Tree-sitter regexps are structured

Instead of trying to build a regular expression string, just build a
structured thing with seq() and choice() and whatnot. This is
technically uglier but fixes a problem I found with comment regular
expressions so you know, it works, which is better than not working.

Also now tokens get named and maybe that's good? It's so hard to say.
This commit is contained in:
John Doty 2024-09-05 11:51:29 -07:00
parent 5e12af9f31
commit ea5fab4e4e

View file

@ -28,9 +28,9 @@ def to_javascript_regex(re: parser.Re) -> str:
else:
final.append(part)
s = "".join([to_javascript_regex(p) for p in final])
s = ", ".join([to_javascript_regex(p) for p in final])
if len(final) > 1:
s = f"({s})"
s = f"seq({s})"
return s
elif isinstance(re, parser.ReAlt):
@ -45,22 +45,22 @@ def to_javascript_regex(re: parser.Re) -> str:
else:
final.append(part)
s = "|".join([to_javascript_regex(p) for p in final])
s = ", ".join([to_javascript_regex(p) for p in final])
if len(final) > 1:
s = f"({s})"
s = f"choice({s})"
return s
elif isinstance(re, parser.ReQuestion):
s = to_javascript_regex(re.child)
return f"{s}?"
return f"optional({s})"
elif isinstance(re, parser.RePlus):
s = to_javascript_regex(re.child)
return f"{s}+"
return f"repeat1({s})"
elif isinstance(re, parser.ReStar):
s = to_javascript_regex(re.child)
return f"{s}*"
return f"repeat({s})"
elif isinstance(re, parser.ReSet):
if (
@ -68,7 +68,7 @@ def to_javascript_regex(re: parser.Re) -> str:
and re.values[0].lower == 0
and re.values[0].upper == parser.UNICODE_MAX_CP
):
return "."
return "/./"
inverted = re.inversion
if inverted:
@ -93,21 +93,31 @@ def to_javascript_regex(re: parser.Re) -> str:
# The only time this isn't a "set" is if this is a set of one
# range that is one character long, in which case it's better
# represented as a literal.
s = f"[{s}]"
s = f"/[{s}]/"
else:
s = s.replace("'", "\\'")
s = f"'{s}'"
return s
raise Exception(f"Regex node {re} not supported for tree-sitter")
def terminal_name(t: parser.Terminal) -> str:
terminal_name = t.name
if terminal_name is None:
raise Exception("The terminal was not assigned a name: {t}")
return terminal_name
def terminal_to_tree_sitter(rule: parser.Terminal) -> str:
if isinstance(rule.pattern, parser.Re):
regex = to_javascript_regex(rule.pattern)
regex = regex.replace("/", "\\/")
result = f"/{regex}/"
result = to_javascript_regex(rule.pattern)
# regex = regex.replace("/", "\\/")
# result = f"/{regex}/"
else:
string = to_js_string(rule.pattern)
result = f'"{string}"'
return result
return f"token({result})"
def apply_precedence(js: str, name: str, grammar: parser.Grammar) -> str:
@ -130,7 +140,10 @@ def convert_to_tree_sitter(rule: parser.Rule, grammar: parser.Grammar) -> str:
return method(grammar)
if isinstance(rule, parser.Terminal):
return terminal_to_tree_sitter(rule)
# NOTE: We used to just inline these but now we explicitly have names
# for the tokens.
target_name = terminal_name(rule)
return f"$['{target_name}']"
elif isinstance(rule, parser.AlternativeRule):
final = []
@ -230,7 +243,7 @@ def emit_tree_sitter_grammar(grammar: parser.Grammar, path: pathlib.Path | str):
f.write("module.exports = grammar({\n")
f.write(f" name: '{grammar.name}',\n")
extras = ", ".join([terminal_to_tree_sitter(t) for t in grammar.trivia_terminals()])
extras = ", ".join([f"$['{terminal_name(t)}']" for t in grammar.trivia_terminals()])
f.write(f" extras: $ => [{extras}],\n")
f.write(" rules: {\n")
@ -248,5 +261,12 @@ def emit_tree_sitter_grammar(grammar: parser.Grammar, path: pathlib.Path | str):
f.write(f" '{rule_name}': $ => {rule_definition},")
f.write("\n")
for rule in grammar.terminals():
f.write("\n")
definition = terminal_to_tree_sitter(rule)
f.write(f" '{rule.name}': $ => {definition},")
f.write("\n }\n")
f.write("});")