Tree-sitter regexps are structured
Instead of trying to build a regular expression string, just build a structured thing with seq() and choice() and whatnot. This is technically uglier but fixes a problem I found with comment regular expressions so you know, it works, which is better than not working. Also now tokens get named and maybe that's good? It's so hard to say.
This commit is contained in:
parent
5e12af9f31
commit
ea5fab4e4e
1 changed files with 35 additions and 15 deletions
|
|
@ -28,9 +28,9 @@ def to_javascript_regex(re: parser.Re) -> str:
|
|||
else:
|
||||
final.append(part)
|
||||
|
||||
s = "".join([to_javascript_regex(p) for p in final])
|
||||
s = ", ".join([to_javascript_regex(p) for p in final])
|
||||
if len(final) > 1:
|
||||
s = f"({s})"
|
||||
s = f"seq({s})"
|
||||
return s
|
||||
|
||||
elif isinstance(re, parser.ReAlt):
|
||||
|
|
@ -45,22 +45,22 @@ def to_javascript_regex(re: parser.Re) -> str:
|
|||
else:
|
||||
final.append(part)
|
||||
|
||||
s = "|".join([to_javascript_regex(p) for p in final])
|
||||
s = ", ".join([to_javascript_regex(p) for p in final])
|
||||
if len(final) > 1:
|
||||
s = f"({s})"
|
||||
s = f"choice({s})"
|
||||
return s
|
||||
|
||||
elif isinstance(re, parser.ReQuestion):
|
||||
s = to_javascript_regex(re.child)
|
||||
return f"{s}?"
|
||||
return f"optional({s})"
|
||||
|
||||
elif isinstance(re, parser.RePlus):
|
||||
s = to_javascript_regex(re.child)
|
||||
return f"{s}+"
|
||||
return f"repeat1({s})"
|
||||
|
||||
elif isinstance(re, parser.ReStar):
|
||||
s = to_javascript_regex(re.child)
|
||||
return f"{s}*"
|
||||
return f"repeat({s})"
|
||||
|
||||
elif isinstance(re, parser.ReSet):
|
||||
if (
|
||||
|
|
@ -68,7 +68,7 @@ def to_javascript_regex(re: parser.Re) -> str:
|
|||
and re.values[0].lower == 0
|
||||
and re.values[0].upper == parser.UNICODE_MAX_CP
|
||||
):
|
||||
return "."
|
||||
return "/./"
|
||||
|
||||
inverted = re.inversion
|
||||
if inverted:
|
||||
|
|
@ -93,21 +93,31 @@ def to_javascript_regex(re: parser.Re) -> str:
|
|||
# The only time this isn't a "set" is if this is a set of one
|
||||
# range that is one character long, in which case it's better
|
||||
# represented as a literal.
|
||||
s = f"[{s}]"
|
||||
s = f"/[{s}]/"
|
||||
else:
|
||||
s = s.replace("'", "\\'")
|
||||
s = f"'{s}'"
|
||||
return s
|
||||
|
||||
raise Exception(f"Regex node {re} not supported for tree-sitter")
|
||||
|
||||
|
||||
def terminal_name(t: parser.Terminal) -> str:
|
||||
terminal_name = t.name
|
||||
if terminal_name is None:
|
||||
raise Exception("The terminal was not assigned a name: {t}")
|
||||
return terminal_name
|
||||
|
||||
|
||||
def terminal_to_tree_sitter(rule: parser.Terminal) -> str:
|
||||
if isinstance(rule.pattern, parser.Re):
|
||||
regex = to_javascript_regex(rule.pattern)
|
||||
regex = regex.replace("/", "\\/")
|
||||
result = f"/{regex}/"
|
||||
result = to_javascript_regex(rule.pattern)
|
||||
# regex = regex.replace("/", "\\/")
|
||||
# result = f"/{regex}/"
|
||||
else:
|
||||
string = to_js_string(rule.pattern)
|
||||
result = f'"{string}"'
|
||||
return result
|
||||
return f"token({result})"
|
||||
|
||||
|
||||
def apply_precedence(js: str, name: str, grammar: parser.Grammar) -> str:
|
||||
|
|
@ -130,7 +140,10 @@ def convert_to_tree_sitter(rule: parser.Rule, grammar: parser.Grammar) -> str:
|
|||
return method(grammar)
|
||||
|
||||
if isinstance(rule, parser.Terminal):
|
||||
return terminal_to_tree_sitter(rule)
|
||||
# NOTE: We used to just inline these but now we explicitly have names
|
||||
# for the tokens.
|
||||
target_name = terminal_name(rule)
|
||||
return f"$['{target_name}']"
|
||||
|
||||
elif isinstance(rule, parser.AlternativeRule):
|
||||
final = []
|
||||
|
|
@ -230,7 +243,7 @@ def emit_tree_sitter_grammar(grammar: parser.Grammar, path: pathlib.Path | str):
|
|||
f.write("module.exports = grammar({\n")
|
||||
f.write(f" name: '{grammar.name}',\n")
|
||||
|
||||
extras = ", ".join([terminal_to_tree_sitter(t) for t in grammar.trivia_terminals()])
|
||||
extras = ", ".join([f"$['{terminal_name(t)}']" for t in grammar.trivia_terminals()])
|
||||
f.write(f" extras: $ => [{extras}],\n")
|
||||
|
||||
f.write(" rules: {\n")
|
||||
|
|
@ -248,5 +261,12 @@ def emit_tree_sitter_grammar(grammar: parser.Grammar, path: pathlib.Path | str):
|
|||
|
||||
f.write(f" '{rule_name}': $ => {rule_definition},")
|
||||
|
||||
f.write("\n")
|
||||
for rule in grammar.terminals():
|
||||
f.write("\n")
|
||||
|
||||
definition = terminal_to_tree_sitter(rule)
|
||||
f.write(f" '{rule.name}': $ => {definition},")
|
||||
|
||||
f.write("\n }\n")
|
||||
f.write("});")
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue