lrparsers/parser/tree_sitter.py
John Doty ea5fab4e4e Tree-sitter regexps are structured
Instead of trying to build a regular expression string, just build a
structured thing with seq() and choice() and whatnot. This is
technically uglier but fixes a problem I found with comment regular
expressions so you know, it works, which is better than not working.

Also now tokens get named and maybe that's good? It's so hard to say.
2024-09-05 11:51:29 -07:00

272 lines
8.7 KiB
Python

import json
import pathlib
from . import parser
def to_js_string(s: str) -> str:
result = json.dumps(s)[1:-1]
# JSON escapes double-quotes but we don't need to in our context.
result = result.replace('\\"', '"')
return result
def to_javascript_regex(re: parser.Re) -> str:
# NOTE: In general it's bad to introduce parenthesis into regular
# expressions where they're not required because they also create
# capture groups, but I think it doesn't apply to tree-sitter
# regular expressions (and it doesn't mean anything to me either.)
if isinstance(re, parser.ReSeq):
final = []
queue = []
queue.append(re)
while len(queue) > 0:
part = queue.pop()
if isinstance(part, parser.ReSeq):
queue.append(part.right)
queue.append(part.left)
else:
final.append(part)
s = ", ".join([to_javascript_regex(p) for p in final])
if len(final) > 1:
s = f"seq({s})"
return s
elif isinstance(re, parser.ReAlt):
final = []
queue = []
queue.append(re)
while len(queue) > 0:
part = queue.pop()
if isinstance(part, parser.ReAlt):
queue.append(part.right)
queue.append(part.left)
else:
final.append(part)
s = ", ".join([to_javascript_regex(p) for p in final])
if len(final) > 1:
s = f"choice({s})"
return s
elif isinstance(re, parser.ReQuestion):
s = to_javascript_regex(re.child)
return f"optional({s})"
elif isinstance(re, parser.RePlus):
s = to_javascript_regex(re.child)
return f"repeat1({s})"
elif isinstance(re, parser.ReStar):
s = to_javascript_regex(re.child)
return f"repeat({s})"
elif isinstance(re, parser.ReSet):
if (
len(re.values) == 1
and re.values[0].lower == 0
and re.values[0].upper == parser.UNICODE_MAX_CP
):
return "/./"
inverted = re.inversion
if inverted:
re = re.invert()
parts = []
for value in re.values:
if len(value) == 1:
parts.append(to_js_string(chr(value.lower)))
else:
parts.append(
"{}-{}".format(
to_js_string(chr(value.lower)),
to_js_string(chr(value.upper - 1)),
)
)
s = "".join(parts)
if inverted:
s = "^" + s
if len(s) > 1:
# The only time this isn't a "set" is if this is a set of one
# range that is one character long, in which case it's better
# represented as a literal.
s = f"/[{s}]/"
else:
s = s.replace("'", "\\'")
s = f"'{s}'"
return s
raise Exception(f"Regex node {re} not supported for tree-sitter")
def terminal_name(t: parser.Terminal) -> str:
terminal_name = t.name
if terminal_name is None:
raise Exception("The terminal was not assigned a name: {t}")
return terminal_name
def terminal_to_tree_sitter(rule: parser.Terminal) -> str:
if isinstance(rule.pattern, parser.Re):
result = to_javascript_regex(rule.pattern)
# regex = regex.replace("/", "\\/")
# result = f"/{regex}/"
else:
string = to_js_string(rule.pattern)
result = f'"{string}"'
return f"token({result})"
def apply_precedence(js: str, name: str, grammar: parser.Grammar) -> str:
prec = grammar.get_precedence(name)
if prec is not None:
assoc, level = prec
if assoc == parser.Assoc.LEFT:
js = f"prec.left({level}, {js})"
elif assoc == parser.Assoc.RIGHT:
js = f"prec.right({level}, {js})"
else:
js = f"prec({level}, {js})"
return js
def convert_to_tree_sitter(rule: parser.Rule, grammar: parser.Grammar) -> str:
method = getattr(rule, "convert_to_tree_sitter", None)
if method is not None:
return method(grammar)
if isinstance(rule, parser.Terminal):
# NOTE: We used to just inline these but now we explicitly have names
# for the tokens.
target_name = terminal_name(rule)
return f"$['{target_name}']"
elif isinstance(rule, parser.AlternativeRule):
final = []
queue = []
has_nothing = False
queue.append(rule)
while len(queue) > 0:
part = queue.pop()
if isinstance(part, parser.AlternativeRule):
queue.append(part.right)
queue.append(part.left)
elif isinstance(part, parser.NothingRule):
has_nothing = True
else:
final.append(part)
if len(final) == 0:
raise Exception("Unsupported rule: empty alternative")
result = ", ".join([convert_to_tree_sitter(r, grammar) for r in final])
if len(final) > 1:
result = f"choice({result})"
if has_nothing:
result = f"optional({result})"
return result
elif isinstance(rule, parser.SequenceRule):
final = []
queue = []
queue.append(rule)
while len(queue) > 0:
part = queue.pop()
if isinstance(part, parser.SequenceRule):
queue.append(part.second)
queue.append(part.first)
elif isinstance(part, parser.NothingRule):
pass
else:
final.append(part)
if len(final) == 0:
raise Exception("Unsupported rule: empty sequence")
# OK so there's a weird thing here? If we see a terminal in our
# sequence that has a precedence then we need to group with the
# previous element somehow.
#
# This is an incredible comment that explains how tree-sitter
# actually thinks about conflicts:
#
# https://github.com/tree-sitter/tree-sitter/issues/372
#
pieces = [convert_to_tree_sitter(r, grammar) for r in final]
def make_seq(pieces: list[str]):
if len(pieces) == 1:
return pieces[0]
return "seq({})".format(", ".join(pieces))
for i, r in reversed(list(enumerate(final))):
if isinstance(r, parser.Terminal) and r.name is not None:
if grammar.get_precedence(r.name) is not None:
cut = max(i - 1, 0)
js = make_seq(pieces[cut:])
js = apply_precedence(js, r.name, grammar)
pieces = pieces[:cut] + [js]
result = make_seq(pieces)
return result
elif isinstance(rule, parser.NonTerminal):
target_name = rule.name
if rule.transparent:
target_name = f"_{target_name}"
return f"$['{target_name}']"
elif isinstance(rule, parser.MetadataRule):
result = convert_to_tree_sitter(rule.rule, grammar)
field = rule.metadata.get("field")
if field is not None:
result = f"field('{field}', {result})"
return result
else:
raise ValueError(f"Rule {rule} not supported for tree-sitter")
# https://tree-sitter.github.io/tree-sitter/creating-parsers
def emit_tree_sitter_grammar(grammar: parser.Grammar, path: pathlib.Path | str):
path = pathlib.Path(path) / "grammar.js"
with open(path, "w", encoding="utf-8") as f:
f.write('/// <reference types="tree-sitter-cli/dsl" />\n')
f.write("// @ts-check\n")
f.write("// NOTE: This file was generated by a tool. Do not modify.\n")
f.write("\n")
f.write("module.exports = grammar({\n")
f.write(f" name: '{grammar.name}',\n")
extras = ", ".join([f"$['{terminal_name(t)}']" for t in grammar.trivia_terminals()])
f.write(f" extras: $ => [{extras}],\n")
f.write(" rules: {\n")
f.write(f" source_file: $ => $['{grammar.start}'],\n")
for rule in grammar.non_terminals():
f.write("\n")
rule_name = rule.name
if rule.transparent:
rule_name = "_" + rule_name
body = rule.fn(grammar)
rule_definition = convert_to_tree_sitter(body, grammar)
rule_definition = apply_precedence(rule_definition, rule.name, grammar)
f.write(f" '{rule_name}': $ => {rule_definition},")
f.write("\n")
for rule in grammar.terminals():
f.write("\n")
definition = terminal_to_tree_sitter(rule)
f.write(f" '{rule.name}': $ => {definition},")
f.write("\n }\n")
f.write("});")