Compare commits

...

5 commits

4 changed files with 475 additions and 272 deletions

View file

@ -1607,13 +1607,13 @@ class Rule:
class Terminal(Rule):
"""A token, or terminal symbol in the grammar."""
value: str | None
name: str | None
pattern: "str | Re"
meta: dict[str, typing.Any]
regex: bool
def __init__(self, pattern: "str|Re", *, name: str | None = None, **kwargs):
self.value = name
self.name = name
self.pattern = pattern
self.meta = kwargs
self.regex = isinstance(pattern, Re)
@ -1623,7 +1623,7 @@ class Terminal(Rule):
yield [self]
def __repr__(self) -> str:
return self.value or "???"
return self.name or "<Unknown terminal>"
class NonTerminal(Rule):
@ -1782,217 +1782,6 @@ def rule(
return wrapper
PrecedenceList = list[typing.Tuple[Assoc, list[Rule]]]
class Grammar:
"""The base class for defining a grammar.
Inherit from this, and and define members for your nonterminals, and then
use the `build_tables` method to construct the parse tables.
Here's an example of a simple grammar:
class SimpleGrammar(Grammar):
@rule
def expression(self):
return seq(self.expression, self.PLUS, self.term) | self.term
@rule
def term(self):
return seq(self.LPAREN, self.expression, self.RPAREN) | self.ID
PLUS = Terminal('+')
LPAREN = Terminal('(')
RPAREN = Terminal(')')
ID = Terminal('id')
Not very exciting, perhaps, but it's something.
"""
_precedence: dict[str, typing.Tuple[Assoc, int]]
_start: str
_generator: type[GenerateLR0]
_terminals: list[Terminal]
_trivia: list[Terminal]
def __init__(
self,
start: str | None = None,
precedence: PrecedenceList | None = None,
generator: type[GenerateLR0] | None = None,
trivia: list[str | Terminal] | None = None,
name: str | None = None,
):
if start is None:
start = getattr(self, "start", None)
if start is None:
raise ValueError(
"The default start rule must either be specified in the constructor or as an "
"attribute in the class."
)
if precedence is None:
precedence = getattr(self, "precedence", [])
assert precedence is not None
if generator is None:
generator = getattr(self, "generator", GenerateLALR)
assert generator is not None
if trivia is None:
trivia = getattr(self, "trivia", [])
assert trivia is not None
# Fixup terminal names with the name of the member that declared it.
terminals = {}
for n, t in inspect.getmembers(self, lambda x: isinstance(x, Terminal)):
if t.value is None:
t.value = n
if n in terminals:
raise ValueError(f"More than one terminal has the name '{n}'")
terminals[n] = t
# Resolve the trivia declarations correctly.
resolved_trivia: list[Terminal] = []
for t in trivia:
if isinstance(t, str):
resolved = terminals.get(t)
if resolved is None:
raise ValueError(f"The trivia '{t}' is not a terminal name")
resolved_trivia.append(resolved)
else:
resolved_trivia.append(t)
# Fix up the precedence table.
precedence_table = {}
for prec, (associativity, symbols) in enumerate(precedence):
for symbol in symbols:
if isinstance(symbol, Terminal):
key = symbol.value
elif isinstance(symbol, NonTerminal):
key = symbol.name
else:
raise ValueError(f"{symbol} must be either a Token or a NonTerminal")
precedence_table[key] = (associativity, prec + 1)
if name is None:
name = getattr(self, "name", None)
if name is None:
name = self.__class__.__name__.removesuffix("Grammar").lower()
self._precedence = precedence_table
self._start = start
self._generator = generator
self._terminals = list(terminals.values())
self._trivia = resolved_trivia
self.name = name
@property
def terminals(self) -> list[Terminal]:
return self._terminals
@property
def resolved_trivia(self) -> list[Terminal]:
return self._trivia
def generate_nonterminal_dict(
self, start: str | None = None
) -> typing.Tuple[dict[str, list[list[str | Terminal]]], set[str]]:
"""Convert the rules into a dictionary of productions.
Our table generators work on a very flat set of productions. This is the
first step in flattening the productions from the members: walk the rules
starting from the given start rule and flatten them, one by one, into a
dictionary that maps nonterminal rule name to its associated list of
productions.
"""
if start is None:
start = self._start
rules = inspect.getmembers(self, lambda x: isinstance(x, NonTerminal))
nonterminals = {rule.name: rule for _, rule in rules}
transparents = {rule.name for _, rule in rules if rule.transparent}
grammar = {}
rule = nonterminals.get(start)
if rule is None:
raise ValueError(f"Cannot find a rule named '{start}'")
queue = [rule]
while len(queue) > 0:
rule = queue.pop()
if rule.name in grammar:
continue
body = rule.generate_body(self)
for clause in body:
for symbol in clause:
if not isinstance(symbol, Terminal):
assert isinstance(symbol, str)
nonterminal = nonterminals.get(symbol)
if nonterminal is None:
raise ValueError(f"While processing {rule.name}: cannot find {symbol}")
queue.append(nonterminal)
grammar[rule.name] = body
return (grammar, transparents)
def desugar(
self, start: str | None = None
) -> typing.Tuple[list[typing.Tuple[str, list[str]]], set[str]]:
"""Convert the rules into a flat list of productions.
Our table generators work from a very flat set of productions. The form
produced by this function is one level flatter than the one produced by
generate_nonterminal_dict- less useful to people, probably, but it is
the input form needed by the Generator.
"""
temp_grammar, transparents = self.generate_nonterminal_dict(start)
grammar = []
for rule_name, clauses in temp_grammar.items():
for clause in clauses:
new_clause = []
for symbol in clause:
if isinstance(symbol, Terminal):
if symbol.value in temp_grammar:
raise ValueError(
f"'{symbol.value}' is the name of both a Terminal and a NonTerminal rule. This will cause problems."
)
new_clause.append(symbol.value)
else:
new_clause.append(symbol)
grammar.append((rule_name, new_clause))
return grammar, transparents
def build_table(self, start: str | None = None, generator=None) -> ParseTable:
"""Construct a parse table for this grammar, starting at the named
nonterminal rule.
"""
if start is None:
start = self._start
desugared, transparents = self.desugar(start)
if generator is None:
generator = self._generator
gen = generator(start, desugared, precedence=self._precedence, transparents=transparents)
table = gen.gen_table()
for t in self._trivia:
assert t.value is not None
table.trivia.add(t.value)
return table
###############################################################################
# Lexer support
###############################################################################
@ -2211,7 +2000,7 @@ class NFAState:
continue
visited.add(state)
label = state.accept.value if state.accept is not None else ""
label = state.accept.name if state.accept is not None else ""
f.write(f' {id(state)} [label="{label}"];\n')
for target in state.epsilons:
stack.append(target)
@ -2275,6 +2064,7 @@ UNICODE_MAX_CP = 1114112
@dataclasses.dataclass
class ReSet(Re):
values: list[Span]
inversion: bool = False # No semantic meaning, just pretty.
@classmethod
def from_ranges(cls, *args: str | tuple[str, str]) -> "ReSet":
@ -2311,7 +2101,7 @@ class ReSet(Re):
assert lower < upper
spans.append(Span(lower, upper))
return ReSet(spans)
return ReSet(spans, inversion=not self.inversion)
def __invert__(self) -> "ReSet":
return self.invert()
@ -2495,69 +2285,24 @@ class NFASuperState:
if accept is None:
accept = st.accept
elif accept.value != st.accept.value:
elif accept.name != st.accept.name:
if accept.regex and not st.accept.regex:
accept = st.accept
elif st.accept.regex and not accept.regex:
pass
else:
raise ValueError(
f"Lexer is ambiguous: cannot distinguish between {accept.value} ('{accept.pattern}') and {st.accept.value} ('{st.accept.pattern}')"
f"Lexer is ambiguous: cannot distinguish between {accept.name} ('{accept.pattern}') and {st.accept.name} ('{st.accept.pattern}')"
)
return accept
def compile_lexer(grammar: Grammar) -> LexerTable:
# Parse the terminals all together into a big NFA rooted at `NFA`.
NFA = NFAState()
for terminal in grammar.terminals:
pattern = terminal.pattern
if isinstance(pattern, Re):
start, ends = pattern.to_nfa()
for end in ends:
end.accept = terminal
NFA.epsilons.append(start)
else:
start = end = NFAState()
for c in pattern:
end = end.add_edge(Span.from_str(c), NFAState())
end.accept = terminal
NFA.epsilons.append(start)
NFA.dump_graph()
# Convert the NFA into a DFA in the most straightforward way (by tracking
# sets of state closures, called SuperStates.)
DFA: dict[NFASuperState, tuple[int, list[tuple[Span, NFASuperState]]]] = {}
stack = [NFASuperState([NFA])]
while len(stack) > 0:
ss = stack.pop()
if ss in DFA:
continue
edges = ss.edges()
DFA[ss] = (len(DFA), edges)
for _, target in edges:
stack.append(target)
return [
(
ss.accept_terminal(),
[(k, DFA[v][0]) for k, v in edges],
)
for ss, (_, edges) in DFA.items()
]
def dump_lexer_table(table: LexerTable, name: str = "lexer.dot"):
with open(name, "w", encoding="utf-8") as f:
f.write("digraph G {\n")
for index, (accept, edges) in enumerate(table):
label = accept.value if accept is not None else ""
label = accept.name if accept is not None else ""
f.write(f' {index} [label="{label}"];\n')
for span, target in edges:
label = str(span).replace('"', '\\"')
@ -2661,3 +2406,264 @@ class Highlight(SyntaxMeta):
class Variable(SyntaxMeta):
class Language(SyntaxMeta):
pass
###############################################################################
# Finally, the base class for grammars
###############################################################################
PrecedenceList = list[typing.Tuple[Assoc, list[Rule]]]
class Grammar:
"""The base class for defining a grammar.
Inherit from this, and and define members for your nonterminals, and then
use the `build_tables` method to construct the parse tables.
Here's an example of a simple grammar:
class SimpleGrammar(Grammar):
@rule
def expression(self):
return seq(self.expression, self.PLUS, self.term) | self.term
@rule
def term(self):
return seq(self.LPAREN, self.expression, self.RPAREN) | self.ID
PLUS = Terminal('+')
LPAREN = Terminal('(')
RPAREN = Terminal(')')
ID = Terminal('id')
Not very exciting, perhaps, but it's something.
"""
_precedence: dict[str, typing.Tuple[Assoc, int]]
_generator: type[GenerateLR0]
_terminals: list[Terminal]
_trivia: list[Terminal]
def __init__(
self,
start: str | None = None,
precedence: PrecedenceList | None = None,
generator: type[GenerateLR0] | None = None,
trivia: list[str | Terminal] | None = None,
name: str | None = None,
):
if start is None:
start = getattr(self, "start", None)
if start is None:
raise ValueError(
"The default start rule must either be specified in the constructor or as an "
"attribute in the class."
)
if precedence is None:
precedence = getattr(self, "precedence", [])
assert precedence is not None
if generator is None:
generator = getattr(self, "generator", GenerateLALR)
assert generator is not None
if trivia is None:
trivia = getattr(self, "trivia", [])
assert trivia is not None
# Fixup terminal names with the name of the member that declared it.
terminals = {}
for n, t in inspect.getmembers(self, lambda x: isinstance(x, Terminal)):
if t.name is None:
t.name = n
if n in terminals:
raise ValueError(f"More than one terminal has the name '{n}'")
terminals[n] = t
# Resolve the trivia declarations correctly.
resolved_trivia: list[Terminal] = []
for t in trivia:
if isinstance(t, str):
resolved = terminals.get(t)
if resolved is None:
raise ValueError(f"The trivia '{t}' is not a terminal name")
resolved_trivia.append(resolved)
else:
resolved_trivia.append(t)
# Fix up the precedence table.
precedence_table = {}
for prec, (associativity, symbols) in enumerate(precedence):
for symbol in symbols:
if isinstance(symbol, Terminal):
key = symbol.name
elif isinstance(symbol, NonTerminal):
key = symbol.name
else:
raise ValueError(f"{symbol} must be either a Token or a NonTerminal")
precedence_table[key] = (associativity, prec + 1)
if name is None:
name = getattr(self, "name", None)
if name is None:
name = self.__class__.__name__.removesuffix("Grammar").lower()
self._precedence = precedence_table
self.start = start
self._generator = generator
self._terminals = list(terminals.values())
self._trivia = resolved_trivia
self.name = name
def terminals(self) -> list[Terminal]:
return self._terminals
@property
def resolved_trivia(self) -> list[Terminal]:
return self._trivia
def non_terminals(self) -> list[NonTerminal]:
return [nt for _, nt in inspect.getmembers(self, lambda x: isinstance(x, NonTerminal))]
def generate_nonterminal_dict(
self, start: str | None = None
) -> typing.Tuple[dict[str, list[list[str | Terminal]]], set[str]]:
"""Convert the rules into a dictionary of productions.
Our table generators work on a very flat set of productions. This is the
first step in flattening the productions from the members: walk the rules
starting from the given start rule and flatten them, one by one, into a
dictionary that maps nonterminal rule name to its associated list of
productions.
"""
if start is None:
start = self.start
rules = self.non_terminals()
nonterminals = {rule.name: rule for rule in rules}
transparents = {rule.name for rule in rules if rule.transparent}
grammar = {}
rule = nonterminals.get(start)
if rule is None:
raise ValueError(f"Cannot find a rule named '{start}'")
queue = [rule]
while len(queue) > 0:
rule = queue.pop()
if rule.name in grammar:
continue
body = rule.generate_body(self)
for clause in body:
for symbol in clause:
if not isinstance(symbol, Terminal):
assert isinstance(symbol, str)
nonterminal = nonterminals.get(symbol)
if nonterminal is None:
raise ValueError(f"While processing {rule.name}: cannot find {symbol}")
queue.append(nonterminal)
grammar[rule.name] = body
return (grammar, transparents)
def desugar(
self, start: str | None = None
) -> typing.Tuple[list[typing.Tuple[str, list[str]]], set[str]]:
"""Convert the rules into a flat list of productions.
Our table generators work from a very flat set of productions. The form
produced by this function is one level flatter than the one produced by
generate_nonterminal_dict- less useful to people, probably, but it is
the input form needed by the Generator.
"""
temp_grammar, transparents = self.generate_nonterminal_dict(start)
grammar = []
for rule_name, clauses in temp_grammar.items():
for clause in clauses:
new_clause = []
for symbol in clause:
if isinstance(symbol, Terminal):
if symbol.name in temp_grammar:
raise ValueError(
f"'{symbol.name}' is the name of both a Terminal and a NonTerminal rule. This will cause problems."
)
new_clause.append(symbol.name)
else:
new_clause.append(symbol)
grammar.append((rule_name, new_clause))
return grammar, transparents
def build_table(self, start: str | None = None, generator=None) -> ParseTable:
"""Construct a parse table for this grammar, starting at the named
nonterminal rule.
"""
if start is None:
start = self.start
desugared, transparents = self.desugar(start)
if generator is None:
generator = self._generator
gen = generator(start, desugared, precedence=self._precedence, transparents=transparents)
table = gen.gen_table()
for t in self._trivia:
assert t.name is not None
table.trivia.add(t.name)
return table
def compile_lexer(self) -> LexerTable:
"""Construct a lexer table for this grammar."""
# Parse the terminals all together into a big NFA rooted at `NFA`.
NFA = NFAState()
for terminal in self.terminals():
pattern = terminal.pattern
if isinstance(pattern, Re):
start, ends = pattern.to_nfa()
for end in ends:
end.accept = terminal
NFA.epsilons.append(start)
else:
start = end = NFAState()
for c in pattern:
end = end.add_edge(Span.from_str(c), NFAState())
end.accept = terminal
NFA.epsilons.append(start)
# NFA.dump_graph()
# Convert the NFA into a DFA in the most straightforward way (by tracking
# sets of state closures, called SuperStates.)
DFA: dict[NFASuperState, tuple[int, list[tuple[Span, NFASuperState]]]] = {}
stack = [NFASuperState([NFA])]
while len(stack) > 0:
ss = stack.pop()
if ss in DFA:
continue
edges = ss.edges()
DFA[ss] = (len(DFA), edges)
for _, target in edges:
stack.append(target)
return [
(
ss.accept_terminal(),
[(k, DFA[v][0]) for k, v in edges],
)
for ss, (_, edges) in DFA.items()
]

View file

@ -292,9 +292,9 @@ class Parser:
# accessible in the tree.
input_tokens = tokens.tokens()
input: list[TokenValue] = [
TokenValue(kind=kind.value, start=start, end=start + length)
TokenValue(kind=kind.name, start=start, end=start + length)
for (kind, start, length) in input_tokens
if kind.value is not None and kind.value not in self.table.trivia
if kind.name is not None and kind.name not in self.table.trivia
]
eof = 0 if len(input) == 0 else input[-1].end
@ -514,9 +514,9 @@ class GenericTokenStream:
end = len(self._tokens)
max_terminal_name = max(
len(terminal.value)
len(terminal.name)
for terminal, _ in self.lexer
if terminal is not None and terminal.value is not None
if terminal is not None and terminal.name is not None
)
max_offset_len = len(str(len(self.src)))
@ -539,6 +539,6 @@ class GenericTokenStream:
else:
line_part = " |"
line = f"{start:{max_offset_len}} {line_part} {column_index:3} {kind.value:{max_terminal_name}} {repr(value)}"
line = f"{start:{max_offset_len}} {line_part} {column_index:3} {kind.name:{max_terminal_name}} {repr(value)}"
lines.append(line)
return lines

198
parser/tree_sitter.py Normal file
View file

@ -0,0 +1,198 @@
import json
import pathlib
from . import parser
def to_js_string(s: str) -> str:
result = json.dumps(s)[1:-1]
# JSON escapes double-quotes but we don't need to in our context.
result = result.replace('\\"', '"')
return result
def to_javascript_regex(re: parser.Re) -> str:
# NOTE: In general it's bad to introduce parenthesis into regular
# expressions where they're not required because they also create
# capture groups, but I think it doesn't apply to tree-sitter
# regular expressions (and it doesn't mean anything to me either.)
if isinstance(re, parser.ReSeq):
final = []
queue = []
queue.append(re)
while len(queue) > 0:
part = queue.pop()
if isinstance(part, parser.ReSeq):
queue.append(part.right)
queue.append(part.left)
else:
final.append(part)
s = "".join([to_javascript_regex(p) for p in final])
if len(final) > 1:
s = f"({s})"
return s
elif isinstance(re, parser.ReAlt):
final = []
queue = []
queue.append(re)
while len(queue) > 0:
part = queue.pop()
if isinstance(part, parser.ReAlt):
queue.append(part.right)
queue.append(part.left)
else:
final.append(part)
s = "|".join([to_javascript_regex(p) for p in final])
if len(final) > 1:
s = f"({s})"
return s
elif isinstance(re, parser.ReQuestion):
s = to_javascript_regex(re.child)
return f"{s}?"
elif isinstance(re, parser.RePlus):
s = to_javascript_regex(re.child)
return f"{s}+"
elif isinstance(re, parser.ReStar):
s = to_javascript_regex(re.child)
return f"{s}*"
elif isinstance(re, parser.ReSet):
if (
len(re.values) == 1
and re.values[0].lower == 0
and re.values[0].upper == parser.UNICODE_MAX_CP
):
return "."
inverted = re.inversion
if inverted:
re = re.invert()
parts = []
for value in re.values:
if len(value) == 1:
parts.append(to_js_string(chr(value.lower)))
else:
parts.append(
"{}-{}".format(
to_js_string(chr(value.lower)),
to_js_string(chr(value.upper - 1)),
)
)
s = "".join(parts)
if inverted:
s = "^" + s
if len(s) > 1:
# The only time this isn't a "set" is if this is a set of one
# range that is one character long, in which case it's better
# represented as a literal.
s = f"[{s}]"
return s
raise Exception(f"Regex node {re} not supported for tree-sitter")
def convert_to_tree_sitter(rule: parser.Rule, grammar: parser.Grammar) -> str:
# TODO: Precedence?
method = getattr(rule, "convert_to_tree_sitter", None)
if method is not None:
return method(grammar)
if isinstance(rule, parser.Terminal):
if isinstance(rule.pattern, parser.Re):
regex = to_javascript_regex(rule.pattern)
return f"/{regex}/"
else:
string = to_js_string(rule.pattern)
return f'"{string}"'
elif isinstance(rule, parser.AlternativeRule):
final = []
queue = []
has_nothing = False
queue.append(rule)
while len(queue) > 0:
part = queue.pop()
if isinstance(part, parser.AlternativeRule):
queue.append(part.right)
queue.append(part.left)
elif isinstance(part, parser.NothingRule):
has_nothing = True
else:
final.append(part)
if len(final) == 0:
raise Exception("Unsupported rule: empty alternative")
result = ", ".join([convert_to_tree_sitter(r, grammar) for r in final])
if len(final) > 1:
result = f"choice({result})"
if has_nothing:
result = f"opt({result})"
return result
elif isinstance(rule, parser.SequenceRule):
final = []
queue = []
queue.append(rule)
while len(queue) > 0:
part = queue.pop()
if isinstance(part, parser.SequenceRule):
queue.append(part.second)
queue.append(part.first)
elif isinstance(part, parser.NothingRule):
pass
else:
final.append(part)
if len(final) == 0:
raise Exception("Unsupported rule: empty sequence")
result = ", ".join([convert_to_tree_sitter(r, grammar) for r in final])
if len(final) > 1:
result = f"seq({result})"
return result
elif isinstance(rule, parser.NonTerminal):
return f"$['{rule.name}']"
elif isinstance(rule, parser.MetadataRule):
return convert_to_tree_sitter(rule.rule, grammar)
else:
raise ValueError(f"Rule {rule} not supported for tree-sitter")
# https://tree-sitter.github.io/tree-sitter/creating-parsers
def emit_tree_sitter_grammar(grammar: parser.Grammar, path: pathlib.Path | str):
# TODO: PRECEDENCE
path = pathlib.Path(path) / "grammar.js"
with open(path, "w", encoding="utf-8") as f:
f.write('/// <reference types="tree-sitter-cli/dsl" />\n')
f.write("// @ts-check\n")
f.write("\n")
f.write("module.exports = grammar({\n")
f.write(f" name: '{grammar.name}',\n")
f.write(" rules: {\n")
f.write(f" source_file: $ => $['{grammar.start}'],\n")
for rule in grammar.non_terminals():
f.write("\n")
rule_name = rule.name
if rule.transparent:
rule_name = "_" + rule_name
body = rule.fn(grammar)
rule_definition = convert_to_tree_sitter(body, grammar)
f.write(f" '{rule_name}': $ => {rule_definition},")
f.write(" }\n")
f.write("});")

View file

@ -11,7 +11,6 @@ from parser import (
Grammar,
rule,
Terminal,
compile_lexer,
dump_lexer_table,
Re,
)
@ -372,7 +371,7 @@ def test_lexer_compile():
)
BLANKS = Terminal(Re.set("\r", "\n", "\t", " ").plus())
lexer = compile_lexer(LexTest())
lexer = LexTest().compile_lexer()
dump_lexer_table(lexer)
tokens = list(generic_tokenize("xy is ass", lexer))
assert tokens == [
@ -410,7 +409,7 @@ def test_lexer_numbers(n: float):
)
)
lexer = compile_lexer(LexTest())
lexer = LexTest().compile_lexer()
dump_lexer_table(lexer)
number_string = str(n)