diff --git a/parser/parser.py b/parser/parser.py index 5d47160..bdc0349 100644 --- a/parser/parser.py +++ b/parser/parser.py @@ -1607,13 +1607,13 @@ class Rule: class Terminal(Rule): """A token, or terminal symbol in the grammar.""" - name: str | None + value: str | None pattern: "str | Re" meta: dict[str, typing.Any] regex: bool def __init__(self, pattern: "str|Re", *, name: str | None = None, **kwargs): - self.name = name + self.value = name self.pattern = pattern self.meta = kwargs self.regex = isinstance(pattern, Re) @@ -1623,7 +1623,7 @@ class Terminal(Rule): yield [self] def __repr__(self) -> str: - return self.name or "" + return self.value or "???" class NonTerminal(Rule): @@ -1782,6 +1782,217 @@ def rule( return wrapper +PrecedenceList = list[typing.Tuple[Assoc, list[Rule]]] + + +class Grammar: + """The base class for defining a grammar. + + Inherit from this, and and define members for your nonterminals, and then + use the `build_tables` method to construct the parse tables. + + + Here's an example of a simple grammar: + + class SimpleGrammar(Grammar): + @rule + def expression(self): + return seq(self.expression, self.PLUS, self.term) | self.term + + @rule + def term(self): + return seq(self.LPAREN, self.expression, self.RPAREN) | self.ID + + PLUS = Terminal('+') + LPAREN = Terminal('(') + RPAREN = Terminal(')') + ID = Terminal('id') + + + Not very exciting, perhaps, but it's something. + """ + + _precedence: dict[str, typing.Tuple[Assoc, int]] + _start: str + _generator: type[GenerateLR0] + _terminals: list[Terminal] + _trivia: list[Terminal] + + def __init__( + self, + start: str | None = None, + precedence: PrecedenceList | None = None, + generator: type[GenerateLR0] | None = None, + trivia: list[str | Terminal] | None = None, + name: str | None = None, + ): + if start is None: + start = getattr(self, "start", None) + if start is None: + raise ValueError( + "The default start rule must either be specified in the constructor or as an " + "attribute in the class." + ) + + if precedence is None: + precedence = getattr(self, "precedence", []) + assert precedence is not None + + if generator is None: + generator = getattr(self, "generator", GenerateLALR) + assert generator is not None + + if trivia is None: + trivia = getattr(self, "trivia", []) + assert trivia is not None + + # Fixup terminal names with the name of the member that declared it. + terminals = {} + for n, t in inspect.getmembers(self, lambda x: isinstance(x, Terminal)): + if t.value is None: + t.value = n + + if n in terminals: + raise ValueError(f"More than one terminal has the name '{n}'") + terminals[n] = t + + # Resolve the trivia declarations correctly. + resolved_trivia: list[Terminal] = [] + for t in trivia: + if isinstance(t, str): + resolved = terminals.get(t) + if resolved is None: + raise ValueError(f"The trivia '{t}' is not a terminal name") + resolved_trivia.append(resolved) + else: + resolved_trivia.append(t) + + # Fix up the precedence table. + precedence_table = {} + for prec, (associativity, symbols) in enumerate(precedence): + for symbol in symbols: + if isinstance(symbol, Terminal): + key = symbol.value + elif isinstance(symbol, NonTerminal): + key = symbol.name + else: + raise ValueError(f"{symbol} must be either a Token or a NonTerminal") + + precedence_table[key] = (associativity, prec + 1) + + if name is None: + name = getattr(self, "name", None) + if name is None: + name = self.__class__.__name__.removesuffix("Grammar").lower() + + self._precedence = precedence_table + self._start = start + self._generator = generator + self._terminals = list(terminals.values()) + self._trivia = resolved_trivia + self.name = name + + @property + def terminals(self) -> list[Terminal]: + return self._terminals + + @property + def resolved_trivia(self) -> list[Terminal]: + return self._trivia + + def generate_nonterminal_dict( + self, start: str | None = None + ) -> typing.Tuple[dict[str, list[list[str | Terminal]]], set[str]]: + """Convert the rules into a dictionary of productions. + + Our table generators work on a very flat set of productions. This is the + first step in flattening the productions from the members: walk the rules + starting from the given start rule and flatten them, one by one, into a + dictionary that maps nonterminal rule name to its associated list of + productions. + """ + if start is None: + start = self._start + + rules = inspect.getmembers(self, lambda x: isinstance(x, NonTerminal)) + nonterminals = {rule.name: rule for _, rule in rules} + transparents = {rule.name for _, rule in rules if rule.transparent} + + grammar = {} + + rule = nonterminals.get(start) + if rule is None: + raise ValueError(f"Cannot find a rule named '{start}'") + queue = [rule] + while len(queue) > 0: + rule = queue.pop() + if rule.name in grammar: + continue + + body = rule.generate_body(self) + for clause in body: + for symbol in clause: + if not isinstance(symbol, Terminal): + assert isinstance(symbol, str) + nonterminal = nonterminals.get(symbol) + if nonterminal is None: + raise ValueError(f"While processing {rule.name}: cannot find {symbol}") + queue.append(nonterminal) + + grammar[rule.name] = body + + return (grammar, transparents) + + def desugar( + self, start: str | None = None + ) -> typing.Tuple[list[typing.Tuple[str, list[str]]], set[str]]: + """Convert the rules into a flat list of productions. + + Our table generators work from a very flat set of productions. The form + produced by this function is one level flatter than the one produced by + generate_nonterminal_dict- less useful to people, probably, but it is + the input form needed by the Generator. + """ + temp_grammar, transparents = self.generate_nonterminal_dict(start) + + grammar = [] + for rule_name, clauses in temp_grammar.items(): + for clause in clauses: + new_clause = [] + for symbol in clause: + if isinstance(symbol, Terminal): + if symbol.value in temp_grammar: + raise ValueError( + f"'{symbol.value}' is the name of both a Terminal and a NonTerminal rule. This will cause problems." + ) + new_clause.append(symbol.value) + else: + new_clause.append(symbol) + + grammar.append((rule_name, new_clause)) + + return grammar, transparents + + def build_table(self, start: str | None = None, generator=None) -> ParseTable: + """Construct a parse table for this grammar, starting at the named + nonterminal rule. + """ + if start is None: + start = self._start + desugared, transparents = self.desugar(start) + + if generator is None: + generator = self._generator + gen = generator(start, desugared, precedence=self._precedence, transparents=transparents) + table = gen.gen_table() + + for t in self._trivia: + assert t.value is not None + table.trivia.add(t.value) + + return table + + ############################################################################### # Lexer support ############################################################################### @@ -2000,7 +2211,7 @@ class NFAState: continue visited.add(state) - label = state.accept.name if state.accept is not None else "" + label = state.accept.value if state.accept is not None else "" f.write(f' {id(state)} [label="{label}"];\n') for target in state.epsilons: stack.append(target) @@ -2064,7 +2275,6 @@ UNICODE_MAX_CP = 1114112 @dataclasses.dataclass class ReSet(Re): values: list[Span] - inversion: bool = False # No semantic meaning, just pretty. @classmethod def from_ranges(cls, *args: str | tuple[str, str]) -> "ReSet": @@ -2101,7 +2311,7 @@ class ReSet(Re): assert lower < upper spans.append(Span(lower, upper)) - return ReSet(spans, inversion=not self.inversion) + return ReSet(spans) def __invert__(self) -> "ReSet": return self.invert() @@ -2285,24 +2495,69 @@ class NFASuperState: if accept is None: accept = st.accept - elif accept.name != st.accept.name: + elif accept.value != st.accept.value: if accept.regex and not st.accept.regex: accept = st.accept elif st.accept.regex and not accept.regex: pass else: raise ValueError( - f"Lexer is ambiguous: cannot distinguish between {accept.name} ('{accept.pattern}') and {st.accept.name} ('{st.accept.pattern}')" + f"Lexer is ambiguous: cannot distinguish between {accept.value} ('{accept.pattern}') and {st.accept.value} ('{st.accept.pattern}')" ) return accept +def compile_lexer(grammar: Grammar) -> LexerTable: + # Parse the terminals all together into a big NFA rooted at `NFA`. + NFA = NFAState() + for terminal in grammar.terminals: + pattern = terminal.pattern + if isinstance(pattern, Re): + start, ends = pattern.to_nfa() + for end in ends: + end.accept = terminal + NFA.epsilons.append(start) + + else: + start = end = NFAState() + for c in pattern: + end = end.add_edge(Span.from_str(c), NFAState()) + end.accept = terminal + NFA.epsilons.append(start) + + NFA.dump_graph() + + # Convert the NFA into a DFA in the most straightforward way (by tracking + # sets of state closures, called SuperStates.) + DFA: dict[NFASuperState, tuple[int, list[tuple[Span, NFASuperState]]]] = {} + + stack = [NFASuperState([NFA])] + while len(stack) > 0: + ss = stack.pop() + if ss in DFA: + continue + + edges = ss.edges() + + DFA[ss] = (len(DFA), edges) + for _, target in edges: + stack.append(target) + + return [ + ( + ss.accept_terminal(), + [(k, DFA[v][0]) for k, v in edges], + ) + for ss, (_, edges) in DFA.items() + ] + + def dump_lexer_table(table: LexerTable, name: str = "lexer.dot"): with open(name, "w", encoding="utf-8") as f: f.write("digraph G {\n") for index, (accept, edges) in enumerate(table): - label = accept.name if accept is not None else "" + label = accept.value if accept is not None else "" f.write(f' {index} [label="{label}"];\n') for span, target in edges: label = str(span).replace('"', '\\"') @@ -2406,264 +2661,3 @@ class Highlight(SyntaxMeta): class Variable(SyntaxMeta): class Language(SyntaxMeta): pass - - -############################################################################### -# Finally, the base class for grammars -############################################################################### - -PrecedenceList = list[typing.Tuple[Assoc, list[Rule]]] - - -class Grammar: - """The base class for defining a grammar. - - Inherit from this, and and define members for your nonterminals, and then - use the `build_tables` method to construct the parse tables. - - - Here's an example of a simple grammar: - - class SimpleGrammar(Grammar): - @rule - def expression(self): - return seq(self.expression, self.PLUS, self.term) | self.term - - @rule - def term(self): - return seq(self.LPAREN, self.expression, self.RPAREN) | self.ID - - PLUS = Terminal('+') - LPAREN = Terminal('(') - RPAREN = Terminal(')') - ID = Terminal('id') - - - Not very exciting, perhaps, but it's something. - """ - - _precedence: dict[str, typing.Tuple[Assoc, int]] - _generator: type[GenerateLR0] - _terminals: list[Terminal] - _trivia: list[Terminal] - - def __init__( - self, - start: str | None = None, - precedence: PrecedenceList | None = None, - generator: type[GenerateLR0] | None = None, - trivia: list[str | Terminal] | None = None, - name: str | None = None, - ): - if start is None: - start = getattr(self, "start", None) - if start is None: - raise ValueError( - "The default start rule must either be specified in the constructor or as an " - "attribute in the class." - ) - - if precedence is None: - precedence = getattr(self, "precedence", []) - assert precedence is not None - - if generator is None: - generator = getattr(self, "generator", GenerateLALR) - assert generator is not None - - if trivia is None: - trivia = getattr(self, "trivia", []) - assert trivia is not None - - # Fixup terminal names with the name of the member that declared it. - terminals = {} - for n, t in inspect.getmembers(self, lambda x: isinstance(x, Terminal)): - if t.name is None: - t.name = n - - if n in terminals: - raise ValueError(f"More than one terminal has the name '{n}'") - terminals[n] = t - - # Resolve the trivia declarations correctly. - resolved_trivia: list[Terminal] = [] - for t in trivia: - if isinstance(t, str): - resolved = terminals.get(t) - if resolved is None: - raise ValueError(f"The trivia '{t}' is not a terminal name") - resolved_trivia.append(resolved) - else: - resolved_trivia.append(t) - - # Fix up the precedence table. - precedence_table = {} - for prec, (associativity, symbols) in enumerate(precedence): - for symbol in symbols: - if isinstance(symbol, Terminal): - key = symbol.name - elif isinstance(symbol, NonTerminal): - key = symbol.name - else: - raise ValueError(f"{symbol} must be either a Token or a NonTerminal") - - precedence_table[key] = (associativity, prec + 1) - - if name is None: - name = getattr(self, "name", None) - if name is None: - name = self.__class__.__name__.removesuffix("Grammar").lower() - - self._precedence = precedence_table - self.start = start - self._generator = generator - self._terminals = list(terminals.values()) - self._trivia = resolved_trivia - self.name = name - - def terminals(self) -> list[Terminal]: - return self._terminals - - @property - def resolved_trivia(self) -> list[Terminal]: - return self._trivia - - def non_terminals(self) -> list[NonTerminal]: - return [nt for _, nt in inspect.getmembers(self, lambda x: isinstance(x, NonTerminal))] - - def generate_nonterminal_dict( - self, start: str | None = None - ) -> typing.Tuple[dict[str, list[list[str | Terminal]]], set[str]]: - """Convert the rules into a dictionary of productions. - - Our table generators work on a very flat set of productions. This is the - first step in flattening the productions from the members: walk the rules - starting from the given start rule and flatten them, one by one, into a - dictionary that maps nonterminal rule name to its associated list of - productions. - """ - if start is None: - start = self.start - - rules = self.non_terminals() - nonterminals = {rule.name: rule for rule in rules} - transparents = {rule.name for rule in rules if rule.transparent} - - grammar = {} - - rule = nonterminals.get(start) - if rule is None: - raise ValueError(f"Cannot find a rule named '{start}'") - queue = [rule] - while len(queue) > 0: - rule = queue.pop() - if rule.name in grammar: - continue - - body = rule.generate_body(self) - for clause in body: - for symbol in clause: - if not isinstance(symbol, Terminal): - assert isinstance(symbol, str) - nonterminal = nonterminals.get(symbol) - if nonterminal is None: - raise ValueError(f"While processing {rule.name}: cannot find {symbol}") - queue.append(nonterminal) - - grammar[rule.name] = body - - return (grammar, transparents) - - def desugar( - self, start: str | None = None - ) -> typing.Tuple[list[typing.Tuple[str, list[str]]], set[str]]: - """Convert the rules into a flat list of productions. - - Our table generators work from a very flat set of productions. The form - produced by this function is one level flatter than the one produced by - generate_nonterminal_dict- less useful to people, probably, but it is - the input form needed by the Generator. - """ - temp_grammar, transparents = self.generate_nonterminal_dict(start) - - grammar = [] - for rule_name, clauses in temp_grammar.items(): - for clause in clauses: - new_clause = [] - for symbol in clause: - if isinstance(symbol, Terminal): - if symbol.name in temp_grammar: - raise ValueError( - f"'{symbol.name}' is the name of both a Terminal and a NonTerminal rule. This will cause problems." - ) - new_clause.append(symbol.name) - else: - new_clause.append(symbol) - - grammar.append((rule_name, new_clause)) - - return grammar, transparents - - def build_table(self, start: str | None = None, generator=None) -> ParseTable: - """Construct a parse table for this grammar, starting at the named - nonterminal rule. - """ - if start is None: - start = self.start - desugared, transparents = self.desugar(start) - - if generator is None: - generator = self._generator - gen = generator(start, desugared, precedence=self._precedence, transparents=transparents) - table = gen.gen_table() - - for t in self._trivia: - assert t.name is not None - table.trivia.add(t.name) - - return table - - def compile_lexer(self) -> LexerTable: - """Construct a lexer table for this grammar.""" - # Parse the terminals all together into a big NFA rooted at `NFA`. - NFA = NFAState() - for terminal in self.terminals(): - pattern = terminal.pattern - if isinstance(pattern, Re): - start, ends = pattern.to_nfa() - for end in ends: - end.accept = terminal - NFA.epsilons.append(start) - - else: - start = end = NFAState() - for c in pattern: - end = end.add_edge(Span.from_str(c), NFAState()) - end.accept = terminal - NFA.epsilons.append(start) - - # NFA.dump_graph() - - # Convert the NFA into a DFA in the most straightforward way (by tracking - # sets of state closures, called SuperStates.) - DFA: dict[NFASuperState, tuple[int, list[tuple[Span, NFASuperState]]]] = {} - - stack = [NFASuperState([NFA])] - while len(stack) > 0: - ss = stack.pop() - if ss in DFA: - continue - - edges = ss.edges() - - DFA[ss] = (len(DFA), edges) - for _, target in edges: - stack.append(target) - - return [ - ( - ss.accept_terminal(), - [(k, DFA[v][0]) for k, v in edges], - ) - for ss, (_, edges) in DFA.items() - ] diff --git a/parser/runtime.py b/parser/runtime.py index 351c83a..24e617f 100644 --- a/parser/runtime.py +++ b/parser/runtime.py @@ -292,9 +292,9 @@ class Parser: # accessible in the tree. input_tokens = tokens.tokens() input: list[TokenValue] = [ - TokenValue(kind=kind.name, start=start, end=start + length) + TokenValue(kind=kind.value, start=start, end=start + length) for (kind, start, length) in input_tokens - if kind.name is not None and kind.name not in self.table.trivia + if kind.value is not None and kind.value not in self.table.trivia ] eof = 0 if len(input) == 0 else input[-1].end @@ -514,9 +514,9 @@ class GenericTokenStream: end = len(self._tokens) max_terminal_name = max( - len(terminal.name) + len(terminal.value) for terminal, _ in self.lexer - if terminal is not None and terminal.name is not None + if terminal is not None and terminal.value is not None ) max_offset_len = len(str(len(self.src))) @@ -539,6 +539,6 @@ class GenericTokenStream: else: line_part = " |" - line = f"{start:{max_offset_len}} {line_part} {column_index:3} {kind.name:{max_terminal_name}} {repr(value)}" + line = f"{start:{max_offset_len}} {line_part} {column_index:3} {kind.value:{max_terminal_name}} {repr(value)}" lines.append(line) return lines diff --git a/parser/tree_sitter.py b/parser/tree_sitter.py deleted file mode 100644 index d6270d2..0000000 --- a/parser/tree_sitter.py +++ /dev/null @@ -1,198 +0,0 @@ -import json -import pathlib - -from . import parser - - -def to_js_string(s: str) -> str: - result = json.dumps(s)[1:-1] - # JSON escapes double-quotes but we don't need to in our context. - result = result.replace('\\"', '"') - return result - - -def to_javascript_regex(re: parser.Re) -> str: - # NOTE: In general it's bad to introduce parenthesis into regular - # expressions where they're not required because they also create - # capture groups, but I think it doesn't apply to tree-sitter - # regular expressions (and it doesn't mean anything to me either.) - if isinstance(re, parser.ReSeq): - final = [] - queue = [] - queue.append(re) - while len(queue) > 0: - part = queue.pop() - if isinstance(part, parser.ReSeq): - queue.append(part.right) - queue.append(part.left) - else: - final.append(part) - - s = "".join([to_javascript_regex(p) for p in final]) - if len(final) > 1: - s = f"({s})" - return s - - elif isinstance(re, parser.ReAlt): - final = [] - queue = [] - queue.append(re) - while len(queue) > 0: - part = queue.pop() - if isinstance(part, parser.ReAlt): - queue.append(part.right) - queue.append(part.left) - else: - final.append(part) - - s = "|".join([to_javascript_regex(p) for p in final]) - if len(final) > 1: - s = f"({s})" - return s - - elif isinstance(re, parser.ReQuestion): - s = to_javascript_regex(re.child) - return f"{s}?" - - elif isinstance(re, parser.RePlus): - s = to_javascript_regex(re.child) - return f"{s}+" - - elif isinstance(re, parser.ReStar): - s = to_javascript_regex(re.child) - return f"{s}*" - - elif isinstance(re, parser.ReSet): - if ( - len(re.values) == 1 - and re.values[0].lower == 0 - and re.values[0].upper == parser.UNICODE_MAX_CP - ): - return "." - - inverted = re.inversion - if inverted: - re = re.invert() - - parts = [] - for value in re.values: - if len(value) == 1: - parts.append(to_js_string(chr(value.lower))) - else: - parts.append( - "{}-{}".format( - to_js_string(chr(value.lower)), - to_js_string(chr(value.upper - 1)), - ) - ) - - s = "".join(parts) - if inverted: - s = "^" + s - if len(s) > 1: - # The only time this isn't a "set" is if this is a set of one - # range that is one character long, in which case it's better - # represented as a literal. - s = f"[{s}]" - return s - - raise Exception(f"Regex node {re} not supported for tree-sitter") - - -def convert_to_tree_sitter(rule: parser.Rule, grammar: parser.Grammar) -> str: - # TODO: Precedence? - - method = getattr(rule, "convert_to_tree_sitter", None) - if method is not None: - return method(grammar) - - if isinstance(rule, parser.Terminal): - if isinstance(rule.pattern, parser.Re): - regex = to_javascript_regex(rule.pattern) - return f"/{regex}/" - else: - string = to_js_string(rule.pattern) - return f'"{string}"' - - elif isinstance(rule, parser.AlternativeRule): - final = [] - queue = [] - has_nothing = False - queue.append(rule) - while len(queue) > 0: - part = queue.pop() - if isinstance(part, parser.AlternativeRule): - queue.append(part.right) - queue.append(part.left) - elif isinstance(part, parser.NothingRule): - has_nothing = True - else: - final.append(part) - - if len(final) == 0: - raise Exception("Unsupported rule: empty alternative") - - result = ", ".join([convert_to_tree_sitter(r, grammar) for r in final]) - if len(final) > 1: - result = f"choice({result})" - if has_nothing: - result = f"opt({result})" - return result - - elif isinstance(rule, parser.SequenceRule): - final = [] - queue = [] - queue.append(rule) - while len(queue) > 0: - part = queue.pop() - if isinstance(part, parser.SequenceRule): - queue.append(part.second) - queue.append(part.first) - elif isinstance(part, parser.NothingRule): - pass - else: - final.append(part) - - if len(final) == 0: - raise Exception("Unsupported rule: empty sequence") - - result = ", ".join([convert_to_tree_sitter(r, grammar) for r in final]) - if len(final) > 1: - result = f"seq({result})" - return result - - elif isinstance(rule, parser.NonTerminal): - return f"$['{rule.name}']" - - elif isinstance(rule, parser.MetadataRule): - return convert_to_tree_sitter(rule.rule, grammar) - - else: - raise ValueError(f"Rule {rule} not supported for tree-sitter") - - -# https://tree-sitter.github.io/tree-sitter/creating-parsers -def emit_tree_sitter_grammar(grammar: parser.Grammar, path: pathlib.Path | str): - # TODO: PRECEDENCE - path = pathlib.Path(path) / "grammar.js" - with open(path, "w", encoding="utf-8") as f: - f.write('/// \n') - f.write("// @ts-check\n") - f.write("\n") - f.write("module.exports = grammar({\n") - f.write(f" name: '{grammar.name}',\n") - f.write(" rules: {\n") - f.write(f" source_file: $ => $['{grammar.start}'],\n") - for rule in grammar.non_terminals(): - f.write("\n") - - rule_name = rule.name - if rule.transparent: - rule_name = "_" + rule_name - - body = rule.fn(grammar) - rule_definition = convert_to_tree_sitter(body, grammar) - f.write(f" '{rule_name}': $ => {rule_definition},") - - f.write(" }\n") - f.write("});") diff --git a/tests/test_lexer.py b/tests/test_lexer.py index eec0415..bd7407a 100644 --- a/tests/test_lexer.py +++ b/tests/test_lexer.py @@ -11,6 +11,7 @@ from parser import ( Grammar, rule, Terminal, + compile_lexer, dump_lexer_table, Re, ) @@ -371,7 +372,7 @@ def test_lexer_compile(): ) BLANKS = Terminal(Re.set("\r", "\n", "\t", " ").plus()) - lexer = LexTest().compile_lexer() + lexer = compile_lexer(LexTest()) dump_lexer_table(lexer) tokens = list(generic_tokenize("xy is ass", lexer)) assert tokens == [ @@ -409,7 +410,7 @@ def test_lexer_numbers(n: float): ) ) - lexer = LexTest().compile_lexer() + lexer = compile_lexer(LexTest()) dump_lexer_table(lexer) number_string = str(n)