diff --git a/parser/parser.py b/parser/parser.py index c69fe65..e657897 100644 --- a/parser/parser.py +++ b/parser/parser.py @@ -2815,12 +2815,10 @@ class Grammar: def get_precedence(self, name: str) -> None | tuple[Assoc, int]: return self._precedence.get(name) - # TODO: The flattened form should retain NonTerminal, not just str. def generate_nonterminal_dict( self, start: str | None = None ) -> typing.Tuple[dict[str, list[list[str | Terminal]]], set[str]]: - """Convert the rules into a dictionary of productions, and a set of - the names of transparent nonterminals. + """Convert the rules into a dictionary of productions. Our table generators work on a very flat set of productions. This is the first step in flattening the productions from the members: walk the rules @@ -2840,8 +2838,6 @@ class Grammar: rule = nonterminals.get(start) if rule is None: raise ValueError(f"Cannot find a rule named '{start}'") - if rule.transparent: - raise ValueError("The start rule cannot be transparent") queue = [rule] while len(queue) > 0: rule = queue.pop() diff --git a/parser/runtime.py b/parser/runtime.py index 3d55ef5..351c83a 100644 --- a/parser/runtime.py +++ b/parser/runtime.py @@ -22,29 +22,6 @@ class Tree: end: int children: typing.Tuple["Tree | TokenValue", ...] - def format_lines(self, source: str | None = None) -> list[str]: - lines = [] - - def format_node(node: Tree | TokenValue, indent: int): - match node: - case Tree(name=name, start=start, end=end, children=children): - lines.append((" " * indent) + f"{name or '???'} [{start}, {end})") - for child in children: - format_node(child, indent + 2) - - case TokenValue(kind=kind, start=start, end=end): - if source is not None: - value = f":'{source[start:end]}'" - else: - value = "" - lines.append((" " * indent) + f"{kind}{value} [{start}, {end})") - - format_node(self, 0) - return lines - - def format(self, source: str | None = None) -> str: - return "\n".join(self.format_lines(source)) - @dataclass class ParseError: @@ -301,15 +278,13 @@ class TokenStream(typing.Protocol): ... -# TODO: This runtime API sucks; the TokenStream is nice and all but I should -# also be able to have a function that takes a string and produces a -# tree directly, with caching intermediates for codegen and whatnot. - - class Parser: + # Our stack is a stack of tuples, where the first entry is the state + # number and the second entry is the 'value' that was generated when the + # state was pushed. table: parser.ParseTable - def __init__(self, table: parser.ParseTable): + def __init__(self, table): self.table = table def parse(self, tokens: TokenStream) -> typing.Tuple[Tree | None, list[str]]: @@ -326,9 +301,6 @@ class Parser: input = input + [TokenValue(kind="$", start=eof, end=eof)] input_index = 0 - # Our stack is a stack of tuples, where the first entry is the state - # number and the second entry is the 'value' that was generated when - # the state was pushed. stack: ParseStack = [(0, None)] result: Tree | None = None errors: list[ParseError] = [] diff --git a/parser/wadler.py b/parser/wadler.py index 381272d..fbcc2cc 100644 --- a/parser/wadler.py +++ b/parser/wadler.py @@ -1,7 +1,5 @@ # A prettier printer. -import abc import dataclasses -import math import typing from . import parser @@ -14,13 +12,6 @@ class Cons: right: "Document" -def cons(left: "Document", right: "Document") -> "Document": - if left and right: - return Cons(left, right) - else: - return left or right - - @dataclasses.dataclass(frozen=True) class NewLine: pass @@ -56,230 +47,144 @@ class Lazy: Document = None | Text | NewLine | Cons | Indent | Group | Lazy -def resolve_document(doc: Document) -> Document: - match doc: - case Cons(left, right): - lr = resolve_document(left) - rr = resolve_document(right) - if lr is not left or rr is not right: - return cons(lr, rr) - else: - return doc - - case Lazy(_): - return doc.resolve() - - case _: - return doc - - def layout_document(doc: Document) -> typing.Generator[str, None, None]: - del doc raise NotImplementedError() -@dataclasses.dataclass(frozen=True) -class MatchTerminal: - name: str +@dataclasses.dataclass +class Match: + doc: Document + remaining: list[runtime.Tree | runtime.TokenValue] -@dataclasses.dataclass(frozen=True) -class MatchNonTerminal: - name: str - - -@dataclasses.dataclass(frozen=True) -class Accept: - pass - - -@dataclasses.dataclass(frozen=True) -class StartGroup: - pass - - -@dataclasses.dataclass(frozen=True) -class EndGroup: - pass - - -@dataclasses.dataclass(frozen=True) -class StartIndent: - pass - - -@dataclasses.dataclass(frozen=True) -class EndIndent: - amount: int - - -@dataclasses.dataclass(frozen=True) -class Split: - left: int - right: int - - -@dataclasses.dataclass(frozen=True) -class Jump: - next: int - - -MatchInstruction = ( - MatchTerminal - | MatchNonTerminal - | Accept - | StartGroup - | EndGroup - | NewLine - | StartIndent - | EndIndent - | Split - | Jump -) - - -### THIS DOESN'T WORK -### -### YOU CANNOT MATCH RULES WITH TRANSPARENT CHILDREN WITH A FSM, THIS IS NOT -### A REGULAR LANGUAGE IT IS CONTEXT FREE SO WE NEED TO RUN OUR REAL PARSER -### WHICH MEANS YES WE NEED TO GENERATE TABLES AGAIN OUT OF SUB-GRAMMARS FOR -### PRODUCTIONS BUT ALSO GENERATE NEW ONES FOR META AND ALSO RUN ACTIONS -### -### CHRIST. -### class Matcher: - code: list[MatchInstruction] + def match(self, items: list[runtime.Tree | runtime.TokenValue]) -> Match | None: + raise NotImplementedError() - def __init__(self): - self.code = [] - @dataclasses.dataclass - class ThreadState: - pc: int - position: int - count: int - results: list[Document | StartGroup | StartIndent] +class NonTerminalMatcher(Matcher): + name: str + printer: "Printer" - def match(self, printer: "Printer", items: list[runtime.Tree | runtime.TokenValue]) -> Document: - threads: list[Matcher.ThreadState] = [ - Matcher.ThreadState(pc=0, position=0, results=[], count=0) - ] + def __init__(self, name: str, printer: "Printer"): + self.name = name + self.printer = printer - while len(threads) > 0: - thread = threads.pop() - results = thread.results - while True: - thread.count += 1 - if thread.count > 1000: - raise Exception("Too many steps!") + def match(self, items: list[runtime.Tree | runtime.TokenValue]) -> Match | None: + if len(items) == 0: + return None - inst = self.code[thread.pc] - print(f"THREAD: {thread.pc}: {inst} ({thread.position})") - match inst: - case MatchTerminal(name): - if thread.position >= len(items): - break - - item = items[thread.position] - if not isinstance(item, runtime.TokenValue): - break - - if item.kind != name: - break - - results.append(Text(item.start, item.end)) - thread.pc += 1 - thread.position += 1 - - case MatchNonTerminal(name): - if thread.position >= len(items): - break - - item = items[thread.position] - if not isinstance(item, runtime.Tree): - break - - if item.name != name: - break - - def thunk(capture: runtime.Tree): - return lambda: printer.convert_tree_to_document(capture) - - results.append(Lazy(thunk(item))) - thread.pc += 1 - thread.position += 1 - - case Accept(): - if thread.position != len(items): - break - - result = None - for r in thread.results: - assert not isinstance(r, (StartGroup, StartIndent)) - result = cons(result, r) - return result - - case StartGroup(): - results.append(inst) - thread.pc += 1 - - case EndGroup(): - group_items = None - while not isinstance(results[-1], StartGroup): - item = typing.cast(Document, results.pop()) - group_items = cons(item, group_items) - results.pop() - results.append(Group(group_items)) - thread.pc += 1 - - case NewLine(): - results.append(NewLine()) - thread.pc += 1 - - case StartIndent(): - results.append(inst) - thread.pc += 1 - - case EndIndent(amount): - indent_items = None - while not isinstance(results[-1], StartIndent): - item = typing.cast(Document, results.pop()) - indent_items = cons(item, indent_items) - results.pop() - results.append(Indent(amount, indent_items)) - thread.pc += 1 - - case Split(left, right): - new_thread = Matcher.ThreadState( - pc=right, - position=thread.position, - results=list(thread.results), - count=0, - ) - threads.append(new_thread) - thread.pc = left - - case Jump(where): - thread.pc = where - threads.append(thread) - - case _: - typing.assert_never(inst) + item = items[0] + if isinstance(item, runtime.Tree) and item.name == self.name: + return Match( + doc=Lazy(value=lambda: self.printer.convert_tree_to_document(item)), + remaining=items[1:], + ) return None - def format(self) -> str: - return "\n".join(self.format_lines()) - def format_lines(self) -> list[str]: - lines = [] - code_len = int(math.log10(len(self.code))) + 1 - for i, inst in enumerate(self.code): - lines.append(f"{i: >{code_len}} {inst}") - return lines +class TerminalMatcher(Matcher): + name: str - @abc.abstractmethod - def format_into(self, lines: list[str], visited: dict["Matcher", int], indent: int = 0): ... + def __init__(self, name: str): + self.name = name + + def match(self, items: list[runtime.Tree | runtime.TokenValue]) -> Match | None: + if len(items) == 0: + return None + + item = items[0] + if isinstance(item, runtime.TokenValue) and item.kind == self.name: + return Match( + doc=Text(start=item.start, end=item.end), + remaining=items[1:], + ) + + return None + + +class IndentMatcher(Matcher): + amount: int + child: Matcher + + def __init__(self, amount: int, child: Matcher): + self.amount = amount + self.child = child + + def match(self, items: list[runtime.Tree | runtime.TokenValue]) -> Match | None: + result = self.child.match(items) + if result is not None: + result.doc = Indent(amount=self.amount, doc=result.doc) + + return result + + +class NewLineMatcher(Matcher): + def match(self, items: list[runtime.Tree | runtime.TokenValue]) -> Match | None: + return Match( + doc=NewLine(), + remaining=items, + ) + + +class GroupMatcher(Matcher): + child: Matcher + + def __init__(self, child: Matcher): + self.child = child + + def match(self, items: list[runtime.Tree | runtime.TokenValue]) -> Match | None: + result = self.child.match(items) + if result is not None: + result.doc = Group(result.doc) + + return result + + +class CompleteMatcher(Matcher): + def match(self, items: list[runtime.Tree | runtime.TokenValue]) -> Match | None: + if len(items) == 0: + return Match(doc=None, remaining=[]) + else: + return None + + +class AlternativeMatcher(Matcher): + children: list[Matcher] + + def __init__(self, children: list[Matcher] | None = None): + self.children = children or [] + + def match(self, items: list[runtime.Tree | runtime.TokenValue]) -> Match | None: + for child in self.children: + m = child.match(items) + if m is not None: + return m + + return None + + +class SequenceMatcher(Matcher): + children: list[Matcher] + + def __init__(self, children: list[Matcher] | None = None): + self.children = children or [] + + def match(self, items: list[runtime.Tree | runtime.TokenValue]) -> Match | None: + doc = None + for child in self.children: + m = child.match(items) + if m is None: + return None + + items = m.remaining + doc = Cons(doc, m.doc) + + return Match( + doc=doc, + remaining=items, + ) class PrettyMeta(parser.SyntaxMeta): @@ -290,92 +195,68 @@ class PrettyMeta(parser.SyntaxMeta): class Printer: grammar: parser.Grammar - _matchers: dict[str, Matcher] - _nonterminals: dict[str, parser.NonTerminal] + matchers: dict[str, Matcher] def __init__(self, grammar: parser.Grammar): self.grammar = grammar - self._nonterminals = {nt.name: nt for nt in grammar.non_terminals()} - self._matchers = {} def lookup_nonterminal(self, name: str) -> parser.NonTerminal: - return self._nonterminals[name] + raise NotImplementedError() - def compile_rule(self, rule: parser.NonTerminal) -> Matcher: - matcher = Matcher() - code = matcher.code - patcher: dict[str, int] = {} + def production_to_matcher(self, production: parser.FlattenedWithMetadata) -> Matcher: + results = [] + for item in production: + if isinstance(item, str): + rule = self.lookup_nonterminal(item) + if rule.transparent: + # If it's transparent then we don't actually match a + # nonterminal here, we need to match against the contents + # of the rule, so we recurse. + results.append(self.rule_to_matcher(rule)) + else: + results.append(NonTerminalMatcher(item, self)) + + elif isinstance(item, parser.Terminal): + name = item.name + assert name is not None + results.append(TerminalMatcher(name)) - def compile_nonterminal(rule: parser.NonTerminal): - sub_start = patcher.get(rule.name) - if sub_start is not None: - code.append(Jump(sub_start)) else: - sub_start = len(code) - patcher[rule.name] = sub_start - tails = [] - subs = list(rule.fn(self.grammar).flatten(with_metadata=True)) - for sub in subs[:-1]: - split_pos = len(code) - code.append(Split(0, 0)) + meta, children = item - compile_production(sub) + child = self.production_to_matcher(children) - tails.append(len(code)) - code.append(Jump(0)) + prettier = meta.get("prettier") + if isinstance(prettier, PrettyMeta): + if prettier.indent: + child = IndentMatcher(prettier.indent, child) - code[split_pos] = Split(sub_start + 1, len(code)) - sub_start = len(code) + if prettier.group: + child = GroupMatcher(child) - compile_production(subs[-1]) + results.append(child) - for tail in tails: - code[tail] = Jump(len(code)) - - def compile_production(production: parser.FlattenedWithMetadata): - for item in production: - if isinstance(item, str): - rule = self.lookup_nonterminal(item) - if rule.transparent: - # If it's transparent then we need to inline the pattern here. - compile_nonterminal(rule) - else: - code.append(MatchNonTerminal(item)) - - elif isinstance(item, parser.Terminal): - name = item.name - assert name is not None - code.append(MatchTerminal(name)) + if prettier.newline: + results.append(NewLineMatcher()) else: - meta, children = item + results.append(child) - prettier = meta.get("prettier") - if isinstance(prettier, PrettyMeta): - if prettier.indent: - code.append(StartIndent()) - if prettier.group: - code.append(StartGroup()) - - compile_production(children) - - if isinstance(prettier, PrettyMeta): - if prettier.group: - code.append(EndGroup()) - if prettier.indent: - code.append(EndIndent(prettier.indent)) - if prettier.newline: - code.append(NewLine()) - - compile_nonterminal(rule) - code.append(Accept()) - return matcher + return SequenceMatcher(results) def rule_to_matcher(self, rule: parser.NonTerminal) -> Matcher: - result = self._matchers.get(rule.name) + result = self.matchers.get(rule.name) if result is None: - result = self.compile_rule(rule) - self._matchers[rule.name] = result + # Create the empty alternative, be sure to set up the + alts = AlternativeMatcher() + if rule.transparent: + result = alts + else: + result = SequenceMatcher(children=[alts, CompleteMatcher()]) + self.matchers[rule.name] = result + + for production in rule.fn(self.grammar).flatten(with_metadata=True): + alts.children.append(self.production_to_matcher(production)) return result @@ -385,16 +266,11 @@ class Printer: rule = self.lookup_nonterminal(name) matcher = self.rule_to_matcher(rule) - print(f"--------") - print(f"Matching with:\n{matcher.format()}") - m = matcher.match(self, list(tree.children)) - print(f"--------") - if m is None: - raise ValueError( - f"Could not match a valid tree for {tree.name} with {len(tree.children)} children:\n{tree.format()}\nMatcher:\n{matcher.format()}" - ) - # return m - return resolve_document(m) + + m = matcher.match(list(tree.children)) + assert m is not None, "Could not match a valid tree" # TODO: Exception rather I think + + return m.doc def format_tree(self, tree: runtime.Tree) -> str: doc = self.convert_tree_to_document(tree)