From 1d28c82007db626887cfcd14729772b4c28a5c79 Mon Sep 17 00:00:00 2001 From: John Doty Date: Mon, 9 Sep 2024 06:23:25 -0700 Subject: [PATCH] Saving this for posterity, but it is doomed Remember that tree levels are generated by context free languages, not regular languages, and so they can only be recognized by push-down automatons, not finite state machines. What happened was that I failed to account for transparent rules. Without transparent rules the children of a tree node do not have any recursion in them (by definition!) and so therefore *are* a regular language. But transparent rules change that: there *can be* recursion hidden on the same tree level, and it should have been clear from a moment's reflection that the recursion there meant that tree levels were once again a context free language. Fortunately we have a recognizer for context free languages lying around, so we can just use that I guess. --- parser/wadler.py | 446 ++++++++++++++++++++++++++++++----------------- 1 file changed, 285 insertions(+), 161 deletions(-) diff --git a/parser/wadler.py b/parser/wadler.py index fbcc2cc..381272d 100644 --- a/parser/wadler.py +++ b/parser/wadler.py @@ -1,5 +1,7 @@ # A prettier printer. +import abc import dataclasses +import math import typing from . import parser @@ -12,6 +14,13 @@ class Cons: right: "Document" +def cons(left: "Document", right: "Document") -> "Document": + if left and right: + return Cons(left, right) + else: + return left or right + + @dataclasses.dataclass(frozen=True) class NewLine: pass @@ -47,144 +56,230 @@ class Lazy: Document = None | Text | NewLine | Cons | Indent | Group | Lazy +def resolve_document(doc: Document) -> Document: + match doc: + case Cons(left, right): + lr = resolve_document(left) + rr = resolve_document(right) + if lr is not left or rr is not right: + return cons(lr, rr) + else: + return doc + + case Lazy(_): + return doc.resolve() + + case _: + return doc + + def layout_document(doc: Document) -> typing.Generator[str, None, None]: + del doc raise NotImplementedError() -@dataclasses.dataclass -class Match: - doc: Document - remaining: list[runtime.Tree | runtime.TokenValue] - - -class Matcher: - def match(self, items: list[runtime.Tree | runtime.TokenValue]) -> Match | None: - raise NotImplementedError() - - -class NonTerminalMatcher(Matcher): - name: str - printer: "Printer" - - def __init__(self, name: str, printer: "Printer"): - self.name = name - self.printer = printer - - def match(self, items: list[runtime.Tree | runtime.TokenValue]) -> Match | None: - if len(items) == 0: - return None - - item = items[0] - if isinstance(item, runtime.Tree) and item.name == self.name: - return Match( - doc=Lazy(value=lambda: self.printer.convert_tree_to_document(item)), - remaining=items[1:], - ) - - return None - - -class TerminalMatcher(Matcher): +@dataclasses.dataclass(frozen=True) +class MatchTerminal: name: str - def __init__(self, name: str): - self.name = name - def match(self, items: list[runtime.Tree | runtime.TokenValue]) -> Match | None: - if len(items) == 0: - return None - - item = items[0] - if isinstance(item, runtime.TokenValue) and item.kind == self.name: - return Match( - doc=Text(start=item.start, end=item.end), - remaining=items[1:], - ) - - return None +@dataclasses.dataclass(frozen=True) +class MatchNonTerminal: + name: str -class IndentMatcher(Matcher): +@dataclasses.dataclass(frozen=True) +class Accept: + pass + + +@dataclasses.dataclass(frozen=True) +class StartGroup: + pass + + +@dataclasses.dataclass(frozen=True) +class EndGroup: + pass + + +@dataclasses.dataclass(frozen=True) +class StartIndent: + pass + + +@dataclasses.dataclass(frozen=True) +class EndIndent: amount: int - child: Matcher - - def __init__(self, amount: int, child: Matcher): - self.amount = amount - self.child = child - - def match(self, items: list[runtime.Tree | runtime.TokenValue]) -> Match | None: - result = self.child.match(items) - if result is not None: - result.doc = Indent(amount=self.amount, doc=result.doc) - - return result -class NewLineMatcher(Matcher): - def match(self, items: list[runtime.Tree | runtime.TokenValue]) -> Match | None: - return Match( - doc=NewLine(), - remaining=items, - ) +@dataclasses.dataclass(frozen=True) +class Split: + left: int + right: int -class GroupMatcher(Matcher): - child: Matcher - - def __init__(self, child: Matcher): - self.child = child - - def match(self, items: list[runtime.Tree | runtime.TokenValue]) -> Match | None: - result = self.child.match(items) - if result is not None: - result.doc = Group(result.doc) - - return result +@dataclasses.dataclass(frozen=True) +class Jump: + next: int -class CompleteMatcher(Matcher): - def match(self, items: list[runtime.Tree | runtime.TokenValue]) -> Match | None: - if len(items) == 0: - return Match(doc=None, remaining=[]) - else: - return None +MatchInstruction = ( + MatchTerminal + | MatchNonTerminal + | Accept + | StartGroup + | EndGroup + | NewLine + | StartIndent + | EndIndent + | Split + | Jump +) -class AlternativeMatcher(Matcher): - children: list[Matcher] +### THIS DOESN'T WORK +### +### YOU CANNOT MATCH RULES WITH TRANSPARENT CHILDREN WITH A FSM, THIS IS NOT +### A REGULAR LANGUAGE IT IS CONTEXT FREE SO WE NEED TO RUN OUR REAL PARSER +### WHICH MEANS YES WE NEED TO GENERATE TABLES AGAIN OUT OF SUB-GRAMMARS FOR +### PRODUCTIONS BUT ALSO GENERATE NEW ONES FOR META AND ALSO RUN ACTIONS +### +### CHRIST. +### +class Matcher: + code: list[MatchInstruction] - def __init__(self, children: list[Matcher] | None = None): - self.children = children or [] + def __init__(self): + self.code = [] - def match(self, items: list[runtime.Tree | runtime.TokenValue]) -> Match | None: - for child in self.children: - m = child.match(items) - if m is not None: - return m + @dataclasses.dataclass + class ThreadState: + pc: int + position: int + count: int + results: list[Document | StartGroup | StartIndent] + + def match(self, printer: "Printer", items: list[runtime.Tree | runtime.TokenValue]) -> Document: + threads: list[Matcher.ThreadState] = [ + Matcher.ThreadState(pc=0, position=0, results=[], count=0) + ] + + while len(threads) > 0: + thread = threads.pop() + results = thread.results + while True: + thread.count += 1 + if thread.count > 1000: + raise Exception("Too many steps!") + + inst = self.code[thread.pc] + print(f"THREAD: {thread.pc}: {inst} ({thread.position})") + match inst: + case MatchTerminal(name): + if thread.position >= len(items): + break + + item = items[thread.position] + if not isinstance(item, runtime.TokenValue): + break + + if item.kind != name: + break + + results.append(Text(item.start, item.end)) + thread.pc += 1 + thread.position += 1 + + case MatchNonTerminal(name): + if thread.position >= len(items): + break + + item = items[thread.position] + if not isinstance(item, runtime.Tree): + break + + if item.name != name: + break + + def thunk(capture: runtime.Tree): + return lambda: printer.convert_tree_to_document(capture) + + results.append(Lazy(thunk(item))) + thread.pc += 1 + thread.position += 1 + + case Accept(): + if thread.position != len(items): + break + + result = None + for r in thread.results: + assert not isinstance(r, (StartGroup, StartIndent)) + result = cons(result, r) + return result + + case StartGroup(): + results.append(inst) + thread.pc += 1 + + case EndGroup(): + group_items = None + while not isinstance(results[-1], StartGroup): + item = typing.cast(Document, results.pop()) + group_items = cons(item, group_items) + results.pop() + results.append(Group(group_items)) + thread.pc += 1 + + case NewLine(): + results.append(NewLine()) + thread.pc += 1 + + case StartIndent(): + results.append(inst) + thread.pc += 1 + + case EndIndent(amount): + indent_items = None + while not isinstance(results[-1], StartIndent): + item = typing.cast(Document, results.pop()) + indent_items = cons(item, indent_items) + results.pop() + results.append(Indent(amount, indent_items)) + thread.pc += 1 + + case Split(left, right): + new_thread = Matcher.ThreadState( + pc=right, + position=thread.position, + results=list(thread.results), + count=0, + ) + threads.append(new_thread) + thread.pc = left + + case Jump(where): + thread.pc = where + threads.append(thread) + + case _: + typing.assert_never(inst) return None + def format(self) -> str: + return "\n".join(self.format_lines()) -class SequenceMatcher(Matcher): - children: list[Matcher] + def format_lines(self) -> list[str]: + lines = [] + code_len = int(math.log10(len(self.code))) + 1 + for i, inst in enumerate(self.code): + lines.append(f"{i: >{code_len}} {inst}") + return lines - def __init__(self, children: list[Matcher] | None = None): - self.children = children or [] - - def match(self, items: list[runtime.Tree | runtime.TokenValue]) -> Match | None: - doc = None - for child in self.children: - m = child.match(items) - if m is None: - return None - - items = m.remaining - doc = Cons(doc, m.doc) - - return Match( - doc=doc, - remaining=items, - ) + @abc.abstractmethod + def format_into(self, lines: list[str], visited: dict["Matcher", int], indent: int = 0): ... class PrettyMeta(parser.SyntaxMeta): @@ -195,68 +290,92 @@ class PrettyMeta(parser.SyntaxMeta): class Printer: grammar: parser.Grammar - matchers: dict[str, Matcher] + _matchers: dict[str, Matcher] + _nonterminals: dict[str, parser.NonTerminal] def __init__(self, grammar: parser.Grammar): self.grammar = grammar + self._nonterminals = {nt.name: nt for nt in grammar.non_terminals()} + self._matchers = {} def lookup_nonterminal(self, name: str) -> parser.NonTerminal: - raise NotImplementedError() + return self._nonterminals[name] - def production_to_matcher(self, production: parser.FlattenedWithMetadata) -> Matcher: - results = [] - for item in production: - if isinstance(item, str): - rule = self.lookup_nonterminal(item) - if rule.transparent: - # If it's transparent then we don't actually match a - # nonterminal here, we need to match against the contents - # of the rule, so we recurse. - results.append(self.rule_to_matcher(rule)) - else: - results.append(NonTerminalMatcher(item, self)) - - elif isinstance(item, parser.Terminal): - name = item.name - assert name is not None - results.append(TerminalMatcher(name)) + def compile_rule(self, rule: parser.NonTerminal) -> Matcher: + matcher = Matcher() + code = matcher.code + patcher: dict[str, int] = {} + def compile_nonterminal(rule: parser.NonTerminal): + sub_start = patcher.get(rule.name) + if sub_start is not None: + code.append(Jump(sub_start)) else: - meta, children = item + sub_start = len(code) + patcher[rule.name] = sub_start + tails = [] + subs = list(rule.fn(self.grammar).flatten(with_metadata=True)) + for sub in subs[:-1]: + split_pos = len(code) + code.append(Split(0, 0)) - child = self.production_to_matcher(children) + compile_production(sub) - prettier = meta.get("prettier") - if isinstance(prettier, PrettyMeta): - if prettier.indent: - child = IndentMatcher(prettier.indent, child) + tails.append(len(code)) + code.append(Jump(0)) - if prettier.group: - child = GroupMatcher(child) + code[split_pos] = Split(sub_start + 1, len(code)) + sub_start = len(code) - results.append(child) + compile_production(subs[-1]) - if prettier.newline: - results.append(NewLineMatcher()) + for tail in tails: + code[tail] = Jump(len(code)) + + def compile_production(production: parser.FlattenedWithMetadata): + for item in production: + if isinstance(item, str): + rule = self.lookup_nonterminal(item) + if rule.transparent: + # If it's transparent then we need to inline the pattern here. + compile_nonterminal(rule) + else: + code.append(MatchNonTerminal(item)) + + elif isinstance(item, parser.Terminal): + name = item.name + assert name is not None + code.append(MatchTerminal(name)) else: - results.append(child) + meta, children = item - return SequenceMatcher(results) + prettier = meta.get("prettier") + if isinstance(prettier, PrettyMeta): + if prettier.indent: + code.append(StartIndent()) + if prettier.group: + code.append(StartGroup()) + + compile_production(children) + + if isinstance(prettier, PrettyMeta): + if prettier.group: + code.append(EndGroup()) + if prettier.indent: + code.append(EndIndent(prettier.indent)) + if prettier.newline: + code.append(NewLine()) + + compile_nonterminal(rule) + code.append(Accept()) + return matcher def rule_to_matcher(self, rule: parser.NonTerminal) -> Matcher: - result = self.matchers.get(rule.name) + result = self._matchers.get(rule.name) if result is None: - # Create the empty alternative, be sure to set up the - alts = AlternativeMatcher() - if rule.transparent: - result = alts - else: - result = SequenceMatcher(children=[alts, CompleteMatcher()]) - self.matchers[rule.name] = result - - for production in rule.fn(self.grammar).flatten(with_metadata=True): - alts.children.append(self.production_to_matcher(production)) + result = self.compile_rule(rule) + self._matchers[rule.name] = result return result @@ -266,11 +385,16 @@ class Printer: rule = self.lookup_nonterminal(name) matcher = self.rule_to_matcher(rule) - - m = matcher.match(list(tree.children)) - assert m is not None, "Could not match a valid tree" # TODO: Exception rather I think - - return m.doc + print(f"--------") + print(f"Matching with:\n{matcher.format()}") + m = matcher.match(self, list(tree.children)) + print(f"--------") + if m is None: + raise ValueError( + f"Could not match a valid tree for {tree.name} with {len(tree.children)} children:\n{tree.format()}\nMatcher:\n{matcher.format()}" + ) + # return m + return resolve_document(m) def format_tree(self, tree: runtime.Tree) -> str: doc = self.convert_tree_to_document(tree)