From 1d28c82007db626887cfcd14729772b4c28a5c79 Mon Sep 17 00:00:00 2001
From: John Doty <john@d0ty.me>
Date: Mon, 9 Sep 2024 06:23:25 -0700
Subject: [PATCH] Saving this for posterity, but it is doomed

Remember that tree levels are generated by context free languages, not
regular languages, and so they can only be recognized by push-down
automatons, not finite state machines.

What happened was that I failed to account for transparent rules.
Without transparent rules the children of a tree node do not have any
recursion in them (by definition!) and so therefore *are* a regular
language. But transparent rules change that: there *can be* recursion
hidden on the same tree level, and it should have been clear from a
moment's reflection that the recursion there meant that tree levels
were once again a context free language.

Fortunately we have a recognizer for context free languages lying
around, so we can just use that I guess.
---
 parser/wadler.py | 446 ++++++++++++++++++++++++++++++-----------------
 1 file changed, 285 insertions(+), 161 deletions(-)

diff --git a/parser/wadler.py b/parser/wadler.py
index fbcc2cc..381272d 100644
--- a/parser/wadler.py
+++ b/parser/wadler.py
@@ -1,5 +1,7 @@
 # A prettier printer.
+import abc
 import dataclasses
+import math
 import typing
 
 from . import parser
@@ -12,6 +14,13 @@ class Cons:
     right: "Document"
 
 
+def cons(left: "Document", right: "Document") -> "Document":
+    if left and right:
+        return Cons(left, right)
+    else:
+        return left or right
+
+
 @dataclasses.dataclass(frozen=True)
 class NewLine:
     pass
@@ -47,144 +56,230 @@ class Lazy:
 Document = None | Text | NewLine | Cons | Indent | Group | Lazy
 
 
+def resolve_document(doc: Document) -> Document:
+    match doc:
+        case Cons(left, right):
+            lr = resolve_document(left)
+            rr = resolve_document(right)
+            if lr is not left or rr is not right:
+                return cons(lr, rr)
+            else:
+                return doc
+
+        case Lazy(_):
+            return doc.resolve()
+
+        case _:
+            return doc
+
+
 def layout_document(doc: Document) -> typing.Generator[str, None, None]:
+    del doc
     raise NotImplementedError()
 
 
-@dataclasses.dataclass
-class Match:
-    doc: Document
-    remaining: list[runtime.Tree | runtime.TokenValue]
-
-
-class Matcher:
-    def match(self, items: list[runtime.Tree | runtime.TokenValue]) -> Match | None:
-        raise NotImplementedError()
-
-
-class NonTerminalMatcher(Matcher):
-    name: str
-    printer: "Printer"
-
-    def __init__(self, name: str, printer: "Printer"):
-        self.name = name
-        self.printer = printer
-
-    def match(self, items: list[runtime.Tree | runtime.TokenValue]) -> Match | None:
-        if len(items) == 0:
-            return None
-
-        item = items[0]
-        if isinstance(item, runtime.Tree) and item.name == self.name:
-            return Match(
-                doc=Lazy(value=lambda: self.printer.convert_tree_to_document(item)),
-                remaining=items[1:],
-            )
-
-        return None
-
-
-class TerminalMatcher(Matcher):
+@dataclasses.dataclass(frozen=True)
+class MatchTerminal:
     name: str
 
-    def __init__(self, name: str):
-        self.name = name
 
-    def match(self, items: list[runtime.Tree | runtime.TokenValue]) -> Match | None:
-        if len(items) == 0:
-            return None
-
-        item = items[0]
-        if isinstance(item, runtime.TokenValue) and item.kind == self.name:
-            return Match(
-                doc=Text(start=item.start, end=item.end),
-                remaining=items[1:],
-            )
-
-        return None
+@dataclasses.dataclass(frozen=True)
+class MatchNonTerminal:
+    name: str
 
 
-class IndentMatcher(Matcher):
+@dataclasses.dataclass(frozen=True)
+class Accept:
+    pass
+
+
+@dataclasses.dataclass(frozen=True)
+class StartGroup:
+    pass
+
+
+@dataclasses.dataclass(frozen=True)
+class EndGroup:
+    pass
+
+
+@dataclasses.dataclass(frozen=True)
+class StartIndent:
+    pass
+
+
+@dataclasses.dataclass(frozen=True)
+class EndIndent:
     amount: int
-    child: Matcher
-
-    def __init__(self, amount: int, child: Matcher):
-        self.amount = amount
-        self.child = child
-
-    def match(self, items: list[runtime.Tree | runtime.TokenValue]) -> Match | None:
-        result = self.child.match(items)
-        if result is not None:
-            result.doc = Indent(amount=self.amount, doc=result.doc)
-
-        return result
 
 
-class NewLineMatcher(Matcher):
-    def match(self, items: list[runtime.Tree | runtime.TokenValue]) -> Match | None:
-        return Match(
-            doc=NewLine(),
-            remaining=items,
-        )
+@dataclasses.dataclass(frozen=True)
+class Split:
+    left: int
+    right: int
 
 
-class GroupMatcher(Matcher):
-    child: Matcher
-
-    def __init__(self, child: Matcher):
-        self.child = child
-
-    def match(self, items: list[runtime.Tree | runtime.TokenValue]) -> Match | None:
-        result = self.child.match(items)
-        if result is not None:
-            result.doc = Group(result.doc)
-
-        return result
+@dataclasses.dataclass(frozen=True)
+class Jump:
+    next: int
 
 
-class CompleteMatcher(Matcher):
-    def match(self, items: list[runtime.Tree | runtime.TokenValue]) -> Match | None:
-        if len(items) == 0:
-            return Match(doc=None, remaining=[])
-        else:
-            return None
+MatchInstruction = (
+    MatchTerminal
+    | MatchNonTerminal
+    | Accept
+    | StartGroup
+    | EndGroup
+    | NewLine
+    | StartIndent
+    | EndIndent
+    | Split
+    | Jump
+)
 
 
-class AlternativeMatcher(Matcher):
-    children: list[Matcher]
+### THIS DOESN'T WORK
+###
+### YOU CANNOT MATCH RULES WITH TRANSPARENT CHILDREN WITH A FSM, THIS IS NOT
+### A REGULAR LANGUAGE IT IS CONTEXT FREE SO WE NEED TO RUN OUR REAL PARSER
+### WHICH MEANS YES WE NEED TO GENERATE TABLES AGAIN OUT OF SUB-GRAMMARS FOR
+### PRODUCTIONS BUT ALSO GENERATE NEW ONES FOR META AND ALSO RUN ACTIONS
+###
+### CHRIST.
+###
+class Matcher:
+    code: list[MatchInstruction]
 
-    def __init__(self, children: list[Matcher] | None = None):
-        self.children = children or []
+    def __init__(self):
+        self.code = []
 
-    def match(self, items: list[runtime.Tree | runtime.TokenValue]) -> Match | None:
-        for child in self.children:
-            m = child.match(items)
-            if m is not None:
-                return m
+    @dataclasses.dataclass
+    class ThreadState:
+        pc: int
+        position: int
+        count: int
+        results: list[Document | StartGroup | StartIndent]
+
+    def match(self, printer: "Printer", items: list[runtime.Tree | runtime.TokenValue]) -> Document:
+        threads: list[Matcher.ThreadState] = [
+            Matcher.ThreadState(pc=0, position=0, results=[], count=0)
+        ]
+
+        while len(threads) > 0:
+            thread = threads.pop()
+            results = thread.results
+            while True:
+                thread.count += 1
+                if thread.count > 1000:
+                    raise Exception("Too many steps!")
+
+                inst = self.code[thread.pc]
+                print(f"THREAD: {thread.pc}: {inst} ({thread.position})")
+                match inst:
+                    case MatchTerminal(name):
+                        if thread.position >= len(items):
+                            break
+
+                        item = items[thread.position]
+                        if not isinstance(item, runtime.TokenValue):
+                            break
+
+                        if item.kind != name:
+                            break
+
+                        results.append(Text(item.start, item.end))
+                        thread.pc += 1
+                        thread.position += 1
+
+                    case MatchNonTerminal(name):
+                        if thread.position >= len(items):
+                            break
+
+                        item = items[thread.position]
+                        if not isinstance(item, runtime.Tree):
+                            break
+
+                        if item.name != name:
+                            break
+
+                        def thunk(capture: runtime.Tree):
+                            return lambda: printer.convert_tree_to_document(capture)
+
+                        results.append(Lazy(thunk(item)))
+                        thread.pc += 1
+                        thread.position += 1
+
+                    case Accept():
+                        if thread.position != len(items):
+                            break
+
+                        result = None
+                        for r in thread.results:
+                            assert not isinstance(r, (StartGroup, StartIndent))
+                            result = cons(result, r)
+                        return result
+
+                    case StartGroup():
+                        results.append(inst)
+                        thread.pc += 1
+
+                    case EndGroup():
+                        group_items = None
+                        while not isinstance(results[-1], StartGroup):
+                            item = typing.cast(Document, results.pop())
+                            group_items = cons(item, group_items)
+                        results.pop()
+                        results.append(Group(group_items))
+                        thread.pc += 1
+
+                    case NewLine():
+                        results.append(NewLine())
+                        thread.pc += 1
+
+                    case StartIndent():
+                        results.append(inst)
+                        thread.pc += 1
+
+                    case EndIndent(amount):
+                        indent_items = None
+                        while not isinstance(results[-1], StartIndent):
+                            item = typing.cast(Document, results.pop())
+                            indent_items = cons(item, indent_items)
+                        results.pop()
+                        results.append(Indent(amount, indent_items))
+                        thread.pc += 1
+
+                    case Split(left, right):
+                        new_thread = Matcher.ThreadState(
+                            pc=right,
+                            position=thread.position,
+                            results=list(thread.results),
+                            count=0,
+                        )
+                        threads.append(new_thread)
+                        thread.pc = left
+
+                    case Jump(where):
+                        thread.pc = where
+                        threads.append(thread)
+
+                    case _:
+                        typing.assert_never(inst)
 
         return None
 
+    def format(self) -> str:
+        return "\n".join(self.format_lines())
 
-class SequenceMatcher(Matcher):
-    children: list[Matcher]
+    def format_lines(self) -> list[str]:
+        lines = []
+        code_len = int(math.log10(len(self.code))) + 1
+        for i, inst in enumerate(self.code):
+            lines.append(f"{i: >{code_len}} {inst}")
+        return lines
 
-    def __init__(self, children: list[Matcher] | None = None):
-        self.children = children or []
-
-    def match(self, items: list[runtime.Tree | runtime.TokenValue]) -> Match | None:
-        doc = None
-        for child in self.children:
-            m = child.match(items)
-            if m is None:
-                return None
-
-            items = m.remaining
-            doc = Cons(doc, m.doc)
-
-        return Match(
-            doc=doc,
-            remaining=items,
-        )
+    @abc.abstractmethod
+    def format_into(self, lines: list[str], visited: dict["Matcher", int], indent: int = 0): ...
 
 
 class PrettyMeta(parser.SyntaxMeta):
@@ -195,68 +290,92 @@ class PrettyMeta(parser.SyntaxMeta):
 
 class Printer:
     grammar: parser.Grammar
-    matchers: dict[str, Matcher]
+    _matchers: dict[str, Matcher]
+    _nonterminals: dict[str, parser.NonTerminal]
 
     def __init__(self, grammar: parser.Grammar):
         self.grammar = grammar
+        self._nonterminals = {nt.name: nt for nt in grammar.non_terminals()}
+        self._matchers = {}
 
     def lookup_nonterminal(self, name: str) -> parser.NonTerminal:
-        raise NotImplementedError()
+        return self._nonterminals[name]
 
-    def production_to_matcher(self, production: parser.FlattenedWithMetadata) -> Matcher:
-        results = []
-        for item in production:
-            if isinstance(item, str):
-                rule = self.lookup_nonterminal(item)
-                if rule.transparent:
-                    # If it's transparent then we don't actually match a
-                    # nonterminal here, we need to match against the contents
-                    # of the rule, so we recurse.
-                    results.append(self.rule_to_matcher(rule))
-                else:
-                    results.append(NonTerminalMatcher(item, self))
-
-            elif isinstance(item, parser.Terminal):
-                name = item.name
-                assert name is not None
-                results.append(TerminalMatcher(name))
+    def compile_rule(self, rule: parser.NonTerminal) -> Matcher:
+        matcher = Matcher()
+        code = matcher.code
+        patcher: dict[str, int] = {}
 
+        def compile_nonterminal(rule: parser.NonTerminal):
+            sub_start = patcher.get(rule.name)
+            if sub_start is not None:
+                code.append(Jump(sub_start))
             else:
-                meta, children = item
+                sub_start = len(code)
+                patcher[rule.name] = sub_start
+                tails = []
+                subs = list(rule.fn(self.grammar).flatten(with_metadata=True))
+                for sub in subs[:-1]:
+                    split_pos = len(code)
+                    code.append(Split(0, 0))
 
-                child = self.production_to_matcher(children)
+                    compile_production(sub)
 
-                prettier = meta.get("prettier")
-                if isinstance(prettier, PrettyMeta):
-                    if prettier.indent:
-                        child = IndentMatcher(prettier.indent, child)
+                    tails.append(len(code))
+                    code.append(Jump(0))
 
-                    if prettier.group:
-                        child = GroupMatcher(child)
+                    code[split_pos] = Split(sub_start + 1, len(code))
+                    sub_start = len(code)
 
-                    results.append(child)
+                compile_production(subs[-1])
 
-                    if prettier.newline:
-                        results.append(NewLineMatcher())
+                for tail in tails:
+                    code[tail] = Jump(len(code))
+
+        def compile_production(production: parser.FlattenedWithMetadata):
+            for item in production:
+                if isinstance(item, str):
+                    rule = self.lookup_nonterminal(item)
+                    if rule.transparent:
+                        # If it's transparent then we need to inline the pattern here.
+                        compile_nonterminal(rule)
+                    else:
+                        code.append(MatchNonTerminal(item))
+
+                elif isinstance(item, parser.Terminal):
+                    name = item.name
+                    assert name is not None
+                    code.append(MatchTerminal(name))
 
                 else:
-                    results.append(child)
+                    meta, children = item
 
-        return SequenceMatcher(results)
+                    prettier = meta.get("prettier")
+                    if isinstance(prettier, PrettyMeta):
+                        if prettier.indent:
+                            code.append(StartIndent())
+                        if prettier.group:
+                            code.append(StartGroup())
+
+                    compile_production(children)
+
+                    if isinstance(prettier, PrettyMeta):
+                        if prettier.group:
+                            code.append(EndGroup())
+                        if prettier.indent:
+                            code.append(EndIndent(prettier.indent))
+                        if prettier.newline:
+                            code.append(NewLine())
+
+        compile_nonterminal(rule)
+        code.append(Accept())
+        return matcher
 
     def rule_to_matcher(self, rule: parser.NonTerminal) -> Matcher:
-        result = self.matchers.get(rule.name)
+        result = self._matchers.get(rule.name)
         if result is None:
-            # Create the empty alternative, be sure to set up the
-            alts = AlternativeMatcher()
-            if rule.transparent:
-                result = alts
-            else:
-                result = SequenceMatcher(children=[alts, CompleteMatcher()])
-            self.matchers[rule.name] = result
-
-            for production in rule.fn(self.grammar).flatten(with_metadata=True):
-                alts.children.append(self.production_to_matcher(production))
+            result = self.compile_rule(rule)
+            self._matchers[rule.name] = result
 
         return result
 
@@ -266,11 +385,16 @@ class Printer:
 
         rule = self.lookup_nonterminal(name)
         matcher = self.rule_to_matcher(rule)
-
-        m = matcher.match(list(tree.children))
-        assert m is not None, "Could not match a valid tree"  # TODO: Exception rather I think
-
-        return m.doc
+        print(f"--------")
+        print(f"Matching with:\n{matcher.format()}")
+        m = matcher.match(self, list(tree.children))
+        print(f"--------")
+        if m is None:
+            raise ValueError(
+                f"Could not match a valid tree for {tree.name} with {len(tree.children)} children:\n{tree.format()}\nMatcher:\n{matcher.format()}"
+            )
+        # return m
+        return resolve_document(m)
 
     def format_tree(self, tree: runtime.Tree) -> str:
         doc = self.convert_tree_to_document(tree)