Saving this for posterity, but it is doomed

Remember that tree levels are generated by context free languages, not regular languages, and so they can only be recognized by push-down automatons, not finite state machines. What happened was that I failed to account for transparent rules. Without transparent rules the children of a tree node do not have any recursion in them (by definition!) and so therefore *are* a regular language. But transparent rules change that: there *can be* recursion hidden on the same tree level, and it should have been clear from a moment's reflection that the recursion there meant that tree levels were once again a context free language. Fortunately we have a recognizer for context free languages lying around, so we can just use that I guess.
The start rule cannot be transparent
2024-09-09 06:23:25 -07:00 · 2024-09-09 06:23:11 -07:00 · 2024-09-09 06:22:56 -07:00
3 changed files with 322 additions and 166 deletions
--- a/parser/parser.py
+++ b/parser/parser.py
@ -2815,10 +2815,12 @@ class Grammar:
    def get_precedence(self, name: str) -> None | tuple[Assoc, int]:
        return self._precedence.get(name)

+    # TODO: The flattened form should retain NonTerminal, not just str.
    def generate_nonterminal_dict(
        self, start: str | None = None
    ) -> typing.Tuple[dict[str, list[list[str | Terminal]]], set[str]]:
-        """Convert the rules into a dictionary of productions.
+        """Convert the rules into a dictionary of productions, and a set of
+        the names of transparent nonterminals.

        Our table generators work on a very flat set of productions. This is the
        first step in flattening the productions from the members: walk the rules
@ -2838,6 +2840,8 @@ class Grammar:
        rule = nonterminals.get(start)
        if rule is None:
            raise ValueError(f"Cannot find a rule named '{start}'")
+        if rule.transparent:
+            raise ValueError("The start rule cannot be transparent")
        queue = [rule]
        while len(queue) > 0:
            rule = queue.pop()
--- a/parser/runtime.py
+++ b/parser/runtime.py
@ -22,6 +22,29 @@ class Tree:
    end: int
    children: typing.Tuple["Tree | TokenValue", ...]

+    def format_lines(self, source: str | None = None) -> list[str]:
+        lines = []
+
+        def format_node(node: Tree | TokenValue, indent: int):
+            match node:
+                case Tree(name=name, start=start, end=end, children=children):
+                    lines.append((" " * indent) + f"{name or '???'} [{start}, {end})")
+                    for child in children:
+                        format_node(child, indent + 2)
+
+                case TokenValue(kind=kind, start=start, end=end):
+                    if source is not None:
+                        value = f":'{source[start:end]}'"
+                    else:
+                        value = ""
+                    lines.append((" " * indent) + f"{kind}{value} [{start}, {end})")
+
+        format_node(self, 0)
+        return lines
+
+    def format(self, source: str | None = None) -> str:
+        return "\n".join(self.format_lines(source))
+

@dataclass
 class ParseError:
@ -278,13 +301,15 @@ class TokenStream(typing.Protocol):
        ...


+# TODO: This runtime API sucks; the TokenStream is nice and all but I should
+#       also be able to have a function that takes a string and produces a
+#       tree directly, with caching intermediates for codegen and whatnot.
+
+
 class Parser:
-    # Our stack is a stack of tuples, where the first entry is the state
-    # number and the second entry is the 'value' that was generated when the
-    # state was pushed.
    table: parser.ParseTable

-    def __init__(self, table):
+    def __init__(self, table: parser.ParseTable):
        self.table = table

    def parse(self, tokens: TokenStream) -> typing.Tuple[Tree | None, list[str]]:
@ -301,6 +326,9 @@ class Parser:
        input = input + [TokenValue(kind="$", start=eof, end=eof)]
        input_index = 0

+        # Our stack is a stack of tuples, where the first entry is the state
+        # number and the second entry is the 'value' that was generated when
+        # the state was pushed.
        stack: ParseStack = [(0, None)]
        result: Tree | None = None
        errors: list[ParseError] = []
--- a/parser/wadler.py
+++ b/parser/wadler.py
@ -1,5 +1,7 @@
 # A prettier printer.
+import abc
 import dataclasses
+import math
 import typing

 from . import parser
@ -12,6 +14,13 @@ class Cons:
    right: "Document"


+def cons(left: "Document", right: "Document") -> "Document":
+    if left and right:
+        return Cons(left, right)
+    else:
+        return left or right
+
+
@dataclasses.dataclass(frozen=True)
 class NewLine:
    pass
@ -47,144 +56,230 @@ class Lazy:
 Document = None | Text | NewLine | Cons | Indent | Group | Lazy


+def resolve_document(doc: Document) -> Document:
+    match doc:
+        case Cons(left, right):
+            lr = resolve_document(left)
+            rr = resolve_document(right)
+            if lr is not left or rr is not right:
+                return cons(lr, rr)
+            else:
+                return doc
+
+        case Lazy(_):
+            return doc.resolve()
+
+        case _:
+            return doc
+
+
 def layout_document(doc: Document) -> typing.Generator[str, None, None]:
+    del doc
    raise NotImplementedError()


-@dataclasses.dataclass
-class Match:
-    doc: Document
-    remaining: list[runtime.Tree | runtime.TokenValue]
-
-
-class Matcher:
-    def match(self, items: list[runtime.Tree | runtime.TokenValue]) -> Match | None:
-        raise NotImplementedError()
-
-
-class NonTerminalMatcher(Matcher):
-    name: str
-    printer: "Printer"
-
-    def __init__(self, name: str, printer: "Printer"):
-        self.name = name
-        self.printer = printer
-
-    def match(self, items: list[runtime.Tree | runtime.TokenValue]) -> Match | None:
-        if len(items) == 0:
-            return None
-
-        item = items[0]
-        if isinstance(item, runtime.Tree) and item.name == self.name:
-            return Match(
-                doc=Lazy(value=lambda: self.printer.convert_tree_to_document(item)),
-                remaining=items[1:],
-            )
-
-        return None
-
-
-class TerminalMatcher(Matcher):
+@dataclasses.dataclass(frozen=True)
+class MatchTerminal:
    name: str

-    def __init__(self, name: str):
-        self.name = name

-    def match(self, items: list[runtime.Tree | runtime.TokenValue]) -> Match | None:
-        if len(items) == 0:
-            return None
-
-        item = items[0]
-        if isinstance(item, runtime.TokenValue) and item.kind == self.name:
-            return Match(
-                doc=Text(start=item.start, end=item.end),
-                remaining=items[1:],
-            )
-
-        return None
+@dataclasses.dataclass(frozen=True)
+class MatchNonTerminal:
+    name: str


-class IndentMatcher(Matcher):
+@dataclasses.dataclass(frozen=True)
+class Accept:
+    pass
+
+
+@dataclasses.dataclass(frozen=True)
+class StartGroup:
+    pass
+
+
+@dataclasses.dataclass(frozen=True)
+class EndGroup:
+    pass
+
+
+@dataclasses.dataclass(frozen=True)
+class StartIndent:
+    pass
+
+
+@dataclasses.dataclass(frozen=True)
+class EndIndent:
    amount: int
-    child: Matcher
-
-    def __init__(self, amount: int, child: Matcher):
-        self.amount = amount
-        self.child = child
-
-    def match(self, items: list[runtime.Tree | runtime.TokenValue]) -> Match | None:
-        result = self.child.match(items)
-        if result is not None:
-            result.doc = Indent(amount=self.amount, doc=result.doc)
-
-        return result


-class NewLineMatcher(Matcher):
-    def match(self, items: list[runtime.Tree | runtime.TokenValue]) -> Match | None:
-        return Match(
-            doc=NewLine(),
-            remaining=items,
-        )
+@dataclasses.dataclass(frozen=True)
+class Split:
+    left: int
+    right: int


-class GroupMatcher(Matcher):
-    child: Matcher
-
-    def __init__(self, child: Matcher):
-        self.child = child
-
-    def match(self, items: list[runtime.Tree | runtime.TokenValue]) -> Match | None:
-        result = self.child.match(items)
-        if result is not None:
-            result.doc = Group(result.doc)
-
-        return result
+@dataclasses.dataclass(frozen=True)
+class Jump:
+    next: int


-class CompleteMatcher(Matcher):
-    def match(self, items: list[runtime.Tree | runtime.TokenValue]) -> Match | None:
-        if len(items) == 0:
-            return Match(doc=None, remaining=[])
-        else:
-            return None
+MatchInstruction = (
+    MatchTerminal
+    | MatchNonTerminal
+    | Accept
+    | StartGroup
+    | EndGroup
+    | NewLine
+    | StartIndent
+    | EndIndent
+    | Split
+    | Jump
+)


-class AlternativeMatcher(Matcher):
-    children: list[Matcher]
+### THIS DOESN'T WORK
+###
+### YOU CANNOT MATCH RULES WITH TRANSPARENT CHILDREN WITH A FSM, THIS IS NOT
+### A REGULAR LANGUAGE IT IS CONTEXT FREE SO WE NEED TO RUN OUR REAL PARSER
+### WHICH MEANS YES WE NEED TO GENERATE TABLES AGAIN OUT OF SUB-GRAMMARS FOR
+### PRODUCTIONS BUT ALSO GENERATE NEW ONES FOR META AND ALSO RUN ACTIONS
+###
+### CHRIST.
+###
+class Matcher:
+    code: list[MatchInstruction]

-    def __init__(self, children: list[Matcher] | None = None):
-        self.children = children or []
+    def __init__(self):
+        self.code = []

-    def match(self, items: list[runtime.Tree | runtime.TokenValue]) -> Match | None:
-        for child in self.children:
-            m = child.match(items)
-            if m is not None:
-                return m
+    @dataclasses.dataclass
+    class ThreadState:
+        pc: int
+        position: int
+        count: int
+        results: list[Document | StartGroup | StartIndent]
+
+    def match(self, printer: "Printer", items: list[runtime.Tree | runtime.TokenValue]) -> Document:
+        threads: list[Matcher.ThreadState] = [
+            Matcher.ThreadState(pc=0, position=0, results=[], count=0)
+        ]
+
+        while len(threads) > 0:
+            thread = threads.pop()
+            results = thread.results
+            while True:
+                thread.count += 1
+                if thread.count > 1000:
+                    raise Exception("Too many steps!")
+
+                inst = self.code[thread.pc]
+                print(f"THREAD: {thread.pc}: {inst} ({thread.position})")
+                match inst:
+                    case MatchTerminal(name):
+                        if thread.position >= len(items):
+                            break
+
+                        item = items[thread.position]
+                        if not isinstance(item, runtime.TokenValue):
+                            break
+
+                        if item.kind != name:
+                            break
+
+                        results.append(Text(item.start, item.end))
+                        thread.pc += 1
+                        thread.position += 1
+
+                    case MatchNonTerminal(name):
+                        if thread.position >= len(items):
+                            break
+
+                        item = items[thread.position]
+                        if not isinstance(item, runtime.Tree):
+                            break
+
+                        if item.name != name:
+                            break
+
+                        def thunk(capture: runtime.Tree):
+                            return lambda: printer.convert_tree_to_document(capture)
+
+                        results.append(Lazy(thunk(item)))
+                        thread.pc += 1
+                        thread.position += 1
+
+                    case Accept():
+                        if thread.position != len(items):
+                            break
+
+                        result = None
+                        for r in thread.results:
+                            assert not isinstance(r, (StartGroup, StartIndent))
+                            result = cons(result, r)
+                        return result
+
+                    case StartGroup():
+                        results.append(inst)
+                        thread.pc += 1
+
+                    case EndGroup():
+                        group_items = None
+                        while not isinstance(results[-1], StartGroup):
+                            item = typing.cast(Document, results.pop())
+                            group_items = cons(item, group_items)
+                        results.pop()
+                        results.append(Group(group_items))
+                        thread.pc += 1
+
+                    case NewLine():
+                        results.append(NewLine())
+                        thread.pc += 1
+
+                    case StartIndent():
+                        results.append(inst)
+                        thread.pc += 1
+
+                    case EndIndent(amount):
+                        indent_items = None
+                        while not isinstance(results[-1], StartIndent):
+                            item = typing.cast(Document, results.pop())
+                            indent_items = cons(item, indent_items)
+                        results.pop()
+                        results.append(Indent(amount, indent_items))
+                        thread.pc += 1
+
+                    case Split(left, right):
+                        new_thread = Matcher.ThreadState(
+                            pc=right,
+                            position=thread.position,
+                            results=list(thread.results),
+                            count=0,
+                        )
+                        threads.append(new_thread)
+                        thread.pc = left
+
+                    case Jump(where):
+                        thread.pc = where
+                        threads.append(thread)
+
+                    case _:
+                        typing.assert_never(inst)

        return None

+    def format(self) -> str:
+        return "\n".join(self.format_lines())

-class SequenceMatcher(Matcher):
-    children: list[Matcher]
+    def format_lines(self) -> list[str]:
+        lines = []
+        code_len = int(math.log10(len(self.code))) + 1
+        for i, inst in enumerate(self.code):
+            lines.append(f"{i: >{code_len}} {inst}")
+        return lines

-    def __init__(self, children: list[Matcher] | None = None):
-        self.children = children or []
-
-    def match(self, items: list[runtime.Tree | runtime.TokenValue]) -> Match | None:
-        doc = None
-        for child in self.children:
-            m = child.match(items)
-            if m is None:
-                return None
-
-            items = m.remaining
-            doc = Cons(doc, m.doc)
-
-        return Match(
-            doc=doc,
-            remaining=items,
-        )
+    @abc.abstractmethod
+    def format_into(self, lines: list[str], visited: dict["Matcher", int], indent: int = 0): ...


 class PrettyMeta(parser.SyntaxMeta):
@ -195,68 +290,92 @@ class PrettyMeta(parser.SyntaxMeta):

 class Printer:
    grammar: parser.Grammar
-    matchers: dict[str, Matcher]
+    _matchers: dict[str, Matcher]
+    _nonterminals: dict[str, parser.NonTerminal]

    def __init__(self, grammar: parser.Grammar):
        self.grammar = grammar
+        self._nonterminals = {nt.name: nt for nt in grammar.non_terminals()}
+        self._matchers = {}

    def lookup_nonterminal(self, name: str) -> parser.NonTerminal:
-        raise NotImplementedError()
+        return self._nonterminals[name]

-    def production_to_matcher(self, production: parser.FlattenedWithMetadata) -> Matcher:
-        results = []
-        for item in production:
-            if isinstance(item, str):
-                rule = self.lookup_nonterminal(item)
-                if rule.transparent:
-                    # If it's transparent then we don't actually match a
-                    # nonterminal here, we need to match against the contents
-                    # of the rule, so we recurse.
-                    results.append(self.rule_to_matcher(rule))
-                else:
-                    results.append(NonTerminalMatcher(item, self))
-
-            elif isinstance(item, parser.Terminal):
-                name = item.name
-                assert name is not None
-                results.append(TerminalMatcher(name))
+    def compile_rule(self, rule: parser.NonTerminal) -> Matcher:
+        matcher = Matcher()
+        code = matcher.code
+        patcher: dict[str, int] = {}

+        def compile_nonterminal(rule: parser.NonTerminal):
+            sub_start = patcher.get(rule.name)
+            if sub_start is not None:
+                code.append(Jump(sub_start))
            else:
-                meta, children = item
+                sub_start = len(code)
+                patcher[rule.name] = sub_start
+                tails = []
+                subs = list(rule.fn(self.grammar).flatten(with_metadata=True))
+                for sub in subs[:-1]:
+                    split_pos = len(code)
+                    code.append(Split(0, 0))

-                child = self.production_to_matcher(children)
+                    compile_production(sub)

-                prettier = meta.get("prettier")
-                if isinstance(prettier, PrettyMeta):
-                    if prettier.indent:
-                        child = IndentMatcher(prettier.indent, child)
+                    tails.append(len(code))
+                    code.append(Jump(0))

-                    if prettier.group:
-                        child = GroupMatcher(child)
+                    code[split_pos] = Split(sub_start + 1, len(code))
+                    sub_start = len(code)

-                    results.append(child)
+                compile_production(subs[-1])

-                    if prettier.newline:
-                        results.append(NewLineMatcher())
+                for tail in tails:
+                    code[tail] = Jump(len(code))
+
+        def compile_production(production: parser.FlattenedWithMetadata):
+            for item in production:
+                if isinstance(item, str):
+                    rule = self.lookup_nonterminal(item)
+                    if rule.transparent:
+                        # If it's transparent then we need to inline the pattern here.
+                        compile_nonterminal(rule)
+                    else:
+                        code.append(MatchNonTerminal(item))
+
+                elif isinstance(item, parser.Terminal):
+                    name = item.name
+                    assert name is not None
+                    code.append(MatchTerminal(name))

                else:
-                    results.append(child)
+                    meta, children = item

-        return SequenceMatcher(results)
+                    prettier = meta.get("prettier")
+                    if isinstance(prettier, PrettyMeta):
+                        if prettier.indent:
+                            code.append(StartIndent())
+                        if prettier.group:
+                            code.append(StartGroup())
+
+                    compile_production(children)
+
+                    if isinstance(prettier, PrettyMeta):
+                        if prettier.group:
+                            code.append(EndGroup())
+                        if prettier.indent:
+                            code.append(EndIndent(prettier.indent))
+                        if prettier.newline:
+                            code.append(NewLine())
+
+        compile_nonterminal(rule)
+        code.append(Accept())
+        return matcher

    def rule_to_matcher(self, rule: parser.NonTerminal) -> Matcher:
-        result = self.matchers.get(rule.name)
+        result = self._matchers.get(rule.name)
        if result is None:
-            # Create the empty alternative, be sure to set up the
-            alts = AlternativeMatcher()
-            if rule.transparent:
-                result = alts
-            else:
-                result = SequenceMatcher(children=[alts, CompleteMatcher()])
-            self.matchers[rule.name] = result
-
-            for production in rule.fn(self.grammar).flatten(with_metadata=True):
-                alts.children.append(self.production_to_matcher(production))
+            result = self.compile_rule(rule)
+            self._matchers[rule.name] = result

        return result

@ -266,11 +385,16 @@ class Printer:

        rule = self.lookup_nonterminal(name)
        matcher = self.rule_to_matcher(rule)
-
-        m = matcher.match(list(tree.children))
-        assert m is not None, "Could not match a valid tree"  # TODO: Exception rather I think
-
-        return m.doc
+        print(f"--------")
+        print(f"Matching with:\n{matcher.format()}")
+        m = matcher.match(self, list(tree.children))
+        print(f"--------")
+        if m is None:
+            raise ValueError(
+                f"Could not match a valid tree for {tree.name} with {len(tree.children)} children:\n{tree.format()}\nMatcher:\n{matcher.format()}"
+            )
+        # return m
+        return resolve_document(m)

    def format_tree(self, tree: runtime.Tree) -> str:
        doc = self.convert_tree_to_document(tree)