Saving this for posterity, but it is doomed

Remember that tree levels are generated by context free languages, not regular languages, and so they can only be recognized by push-down automatons, not finite state machines. What happened was that I failed to account for transparent rules. Without transparent rules the children of a tree node do not have any recursion in them (by definition!) and so therefore *are* a regular language. But transparent rules change that: there *can be* recursion hidden on the same tree level, and it should have been clear from a moment's reflection that the recursion there meant that tree levels were once again a context free language. Fortunately we have a recognizer for context free languages lying around, so we can just use that I guess.
The start rule cannot be transparent
2024-09-09 06:23:25 -07:00 · 2024-09-09 06:23:11 -07:00 · 2024-09-09 06:22:56 -07:00
3 changed files with 322 additions and 166 deletions
--- a/parser/parser.py
+++ b/parser/parser.py
@ -2815,10 +2815,12 @@ class Grammar:
    def get_precedence(self, name: str) -> None | tuple[Assoc, int]:
        return self._precedence.get(name)
    # TODO: The flattened form should retain NonTerminal, not just str.
    def generate_nonterminal_dict(
        self, start: str | None = None
    ) -> typing.Tuple[dict[str, list[list[str | Terminal]]], set[str]]:
-        """Convert the rules into a dictionary of productions.
+        """Convert the rules into a dictionary of productions, and a set of
        the names of transparent nonterminals.
        Our table generators work on a very flat set of productions. This is the
        first step in flattening the productions from the members: walk the rules
@ -2838,6 +2840,8 @@ class Grammar:
        rule = nonterminals.get(start)
        if rule is None:
            raise ValueError(f"Cannot find a rule named '{start}'")
        if rule.transparent:
            raise ValueError("The start rule cannot be transparent")
        queue = [rule]
        while len(queue) > 0:
            rule = queue.pop()
--- a/parser/runtime.py
+++ b/parser/runtime.py
@ -22,6 +22,29 @@ class Tree:
    end: int
    children: typing.Tuple["Tree | TokenValue", ...]
    def format_lines(self, source: str | None = None) -> list[str]:
        lines = []
        def format_node(node: Tree | TokenValue, indent: int):
            match node:
                case Tree(name=name, start=start, end=end, children=children):
                    lines.append((" " * indent) + f"{name or '???'} [{start}, {end})")
                    for child in children:
                        format_node(child, indent + 2)
                case TokenValue(kind=kind, start=start, end=end):
                    if source is not None:
                        value = f":'{source[start:end]}'"
                    else:
                        value = ""
                    lines.append((" " * indent) + f"{kind}{value} [{start}, {end})")
        format_node(self, 0)
        return lines
    def format(self, source: str | None = None) -> str:
        return "\n".join(self.format_lines(source))
@dataclass
 class ParseError:
@ -278,13 +301,15 @@ class TokenStream(typing.Protocol):
        ...
 # TODO: This runtime API sucks; the TokenStream is nice and all but I should
 #       also be able to have a function that takes a string and produces a
 #       tree directly, with caching intermediates for codegen and whatnot.
 class Parser:
    # Our stack is a stack of tuples, where the first entry is the state
    # number and the second entry is the 'value' that was generated when the
    # state was pushed.
    table: parser.ParseTable
-    def __init__(self, table):
+    def __init__(self, table: parser.ParseTable):
        self.table = table
    def parse(self, tokens: TokenStream) -> typing.Tuple[Tree | None, list[str]]:
@ -301,6 +326,9 @@ class Parser:
        input = input + [TokenValue(kind="$", start=eof, end=eof)]
        input_index = 0
        # Our stack is a stack of tuples, where the first entry is the state
        # number and the second entry is the 'value' that was generated when
        # the state was pushed.
        stack: ParseStack = [(0, None)]
        result: Tree | None = None
        errors: list[ParseError] = []
--- a/parser/wadler.py
+++ b/parser/wadler.py
@ -1,5 +1,7 @@
 # A prettier printer.
 import abc
 import dataclasses
 import math
 import typing
 from . import parser
@ -12,6 +14,13 @@ class Cons:
    right: "Document"
 def cons(left: "Document", right: "Document") -> "Document":
    if left and right:
        return Cons(left, right)
    else:
        return left or right
@dataclasses.dataclass(frozen=True)
 class NewLine:
    pass
@ -47,144 +56,230 @@ class Lazy:
 Document = None | Text | NewLine | Cons | Indent | Group | Lazy
 def resolve_document(doc: Document) -> Document:
    match doc:
        case Cons(left, right):
            lr = resolve_document(left)
            rr = resolve_document(right)
            if lr is not left or rr is not right:
                return cons(lr, rr)
            else:
                return doc
        case Lazy(_):
            return doc.resolve()
        case _:
            return doc
 def layout_document(doc: Document) -> typing.Generator[str, None, None]:
    del doc
    raise NotImplementedError()
@dataclasses.dataclass(frozen=True)
 class MatchTerminal:
    name: str
@dataclasses.dataclass(frozen=True)
 class MatchNonTerminal:
    name: str
@dataclasses.dataclass(frozen=True)
 class Accept:
    pass
@dataclasses.dataclass(frozen=True)
 class StartGroup:
    pass
@dataclasses.dataclass(frozen=True)
 class EndGroup:
    pass
@dataclasses.dataclass(frozen=True)
 class StartIndent:
    pass
@dataclasses.dataclass(frozen=True)
 class EndIndent:
    amount: int
@dataclasses.dataclass(frozen=True)
 class Split:
    left: int
    right: int
@dataclasses.dataclass(frozen=True)
 class Jump:
    next: int
 MatchInstruction = (
    MatchTerminal
    | MatchNonTerminal
    | Accept
    | StartGroup
    | EndGroup
    | NewLine
    | StartIndent
    | EndIndent
    | Split
    | Jump
 )
 ### THIS DOESN'T WORK
 ###
 ### YOU CANNOT MATCH RULES WITH TRANSPARENT CHILDREN WITH A FSM, THIS IS NOT
 ### A REGULAR LANGUAGE IT IS CONTEXT FREE SO WE NEED TO RUN OUR REAL PARSER
 ### WHICH MEANS YES WE NEED TO GENERATE TABLES AGAIN OUT OF SUB-GRAMMARS FOR
 ### PRODUCTIONS BUT ALSO GENERATE NEW ONES FOR META AND ALSO RUN ACTIONS
 ###
 ### CHRIST.
 ###
 class Matcher:
    code: list[MatchInstruction]
    def __init__(self):
        self.code = []
    @dataclasses.dataclass
-class Match:
+    class ThreadState:
-    doc: Document
+        pc: int
-    remaining: list[runtime.Tree | runtime.TokenValue]
+        position: int
        count: int
        results: list[Document | StartGroup | StartIndent]
    def match(self, printer: "Printer", items: list[runtime.Tree | runtime.TokenValue]) -> Document:
        threads: list[Matcher.ThreadState] = [
            Matcher.ThreadState(pc=0, position=0, results=[], count=0)
        ]
-class Matcher:
+        while len(threads) > 0:
-    def match(self, items: list[runtime.Tree | runtime.TokenValue]) -> Match | None:
+            thread = threads.pop()
-        raise NotImplementedError()
+            results = thread.results
            while True:
                thread.count += 1
                if thread.count > 1000:
                    raise Exception("Too many steps!")
                inst = self.code[thread.pc]
                print(f"THREAD: {thread.pc}: {inst} ({thread.position})")
                match inst:
                    case MatchTerminal(name):
                        if thread.position >= len(items):
                            break
-class NonTerminalMatcher(Matcher):
+                        item = items[thread.position]
-    name: str
+                        if not isinstance(item, runtime.TokenValue):
-    printer: "Printer"
+                            break
-    def __init__(self, name: str, printer: "Printer"):
+                        if item.kind != name:
-        self.name = name
+                            break
        self.printer = printer
-    def match(self, items: list[runtime.Tree | runtime.TokenValue]) -> Match | None:
+                        results.append(Text(item.start, item.end))
-        if len(items) == 0:
+                        thread.pc += 1
-            return None
+                        thread.position += 1
-        item = items[0]
+                    case MatchNonTerminal(name):
-        if isinstance(item, runtime.Tree) and item.name == self.name:
+                        if thread.position >= len(items):
-            return Match(
+                            break
                doc=Lazy(value=lambda: self.printer.convert_tree_to_document(item)),
                remaining=items[1:],
            )
-        return None
+                        item = items[thread.position]
                        if not isinstance(item, runtime.Tree):
                            break
                        if item.name != name:
                            break
-class TerminalMatcher(Matcher):
+                        def thunk(capture: runtime.Tree):
-    name: str
+                            return lambda: printer.convert_tree_to_document(capture)
-    def __init__(self, name: str):
+                        results.append(Lazy(thunk(item)))
-        self.name = name
+                        thread.pc += 1
                        thread.position += 1
-    def match(self, items: list[runtime.Tree | runtime.TokenValue]) -> Match | None:
+                    case Accept():
-        if len(items) == 0:
+                        if thread.position != len(items):
-            return None
+                            break
        item = items[0]
        if isinstance(item, runtime.TokenValue) and item.kind == self.name:
            return Match(
                doc=Text(start=item.start, end=item.end),
                remaining=items[1:],
            )
        return None
 class IndentMatcher(Matcher):
    amount: int
    child: Matcher
    def __init__(self, amount: int, child: Matcher):
        self.amount = amount
        self.child = child
    def match(self, items: list[runtime.Tree | runtime.TokenValue]) -> Match | None:
        result = self.child.match(items)
        if result is not None:
            result.doc = Indent(amount=self.amount, doc=result.doc)
                        result = None
                        for r in thread.results:
                            assert not isinstance(r, (StartGroup, StartIndent))
                            result = cons(result, r)
                        return result
                    case StartGroup():
                        results.append(inst)
                        thread.pc += 1
-class NewLineMatcher(Matcher):
+                    case EndGroup():
-    def match(self, items: list[runtime.Tree | runtime.TokenValue]) -> Match | None:
+                        group_items = None
-        return Match(
+                        while not isinstance(results[-1], StartGroup):
-            doc=NewLine(),
+                            item = typing.cast(Document, results.pop())
-            remaining=items,
+                            group_items = cons(item, group_items)
                        results.pop()
                        results.append(Group(group_items))
                        thread.pc += 1
                    case NewLine():
                        results.append(NewLine())
                        thread.pc += 1
                    case StartIndent():
                        results.append(inst)
                        thread.pc += 1
                    case EndIndent(amount):
                        indent_items = None
                        while not isinstance(results[-1], StartIndent):
                            item = typing.cast(Document, results.pop())
                            indent_items = cons(item, indent_items)
                        results.pop()
                        results.append(Indent(amount, indent_items))
                        thread.pc += 1
                    case Split(left, right):
                        new_thread = Matcher.ThreadState(
                            pc=right,
                            position=thread.position,
                            results=list(thread.results),
                            count=0,
                        )
                        threads.append(new_thread)
                        thread.pc = left
                    case Jump(where):
                        thread.pc = where
                        threads.append(thread)
-class GroupMatcher(Matcher):
+                    case _:
-    child: Matcher
+                        typing.assert_never(inst)
    def __init__(self, child: Matcher):
        self.child = child
    def match(self, items: list[runtime.Tree | runtime.TokenValue]) -> Match | None:
        result = self.child.match(items)
        if result is not None:
            result.doc = Group(result.doc)
        return result
 class CompleteMatcher(Matcher):
    def match(self, items: list[runtime.Tree | runtime.TokenValue]) -> Match | None:
        if len(items) == 0:
            return Match(doc=None, remaining=[])
        else:
            return None
 class AlternativeMatcher(Matcher):
    children: list[Matcher]
    def __init__(self, children: list[Matcher] | None = None):
        self.children = children or []
    def match(self, items: list[runtime.Tree | runtime.TokenValue]) -> Match | None:
        for child in self.children:
            m = child.match(items)
            if m is not None:
                return m
        return None
    def format(self) -> str:
        return "\n".join(self.format_lines())
-class SequenceMatcher(Matcher):
+    def format_lines(self) -> list[str]:
-    children: list[Matcher]
+        lines = []
        code_len = int(math.log10(len(self.code))) + 1
        for i, inst in enumerate(self.code):
            lines.append(f"{i: >{code_len}} {inst}")
        return lines
-    def __init__(self, children: list[Matcher] | None = None):
+    @abc.abstractmethod
-        self.children = children or []
+    def format_into(self, lines: list[str], visited: dict["Matcher", int], indent: int = 0): ...
    def match(self, items: list[runtime.Tree | runtime.TokenValue]) -> Match | None:
        doc = None
        for child in self.children:
            m = child.match(items)
            if m is None:
                return None
            items = m.remaining
            doc = Cons(doc, m.doc)
        return Match(
            doc=doc,
            remaining=items,
        )
 class PrettyMeta(parser.SyntaxMeta):
@ -195,68 +290,92 @@ class PrettyMeta(parser.SyntaxMeta):
 class Printer:
    grammar: parser.Grammar
-    matchers: dict[str, Matcher]
+    _matchers: dict[str, Matcher]
    _nonterminals: dict[str, parser.NonTerminal]
    def __init__(self, grammar: parser.Grammar):
        self.grammar = grammar
        self._nonterminals = {nt.name: nt for nt in grammar.non_terminals()}
        self._matchers = {}
    def lookup_nonterminal(self, name: str) -> parser.NonTerminal:
-        raise NotImplementedError()
+        return self._nonterminals[name]
-    def production_to_matcher(self, production: parser.FlattenedWithMetadata) -> Matcher:
+    def compile_rule(self, rule: parser.NonTerminal) -> Matcher:
-        results = []
+        matcher = Matcher()
        code = matcher.code
        patcher: dict[str, int] = {}
        def compile_nonterminal(rule: parser.NonTerminal):
            sub_start = patcher.get(rule.name)
            if sub_start is not None:
                code.append(Jump(sub_start))
            else:
                sub_start = len(code)
                patcher[rule.name] = sub_start
                tails = []
                subs = list(rule.fn(self.grammar).flatten(with_metadata=True))
                for sub in subs[:-1]:
                    split_pos = len(code)
                    code.append(Split(0, 0))
                    compile_production(sub)
                    tails.append(len(code))
                    code.append(Jump(0))
                    code[split_pos] = Split(sub_start + 1, len(code))
                    sub_start = len(code)
                compile_production(subs[-1])
                for tail in tails:
                    code[tail] = Jump(len(code))
        def compile_production(production: parser.FlattenedWithMetadata):
            for item in production:
                if isinstance(item, str):
                    rule = self.lookup_nonterminal(item)
                    if rule.transparent:
-                    # If it's transparent then we don't actually match a
+                        # If it's transparent then we need to inline the pattern here.
-                    # nonterminal here, we need to match against the contents
+                        compile_nonterminal(rule)
                    # of the rule, so we recurse.
                    results.append(self.rule_to_matcher(rule))
                    else:
-                    results.append(NonTerminalMatcher(item, self))
+                        code.append(MatchNonTerminal(item))
                elif isinstance(item, parser.Terminal):
                    name = item.name
                    assert name is not None
-                results.append(TerminalMatcher(name))
+                    code.append(MatchTerminal(name))
                else:
                    meta, children = item
                child = self.production_to_matcher(children)
                    prettier = meta.get("prettier")
                    if isinstance(prettier, PrettyMeta):
                        if prettier.indent:
-                        child = IndentMatcher(prettier.indent, child)
+                            code.append(StartIndent())
                        if prettier.group:
-                        child = GroupMatcher(child)
+                            code.append(StartGroup())
-                    results.append(child)
+                    compile_production(children)
                    if isinstance(prettier, PrettyMeta):
                        if prettier.group:
                            code.append(EndGroup())
                        if prettier.indent:
                            code.append(EndIndent(prettier.indent))
                        if prettier.newline:
-                        results.append(NewLineMatcher())
+                            code.append(NewLine())
-                else:
+        compile_nonterminal(rule)
-                    results.append(child)
+        code.append(Accept())
-
+        return matcher
        return SequenceMatcher(results)
    def rule_to_matcher(self, rule: parser.NonTerminal) -> Matcher:
-        result = self.matchers.get(rule.name)
+        result = self._matchers.get(rule.name)
        if result is None:
-            # Create the empty alternative, be sure to set up the
+            result = self.compile_rule(rule)
-            alts = AlternativeMatcher()
+            self._matchers[rule.name] = result
            if rule.transparent:
                result = alts
            else:
                result = SequenceMatcher(children=[alts, CompleteMatcher()])
            self.matchers[rule.name] = result
            for production in rule.fn(self.grammar).flatten(with_metadata=True):
                alts.children.append(self.production_to_matcher(production))
        return result
@ -266,11 +385,16 @@ class Printer:
        rule = self.lookup_nonterminal(name)
        matcher = self.rule_to_matcher(rule)
-
+        print(f"--------")
-        m = matcher.match(list(tree.children))
+        print(f"Matching with:\n{matcher.format()}")
-        assert m is not None, "Could not match a valid tree"  # TODO: Exception rather I think
+        m = matcher.match(self, list(tree.children))
-
+        print(f"--------")
-        return m.doc
+        if m is None:
            raise ValueError(
                f"Could not match a valid tree for {tree.name} with {len(tree.children)} children:\n{tree.format()}\nMatcher:\n{matcher.format()}"
            )
        # return m
        return resolve_document(m)
    def format_tree(self, tree: runtime.Tree) -> str:
        doc = self.convert_tree_to_document(tree)