Rebuild the matcher on grammars

Well that wasn't so bad now was it? Eh? Nice to have a parser generator lying around. Let's keep working to see if I can actually finish it.
2024-09-09 11:40:14 -07:00 · 2024-09-09 11:40:14 -07:00 · 7edf5e06bf
commit 7edf5e06bf
parent 1d28c82007
1 changed files with 132 additions and 242 deletions
--- a/parser/wadler.py
+++ b/parser/wadler.py
@ -1,7 +1,5 @@
 # A prettier printer.
 import abc
 import dataclasses
 import math
 import typing
 from . import parser
@ -52,6 +50,10 @@ class Lazy:
            self.value = self.value()
        return self.value
    @classmethod
    def from_tree(cls, tree: runtime.Tree, printer: "Printer") -> "Lazy":
        return Lazy(lambda: printer.convert_tree_to_document(tree))
 Document = None | Text | NewLine | Cons | Indent | Group | Lazy
@ -78,208 +80,88 @@ def layout_document(doc: Document) -> typing.Generator[str, None, None]:
    raise NotImplementedError()
-@dataclasses.dataclass(frozen=True)
+def child_to_name(child: runtime.Tree | runtime.TokenValue) -> str:
-class MatchTerminal:
+    if isinstance(child, runtime.Tree):
-    name: str
+        return f"tree_{child.name}"
    else:
        return f"token_{child.kind}"
@dataclasses.dataclass(frozen=True)
 class MatchNonTerminal:
    name: str
@dataclasses.dataclass(frozen=True)
 class Accept:
    pass
@dataclasses.dataclass(frozen=True)
 class StartGroup:
    pass
@dataclasses.dataclass(frozen=True)
 class EndGroup:
    pass
@dataclasses.dataclass(frozen=True)
 class StartIndent:
    pass
@dataclasses.dataclass(frozen=True)
 class EndIndent:
    amount: int
@dataclasses.dataclass(frozen=True)
 class Split:
    left: int
    right: int
@dataclasses.dataclass(frozen=True)
 class Jump:
    next: int
 MatchInstruction = (
    MatchTerminal
    | MatchNonTerminal
    | Accept
    | StartGroup
    | EndGroup
    | NewLine
    | StartIndent
    | EndIndent
    | Split
    | Jump
 )
 ### THIS DOESN'T WORK
 ###
 ### YOU CANNOT MATCH RULES WITH TRANSPARENT CHILDREN WITH A FSM, THIS IS NOT
 ### A REGULAR LANGUAGE IT IS CONTEXT FREE SO WE NEED TO RUN OUR REAL PARSER
 ### WHICH MEANS YES WE NEED TO GENERATE TABLES AGAIN OUT OF SUB-GRAMMARS FOR
 ### PRODUCTIONS BUT ALSO GENERATE NEW ONES FOR META AND ALSO RUN ACTIONS
 ###
 ### CHRIST.
 ###
 class Matcher:
-    code: list[MatchInstruction]
+    table: parser.ParseTable
    indent_amounts: dict[str, int]
-    def __init__(self):
+    def __init__(self, table: parser.ParseTable, indent_amounts):
-        self.code = []
+        self.table = table
-
+        self.indent_amounts = indent_amounts
    @dataclasses.dataclass
    class ThreadState:
        pc: int
        position: int
        count: int
        results: list[Document | StartGroup | StartIndent]
    def match(self, printer: "Printer", items: list[runtime.Tree | runtime.TokenValue]) -> Document:
-        threads: list[Matcher.ThreadState] = [
+        stack: list[tuple[int, Document]] = [(0, None)]
-            Matcher.ThreadState(pc=0, position=0, results=[], count=0)
+        table = self.table
        input = [(child_to_name(i), i) for i in items] + [
            ("$", runtime.TokenValue(kind="$", start=0, end=0))
        ]
        input_index = 0
-        while len(threads) > 0:
+        while True:
-            thread = threads.pop()
+            current_token = input[input_index]
-            results = thread.results
+            current_state = stack[-1][0]
-            while True:
+            action = table.actions[current_state].get(current_token[0], parser.Error())
                thread.count += 1
                if thread.count > 1000:
                    raise Exception("Too many steps!")
-                inst = self.code[thread.pc]
+            # print(
-                print(f"THREAD: {thread.pc}: {inst} ({thread.position})")
+            #     "{stack: <30} {input: <15} {action: <5}".format(
-                match inst:
+            #         stack=repr([s[0] for s in stack[-5:]]),
-                    case MatchTerminal(name):
+            #         input=current_token[0],
-                        if thread.position >= len(items):
+            #         action=repr(action),
-                            break
+            #     )
            # )
-                        item = items[thread.position]
+            match action:
-                        if not isinstance(item, runtime.TokenValue):
+                case parser.Accept():
-                            break
+                    return stack[-1][1]
-                        if item.kind != name:
+                case parser.Reduce(name=name, count=size):
-                            break
+                    child: Document = None
                    if size > 0:
                        for _, c in stack[-size:]:
                            if c is None:
                                continue
                            child = cons(child, c)
                        del stack[-size:]
-                        results.append(Text(item.start, item.end))
+                    if name[0] == "g":
-                        thread.pc += 1
+                        child = Group(child)
                        thread.position += 1
-                    case MatchNonTerminal(name):
+                    elif name[0] == "i":
-                        if thread.position >= len(items):
+                        amount = self.indent_amounts[name]
-                            break
+                        child = Indent(amount, child)
-                        item = items[thread.position]
+                    elif name[0] == "n":
-                        if not isinstance(item, runtime.Tree):
+                        child = cons(child, NewLine())
                            break
-                        if item.name != name:
+                    elif name[0] == "p":
-                            break
+                        child = cons(NewLine(), child)
-                        def thunk(capture: runtime.Tree):
+                    else:
-                            return lambda: printer.convert_tree_to_document(capture)
+                        pass  # ???
-                        results.append(Lazy(thunk(item)))
+                    goto = self.table.gotos[stack[-1][0]].get(name)
-                        thread.pc += 1
+                    assert goto is not None
-                        thread.position += 1
+                    stack.append((goto, child))
-                    case Accept():
+                case parser.Shift():
-                        if thread.position != len(items):
+                    value = current_token[1]
-                            break
+                    if isinstance(value, runtime.Tree):
                        child = Lazy.from_tree(value, printer)
                    else:
                        child = Text(value.start, value.end)
-                        result = None
+                    stack.append((action.state, child))
-                        for r in thread.results:
+                    input_index += 1
                            assert not isinstance(r, (StartGroup, StartIndent))
                            result = cons(result, r)
                        return result
-                    case StartGroup():
+                case parser.Error():
-                        results.append(inst)
+                    raise Exception("How did I get a parse error here??")
                        thread.pc += 1
                    case EndGroup():
                        group_items = None
                        while not isinstance(results[-1], StartGroup):
                            item = typing.cast(Document, results.pop())
                            group_items = cons(item, group_items)
                        results.pop()
                        results.append(Group(group_items))
                        thread.pc += 1
                    case NewLine():
                        results.append(NewLine())
                        thread.pc += 1
                    case StartIndent():
                        results.append(inst)
                        thread.pc += 1
                    case EndIndent(amount):
                        indent_items = None
                        while not isinstance(results[-1], StartIndent):
                            item = typing.cast(Document, results.pop())
                            indent_items = cons(item, indent_items)
                        results.pop()
                        results.append(Indent(amount, indent_items))
                        thread.pc += 1
                    case Split(left, right):
                        new_thread = Matcher.ThreadState(
                            pc=right,
                            position=thread.position,
                            results=list(thread.results),
                            count=0,
                        )
                        threads.append(new_thread)
                        thread.pc = left
                    case Jump(where):
                        thread.pc = where
                        threads.append(thread)
                    case _:
                        typing.assert_never(inst)
        return None
    def format(self) -> str:
        return "\n".join(self.format_lines())
    def format_lines(self) -> list[str]:
        lines = []
        code_len = int(math.log10(len(self.code))) + 1
        for i, inst in enumerate(self.code):
            lines.append(f"{i: >{code_len}} {inst}")
        return lines
    @abc.abstractmethod
    def format_into(self, lines: list[str], visited: dict["Matcher", int], indent: int = 0): ...
 class PrettyMeta(parser.SyntaxMeta):
@ -302,74 +184,86 @@ class Printer:
        return self._nonterminals[name]
    def compile_rule(self, rule: parser.NonTerminal) -> Matcher:
-        matcher = Matcher()
+        generated_grammar: list[typing.Tuple[str, list[str]]] = []
-        code = matcher.code
+        visited: set[str] = set()
-        patcher: dict[str, int] = {}
+        group_count = 0
        indent_amounts: dict[str, int] = {}
        done_newline = False
-        def compile_nonterminal(rule: parser.NonTerminal):
+        def compile_nonterminal(name: str, rule: parser.NonTerminal):
-            sub_start = patcher.get(rule.name)
+            if name not in visited:
-            if sub_start is not None:
+                visited.add(name)
-                code.append(Jump(sub_start))
+                for production in rule.fn(self.grammar).flatten(with_metadata=True):
-            else:
+                    trans_prod = compile_production(production)
-                sub_start = len(code)
+                    generated_grammar.append((name, trans_prod))
                patcher[rule.name] = sub_start
                tails = []
                subs = list(rule.fn(self.grammar).flatten(with_metadata=True))
                for sub in subs[:-1]:
                    split_pos = len(code)
                    code.append(Split(0, 0))
-                    compile_production(sub)
+        def compile_production(production: parser.FlattenedWithMetadata) -> list[str]:
            nonlocal group_count
            nonlocal indent_amounts
            nonlocal done_newline
-                    tails.append(len(code))
+            result = []
                    code.append(Jump(0))
                    code[split_pos] = Split(sub_start + 1, len(code))
                    sub_start = len(code)
                compile_production(subs[-1])
                for tail in tails:
                    code[tail] = Jump(len(code))
        def compile_production(production: parser.FlattenedWithMetadata):
            for item in production:
                if isinstance(item, str):
-                    rule = self.lookup_nonterminal(item)
+                    nt = self._nonterminals[item]
-                    if rule.transparent:
+                    if nt.transparent:
-                        # If it's transparent then we need to inline the pattern here.
+                        # If it's transparent then we make a new set of
-                        compile_nonterminal(rule)
+                        # productions that covers the contents of the
                        # transparent nonterminal.
                        name = "xxx_" + nt.name
                        compile_nonterminal(name, nt)
                        result.append(name)
                    else:
-                        code.append(MatchNonTerminal(item))
+                        # Otherwise it's a "token" in our input, named
                        # "tree_{whatever}".
                        result.append(f"tree_{item}")
                elif isinstance(item, parser.Terminal):
-                    name = item.name
+                    # If it's a terminal it will appear in our input as
-                    assert name is not None
+                    # "token_{whatever}".
-                    code.append(MatchTerminal(name))
+                    result.append(f"token_{item.name}")
                else:
                    meta, children = item
                    tx_children = compile_production(children)
-                    prettier = meta.get("prettier")
+                    pretty = meta.get("prettier")
-                    if isinstance(prettier, PrettyMeta):
+                    if isinstance(pretty, PrettyMeta):
-                        if prettier.indent:
+                        if pretty.group:
-                            code.append(StartIndent())
+                            # Make a fake rule.
-                        if prettier.group:
+                            rule_name = f"g_{group_count}"
-                            code.append(StartGroup())
+                            group_count += 1
                            generated_grammar.append((rule_name, tx_children))
                            tx_children = [rule_name]
-                    compile_production(children)
+                        if pretty.indent:
                            rule_name = f"i_{len(indent_amounts)}"
                            indent_amounts[rule_name] = pretty.indent
                            generated_grammar.append((rule_name, tx_children))
                            tx_children = [rule_name]
-                    if isinstance(prettier, PrettyMeta):
+                        if pretty.newline:
-                        if prettier.group:
+                            if not done_newline:
-                            code.append(EndGroup())
+                                generated_grammar.append(("newline", []))
-                        if prettier.indent:
+                                done_newline = True
-                            code.append(EndIndent(prettier.indent))
+                            tx_children.append("newline")
                        if prettier.newline:
                            code.append(NewLine())
-        compile_nonterminal(rule)
+                    # If it turned out to have formatting meta then we will
-        code.append(Accept())
+                    # have replaced or augmented the translated children
-        return matcher
+                    # appropriately. Otherwise, if it's highlighting meta or
                    # something else, we'll have ignored it and the
                    # translated children should just be inserted inline.
                    result.extend(tx_children)
            return result
        compile_nonterminal(rule.name, rule)
        gen = self.grammar._generator(rule.name, generated_grammar)
        parse_table = gen.gen_table()
        # print(parse_table.format())
        return Matcher(parse_table, indent_amounts)
    def rule_to_matcher(self, rule: parser.NonTerminal) -> Matcher:
        result = self._matchers.get(rule.name)
@ -385,15 +279,11 @@ class Printer:
        rule = self.lookup_nonterminal(name)
        matcher = self.rule_to_matcher(rule)
        print(f"--------")
        print(f"Matching with:\n{matcher.format()}")
        m = matcher.match(self, list(tree.children))
        print(f"--------")
        if m is None:
            raise ValueError(
-                f"Could not match a valid tree for {tree.name} with {len(tree.children)} children:\n{tree.format()}\nMatcher:\n{matcher.format()}"
+                f"Could not match a valid tree for {tree.name} with {len(tree.children)} children:\n{tree.format()}"
            )
        # return m
        return resolve_document(m)
    def format_tree(self, tree: runtime.Tree) -> str: