[wadler] Refactor: data and runtime split

Now we convert the grammar into data for a pretty-printer, so in theoryw e could write the pretty-printer in a different language.
2024-09-21 06:44:53 -07:00 · 2024-09-21 06:44:53 -07:00 · 1f84752538
commit 1f84752538
parent e4585170d8
2 changed files with 252 additions and 215 deletions
--- a/parser/wadler.py
+++ b/parser/wadler.py
@ -360,252 +360,34 @@ def slice_pre_post_trivia(
    return ([], tokens)


+############################################################################
+# Data to Drive the Pretty Printer
+############################################################################
+
+
@dataclasses.dataclass
-class Matcher:
+class MatcherTable:
+    """Information necessary to create a document from a concrete parse tree,
+    as generated by the parser.
+
+    (In order to do this we need to re-parse the children of the tree, in
+    order to recover structure added by transparent rules. That's why each
+    MatcherTable has an associated ParseTable!)
+    """
+
+    # Parse table to recover the node into a document
    table: parser.ParseTable
+    # Mapping from the name of i_ rules to indent counts
    indent_amounts: dict[str, int]
+    # Mapping from the names of n_ rules to the text they flatten to
    newline_replace: dict[str, str]
-    trivia_mode: dict[str, parser.TriviaMode]
-
-    def match(
-        self,
-        printer: "Printer",
-        items: list[runtime.Tree | runtime.TokenValue],
-        src: str,
-    ) -> Document:
-        stack: list[tuple[int, Document]] = [(0, None)]
-        table = self.table
-
-        # eof_trivia = []
-        # if len(items) > 0:
-        #     item = items[-1]
-        #     if isinstance(item, runtime.TokenValue):
-        #         eof_trivia = item.post_trivia
-
-        input = [(child_to_name(i), i) for i in items] + [
-            (
-                "$",
-                runtime.TokenValue(
-                    kind="$",
-                    start=0,
-                    end=0,
-                    pre_trivia=[],
-                    post_trivia=[],
-                ),
-            )
-        ]
-        input_index = 0
-
-        while True:
-            current_token = input[input_index]
-            current_state = stack[-1][0]
-            action = table.actions[current_state].get(current_token[0], parser.Error())
-
-            match action:
-                case parser.Accept():
-                    result = stack[-1][1]
-                    # result = cons(result, self.apply_trivia(eof_trivia))
-                    return result
-
-                case parser.Reduce(name=name, count=size):
-                    child: Document = None
-                    if size > 0:
-                        for _, c in stack[-size:]:
-                            if c is None:
-                                continue
-                            child = cons(child, c)
-                        del stack[-size:]
-
-                    if name[0] == "g":
-                        child = group(child)
-
-                    elif name[0] == "i":
-                        amount = self.indent_amounts[name]
-                        child = Indent(amount, child)
-
-                    elif name[0] == "n":
-                        replace = self.newline_replace[name]
-                        child = cons(child, NewLine(replace))
-
-                    elif name[0] == "p":
-                        replace = self.newline_replace[name]
-                        child = cons(NewLine(replace), child)
-
-                    elif name[0] == "f":
-                        child = cons(child, ForceBreak(False))
-
-                    elif name[0] == "d":
-                        child = cons(ForceBreak(False), child)
-
-                    else:
-                        pass  # Reducing a transparent rule probably.
-
-                    goto = table.gotos[stack[-1][0]].get(name)
-                    assert goto is not None
-                    stack.append((goto, child))
-
-                case parser.Shift():
-                    value = current_token[1]
-
-                    if isinstance(value, runtime.Tree):
-                        child = Lazy.from_tree(value, src, printer)
-                    else:
-                        child = cons(
-                            trivia(self.apply_pre_trivia(value.pre_trivia, src)),
-                            Literal(src[value.start : value.end]),
-                            trivia(self.apply_post_trivia(value.post_trivia, src)),
-                        )
-
-                    stack.append((action.state, child))
-                    input_index += 1
-
-                case parser.Error():
-                    raise Exception("How did I get a parse error here??")
-
-    def apply_pre_trivia(self, trivia_tokens: list[runtime.TokenValue], src: str) -> Document:
-        pre_trivia, _ = slice_pre_post_trivia(self.trivia_mode, trivia_tokens)
-        # print(f"PRE:\n{pre_trivia}")
-
-        if len(pre_trivia) == 0:
-            return None
-
-        at_start_of_file = pre_trivia[0][1].start == 0
-
-        trivia_doc = None
-        new_line_count = 0
-        for mode, token in pre_trivia:
-            # print(f"PRE  {mode:25} {token.kind:30} ({new_line_count})")
-            match mode:
-                case parser.TriviaMode.LineComment:
-                    trivia_doc = cons(
-                        trivia_doc,
-                        Literal(src[token.start : token.end]),
-                        ForceBreak(False),
-                    )
-                    new_line_count = 0  # There will be a newline after this.
-                    at_start_of_file = False
-
-                case parser.TriviaMode.Blank:
-                    pass
-
-                case parser.TriviaMode.NewLine:
-                    new_line_count += 1
-                    if new_line_count == 2 and not at_start_of_file:
-                        trivia_doc = cons(
-                            trivia_doc,
-                            ForceBreak(False),
-                        )
-
-                case _:
-                    typing.assert_never(mode)
-
-        return trivia_doc
-
-    def apply_post_trivia(self, trivia_tokens: list[runtime.TokenValue], src: str) -> Document:
-        if len(trivia_tokens) > 0 and trivia_tokens[-1].end == len(src):
-            return self.apply_eof_trivia(trivia_tokens, src)
-
-        _, post_trivia = slice_pre_post_trivia(self.trivia_mode, trivia_tokens)
-
-        trivia_doc = None
-        for mode, token in post_trivia:
-            # print(f"POST {mode:25} {token.kind:30}")
-            match mode:
-                case parser.TriviaMode.Blank:
-                    pass
-
-                case parser.TriviaMode.NewLine:
-                    # Anything after a line break is not processed as post
-                    # trivia.
-                    break
-
-                case parser.TriviaMode.LineComment:
-                    # Because this is post-trivia, we know there's something
-                    # to our left, and we can force the space.
-                    trivia_doc = cons(
-                        Literal(" "),
-                        Literal(src[token.start : token.end]),
-                        ForceBreak(True),  # And the line needs to end.
-                    )
-                    break
-
-                case _:
-                    typing.assert_never(mode)
-
-        return trivia_doc
-
-    def apply_eof_trivia(self, trivia_tokens: list[runtime.TokenValue], src: str) -> Document:
-        # EOF trivia has weird rules, namely, it's like pre and post joined together but.
-        tokens = [
-            (self.trivia_mode.get(token.kind, parser.TriviaMode.Blank), token)
-            for token in trivia_tokens
-        ]
-
-        at_start = True
-        newline_count = 0
-        trivia_doc = None
-        for mode, token in tokens:
-            match mode:
-                case parser.TriviaMode.Blank:
-                    pass
-
-                case parser.TriviaMode.NewLine:
-                    at_start = False
-                    newline_count += 1
-                    if newline_count <= 2:
-                        trivia_doc = cons(trivia_doc, ForceBreak(False))
-
-                case parser.TriviaMode.LineComment:
-                    # Because this is post-trivia, we know there's something
-                    # to our left, and we can force the space.
-                    trivia_doc = cons(
-                        trivia_doc,
-                        Literal(" ") if at_start else None,
-                        Literal(src[token.start : token.end]),
-                    )
-                    newline_count = 0
-                    at_start = False
-
-                case _:
-                    typing.assert_never(mode)
-
-        return trivia_doc


-class Printer:
-    # TODO: Pre-generate the matcher tables for a grammar, to make it
-    #       possible to do codegen in other languages.
-    grammar: parser.Grammar
-    _matchers: dict[str, Matcher]
-    _nonterminals: dict[str, parser.NonTerminal]
-    _indent: str
-    _trivia_mode: dict[str, parser.TriviaMode]
-
-    def __init__(self, grammar: parser.Grammar, indent: str | None = None):
-        self.grammar = grammar
-        self._nonterminals = {nt.name: nt for nt in grammar.non_terminals()}
-        self._matchers = {}
-
-        if indent is None:
-            indent = getattr(self.grammar, "pretty_indent", None)
-        if indent is None:
-            indent = " "
-        self._indent = indent
-
-        trivia_mode = {}
-        for t in grammar.terminals():
-            mode = t.meta.get("trivia_mode")
-            if t.name is not None and isinstance(mode, parser.TriviaMode):
-                trivia_mode[t.name] = mode
-        self._trivia_mode = trivia_mode
-
-    def indent(self) -> str:
-        return self._indent
-
-    def lookup_nonterminal(self, name: str) -> parser.NonTerminal:
-        return self._nonterminals[name]
-
-    def compile_rule(self, rule: parser.NonTerminal) -> Matcher:
+def _compile_nonterminal_matcher(
+    grammar: parser.Grammar,
+    nonterminals: dict[str, parser.NonTerminal],
+    rule: parser.NonTerminal,
+) -> MatcherTable:
    generated_grammar: list[typing.Tuple[str, list[str]]] = []
    visited: set[str] = set()

@ -629,7 +411,7 @@ class Printer:
    def compile_nonterminal(name: str, rule: parser.NonTerminal):
        if name not in visited:
            visited.add(name)
-                for production in rule.fn(self.grammar).flatten(with_metadata=True):
+            for production in rule.fn(grammar).flatten(with_metadata=True):
                trans_prod = compile_production(production)
                generated_grammar.append((name, trans_prod))

@ -645,7 +427,7 @@ class Printer:
        result = []
        for item in production:
            if isinstance(item, str):
-                    nt = self._nonterminals[item]
+                nt = nonterminals[item]
                if nt.transparent:
                    # If it's transparent then we make a new set of
                    # productions that covers the contents of the
@ -748,7 +530,7 @@ class Printer:

    start_name = f"yyy_{rule.name}"
    compile_nonterminal(start_name, rule)
-        gen = self.grammar._generator(start_name, generated_grammar)
+    gen = grammar._generator(start_name, generated_grammar)
    parse_table = gen.gen_table()

    for (_, replacement), rule_name in newlines.items():
@ -756,27 +538,284 @@ class Printer:

    indent_amounts = {rule_name: amount for ((_, amount), rule_name) in indents.items()}

-        return Matcher(
+    return MatcherTable(
        parse_table,
        indent_amounts,
        final_newlines,
-            self._trivia_mode,
    )

-    def rule_to_matcher(self, rule: parser.NonTerminal) -> Matcher:
-        result = self._matchers.get(rule.name)
-        if result is None:
-            result = self.compile_rule(rule)
-            self._matchers[rule.name] = result

+@dataclasses.dataclass
+class PrettyTable:
+    """Information necessary to convert a parsed tree into a wadler-style
+    pretty document, where it can then be formatted.
+
+    This is basically a bunch of "MatcherTables", one for each kind of tree,
+    that tell us how to recover document structure from the tree node.
+    """
+
+    indent: str
+    trivia_modes: dict[str, parser.TriviaMode]
+    matchers: dict[str, MatcherTable]
+
+
+def compile_pretty_table(grammar: parser.Grammar, indent: str | None = None) -> PrettyTable:
+    nonterminals = {nt.name: nt for nt in grammar.non_terminals()}
+    matchers = {}
+
+    if indent is None:
+        indent = getattr(grammar, "pretty_indent", None)
+    if indent is None:
+        indent = " "
+
+    trivia_mode = {}
+    for t in grammar.terminals():
+        mode = t.meta.get("trivia_mode")
+        if t.name is not None and isinstance(mode, parser.TriviaMode):
+            trivia_mode[t.name] = mode
+
+    for name, rule in nonterminals.items():
+        matchers[name] = _compile_nonterminal_matcher(grammar, nonterminals, rule)
+
+    return PrettyTable(
+        indent,
+        trivia_mode,
+        matchers,
+    )
+
+
+############################################################################
+# The Actual Pretty Printer
+############################################################################
+
+
+class Matcher:
+    table: MatcherTable
+    trivia_mode: dict[str, parser.TriviaMode]
+
+    def __init__(self, table: MatcherTable, trivia_mode: dict[str, parser.TriviaMode]):
+        self.table = table
+        self.trivia_mode = trivia_mode
+
+    def match(
+        self,
+        printer: "Printer",
+        items: list[runtime.Tree | runtime.TokenValue],
+        src: str,
+    ) -> Document:
+        stack: list[tuple[int, Document]] = [(0, None)]
+        table = self.table.table
+
+        # eof_trivia = []
+        # if len(items) > 0:
+        #     item = items[-1]
+        #     if isinstance(item, runtime.TokenValue):
+        #         eof_trivia = item.post_trivia
+
+        input = [(child_to_name(i), i) for i in items] + [
+            (
+                "$",
+                runtime.TokenValue(
+                    kind="$",
+                    start=0,
+                    end=0,
+                    pre_trivia=[],
+                    post_trivia=[],
+                ),
+            )
+        ]
+        input_index = 0
+
+        while True:
+            current_token = input[input_index]
+            current_state = stack[-1][0]
+            action = table.actions[current_state].get(current_token[0], parser.Error())
+
+            match action:
+                case parser.Accept():
+                    result = stack[-1][1]
+                    # result = cons(result, self.apply_trivia(eof_trivia))
                    return result

+                case parser.Reduce(name=name, count=size):
+                    child: Document = None
+                    if size > 0:
+                        for _, c in stack[-size:]:
+                            if c is None:
+                                continue
+                            child = cons(child, c)
+                        del stack[-size:]
+
+                    if name[0] == "g":
+                        child = group(child)
+
+                    elif name[0] == "i":
+                        amount = self.table.indent_amounts[name]
+                        child = Indent(amount, child)
+
+                    elif name[0] == "n":
+                        replace = self.table.newline_replace[name]
+                        child = cons(child, NewLine(replace))
+
+                    elif name[0] == "p":
+                        replace = self.table.newline_replace[name]
+                        child = cons(NewLine(replace), child)
+
+                    elif name[0] == "f":
+                        child = cons(child, ForceBreak(False))
+
+                    elif name[0] == "d":
+                        child = cons(ForceBreak(False), child)
+
+                    else:
+                        pass  # Reducing a transparent rule probably.
+
+                    goto = table.gotos[stack[-1][0]].get(name)
+                    assert goto is not None
+                    stack.append((goto, child))
+
+                case parser.Shift():
+                    value = current_token[1]
+
+                    if isinstance(value, runtime.Tree):
+                        child = Lazy.from_tree(value, src, printer)
+                    else:
+                        child = cons(
+                            trivia(self.apply_pre_trivia(value.pre_trivia, src)),
+                            Literal(src[value.start : value.end]),
+                            trivia(self.apply_post_trivia(value.post_trivia, src)),
+                        )
+
+                    stack.append((action.state, child))
+                    input_index += 1
+
+                case parser.Error():
+                    raise Exception("How did I get a parse error here??")
+
+    def apply_pre_trivia(self, trivia_tokens: list[runtime.TokenValue], src: str) -> Document:
+        pre_trivia, _ = slice_pre_post_trivia(self.trivia_mode, trivia_tokens)
+        if len(pre_trivia) == 0:
+            return None
+
+        at_start_of_file = pre_trivia[0][1].start == 0
+
+        trivia_doc = None
+        new_line_count = 0
+        for mode, token in pre_trivia:
+            match mode:
+                case parser.TriviaMode.LineComment:
+                    trivia_doc = cons(
+                        trivia_doc,
+                        Literal(src[token.start : token.end]),
+                        ForceBreak(False),
+                    )
+                    new_line_count = 0  # There will be a newline after this.
+                    at_start_of_file = False
+
+                case parser.TriviaMode.Blank:
+                    pass
+
+                case parser.TriviaMode.NewLine:
+                    new_line_count += 1
+                    if new_line_count == 2 and not at_start_of_file:
+                        trivia_doc = cons(
+                            trivia_doc,
+                            ForceBreak(False),
+                        )
+
+                case _:
+                    typing.assert_never(mode)
+
+        return trivia_doc
+
+    def apply_post_trivia(self, trivia_tokens: list[runtime.TokenValue], src: str) -> Document:
+        if len(trivia_tokens) > 0 and trivia_tokens[-1].end == len(src):
+            return self.apply_eof_trivia(trivia_tokens, src)
+
+        _, post_trivia = slice_pre_post_trivia(self.trivia_mode, trivia_tokens)
+
+        trivia_doc = None
+        for mode, token in post_trivia:
+            match mode:
+                case parser.TriviaMode.Blank:
+                    pass
+
+                case parser.TriviaMode.NewLine:
+                    # Anything after a line break is not processed as post
+                    # trivia.
+                    break
+
+                case parser.TriviaMode.LineComment:
+                    # Because this is post-trivia, we know there's something
+                    # to our left, and we can force the space.
+                    trivia_doc = cons(
+                        Literal(" "),
+                        Literal(src[token.start : token.end]),
+                        ForceBreak(True),  # And the line needs to end.
+                    )
+                    break
+
+                case _:
+                    typing.assert_never(mode)
+
+        return trivia_doc
+
+    def apply_eof_trivia(self, trivia_tokens: list[runtime.TokenValue], src: str) -> Document:
+        # EOF trivia has weird rules, namely, it's like pre and post joined together but.
+        tokens = [
+            (self.trivia_mode.get(token.kind, parser.TriviaMode.Blank), token)
+            for token in trivia_tokens
+        ]
+
+        at_start = True
+        newline_count = 0
+        trivia_doc = None
+        for mode, token in tokens:
+            match mode:
+                case parser.TriviaMode.Blank:
+                    pass
+
+                case parser.TriviaMode.NewLine:
+                    at_start = False
+                    newline_count += 1
+                    if newline_count <= 2:
+                        trivia_doc = cons(trivia_doc, ForceBreak(False))
+
+                case parser.TriviaMode.LineComment:
+                    # Because this is post-trivia, we know there's something
+                    # to our left, and we can force the space.
+                    trivia_doc = cons(
+                        trivia_doc,
+                        Literal(" ") if at_start else None,
+                        Literal(src[token.start : token.end]),
+                    )
+                    newline_count = 0
+                    at_start = False
+
+                case _:
+                    typing.assert_never(mode)
+
+        return trivia_doc
+
+
+class Printer:
+    table: PrettyTable
+    matchers: dict[str, Matcher]
+
+    def __init__(self, table: PrettyTable):
+        self.table = table
+        self.matchers = {
+            name: Matcher(value, self.table.trivia_modes) for name, value in table.matchers.items()
+        }
+
+    def indent(self) -> str:
+        return self.table.indent
+
    def convert_tree_to_document(self, tree: runtime.Tree, src: str) -> Document:
        name = tree.name
        assert name is not None, "Cannot format a tree if it still has transparent nodes inside"

-        rule = self.lookup_nonterminal(name)
-        matcher = self.rule_to_matcher(rule)
+        matcher = self.matchers[name]
        m = matcher.match(self, list(tree.children), src)
        if m is None:
            raise ValueError(
@ -786,4 +825,4 @@ class Printer:

    def format_tree(self, tree: runtime.Tree, src: str, width: int) -> DocumentLayout:
        doc = self.convert_tree_to_document(tree, src)
-        return layout_document(doc, width, self._indent)
+        return layout_document(doc, width, self.table.indent)
--- a/tests/test_wadler.py
+++ b/tests/test_wadler.py
@ -149,7 +149,7 @@ def test_convert_tree_to_document():
    assert [] == errors
    assert tree is not None

-    printer = wadler.Printer(JSON)
+    printer = wadler.Printer(wadler.compile_pretty_table(JSON))
    doc = flatten_document(printer.convert_tree_to_document(tree, text), text)

    assert doc == [
@ -216,7 +216,7 @@ def test_layout_basic():
    assert [] == errors
    assert tree is not None

-    printer = wadler.Printer(JSON)
+    printer = wadler.Printer(wadler.compile_pretty_table(JSON))
    result = printer.format_tree(tree, text, 50).apply_to_source(text)

    assert result == _output(
@ -278,7 +278,7 @@ def test_forced_break():
    assert errors == []
    assert tree is not None

-    printer = wadler.Printer(g)
+    printer = wadler.Printer(wadler.compile_pretty_table(g))
    result = printer.format_tree(tree, text, 200).apply_to_source(text)

    assert result == _output(
@ -318,7 +318,7 @@ def test_maintaining_line_breaks():
    assert errors == []
    assert tree is not None

-    printer = wadler.Printer(g)
+    printer = wadler.Printer(wadler.compile_pretty_table(g))
    result = printer.format_tree(tree, text, 200).apply_to_source(text)

    assert result == _output(
@ -352,7 +352,7 @@ def test_trailing_trivia():
    assert errors == []
    assert tree is not None

-    printer = wadler.Printer(g)
+    printer = wadler.Printer(wadler.compile_pretty_table(g))
    result = printer.format_tree(tree, text, 200).apply_to_source(text)

    assert result == _output(
@ -378,7 +378,7 @@ def test_trailing_trivia_two():
    assert errors == []
    assert tree is not None

-    printer = wadler.Printer(g)
+    printer = wadler.Printer(wadler.compile_pretty_table(g))
    result = printer.format_tree(tree, text, 200).apply_to_source(text)

    assert result == _output(
@ -432,9 +432,7 @@ def test_trailing_trivia_split():
        print(f"{mode:25} {t.kind:10}  {repr(text[t.start:t.end])}")

    trivia_doc = wadler.Matcher(
-        ParseTable([], [], set()),
-        {},
-        {},
+        wadler.MatcherTable(ParseTable([], [], set()), {}, {}),
        TRIVIA_MODES,
    ).apply_post_trivia(
        token.post_trivia,