[wadler] Refactor: data and runtime split

Now we convert the grammar into data for a pretty-printer, so in theoryw e could write the pretty-printer in a different language.
2024-09-21 06:44:53 -07:00 · 2024-09-21 06:44:53 -07:00 · 1f84752538
commit 1f84752538
parent e4585170d8
2 changed files with 252 additions and 215 deletions
--- a/parser/wadler.py
+++ b/parser/wadler.py
@ -360,13 +360,243 @@ def slice_pre_post_trivia(
    return ([], tokens)


+############################################################################
+# Data to Drive the Pretty Printer
+############################################################################
+
+
@dataclasses.dataclass
-class Matcher:
+class MatcherTable:
+    """Information necessary to create a document from a concrete parse tree,
+    as generated by the parser.
+
+    (In order to do this we need to re-parse the children of the tree, in
+    order to recover structure added by transparent rules. That's why each
+    MatcherTable has an associated ParseTable!)
+    """
+
+    # Parse table to recover the node into a document
    table: parser.ParseTable
+    # Mapping from the name of i_ rules to indent counts
    indent_amounts: dict[str, int]
+    # Mapping from the names of n_ rules to the text they flatten to
    newline_replace: dict[str, str]
+
+
+def _compile_nonterminal_matcher(
+    grammar: parser.Grammar,
+    nonterminals: dict[str, parser.NonTerminal],
+    rule: parser.NonTerminal,
+) -> MatcherTable:
+    generated_grammar: list[typing.Tuple[str, list[str]]] = []
+    visited: set[str] = set()
+
+    # In order to generate groups, indents, and newlines we need to
+    # synthesize new productions. And it happens sometimes that we get
+    # duplicates, repeated synthetic productions. It's important to
+    # de-duplicate productions, otherwise we'll wind up with ambiguities
+    # in the parser.
+    #
+    # These dictionaries track the synthetic rules: the keys are
+    # production and also the parameter (if any), and the values are the
+    # names of the productions that produce the effect.
+    #
+    groups: dict[tuple[str, ...], str] = {}
+    indents: dict[tuple[tuple[str, ...], int], str] = {}
+    newlines: dict[tuple[tuple[str, ...], str], str] = {}
+    prefix_count: int = 0
+
+    final_newlines: dict[str, str] = {}
+
+    def compile_nonterminal(name: str, rule: parser.NonTerminal):
+        if name not in visited:
+            visited.add(name)
+            for production in rule.fn(grammar).flatten(with_metadata=True):
+                trans_prod = compile_production(production)
+                generated_grammar.append((name, trans_prod))
+
+    def compile_production(production: parser.FlattenedWithMetadata) -> list[str]:
+        nonlocal groups
+        nonlocal indents
+        nonlocal newlines
+        nonlocal prefix_count
+        nonlocal final_newlines
+
+        prefix_stack: list[str] = []
+
+        result = []
+        for item in production:
+            if isinstance(item, str):
+                nt = nonterminals[item]
+                if nt.transparent:
+                    # If it's transparent then we make a new set of
+                    # productions that covers the contents of the
+                    # transparent nonterminal.
+                    name = "xxx_" + nt.name
+                    compile_nonterminal(name, nt)
+                    result.append(name)
+                else:
+                    # Otherwise it's a "token" in our input, named
+                    # "tree_{whatever}".
+                    result.append(f"tree_{item}")
+
+            elif isinstance(item, parser.Terminal):
+                # If it's a terminal it will appear in our input as
+                # "token_{whatever}".
+                result.append(f"token_{item.name}")
+
+            else:
+                meta, children = item
+                tx_children = compile_production(children)
+
+                pretty = meta.get("format")
+                if isinstance(pretty, parser.FormatMeta):
+                    if pretty.group:
+                        # Make a fake rule.
+                        child_key = tuple(tx_children)
+                        rule_name = groups.get(child_key)
+                        if rule_name is None:
+                            rule_name = f"g_{len(groups)}"
+                            groups[child_key] = rule_name
+                            generated_grammar.append((rule_name, tx_children))
+
+                        tx_children = [rule_name]
+
+                    if pretty.indent:
+                        child_key = (tuple(tx_children), pretty.indent)
+                        rule_name = indents.get(child_key)
+                        if rule_name is None:
+                            rule_name = f"i_{len(indents)}"
+                            indents[child_key] = rule_name
+                            generated_grammar.append((rule_name, tx_children))
+
+                        tx_children = [rule_name]
+
+                    if pretty.newline is not None:
+                        if len(tx_children) == 0:
+                            tx_children = result
+                            result = []
+
+                        if len(tx_children) > 0:
+                            # n == postfix newline
+                            child_key = (tuple(tx_children), pretty.newline)
+                            rule_name = newlines.get(child_key)
+                            if rule_name is None:
+                                rule_name = f"n_{len(newlines)}"
+                                newlines[child_key] = rule_name
+                                generated_grammar.append((rule_name, tx_children))
+
+                            tx_children = [rule_name]
+
+                        else:
+                            # p == prefix newline
+                            rule_name = f"p_{prefix_count}"
+                            prefix_count += 1
+                            final_newlines[rule_name] = pretty.newline
+                            prefix_stack.append(rule_name)
+
+                    if pretty.forced_break:
+                        if len(tx_children) == 0:
+                            tx_children = result
+                            result = []
+
+                        if len(tx_children) > 0:
+                            # f == postfix forced break
+                            rule_name = f"f_{prefix_count}"
+                            prefix_count += 1
+
+                            generated_grammar.append((rule_name, tx_children))
+                            tx_children = [rule_name]
+                        else:
+                            # d == prefix forced break (to the right of 'f' on my kbd)
+                            rule_name = f"d_{prefix_count}"
+                            prefix_count += 1
+                            prefix_stack.append(rule_name)
+
+                # If it turned out to have formatting meta then we will
+                # have replaced or augmented the translated children
+                # appropriately. Otherwise, if it's highlighting meta or
+                # something else, we'll have ignored it and the
+                # translated children should just be inserted inline.
+                result.extend(tx_children)
+
+        # OK so we might have some prefix newlines. They should contain... things.
+        while len(prefix_stack) > 0:
+            rule_name = prefix_stack.pop()
+            generated_grammar.append((rule_name, result))
+            result = [rule_name]
+
+        return result
+
+    start_name = f"yyy_{rule.name}"
+    compile_nonterminal(start_name, rule)
+    gen = grammar._generator(start_name, generated_grammar)
+    parse_table = gen.gen_table()
+
+    for (_, replacement), rule_name in newlines.items():
+        final_newlines[rule_name] = replacement
+
+    indent_amounts = {rule_name: amount for ((_, amount), rule_name) in indents.items()}
+
+    return MatcherTable(
+        parse_table,
+        indent_amounts,
+        final_newlines,
+    )
+
+
+@dataclasses.dataclass
+class PrettyTable:
+    """Information necessary to convert a parsed tree into a wadler-style
+    pretty document, where it can then be formatted.
+
+    This is basically a bunch of "MatcherTables", one for each kind of tree,
+    that tell us how to recover document structure from the tree node.
+    """
+
+    indent: str
+    trivia_modes: dict[str, parser.TriviaMode]
+    matchers: dict[str, MatcherTable]
+
+
+def compile_pretty_table(grammar: parser.Grammar, indent: str | None = None) -> PrettyTable:
+    nonterminals = {nt.name: nt for nt in grammar.non_terminals()}
+    matchers = {}
+
+    if indent is None:
+        indent = getattr(grammar, "pretty_indent", None)
+    if indent is None:
+        indent = " "
+
+    trivia_mode = {}
+    for t in grammar.terminals():
+        mode = t.meta.get("trivia_mode")
+        if t.name is not None and isinstance(mode, parser.TriviaMode):
+            trivia_mode[t.name] = mode
+
+    for name, rule in nonterminals.items():
+        matchers[name] = _compile_nonterminal_matcher(grammar, nonterminals, rule)
+
+    return PrettyTable(
+        indent,
+        trivia_mode,
+        matchers,
+    )
+
+
+############################################################################
+# The Actual Pretty Printer
+############################################################################
+
+
+class Matcher:
+    table: MatcherTable
    trivia_mode: dict[str, parser.TriviaMode]

+    def __init__(self, table: MatcherTable, trivia_mode: dict[str, parser.TriviaMode]):
+        self.table = table
+        self.trivia_mode = trivia_mode
+
    def match(
        self,
        printer: "Printer",
@ -374,7 +604,7 @@ class Matcher:
        src: str,
    ) -> Document:
        stack: list[tuple[int, Document]] = [(0, None)]
-        table = self.table
+        table = self.table.table

        # eof_trivia = []
        # if len(items) > 0:
@ -420,15 +650,15 @@ class Matcher:
                        child = group(child)

                    elif name[0] == "i":
-                        amount = self.indent_amounts[name]
+                        amount = self.table.indent_amounts[name]
                        child = Indent(amount, child)

                    elif name[0] == "n":
-                        replace = self.newline_replace[name]
+                        replace = self.table.newline_replace[name]
                        child = cons(child, NewLine(replace))

                    elif name[0] == "p":
-                        replace = self.newline_replace[name]
+                        replace = self.table.newline_replace[name]
                        child = cons(NewLine(replace), child)

                    elif name[0] == "f":
@ -464,8 +694,6 @@ class Matcher:

    def apply_pre_trivia(self, trivia_tokens: list[runtime.TokenValue], src: str) -> Document:
        pre_trivia, _ = slice_pre_post_trivia(self.trivia_mode, trivia_tokens)
-        # print(f"PRE:\n{pre_trivia}")
-
        if len(pre_trivia) == 0:
            return None

@ -474,7 +702,6 @@ class Matcher:
        trivia_doc = None
        new_line_count = 0
        for mode, token in pre_trivia:
-            # print(f"PRE  {mode:25} {token.kind:30} ({new_line_count})")
            match mode:
                case parser.TriviaMode.LineComment:
                    trivia_doc = cons(
@ -509,7 +736,6 @@ class Matcher:

        trivia_doc = None
        for mode, token in post_trivia:
-            # print(f"POST {mode:25} {token.kind:30}")
            match mode:
                case parser.TriviaMode.Blank:
                    pass
@ -573,210 +799,23 @@ class Matcher:


 class Printer:
-    # TODO: Pre-generate the matcher tables for a grammar, to make it
-    #       possible to do codegen in other languages.
-    grammar: parser.Grammar
-    _matchers: dict[str, Matcher]
-    _nonterminals: dict[str, parser.NonTerminal]
-    _indent: str
-    _trivia_mode: dict[str, parser.TriviaMode]
+    table: PrettyTable
+    matchers: dict[str, Matcher]

-    def __init__(self, grammar: parser.Grammar, indent: str | None = None):
-        self.grammar = grammar
-        self._nonterminals = {nt.name: nt for nt in grammar.non_terminals()}
-        self._matchers = {}
-
-        if indent is None:
-            indent = getattr(self.grammar, "pretty_indent", None)
-        if indent is None:
-            indent = " "
-        self._indent = indent
-
-        trivia_mode = {}
-        for t in grammar.terminals():
-            mode = t.meta.get("trivia_mode")
-            if t.name is not None and isinstance(mode, parser.TriviaMode):
-                trivia_mode[t.name] = mode
-        self._trivia_mode = trivia_mode
+    def __init__(self, table: PrettyTable):
+        self.table = table
+        self.matchers = {
+            name: Matcher(value, self.table.trivia_modes) for name, value in table.matchers.items()
+        }

    def indent(self) -> str:
-        return self._indent
-
-    def lookup_nonterminal(self, name: str) -> parser.NonTerminal:
-        return self._nonterminals[name]
-
-    def compile_rule(self, rule: parser.NonTerminal) -> Matcher:
-        generated_grammar: list[typing.Tuple[str, list[str]]] = []
-        visited: set[str] = set()
-
-        # In order to generate groups, indents, and newlines we need to
-        # synthesize new productions. And it happens sometimes that we get
-        # duplicates, repeated synthetic productions. It's important to
-        # de-duplicate productions, otherwise we'll wind up with ambiguities
-        # in the parser.
-        #
-        # These dictionaries track the synthetic rules: the keys are
-        # production and also the parameter (if any), and the values are the
-        # names of the productions that produce the effect.
-        #
-        groups: dict[tuple[str, ...], str] = {}
-        indents: dict[tuple[tuple[str, ...], int], str] = {}
-        newlines: dict[tuple[tuple[str, ...], str], str] = {}
-        prefix_count: int = 0
-
-        final_newlines: dict[str, str] = {}
-
-        def compile_nonterminal(name: str, rule: parser.NonTerminal):
-            if name not in visited:
-                visited.add(name)
-                for production in rule.fn(self.grammar).flatten(with_metadata=True):
-                    trans_prod = compile_production(production)
-                    generated_grammar.append((name, trans_prod))
-
-        def compile_production(production: parser.FlattenedWithMetadata) -> list[str]:
-            nonlocal groups
-            nonlocal indents
-            nonlocal newlines
-            nonlocal prefix_count
-            nonlocal final_newlines
-
-            prefix_stack: list[str] = []
-
-            result = []
-            for item in production:
-                if isinstance(item, str):
-                    nt = self._nonterminals[item]
-                    if nt.transparent:
-                        # If it's transparent then we make a new set of
-                        # productions that covers the contents of the
-                        # transparent nonterminal.
-                        name = "xxx_" + nt.name
-                        compile_nonterminal(name, nt)
-                        result.append(name)
-                    else:
-                        # Otherwise it's a "token" in our input, named
-                        # "tree_{whatever}".
-                        result.append(f"tree_{item}")
-
-                elif isinstance(item, parser.Terminal):
-                    # If it's a terminal it will appear in our input as
-                    # "token_{whatever}".
-                    result.append(f"token_{item.name}")
-
-                else:
-                    meta, children = item
-                    tx_children = compile_production(children)
-
-                    pretty = meta.get("format")
-                    if isinstance(pretty, parser.FormatMeta):
-                        if pretty.group:
-                            # Make a fake rule.
-                            child_key = tuple(tx_children)
-                            rule_name = groups.get(child_key)
-                            if rule_name is None:
-                                rule_name = f"g_{len(groups)}"
-                                groups[child_key] = rule_name
-                                generated_grammar.append((rule_name, tx_children))
-
-                            tx_children = [rule_name]
-
-                        if pretty.indent:
-                            child_key = (tuple(tx_children), pretty.indent)
-                            rule_name = indents.get(child_key)
-                            if rule_name is None:
-                                rule_name = f"i_{len(indents)}"
-                                indents[child_key] = rule_name
-                                generated_grammar.append((rule_name, tx_children))
-
-                            tx_children = [rule_name]
-
-                        if pretty.newline is not None:
-                            if len(tx_children) == 0:
-                                tx_children = result
-                                result = []
-
-                            if len(tx_children) > 0:
-                                # n == postfix newline
-                                child_key = (tuple(tx_children), pretty.newline)
-                                rule_name = newlines.get(child_key)
-                                if rule_name is None:
-                                    rule_name = f"n_{len(newlines)}"
-                                    newlines[child_key] = rule_name
-                                    generated_grammar.append((rule_name, tx_children))
-
-                                tx_children = [rule_name]
-
-                            else:
-                                # p == prefix newline
-                                rule_name = f"p_{prefix_count}"
-                                prefix_count += 1
-                                final_newlines[rule_name] = pretty.newline
-                                prefix_stack.append(rule_name)
-
-                        if pretty.forced_break:
-                            if len(tx_children) == 0:
-                                tx_children = result
-                                result = []
-
-                            if len(tx_children) > 0:
-                                # f == postfix forced break
-                                rule_name = f"f_{prefix_count}"
-                                prefix_count += 1
-
-                                generated_grammar.append((rule_name, tx_children))
-                                tx_children = [rule_name]
-                            else:
-                                # d == prefix forced break (to the right of 'f' on my kbd)
-                                rule_name = f"d_{prefix_count}"
-                                prefix_count += 1
-                                prefix_stack.append(rule_name)
-
-                    # If it turned out to have formatting meta then we will
-                    # have replaced or augmented the translated children
-                    # appropriately. Otherwise, if it's highlighting meta or
-                    # something else, we'll have ignored it and the
-                    # translated children should just be inserted inline.
-                    result.extend(tx_children)
-
-            # OK so we might have some prefix newlines. They should contain... things.
-            while len(prefix_stack) > 0:
-                rule_name = prefix_stack.pop()
-                generated_grammar.append((rule_name, result))
-                result = [rule_name]
-
-            return result
-
-        start_name = f"yyy_{rule.name}"
-        compile_nonterminal(start_name, rule)
-        gen = self.grammar._generator(start_name, generated_grammar)
-        parse_table = gen.gen_table()
-
-        for (_, replacement), rule_name in newlines.items():
-            final_newlines[rule_name] = replacement
-
-        indent_amounts = {rule_name: amount for ((_, amount), rule_name) in indents.items()}
-
-        return Matcher(
-            parse_table,
-            indent_amounts,
-            final_newlines,
-            self._trivia_mode,
-        )
-
-    def rule_to_matcher(self, rule: parser.NonTerminal) -> Matcher:
-        result = self._matchers.get(rule.name)
-        if result is None:
-            result = self.compile_rule(rule)
-            self._matchers[rule.name] = result
-
-        return result
+        return self.table.indent

    def convert_tree_to_document(self, tree: runtime.Tree, src: str) -> Document:
        name = tree.name
        assert name is not None, "Cannot format a tree if it still has transparent nodes inside"

-        rule = self.lookup_nonterminal(name)
-        matcher = self.rule_to_matcher(rule)
+        matcher = self.matchers[name]
        m = matcher.match(self, list(tree.children), src)
        if m is None:
            raise ValueError(
@ -786,4 +825,4 @@ class Printer:

    def format_tree(self, tree: runtime.Tree, src: str, width: int) -> DocumentLayout:
        doc = self.convert_tree_to_document(tree, src)
-        return layout_document(doc, width, self._indent)
+        return layout_document(doc, width, self.table.indent)
--- a/tests/test_wadler.py
+++ b/tests/test_wadler.py
@ -149,7 +149,7 @@ def test_convert_tree_to_document():
    assert [] == errors
    assert tree is not None

-    printer = wadler.Printer(JSON)
+    printer = wadler.Printer(wadler.compile_pretty_table(JSON))
    doc = flatten_document(printer.convert_tree_to_document(tree, text), text)

    assert doc == [
@ -216,7 +216,7 @@ def test_layout_basic():
    assert [] == errors
    assert tree is not None

-    printer = wadler.Printer(JSON)
+    printer = wadler.Printer(wadler.compile_pretty_table(JSON))
    result = printer.format_tree(tree, text, 50).apply_to_source(text)

    assert result == _output(
@ -278,7 +278,7 @@ def test_forced_break():
    assert errors == []
    assert tree is not None

-    printer = wadler.Printer(g)
+    printer = wadler.Printer(wadler.compile_pretty_table(g))
    result = printer.format_tree(tree, text, 200).apply_to_source(text)

    assert result == _output(
@ -318,7 +318,7 @@ def test_maintaining_line_breaks():
    assert errors == []
    assert tree is not None

-    printer = wadler.Printer(g)
+    printer = wadler.Printer(wadler.compile_pretty_table(g))
    result = printer.format_tree(tree, text, 200).apply_to_source(text)

    assert result == _output(
@ -352,7 +352,7 @@ def test_trailing_trivia():
    assert errors == []
    assert tree is not None

-    printer = wadler.Printer(g)
+    printer = wadler.Printer(wadler.compile_pretty_table(g))
    result = printer.format_tree(tree, text, 200).apply_to_source(text)

    assert result == _output(
@ -378,7 +378,7 @@ def test_trailing_trivia_two():
    assert errors == []
    assert tree is not None

-    printer = wadler.Printer(g)
+    printer = wadler.Printer(wadler.compile_pretty_table(g))
    result = printer.format_tree(tree, text, 200).apply_to_source(text)

    assert result == _output(
@ -432,9 +432,7 @@ def test_trailing_trivia_split():
        print(f"{mode:25} {t.kind:10}  {repr(text[t.start:t.end])}")

    trivia_doc = wadler.Matcher(
-        ParseTable([], [], set()),
-        {},
-        {},
+        wadler.MatcherTable(ParseTable([], [], set()), {}, {}),
        TRIVIA_MODES,
    ).apply_post_trivia(
        token.post_trivia,