[wadler] Re-factor into multiple modules

Hard split between builder and runtime, as is proper.
2024-09-21 07:42:52 -07:00 · 2024-09-21 07:42:52 -07:00 · 1a3ce02d48
commit 1a3ce02d48
parent 1f84752538
4 changed files with 370 additions and 267 deletions
--- a/parser/wadler/init.py
+++ b/parser/wadler/init.py
@ -0,0 +1,5 @@
+# A prettier printer.
+from . import builder
+from . import runtime
+
+from .builder import *
--- a/parser/wadler/builder.py
+++ b/parser/wadler/builder.py
@ -0,0 +1,316 @@
+"""Data structures to support pretty-printing.
+
+Just like the parse tables, these tables could be written out in a different
+format and used to drive a pretty-printer written in another programming
+language, probably paired with a parser runtime written in that same language.
+"""
+
+import dataclasses
+import typing
+
+from .. import parser
+
+
+@dataclasses.dataclass
+class MatcherTable:
+    """Information necessary to create a document from a single node of a
+    concrete parse tree as generated by the parser.
+
+    A "document" in this case is a wadler-style document. See the
+    documentation of the module for what kinds of document nodes we expect
+    to generate.
+
+    The grammar contains extra metadata about how to add line-breaks and
+    whatnot, but that information was discarded during the parse. (We don't
+    need it!) That means we need to recover it after the fact. It would be
+    easy, except transparent rules mean that the series of tree children
+    form a context-free language instead of a regular language, and so we
+    actually need a full parser again to recover the structure.
+
+    The data to drive that parse is in `table`, which is an LR parse table of
+    the usual form produced by this parser generator. To build the document,
+    use the actions in the parse table to drive an LR parse, maintaining a
+    stack of documents as you go.
+
+    When matching terminals, interpret symbol names as follows:
+
+    - `token_[NAME]` symbols are token children in the tree node we're parsing.
+      (The token will have the name [NAME].) These should get shifted onto the
+      stack as plain-text document nodes.
+
+    - `tree_[KIND]` symbols are tree node children in the tree node we're
+      parsing. (The tree kind will be [KIND].) These should get shifted onto
+      the stack as document nodes, but recursively (by matching *their* children
+      with the same strategy.)
+
+    When reducing nonterminals, first concatenate all of the documents you remove
+    from the stack into a single document, then use the first character to
+    determine what (if any) additional work to do to the document:
+
+    - `i...` symbols are productions used to generated "indent" documents. The
+      `indent_amounts` dict indicates how far to indent each production. The
+      concatenated documents become the child of the indent.
+
+    - `g...` symbols are productions used to generate "group" documents. The
+      concatenated documents become the child of the group.
+
+    - `n...` symbols are productions that generate newlines. A newline document
+      should be created and appended to the concatenated documents. The
+      `newline_replace` dict indicates what the replacement text for the newline
+      document should be.
+
+    - `p...` symbols are just like `n...` symbols, except the newline symbol
+      is prepended instead of appended.
+
+    - `f...` symbols are like `n...` symbols, except that a force-break document
+      is appended instead of a newline document.
+
+    - `d...` symbols are like `f...` symbols, except that the force-break
+      document is prepended instead of appended.
+
+    - Any other prefix should be ignored.
+    """
+
+    # Parse table to recover the node into a document
+    table: parser.ParseTable
+    # Mapping from the name of i_ rules to indent counts
+    indent_amounts: dict[str, int]
+    # Mapping from the names of n_ rules to the text they flatten to
+    newline_replace: dict[str, str]
+
+
+def _compile_nonterminal_matcher(
+    grammar: parser.Grammar,
+    nonterminals: dict[str, parser.NonTerminal],
+    rule: parser.NonTerminal,
+) -> MatcherTable:
+    """Generate a matcher table for a single nonterminal.
+
+    See the docs for [MatcherTable] to understand the result.
+    """
+    generated_grammar: list[typing.Tuple[str, list[str]]] = []
+    visited: set[str] = set()
+
+    # In order to generate groups, indents, and newlines we need to
+    # synthesize new productions. And it happens sometimes that we get
+    # duplicates, repeated synthetic productions. It's important to
+    # de-duplicate productions, otherwise we'll wind up with ambiguities in
+    # the parser.
+    #
+    # These dictionaries track the synthetic rules: the keys are production
+    # and also the parameter (if any), and the values are the names of the
+    # productions that produce the effect.
+    #
+    groups: dict[tuple[str, ...], str] = {}
+    indents: dict[tuple[tuple[str, ...], int], str] = {}
+    newlines: dict[tuple[tuple[str, ...], str], str] = {}
+    prefix_count: int = 0
+
+    final_newlines: dict[str, str] = {}
+
+    def compile_nonterminal(name: str, rule: parser.NonTerminal):
+        if name not in visited:
+            visited.add(name)
+            for production in rule.fn(grammar).flatten(with_metadata=True):
+                trans_prod = compile_production(production)
+                generated_grammar.append((name, trans_prod))
+
+    def compile_production(production: parser.FlattenedWithMetadata) -> list[str]:
+        nonlocal groups
+        nonlocal indents
+        nonlocal newlines
+        nonlocal prefix_count
+        nonlocal final_newlines
+
+        prefix_stack: list[str] = []
+
+        result = []
+        for item in production:
+            if isinstance(item, str):
+                nt = nonterminals[item]
+                if nt.transparent:
+                    # If it's transparent then we make a new set of
+                    # productions that covers the contents of the
+                    # transparent nonterminal.
+                    name = "xxx_" + nt.name
+                    compile_nonterminal(name, nt)
+                    result.append(name)
+                else:
+                    # Otherwise it's a "token" in our input, named
+                    # "tree_{whatever}".
+                    result.append(f"tree_{item}")
+
+            elif isinstance(item, parser.Terminal):
+                # If it's a terminal it will appear in our input as
+                # "token_{whatever}".
+                result.append(f"token_{item.name}")
+
+            else:
+                meta, children = item
+                tx_children = compile_production(children)
+
+                pretty = meta.get("format")
+                if isinstance(pretty, parser.FormatMeta):
+                    if pretty.group:
+                        # Generate a group rule.
+                        child_key = tuple(tx_children)
+                        rule_name = groups.get(child_key)
+                        if rule_name is None:
+                            rule_name = f"g_{len(groups)}"
+                            groups[child_key] = rule_name
+                            generated_grammar.append((rule_name, tx_children))
+
+                        tx_children = [rule_name]
+
+                    if pretty.indent:
+                        # Generate an indent rule.
+                        child_key = (tuple(tx_children), pretty.indent)
+                        rule_name = indents.get(child_key)
+                        if rule_name is None:
+                            rule_name = f"i_{len(indents)}"
+                            indents[child_key] = rule_name
+                            generated_grammar.append((rule_name, tx_children))
+
+                        tx_children = [rule_name]
+
+                    if pretty.newline is not None:
+                        # Generate a newline rule.
+                        #
+                        # Newline rules are complicated because we need to avoid
+                        # having a production that has zero children. Zero-child
+                        # productions generate unpredictable parse trees, even
+                        # when "unambiguous".
+                        #
+                        # Our first hedge is: if don't have any children for
+                        # this production but we *have* already converted some
+                        # stuff, then take the stuff we've already converted as
+                        # our child and wrap it in a newline production. (This
+                        # works when the newline is not the first element in the
+                        # production.)
+                        #
+                        if len(tx_children) == 0:
+                            tx_children = result
+                            result = []
+
+                        if len(tx_children) > 0:
+                            # n == postfix newline.
+                            child_key = (tuple(tx_children), pretty.newline)
+                            rule_name = newlines.get(child_key)
+                            if rule_name is None:
+                                rule_name = f"n_{len(newlines)}"
+                                newlines[child_key] = rule_name
+                                generated_grammar.append((rule_name, tx_children))
+
+                            tx_children = [rule_name]
+
+                        else:
+                            # If we still have no tx_children then the newline must
+                            # be the first thing in the produciton. Ugh. We will
+                            # remember it for later, and apply it after we've
+                            # finished handling everything else.
+                            #
+                            # p == prefix newline
+                            rule_name = f"p_{prefix_count}"
+                            prefix_count += 1
+                            final_newlines[rule_name] = pretty.newline
+                            prefix_stack.append(rule_name)
+
+                    if pretty.forced_break:
+                        # Generate a force-break rule.
+                        #
+                        # This follows the same strategies as newlines with
+                        # respect to empty productions.
+                        if len(tx_children) == 0:
+                            tx_children = result
+                            result = []
+
+                        if len(tx_children) > 0:
+                            # f == postfix forced break
+                            rule_name = f"f_{prefix_count}"
+                            prefix_count += 1
+
+                            generated_grammar.append((rule_name, tx_children))
+                            tx_children = [rule_name]
+                        else:
+                            # d == prefix forced break (so-named because 'd' is
+                            # to the right of 'f' on my keyboard)
+                            rule_name = f"d_{prefix_count}"
+                            prefix_count += 1
+                            prefix_stack.append(rule_name)
+
+                # If it turned out to have formatting meta then we will have
+                # replaced or augmented the translated children appropriately.
+                # Otherwise, if it's highlighting meta or something else, we
+                # will have ignored it and the translated children should just
+                # be inserted inline.
+                result.extend(tx_children)
+
+        # Now is the time to handle any prefix rules, by wrapping the results in
+        # a new production for the prefix and replacing the results with that
+        # one.
+        while len(prefix_stack) > 0:
+            rule_name = prefix_stack.pop()
+            generated_grammar.append((rule_name, result))
+            result = [rule_name]
+
+        return result
+
+    start_name = f"yyy_{rule.name}"
+    compile_nonterminal(start_name, rule)
+    gen = grammar._generator(start_name, generated_grammar)
+    parse_table = gen.gen_table()
+
+    for (_, replacement), rule_name in newlines.items():
+        final_newlines[rule_name] = replacement
+
+    indent_amounts = {rule_name: amount for ((_, amount), rule_name) in indents.items()}
+
+    return MatcherTable(
+        parse_table,
+        indent_amounts,
+        final_newlines,
+    )
+
+
+@dataclasses.dataclass
+class PrettyTable:
+    """Information necessary to convert a parsed tree into a wadler-style
+    pretty document, where it can then be formatted.
+
+    This is basically a bunch of "MatcherTables", one for each kind of tree,
+    that tell us how to recover document structure from the tree node. We also
+    record:
+
+    - The indentation string to use.
+    - The trivia modes of any terminals, for use in reconstructing trivia.
+    """
+
+    indent: str
+    trivia_modes: dict[str, parser.TriviaMode]
+    matchers: dict[str, MatcherTable]
+
+
+def compile_pretty_table(grammar: parser.Grammar, indent: str | None = None) -> PrettyTable:
+    """Generate a [PrettyTable] to drive a pretty-printer from a grammar."""
+    nonterminals = {nt.name: nt for nt in grammar.non_terminals()}
+    matchers = {}
+
+    if indent is None:
+        indent = getattr(grammar, "pretty_indent", None)
+    if indent is None:
+        indent = " "
+
+    trivia_mode = {}
+    for t in grammar.terminals():
+        mode = t.meta.get("trivia_mode")
+        if t.name is not None and isinstance(mode, parser.TriviaMode):
+            trivia_mode[t.name] = mode
+
+    for name, rule in nonterminals.items():
+        matchers[name] = _compile_nonterminal_matcher(grammar, nonterminals, rule)
+
+    return PrettyTable(
+        indent,
+        trivia_mode,
+        matchers,
+    )
--- a/parser/wadler/runtime.py
+++ b/parser/wadler/runtime.py
@ -1,9 +1,9 @@
-# A prettier printer.
 import dataclasses
 import typing

-from . import parser
-from . import runtime
+from . import builder
+from .. import parser
+from .. import runtime


 ############################################################################
@ -360,240 +360,16 @@ def slice_pre_post_trivia(
    return ([], tokens)


-############################################################################
-# Data to Drive the Pretty Printer
-############################################################################
-
-
-@dataclasses.dataclass
-class MatcherTable:
-    """Information necessary to create a document from a concrete parse tree,
-    as generated by the parser.
-
-    (In order to do this we need to re-parse the children of the tree, in
-    order to recover structure added by transparent rules. That's why each
-    MatcherTable has an associated ParseTable!)
-    """
-
-    # Parse table to recover the node into a document
-    table: parser.ParseTable
-    # Mapping from the name of i_ rules to indent counts
-    indent_amounts: dict[str, int]
-    # Mapping from the names of n_ rules to the text they flatten to
-    newline_replace: dict[str, str]
-
-
-def _compile_nonterminal_matcher(
-    grammar: parser.Grammar,
-    nonterminals: dict[str, parser.NonTerminal],
-    rule: parser.NonTerminal,
-) -> MatcherTable:
-    generated_grammar: list[typing.Tuple[str, list[str]]] = []
-    visited: set[str] = set()
-
-    # In order to generate groups, indents, and newlines we need to
-    # synthesize new productions. And it happens sometimes that we get
-    # duplicates, repeated synthetic productions. It's important to
-    # de-duplicate productions, otherwise we'll wind up with ambiguities
-    # in the parser.
-    #
-    # These dictionaries track the synthetic rules: the keys are
-    # production and also the parameter (if any), and the values are the
-    # names of the productions that produce the effect.
-    #
-    groups: dict[tuple[str, ...], str] = {}
-    indents: dict[tuple[tuple[str, ...], int], str] = {}
-    newlines: dict[tuple[tuple[str, ...], str], str] = {}
-    prefix_count: int = 0
-
-    final_newlines: dict[str, str] = {}
-
-    def compile_nonterminal(name: str, rule: parser.NonTerminal):
-        if name not in visited:
-            visited.add(name)
-            for production in rule.fn(grammar).flatten(with_metadata=True):
-                trans_prod = compile_production(production)
-                generated_grammar.append((name, trans_prod))
-
-    def compile_production(production: parser.FlattenedWithMetadata) -> list[str]:
-        nonlocal groups
-        nonlocal indents
-        nonlocal newlines
-        nonlocal prefix_count
-        nonlocal final_newlines
-
-        prefix_stack: list[str] = []
-
-        result = []
-        for item in production:
-            if isinstance(item, str):
-                nt = nonterminals[item]
-                if nt.transparent:
-                    # If it's transparent then we make a new set of
-                    # productions that covers the contents of the
-                    # transparent nonterminal.
-                    name = "xxx_" + nt.name
-                    compile_nonterminal(name, nt)
-                    result.append(name)
-                else:
-                    # Otherwise it's a "token" in our input, named
-                    # "tree_{whatever}".
-                    result.append(f"tree_{item}")
-
-            elif isinstance(item, parser.Terminal):
-                # If it's a terminal it will appear in our input as
-                # "token_{whatever}".
-                result.append(f"token_{item.name}")
-
-            else:
-                meta, children = item
-                tx_children = compile_production(children)
-
-                pretty = meta.get("format")
-                if isinstance(pretty, parser.FormatMeta):
-                    if pretty.group:
-                        # Make a fake rule.
-                        child_key = tuple(tx_children)
-                        rule_name = groups.get(child_key)
-                        if rule_name is None:
-                            rule_name = f"g_{len(groups)}"
-                            groups[child_key] = rule_name
-                            generated_grammar.append((rule_name, tx_children))
-
-                        tx_children = [rule_name]
-
-                    if pretty.indent:
-                        child_key = (tuple(tx_children), pretty.indent)
-                        rule_name = indents.get(child_key)
-                        if rule_name is None:
-                            rule_name = f"i_{len(indents)}"
-                            indents[child_key] = rule_name
-                            generated_grammar.append((rule_name, tx_children))
-
-                        tx_children = [rule_name]
-
-                    if pretty.newline is not None:
-                        if len(tx_children) == 0:
-                            tx_children = result
-                            result = []
-
-                        if len(tx_children) > 0:
-                            # n == postfix newline
-                            child_key = (tuple(tx_children), pretty.newline)
-                            rule_name = newlines.get(child_key)
-                            if rule_name is None:
-                                rule_name = f"n_{len(newlines)}"
-                                newlines[child_key] = rule_name
-                                generated_grammar.append((rule_name, tx_children))
-
-                            tx_children = [rule_name]
-
-                        else:
-                            # p == prefix newline
-                            rule_name = f"p_{prefix_count}"
-                            prefix_count += 1
-                            final_newlines[rule_name] = pretty.newline
-                            prefix_stack.append(rule_name)
-
-                    if pretty.forced_break:
-                        if len(tx_children) == 0:
-                            tx_children = result
-                            result = []
-
-                        if len(tx_children) > 0:
-                            # f == postfix forced break
-                            rule_name = f"f_{prefix_count}"
-                            prefix_count += 1
-
-                            generated_grammar.append((rule_name, tx_children))
-                            tx_children = [rule_name]
-                        else:
-                            # d == prefix forced break (to the right of 'f' on my kbd)
-                            rule_name = f"d_{prefix_count}"
-                            prefix_count += 1
-                            prefix_stack.append(rule_name)
-
-                # If it turned out to have formatting meta then we will
-                # have replaced or augmented the translated children
-                # appropriately. Otherwise, if it's highlighting meta or
-                # something else, we'll have ignored it and the
-                # translated children should just be inserted inline.
-                result.extend(tx_children)
-
-        # OK so we might have some prefix newlines. They should contain... things.
-        while len(prefix_stack) > 0:
-            rule_name = prefix_stack.pop()
-            generated_grammar.append((rule_name, result))
-            result = [rule_name]
-
-        return result
-
-    start_name = f"yyy_{rule.name}"
-    compile_nonterminal(start_name, rule)
-    gen = grammar._generator(start_name, generated_grammar)
-    parse_table = gen.gen_table()
-
-    for (_, replacement), rule_name in newlines.items():
-        final_newlines[rule_name] = replacement
-
-    indent_amounts = {rule_name: amount for ((_, amount), rule_name) in indents.items()}
-
-    return MatcherTable(
-        parse_table,
-        indent_amounts,
-        final_newlines,
-    )
-
-
-@dataclasses.dataclass
-class PrettyTable:
-    """Information necessary to convert a parsed tree into a wadler-style
-    pretty document, where it can then be formatted.
-
-    This is basically a bunch of "MatcherTables", one for each kind of tree,
-    that tell us how to recover document structure from the tree node.
-    """
-
-    indent: str
-    trivia_modes: dict[str, parser.TriviaMode]
-    matchers: dict[str, MatcherTable]
-
-
-def compile_pretty_table(grammar: parser.Grammar, indent: str | None = None) -> PrettyTable:
-    nonterminals = {nt.name: nt for nt in grammar.non_terminals()}
-    matchers = {}
-
-    if indent is None:
-        indent = getattr(grammar, "pretty_indent", None)
-    if indent is None:
-        indent = " "
-
-    trivia_mode = {}
-    for t in grammar.terminals():
-        mode = t.meta.get("trivia_mode")
-        if t.name is not None and isinstance(mode, parser.TriviaMode):
-            trivia_mode[t.name] = mode
-
-    for name, rule in nonterminals.items():
-        matchers[name] = _compile_nonterminal_matcher(grammar, nonterminals, rule)
-
-    return PrettyTable(
-        indent,
-        trivia_mode,
-        matchers,
-    )
-
-
 ############################################################################
 # The Actual Pretty Printer
 ############################################################################


 class Matcher:
-    table: MatcherTable
+    table: builder.MatcherTable
    trivia_mode: dict[str, parser.TriviaMode]

-    def __init__(self, table: MatcherTable, trivia_mode: dict[str, parser.TriviaMode]):
+    def __init__(self, table: builder.MatcherTable, trivia_mode: dict[str, parser.TriviaMode]):
        self.table = table
        self.trivia_mode = trivia_mode

@ -799,10 +575,10 @@ class Matcher:


 class Printer:
-    table: PrettyTable
+    table: builder.PrettyTable
    matchers: dict[str, Matcher]

-    def __init__(self, table: PrettyTable):
+    def __init__(self, table: builder.PrettyTable):
        self.table = table
        self.matchers = {
            name: Matcher(value, self.table.trivia_modes) for name, value in table.matchers.items()