lrparsers/parser/wadler/builder.py

"""Data structures to support pretty-printing.

Just like the parse tables, these tables could be written out in a different
format and used to drive a pretty-printer written in another programming
language, probably paired with a parser runtime written in that same language.
"""

import dataclasses
import typing

from .. import parser


@dataclasses.dataclass
class MatcherTable:
    """Information necessary to create a document from a single node of a
    concrete parse tree as generated by the parser.

    A "document" in this case is a wadler-style document. See the
    documentation of the module for what kinds of document nodes we expect
    to generate.

    The grammar contains extra metadata about how to add line-breaks and
    whatnot, but that information was discarded during the parse. (We don't
    need it!) That means we need to recover it after the fact. It would be
    easy, except transparent rules mean that the series of tree children
    form a context-free language instead of a regular language, and so we
    actually need a full parser again to recover the structure.

    The data to drive that parse is in `table`, which is an LR parse table of
    the usual form produced by this parser generator. To build the document,
    use the actions in the parse table to drive an LR parse, maintaining a
    stack of documents as you go.

    When matching terminals, interpret symbol names as follows:

    - `token_[NAME]` symbols are token children in the tree node we're parsing.
      (The token will have the name [NAME].) These should get shifted onto the
      stack as plain-text document nodes.

    - `tree_[KIND]` symbols are tree node children in the tree node we're
      parsing. (The tree kind will be [KIND].) These should get shifted onto
      the stack as document nodes, but recursively (by matching *their* children
      with the same strategy.)

    When reducing nonterminals, first concatenate all of the documents you remove
    from the stack into a single document, then use the first character to
    determine what (if any) additional work to do to the document:

    - `i...` symbols are productions used to generated "indent" documents. The
      `indent_amounts` dict indicates how far to indent each production. The
      concatenated documents become the child of the indent.

    - `g...` symbols are productions used to generate "group" documents. The
      concatenated documents become the child of the group.

    - `n...` symbols are productions that generate newlines. A newline document
      should be created and appended to the concatenated documents. The
      `newline_replace` dict indicates what the replacement text for the newline
      document should be.

    - `p...` symbols are just like `n...` symbols, except the newline symbol
      is prepended instead of appended.

    - `f...` symbols are like `n...` symbols, except that a force-break document
      is appended instead of a newline document.

    - `d...` symbols are like `f...` symbols, except that the force-break
      document is prepended instead of appended.

    - Any other prefix should be ignored.
    """

    # Parse table to recover the node into a document
    table: parser.ParseTable
    # Mapping from the name of i_ rules to indent counts
    indent_amounts: dict[str, int]
    # Mapping from the names of n_ rules to the text they flatten to
    newline_replace: dict[str, str]


def _compile_nonterminal_matcher(rule: parser.NonTerminal) -> MatcherTable:
    """Generate a matcher table for a single nonterminal.

    See the docs for [MatcherTable] to understand the result.
    """
    generated_grammar: list[typing.Tuple[str, list[str]]] = []
    visited: set[str] = set()

    # In order to generate groups, indents, and newlines we need to
    # synthesize new productions. And it happens sometimes that we get
    # duplicates, repeated synthetic productions. It's important to
    # de-duplicate productions, otherwise we'll wind up with ambiguities in
    # the parser.
    #
    # These dictionaries track the synthetic rules: the keys are production
    # and also the parameter (if any), and the values are the names of the
    # productions that produce the effect.
    #
    groups: dict[tuple[str, ...], str] = {}
    indents: dict[tuple[tuple[str, ...], int], str] = {}
    newlines: dict[tuple[tuple[str, ...], str], str] = {}
    prefix_count: int = 0

    final_newlines: dict[str, str] = {}

    def compile_nonterminal(name: str, rule: parser.NonTerminal):
        if name not in visited:
            visited.add(name)
            for production in rule.fn().flatten(with_metadata=True):
                trans_prod = compile_production(production)
                generated_grammar.append((name, trans_prod))

    def compile_production(production: parser.FlattenedWithMetadata) -> list[str]:
        nonlocal groups
        nonlocal indents
        nonlocal newlines
        nonlocal prefix_count
        nonlocal final_newlines

        prefix_stack: list[str] = []

        result = []
        for item in production:
            if isinstance(item, parser.NonTerminal):
                if item.transparent:
                    # If it's transparent then we make a new set of
                    # productions that covers the contents of the
                    # transparent nonterminal.
                    name = "xxx_" + item.name
                    compile_nonterminal(name, item)
                    result.append(name)
                else:
                    # Otherwise it's a "token" in our input, named
                    # "tree_{whatever}".
                    result.append(f"tree_{item.name}")

            elif isinstance(item, parser.Terminal):
                # If it's a terminal it will appear in our input as
                # "token_{whatever}".
                result.append(f"token_{item.name}")

            else:
                meta, children = item
                tx_children = compile_production(children)

                pretty = meta.get("format")
                if isinstance(pretty, parser.FormatMeta):
                    if pretty.group:
                        # Generate a group rule.
                        child_key = tuple(tx_children)
                        rule_name = groups.get(child_key)
                        if rule_name is None:
                            rule_name = f"g_{len(groups)}"
                            groups[child_key] = rule_name
                            generated_grammar.append((rule_name, tx_children))

                        tx_children = [rule_name]

                    if pretty.indent:
                        # Generate an indent rule.
                        child_key = (tuple(tx_children), pretty.indent)
                        rule_name = indents.get(child_key)
                        if rule_name is None:
                            rule_name = f"i_{len(indents)}"
                            indents[child_key] = rule_name
                            generated_grammar.append((rule_name, tx_children))

                        tx_children = [rule_name]

                    if pretty.newline is not None:
                        # Generate a newline rule.
                        #
                        # Newline rules are complicated because we need to avoid
                        # having a production that has zero children. Zero-child
                        # productions generate unpredictable parse trees, even
                        # when "unambiguous".
                        #
                        # Our first hedge is: if don't have any children for
                        # this production but we *have* already converted some
                        # stuff, then take the stuff we've already converted as
                        # our child and wrap it in a newline production. (This
                        # works when the newline is not the first element in the
                        # production.)
                        #
                        if len(tx_children) == 0:
                            tx_children = result
                            result = []

                        if len(tx_children) > 0:
                            # n == postfix newline.
                            child_key = (tuple(tx_children), pretty.newline)
                            rule_name = newlines.get(child_key)
                            if rule_name is None:
                                rule_name = f"n_{len(newlines)}"
                                newlines[child_key] = rule_name
                                generated_grammar.append((rule_name, tx_children))

                            tx_children = [rule_name]

                        else:
                            # If we still have no tx_children then the newline must
                            # be the first thing in the produciton. Ugh. We will
                            # remember it for later, and apply it after we've
                            # finished handling everything else.
                            #
                            # p == prefix newline
                            rule_name = f"p_{prefix_count}"
                            prefix_count += 1
                            final_newlines[rule_name] = pretty.newline
                            prefix_stack.append(rule_name)

                    if pretty.forced_break:
                        # Generate a force-break rule.
                        #
                        # This follows the same strategies as newlines with
                        # respect to empty productions.
                        if len(tx_children) == 0:
                            tx_children = result
                            result = []

                        if len(tx_children) > 0:
                            # f == postfix forced break
                            rule_name = f"f_{prefix_count}"
                            prefix_count += 1

                            generated_grammar.append((rule_name, tx_children))
                            tx_children = [rule_name]
                        else:
                            # d == prefix forced break (so-named because 'd' is
                            # to the right of 'f' on my keyboard)
                            rule_name = f"d_{prefix_count}"
                            prefix_count += 1
                            prefix_stack.append(rule_name)

                # If it turned out to have formatting meta then we will have
                # replaced or augmented the translated children appropriately.
                # Otherwise, if it's highlighting meta or something else, we
                # will have ignored it and the translated children should just
                # be inserted inline.
                result.extend(tx_children)

        # Now is the time to handle any prefix rules, by wrapping the results in
        # a new production for the prefix and replacing the results with that
        # one.
        while len(prefix_stack) > 0:
            rule_name = prefix_stack.pop()
            generated_grammar.append((rule_name, result))
            result = [rule_name]

        return result

    start_name = f"yyy_{rule.name}"
    compile_nonterminal(start_name, rule)
    gen = parser.ParserGenerator(start_name, generated_grammar)
    parse_table = gen.gen_table()

    for (_, replacement), rule_name in newlines.items():
        final_newlines[rule_name] = replacement

    indent_amounts = {rule_name: amount for ((_, amount), rule_name) in indents.items()}

    return MatcherTable(
        parse_table,
        indent_amounts,
        final_newlines,
    )


@dataclasses.dataclass
class PrettyTable:
    """Information necessary to convert a parsed tree into a wadler-style
    pretty document, where it can then be formatted.

    This is basically a bunch of "MatcherTables", one for each kind of tree,
    that tell us how to recover document structure from the tree node. We also
    record:

    - The indentation string to use.
    - The trivia modes of any terminals, for use in reconstructing trivia.
    """

    indent: str
    trivia_modes: dict[str, parser.TriviaMode]
    matchers: dict[str, MatcherTable]


def compile_pretty_table(grammar: parser.Grammar, indent: str | None = None) -> PrettyTable:
    """Generate a [PrettyTable] to drive a pretty-printer from a grammar."""
    nonterminals = {nt.name: nt for nt in grammar.non_terminals()}
    matchers = {}

    if indent is None:
        indent = grammar.pretty_indent
    if indent is None:
        indent = " "

    trivia_mode = {}
    for t in grammar.terminals():
        mode = t.meta.get("trivia_mode")
        if t.name is not None and isinstance(mode, parser.TriviaMode):
            trivia_mode[t.name] = mode

    for name, rule in nonterminals.items():
        matchers[name] = _compile_nonterminal_matcher(rule)

    return PrettyTable(
        indent,
        trivia_mode,
        matchers,
    )