"""Data structures to support pretty-printing. Just like the parse tables, these tables could be written out in a different format and used to drive a pretty-printer written in another programming language, probably paired with a parser runtime written in that same language. """ import dataclasses import typing from .. import parser @dataclasses.dataclass class MatcherTable: """Information necessary to create a document from a single node of a concrete parse tree as generated by the parser. A "document" in this case is a wadler-style document. See the documentation of the module for what kinds of document nodes we expect to generate. The grammar contains extra metadata about how to add line-breaks and whatnot, but that information was discarded during the parse. (We don't need it!) That means we need to recover it after the fact. It would be easy, except transparent rules mean that the series of tree children form a context-free language instead of a regular language, and so we actually need a full parser again to recover the structure. The data to drive that parse is in `table`, which is an LR parse table of the usual form produced by this parser generator. To build the document, use the actions in the parse table to drive an LR parse, maintaining a stack of documents as you go. When matching terminals, interpret symbol names as follows: - `token_[NAME]` symbols are token children in the tree node we're parsing. (The token will have the name [NAME].) These should get shifted onto the stack as plain-text document nodes. - `tree_[KIND]` symbols are tree node children in the tree node we're parsing. (The tree kind will be [KIND].) These should get shifted onto the stack as document nodes, but recursively (by matching *their* children with the same strategy.) When reducing nonterminals, first concatenate all of the documents you remove from the stack into a single document, then use the first character to determine what (if any) additional work to do to the document: - `i...` symbols are productions used to generated "indent" documents. The `indent_amounts` dict indicates how far to indent each production. The concatenated documents become the child of the indent. - `g...` symbols are productions used to generate "group" documents. The concatenated documents become the child of the group. - `n...` symbols are productions that generate newlines. A newline document should be created and appended to the concatenated documents. The `newline_replace` dict indicates what the replacement text for the newline document should be. - `p...` symbols are just like `n...` symbols, except the newline symbol is prepended instead of appended. - `f...` symbols are like `n...` symbols, except that a force-break document is appended instead of a newline document. - `d...` symbols are like `f...` symbols, except that the force-break document is prepended instead of appended. - Any other prefix should be ignored. """ # Parse table to recover the node into a document table: parser.ParseTable # Mapping from the name of i_ rules to indent counts indent_amounts: dict[str, int] # Mapping from the names of n_ rules to the text they flatten to newline_replace: dict[str, str] def _compile_nonterminal_matcher(rule: parser.NonTerminal) -> MatcherTable: """Generate a matcher table for a single nonterminal. See the docs for [MatcherTable] to understand the result. """ generated_grammar: list[typing.Tuple[str, list[str]]] = [] visited: set[str] = set() # In order to generate groups, indents, and newlines we need to # synthesize new productions. And it happens sometimes that we get # duplicates, repeated synthetic productions. It's important to # de-duplicate productions, otherwise we'll wind up with ambiguities in # the parser. # # These dictionaries track the synthetic rules: the keys are production # and also the parameter (if any), and the values are the names of the # productions that produce the effect. # groups: dict[tuple[str, ...], str] = {} indents: dict[tuple[tuple[str, ...], int], str] = {} newlines: dict[tuple[tuple[str, ...], str], str] = {} prefix_count: int = 0 final_newlines: dict[str, str] = {} def compile_nonterminal(name: str, rule: parser.NonTerminal): if name not in visited: visited.add(name) for production in rule.fn().flatten(with_metadata=True): trans_prod = compile_production(production) generated_grammar.append((name, trans_prod)) def compile_production(production: parser.FlattenedWithMetadata) -> list[str]: nonlocal groups nonlocal indents nonlocal newlines nonlocal prefix_count nonlocal final_newlines prefix_stack: list[str] = [] result = [] for item in production: if isinstance(item, parser.NonTerminal): if item.transparent: # If it's transparent then we make a new set of # productions that covers the contents of the # transparent nonterminal. name = "xxx_" + item.name compile_nonterminal(name, item) result.append(name) else: # Otherwise it's a "token" in our input, named # "tree_{whatever}". result.append(f"tree_{item.name}") elif isinstance(item, parser.Terminal): # If it's a terminal it will appear in our input as # "token_{whatever}". result.append(f"token_{item.name}") else: meta, children = item tx_children = compile_production(children) pretty = meta.get("format") if isinstance(pretty, parser.FormatMeta): if pretty.group: # Generate a group rule. child_key = tuple(tx_children) rule_name = groups.get(child_key) if rule_name is None: rule_name = f"g_{len(groups)}" groups[child_key] = rule_name generated_grammar.append((rule_name, tx_children)) tx_children = [rule_name] if pretty.indent: # Generate an indent rule. child_key = (tuple(tx_children), pretty.indent) rule_name = indents.get(child_key) if rule_name is None: rule_name = f"i_{len(indents)}" indents[child_key] = rule_name generated_grammar.append((rule_name, tx_children)) tx_children = [rule_name] if pretty.newline is not None: # Generate a newline rule. # # Newline rules are complicated because we need to avoid # having a production that has zero children. Zero-child # productions generate unpredictable parse trees, even # when "unambiguous". # # Our first hedge is: if don't have any children for # this production but we *have* already converted some # stuff, then take the stuff we've already converted as # our child and wrap it in a newline production. (This # works when the newline is not the first element in the # production.) # if len(tx_children) == 0: tx_children = result result = [] if len(tx_children) > 0: # n == postfix newline. child_key = (tuple(tx_children), pretty.newline) rule_name = newlines.get(child_key) if rule_name is None: rule_name = f"n_{len(newlines)}" newlines[child_key] = rule_name generated_grammar.append((rule_name, tx_children)) tx_children = [rule_name] else: # If we still have no tx_children then the newline must # be the first thing in the produciton. Ugh. We will # remember it for later, and apply it after we've # finished handling everything else. # # p == prefix newline rule_name = f"p_{prefix_count}" prefix_count += 1 final_newlines[rule_name] = pretty.newline prefix_stack.append(rule_name) if pretty.forced_break: # Generate a force-break rule. # # This follows the same strategies as newlines with # respect to empty productions. if len(tx_children) == 0: tx_children = result result = [] if len(tx_children) > 0: # f == postfix forced break rule_name = f"f_{prefix_count}" prefix_count += 1 generated_grammar.append((rule_name, tx_children)) tx_children = [rule_name] else: # d == prefix forced break (so-named because 'd' is # to the right of 'f' on my keyboard) rule_name = f"d_{prefix_count}" prefix_count += 1 prefix_stack.append(rule_name) # If it turned out to have formatting meta then we will have # replaced or augmented the translated children appropriately. # Otherwise, if it's highlighting meta or something else, we # will have ignored it and the translated children should just # be inserted inline. result.extend(tx_children) # Now is the time to handle any prefix rules, by wrapping the results in # a new production for the prefix and replacing the results with that # one. while len(prefix_stack) > 0: rule_name = prefix_stack.pop() generated_grammar.append((rule_name, result)) result = [rule_name] return result start_name = f"yyy_{rule.name}" compile_nonterminal(start_name, rule) gen = parser.ParserGenerator(start_name, generated_grammar) parse_table = gen.gen_table() for (_, replacement), rule_name in newlines.items(): final_newlines[rule_name] = replacement indent_amounts = {rule_name: amount for ((_, amount), rule_name) in indents.items()} return MatcherTable( parse_table, indent_amounts, final_newlines, ) @dataclasses.dataclass class PrettyTable: """Information necessary to convert a parsed tree into a wadler-style pretty document, where it can then be formatted. This is basically a bunch of "MatcherTables", one for each kind of tree, that tell us how to recover document structure from the tree node. We also record: - The indentation string to use. - The trivia modes of any terminals, for use in reconstructing trivia. """ indent: str trivia_modes: dict[str, parser.TriviaMode] matchers: dict[str, MatcherTable] def compile_pretty_table(grammar: parser.Grammar, indent: str | None = None) -> PrettyTable: """Generate a [PrettyTable] to drive a pretty-printer from a grammar.""" nonterminals = {nt.name: nt for nt in grammar.non_terminals()} matchers = {} if indent is None: indent = grammar.pretty_indent if indent is None: indent = " " trivia_mode = {} for t in grammar.terminals(): mode = t.meta.get("trivia_mode") if t.name is not None and isinstance(mode, parser.TriviaMode): trivia_mode[t.name] = mode for name, rule in nonterminals.items(): matchers[name] = _compile_nonterminal_matcher(rule) return PrettyTable( indent, trivia_mode, matchers, )