[wadler] Re-factor into multiple modules

Hard split between builder and runtime, as is proper.
This commit is contained in:
John Doty 2024-09-21 07:42:52 -07:00
parent 1f84752538
commit 1a3ce02d48
4 changed files with 370 additions and 267 deletions

View file

@ -0,0 +1,5 @@
# A prettier printer.
from . import builder
from . import runtime
from .builder import *

316
parser/wadler/builder.py Normal file
View file

@ -0,0 +1,316 @@
"""Data structures to support pretty-printing.
Just like the parse tables, these tables could be written out in a different
format and used to drive a pretty-printer written in another programming
language, probably paired with a parser runtime written in that same language.
"""
import dataclasses
import typing
from .. import parser
@dataclasses.dataclass
class MatcherTable:
"""Information necessary to create a document from a single node of a
concrete parse tree as generated by the parser.
A "document" in this case is a wadler-style document. See the
documentation of the module for what kinds of document nodes we expect
to generate.
The grammar contains extra metadata about how to add line-breaks and
whatnot, but that information was discarded during the parse. (We don't
need it!) That means we need to recover it after the fact. It would be
easy, except transparent rules mean that the series of tree children
form a context-free language instead of a regular language, and so we
actually need a full parser again to recover the structure.
The data to drive that parse is in `table`, which is an LR parse table of
the usual form produced by this parser generator. To build the document,
use the actions in the parse table to drive an LR parse, maintaining a
stack of documents as you go.
When matching terminals, interpret symbol names as follows:
- `token_[NAME]` symbols are token children in the tree node we're parsing.
(The token will have the name [NAME].) These should get shifted onto the
stack as plain-text document nodes.
- `tree_[KIND]` symbols are tree node children in the tree node we're
parsing. (The tree kind will be [KIND].) These should get shifted onto
the stack as document nodes, but recursively (by matching *their* children
with the same strategy.)
When reducing nonterminals, first concatenate all of the documents you remove
from the stack into a single document, then use the first character to
determine what (if any) additional work to do to the document:
- `i...` symbols are productions used to generated "indent" documents. The
`indent_amounts` dict indicates how far to indent each production. The
concatenated documents become the child of the indent.
- `g...` symbols are productions used to generate "group" documents. The
concatenated documents become the child of the group.
- `n...` symbols are productions that generate newlines. A newline document
should be created and appended to the concatenated documents. The
`newline_replace` dict indicates what the replacement text for the newline
document should be.
- `p...` symbols are just like `n...` symbols, except the newline symbol
is prepended instead of appended.
- `f...` symbols are like `n...` symbols, except that a force-break document
is appended instead of a newline document.
- `d...` symbols are like `f...` symbols, except that the force-break
document is prepended instead of appended.
- Any other prefix should be ignored.
"""
# Parse table to recover the node into a document
table: parser.ParseTable
# Mapping from the name of i_ rules to indent counts
indent_amounts: dict[str, int]
# Mapping from the names of n_ rules to the text they flatten to
newline_replace: dict[str, str]
def _compile_nonterminal_matcher(
grammar: parser.Grammar,
nonterminals: dict[str, parser.NonTerminal],
rule: parser.NonTerminal,
) -> MatcherTable:
"""Generate a matcher table for a single nonterminal.
See the docs for [MatcherTable] to understand the result.
"""
generated_grammar: list[typing.Tuple[str, list[str]]] = []
visited: set[str] = set()
# In order to generate groups, indents, and newlines we need to
# synthesize new productions. And it happens sometimes that we get
# duplicates, repeated synthetic productions. It's important to
# de-duplicate productions, otherwise we'll wind up with ambiguities in
# the parser.
#
# These dictionaries track the synthetic rules: the keys are production
# and also the parameter (if any), and the values are the names of the
# productions that produce the effect.
#
groups: dict[tuple[str, ...], str] = {}
indents: dict[tuple[tuple[str, ...], int], str] = {}
newlines: dict[tuple[tuple[str, ...], str], str] = {}
prefix_count: int = 0
final_newlines: dict[str, str] = {}
def compile_nonterminal(name: str, rule: parser.NonTerminal):
if name not in visited:
visited.add(name)
for production in rule.fn(grammar).flatten(with_metadata=True):
trans_prod = compile_production(production)
generated_grammar.append((name, trans_prod))
def compile_production(production: parser.FlattenedWithMetadata) -> list[str]:
nonlocal groups
nonlocal indents
nonlocal newlines
nonlocal prefix_count
nonlocal final_newlines
prefix_stack: list[str] = []
result = []
for item in production:
if isinstance(item, str):
nt = nonterminals[item]
if nt.transparent:
# If it's transparent then we make a new set of
# productions that covers the contents of the
# transparent nonterminal.
name = "xxx_" + nt.name
compile_nonterminal(name, nt)
result.append(name)
else:
# Otherwise it's a "token" in our input, named
# "tree_{whatever}".
result.append(f"tree_{item}")
elif isinstance(item, parser.Terminal):
# If it's a terminal it will appear in our input as
# "token_{whatever}".
result.append(f"token_{item.name}")
else:
meta, children = item
tx_children = compile_production(children)
pretty = meta.get("format")
if isinstance(pretty, parser.FormatMeta):
if pretty.group:
# Generate a group rule.
child_key = tuple(tx_children)
rule_name = groups.get(child_key)
if rule_name is None:
rule_name = f"g_{len(groups)}"
groups[child_key] = rule_name
generated_grammar.append((rule_name, tx_children))
tx_children = [rule_name]
if pretty.indent:
# Generate an indent rule.
child_key = (tuple(tx_children), pretty.indent)
rule_name = indents.get(child_key)
if rule_name is None:
rule_name = f"i_{len(indents)}"
indents[child_key] = rule_name
generated_grammar.append((rule_name, tx_children))
tx_children = [rule_name]
if pretty.newline is not None:
# Generate a newline rule.
#
# Newline rules are complicated because we need to avoid
# having a production that has zero children. Zero-child
# productions generate unpredictable parse trees, even
# when "unambiguous".
#
# Our first hedge is: if don't have any children for
# this production but we *have* already converted some
# stuff, then take the stuff we've already converted as
# our child and wrap it in a newline production. (This
# works when the newline is not the first element in the
# production.)
#
if len(tx_children) == 0:
tx_children = result
result = []
if len(tx_children) > 0:
# n == postfix newline.
child_key = (tuple(tx_children), pretty.newline)
rule_name = newlines.get(child_key)
if rule_name is None:
rule_name = f"n_{len(newlines)}"
newlines[child_key] = rule_name
generated_grammar.append((rule_name, tx_children))
tx_children = [rule_name]
else:
# If we still have no tx_children then the newline must
# be the first thing in the produciton. Ugh. We will
# remember it for later, and apply it after we've
# finished handling everything else.
#
# p == prefix newline
rule_name = f"p_{prefix_count}"
prefix_count += 1
final_newlines[rule_name] = pretty.newline
prefix_stack.append(rule_name)
if pretty.forced_break:
# Generate a force-break rule.
#
# This follows the same strategies as newlines with
# respect to empty productions.
if len(tx_children) == 0:
tx_children = result
result = []
if len(tx_children) > 0:
# f == postfix forced break
rule_name = f"f_{prefix_count}"
prefix_count += 1
generated_grammar.append((rule_name, tx_children))
tx_children = [rule_name]
else:
# d == prefix forced break (so-named because 'd' is
# to the right of 'f' on my keyboard)
rule_name = f"d_{prefix_count}"
prefix_count += 1
prefix_stack.append(rule_name)
# If it turned out to have formatting meta then we will have
# replaced or augmented the translated children appropriately.
# Otherwise, if it's highlighting meta or something else, we
# will have ignored it and the translated children should just
# be inserted inline.
result.extend(tx_children)
# Now is the time to handle any prefix rules, by wrapping the results in
# a new production for the prefix and replacing the results with that
# one.
while len(prefix_stack) > 0:
rule_name = prefix_stack.pop()
generated_grammar.append((rule_name, result))
result = [rule_name]
return result
start_name = f"yyy_{rule.name}"
compile_nonterminal(start_name, rule)
gen = grammar._generator(start_name, generated_grammar)
parse_table = gen.gen_table()
for (_, replacement), rule_name in newlines.items():
final_newlines[rule_name] = replacement
indent_amounts = {rule_name: amount for ((_, amount), rule_name) in indents.items()}
return MatcherTable(
parse_table,
indent_amounts,
final_newlines,
)
@dataclasses.dataclass
class PrettyTable:
"""Information necessary to convert a parsed tree into a wadler-style
pretty document, where it can then be formatted.
This is basically a bunch of "MatcherTables", one for each kind of tree,
that tell us how to recover document structure from the tree node. We also
record:
- The indentation string to use.
- The trivia modes of any terminals, for use in reconstructing trivia.
"""
indent: str
trivia_modes: dict[str, parser.TriviaMode]
matchers: dict[str, MatcherTable]
def compile_pretty_table(grammar: parser.Grammar, indent: str | None = None) -> PrettyTable:
"""Generate a [PrettyTable] to drive a pretty-printer from a grammar."""
nonterminals = {nt.name: nt for nt in grammar.non_terminals()}
matchers = {}
if indent is None:
indent = getattr(grammar, "pretty_indent", None)
if indent is None:
indent = " "
trivia_mode = {}
for t in grammar.terminals():
mode = t.meta.get("trivia_mode")
if t.name is not None and isinstance(mode, parser.TriviaMode):
trivia_mode[t.name] = mode
for name, rule in nonterminals.items():
matchers[name] = _compile_nonterminal_matcher(grammar, nonterminals, rule)
return PrettyTable(
indent,
trivia_mode,
matchers,
)

View file

@ -1,9 +1,9 @@
# A prettier printer.
import dataclasses
import typing
from . import parser
from . import runtime
from . import builder
from .. import parser
from .. import runtime
############################################################################
@ -360,240 +360,16 @@ def slice_pre_post_trivia(
return ([], tokens)
############################################################################
# Data to Drive the Pretty Printer
############################################################################
@dataclasses.dataclass
class MatcherTable:
"""Information necessary to create a document from a concrete parse tree,
as generated by the parser.
(In order to do this we need to re-parse the children of the tree, in
order to recover structure added by transparent rules. That's why each
MatcherTable has an associated ParseTable!)
"""
# Parse table to recover the node into a document
table: parser.ParseTable
# Mapping from the name of i_ rules to indent counts
indent_amounts: dict[str, int]
# Mapping from the names of n_ rules to the text they flatten to
newline_replace: dict[str, str]
def _compile_nonterminal_matcher(
grammar: parser.Grammar,
nonterminals: dict[str, parser.NonTerminal],
rule: parser.NonTerminal,
) -> MatcherTable:
generated_grammar: list[typing.Tuple[str, list[str]]] = []
visited: set[str] = set()
# In order to generate groups, indents, and newlines we need to
# synthesize new productions. And it happens sometimes that we get
# duplicates, repeated synthetic productions. It's important to
# de-duplicate productions, otherwise we'll wind up with ambiguities
# in the parser.
#
# These dictionaries track the synthetic rules: the keys are
# production and also the parameter (if any), and the values are the
# names of the productions that produce the effect.
#
groups: dict[tuple[str, ...], str] = {}
indents: dict[tuple[tuple[str, ...], int], str] = {}
newlines: dict[tuple[tuple[str, ...], str], str] = {}
prefix_count: int = 0
final_newlines: dict[str, str] = {}
def compile_nonterminal(name: str, rule: parser.NonTerminal):
if name not in visited:
visited.add(name)
for production in rule.fn(grammar).flatten(with_metadata=True):
trans_prod = compile_production(production)
generated_grammar.append((name, trans_prod))
def compile_production(production: parser.FlattenedWithMetadata) -> list[str]:
nonlocal groups
nonlocal indents
nonlocal newlines
nonlocal prefix_count
nonlocal final_newlines
prefix_stack: list[str] = []
result = []
for item in production:
if isinstance(item, str):
nt = nonterminals[item]
if nt.transparent:
# If it's transparent then we make a new set of
# productions that covers the contents of the
# transparent nonterminal.
name = "xxx_" + nt.name
compile_nonterminal(name, nt)
result.append(name)
else:
# Otherwise it's a "token" in our input, named
# "tree_{whatever}".
result.append(f"tree_{item}")
elif isinstance(item, parser.Terminal):
# If it's a terminal it will appear in our input as
# "token_{whatever}".
result.append(f"token_{item.name}")
else:
meta, children = item
tx_children = compile_production(children)
pretty = meta.get("format")
if isinstance(pretty, parser.FormatMeta):
if pretty.group:
# Make a fake rule.
child_key = tuple(tx_children)
rule_name = groups.get(child_key)
if rule_name is None:
rule_name = f"g_{len(groups)}"
groups[child_key] = rule_name
generated_grammar.append((rule_name, tx_children))
tx_children = [rule_name]
if pretty.indent:
child_key = (tuple(tx_children), pretty.indent)
rule_name = indents.get(child_key)
if rule_name is None:
rule_name = f"i_{len(indents)}"
indents[child_key] = rule_name
generated_grammar.append((rule_name, tx_children))
tx_children = [rule_name]
if pretty.newline is not None:
if len(tx_children) == 0:
tx_children = result
result = []
if len(tx_children) > 0:
# n == postfix newline
child_key = (tuple(tx_children), pretty.newline)
rule_name = newlines.get(child_key)
if rule_name is None:
rule_name = f"n_{len(newlines)}"
newlines[child_key] = rule_name
generated_grammar.append((rule_name, tx_children))
tx_children = [rule_name]
else:
# p == prefix newline
rule_name = f"p_{prefix_count}"
prefix_count += 1
final_newlines[rule_name] = pretty.newline
prefix_stack.append(rule_name)
if pretty.forced_break:
if len(tx_children) == 0:
tx_children = result
result = []
if len(tx_children) > 0:
# f == postfix forced break
rule_name = f"f_{prefix_count}"
prefix_count += 1
generated_grammar.append((rule_name, tx_children))
tx_children = [rule_name]
else:
# d == prefix forced break (to the right of 'f' on my kbd)
rule_name = f"d_{prefix_count}"
prefix_count += 1
prefix_stack.append(rule_name)
# If it turned out to have formatting meta then we will
# have replaced or augmented the translated children
# appropriately. Otherwise, if it's highlighting meta or
# something else, we'll have ignored it and the
# translated children should just be inserted inline.
result.extend(tx_children)
# OK so we might have some prefix newlines. They should contain... things.
while len(prefix_stack) > 0:
rule_name = prefix_stack.pop()
generated_grammar.append((rule_name, result))
result = [rule_name]
return result
start_name = f"yyy_{rule.name}"
compile_nonterminal(start_name, rule)
gen = grammar._generator(start_name, generated_grammar)
parse_table = gen.gen_table()
for (_, replacement), rule_name in newlines.items():
final_newlines[rule_name] = replacement
indent_amounts = {rule_name: amount for ((_, amount), rule_name) in indents.items()}
return MatcherTable(
parse_table,
indent_amounts,
final_newlines,
)
@dataclasses.dataclass
class PrettyTable:
"""Information necessary to convert a parsed tree into a wadler-style
pretty document, where it can then be formatted.
This is basically a bunch of "MatcherTables", one for each kind of tree,
that tell us how to recover document structure from the tree node.
"""
indent: str
trivia_modes: dict[str, parser.TriviaMode]
matchers: dict[str, MatcherTable]
def compile_pretty_table(grammar: parser.Grammar, indent: str | None = None) -> PrettyTable:
nonterminals = {nt.name: nt for nt in grammar.non_terminals()}
matchers = {}
if indent is None:
indent = getattr(grammar, "pretty_indent", None)
if indent is None:
indent = " "
trivia_mode = {}
for t in grammar.terminals():
mode = t.meta.get("trivia_mode")
if t.name is not None and isinstance(mode, parser.TriviaMode):
trivia_mode[t.name] = mode
for name, rule in nonterminals.items():
matchers[name] = _compile_nonterminal_matcher(grammar, nonterminals, rule)
return PrettyTable(
indent,
trivia_mode,
matchers,
)
############################################################################
# The Actual Pretty Printer
############################################################################
class Matcher:
table: MatcherTable
table: builder.MatcherTable
trivia_mode: dict[str, parser.TriviaMode]
def __init__(self, table: MatcherTable, trivia_mode: dict[str, parser.TriviaMode]):
def __init__(self, table: builder.MatcherTable, trivia_mode: dict[str, parser.TriviaMode]):
self.table = table
self.trivia_mode = trivia_mode
@ -799,10 +575,10 @@ class Matcher:
class Printer:
table: PrettyTable
table: builder.PrettyTable
matchers: dict[str, Matcher]
def __init__(self, table: PrettyTable):
def __init__(self, table: builder.PrettyTable):
self.table = table
self.matchers = {
name: Matcher(value, self.table.trivia_modes) for name, value in table.matchers.items()