[wadler] Refactor: data and runtime split
Now we convert the grammar into data for a pretty-printer, so in theoryw e could write the pretty-printer in a different language.
This commit is contained in:
parent
e4585170d8
commit
1f84752538
2 changed files with 252 additions and 215 deletions
451
parser/wadler.py
451
parser/wadler.py
|
|
@ -360,13 +360,243 @@ def slice_pre_post_trivia(
|
|||
return ([], tokens)
|
||||
|
||||
|
||||
############################################################################
|
||||
# Data to Drive the Pretty Printer
|
||||
############################################################################
|
||||
|
||||
|
||||
@dataclasses.dataclass
|
||||
class Matcher:
|
||||
class MatcherTable:
|
||||
"""Information necessary to create a document from a concrete parse tree,
|
||||
as generated by the parser.
|
||||
|
||||
(In order to do this we need to re-parse the children of the tree, in
|
||||
order to recover structure added by transparent rules. That's why each
|
||||
MatcherTable has an associated ParseTable!)
|
||||
"""
|
||||
|
||||
# Parse table to recover the node into a document
|
||||
table: parser.ParseTable
|
||||
# Mapping from the name of i_ rules to indent counts
|
||||
indent_amounts: dict[str, int]
|
||||
# Mapping from the names of n_ rules to the text they flatten to
|
||||
newline_replace: dict[str, str]
|
||||
|
||||
|
||||
def _compile_nonterminal_matcher(
|
||||
grammar: parser.Grammar,
|
||||
nonterminals: dict[str, parser.NonTerminal],
|
||||
rule: parser.NonTerminal,
|
||||
) -> MatcherTable:
|
||||
generated_grammar: list[typing.Tuple[str, list[str]]] = []
|
||||
visited: set[str] = set()
|
||||
|
||||
# In order to generate groups, indents, and newlines we need to
|
||||
# synthesize new productions. And it happens sometimes that we get
|
||||
# duplicates, repeated synthetic productions. It's important to
|
||||
# de-duplicate productions, otherwise we'll wind up with ambiguities
|
||||
# in the parser.
|
||||
#
|
||||
# These dictionaries track the synthetic rules: the keys are
|
||||
# production and also the parameter (if any), and the values are the
|
||||
# names of the productions that produce the effect.
|
||||
#
|
||||
groups: dict[tuple[str, ...], str] = {}
|
||||
indents: dict[tuple[tuple[str, ...], int], str] = {}
|
||||
newlines: dict[tuple[tuple[str, ...], str], str] = {}
|
||||
prefix_count: int = 0
|
||||
|
||||
final_newlines: dict[str, str] = {}
|
||||
|
||||
def compile_nonterminal(name: str, rule: parser.NonTerminal):
|
||||
if name not in visited:
|
||||
visited.add(name)
|
||||
for production in rule.fn(grammar).flatten(with_metadata=True):
|
||||
trans_prod = compile_production(production)
|
||||
generated_grammar.append((name, trans_prod))
|
||||
|
||||
def compile_production(production: parser.FlattenedWithMetadata) -> list[str]:
|
||||
nonlocal groups
|
||||
nonlocal indents
|
||||
nonlocal newlines
|
||||
nonlocal prefix_count
|
||||
nonlocal final_newlines
|
||||
|
||||
prefix_stack: list[str] = []
|
||||
|
||||
result = []
|
||||
for item in production:
|
||||
if isinstance(item, str):
|
||||
nt = nonterminals[item]
|
||||
if nt.transparent:
|
||||
# If it's transparent then we make a new set of
|
||||
# productions that covers the contents of the
|
||||
# transparent nonterminal.
|
||||
name = "xxx_" + nt.name
|
||||
compile_nonterminal(name, nt)
|
||||
result.append(name)
|
||||
else:
|
||||
# Otherwise it's a "token" in our input, named
|
||||
# "tree_{whatever}".
|
||||
result.append(f"tree_{item}")
|
||||
|
||||
elif isinstance(item, parser.Terminal):
|
||||
# If it's a terminal it will appear in our input as
|
||||
# "token_{whatever}".
|
||||
result.append(f"token_{item.name}")
|
||||
|
||||
else:
|
||||
meta, children = item
|
||||
tx_children = compile_production(children)
|
||||
|
||||
pretty = meta.get("format")
|
||||
if isinstance(pretty, parser.FormatMeta):
|
||||
if pretty.group:
|
||||
# Make a fake rule.
|
||||
child_key = tuple(tx_children)
|
||||
rule_name = groups.get(child_key)
|
||||
if rule_name is None:
|
||||
rule_name = f"g_{len(groups)}"
|
||||
groups[child_key] = rule_name
|
||||
generated_grammar.append((rule_name, tx_children))
|
||||
|
||||
tx_children = [rule_name]
|
||||
|
||||
if pretty.indent:
|
||||
child_key = (tuple(tx_children), pretty.indent)
|
||||
rule_name = indents.get(child_key)
|
||||
if rule_name is None:
|
||||
rule_name = f"i_{len(indents)}"
|
||||
indents[child_key] = rule_name
|
||||
generated_grammar.append((rule_name, tx_children))
|
||||
|
||||
tx_children = [rule_name]
|
||||
|
||||
if pretty.newline is not None:
|
||||
if len(tx_children) == 0:
|
||||
tx_children = result
|
||||
result = []
|
||||
|
||||
if len(tx_children) > 0:
|
||||
# n == postfix newline
|
||||
child_key = (tuple(tx_children), pretty.newline)
|
||||
rule_name = newlines.get(child_key)
|
||||
if rule_name is None:
|
||||
rule_name = f"n_{len(newlines)}"
|
||||
newlines[child_key] = rule_name
|
||||
generated_grammar.append((rule_name, tx_children))
|
||||
|
||||
tx_children = [rule_name]
|
||||
|
||||
else:
|
||||
# p == prefix newline
|
||||
rule_name = f"p_{prefix_count}"
|
||||
prefix_count += 1
|
||||
final_newlines[rule_name] = pretty.newline
|
||||
prefix_stack.append(rule_name)
|
||||
|
||||
if pretty.forced_break:
|
||||
if len(tx_children) == 0:
|
||||
tx_children = result
|
||||
result = []
|
||||
|
||||
if len(tx_children) > 0:
|
||||
# f == postfix forced break
|
||||
rule_name = f"f_{prefix_count}"
|
||||
prefix_count += 1
|
||||
|
||||
generated_grammar.append((rule_name, tx_children))
|
||||
tx_children = [rule_name]
|
||||
else:
|
||||
# d == prefix forced break (to the right of 'f' on my kbd)
|
||||
rule_name = f"d_{prefix_count}"
|
||||
prefix_count += 1
|
||||
prefix_stack.append(rule_name)
|
||||
|
||||
# If it turned out to have formatting meta then we will
|
||||
# have replaced or augmented the translated children
|
||||
# appropriately. Otherwise, if it's highlighting meta or
|
||||
# something else, we'll have ignored it and the
|
||||
# translated children should just be inserted inline.
|
||||
result.extend(tx_children)
|
||||
|
||||
# OK so we might have some prefix newlines. They should contain... things.
|
||||
while len(prefix_stack) > 0:
|
||||
rule_name = prefix_stack.pop()
|
||||
generated_grammar.append((rule_name, result))
|
||||
result = [rule_name]
|
||||
|
||||
return result
|
||||
|
||||
start_name = f"yyy_{rule.name}"
|
||||
compile_nonterminal(start_name, rule)
|
||||
gen = grammar._generator(start_name, generated_grammar)
|
||||
parse_table = gen.gen_table()
|
||||
|
||||
for (_, replacement), rule_name in newlines.items():
|
||||
final_newlines[rule_name] = replacement
|
||||
|
||||
indent_amounts = {rule_name: amount for ((_, amount), rule_name) in indents.items()}
|
||||
|
||||
return MatcherTable(
|
||||
parse_table,
|
||||
indent_amounts,
|
||||
final_newlines,
|
||||
)
|
||||
|
||||
|
||||
@dataclasses.dataclass
|
||||
class PrettyTable:
|
||||
"""Information necessary to convert a parsed tree into a wadler-style
|
||||
pretty document, where it can then be formatted.
|
||||
|
||||
This is basically a bunch of "MatcherTables", one for each kind of tree,
|
||||
that tell us how to recover document structure from the tree node.
|
||||
"""
|
||||
|
||||
indent: str
|
||||
trivia_modes: dict[str, parser.TriviaMode]
|
||||
matchers: dict[str, MatcherTable]
|
||||
|
||||
|
||||
def compile_pretty_table(grammar: parser.Grammar, indent: str | None = None) -> PrettyTable:
|
||||
nonterminals = {nt.name: nt for nt in grammar.non_terminals()}
|
||||
matchers = {}
|
||||
|
||||
if indent is None:
|
||||
indent = getattr(grammar, "pretty_indent", None)
|
||||
if indent is None:
|
||||
indent = " "
|
||||
|
||||
trivia_mode = {}
|
||||
for t in grammar.terminals():
|
||||
mode = t.meta.get("trivia_mode")
|
||||
if t.name is not None and isinstance(mode, parser.TriviaMode):
|
||||
trivia_mode[t.name] = mode
|
||||
|
||||
for name, rule in nonterminals.items():
|
||||
matchers[name] = _compile_nonterminal_matcher(grammar, nonterminals, rule)
|
||||
|
||||
return PrettyTable(
|
||||
indent,
|
||||
trivia_mode,
|
||||
matchers,
|
||||
)
|
||||
|
||||
|
||||
############################################################################
|
||||
# The Actual Pretty Printer
|
||||
############################################################################
|
||||
|
||||
|
||||
class Matcher:
|
||||
table: MatcherTable
|
||||
trivia_mode: dict[str, parser.TriviaMode]
|
||||
|
||||
def __init__(self, table: MatcherTable, trivia_mode: dict[str, parser.TriviaMode]):
|
||||
self.table = table
|
||||
self.trivia_mode = trivia_mode
|
||||
|
||||
def match(
|
||||
self,
|
||||
printer: "Printer",
|
||||
|
|
@ -374,7 +604,7 @@ class Matcher:
|
|||
src: str,
|
||||
) -> Document:
|
||||
stack: list[tuple[int, Document]] = [(0, None)]
|
||||
table = self.table
|
||||
table = self.table.table
|
||||
|
||||
# eof_trivia = []
|
||||
# if len(items) > 0:
|
||||
|
|
@ -420,15 +650,15 @@ class Matcher:
|
|||
child = group(child)
|
||||
|
||||
elif name[0] == "i":
|
||||
amount = self.indent_amounts[name]
|
||||
amount = self.table.indent_amounts[name]
|
||||
child = Indent(amount, child)
|
||||
|
||||
elif name[0] == "n":
|
||||
replace = self.newline_replace[name]
|
||||
replace = self.table.newline_replace[name]
|
||||
child = cons(child, NewLine(replace))
|
||||
|
||||
elif name[0] == "p":
|
||||
replace = self.newline_replace[name]
|
||||
replace = self.table.newline_replace[name]
|
||||
child = cons(NewLine(replace), child)
|
||||
|
||||
elif name[0] == "f":
|
||||
|
|
@ -464,8 +694,6 @@ class Matcher:
|
|||
|
||||
def apply_pre_trivia(self, trivia_tokens: list[runtime.TokenValue], src: str) -> Document:
|
||||
pre_trivia, _ = slice_pre_post_trivia(self.trivia_mode, trivia_tokens)
|
||||
# print(f"PRE:\n{pre_trivia}")
|
||||
|
||||
if len(pre_trivia) == 0:
|
||||
return None
|
||||
|
||||
|
|
@ -474,7 +702,6 @@ class Matcher:
|
|||
trivia_doc = None
|
||||
new_line_count = 0
|
||||
for mode, token in pre_trivia:
|
||||
# print(f"PRE {mode:25} {token.kind:30} ({new_line_count})")
|
||||
match mode:
|
||||
case parser.TriviaMode.LineComment:
|
||||
trivia_doc = cons(
|
||||
|
|
@ -509,7 +736,6 @@ class Matcher:
|
|||
|
||||
trivia_doc = None
|
||||
for mode, token in post_trivia:
|
||||
# print(f"POST {mode:25} {token.kind:30}")
|
||||
match mode:
|
||||
case parser.TriviaMode.Blank:
|
||||
pass
|
||||
|
|
@ -573,210 +799,23 @@ class Matcher:
|
|||
|
||||
|
||||
class Printer:
|
||||
# TODO: Pre-generate the matcher tables for a grammar, to make it
|
||||
# possible to do codegen in other languages.
|
||||
grammar: parser.Grammar
|
||||
_matchers: dict[str, Matcher]
|
||||
_nonterminals: dict[str, parser.NonTerminal]
|
||||
_indent: str
|
||||
_trivia_mode: dict[str, parser.TriviaMode]
|
||||
table: PrettyTable
|
||||
matchers: dict[str, Matcher]
|
||||
|
||||
def __init__(self, grammar: parser.Grammar, indent: str | None = None):
|
||||
self.grammar = grammar
|
||||
self._nonterminals = {nt.name: nt for nt in grammar.non_terminals()}
|
||||
self._matchers = {}
|
||||
|
||||
if indent is None:
|
||||
indent = getattr(self.grammar, "pretty_indent", None)
|
||||
if indent is None:
|
||||
indent = " "
|
||||
self._indent = indent
|
||||
|
||||
trivia_mode = {}
|
||||
for t in grammar.terminals():
|
||||
mode = t.meta.get("trivia_mode")
|
||||
if t.name is not None and isinstance(mode, parser.TriviaMode):
|
||||
trivia_mode[t.name] = mode
|
||||
self._trivia_mode = trivia_mode
|
||||
def __init__(self, table: PrettyTable):
|
||||
self.table = table
|
||||
self.matchers = {
|
||||
name: Matcher(value, self.table.trivia_modes) for name, value in table.matchers.items()
|
||||
}
|
||||
|
||||
def indent(self) -> str:
|
||||
return self._indent
|
||||
|
||||
def lookup_nonterminal(self, name: str) -> parser.NonTerminal:
|
||||
return self._nonterminals[name]
|
||||
|
||||
def compile_rule(self, rule: parser.NonTerminal) -> Matcher:
|
||||
generated_grammar: list[typing.Tuple[str, list[str]]] = []
|
||||
visited: set[str] = set()
|
||||
|
||||
# In order to generate groups, indents, and newlines we need to
|
||||
# synthesize new productions. And it happens sometimes that we get
|
||||
# duplicates, repeated synthetic productions. It's important to
|
||||
# de-duplicate productions, otherwise we'll wind up with ambiguities
|
||||
# in the parser.
|
||||
#
|
||||
# These dictionaries track the synthetic rules: the keys are
|
||||
# production and also the parameter (if any), and the values are the
|
||||
# names of the productions that produce the effect.
|
||||
#
|
||||
groups: dict[tuple[str, ...], str] = {}
|
||||
indents: dict[tuple[tuple[str, ...], int], str] = {}
|
||||
newlines: dict[tuple[tuple[str, ...], str], str] = {}
|
||||
prefix_count: int = 0
|
||||
|
||||
final_newlines: dict[str, str] = {}
|
||||
|
||||
def compile_nonterminal(name: str, rule: parser.NonTerminal):
|
||||
if name not in visited:
|
||||
visited.add(name)
|
||||
for production in rule.fn(self.grammar).flatten(with_metadata=True):
|
||||
trans_prod = compile_production(production)
|
||||
generated_grammar.append((name, trans_prod))
|
||||
|
||||
def compile_production(production: parser.FlattenedWithMetadata) -> list[str]:
|
||||
nonlocal groups
|
||||
nonlocal indents
|
||||
nonlocal newlines
|
||||
nonlocal prefix_count
|
||||
nonlocal final_newlines
|
||||
|
||||
prefix_stack: list[str] = []
|
||||
|
||||
result = []
|
||||
for item in production:
|
||||
if isinstance(item, str):
|
||||
nt = self._nonterminals[item]
|
||||
if nt.transparent:
|
||||
# If it's transparent then we make a new set of
|
||||
# productions that covers the contents of the
|
||||
# transparent nonterminal.
|
||||
name = "xxx_" + nt.name
|
||||
compile_nonterminal(name, nt)
|
||||
result.append(name)
|
||||
else:
|
||||
# Otherwise it's a "token" in our input, named
|
||||
# "tree_{whatever}".
|
||||
result.append(f"tree_{item}")
|
||||
|
||||
elif isinstance(item, parser.Terminal):
|
||||
# If it's a terminal it will appear in our input as
|
||||
# "token_{whatever}".
|
||||
result.append(f"token_{item.name}")
|
||||
|
||||
else:
|
||||
meta, children = item
|
||||
tx_children = compile_production(children)
|
||||
|
||||
pretty = meta.get("format")
|
||||
if isinstance(pretty, parser.FormatMeta):
|
||||
if pretty.group:
|
||||
# Make a fake rule.
|
||||
child_key = tuple(tx_children)
|
||||
rule_name = groups.get(child_key)
|
||||
if rule_name is None:
|
||||
rule_name = f"g_{len(groups)}"
|
||||
groups[child_key] = rule_name
|
||||
generated_grammar.append((rule_name, tx_children))
|
||||
|
||||
tx_children = [rule_name]
|
||||
|
||||
if pretty.indent:
|
||||
child_key = (tuple(tx_children), pretty.indent)
|
||||
rule_name = indents.get(child_key)
|
||||
if rule_name is None:
|
||||
rule_name = f"i_{len(indents)}"
|
||||
indents[child_key] = rule_name
|
||||
generated_grammar.append((rule_name, tx_children))
|
||||
|
||||
tx_children = [rule_name]
|
||||
|
||||
if pretty.newline is not None:
|
||||
if len(tx_children) == 0:
|
||||
tx_children = result
|
||||
result = []
|
||||
|
||||
if len(tx_children) > 0:
|
||||
# n == postfix newline
|
||||
child_key = (tuple(tx_children), pretty.newline)
|
||||
rule_name = newlines.get(child_key)
|
||||
if rule_name is None:
|
||||
rule_name = f"n_{len(newlines)}"
|
||||
newlines[child_key] = rule_name
|
||||
generated_grammar.append((rule_name, tx_children))
|
||||
|
||||
tx_children = [rule_name]
|
||||
|
||||
else:
|
||||
# p == prefix newline
|
||||
rule_name = f"p_{prefix_count}"
|
||||
prefix_count += 1
|
||||
final_newlines[rule_name] = pretty.newline
|
||||
prefix_stack.append(rule_name)
|
||||
|
||||
if pretty.forced_break:
|
||||
if len(tx_children) == 0:
|
||||
tx_children = result
|
||||
result = []
|
||||
|
||||
if len(tx_children) > 0:
|
||||
# f == postfix forced break
|
||||
rule_name = f"f_{prefix_count}"
|
||||
prefix_count += 1
|
||||
|
||||
generated_grammar.append((rule_name, tx_children))
|
||||
tx_children = [rule_name]
|
||||
else:
|
||||
# d == prefix forced break (to the right of 'f' on my kbd)
|
||||
rule_name = f"d_{prefix_count}"
|
||||
prefix_count += 1
|
||||
prefix_stack.append(rule_name)
|
||||
|
||||
# If it turned out to have formatting meta then we will
|
||||
# have replaced or augmented the translated children
|
||||
# appropriately. Otherwise, if it's highlighting meta or
|
||||
# something else, we'll have ignored it and the
|
||||
# translated children should just be inserted inline.
|
||||
result.extend(tx_children)
|
||||
|
||||
# OK so we might have some prefix newlines. They should contain... things.
|
||||
while len(prefix_stack) > 0:
|
||||
rule_name = prefix_stack.pop()
|
||||
generated_grammar.append((rule_name, result))
|
||||
result = [rule_name]
|
||||
|
||||
return result
|
||||
|
||||
start_name = f"yyy_{rule.name}"
|
||||
compile_nonterminal(start_name, rule)
|
||||
gen = self.grammar._generator(start_name, generated_grammar)
|
||||
parse_table = gen.gen_table()
|
||||
|
||||
for (_, replacement), rule_name in newlines.items():
|
||||
final_newlines[rule_name] = replacement
|
||||
|
||||
indent_amounts = {rule_name: amount for ((_, amount), rule_name) in indents.items()}
|
||||
|
||||
return Matcher(
|
||||
parse_table,
|
||||
indent_amounts,
|
||||
final_newlines,
|
||||
self._trivia_mode,
|
||||
)
|
||||
|
||||
def rule_to_matcher(self, rule: parser.NonTerminal) -> Matcher:
|
||||
result = self._matchers.get(rule.name)
|
||||
if result is None:
|
||||
result = self.compile_rule(rule)
|
||||
self._matchers[rule.name] = result
|
||||
|
||||
return result
|
||||
return self.table.indent
|
||||
|
||||
def convert_tree_to_document(self, tree: runtime.Tree, src: str) -> Document:
|
||||
name = tree.name
|
||||
assert name is not None, "Cannot format a tree if it still has transparent nodes inside"
|
||||
|
||||
rule = self.lookup_nonterminal(name)
|
||||
matcher = self.rule_to_matcher(rule)
|
||||
matcher = self.matchers[name]
|
||||
m = matcher.match(self, list(tree.children), src)
|
||||
if m is None:
|
||||
raise ValueError(
|
||||
|
|
@ -786,4 +825,4 @@ class Printer:
|
|||
|
||||
def format_tree(self, tree: runtime.Tree, src: str, width: int) -> DocumentLayout:
|
||||
doc = self.convert_tree_to_document(tree, src)
|
||||
return layout_document(doc, width, self._indent)
|
||||
return layout_document(doc, width, self.table.indent)
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue