diff --git a/parser/wadler.py b/parser/wadler.py index db4c66b..f675caf 100644 --- a/parser/wadler.py +++ b/parser/wadler.py @@ -360,13 +360,243 @@ def slice_pre_post_trivia( return ([], tokens) +############################################################################ +# Data to Drive the Pretty Printer +############################################################################ + + @dataclasses.dataclass -class Matcher: +class MatcherTable: + """Information necessary to create a document from a concrete parse tree, + as generated by the parser. + + (In order to do this we need to re-parse the children of the tree, in + order to recover structure added by transparent rules. That's why each + MatcherTable has an associated ParseTable!) + """ + + # Parse table to recover the node into a document table: parser.ParseTable + # Mapping from the name of i_ rules to indent counts indent_amounts: dict[str, int] + # Mapping from the names of n_ rules to the text they flatten to newline_replace: dict[str, str] + + +def _compile_nonterminal_matcher( + grammar: parser.Grammar, + nonterminals: dict[str, parser.NonTerminal], + rule: parser.NonTerminal, +) -> MatcherTable: + generated_grammar: list[typing.Tuple[str, list[str]]] = [] + visited: set[str] = set() + + # In order to generate groups, indents, and newlines we need to + # synthesize new productions. And it happens sometimes that we get + # duplicates, repeated synthetic productions. It's important to + # de-duplicate productions, otherwise we'll wind up with ambiguities + # in the parser. + # + # These dictionaries track the synthetic rules: the keys are + # production and also the parameter (if any), and the values are the + # names of the productions that produce the effect. + # + groups: dict[tuple[str, ...], str] = {} + indents: dict[tuple[tuple[str, ...], int], str] = {} + newlines: dict[tuple[tuple[str, ...], str], str] = {} + prefix_count: int = 0 + + final_newlines: dict[str, str] = {} + + def compile_nonterminal(name: str, rule: parser.NonTerminal): + if name not in visited: + visited.add(name) + for production in rule.fn(grammar).flatten(with_metadata=True): + trans_prod = compile_production(production) + generated_grammar.append((name, trans_prod)) + + def compile_production(production: parser.FlattenedWithMetadata) -> list[str]: + nonlocal groups + nonlocal indents + nonlocal newlines + nonlocal prefix_count + nonlocal final_newlines + + prefix_stack: list[str] = [] + + result = [] + for item in production: + if isinstance(item, str): + nt = nonterminals[item] + if nt.transparent: + # If it's transparent then we make a new set of + # productions that covers the contents of the + # transparent nonterminal. + name = "xxx_" + nt.name + compile_nonterminal(name, nt) + result.append(name) + else: + # Otherwise it's a "token" in our input, named + # "tree_{whatever}". + result.append(f"tree_{item}") + + elif isinstance(item, parser.Terminal): + # If it's a terminal it will appear in our input as + # "token_{whatever}". + result.append(f"token_{item.name}") + + else: + meta, children = item + tx_children = compile_production(children) + + pretty = meta.get("format") + if isinstance(pretty, parser.FormatMeta): + if pretty.group: + # Make a fake rule. + child_key = tuple(tx_children) + rule_name = groups.get(child_key) + if rule_name is None: + rule_name = f"g_{len(groups)}" + groups[child_key] = rule_name + generated_grammar.append((rule_name, tx_children)) + + tx_children = [rule_name] + + if pretty.indent: + child_key = (tuple(tx_children), pretty.indent) + rule_name = indents.get(child_key) + if rule_name is None: + rule_name = f"i_{len(indents)}" + indents[child_key] = rule_name + generated_grammar.append((rule_name, tx_children)) + + tx_children = [rule_name] + + if pretty.newline is not None: + if len(tx_children) == 0: + tx_children = result + result = [] + + if len(tx_children) > 0: + # n == postfix newline + child_key = (tuple(tx_children), pretty.newline) + rule_name = newlines.get(child_key) + if rule_name is None: + rule_name = f"n_{len(newlines)}" + newlines[child_key] = rule_name + generated_grammar.append((rule_name, tx_children)) + + tx_children = [rule_name] + + else: + # p == prefix newline + rule_name = f"p_{prefix_count}" + prefix_count += 1 + final_newlines[rule_name] = pretty.newline + prefix_stack.append(rule_name) + + if pretty.forced_break: + if len(tx_children) == 0: + tx_children = result + result = [] + + if len(tx_children) > 0: + # f == postfix forced break + rule_name = f"f_{prefix_count}" + prefix_count += 1 + + generated_grammar.append((rule_name, tx_children)) + tx_children = [rule_name] + else: + # d == prefix forced break (to the right of 'f' on my kbd) + rule_name = f"d_{prefix_count}" + prefix_count += 1 + prefix_stack.append(rule_name) + + # If it turned out to have formatting meta then we will + # have replaced or augmented the translated children + # appropriately. Otherwise, if it's highlighting meta or + # something else, we'll have ignored it and the + # translated children should just be inserted inline. + result.extend(tx_children) + + # OK so we might have some prefix newlines. They should contain... things. + while len(prefix_stack) > 0: + rule_name = prefix_stack.pop() + generated_grammar.append((rule_name, result)) + result = [rule_name] + + return result + + start_name = f"yyy_{rule.name}" + compile_nonterminal(start_name, rule) + gen = grammar._generator(start_name, generated_grammar) + parse_table = gen.gen_table() + + for (_, replacement), rule_name in newlines.items(): + final_newlines[rule_name] = replacement + + indent_amounts = {rule_name: amount for ((_, amount), rule_name) in indents.items()} + + return MatcherTable( + parse_table, + indent_amounts, + final_newlines, + ) + + +@dataclasses.dataclass +class PrettyTable: + """Information necessary to convert a parsed tree into a wadler-style + pretty document, where it can then be formatted. + + This is basically a bunch of "MatcherTables", one for each kind of tree, + that tell us how to recover document structure from the tree node. + """ + + indent: str + trivia_modes: dict[str, parser.TriviaMode] + matchers: dict[str, MatcherTable] + + +def compile_pretty_table(grammar: parser.Grammar, indent: str | None = None) -> PrettyTable: + nonterminals = {nt.name: nt for nt in grammar.non_terminals()} + matchers = {} + + if indent is None: + indent = getattr(grammar, "pretty_indent", None) + if indent is None: + indent = " " + + trivia_mode = {} + for t in grammar.terminals(): + mode = t.meta.get("trivia_mode") + if t.name is not None and isinstance(mode, parser.TriviaMode): + trivia_mode[t.name] = mode + + for name, rule in nonterminals.items(): + matchers[name] = _compile_nonterminal_matcher(grammar, nonterminals, rule) + + return PrettyTable( + indent, + trivia_mode, + matchers, + ) + + +############################################################################ +# The Actual Pretty Printer +############################################################################ + + +class Matcher: + table: MatcherTable trivia_mode: dict[str, parser.TriviaMode] + def __init__(self, table: MatcherTable, trivia_mode: dict[str, parser.TriviaMode]): + self.table = table + self.trivia_mode = trivia_mode + def match( self, printer: "Printer", @@ -374,7 +604,7 @@ class Matcher: src: str, ) -> Document: stack: list[tuple[int, Document]] = [(0, None)] - table = self.table + table = self.table.table # eof_trivia = [] # if len(items) > 0: @@ -420,15 +650,15 @@ class Matcher: child = group(child) elif name[0] == "i": - amount = self.indent_amounts[name] + amount = self.table.indent_amounts[name] child = Indent(amount, child) elif name[0] == "n": - replace = self.newline_replace[name] + replace = self.table.newline_replace[name] child = cons(child, NewLine(replace)) elif name[0] == "p": - replace = self.newline_replace[name] + replace = self.table.newline_replace[name] child = cons(NewLine(replace), child) elif name[0] == "f": @@ -464,8 +694,6 @@ class Matcher: def apply_pre_trivia(self, trivia_tokens: list[runtime.TokenValue], src: str) -> Document: pre_trivia, _ = slice_pre_post_trivia(self.trivia_mode, trivia_tokens) - # print(f"PRE:\n{pre_trivia}") - if len(pre_trivia) == 0: return None @@ -474,7 +702,6 @@ class Matcher: trivia_doc = None new_line_count = 0 for mode, token in pre_trivia: - # print(f"PRE {mode:25} {token.kind:30} ({new_line_count})") match mode: case parser.TriviaMode.LineComment: trivia_doc = cons( @@ -509,7 +736,6 @@ class Matcher: trivia_doc = None for mode, token in post_trivia: - # print(f"POST {mode:25} {token.kind:30}") match mode: case parser.TriviaMode.Blank: pass @@ -573,210 +799,23 @@ class Matcher: class Printer: - # TODO: Pre-generate the matcher tables for a grammar, to make it - # possible to do codegen in other languages. - grammar: parser.Grammar - _matchers: dict[str, Matcher] - _nonterminals: dict[str, parser.NonTerminal] - _indent: str - _trivia_mode: dict[str, parser.TriviaMode] + table: PrettyTable + matchers: dict[str, Matcher] - def __init__(self, grammar: parser.Grammar, indent: str | None = None): - self.grammar = grammar - self._nonterminals = {nt.name: nt for nt in grammar.non_terminals()} - self._matchers = {} - - if indent is None: - indent = getattr(self.grammar, "pretty_indent", None) - if indent is None: - indent = " " - self._indent = indent - - trivia_mode = {} - for t in grammar.terminals(): - mode = t.meta.get("trivia_mode") - if t.name is not None and isinstance(mode, parser.TriviaMode): - trivia_mode[t.name] = mode - self._trivia_mode = trivia_mode + def __init__(self, table: PrettyTable): + self.table = table + self.matchers = { + name: Matcher(value, self.table.trivia_modes) for name, value in table.matchers.items() + } def indent(self) -> str: - return self._indent - - def lookup_nonterminal(self, name: str) -> parser.NonTerminal: - return self._nonterminals[name] - - def compile_rule(self, rule: parser.NonTerminal) -> Matcher: - generated_grammar: list[typing.Tuple[str, list[str]]] = [] - visited: set[str] = set() - - # In order to generate groups, indents, and newlines we need to - # synthesize new productions. And it happens sometimes that we get - # duplicates, repeated synthetic productions. It's important to - # de-duplicate productions, otherwise we'll wind up with ambiguities - # in the parser. - # - # These dictionaries track the synthetic rules: the keys are - # production and also the parameter (if any), and the values are the - # names of the productions that produce the effect. - # - groups: dict[tuple[str, ...], str] = {} - indents: dict[tuple[tuple[str, ...], int], str] = {} - newlines: dict[tuple[tuple[str, ...], str], str] = {} - prefix_count: int = 0 - - final_newlines: dict[str, str] = {} - - def compile_nonterminal(name: str, rule: parser.NonTerminal): - if name not in visited: - visited.add(name) - for production in rule.fn(self.grammar).flatten(with_metadata=True): - trans_prod = compile_production(production) - generated_grammar.append((name, trans_prod)) - - def compile_production(production: parser.FlattenedWithMetadata) -> list[str]: - nonlocal groups - nonlocal indents - nonlocal newlines - nonlocal prefix_count - nonlocal final_newlines - - prefix_stack: list[str] = [] - - result = [] - for item in production: - if isinstance(item, str): - nt = self._nonterminals[item] - if nt.transparent: - # If it's transparent then we make a new set of - # productions that covers the contents of the - # transparent nonterminal. - name = "xxx_" + nt.name - compile_nonterminal(name, nt) - result.append(name) - else: - # Otherwise it's a "token" in our input, named - # "tree_{whatever}". - result.append(f"tree_{item}") - - elif isinstance(item, parser.Terminal): - # If it's a terminal it will appear in our input as - # "token_{whatever}". - result.append(f"token_{item.name}") - - else: - meta, children = item - tx_children = compile_production(children) - - pretty = meta.get("format") - if isinstance(pretty, parser.FormatMeta): - if pretty.group: - # Make a fake rule. - child_key = tuple(tx_children) - rule_name = groups.get(child_key) - if rule_name is None: - rule_name = f"g_{len(groups)}" - groups[child_key] = rule_name - generated_grammar.append((rule_name, tx_children)) - - tx_children = [rule_name] - - if pretty.indent: - child_key = (tuple(tx_children), pretty.indent) - rule_name = indents.get(child_key) - if rule_name is None: - rule_name = f"i_{len(indents)}" - indents[child_key] = rule_name - generated_grammar.append((rule_name, tx_children)) - - tx_children = [rule_name] - - if pretty.newline is not None: - if len(tx_children) == 0: - tx_children = result - result = [] - - if len(tx_children) > 0: - # n == postfix newline - child_key = (tuple(tx_children), pretty.newline) - rule_name = newlines.get(child_key) - if rule_name is None: - rule_name = f"n_{len(newlines)}" - newlines[child_key] = rule_name - generated_grammar.append((rule_name, tx_children)) - - tx_children = [rule_name] - - else: - # p == prefix newline - rule_name = f"p_{prefix_count}" - prefix_count += 1 - final_newlines[rule_name] = pretty.newline - prefix_stack.append(rule_name) - - if pretty.forced_break: - if len(tx_children) == 0: - tx_children = result - result = [] - - if len(tx_children) > 0: - # f == postfix forced break - rule_name = f"f_{prefix_count}" - prefix_count += 1 - - generated_grammar.append((rule_name, tx_children)) - tx_children = [rule_name] - else: - # d == prefix forced break (to the right of 'f' on my kbd) - rule_name = f"d_{prefix_count}" - prefix_count += 1 - prefix_stack.append(rule_name) - - # If it turned out to have formatting meta then we will - # have replaced or augmented the translated children - # appropriately. Otherwise, if it's highlighting meta or - # something else, we'll have ignored it and the - # translated children should just be inserted inline. - result.extend(tx_children) - - # OK so we might have some prefix newlines. They should contain... things. - while len(prefix_stack) > 0: - rule_name = prefix_stack.pop() - generated_grammar.append((rule_name, result)) - result = [rule_name] - - return result - - start_name = f"yyy_{rule.name}" - compile_nonterminal(start_name, rule) - gen = self.grammar._generator(start_name, generated_grammar) - parse_table = gen.gen_table() - - for (_, replacement), rule_name in newlines.items(): - final_newlines[rule_name] = replacement - - indent_amounts = {rule_name: amount for ((_, amount), rule_name) in indents.items()} - - return Matcher( - parse_table, - indent_amounts, - final_newlines, - self._trivia_mode, - ) - - def rule_to_matcher(self, rule: parser.NonTerminal) -> Matcher: - result = self._matchers.get(rule.name) - if result is None: - result = self.compile_rule(rule) - self._matchers[rule.name] = result - - return result + return self.table.indent def convert_tree_to_document(self, tree: runtime.Tree, src: str) -> Document: name = tree.name assert name is not None, "Cannot format a tree if it still has transparent nodes inside" - rule = self.lookup_nonterminal(name) - matcher = self.rule_to_matcher(rule) + matcher = self.matchers[name] m = matcher.match(self, list(tree.children), src) if m is None: raise ValueError( @@ -786,4 +825,4 @@ class Printer: def format_tree(self, tree: runtime.Tree, src: str, width: int) -> DocumentLayout: doc = self.convert_tree_to_document(tree, src) - return layout_document(doc, width, self._indent) + return layout_document(doc, width, self.table.indent) diff --git a/tests/test_wadler.py b/tests/test_wadler.py index 8b43496..f5bfb37 100644 --- a/tests/test_wadler.py +++ b/tests/test_wadler.py @@ -149,7 +149,7 @@ def test_convert_tree_to_document(): assert [] == errors assert tree is not None - printer = wadler.Printer(JSON) + printer = wadler.Printer(wadler.compile_pretty_table(JSON)) doc = flatten_document(printer.convert_tree_to_document(tree, text), text) assert doc == [ @@ -216,7 +216,7 @@ def test_layout_basic(): assert [] == errors assert tree is not None - printer = wadler.Printer(JSON) + printer = wadler.Printer(wadler.compile_pretty_table(JSON)) result = printer.format_tree(tree, text, 50).apply_to_source(text) assert result == _output( @@ -278,7 +278,7 @@ def test_forced_break(): assert errors == [] assert tree is not None - printer = wadler.Printer(g) + printer = wadler.Printer(wadler.compile_pretty_table(g)) result = printer.format_tree(tree, text, 200).apply_to_source(text) assert result == _output( @@ -318,7 +318,7 @@ def test_maintaining_line_breaks(): assert errors == [] assert tree is not None - printer = wadler.Printer(g) + printer = wadler.Printer(wadler.compile_pretty_table(g)) result = printer.format_tree(tree, text, 200).apply_to_source(text) assert result == _output( @@ -352,7 +352,7 @@ def test_trailing_trivia(): assert errors == [] assert tree is not None - printer = wadler.Printer(g) + printer = wadler.Printer(wadler.compile_pretty_table(g)) result = printer.format_tree(tree, text, 200).apply_to_source(text) assert result == _output( @@ -378,7 +378,7 @@ def test_trailing_trivia_two(): assert errors == [] assert tree is not None - printer = wadler.Printer(g) + printer = wadler.Printer(wadler.compile_pretty_table(g)) result = printer.format_tree(tree, text, 200).apply_to_source(text) assert result == _output( @@ -432,9 +432,7 @@ def test_trailing_trivia_split(): print(f"{mode:25} {t.kind:10} {repr(text[t.start:t.end])}") trivia_doc = wadler.Matcher( - ParseTable([], [], set()), - {}, - {}, + wadler.MatcherTable(ParseTable([], [], set()), {}, {}), TRIVIA_MODES, ).apply_post_trivia( token.post_trivia,