From 8a17cfd586ab085cd601ed1bc8528832dfe7867d Mon Sep 17 00:00:00 2001 From: John Doty Date: Thu, 19 Sep 2024 16:39:32 -0700 Subject: [PATCH] [wadler] Prettier handling of trivia Split the rules for pre- and post- trivia, understand when we want to do either, handle multi-line-break (in an unsatisfying way, I guess) but otherwise lay the groundwork for thinking about it better. Also now we don't generate lazy "Text" nodes because I thought I might want to actually look at the newlines in the source but I don't yet. I *can* now, though. (I can also detect EOF so there's that.) --- grammar.py | 4 +- harness.py | 8 +-- parser/parser.py | 14 ++-- parser/wadler.py | 163 ++++++++++++++++++++++++++++--------------- tests/test_wadler.py | 78 +++++++++++---------- 5 files changed, 159 insertions(+), 108 deletions(-) diff --git a/grammar.py b/grammar.py index 7e098ff..5bf663d 100644 --- a/grammar.py +++ b/grammar.py @@ -24,7 +24,7 @@ class FineGrammar(Grammar): # generator = parser.GenerateLR1 start = "File" - trivia = ["BLANKS", "LINE_BREAKS", "COMMENT"] + trivia = ["BLANKS", "LINE_BREAK", "COMMENT"] pretty_indent = " " @@ -426,7 +426,7 @@ class FineGrammar(Grammar): return self.IDENTIFIER | group(self.IDENTIFIER, self.COLON, indent(sp, self.expression)) BLANKS = Terminal(Re.set(" ", "\t").plus()) - LINE_BREAKS = Terminal(Re.set("\r", "\n").plus(), trivia_mode=TriviaMode.NewLine) + LINE_BREAK = Terminal(Re.set("\r", "\n"), trivia_mode=TriviaMode.NewLine) COMMENT = Terminal( Re.seq(Re.literal("//"), Re.set("\n").invert().star()), highlight=highlight.comment.line, diff --git a/harness.py b/harness.py index b0352e4..10b993e 100644 --- a/harness.py +++ b/harness.py @@ -371,7 +371,7 @@ class Harness: printer = self.load_printer() if self.tree is not None: - self.document = printer.convert_tree_to_document(self.tree) + self.document = printer.convert_tree_to_document(self.tree, self.source) else: self.document = None @@ -541,12 +541,6 @@ class Harness: append(f"indent {doc.amount}") self.format_document(lines, doc.doc, indent + 1) - case wadler.Text(start, end): - if self.source is not None: - append(f"< {repr(self.source[start:end])}") - else: - append(f"< ??? {start}:{end}") - case wadler.Literal(text): append(f"literal {repr(text)}") diff --git a/parser/parser.py b/parser/parser.py index bff4034..f853a2e 100644 --- a/parser/parser.py +++ b/parser/parser.py @@ -2109,6 +2109,10 @@ class Re: UNICODE_MAX_CP = 1114112 +def _str_repr(x: int) -> str: + return repr(chr(x))[1:-1] + + @dataclasses.dataclass class ReSet(Re): values: list[Span] @@ -2165,12 +2169,12 @@ class ReSet(Re): if len(self.values) == 1: span = self.values[0] if len(span) == 1: - return chr(span.lower) + return _str_repr(span.lower) ranges = [] for span in self.values: - start = chr(span.lower) - end = chr(span.upper - 1) + start = _str_repr(span.lower) + end = _str_repr(span.upper - 1) if start == end: ranges.append(start) else: @@ -2736,7 +2740,7 @@ class TriviaMode(enum.Enum): pretty-printing. Attach this to a "trivia_mode" property on a Terminal definition. - - Ignore means that the trivia should be ignored. (This is the default.) + - Blank means that the trivia represents blank space. (This is the default.) - NewLine means that the trivia is a line break. This is important for other modes, specifically... @@ -2748,7 +2752,7 @@ class TriviaMode(enum.Enum): a forced break. """ - Ignore = 0 + Blank = 0 NewLine = 1 LineComment = 2 diff --git a/parser/wadler.py b/parser/wadler.py index e8e2534..5100241 100644 --- a/parser/wadler.py +++ b/parser/wadler.py @@ -32,12 +32,6 @@ class Indent: doc: "Document" -@dataclasses.dataclass(frozen=True) -class Text: - start: int - end: int - - @dataclasses.dataclass(frozen=True) class Literal: text: str @@ -69,13 +63,11 @@ class Lazy: return self.value @classmethod - def from_tree(cls, tree: runtime.Tree, printer: "Printer") -> "Lazy": - return Lazy(lambda: printer.convert_tree_to_document(tree)) + def from_tree(cls, tree: runtime.Tree, src: str, printer: "Printer") -> "Lazy": + return Lazy(lambda: printer.convert_tree_to_document(tree, src)) -Document = ( - None | Text | Literal | NewLine | ForceBreak | Cons | Indent | Group | Trivia | Marker | Lazy -) +Document = None | Literal | NewLine | ForceBreak | Cons | Indent | Group | Trivia | Marker | Lazy def cons(*documents: Document) -> Document: @@ -207,9 +199,6 @@ def layout_document(doc: Document, width: int, indent: str) -> DocumentLayout: case None: pass - case Text(start, end): - remaining -= end - start - case Literal(text): remaining -= len(text) @@ -268,10 +257,6 @@ def layout_document(doc: Document, width: int, indent: str) -> DocumentLayout: case None: pass - case Text(start, end): - output.append((start, end)) - column += end - start - case Literal(text): output.append(text) column += len(text) @@ -337,7 +322,7 @@ def resolve_document(doc: Document) -> Document: case Trivia(child): return Trivia(resolve_document(child)) - case Text() | Literal() | NewLine() | ForceBreak() | Indent() | None: + case Literal() | NewLine() | ForceBreak() | Indent() | None: return doc case _: @@ -358,7 +343,12 @@ class Matcher: newline_replace: dict[str, str] trivia_mode: dict[str, parser.TriviaMode] - def match(self, printer: "Printer", items: list[runtime.Tree | runtime.TokenValue]) -> Document: + def match( + self, + printer: "Printer", + items: list[runtime.Tree | runtime.TokenValue], + src: str, + ) -> Document: stack: list[tuple[int, Document]] = [(0, None)] table = self.table @@ -434,10 +424,13 @@ class Matcher: value = current_token[1] if isinstance(value, runtime.Tree): - child = Lazy.from_tree(value, printer) + child = Lazy.from_tree(value, src, printer) else: - child = Text(value.start, value.end) - child = cons(child, self.apply_trivia(value.post_trivia)) + child = cons( + trivia(self.apply_pre_trivia(value.pre_trivia, src)), + Literal(src[value.start : value.end]), + trivia(self.apply_post_trivia(value.post_trivia, src)), + ) stack.append((action.state, child)) input_index += 1 @@ -445,46 +438,100 @@ class Matcher: case parser.Error(): raise Exception("How did I get a parse error here??") - def apply_trivia(self, trivia_tokens: list[runtime.TokenValue]) -> Document: - has_newline = False + def slice_pre_post_trivia(self, trivia_tokens: list[runtime.TokenValue], src: str) -> tuple[ + list[tuple[parser.TriviaMode, runtime.TokenValue]], + list[tuple[parser.TriviaMode, runtime.TokenValue]], + ]: + tokens = [ + (self.trivia_mode.get(token.kind, parser.TriviaMode.Blank), token) + for token in trivia_tokens + ] + + for index, (mode, token) in enumerate(tokens): + if token.start == 0: + # Everything is pre-trivia if we're at the start of the file. + return (tokens, []) + + if mode == parser.TriviaMode.NewLine: + # This is the first newline; it belongs with the post-trivia. + return (tokens[index + 1 :], tokens[: index + 1]) + + # If we never found a new line then it's all post-trivia. + return ([], tokens) + + def apply_pre_trivia(self, trivia_tokens: list[runtime.TokenValue], src: str) -> Document: + pre_trivia, _ = self.slice_pre_post_trivia(trivia_tokens, src) + if len(pre_trivia) == 0: + return None + + at_start_of_file = pre_trivia[0][1].start == 0 + trivia_doc = None - for token in trivia_tokens: - mode = self.trivia_mode.get(token.kind, parser.TriviaMode.Ignore) + new_line_count = 0 + for mode, token in pre_trivia: match mode: - case parser.TriviaMode.Ignore: + case parser.TriviaMode.LineComment: + trivia_doc = cons( + trivia_doc, + Literal(src[token.start : token.end]), + ForceBreak(False), + ) + new_line_count = 0 # There will be a newline after this. + at_start_of_file = False + + case parser.TriviaMode.Blank: pass case parser.TriviaMode.NewLine: - # We ignore line breaks because obviously - # we expect the pretty-printer to put the - # line breaks in where they belong *but* - # we track if they happened to influence - # the layout. - has_newline = True - - case parser.TriviaMode.LineComment: - if has_newline: - # This line comment is all alone on - # its line, so we need to maintain - # that. - line_break = NewLine("") - else: - # This line comment is attached to - # something to the left, reduce it to - # a space. - line_break = Literal(" ") - - trivia_doc = cons( - trivia_doc, - line_break, - Text(token.start, token.end), - ForceBreak(True), # This is probably the wrong place for this! - ) + new_line_count += 1 + if new_line_count == 2 and not at_start_of_file: + trivia_doc = cons( + trivia_doc, + ForceBreak(False), + ForceBreak(False), + ) case _: typing.assert_never(mode) - return trivia(trivia_doc) + return trivia_doc + + def apply_post_trivia(self, trivia_tokens: list[runtime.TokenValue], src: str) -> Document: + _, post_trivia = self.slice_pre_post_trivia(trivia_tokens, src) + if len(post_trivia) == 0: + return None + + trivia_doc = None + for mode, token in post_trivia: + match mode: + case parser.TriviaMode.Blank: + pass + + case parser.TriviaMode.NewLine: + # Anything after a line break is not processed as post + # trivia. + break + + case parser.TriviaMode.LineComment: + # Because this is post-trivia, we know there's something + # to our left, and we can force the space. + trivia_doc = cons( + Literal(" "), + Literal(src[token.start : token.end]), + ForceBreak(True), # And the line needs to end. + ) + break + + case _: + typing.assert_never(mode) + + if len(trivia_tokens) > 0 and trivia_tokens[-1].end == len(src): + # As a special case, if we're post trivia at the end of the file + # then we also need to be pre-trivia too, for the hypthetical EOF + # token that we never see. + trivia_doc = cons(trivia_doc, self.apply_pre_trivia(trivia_tokens, src)) + + return trivia_doc class Printer: @@ -686,19 +733,19 @@ class Printer: return result - def convert_tree_to_document(self, tree: runtime.Tree) -> Document: + def convert_tree_to_document(self, tree: runtime.Tree, src: str) -> Document: name = tree.name assert name is not None, "Cannot format a tree if it still has transparent nodes inside" rule = self.lookup_nonterminal(name) matcher = self.rule_to_matcher(rule) - m = matcher.match(self, list(tree.children)) + m = matcher.match(self, list(tree.children), src) if m is None: raise ValueError( f"Could not match a valid tree for {tree.name} with {len(tree.children)} children:\n{tree.format()}" ) return resolve_document(m) - def format_tree(self, tree: runtime.Tree, width: int) -> DocumentLayout: - doc = self.convert_tree_to_document(tree) + def format_tree(self, tree: runtime.Tree, src: str, width: int) -> DocumentLayout: + doc = self.convert_tree_to_document(tree, src) return layout_document(doc, width, self._indent) diff --git a/tests/test_wadler.py b/tests/test_wadler.py index f53c97a..ff52ef2 100644 --- a/tests/test_wadler.py +++ b/tests/test_wadler.py @@ -14,6 +14,7 @@ from parser.parser import ( sp, nl, br, + TriviaMode, ) import parser.runtime as runtime @@ -72,6 +73,7 @@ class JsonGrammar(Grammar): ) BLANKS = Terminal(Re.set(" ", "\t", "\r", "\n").plus()) + LCURLY = Terminal("{") RCURLY = Terminal("}") COMMA = Terminal(",") @@ -118,8 +120,6 @@ def flatten_document(doc: wadler.Document, src: str) -> list: return [""] case wadler.Indent(): return [[f"", flatten_document(doc.doc, src)]] - case wadler.Text(start, end): - return [src[start:end]] case wadler.Literal(text): return [text] case wadler.Group(): @@ -149,7 +149,7 @@ def test_convert_tree_to_document(): assert tree is not None printer = wadler.Printer(JSON) - doc = flatten_document(printer.convert_tree_to_document(tree), text) + doc = flatten_document(printer.convert_tree_to_document(tree, text), text) assert doc == [ [ @@ -212,7 +212,7 @@ def test_layout_basic(): assert tree is not None printer = wadler.Printer(JSON) - result = printer.format_tree(tree, 50).apply_to_source(text) + result = printer.format_tree(tree, text, 50).apply_to_source(text) assert ( result @@ -226,38 +226,44 @@ def test_layout_basic(): ) +class TG(Grammar): + start = "root" + trivia = ["BLANKS", "LINE_BREAK", "COMMENT"] + + @rule + def root(self): + return self._expression + + @rule + def _expression(self): + return self.word | self.list + + @rule + def list(self): + return group(self.LPAREN, indent(nl, self._expressions), nl, self.RPAREN) + + @rule + def _expressions(self): + return self._expression | seq(self._expressions, sp, self._expression) + + @rule + def word(self): + return self.OK | seq(self.BREAK, br, self.BREAK) + + LPAREN = Terminal("(") + RPAREN = Terminal(")") + OK = Terminal("ok") + BREAK = Terminal("break") + + BLANKS = Terminal(Re.set(" ", "\t").plus()) + LINE_BREAK = Terminal(Re.set("\r", "\n"), trivia_mode=TriviaMode.NewLine) + COMMENT = Terminal( + Re.seq(Re.literal(";"), Re.set("\n").invert().star()), + trivia_mode=TriviaMode.LineComment, + ) + + def test_forced_break(): - class TG(Grammar): - start = "root" - trivia = ["BLANKS"] - - @rule - def root(self): - return self._expression - - @rule - def _expression(self): - return self.word | self.list - - @rule - def list(self): - return group(self.LPAREN, indent(nl, self._expressions), nl, self.RPAREN) - - @rule - def _expressions(self): - return self._expression | seq(self._expressions, sp, self._expression) - - @rule - def word(self): - return self.OK | seq(self.BREAK, br, self.BREAK) - - LPAREN = Terminal("(") - RPAREN = Terminal(")") - OK = Terminal("ok") - BREAK = Terminal("break") - - BLANKS = Terminal(Re.set(" ", "\t", "\r", "\n").plus()) - g = TG() g_lexer = g.compile_lexer() g_parser = runtime.Parser(g.build_table()) @@ -269,7 +275,7 @@ def test_forced_break(): assert tree is not None printer = wadler.Printer(g) - result = printer.format_tree(tree, 200).apply_to_source(text) + result = printer.format_tree(tree, text, 200).apply_to_source(text) assert ( result