[wadler] Prettier handling of trivia

Split the rules for pre- and post- trivia, understand when we want to do either, handle multi-line-break (in an unsatisfying way, I guess) but otherwise lay the groundwork for thinking about it better. Also now we don't generate lazy "Text" nodes because I thought I might want to actually look at the newlines in the source but I don't yet. I *can* now, though. (I can also detect EOF so there's that.)
2024-09-19 16:39:32 -07:00 · 2024-09-19 16:39:32 -07:00 · 8a17cfd586
commit 8a17cfd586
parent c31d527077
5 changed files with 159 additions and 108 deletions
--- a/grammar.py
+++ b/grammar.py
@ -24,7 +24,7 @@ class FineGrammar(Grammar):
    # generator = parser.GenerateLR1
    start = "File"
-    trivia = ["BLANKS", "LINE_BREAKS", "COMMENT"]
+    trivia = ["BLANKS", "LINE_BREAK", "COMMENT"]
    pretty_indent = "  "
@ -426,7 +426,7 @@ class FineGrammar(Grammar):
        return self.IDENTIFIER | group(self.IDENTIFIER, self.COLON, indent(sp, self.expression))
    BLANKS = Terminal(Re.set(" ", "\t").plus())
-    LINE_BREAKS = Terminal(Re.set("\r", "\n").plus(), trivia_mode=TriviaMode.NewLine)
+    LINE_BREAK = Terminal(Re.set("\r", "\n"), trivia_mode=TriviaMode.NewLine)
    COMMENT = Terminal(
        Re.seq(Re.literal("//"), Re.set("\n").invert().star()),
        highlight=highlight.comment.line,
--- a/harness.py
+++ b/harness.py
@ -371,7 +371,7 @@ class Harness:
            printer = self.load_printer()
            if self.tree is not None:
-                self.document = printer.convert_tree_to_document(self.tree)
+                self.document = printer.convert_tree_to_document(self.tree, self.source)
            else:
                self.document = None
@ -541,12 +541,6 @@ class Harness:
                append(f"indent {doc.amount}")
                self.format_document(lines, doc.doc, indent + 1)
            case wadler.Text(start, end):
                if self.source is not None:
                    append(f"< {repr(self.source[start:end])}")
                else:
                    append(f"< ??? {start}:{end}")
            case wadler.Literal(text):
                append(f"literal {repr(text)}")
--- a/parser/parser.py
+++ b/parser/parser.py
@ -2109,6 +2109,10 @@ class Re:
 UNICODE_MAX_CP = 1114112
 def _str_repr(x: int) -> str:
    return repr(chr(x))[1:-1]
@dataclasses.dataclass
 class ReSet(Re):
    values: list[Span]
@ -2165,12 +2169,12 @@ class ReSet(Re):
        if len(self.values) == 1:
            span = self.values[0]
            if len(span) == 1:
-                return chr(span.lower)
+                return _str_repr(span.lower)
        ranges = []
        for span in self.values:
-            start = chr(span.lower)
+            start = _str_repr(span.lower)
-            end = chr(span.upper - 1)
+            end = _str_repr(span.upper - 1)
            if start == end:
                ranges.append(start)
            else:
@ -2736,7 +2740,7 @@ class TriviaMode(enum.Enum):
    pretty-printing. Attach this to a "trivia_mode" property on a Terminal
    definition.
-    - Ignore means that the trivia should be ignored. (This is the default.)
+    - Blank means that the trivia represents blank space. (This is the default.)
    - NewLine means that the trivia is a line break. This is important for
      other modes, specifically...
@ -2748,7 +2752,7 @@ class TriviaMode(enum.Enum):
      a forced break.
    """
-    Ignore = 0
+    Blank = 0
    NewLine = 1
    LineComment = 2
--- a/parser/wadler.py
+++ b/parser/wadler.py
@ -32,12 +32,6 @@ class Indent:
    doc: "Document"
@dataclasses.dataclass(frozen=True)
 class Text:
    start: int
    end: int
@dataclasses.dataclass(frozen=True)
 class Literal:
    text: str
@ -69,13 +63,11 @@ class Lazy:
        return self.value
    @classmethod
-    def from_tree(cls, tree: runtime.Tree, printer: "Printer") -> "Lazy":
+    def from_tree(cls, tree: runtime.Tree, src: str, printer: "Printer") -> "Lazy":
-        return Lazy(lambda: printer.convert_tree_to_document(tree))
+        return Lazy(lambda: printer.convert_tree_to_document(tree, src))
-Document = (
+Document = None | Literal | NewLine | ForceBreak | Cons | Indent | Group | Trivia | Marker | Lazy
    None | Text | Literal | NewLine | ForceBreak | Cons | Indent | Group | Trivia | Marker | Lazy
 )
 def cons(*documents: Document) -> Document:
@ -207,9 +199,6 @@ def layout_document(doc: Document, width: int, indent: str) -> DocumentLayout:
                case None:
                    pass
                case Text(start, end):
                    remaining -= end - start
                case Literal(text):
                    remaining -= len(text)
@ -268,10 +257,6 @@ def layout_document(doc: Document, width: int, indent: str) -> DocumentLayout:
            case None:
                pass
            case Text(start, end):
                output.append((start, end))
                column += end - start
            case Literal(text):
                output.append(text)
                column += len(text)
@ -337,7 +322,7 @@ def resolve_document(doc: Document) -> Document:
        case Trivia(child):
            return Trivia(resolve_document(child))
-        case Text() | Literal() | NewLine() | ForceBreak() | Indent() | None:
+        case Literal() | NewLine() | ForceBreak() | Indent() | None:
            return doc
        case _:
@ -358,7 +343,12 @@ class Matcher:
    newline_replace: dict[str, str]
    trivia_mode: dict[str, parser.TriviaMode]
-    def match(self, printer: "Printer", items: list[runtime.Tree | runtime.TokenValue]) -> Document:
+    def match(
        self,
        printer: "Printer",
        items: list[runtime.Tree | runtime.TokenValue],
        src: str,
    ) -> Document:
        stack: list[tuple[int, Document]] = [(0, None)]
        table = self.table
@ -434,10 +424,13 @@ class Matcher:
                    value = current_token[1]
                    if isinstance(value, runtime.Tree):
-                        child = Lazy.from_tree(value, printer)
+                        child = Lazy.from_tree(value, src, printer)
                    else:
-                        child = Text(value.start, value.end)
+                        child = cons(
-                        child = cons(child, self.apply_trivia(value.post_trivia))
+                            trivia(self.apply_pre_trivia(value.pre_trivia, src)),
                            Literal(src[value.start : value.end]),
                            trivia(self.apply_post_trivia(value.post_trivia, src)),
                        )
                    stack.append((action.state, child))
                    input_index += 1
@ -445,46 +438,100 @@ class Matcher:
                case parser.Error():
                    raise Exception("How did I get a parse error here??")
-    def apply_trivia(self, trivia_tokens: list[runtime.TokenValue]) -> Document:
+    def slice_pre_post_trivia(self, trivia_tokens: list[runtime.TokenValue], src: str) -> tuple[
-        has_newline = False
+        list[tuple[parser.TriviaMode, runtime.TokenValue]],
        list[tuple[parser.TriviaMode, runtime.TokenValue]],
    ]:
        tokens = [
            (self.trivia_mode.get(token.kind, parser.TriviaMode.Blank), token)
            for token in trivia_tokens
        ]
        for index, (mode, token) in enumerate(tokens):
            if token.start == 0:
                # Everything is pre-trivia if we're at the start of the file.
                return (tokens, [])
            if mode == parser.TriviaMode.NewLine:
                # This is the first newline; it belongs with the post-trivia.
                return (tokens[index + 1 :], tokens[: index + 1])
        # If we never found a new line then it's all post-trivia.
        return ([], tokens)
    def apply_pre_trivia(self, trivia_tokens: list[runtime.TokenValue], src: str) -> Document:
        pre_trivia, _ = self.slice_pre_post_trivia(trivia_tokens, src)
        if len(pre_trivia) == 0:
            return None
        at_start_of_file = pre_trivia[0][1].start == 0
        trivia_doc = None
-        for token in trivia_tokens:
+        new_line_count = 0
-            mode = self.trivia_mode.get(token.kind, parser.TriviaMode.Ignore)
+        for mode, token in pre_trivia:
            match mode:
-                case parser.TriviaMode.Ignore:
+                case parser.TriviaMode.LineComment:
                    trivia_doc = cons(
                        trivia_doc,
                        Literal(src[token.start : token.end]),
                        ForceBreak(False),
                    )
                    new_line_count = 0  # There will be a newline after this.
                    at_start_of_file = False
                case parser.TriviaMode.Blank:
                    pass
                case parser.TriviaMode.NewLine:
-                    # We ignore line breaks because obviously
+                    new_line_count += 1
-                    # we expect the pretty-printer to put the
+                    if new_line_count == 2 and not at_start_of_file:
                    # line breaks in where they belong *but*
                    # we track if they happened to influence
                    # the layout.
                    has_newline = True
                case parser.TriviaMode.LineComment:
                    if has_newline:
                        # This line comment is all alone on
                        # its line, so we need to maintain
                        # that.
                        line_break = NewLine("")
                    else:
                        # This line comment is attached to
                        # something to the left, reduce it to
                        # a space.
                        line_break = Literal(" ")
                        trivia_doc = cons(
                            trivia_doc,
-                        line_break,
+                            ForceBreak(False),
-                        Text(token.start, token.end),
+                            ForceBreak(False),
                        ForceBreak(True),  # This is probably the wrong place for this!
                        )
                case _:
                    typing.assert_never(mode)
-        return trivia(trivia_doc)
+        return trivia_doc
    def apply_post_trivia(self, trivia_tokens: list[runtime.TokenValue], src: str) -> Document:
        _, post_trivia = self.slice_pre_post_trivia(trivia_tokens, src)
        if len(post_trivia) == 0:
            return None
        trivia_doc = None
        for mode, token in post_trivia:
            match mode:
                case parser.TriviaMode.Blank:
                    pass
                case parser.TriviaMode.NewLine:
                    # Anything after a line break is not processed as post
                    # trivia.
                    break
                case parser.TriviaMode.LineComment:
                    # Because this is post-trivia, we know there's something
                    # to our left, and we can force the space.
                    trivia_doc = cons(
                        Literal(" "),
                        Literal(src[token.start : token.end]),
                        ForceBreak(True),  # And the line needs to end.
                    )
                    break
                case _:
                    typing.assert_never(mode)
        if len(trivia_tokens) > 0 and trivia_tokens[-1].end == len(src):
            # As a special case, if we're post trivia at the end of the file
            # then we also need to be pre-trivia too, for the hypthetical EOF
            # token that we never see.
            trivia_doc = cons(trivia_doc, self.apply_pre_trivia(trivia_tokens, src))
        return trivia_doc
 class Printer:
@ -686,19 +733,19 @@ class Printer:
        return result
-    def convert_tree_to_document(self, tree: runtime.Tree) -> Document:
+    def convert_tree_to_document(self, tree: runtime.Tree, src: str) -> Document:
        name = tree.name
        assert name is not None, "Cannot format a tree if it still has transparent nodes inside"
        rule = self.lookup_nonterminal(name)
        matcher = self.rule_to_matcher(rule)
-        m = matcher.match(self, list(tree.children))
+        m = matcher.match(self, list(tree.children), src)
        if m is None:
            raise ValueError(
                f"Could not match a valid tree for {tree.name} with {len(tree.children)} children:\n{tree.format()}"
            )
        return resolve_document(m)
-    def format_tree(self, tree: runtime.Tree, width: int) -> DocumentLayout:
+    def format_tree(self, tree: runtime.Tree, src: str, width: int) -> DocumentLayout:
-        doc = self.convert_tree_to_document(tree)
+        doc = self.convert_tree_to_document(tree, src)
        return layout_document(doc, width, self._indent)
--- a/tests/test_wadler.py
+++ b/tests/test_wadler.py
@ -14,6 +14,7 @@ from parser.parser import (
    sp,
    nl,
    br,
    TriviaMode,
 )
 import parser.runtime as runtime
@ -72,6 +73,7 @@ class JsonGrammar(Grammar):
        )
    BLANKS = Terminal(Re.set(" ", "\t", "\r", "\n").plus())
    LCURLY = Terminal("{")
    RCURLY = Terminal("}")
    COMMA = Terminal(",")
@ -118,8 +120,6 @@ def flatten_document(doc: wadler.Document, src: str) -> list:
            return ["<forced break>"]
        case wadler.Indent():
            return [[f"<indent {doc.amount}>", flatten_document(doc.doc, src)]]
        case wadler.Text(start, end):
            return [src[start:end]]
        case wadler.Literal(text):
            return [text]
        case wadler.Group():
@ -149,7 +149,7 @@ def test_convert_tree_to_document():
    assert tree is not None
    printer = wadler.Printer(JSON)
-    doc = flatten_document(printer.convert_tree_to_document(tree), text)
+    doc = flatten_document(printer.convert_tree_to_document(tree, text), text)
    assert doc == [
        [
@ -212,7 +212,7 @@ def test_layout_basic():
    assert tree is not None
    printer = wadler.Printer(JSON)
-    result = printer.format_tree(tree, 50).apply_to_source(text)
+    result = printer.format_tree(tree, text, 50).apply_to_source(text)
    assert (
        result
@ -226,10 +226,9 @@ def test_layout_basic():
    )
-def test_forced_break():
+class TG(Grammar):
    class TG(Grammar):
    start = "root"
-        trivia = ["BLANKS"]
+    trivia = ["BLANKS", "LINE_BREAK", "COMMENT"]
    @rule
    def root(self):
@ -256,8 +255,15 @@ def test_forced_break():
    OK = Terminal("ok")
    BREAK = Terminal("break")
-        BLANKS = Terminal(Re.set(" ", "\t", "\r", "\n").plus())
+    BLANKS = Terminal(Re.set(" ", "\t").plus())
    LINE_BREAK = Terminal(Re.set("\r", "\n"), trivia_mode=TriviaMode.NewLine)
    COMMENT = Terminal(
        Re.seq(Re.literal(";"), Re.set("\n").invert().star()),
        trivia_mode=TriviaMode.LineComment,
    )
 def test_forced_break():
    g = TG()
    g_lexer = g.compile_lexer()
    g_parser = runtime.Parser(g.build_table())
@ -269,7 +275,7 @@ def test_forced_break():
    assert tree is not None
    printer = wadler.Printer(g)
-    result = printer.format_tree(tree, 200).apply_to_source(text)
+    result = printer.format_tree(tree, text, 200).apply_to_source(text)
    assert (
        result