[wadler] Prettier handling of trivia

Split the rules for pre- and post- trivia, understand when we want to do either, handle multi-line-break (in an unsatisfying way, I guess) but otherwise lay the groundwork for thinking about it better. Also now we don't generate lazy "Text" nodes because I thought I might want to actually look at the newlines in the source but I don't yet. I *can* now, though. (I can also detect EOF so there's that.)
2024-09-19 16:39:32 -07:00 · 2024-09-19 16:39:32 -07:00 · 8a17cfd586
commit 8a17cfd586
parent c31d527077
5 changed files with 159 additions and 108 deletions
--- a/grammar.py
+++ b/grammar.py
@ -24,7 +24,7 @@ class FineGrammar(Grammar):
    # generator = parser.GenerateLR1
    start = "File"

-    trivia = ["BLANKS", "LINE_BREAKS", "COMMENT"]
+    trivia = ["BLANKS", "LINE_BREAK", "COMMENT"]

    pretty_indent = "  "

@ -426,7 +426,7 @@ class FineGrammar(Grammar):
        return self.IDENTIFIER | group(self.IDENTIFIER, self.COLON, indent(sp, self.expression))

    BLANKS = Terminal(Re.set(" ", "\t").plus())
-    LINE_BREAKS = Terminal(Re.set("\r", "\n").plus(), trivia_mode=TriviaMode.NewLine)
+    LINE_BREAK = Terminal(Re.set("\r", "\n"), trivia_mode=TriviaMode.NewLine)
    COMMENT = Terminal(
        Re.seq(Re.literal("//"), Re.set("\n").invert().star()),
        highlight=highlight.comment.line,
--- a/harness.py
+++ b/harness.py
@ -371,7 +371,7 @@ class Harness:

            printer = self.load_printer()
            if self.tree is not None:
-                self.document = printer.convert_tree_to_document(self.tree)
+                self.document = printer.convert_tree_to_document(self.tree, self.source)
            else:
                self.document = None

@ -541,12 +541,6 @@ class Harness:
                append(f"indent {doc.amount}")
                self.format_document(lines, doc.doc, indent + 1)

-            case wadler.Text(start, end):
-                if self.source is not None:
-                    append(f"< {repr(self.source[start:end])}")
-                else:
-                    append(f"< ??? {start}:{end}")
-
            case wadler.Literal(text):
                append(f"literal {repr(text)}")

--- a/parser/parser.py
+++ b/parser/parser.py
@ -2109,6 +2109,10 @@ class Re:
 UNICODE_MAX_CP = 1114112


+def _str_repr(x: int) -> str:
+    return repr(chr(x))[1:-1]
+
+
@dataclasses.dataclass
 class ReSet(Re):
    values: list[Span]
@ -2165,12 +2169,12 @@ class ReSet(Re):
        if len(self.values) == 1:
            span = self.values[0]
            if len(span) == 1:
-                return chr(span.lower)
+                return _str_repr(span.lower)

        ranges = []
        for span in self.values:
-            start = chr(span.lower)
-            end = chr(span.upper - 1)
+            start = _str_repr(span.lower)
+            end = _str_repr(span.upper - 1)
            if start == end:
                ranges.append(start)
            else:
@ -2736,7 +2740,7 @@ class TriviaMode(enum.Enum):
    pretty-printing. Attach this to a "trivia_mode" property on a Terminal
    definition.

-    - Ignore means that the trivia should be ignored. (This is the default.)
+    - Blank means that the trivia represents blank space. (This is the default.)

    - NewLine means that the trivia is a line break. This is important for
      other modes, specifically...
@ -2748,7 +2752,7 @@ class TriviaMode(enum.Enum):
      a forced break.
    """

-    Ignore = 0
+    Blank = 0
    NewLine = 1
    LineComment = 2

--- a/parser/wadler.py
+++ b/parser/wadler.py
@ -32,12 +32,6 @@ class Indent:
    doc: "Document"


-@dataclasses.dataclass(frozen=True)
-class Text:
-    start: int
-    end: int
-
-
@dataclasses.dataclass(frozen=True)
 class Literal:
    text: str
@ -69,13 +63,11 @@ class Lazy:
        return self.value

    @classmethod
-    def from_tree(cls, tree: runtime.Tree, printer: "Printer") -> "Lazy":
-        return Lazy(lambda: printer.convert_tree_to_document(tree))
+    def from_tree(cls, tree: runtime.Tree, src: str, printer: "Printer") -> "Lazy":
+        return Lazy(lambda: printer.convert_tree_to_document(tree, src))


-Document = (
-    None | Text | Literal | NewLine | ForceBreak | Cons | Indent | Group | Trivia | Marker | Lazy
-)
+Document = None | Literal | NewLine | ForceBreak | Cons | Indent | Group | Trivia | Marker | Lazy


 def cons(*documents: Document) -> Document:
@ -207,9 +199,6 @@ def layout_document(doc: Document, width: int, indent: str) -> DocumentLayout:
                case None:
                    pass

-                case Text(start, end):
-                    remaining -= end - start
-
                case Literal(text):
                    remaining -= len(text)

@ -268,10 +257,6 @@ def layout_document(doc: Document, width: int, indent: str) -> DocumentLayout:
            case None:
                pass

-            case Text(start, end):
-                output.append((start, end))
-                column += end - start
-
            case Literal(text):
                output.append(text)
                column += len(text)
@ -337,7 +322,7 @@ def resolve_document(doc: Document) -> Document:
        case Trivia(child):
            return Trivia(resolve_document(child))

-        case Text() | Literal() | NewLine() | ForceBreak() | Indent() | None:
+        case Literal() | NewLine() | ForceBreak() | Indent() | None:
            return doc

        case _:
@ -358,7 +343,12 @@ class Matcher:
    newline_replace: dict[str, str]
    trivia_mode: dict[str, parser.TriviaMode]

-    def match(self, printer: "Printer", items: list[runtime.Tree | runtime.TokenValue]) -> Document:
+    def match(
+        self,
+        printer: "Printer",
+        items: list[runtime.Tree | runtime.TokenValue],
+        src: str,
+    ) -> Document:
        stack: list[tuple[int, Document]] = [(0, None)]
        table = self.table

@ -434,10 +424,13 @@ class Matcher:
                    value = current_token[1]

                    if isinstance(value, runtime.Tree):
-                        child = Lazy.from_tree(value, printer)
+                        child = Lazy.from_tree(value, src, printer)
                    else:
-                        child = Text(value.start, value.end)
-                        child = cons(child, self.apply_trivia(value.post_trivia))
+                        child = cons(
+                            trivia(self.apply_pre_trivia(value.pre_trivia, src)),
+                            Literal(src[value.start : value.end]),
+                            trivia(self.apply_post_trivia(value.post_trivia, src)),
+                        )

                    stack.append((action.state, child))
                    input_index += 1
@ -445,46 +438,100 @@ class Matcher:
                case parser.Error():
                    raise Exception("How did I get a parse error here??")

-    def apply_trivia(self, trivia_tokens: list[runtime.TokenValue]) -> Document:
-        has_newline = False
+    def slice_pre_post_trivia(self, trivia_tokens: list[runtime.TokenValue], src: str) -> tuple[
+        list[tuple[parser.TriviaMode, runtime.TokenValue]],
+        list[tuple[parser.TriviaMode, runtime.TokenValue]],
+    ]:
+        tokens = [
+            (self.trivia_mode.get(token.kind, parser.TriviaMode.Blank), token)
+            for token in trivia_tokens
+        ]
+
+        for index, (mode, token) in enumerate(tokens):
+            if token.start == 0:
+                # Everything is pre-trivia if we're at the start of the file.
+                return (tokens, [])
+
+            if mode == parser.TriviaMode.NewLine:
+                # This is the first newline; it belongs with the post-trivia.
+                return (tokens[index + 1 :], tokens[: index + 1])
+
+        # If we never found a new line then it's all post-trivia.
+        return ([], tokens)
+
+    def apply_pre_trivia(self, trivia_tokens: list[runtime.TokenValue], src: str) -> Document:
+        pre_trivia, _ = self.slice_pre_post_trivia(trivia_tokens, src)
+        if len(pre_trivia) == 0:
+            return None
+
+        at_start_of_file = pre_trivia[0][1].start == 0
+
        trivia_doc = None
-        for token in trivia_tokens:
-            mode = self.trivia_mode.get(token.kind, parser.TriviaMode.Ignore)
+        new_line_count = 0
+        for mode, token in pre_trivia:
            match mode:
-                case parser.TriviaMode.Ignore:
+                case parser.TriviaMode.LineComment:
+                    trivia_doc = cons(
+                        trivia_doc,
+                        Literal(src[token.start : token.end]),
+                        ForceBreak(False),
+                    )
+                    new_line_count = 0  # There will be a newline after this.
+                    at_start_of_file = False
+
+                case parser.TriviaMode.Blank:
                    pass

                case parser.TriviaMode.NewLine:
-                    # We ignore line breaks because obviously
-                    # we expect the pretty-printer to put the
-                    # line breaks in where they belong *but*
-                    # we track if they happened to influence
-                    # the layout.
-                    has_newline = True
-
-                case parser.TriviaMode.LineComment:
-                    if has_newline:
-                        # This line comment is all alone on
-                        # its line, so we need to maintain
-                        # that.
-                        line_break = NewLine("")
-                    else:
-                        # This line comment is attached to
-                        # something to the left, reduce it to
-                        # a space.
-                        line_break = Literal(" ")
-
+                    new_line_count += 1
+                    if new_line_count == 2 and not at_start_of_file:
                        trivia_doc = cons(
                            trivia_doc,
-                        line_break,
-                        Text(token.start, token.end),
-                        ForceBreak(True),  # This is probably the wrong place for this!
+                            ForceBreak(False),
+                            ForceBreak(False),
                        )

                case _:
                    typing.assert_never(mode)

-        return trivia(trivia_doc)
+        return trivia_doc
+
+    def apply_post_trivia(self, trivia_tokens: list[runtime.TokenValue], src: str) -> Document:
+        _, post_trivia = self.slice_pre_post_trivia(trivia_tokens, src)
+        if len(post_trivia) == 0:
+            return None
+
+        trivia_doc = None
+        for mode, token in post_trivia:
+            match mode:
+                case parser.TriviaMode.Blank:
+                    pass
+
+                case parser.TriviaMode.NewLine:
+                    # Anything after a line break is not processed as post
+                    # trivia.
+                    break
+
+                case parser.TriviaMode.LineComment:
+                    # Because this is post-trivia, we know there's something
+                    # to our left, and we can force the space.
+                    trivia_doc = cons(
+                        Literal(" "),
+                        Literal(src[token.start : token.end]),
+                        ForceBreak(True),  # And the line needs to end.
+                    )
+                    break
+
+                case _:
+                    typing.assert_never(mode)
+
+        if len(trivia_tokens) > 0 and trivia_tokens[-1].end == len(src):
+            # As a special case, if we're post trivia at the end of the file
+            # then we also need to be pre-trivia too, for the hypthetical EOF
+            # token that we never see.
+            trivia_doc = cons(trivia_doc, self.apply_pre_trivia(trivia_tokens, src))
+
+        return trivia_doc


 class Printer:
@ -686,19 +733,19 @@ class Printer:

        return result

-    def convert_tree_to_document(self, tree: runtime.Tree) -> Document:
+    def convert_tree_to_document(self, tree: runtime.Tree, src: str) -> Document:
        name = tree.name
        assert name is not None, "Cannot format a tree if it still has transparent nodes inside"

        rule = self.lookup_nonterminal(name)
        matcher = self.rule_to_matcher(rule)
-        m = matcher.match(self, list(tree.children))
+        m = matcher.match(self, list(tree.children), src)
        if m is None:
            raise ValueError(
                f"Could not match a valid tree for {tree.name} with {len(tree.children)} children:\n{tree.format()}"
            )
        return resolve_document(m)

-    def format_tree(self, tree: runtime.Tree, width: int) -> DocumentLayout:
-        doc = self.convert_tree_to_document(tree)
+    def format_tree(self, tree: runtime.Tree, src: str, width: int) -> DocumentLayout:
+        doc = self.convert_tree_to_document(tree, src)
        return layout_document(doc, width, self._indent)
--- a/tests/test_wadler.py
+++ b/tests/test_wadler.py
@ -14,6 +14,7 @@ from parser.parser import (
    sp,
    nl,
    br,
+    TriviaMode,
 )

 import parser.runtime as runtime
@ -72,6 +73,7 @@ class JsonGrammar(Grammar):
        )

    BLANKS = Terminal(Re.set(" ", "\t", "\r", "\n").plus())
+
    LCURLY = Terminal("{")
    RCURLY = Terminal("}")
    COMMA = Terminal(",")
@ -118,8 +120,6 @@ def flatten_document(doc: wadler.Document, src: str) -> list:
            return ["<forced break>"]
        case wadler.Indent():
            return [[f"<indent {doc.amount}>", flatten_document(doc.doc, src)]]
-        case wadler.Text(start, end):
-            return [src[start:end]]
        case wadler.Literal(text):
            return [text]
        case wadler.Group():
@ -149,7 +149,7 @@ def test_convert_tree_to_document():
    assert tree is not None

    printer = wadler.Printer(JSON)
-    doc = flatten_document(printer.convert_tree_to_document(tree), text)
+    doc = flatten_document(printer.convert_tree_to_document(tree, text), text)

    assert doc == [
        [
@ -212,7 +212,7 @@ def test_layout_basic():
    assert tree is not None

    printer = wadler.Printer(JSON)
-    result = printer.format_tree(tree, 50).apply_to_source(text)
+    result = printer.format_tree(tree, text, 50).apply_to_source(text)

    assert (
        result
@ -226,10 +226,9 @@ def test_layout_basic():
    )


-def test_forced_break():
-    class TG(Grammar):
+class TG(Grammar):
    start = "root"
-        trivia = ["BLANKS"]
+    trivia = ["BLANKS", "LINE_BREAK", "COMMENT"]

    @rule
    def root(self):
@ -256,8 +255,15 @@ def test_forced_break():
    OK = Terminal("ok")
    BREAK = Terminal("break")

-        BLANKS = Terminal(Re.set(" ", "\t", "\r", "\n").plus())
+    BLANKS = Terminal(Re.set(" ", "\t").plus())
+    LINE_BREAK = Terminal(Re.set("\r", "\n"), trivia_mode=TriviaMode.NewLine)
+    COMMENT = Terminal(
+        Re.seq(Re.literal(";"), Re.set("\n").invert().star()),
+        trivia_mode=TriviaMode.LineComment,
+    )

+
+def test_forced_break():
    g = TG()
    g_lexer = g.compile_lexer()
    g_parser = runtime.Parser(g.build_table())
@ -269,7 +275,7 @@ def test_forced_break():
    assert tree is not None

    printer = wadler.Printer(g)
-    result = printer.format_tree(tree, 200).apply_to_source(text)
+    result = printer.format_tree(tree, text, 200).apply_to_source(text)

    assert (
        result