Record trivia in tokens

This will make our formatting better I think.
2024-09-12 06:22:49 -07:00 · 2024-09-12 06:22:49 -07:00 · b3b2102864
commit b3b2102864
parent 8a80bcad64
3 changed files with 90 additions and 12 deletions
--- a/parser/runtime.py
+++ b/parser/runtime.py
@ -13,6 +13,8 @@ class TokenValue:
    kind: str
    start: int
    end: int
    pre_trivia: list["TokenValue"]
    post_trivia: list["TokenValue"]
@dataclass
@ -313,17 +315,50 @@ class Parser:
        self.table = table
    def parse(self, tokens: TokenStream) -> typing.Tuple[Tree | None, list[str]]:
        # TODO: If this were a for reals for reals parser we would keep the trivia
        #       accessible in the tree.
        input_tokens = tokens.tokens()
-        input: list[TokenValue] = [
+
-            TokenValue(kind=kind.name, start=start, end=start + length)
+        # Filter the input tokens, to generate a list of non-trivia tokens.
-            for (kind, start, length) in input_tokens
+        # In addition, track the trivia tokens we find along the way, and put
-            if kind.name is not None and kind.name not in self.table.trivia
+        # them into a list attached to each non-trivia token, so we can
-        ]
+        # actually recover the document *as written*.
        input: list[TokenValue] = []
        trivia: list[TokenValue] = []
        for kind, start, length in input_tokens:
            assert kind.name is not None
            if kind.name in self.table.trivia:
                trivia.append(
                    TokenValue(
                        kind=kind.name,
                        start=start,
                        end=start + length,
                        pre_trivia=[],
                        post_trivia=[],
                    )
                )
            else:
                prev_trivia = trivia
                trivia = []
                input.append(
                    TokenValue(
                        kind=kind.name,
                        start=start,
                        end=start + length,
                        pre_trivia=prev_trivia,
                        post_trivia=trivia,
                    )
                )
        eof = 0 if len(input) == 0 else input[-1].end
-        input = input + [TokenValue(kind="$", start=eof, end=eof)]
+        input = input + [
            TokenValue(
                kind="$",
                start=eof,
                end=eof,
                pre_trivia=trivia,
                post_trivia=[],
            )
        ]
        input_index = 0
        # Our stack is a stack of tuples, where the first entry is the state
@ -428,7 +463,14 @@ class Parser:
                                assert repair.value is not None
                                pos = input[cursor].end
                                input.insert(
-                                    cursor, TokenValue(kind=repair.value, start=pos, end=pos)
+                                    cursor,
                                    TokenValue(
                                        kind=repair.value,
                                        start=pos,
                                        end=pos,
                                        pre_trivia=[],
                                        post_trivia=[],
                                    ),
                                )
                                cursor += 1
--- a/parser/wadler.py
+++ b/parser/wadler.py
@ -255,7 +255,16 @@ class Matcher:
        table = self.table
        input = [(child_to_name(i), i) for i in items] + [
-            ("$", runtime.TokenValue(kind="$", start=0, end=0))
+            (
                "$",
                runtime.TokenValue(
                    kind="$",
                    start=0,
                    end=0,
                    pre_trivia=[],
                    post_trivia=[],
                ),
            )
        ]
        input_index = 0
--- a/tests/test_grammar.py
+++ b/tests/test_grammar.py
@ -20,7 +20,7 @@ class Tokens:
 def _tree(treeform) -> runtime.Tree | runtime.TokenValue:
    if isinstance(treeform, str):
-        return runtime.TokenValue(treeform, 0, 0)
+        return runtime.TokenValue(treeform, 0, 0, [], [])
    else:
        assert isinstance(treeform, tuple)
        name = treeform[0]
@ -277,7 +277,34 @@ def test_grammar_ignore_trivia():
    )
    assert errors == []
-    assert tree == _tree(("sentence", ("sentence", "WORD"), "WORD"))
+    assert tree == runtime.Tree(
        "sentence",
        0,
        0,
        (
            runtime.Tree(
                "sentence",
                0,
                0,
                (
                    runtime.TokenValue(
                        "WORD",
                        0,
                        0,
                        [],
                        [runtime.TokenValue("BLANK", 0, 0, [], [])],
                    ),
                ),
            ),
            runtime.TokenValue(
                "WORD",
                0,
                0,
                [runtime.TokenValue("BLANK", 0, 0, [], [])],
                [runtime.TokenValue("BLANK", 0, 0, [], [])],
            ),
        ),
    )
 def test_grammar_unknown_trivia():