diff --git a/parser/runtime.py b/parser/runtime.py index 3d55ef5..b746cf1 100644 --- a/parser/runtime.py +++ b/parser/runtime.py @@ -13,6 +13,8 @@ class TokenValue: kind: str start: int end: int + pre_trivia: list["TokenValue"] + post_trivia: list["TokenValue"] @dataclass @@ -313,17 +315,50 @@ class Parser: self.table = table def parse(self, tokens: TokenStream) -> typing.Tuple[Tree | None, list[str]]: - # TODO: If this were a for reals for reals parser we would keep the trivia - # accessible in the tree. input_tokens = tokens.tokens() - input: list[TokenValue] = [ - TokenValue(kind=kind.name, start=start, end=start + length) - for (kind, start, length) in input_tokens - if kind.name is not None and kind.name not in self.table.trivia - ] + + # Filter the input tokens, to generate a list of non-trivia tokens. + # In addition, track the trivia tokens we find along the way, and put + # them into a list attached to each non-trivia token, so we can + # actually recover the document *as written*. + input: list[TokenValue] = [] + trivia: list[TokenValue] = [] + for kind, start, length in input_tokens: + assert kind.name is not None + if kind.name in self.table.trivia: + trivia.append( + TokenValue( + kind=kind.name, + start=start, + end=start + length, + pre_trivia=[], + post_trivia=[], + ) + ) + else: + prev_trivia = trivia + trivia = [] + + input.append( + TokenValue( + kind=kind.name, + start=start, + end=start + length, + pre_trivia=prev_trivia, + post_trivia=trivia, + ) + ) eof = 0 if len(input) == 0 else input[-1].end - input = input + [TokenValue(kind="$", start=eof, end=eof)] + input = input + [ + TokenValue( + kind="$", + start=eof, + end=eof, + pre_trivia=trivia, + post_trivia=[], + ) + ] input_index = 0 # Our stack is a stack of tuples, where the first entry is the state @@ -428,7 +463,14 @@ class Parser: assert repair.value is not None pos = input[cursor].end input.insert( - cursor, TokenValue(kind=repair.value, start=pos, end=pos) + cursor, + TokenValue( + kind=repair.value, + start=pos, + end=pos, + pre_trivia=[], + post_trivia=[], + ), ) cursor += 1 diff --git a/parser/wadler.py b/parser/wadler.py index bb9aba0..e7b4a81 100644 --- a/parser/wadler.py +++ b/parser/wadler.py @@ -255,7 +255,16 @@ class Matcher: table = self.table input = [(child_to_name(i), i) for i in items] + [ - ("$", runtime.TokenValue(kind="$", start=0, end=0)) + ( + "$", + runtime.TokenValue( + kind="$", + start=0, + end=0, + pre_trivia=[], + post_trivia=[], + ), + ) ] input_index = 0 diff --git a/tests/test_grammar.py b/tests/test_grammar.py index f3c1f85..ce234ec 100644 --- a/tests/test_grammar.py +++ b/tests/test_grammar.py @@ -20,7 +20,7 @@ class Tokens: def _tree(treeform) -> runtime.Tree | runtime.TokenValue: if isinstance(treeform, str): - return runtime.TokenValue(treeform, 0, 0) + return runtime.TokenValue(treeform, 0, 0, [], []) else: assert isinstance(treeform, tuple) name = treeform[0] @@ -277,7 +277,34 @@ def test_grammar_ignore_trivia(): ) assert errors == [] - assert tree == _tree(("sentence", ("sentence", "WORD"), "WORD")) + assert tree == runtime.Tree( + "sentence", + 0, + 0, + ( + runtime.Tree( + "sentence", + 0, + 0, + ( + runtime.TokenValue( + "WORD", + 0, + 0, + [], + [runtime.TokenValue("BLANK", 0, 0, [], [])], + ), + ), + ), + runtime.TokenValue( + "WORD", + 0, + 0, + [runtime.TokenValue("BLANK", 0, 0, [], [])], + [runtime.TokenValue("BLANK", 0, 0, [], [])], + ), + ), + ) def test_grammar_unknown_trivia():