From 0be0075cfe44a57e6d403a028b776afffc5268b7 Mon Sep 17 00:00:00 2001 From: John Doty Date: Tue, 27 Aug 2024 16:47:26 -0700 Subject: [PATCH] Generic token stream Compatible with the harness --- parser/runtime.py | 54 ++++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 53 insertions(+), 1 deletion(-) diff --git a/parser/runtime.py b/parser/runtime.py index 2aefd7b..24e617f 100644 --- a/parser/runtime.py +++ b/parser/runtime.py @@ -1,7 +1,7 @@ import bisect import enum -import enum import logging +import re import typing from dataclasses import dataclass @@ -490,3 +490,55 @@ def generic_tokenize( pos = last_accept_pos start = pos state = 0 + + +class GenericTokenStream: + def __init__(self, src: str, lexer: parser.LexerTable): + self.src = src + self.lexer = lexer + self._tokens: list[typing.Tuple[parser.Terminal, int, int]] = list( + generic_tokenize(src, lexer) + ) + self._lines = [m.start() for m in re.finditer("\n", src)] + + def tokens(self): + return self._tokens + + def lines(self): + return self._lines + + def dump(self, *, start=None, end=None) -> list[str]: + if start is None: + start = 0 + if end is None: + end = len(self._tokens) + + max_terminal_name = max( + len(terminal.value) + for terminal, _ in self.lexer + if terminal is not None and terminal.value is not None + ) + max_offset_len = len(str(len(self.src))) + + prev_line = None + lines = [] + for token in self._tokens[start:end]: + (kind, start, length) = token + line_index = bisect.bisect_left(self._lines, start) + if line_index == 0: + col_start = 0 + else: + col_start = self._lines[line_index - 1] + 1 + column_index = start - col_start + value = self.src[start : start + length] + + line_number = line_index + 1 + if line_number != prev_line: + line_part = f"{line_number:4}" + prev_line = line_number + else: + line_part = " |" + + line = f"{start:{max_offset_len}} {line_part} {column_index:3} {kind.value:{max_terminal_name}} {repr(value)}" + lines.append(line) + return lines