From b2e7d15fb886753dbc993788a4eedff004bc33ac Mon Sep 17 00:00:00 2001 From: John Doty Date: Sun, 16 Feb 2025 08:06:18 -0800 Subject: [PATCH] Report actual error positions in token errors --- parser/runtime.py | 38 +++++++++++++++++++++++++------------- 1 file changed, 25 insertions(+), 13 deletions(-) diff --git a/parser/runtime.py b/parser/runtime.py index a74276a..c7fc9db 100644 --- a/parser/runtime.py +++ b/parser/runtime.py @@ -8,6 +8,22 @@ from dataclasses import dataclass from . import parser +def offset_to_line_column(lines: list[int], pos: int) -> tuple[int, int]: + """Convert a text offset to a line number and column number given a list + of line break positions. This is used to make errors intelligible. Lines + are 1-based, and columns are 0-based, in accordance with editor + traditions. + """ + line_index = bisect.bisect_left(lines, pos) + if line_index == 0: + col_start = 0 + else: + col_start = lines[line_index - 1] + 1 + column_index = pos - col_start + line_index += 1 + return (line_index, column_index) + + @dataclass class TokenValue: kind: str @@ -597,21 +613,16 @@ class Parser: if errors: lines = tokens.lines() for parse_error in errors: - line_index = bisect.bisect_left(lines, parse_error.start) - if line_index == 0: - col_start = 0 - else: - col_start = lines[line_index - 1] + 1 - column_index = parse_error.start - col_start - line_index += 1 - + line_index, column_index = offset_to_line_column(lines, parse_error.start) error_strings.append(f"{line_index}:{column_index}: {parse_error.message}") return (result, error_strings) def generic_tokenize( - src: str, table: parser.LexerTable + src: str, + table: parser.LexerTable, + lines: list[int], ) -> typing.Iterable[tuple[parser.Terminal, int, int]]: pos = 0 state = 0 @@ -647,7 +658,8 @@ def generic_tokenize( pass if last_accept is None: - raise Exception(f"Token error at {pos}") + line_index, column_index = offset_to_line_column(lines, pos) + raise Exception(f"{line_index}:{column_index}: Unexpected character '{src[pos]}'") yield (last_accept, start, last_accept_pos - start) @@ -661,10 +673,10 @@ class GenericTokenStream: def __init__(self, src: str, lexer: parser.LexerTable): self.src = src self.lexer = lexer - self._tokens: list[typing.Tuple[parser.Terminal, int, int]] = list( - generic_tokenize(src, lexer) - ) self._lines = [m.start() for m in re.finditer("\n", src)] + self._tokens: list[typing.Tuple[parser.Terminal, int, int]] = list( + generic_tokenize(src, lexer, self._lines) + ) def tokens(self): return self._tokens