Report actual error positions in token errors

Fix indentation in docs
2025-02-16 08:06:18 -08:00 · 2025-02-16 08:06:10 -08:00
3 changed files with 182 additions and 170 deletions
--- a/README.md
+++ b/README.md
@ -17,31 +17,31 @@ class. Create one method per non-terminal, decorated with the `rule`
 decorator. Here's an example:

 ```python
-    from parser import *
+from parser import *

-    @rule
-    def expression():
-        return seq(expression, PLUS, term) | term
+@rule
+def expression():
+    return seq(expression, PLUS, term) | term

-    @rule
-    def term():
-        return seq(LPAREN, expression, RPAREN) | ID
+@rule
+def term():
+    return seq(LPAREN, expression, RPAREN) | ID

-    PLUS = Terminal('PLUS', '+')
-    LPAREN = Terminal('LPAREN', '(')
-    RPAREN = Terminal('RPAREN', ')')
-    ID = Terminal(
-        'ID',
-        Re.seq(
-            Re.set(("a", "z"), ("A", "Z"), "_"),
-            Re.set(("a", "z"), ("A", "Z"), ("0", "9"), "_").star(),
-        ),
-    )
+PLUS = Terminal('PLUS', '+')
+LPAREN = Terminal('LPAREN', '(')
+RPAREN = Terminal('RPAREN', ')')
+ID = Terminal(
+    'ID',
+    Re.seq(
+        Re.set(("a", "z"), ("A", "Z"), "_"),
+        Re.set(("a", "z"), ("A", "Z"), ("0", "9"), "_").star(),
+    ),
+)

-    SimpleGrammar = Grammar(
-        name="Simple",
-        start=expression,
-    )
+SimpleGrammar = Grammar(
+    name="Simple",
+    start=expression,
+)
 ```

 Terminals can be plain strings or regular expressions constructed with
@ -59,17 +59,17 @@ There are no helpers (yet!) for consuming lists, so they need to be
 constructed in the classic context-free grammar way:

 ```python
-    @rule
-    def list():
-        return NUMBER | (list + COMMA + NUMBER)
+@rule
+def list():
+    return NUMBER | (list + COMMA + NUMBER)

-    NUMBER = Terminal(Re.set(("0", "9")).plus())
-    COMMA = Terminal(',')
+NUMBER = Terminal(Re.set(("0", "9")).plus())
+COMMA = Terminal(',')

-    NumberList = Grammar(
-      name="NumberList",
-      start=list,
-    )
+NumberList = Grammar(
+  name="NumberList",
+  start=list,
+)
 ```

 (Unlike with PEGs, you can write grammars with left or right-recursion,
@ -95,23 +95,23 @@ which means they don't generate nodes in the tree and just dump their
 contents into the parent node instead.

 ```python
-    @rule
-    def list():
-        # The starting rule can't be transparent: there has to be something to
-        # hold on to!
-        return transparent_list
+@rule
+def list():
+    # The starting rule can't be transparent: there has to be something to
+    # hold on to!
+    return transparent_list

-    @rule(transparent=True)
-    def transparent_list() -> Rule:
-        return NUMBER | (transparent_list + COMMA + NUMBER)
+@rule(transparent=True)
+def transparent_list() -> Rule:
+    return NUMBER | (transparent_list + COMMA + NUMBER)

-    NUMBER = Terminal(Re.set(("0", "9")).plus())
-    COMMA = Terminal(',')
+NUMBER = Terminal(Re.set(("0", "9")).plus())
+COMMA = Terminal(',')

-    NumberList = Grammar(
-      name="NumberList",
-      start=list,
-    )
+NumberList = Grammar(
+  name="NumberList",
+  start=list,
+)
 ```

 This grammar will generate the far more useful tree:
@ -130,23 +130,23 @@ following the lead set by tree-sitter, and so the grammar above is
 probably better-written as:

 ```python
-    @rule
-    def list():
-        # The starting rule can't be transparent: there has to be something to
-        # hold on to!
-        return transparent_list
+@rule
+def list():
+    # The starting rule can't be transparent: there has to be something to
+    # hold on to!
+    return transparent_list

-    @rule
-    def _list() -> Rule:
-        return NUMBER | (_list + COMMA + NUMBER)
+@rule
+def _list() -> Rule:
+    return NUMBER | (_list + COMMA + NUMBER)

-    NUMBER = Terminal(Re.set(("0", "9")).plus())
-    COMMA = Terminal(',')
+NUMBER = Terminal(Re.set(("0", "9")).plus())
+COMMA = Terminal(',')

-    NumberList = Grammar(
-      name="NumberList",
-      start=list,
-    )
+NumberList = Grammar(
+  name="NumberList",
+  start=list,
+)
 ```

 That will generate the same tree, but a little more succinctly.
@ -155,17 +155,17 @@ Of course, it's a lot of work to write these transparent recursive
 rules by hand all the time, so there are helpers that do it for you:

 ```python
-    @rule
-    def list():
-        return zero_or_more(NUMBER, COMMA) + NUMBER
+@rule
+def list():
+    return zero_or_more(NUMBER, COMMA) + NUMBER

-    NUMBER = Terminal(Re.set(("0", "9")).plus())
-    COMMA = Terminal(',')
+NUMBER = Terminal(Re.set(("0", "9")).plus())
+COMMA = Terminal(',')

-    NumberList = Grammar(
-      name="NumberList",
-      start=list,
-    )
+NumberList = Grammar(
+  name="NumberList",
+  start=list,
+)
 ```

 Much better.
@ -180,20 +180,20 @@ To allow (and ignore) spaces, newlines, tabs, and carriage-returns in
 our number lists, we would modify the grammar as follows:

 ```python
-    @rule
-    def list():
-        return zero_or_more(NUMBER, COMMA) + NUMBER
+@rule
+def list():
+    return zero_or_more(NUMBER, COMMA) + NUMBER

-    NUMBER = Terminal(Re.set(("0", "9")).plus())
-    COMMA = Terminal(',')
+NUMBER = Terminal(Re.set(("0", "9")).plus())
+COMMA = Terminal(',')

-    BLANKS = Terminal(Re.set(" ", "\t", "\r", "\n").plus())
+BLANKS = Terminal(Re.set(" ", "\t", "\r", "\n").plus())

-    NumberList = Grammar(
-      name="NumberList",
-      start=list,
-      trivia=[BLANKS],
-    )
+NumberList = Grammar(
+  name="NumberList",
+  start=list,
+  trivia=[BLANKS],
+)
 ```

 Now we can parse a list with spaces! "1  , 2,   3" will parse happily
--- a/dingus/about.md
+++ b/dingus/about.md
@ -20,31 +20,31 @@ object.
 Here's an example:

 ```python {.numberLines}
-    from parser import *
+from parser import *

-    @rule
-    def expression():
-        return seq(expression, PLUS, term) | term
+@rule
+def expression():
+    return seq(expression, PLUS, term) | term

-    @rule
-    def term():
-        return seq(LPAREN, expression, RPAREN) | ID
+@rule
+def term():
+    return seq(LPAREN, expression, RPAREN) | ID

-    PLUS = Terminal('PLUS', '+')
-    LPAREN = Terminal('LPAREN', '(')
-    RPAREN = Terminal('RPAREN', ')')
-    ID = Terminal(
-        'ID',
-        Re.seq(
-            Re.set(("a", "z"), ("A", "Z"), "_"),
-            Re.set(("a", "z"), ("A", "Z"), ("0", "9"), "_").star(),
-        ),
-    )
+PLUS = Terminal('PLUS', '+')
+LPAREN = Terminal('LPAREN', '(')
+RPAREN = Terminal('RPAREN', ')')
+ID = Terminal(
+    'ID',
+    Re.seq(
+        Re.set(("a", "z"), ("A", "Z"), "_"),
+        Re.set(("a", "z"), ("A", "Z"), ("0", "9"), "_").star(),
+    ),
+)

-    SimpleGrammar = Grammar(
-        name="Simple",
-        start=expression,
-    )
+SimpleGrammar = Grammar(
+    name="Simple",
+    start=expression,
+)
 ```

 Terminal patterns can be plain strings or regular expressions
@ -61,17 +61,17 @@ things can be freely nested, as desired.
 You can make lists in the classic context-free grammar way:

 ```python {.numberLines}
-    @rule
-    def list():
-        return NUMBER | (list + COMMA + NUMBER)
+@rule
+def list():
+    return NUMBER | (list + COMMA + NUMBER)

-    NUMBER = Terminal(Re.set(("0", "9")).plus())
-    COMMA = Terminal(',')
+NUMBER = Terminal(Re.set(("0", "9")).plus())
+COMMA = Terminal(',')

-    NumberList = Grammar(
-      name="NumberList",
-      start=list,
-    )
+NumberList = Grammar(
+  name="NumberList",
+  start=list,
+)
 ```

 (Unlike with PEGs, you can write grammars with left or right-recursion,
@ -97,23 +97,23 @@ which means they don't generate nodes in the tree and just dump their
 contents into the parent node instead.

 ```python {.numberLines}
-    @rule
-    def list():
-        # The starting rule can't be transparent: there has to be something to
-        # hold on to!
-        return transparent_list
+@rule
+def list():
+    # The starting rule can't be transparent: there has to be something to
+    # hold on to!
+    return transparent_list

-    @rule(transparent=True)
-    def transparent_list() -> Rule:
-        return NUMBER | (transparent_list + COMMA + NUMBER)
+@rule(transparent=True)
+def transparent_list() -> Rule:
+    return NUMBER | (transparent_list + COMMA + NUMBER)

-    NUMBER = Terminal(Re.set(("0", "9")).plus())
-    COMMA = Terminal(',')
+NUMBER = Terminal(Re.set(("0", "9")).plus())
+COMMA = Terminal(',')

-    NumberList = Grammar(
-      name="NumberList",
-      start=list,
-    )
+NumberList = Grammar(
+  name="NumberList",
+  start=list,
+)
 ```

 This grammar will generate the far more useful tree:
@ -132,23 +132,23 @@ following the lead set by tree-sitter, and so the grammar above is
 probably better-written as:

 ```python {.numberLines}
-    @rule
-    def list():
-        # The starting rule can't be transparent: there has to be something to
-        # hold on to!
-        return transparent_list
+@rule
+def list():
+    # The starting rule can't be transparent: there has to be something to
+    # hold on to!
+    return transparent_list

-    @rule
-    def _list() -> Rule:
-        return NUMBER | (_list + COMMA + NUMBER)
+@rule
+def _list() -> Rule:
+    return NUMBER | (_list + COMMA + NUMBER)

-    NUMBER = Terminal(Re.set(("0", "9")).plus())
-    COMMA = Terminal(',')
+NUMBER = Terminal(Re.set(("0", "9")).plus())
+COMMA = Terminal(',')

-    NumberList = Grammar(
-      name="NumberList",
-      start=list,
-    )
+NumberList = Grammar(
+  name="NumberList",
+  start=list,
+)
 ```

 That will generate the same tree, but a little more succinctly.
@ -157,17 +157,17 @@ Of course, it's a lot of work to write these transparent recursive
 rules by hand all the time, so there are helpers that do it for you:

 ```python {.numberLines}
-    @rule
-    def list():
-        return zero_or_more(NUMBER, COMMA) + NUMBER
+@rule
+def list():
+    return zero_or_more(NUMBER, COMMA) + NUMBER

-    NUMBER = Terminal(Re.set(("0", "9")).plus())
-    COMMA = Terminal(',')
+NUMBER = Terminal(Re.set(("0", "9")).plus())
+COMMA = Terminal(',')

-    NumberList = Grammar(
-      name="NumberList",
-      start=list,
-    )
+NumberList = Grammar(
+  name="NumberList",
+  start=list,
+)
 ```

 Much better.
@ -182,21 +182,21 @@ To allow (and ignore) spaces, newlines, tabs, and carriage-returns in
 our number lists, we would modify the grammar as follows:

 ```python {.numberLines}
-    @rule
-    def list():
-        return zero_or_more(NUMBER, COMMA) + NUMBER
+@rule
+def list():
+    return zero_or_more(NUMBER, COMMA) + NUMBER

-    NUMBER = Terminal(Re.set(("0", "9")).plus())
-    COMMA = Terminal(',')
+NUMBER = Terminal(Re.set(("0", "9")).plus())
+COMMA = Terminal(',')

-    BLANKS = Terminal(Re.set(" ", "\t", "\r", "\n").plus())
-    # ^ and add a new terminal to describe what we're ignoring...
+BLANKS = Terminal(Re.set(" ", "\t", "\r", "\n").plus())
+# ^ and add a new terminal to describe what we're ignoring...

-    NumberList = Grammar(
-      name="NumberList",
-      start=list,
-      trivia=[BLANKS],
-    )
+NumberList = Grammar(
+  name="NumberList",
+  start=list,
+  trivia=[BLANKS],
+)
 ```

 Now we can parse a list with spaces! "1  , 2,   3" will parse happily
--- a/parser/runtime.py
+++ b/parser/runtime.py
@ -8,6 +8,22 @@ from dataclasses import dataclass
 from . import parser


+def offset_to_line_column(lines: list[int], pos: int) -> tuple[int, int]:
+    """Convert a text offset to a line number and column number given a list
+    of line break positions. This is used to make errors intelligible. Lines
+    are 1-based, and columns are 0-based, in accordance with editor
+    traditions.
+    """
+    line_index = bisect.bisect_left(lines, pos)
+    if line_index == 0:
+        col_start = 0
+    else:
+        col_start = lines[line_index - 1] + 1
+    column_index = pos - col_start
+    line_index += 1
+    return (line_index, column_index)
+
+
@dataclass
 class TokenValue:
    kind: str
@ -597,21 +613,16 @@ class Parser:
        if errors:
            lines = tokens.lines()
            for parse_error in errors:
-                line_index = bisect.bisect_left(lines, parse_error.start)
-                if line_index == 0:
-                    col_start = 0
-                else:
-                    col_start = lines[line_index - 1] + 1
-                column_index = parse_error.start - col_start
-                line_index += 1
-
+                line_index, column_index = offset_to_line_column(lines, parse_error.start)
                error_strings.append(f"{line_index}:{column_index}: {parse_error.message}")

        return (result, error_strings)


 def generic_tokenize(
-    src: str, table: parser.LexerTable
+    src: str,
+    table: parser.LexerTable,
+    lines: list[int],
 ) -> typing.Iterable[tuple[parser.Terminal, int, int]]:
    pos = 0
    state = 0
@ -647,7 +658,8 @@ def generic_tokenize(
                pass

        if last_accept is None:
-            raise Exception(f"Token error at {pos}")
+            line_index, column_index = offset_to_line_column(lines, pos)
+            raise Exception(f"{line_index}:{column_index}: Unexpected character '{src[pos]}'")

        yield (last_accept, start, last_accept_pos - start)

@ -661,10 +673,10 @@ class GenericTokenStream:
    def __init__(self, src: str, lexer: parser.LexerTable):
        self.src = src
        self.lexer = lexer
-        self._tokens: list[typing.Tuple[parser.Terminal, int, int]] = list(
-            generic_tokenize(src, lexer)
-        )
        self._lines = [m.start() for m in re.finditer("\n", src)]
+        self._tokens: list[typing.Tuple[parser.Terminal, int, int]] = list(
+            generic_tokenize(src, lexer, self._lines)
+        )

    def tokens(self):
        return self._tokens
Author	SHA1	Message	Date
John Doty	b2e7d15fb8	Report actual error positions in token errors	2025-02-16 08:06:18 -08:00
John Doty	fbccaea2fa	Fix indentation in docs	2025-02-16 08:06:10 -08:00