Report actual error positions in token errors

Fix indentation in docs
2025-02-16 08:06:18 -08:00 · 2025-02-16 08:06:10 -08:00
3 changed files with 182 additions and 170 deletions
--- a/README.md
+++ b/README.md
@ -17,31 +17,31 @@ class. Create one method per non-terminal, decorated with the `rule`
 decorator. Here's an example:
 ```python
-    from parser import *
+from parser import *
-    @rule
+@rule
-    def expression():
+def expression():
-        return seq(expression, PLUS, term) | term
+    return seq(expression, PLUS, term) | term
-    @rule
+@rule
-    def term():
+def term():
-        return seq(LPAREN, expression, RPAREN) | ID
+    return seq(LPAREN, expression, RPAREN) | ID
-    PLUS = Terminal('PLUS', '+')
+PLUS = Terminal('PLUS', '+')
-    LPAREN = Terminal('LPAREN', '(')
+LPAREN = Terminal('LPAREN', '(')
-    RPAREN = Terminal('RPAREN', ')')
+RPAREN = Terminal('RPAREN', ')')
-    ID = Terminal(
+ID = Terminal(
-        'ID',
+    'ID',
-        Re.seq(
+    Re.seq(
-            Re.set(("a", "z"), ("A", "Z"), "_"),
+        Re.set(("a", "z"), ("A", "Z"), "_"),
-            Re.set(("a", "z"), ("A", "Z"), ("0", "9"), "_").star(),
+        Re.set(("a", "z"), ("A", "Z"), ("0", "9"), "_").star(),
-        ),
+    ),
-    )
+)
-    SimpleGrammar = Grammar(
+SimpleGrammar = Grammar(
-        name="Simple",
+    name="Simple",
-        start=expression,
+    start=expression,
-    )
+)
 ```
 Terminals can be plain strings or regular expressions constructed with
@ -59,17 +59,17 @@ There are no helpers (yet!) for consuming lists, so they need to be
 constructed in the classic context-free grammar way:
 ```python
-    @rule
+@rule
-    def list():
+def list():
-        return NUMBER | (list + COMMA + NUMBER)
+    return NUMBER | (list + COMMA + NUMBER)
-    NUMBER = Terminal(Re.set(("0", "9")).plus())
+NUMBER = Terminal(Re.set(("0", "9")).plus())
-    COMMA = Terminal(',')
+COMMA = Terminal(',')
-    NumberList = Grammar(
+NumberList = Grammar(
-      name="NumberList",
+  name="NumberList",
-      start=list,
+  start=list,
-    )
+)
 ```
 (Unlike with PEGs, you can write grammars with left or right-recursion,
@ -95,23 +95,23 @@ which means they don't generate nodes in the tree and just dump their
 contents into the parent node instead.
 ```python
-    @rule
+@rule
-    def list():
+def list():
-        # The starting rule can't be transparent: there has to be something to
+    # The starting rule can't be transparent: there has to be something to
-        # hold on to!
+    # hold on to!
-        return transparent_list
+    return transparent_list
-    @rule(transparent=True)
+@rule(transparent=True)
-    def transparent_list() -> Rule:
+def transparent_list() -> Rule:
-        return NUMBER | (transparent_list + COMMA + NUMBER)
+    return NUMBER | (transparent_list + COMMA + NUMBER)
-    NUMBER = Terminal(Re.set(("0", "9")).plus())
+NUMBER = Terminal(Re.set(("0", "9")).plus())
-    COMMA = Terminal(',')
+COMMA = Terminal(',')
-    NumberList = Grammar(
+NumberList = Grammar(
-      name="NumberList",
+  name="NumberList",
-      start=list,
+  start=list,
-    )
+)
 ```
 This grammar will generate the far more useful tree:
@ -130,23 +130,23 @@ following the lead set by tree-sitter, and so the grammar above is
 probably better-written as:
 ```python
-    @rule
+@rule
-    def list():
+def list():
-        # The starting rule can't be transparent: there has to be something to
+    # The starting rule can't be transparent: there has to be something to
-        # hold on to!
+    # hold on to!
-        return transparent_list
+    return transparent_list
-    @rule
+@rule
-    def _list() -> Rule:
+def _list() -> Rule:
-        return NUMBER | (_list + COMMA + NUMBER)
+    return NUMBER | (_list + COMMA + NUMBER)
-    NUMBER = Terminal(Re.set(("0", "9")).plus())
+NUMBER = Terminal(Re.set(("0", "9")).plus())
-    COMMA = Terminal(',')
+COMMA = Terminal(',')
-    NumberList = Grammar(
+NumberList = Grammar(
-      name="NumberList",
+  name="NumberList",
-      start=list,
+  start=list,
-    )
+)
 ```
 That will generate the same tree, but a little more succinctly.
@ -155,17 +155,17 @@ Of course, it's a lot of work to write these transparent recursive
 rules by hand all the time, so there are helpers that do it for you:
 ```python
-    @rule
+@rule
-    def list():
+def list():
-        return zero_or_more(NUMBER, COMMA) + NUMBER
+    return zero_or_more(NUMBER, COMMA) + NUMBER
-    NUMBER = Terminal(Re.set(("0", "9")).plus())
+NUMBER = Terminal(Re.set(("0", "9")).plus())
-    COMMA = Terminal(',')
+COMMA = Terminal(',')
-    NumberList = Grammar(
+NumberList = Grammar(
-      name="NumberList",
+  name="NumberList",
-      start=list,
+  start=list,
-    )
+)
 ```
 Much better.
@ -180,20 +180,20 @@ To allow (and ignore) spaces, newlines, tabs, and carriage-returns in
 our number lists, we would modify the grammar as follows:
 ```python
-    @rule
+@rule
-    def list():
+def list():
-        return zero_or_more(NUMBER, COMMA) + NUMBER
+    return zero_or_more(NUMBER, COMMA) + NUMBER
-    NUMBER = Terminal(Re.set(("0", "9")).plus())
+NUMBER = Terminal(Re.set(("0", "9")).plus())
-    COMMA = Terminal(',')
+COMMA = Terminal(',')
-    BLANKS = Terminal(Re.set(" ", "\t", "\r", "\n").plus())
+BLANKS = Terminal(Re.set(" ", "\t", "\r", "\n").plus())
-    NumberList = Grammar(
+NumberList = Grammar(
-      name="NumberList",
+  name="NumberList",
-      start=list,
+  start=list,
-      trivia=[BLANKS],
+  trivia=[BLANKS],
-    )
+)
 ```
 Now we can parse a list with spaces! "1  , 2,   3" will parse happily
--- a/dingus/about.md
+++ b/dingus/about.md
@ -20,31 +20,31 @@ object.
 Here's an example:
 ```python {.numberLines}
-    from parser import *
+from parser import *
-    @rule
+@rule
-    def expression():
+def expression():
-        return seq(expression, PLUS, term) | term
+    return seq(expression, PLUS, term) | term
-    @rule
+@rule
-    def term():
+def term():
-        return seq(LPAREN, expression, RPAREN) | ID
+    return seq(LPAREN, expression, RPAREN) | ID
-    PLUS = Terminal('PLUS', '+')
+PLUS = Terminal('PLUS', '+')
-    LPAREN = Terminal('LPAREN', '(')
+LPAREN = Terminal('LPAREN', '(')
-    RPAREN = Terminal('RPAREN', ')')
+RPAREN = Terminal('RPAREN', ')')
-    ID = Terminal(
+ID = Terminal(
-        'ID',
+    'ID',
-        Re.seq(
+    Re.seq(
-            Re.set(("a", "z"), ("A", "Z"), "_"),
+        Re.set(("a", "z"), ("A", "Z"), "_"),
-            Re.set(("a", "z"), ("A", "Z"), ("0", "9"), "_").star(),
+        Re.set(("a", "z"), ("A", "Z"), ("0", "9"), "_").star(),
-        ),
+    ),
-    )
+)
-    SimpleGrammar = Grammar(
+SimpleGrammar = Grammar(
-        name="Simple",
+    name="Simple",
-        start=expression,
+    start=expression,
-    )
+)
 ```
 Terminal patterns can be plain strings or regular expressions
@ -61,17 +61,17 @@ things can be freely nested, as desired.
 You can make lists in the classic context-free grammar way:
 ```python {.numberLines}
-    @rule
+@rule
-    def list():
+def list():
-        return NUMBER | (list + COMMA + NUMBER)
+    return NUMBER | (list + COMMA + NUMBER)
-    NUMBER = Terminal(Re.set(("0", "9")).plus())
+NUMBER = Terminal(Re.set(("0", "9")).plus())
-    COMMA = Terminal(',')
+COMMA = Terminal(',')
-    NumberList = Grammar(
+NumberList = Grammar(
-      name="NumberList",
+  name="NumberList",
-      start=list,
+  start=list,
-    )
+)
 ```
 (Unlike with PEGs, you can write grammars with left or right-recursion,
@ -97,23 +97,23 @@ which means they don't generate nodes in the tree and just dump their
 contents into the parent node instead.
 ```python {.numberLines}
-    @rule
+@rule
-    def list():
+def list():
-        # The starting rule can't be transparent: there has to be something to
+    # The starting rule can't be transparent: there has to be something to
-        # hold on to!
+    # hold on to!
-        return transparent_list
+    return transparent_list
-    @rule(transparent=True)
+@rule(transparent=True)
-    def transparent_list() -> Rule:
+def transparent_list() -> Rule:
-        return NUMBER | (transparent_list + COMMA + NUMBER)
+    return NUMBER | (transparent_list + COMMA + NUMBER)
-    NUMBER = Terminal(Re.set(("0", "9")).plus())
+NUMBER = Terminal(Re.set(("0", "9")).plus())
-    COMMA = Terminal(',')
+COMMA = Terminal(',')
-    NumberList = Grammar(
+NumberList = Grammar(
-      name="NumberList",
+  name="NumberList",
-      start=list,
+  start=list,
-    )
+)
 ```
 This grammar will generate the far more useful tree:
@ -132,23 +132,23 @@ following the lead set by tree-sitter, and so the grammar above is
 probably better-written as:
 ```python {.numberLines}
-    @rule
+@rule
-    def list():
+def list():
-        # The starting rule can't be transparent: there has to be something to
+    # The starting rule can't be transparent: there has to be something to
-        # hold on to!
+    # hold on to!
-        return transparent_list
+    return transparent_list
-    @rule
+@rule
-    def _list() -> Rule:
+def _list() -> Rule:
-        return NUMBER | (_list + COMMA + NUMBER)
+    return NUMBER | (_list + COMMA + NUMBER)
-    NUMBER = Terminal(Re.set(("0", "9")).plus())
+NUMBER = Terminal(Re.set(("0", "9")).plus())
-    COMMA = Terminal(',')
+COMMA = Terminal(',')
-    NumberList = Grammar(
+NumberList = Grammar(
-      name="NumberList",
+  name="NumberList",
-      start=list,
+  start=list,
-    )
+)
 ```
 That will generate the same tree, but a little more succinctly.
@ -157,17 +157,17 @@ Of course, it's a lot of work to write these transparent recursive
 rules by hand all the time, so there are helpers that do it for you:
 ```python {.numberLines}
-    @rule
+@rule
-    def list():
+def list():
-        return zero_or_more(NUMBER, COMMA) + NUMBER
+    return zero_or_more(NUMBER, COMMA) + NUMBER
-    NUMBER = Terminal(Re.set(("0", "9")).plus())
+NUMBER = Terminal(Re.set(("0", "9")).plus())
-    COMMA = Terminal(',')
+COMMA = Terminal(',')
-    NumberList = Grammar(
+NumberList = Grammar(
-      name="NumberList",
+  name="NumberList",
-      start=list,
+  start=list,
-    )
+)
 ```
 Much better.
@ -182,21 +182,21 @@ To allow (and ignore) spaces, newlines, tabs, and carriage-returns in
 our number lists, we would modify the grammar as follows:
 ```python {.numberLines}
-    @rule
+@rule
-    def list():
+def list():
-        return zero_or_more(NUMBER, COMMA) + NUMBER
+    return zero_or_more(NUMBER, COMMA) + NUMBER
-    NUMBER = Terminal(Re.set(("0", "9")).plus())
+NUMBER = Terminal(Re.set(("0", "9")).plus())
-    COMMA = Terminal(',')
+COMMA = Terminal(',')
-    BLANKS = Terminal(Re.set(" ", "\t", "\r", "\n").plus())
+BLANKS = Terminal(Re.set(" ", "\t", "\r", "\n").plus())
-    # ^ and add a new terminal to describe what we're ignoring...
+# ^ and add a new terminal to describe what we're ignoring...
-    NumberList = Grammar(
+NumberList = Grammar(
-      name="NumberList",
+  name="NumberList",
-      start=list,
+  start=list,
-      trivia=[BLANKS],
+  trivia=[BLANKS],
-    )
+)
 ```
 Now we can parse a list with spaces! "1  , 2,   3" will parse happily
--- a/parser/runtime.py
+++ b/parser/runtime.py
@ -8,6 +8,22 @@ from dataclasses import dataclass
 from . import parser
 def offset_to_line_column(lines: list[int], pos: int) -> tuple[int, int]:
    """Convert a text offset to a line number and column number given a list
    of line break positions. This is used to make errors intelligible. Lines
    are 1-based, and columns are 0-based, in accordance with editor
    traditions.
    """
    line_index = bisect.bisect_left(lines, pos)
    if line_index == 0:
        col_start = 0
    else:
        col_start = lines[line_index - 1] + 1
    column_index = pos - col_start
    line_index += 1
    return (line_index, column_index)
@dataclass
 class TokenValue:
    kind: str
@ -597,21 +613,16 @@ class Parser:
        if errors:
            lines = tokens.lines()
            for parse_error in errors:
-                line_index = bisect.bisect_left(lines, parse_error.start)
+                line_index, column_index = offset_to_line_column(lines, parse_error.start)
                if line_index == 0:
                    col_start = 0
                else:
                    col_start = lines[line_index - 1] + 1
                column_index = parse_error.start - col_start
                line_index += 1
                error_strings.append(f"{line_index}:{column_index}: {parse_error.message}")
        return (result, error_strings)
 def generic_tokenize(
-    src: str, table: parser.LexerTable
+    src: str,
    table: parser.LexerTable,
    lines: list[int],
 ) -> typing.Iterable[tuple[parser.Terminal, int, int]]:
    pos = 0
    state = 0
@ -647,7 +658,8 @@ def generic_tokenize(
                pass
        if last_accept is None:
-            raise Exception(f"Token error at {pos}")
+            line_index, column_index = offset_to_line_column(lines, pos)
            raise Exception(f"{line_index}:{column_index}: Unexpected character '{src[pos]}'")
        yield (last_accept, start, last_accept_pos - start)
@ -661,10 +673,10 @@ class GenericTokenStream:
    def __init__(self, src: str, lexer: parser.LexerTable):
        self.src = src
        self.lexer = lexer
        self._tokens: list[typing.Tuple[parser.Terminal, int, int]] = list(
            generic_tokenize(src, lexer)
        )
        self._lines = [m.start() for m in re.finditer("\n", src)]
        self._tokens: list[typing.Tuple[parser.Terminal, int, int]] = list(
            generic_tokenize(src, lexer, self._lines)
        )
    def tokens(self):
        return self._tokens
Author	SHA1	Message	Date
John Doty	b2e7d15fb8	Report actual error positions in token errors	2025-02-16 08:06:18 -08:00
John Doty	fbccaea2fa	Fix indentation in docs	2025-02-16 08:06:10 -08:00