Compare commits

...

2 commits

Author SHA1 Message Date
b2e7d15fb8 Report actual error positions in token errors 2025-02-16 08:06:18 -08:00
fbccaea2fa Fix indentation in docs 2025-02-16 08:06:10 -08:00
3 changed files with 182 additions and 170 deletions

156
README.md
View file

@ -17,31 +17,31 @@ class. Create one method per non-terminal, decorated with the `rule`
decorator. Here's an example:
```python
from parser import *
from parser import *
@rule
def expression():
return seq(expression, PLUS, term) | term
@rule
def expression():
return seq(expression, PLUS, term) | term
@rule
def term():
return seq(LPAREN, expression, RPAREN) | ID
@rule
def term():
return seq(LPAREN, expression, RPAREN) | ID
PLUS = Terminal('PLUS', '+')
LPAREN = Terminal('LPAREN', '(')
RPAREN = Terminal('RPAREN', ')')
ID = Terminal(
'ID',
Re.seq(
Re.set(("a", "z"), ("A", "Z"), "_"),
Re.set(("a", "z"), ("A", "Z"), ("0", "9"), "_").star(),
),
)
PLUS = Terminal('PLUS', '+')
LPAREN = Terminal('LPAREN', '(')
RPAREN = Terminal('RPAREN', ')')
ID = Terminal(
'ID',
Re.seq(
Re.set(("a", "z"), ("A", "Z"), "_"),
Re.set(("a", "z"), ("A", "Z"), ("0", "9"), "_").star(),
),
)
SimpleGrammar = Grammar(
name="Simple",
start=expression,
)
SimpleGrammar = Grammar(
name="Simple",
start=expression,
)
```
Terminals can be plain strings or regular expressions constructed with
@ -59,17 +59,17 @@ There are no helpers (yet!) for consuming lists, so they need to be
constructed in the classic context-free grammar way:
```python
@rule
def list():
return NUMBER | (list + COMMA + NUMBER)
@rule
def list():
return NUMBER | (list + COMMA + NUMBER)
NUMBER = Terminal(Re.set(("0", "9")).plus())
COMMA = Terminal(',')
NUMBER = Terminal(Re.set(("0", "9")).plus())
COMMA = Terminal(',')
NumberList = Grammar(
name="NumberList",
start=list,
)
NumberList = Grammar(
name="NumberList",
start=list,
)
```
(Unlike with PEGs, you can write grammars with left or right-recursion,
@ -95,23 +95,23 @@ which means they don't generate nodes in the tree and just dump their
contents into the parent node instead.
```python
@rule
def list():
# The starting rule can't be transparent: there has to be something to
# hold on to!
return transparent_list
@rule
def list():
# The starting rule can't be transparent: there has to be something to
# hold on to!
return transparent_list
@rule(transparent=True)
def transparent_list() -> Rule:
return NUMBER | (transparent_list + COMMA + NUMBER)
@rule(transparent=True)
def transparent_list() -> Rule:
return NUMBER | (transparent_list + COMMA + NUMBER)
NUMBER = Terminal(Re.set(("0", "9")).plus())
COMMA = Terminal(',')
NUMBER = Terminal(Re.set(("0", "9")).plus())
COMMA = Terminal(',')
NumberList = Grammar(
name="NumberList",
start=list,
)
NumberList = Grammar(
name="NumberList",
start=list,
)
```
This grammar will generate the far more useful tree:
@ -130,23 +130,23 @@ following the lead set by tree-sitter, and so the grammar above is
probably better-written as:
```python
@rule
def list():
# The starting rule can't be transparent: there has to be something to
# hold on to!
return transparent_list
@rule
def list():
# The starting rule can't be transparent: there has to be something to
# hold on to!
return transparent_list
@rule
def _list() -> Rule:
return NUMBER | (_list + COMMA + NUMBER)
@rule
def _list() -> Rule:
return NUMBER | (_list + COMMA + NUMBER)
NUMBER = Terminal(Re.set(("0", "9")).plus())
COMMA = Terminal(',')
NUMBER = Terminal(Re.set(("0", "9")).plus())
COMMA = Terminal(',')
NumberList = Grammar(
name="NumberList",
start=list,
)
NumberList = Grammar(
name="NumberList",
start=list,
)
```
That will generate the same tree, but a little more succinctly.
@ -155,17 +155,17 @@ Of course, it's a lot of work to write these transparent recursive
rules by hand all the time, so there are helpers that do it for you:
```python
@rule
def list():
return zero_or_more(NUMBER, COMMA) + NUMBER
@rule
def list():
return zero_or_more(NUMBER, COMMA) + NUMBER
NUMBER = Terminal(Re.set(("0", "9")).plus())
COMMA = Terminal(',')
NUMBER = Terminal(Re.set(("0", "9")).plus())
COMMA = Terminal(',')
NumberList = Grammar(
name="NumberList",
start=list,
)
NumberList = Grammar(
name="NumberList",
start=list,
)
```
Much better.
@ -180,20 +180,20 @@ To allow (and ignore) spaces, newlines, tabs, and carriage-returns in
our number lists, we would modify the grammar as follows:
```python
@rule
def list():
return zero_or_more(NUMBER, COMMA) + NUMBER
@rule
def list():
return zero_or_more(NUMBER, COMMA) + NUMBER
NUMBER = Terminal(Re.set(("0", "9")).plus())
COMMA = Terminal(',')
NUMBER = Terminal(Re.set(("0", "9")).plus())
COMMA = Terminal(',')
BLANKS = Terminal(Re.set(" ", "\t", "\r", "\n").plus())
BLANKS = Terminal(Re.set(" ", "\t", "\r", "\n").plus())
NumberList = Grammar(
name="NumberList",
start=list,
trivia=[BLANKS],
)
NumberList = Grammar(
name="NumberList",
start=list,
trivia=[BLANKS],
)
```
Now we can parse a list with spaces! "1 , 2, 3" will parse happily

View file

@ -20,31 +20,31 @@ object.
Here's an example:
```python {.numberLines}
from parser import *
from parser import *
@rule
def expression():
return seq(expression, PLUS, term) | term
@rule
def expression():
return seq(expression, PLUS, term) | term
@rule
def term():
return seq(LPAREN, expression, RPAREN) | ID
@rule
def term():
return seq(LPAREN, expression, RPAREN) | ID
PLUS = Terminal('PLUS', '+')
LPAREN = Terminal('LPAREN', '(')
RPAREN = Terminal('RPAREN', ')')
ID = Terminal(
'ID',
Re.seq(
Re.set(("a", "z"), ("A", "Z"), "_"),
Re.set(("a", "z"), ("A", "Z"), ("0", "9"), "_").star(),
),
)
PLUS = Terminal('PLUS', '+')
LPAREN = Terminal('LPAREN', '(')
RPAREN = Terminal('RPAREN', ')')
ID = Terminal(
'ID',
Re.seq(
Re.set(("a", "z"), ("A", "Z"), "_"),
Re.set(("a", "z"), ("A", "Z"), ("0", "9"), "_").star(),
),
)
SimpleGrammar = Grammar(
name="Simple",
start=expression,
)
SimpleGrammar = Grammar(
name="Simple",
start=expression,
)
```
Terminal patterns can be plain strings or regular expressions
@ -61,17 +61,17 @@ things can be freely nested, as desired.
You can make lists in the classic context-free grammar way:
```python {.numberLines}
@rule
def list():
return NUMBER | (list + COMMA + NUMBER)
@rule
def list():
return NUMBER | (list + COMMA + NUMBER)
NUMBER = Terminal(Re.set(("0", "9")).plus())
COMMA = Terminal(',')
NUMBER = Terminal(Re.set(("0", "9")).plus())
COMMA = Terminal(',')
NumberList = Grammar(
name="NumberList",
start=list,
)
NumberList = Grammar(
name="NumberList",
start=list,
)
```
(Unlike with PEGs, you can write grammars with left or right-recursion,
@ -97,23 +97,23 @@ which means they don't generate nodes in the tree and just dump their
contents into the parent node instead.
```python {.numberLines}
@rule
def list():
# The starting rule can't be transparent: there has to be something to
# hold on to!
return transparent_list
@rule
def list():
# The starting rule can't be transparent: there has to be something to
# hold on to!
return transparent_list
@rule(transparent=True)
def transparent_list() -> Rule:
return NUMBER | (transparent_list + COMMA + NUMBER)
@rule(transparent=True)
def transparent_list() -> Rule:
return NUMBER | (transparent_list + COMMA + NUMBER)
NUMBER = Terminal(Re.set(("0", "9")).plus())
COMMA = Terminal(',')
NUMBER = Terminal(Re.set(("0", "9")).plus())
COMMA = Terminal(',')
NumberList = Grammar(
name="NumberList",
start=list,
)
NumberList = Grammar(
name="NumberList",
start=list,
)
```
This grammar will generate the far more useful tree:
@ -132,23 +132,23 @@ following the lead set by tree-sitter, and so the grammar above is
probably better-written as:
```python {.numberLines}
@rule
def list():
# The starting rule can't be transparent: there has to be something to
# hold on to!
return transparent_list
@rule
def list():
# The starting rule can't be transparent: there has to be something to
# hold on to!
return transparent_list
@rule
def _list() -> Rule:
return NUMBER | (_list + COMMA + NUMBER)
@rule
def _list() -> Rule:
return NUMBER | (_list + COMMA + NUMBER)
NUMBER = Terminal(Re.set(("0", "9")).plus())
COMMA = Terminal(',')
NUMBER = Terminal(Re.set(("0", "9")).plus())
COMMA = Terminal(',')
NumberList = Grammar(
name="NumberList",
start=list,
)
NumberList = Grammar(
name="NumberList",
start=list,
)
```
That will generate the same tree, but a little more succinctly.
@ -157,17 +157,17 @@ Of course, it's a lot of work to write these transparent recursive
rules by hand all the time, so there are helpers that do it for you:
```python {.numberLines}
@rule
def list():
return zero_or_more(NUMBER, COMMA) + NUMBER
@rule
def list():
return zero_or_more(NUMBER, COMMA) + NUMBER
NUMBER = Terminal(Re.set(("0", "9")).plus())
COMMA = Terminal(',')
NUMBER = Terminal(Re.set(("0", "9")).plus())
COMMA = Terminal(',')
NumberList = Grammar(
name="NumberList",
start=list,
)
NumberList = Grammar(
name="NumberList",
start=list,
)
```
Much better.
@ -182,21 +182,21 @@ To allow (and ignore) spaces, newlines, tabs, and carriage-returns in
our number lists, we would modify the grammar as follows:
```python {.numberLines}
@rule
def list():
return zero_or_more(NUMBER, COMMA) + NUMBER
@rule
def list():
return zero_or_more(NUMBER, COMMA) + NUMBER
NUMBER = Terminal(Re.set(("0", "9")).plus())
COMMA = Terminal(',')
NUMBER = Terminal(Re.set(("0", "9")).plus())
COMMA = Terminal(',')
BLANKS = Terminal(Re.set(" ", "\t", "\r", "\n").plus())
# ^ and add a new terminal to describe what we're ignoring...
BLANKS = Terminal(Re.set(" ", "\t", "\r", "\n").plus())
# ^ and add a new terminal to describe what we're ignoring...
NumberList = Grammar(
name="NumberList",
start=list,
trivia=[BLANKS],
)
NumberList = Grammar(
name="NumberList",
start=list,
trivia=[BLANKS],
)
```
Now we can parse a list with spaces! "1 , 2, 3" will parse happily

View file

@ -8,6 +8,22 @@ from dataclasses import dataclass
from . import parser
def offset_to_line_column(lines: list[int], pos: int) -> tuple[int, int]:
"""Convert a text offset to a line number and column number given a list
of line break positions. This is used to make errors intelligible. Lines
are 1-based, and columns are 0-based, in accordance with editor
traditions.
"""
line_index = bisect.bisect_left(lines, pos)
if line_index == 0:
col_start = 0
else:
col_start = lines[line_index - 1] + 1
column_index = pos - col_start
line_index += 1
return (line_index, column_index)
@dataclass
class TokenValue:
kind: str
@ -597,21 +613,16 @@ class Parser:
if errors:
lines = tokens.lines()
for parse_error in errors:
line_index = bisect.bisect_left(lines, parse_error.start)
if line_index == 0:
col_start = 0
else:
col_start = lines[line_index - 1] + 1
column_index = parse_error.start - col_start
line_index += 1
line_index, column_index = offset_to_line_column(lines, parse_error.start)
error_strings.append(f"{line_index}:{column_index}: {parse_error.message}")
return (result, error_strings)
def generic_tokenize(
src: str, table: parser.LexerTable
src: str,
table: parser.LexerTable,
lines: list[int],
) -> typing.Iterable[tuple[parser.Terminal, int, int]]:
pos = 0
state = 0
@ -647,7 +658,8 @@ def generic_tokenize(
pass
if last_accept is None:
raise Exception(f"Token error at {pos}")
line_index, column_index = offset_to_line_column(lines, pos)
raise Exception(f"{line_index}:{column_index}: Unexpected character '{src[pos]}'")
yield (last_accept, start, last_accept_pos - start)
@ -661,10 +673,10 @@ class GenericTokenStream:
def __init__(self, src: str, lexer: parser.LexerTable):
self.src = src
self.lexer = lexer
self._tokens: list[typing.Tuple[parser.Terminal, int, int]] = list(
generic_tokenize(src, lexer)
)
self._lines = [m.start() for m in re.finditer("\n", src)]
self._tokens: list[typing.Tuple[parser.Terminal, int, int]] = list(
generic_tokenize(src, lexer, self._lines)
)
def tokens(self):
return self._tokens