Compare commits
2 commits
5f19b1e73e
...
b2e7d15fb8
| Author | SHA1 | Date | |
|---|---|---|---|
| b2e7d15fb8 | |||
| fbccaea2fa |
3 changed files with 182 additions and 170 deletions
156
README.md
156
README.md
|
|
@ -17,31 +17,31 @@ class. Create one method per non-terminal, decorated with the `rule`
|
|||
decorator. Here's an example:
|
||||
|
||||
```python
|
||||
from parser import *
|
||||
from parser import *
|
||||
|
||||
@rule
|
||||
def expression():
|
||||
return seq(expression, PLUS, term) | term
|
||||
@rule
|
||||
def expression():
|
||||
return seq(expression, PLUS, term) | term
|
||||
|
||||
@rule
|
||||
def term():
|
||||
return seq(LPAREN, expression, RPAREN) | ID
|
||||
@rule
|
||||
def term():
|
||||
return seq(LPAREN, expression, RPAREN) | ID
|
||||
|
||||
PLUS = Terminal('PLUS', '+')
|
||||
LPAREN = Terminal('LPAREN', '(')
|
||||
RPAREN = Terminal('RPAREN', ')')
|
||||
ID = Terminal(
|
||||
'ID',
|
||||
Re.seq(
|
||||
Re.set(("a", "z"), ("A", "Z"), "_"),
|
||||
Re.set(("a", "z"), ("A", "Z"), ("0", "9"), "_").star(),
|
||||
),
|
||||
)
|
||||
PLUS = Terminal('PLUS', '+')
|
||||
LPAREN = Terminal('LPAREN', '(')
|
||||
RPAREN = Terminal('RPAREN', ')')
|
||||
ID = Terminal(
|
||||
'ID',
|
||||
Re.seq(
|
||||
Re.set(("a", "z"), ("A", "Z"), "_"),
|
||||
Re.set(("a", "z"), ("A", "Z"), ("0", "9"), "_").star(),
|
||||
),
|
||||
)
|
||||
|
||||
SimpleGrammar = Grammar(
|
||||
name="Simple",
|
||||
start=expression,
|
||||
)
|
||||
SimpleGrammar = Grammar(
|
||||
name="Simple",
|
||||
start=expression,
|
||||
)
|
||||
```
|
||||
|
||||
Terminals can be plain strings or regular expressions constructed with
|
||||
|
|
@ -59,17 +59,17 @@ There are no helpers (yet!) for consuming lists, so they need to be
|
|||
constructed in the classic context-free grammar way:
|
||||
|
||||
```python
|
||||
@rule
|
||||
def list():
|
||||
return NUMBER | (list + COMMA + NUMBER)
|
||||
@rule
|
||||
def list():
|
||||
return NUMBER | (list + COMMA + NUMBER)
|
||||
|
||||
NUMBER = Terminal(Re.set(("0", "9")).plus())
|
||||
COMMA = Terminal(',')
|
||||
NUMBER = Terminal(Re.set(("0", "9")).plus())
|
||||
COMMA = Terminal(',')
|
||||
|
||||
NumberList = Grammar(
|
||||
name="NumberList",
|
||||
start=list,
|
||||
)
|
||||
NumberList = Grammar(
|
||||
name="NumberList",
|
||||
start=list,
|
||||
)
|
||||
```
|
||||
|
||||
(Unlike with PEGs, you can write grammars with left or right-recursion,
|
||||
|
|
@ -95,23 +95,23 @@ which means they don't generate nodes in the tree and just dump their
|
|||
contents into the parent node instead.
|
||||
|
||||
```python
|
||||
@rule
|
||||
def list():
|
||||
# The starting rule can't be transparent: there has to be something to
|
||||
# hold on to!
|
||||
return transparent_list
|
||||
@rule
|
||||
def list():
|
||||
# The starting rule can't be transparent: there has to be something to
|
||||
# hold on to!
|
||||
return transparent_list
|
||||
|
||||
@rule(transparent=True)
|
||||
def transparent_list() -> Rule:
|
||||
return NUMBER | (transparent_list + COMMA + NUMBER)
|
||||
@rule(transparent=True)
|
||||
def transparent_list() -> Rule:
|
||||
return NUMBER | (transparent_list + COMMA + NUMBER)
|
||||
|
||||
NUMBER = Terminal(Re.set(("0", "9")).plus())
|
||||
COMMA = Terminal(',')
|
||||
NUMBER = Terminal(Re.set(("0", "9")).plus())
|
||||
COMMA = Terminal(',')
|
||||
|
||||
NumberList = Grammar(
|
||||
name="NumberList",
|
||||
start=list,
|
||||
)
|
||||
NumberList = Grammar(
|
||||
name="NumberList",
|
||||
start=list,
|
||||
)
|
||||
```
|
||||
|
||||
This grammar will generate the far more useful tree:
|
||||
|
|
@ -130,23 +130,23 @@ following the lead set by tree-sitter, and so the grammar above is
|
|||
probably better-written as:
|
||||
|
||||
```python
|
||||
@rule
|
||||
def list():
|
||||
# The starting rule can't be transparent: there has to be something to
|
||||
# hold on to!
|
||||
return transparent_list
|
||||
@rule
|
||||
def list():
|
||||
# The starting rule can't be transparent: there has to be something to
|
||||
# hold on to!
|
||||
return transparent_list
|
||||
|
||||
@rule
|
||||
def _list() -> Rule:
|
||||
return NUMBER | (_list + COMMA + NUMBER)
|
||||
@rule
|
||||
def _list() -> Rule:
|
||||
return NUMBER | (_list + COMMA + NUMBER)
|
||||
|
||||
NUMBER = Terminal(Re.set(("0", "9")).plus())
|
||||
COMMA = Terminal(',')
|
||||
NUMBER = Terminal(Re.set(("0", "9")).plus())
|
||||
COMMA = Terminal(',')
|
||||
|
||||
NumberList = Grammar(
|
||||
name="NumberList",
|
||||
start=list,
|
||||
)
|
||||
NumberList = Grammar(
|
||||
name="NumberList",
|
||||
start=list,
|
||||
)
|
||||
```
|
||||
|
||||
That will generate the same tree, but a little more succinctly.
|
||||
|
|
@ -155,17 +155,17 @@ Of course, it's a lot of work to write these transparent recursive
|
|||
rules by hand all the time, so there are helpers that do it for you:
|
||||
|
||||
```python
|
||||
@rule
|
||||
def list():
|
||||
return zero_or_more(NUMBER, COMMA) + NUMBER
|
||||
@rule
|
||||
def list():
|
||||
return zero_or_more(NUMBER, COMMA) + NUMBER
|
||||
|
||||
NUMBER = Terminal(Re.set(("0", "9")).plus())
|
||||
COMMA = Terminal(',')
|
||||
NUMBER = Terminal(Re.set(("0", "9")).plus())
|
||||
COMMA = Terminal(',')
|
||||
|
||||
NumberList = Grammar(
|
||||
name="NumberList",
|
||||
start=list,
|
||||
)
|
||||
NumberList = Grammar(
|
||||
name="NumberList",
|
||||
start=list,
|
||||
)
|
||||
```
|
||||
|
||||
Much better.
|
||||
|
|
@ -180,20 +180,20 @@ To allow (and ignore) spaces, newlines, tabs, and carriage-returns in
|
|||
our number lists, we would modify the grammar as follows:
|
||||
|
||||
```python
|
||||
@rule
|
||||
def list():
|
||||
return zero_or_more(NUMBER, COMMA) + NUMBER
|
||||
@rule
|
||||
def list():
|
||||
return zero_or_more(NUMBER, COMMA) + NUMBER
|
||||
|
||||
NUMBER = Terminal(Re.set(("0", "9")).plus())
|
||||
COMMA = Terminal(',')
|
||||
NUMBER = Terminal(Re.set(("0", "9")).plus())
|
||||
COMMA = Terminal(',')
|
||||
|
||||
BLANKS = Terminal(Re.set(" ", "\t", "\r", "\n").plus())
|
||||
BLANKS = Terminal(Re.set(" ", "\t", "\r", "\n").plus())
|
||||
|
||||
NumberList = Grammar(
|
||||
name="NumberList",
|
||||
start=list,
|
||||
trivia=[BLANKS],
|
||||
)
|
||||
NumberList = Grammar(
|
||||
name="NumberList",
|
||||
start=list,
|
||||
trivia=[BLANKS],
|
||||
)
|
||||
```
|
||||
|
||||
Now we can parse a list with spaces! "1 , 2, 3" will parse happily
|
||||
|
|
|
|||
158
dingus/about.md
158
dingus/about.md
|
|
@ -20,31 +20,31 @@ object.
|
|||
Here's an example:
|
||||
|
||||
```python {.numberLines}
|
||||
from parser import *
|
||||
from parser import *
|
||||
|
||||
@rule
|
||||
def expression():
|
||||
return seq(expression, PLUS, term) | term
|
||||
@rule
|
||||
def expression():
|
||||
return seq(expression, PLUS, term) | term
|
||||
|
||||
@rule
|
||||
def term():
|
||||
return seq(LPAREN, expression, RPAREN) | ID
|
||||
@rule
|
||||
def term():
|
||||
return seq(LPAREN, expression, RPAREN) | ID
|
||||
|
||||
PLUS = Terminal('PLUS', '+')
|
||||
LPAREN = Terminal('LPAREN', '(')
|
||||
RPAREN = Terminal('RPAREN', ')')
|
||||
ID = Terminal(
|
||||
'ID',
|
||||
Re.seq(
|
||||
Re.set(("a", "z"), ("A", "Z"), "_"),
|
||||
Re.set(("a", "z"), ("A", "Z"), ("0", "9"), "_").star(),
|
||||
),
|
||||
)
|
||||
PLUS = Terminal('PLUS', '+')
|
||||
LPAREN = Terminal('LPAREN', '(')
|
||||
RPAREN = Terminal('RPAREN', ')')
|
||||
ID = Terminal(
|
||||
'ID',
|
||||
Re.seq(
|
||||
Re.set(("a", "z"), ("A", "Z"), "_"),
|
||||
Re.set(("a", "z"), ("A", "Z"), ("0", "9"), "_").star(),
|
||||
),
|
||||
)
|
||||
|
||||
SimpleGrammar = Grammar(
|
||||
name="Simple",
|
||||
start=expression,
|
||||
)
|
||||
SimpleGrammar = Grammar(
|
||||
name="Simple",
|
||||
start=expression,
|
||||
)
|
||||
```
|
||||
|
||||
Terminal patterns can be plain strings or regular expressions
|
||||
|
|
@ -61,17 +61,17 @@ things can be freely nested, as desired.
|
|||
You can make lists in the classic context-free grammar way:
|
||||
|
||||
```python {.numberLines}
|
||||
@rule
|
||||
def list():
|
||||
return NUMBER | (list + COMMA + NUMBER)
|
||||
@rule
|
||||
def list():
|
||||
return NUMBER | (list + COMMA + NUMBER)
|
||||
|
||||
NUMBER = Terminal(Re.set(("0", "9")).plus())
|
||||
COMMA = Terminal(',')
|
||||
NUMBER = Terminal(Re.set(("0", "9")).plus())
|
||||
COMMA = Terminal(',')
|
||||
|
||||
NumberList = Grammar(
|
||||
name="NumberList",
|
||||
start=list,
|
||||
)
|
||||
NumberList = Grammar(
|
||||
name="NumberList",
|
||||
start=list,
|
||||
)
|
||||
```
|
||||
|
||||
(Unlike with PEGs, you can write grammars with left or right-recursion,
|
||||
|
|
@ -97,23 +97,23 @@ which means they don't generate nodes in the tree and just dump their
|
|||
contents into the parent node instead.
|
||||
|
||||
```python {.numberLines}
|
||||
@rule
|
||||
def list():
|
||||
# The starting rule can't be transparent: there has to be something to
|
||||
# hold on to!
|
||||
return transparent_list
|
||||
@rule
|
||||
def list():
|
||||
# The starting rule can't be transparent: there has to be something to
|
||||
# hold on to!
|
||||
return transparent_list
|
||||
|
||||
@rule(transparent=True)
|
||||
def transparent_list() -> Rule:
|
||||
return NUMBER | (transparent_list + COMMA + NUMBER)
|
||||
@rule(transparent=True)
|
||||
def transparent_list() -> Rule:
|
||||
return NUMBER | (transparent_list + COMMA + NUMBER)
|
||||
|
||||
NUMBER = Terminal(Re.set(("0", "9")).plus())
|
||||
COMMA = Terminal(',')
|
||||
NUMBER = Terminal(Re.set(("0", "9")).plus())
|
||||
COMMA = Terminal(',')
|
||||
|
||||
NumberList = Grammar(
|
||||
name="NumberList",
|
||||
start=list,
|
||||
)
|
||||
NumberList = Grammar(
|
||||
name="NumberList",
|
||||
start=list,
|
||||
)
|
||||
```
|
||||
|
||||
This grammar will generate the far more useful tree:
|
||||
|
|
@ -132,23 +132,23 @@ following the lead set by tree-sitter, and so the grammar above is
|
|||
probably better-written as:
|
||||
|
||||
```python {.numberLines}
|
||||
@rule
|
||||
def list():
|
||||
# The starting rule can't be transparent: there has to be something to
|
||||
# hold on to!
|
||||
return transparent_list
|
||||
@rule
|
||||
def list():
|
||||
# The starting rule can't be transparent: there has to be something to
|
||||
# hold on to!
|
||||
return transparent_list
|
||||
|
||||
@rule
|
||||
def _list() -> Rule:
|
||||
return NUMBER | (_list + COMMA + NUMBER)
|
||||
@rule
|
||||
def _list() -> Rule:
|
||||
return NUMBER | (_list + COMMA + NUMBER)
|
||||
|
||||
NUMBER = Terminal(Re.set(("0", "9")).plus())
|
||||
COMMA = Terminal(',')
|
||||
NUMBER = Terminal(Re.set(("0", "9")).plus())
|
||||
COMMA = Terminal(',')
|
||||
|
||||
NumberList = Grammar(
|
||||
name="NumberList",
|
||||
start=list,
|
||||
)
|
||||
NumberList = Grammar(
|
||||
name="NumberList",
|
||||
start=list,
|
||||
)
|
||||
```
|
||||
|
||||
That will generate the same tree, but a little more succinctly.
|
||||
|
|
@ -157,17 +157,17 @@ Of course, it's a lot of work to write these transparent recursive
|
|||
rules by hand all the time, so there are helpers that do it for you:
|
||||
|
||||
```python {.numberLines}
|
||||
@rule
|
||||
def list():
|
||||
return zero_or_more(NUMBER, COMMA) + NUMBER
|
||||
@rule
|
||||
def list():
|
||||
return zero_or_more(NUMBER, COMMA) + NUMBER
|
||||
|
||||
NUMBER = Terminal(Re.set(("0", "9")).plus())
|
||||
COMMA = Terminal(',')
|
||||
NUMBER = Terminal(Re.set(("0", "9")).plus())
|
||||
COMMA = Terminal(',')
|
||||
|
||||
NumberList = Grammar(
|
||||
name="NumberList",
|
||||
start=list,
|
||||
)
|
||||
NumberList = Grammar(
|
||||
name="NumberList",
|
||||
start=list,
|
||||
)
|
||||
```
|
||||
|
||||
Much better.
|
||||
|
|
@ -182,21 +182,21 @@ To allow (and ignore) spaces, newlines, tabs, and carriage-returns in
|
|||
our number lists, we would modify the grammar as follows:
|
||||
|
||||
```python {.numberLines}
|
||||
@rule
|
||||
def list():
|
||||
return zero_or_more(NUMBER, COMMA) + NUMBER
|
||||
@rule
|
||||
def list():
|
||||
return zero_or_more(NUMBER, COMMA) + NUMBER
|
||||
|
||||
NUMBER = Terminal(Re.set(("0", "9")).plus())
|
||||
COMMA = Terminal(',')
|
||||
NUMBER = Terminal(Re.set(("0", "9")).plus())
|
||||
COMMA = Terminal(',')
|
||||
|
||||
BLANKS = Terminal(Re.set(" ", "\t", "\r", "\n").plus())
|
||||
# ^ and add a new terminal to describe what we're ignoring...
|
||||
BLANKS = Terminal(Re.set(" ", "\t", "\r", "\n").plus())
|
||||
# ^ and add a new terminal to describe what we're ignoring...
|
||||
|
||||
NumberList = Grammar(
|
||||
name="NumberList",
|
||||
start=list,
|
||||
trivia=[BLANKS],
|
||||
)
|
||||
NumberList = Grammar(
|
||||
name="NumberList",
|
||||
start=list,
|
||||
trivia=[BLANKS],
|
||||
)
|
||||
```
|
||||
|
||||
Now we can parse a list with spaces! "1 , 2, 3" will parse happily
|
||||
|
|
|
|||
|
|
@ -8,6 +8,22 @@ from dataclasses import dataclass
|
|||
from . import parser
|
||||
|
||||
|
||||
def offset_to_line_column(lines: list[int], pos: int) -> tuple[int, int]:
|
||||
"""Convert a text offset to a line number and column number given a list
|
||||
of line break positions. This is used to make errors intelligible. Lines
|
||||
are 1-based, and columns are 0-based, in accordance with editor
|
||||
traditions.
|
||||
"""
|
||||
line_index = bisect.bisect_left(lines, pos)
|
||||
if line_index == 0:
|
||||
col_start = 0
|
||||
else:
|
||||
col_start = lines[line_index - 1] + 1
|
||||
column_index = pos - col_start
|
||||
line_index += 1
|
||||
return (line_index, column_index)
|
||||
|
||||
|
||||
@dataclass
|
||||
class TokenValue:
|
||||
kind: str
|
||||
|
|
@ -597,21 +613,16 @@ class Parser:
|
|||
if errors:
|
||||
lines = tokens.lines()
|
||||
for parse_error in errors:
|
||||
line_index = bisect.bisect_left(lines, parse_error.start)
|
||||
if line_index == 0:
|
||||
col_start = 0
|
||||
else:
|
||||
col_start = lines[line_index - 1] + 1
|
||||
column_index = parse_error.start - col_start
|
||||
line_index += 1
|
||||
|
||||
line_index, column_index = offset_to_line_column(lines, parse_error.start)
|
||||
error_strings.append(f"{line_index}:{column_index}: {parse_error.message}")
|
||||
|
||||
return (result, error_strings)
|
||||
|
||||
|
||||
def generic_tokenize(
|
||||
src: str, table: parser.LexerTable
|
||||
src: str,
|
||||
table: parser.LexerTable,
|
||||
lines: list[int],
|
||||
) -> typing.Iterable[tuple[parser.Terminal, int, int]]:
|
||||
pos = 0
|
||||
state = 0
|
||||
|
|
@ -647,7 +658,8 @@ def generic_tokenize(
|
|||
pass
|
||||
|
||||
if last_accept is None:
|
||||
raise Exception(f"Token error at {pos}")
|
||||
line_index, column_index = offset_to_line_column(lines, pos)
|
||||
raise Exception(f"{line_index}:{column_index}: Unexpected character '{src[pos]}'")
|
||||
|
||||
yield (last_accept, start, last_accept_pos - start)
|
||||
|
||||
|
|
@ -661,10 +673,10 @@ class GenericTokenStream:
|
|||
def __init__(self, src: str, lexer: parser.LexerTable):
|
||||
self.src = src
|
||||
self.lexer = lexer
|
||||
self._tokens: list[typing.Tuple[parser.Terminal, int, int]] = list(
|
||||
generic_tokenize(src, lexer)
|
||||
)
|
||||
self._lines = [m.start() for m in re.finditer("\n", src)]
|
||||
self._tokens: list[typing.Tuple[parser.Terminal, int, int]] = list(
|
||||
generic_tokenize(src, lexer, self._lines)
|
||||
)
|
||||
|
||||
def tokens(self):
|
||||
return self._tokens
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue