Compare commits
2 commits
5f19b1e73e
...
b2e7d15fb8
| Author | SHA1 | Date | |
|---|---|---|---|
| b2e7d15fb8 | |||
| fbccaea2fa |
3 changed files with 182 additions and 170 deletions
156
README.md
156
README.md
|
|
@ -17,31 +17,31 @@ class. Create one method per non-terminal, decorated with the `rule`
|
||||||
decorator. Here's an example:
|
decorator. Here's an example:
|
||||||
|
|
||||||
```python
|
```python
|
||||||
from parser import *
|
from parser import *
|
||||||
|
|
||||||
@rule
|
@rule
|
||||||
def expression():
|
def expression():
|
||||||
return seq(expression, PLUS, term) | term
|
return seq(expression, PLUS, term) | term
|
||||||
|
|
||||||
@rule
|
@rule
|
||||||
def term():
|
def term():
|
||||||
return seq(LPAREN, expression, RPAREN) | ID
|
return seq(LPAREN, expression, RPAREN) | ID
|
||||||
|
|
||||||
PLUS = Terminal('PLUS', '+')
|
PLUS = Terminal('PLUS', '+')
|
||||||
LPAREN = Terminal('LPAREN', '(')
|
LPAREN = Terminal('LPAREN', '(')
|
||||||
RPAREN = Terminal('RPAREN', ')')
|
RPAREN = Terminal('RPAREN', ')')
|
||||||
ID = Terminal(
|
ID = Terminal(
|
||||||
'ID',
|
'ID',
|
||||||
Re.seq(
|
Re.seq(
|
||||||
Re.set(("a", "z"), ("A", "Z"), "_"),
|
Re.set(("a", "z"), ("A", "Z"), "_"),
|
||||||
Re.set(("a", "z"), ("A", "Z"), ("0", "9"), "_").star(),
|
Re.set(("a", "z"), ("A", "Z"), ("0", "9"), "_").star(),
|
||||||
),
|
),
|
||||||
)
|
)
|
||||||
|
|
||||||
SimpleGrammar = Grammar(
|
SimpleGrammar = Grammar(
|
||||||
name="Simple",
|
name="Simple",
|
||||||
start=expression,
|
start=expression,
|
||||||
)
|
)
|
||||||
```
|
```
|
||||||
|
|
||||||
Terminals can be plain strings or regular expressions constructed with
|
Terminals can be plain strings or regular expressions constructed with
|
||||||
|
|
@ -59,17 +59,17 @@ There are no helpers (yet!) for consuming lists, so they need to be
|
||||||
constructed in the classic context-free grammar way:
|
constructed in the classic context-free grammar way:
|
||||||
|
|
||||||
```python
|
```python
|
||||||
@rule
|
@rule
|
||||||
def list():
|
def list():
|
||||||
return NUMBER | (list + COMMA + NUMBER)
|
return NUMBER | (list + COMMA + NUMBER)
|
||||||
|
|
||||||
NUMBER = Terminal(Re.set(("0", "9")).plus())
|
NUMBER = Terminal(Re.set(("0", "9")).plus())
|
||||||
COMMA = Terminal(',')
|
COMMA = Terminal(',')
|
||||||
|
|
||||||
NumberList = Grammar(
|
NumberList = Grammar(
|
||||||
name="NumberList",
|
name="NumberList",
|
||||||
start=list,
|
start=list,
|
||||||
)
|
)
|
||||||
```
|
```
|
||||||
|
|
||||||
(Unlike with PEGs, you can write grammars with left or right-recursion,
|
(Unlike with PEGs, you can write grammars with left or right-recursion,
|
||||||
|
|
@ -95,23 +95,23 @@ which means they don't generate nodes in the tree and just dump their
|
||||||
contents into the parent node instead.
|
contents into the parent node instead.
|
||||||
|
|
||||||
```python
|
```python
|
||||||
@rule
|
@rule
|
||||||
def list():
|
def list():
|
||||||
# The starting rule can't be transparent: there has to be something to
|
# The starting rule can't be transparent: there has to be something to
|
||||||
# hold on to!
|
# hold on to!
|
||||||
return transparent_list
|
return transparent_list
|
||||||
|
|
||||||
@rule(transparent=True)
|
@rule(transparent=True)
|
||||||
def transparent_list() -> Rule:
|
def transparent_list() -> Rule:
|
||||||
return NUMBER | (transparent_list + COMMA + NUMBER)
|
return NUMBER | (transparent_list + COMMA + NUMBER)
|
||||||
|
|
||||||
NUMBER = Terminal(Re.set(("0", "9")).plus())
|
NUMBER = Terminal(Re.set(("0", "9")).plus())
|
||||||
COMMA = Terminal(',')
|
COMMA = Terminal(',')
|
||||||
|
|
||||||
NumberList = Grammar(
|
NumberList = Grammar(
|
||||||
name="NumberList",
|
name="NumberList",
|
||||||
start=list,
|
start=list,
|
||||||
)
|
)
|
||||||
```
|
```
|
||||||
|
|
||||||
This grammar will generate the far more useful tree:
|
This grammar will generate the far more useful tree:
|
||||||
|
|
@ -130,23 +130,23 @@ following the lead set by tree-sitter, and so the grammar above is
|
||||||
probably better-written as:
|
probably better-written as:
|
||||||
|
|
||||||
```python
|
```python
|
||||||
@rule
|
@rule
|
||||||
def list():
|
def list():
|
||||||
# The starting rule can't be transparent: there has to be something to
|
# The starting rule can't be transparent: there has to be something to
|
||||||
# hold on to!
|
# hold on to!
|
||||||
return transparent_list
|
return transparent_list
|
||||||
|
|
||||||
@rule
|
@rule
|
||||||
def _list() -> Rule:
|
def _list() -> Rule:
|
||||||
return NUMBER | (_list + COMMA + NUMBER)
|
return NUMBER | (_list + COMMA + NUMBER)
|
||||||
|
|
||||||
NUMBER = Terminal(Re.set(("0", "9")).plus())
|
NUMBER = Terminal(Re.set(("0", "9")).plus())
|
||||||
COMMA = Terminal(',')
|
COMMA = Terminal(',')
|
||||||
|
|
||||||
NumberList = Grammar(
|
NumberList = Grammar(
|
||||||
name="NumberList",
|
name="NumberList",
|
||||||
start=list,
|
start=list,
|
||||||
)
|
)
|
||||||
```
|
```
|
||||||
|
|
||||||
That will generate the same tree, but a little more succinctly.
|
That will generate the same tree, but a little more succinctly.
|
||||||
|
|
@ -155,17 +155,17 @@ Of course, it's a lot of work to write these transparent recursive
|
||||||
rules by hand all the time, so there are helpers that do it for you:
|
rules by hand all the time, so there are helpers that do it for you:
|
||||||
|
|
||||||
```python
|
```python
|
||||||
@rule
|
@rule
|
||||||
def list():
|
def list():
|
||||||
return zero_or_more(NUMBER, COMMA) + NUMBER
|
return zero_or_more(NUMBER, COMMA) + NUMBER
|
||||||
|
|
||||||
NUMBER = Terminal(Re.set(("0", "9")).plus())
|
NUMBER = Terminal(Re.set(("0", "9")).plus())
|
||||||
COMMA = Terminal(',')
|
COMMA = Terminal(',')
|
||||||
|
|
||||||
NumberList = Grammar(
|
NumberList = Grammar(
|
||||||
name="NumberList",
|
name="NumberList",
|
||||||
start=list,
|
start=list,
|
||||||
)
|
)
|
||||||
```
|
```
|
||||||
|
|
||||||
Much better.
|
Much better.
|
||||||
|
|
@ -180,20 +180,20 @@ To allow (and ignore) spaces, newlines, tabs, and carriage-returns in
|
||||||
our number lists, we would modify the grammar as follows:
|
our number lists, we would modify the grammar as follows:
|
||||||
|
|
||||||
```python
|
```python
|
||||||
@rule
|
@rule
|
||||||
def list():
|
def list():
|
||||||
return zero_or_more(NUMBER, COMMA) + NUMBER
|
return zero_or_more(NUMBER, COMMA) + NUMBER
|
||||||
|
|
||||||
NUMBER = Terminal(Re.set(("0", "9")).plus())
|
NUMBER = Terminal(Re.set(("0", "9")).plus())
|
||||||
COMMA = Terminal(',')
|
COMMA = Terminal(',')
|
||||||
|
|
||||||
BLANKS = Terminal(Re.set(" ", "\t", "\r", "\n").plus())
|
BLANKS = Terminal(Re.set(" ", "\t", "\r", "\n").plus())
|
||||||
|
|
||||||
NumberList = Grammar(
|
NumberList = Grammar(
|
||||||
name="NumberList",
|
name="NumberList",
|
||||||
start=list,
|
start=list,
|
||||||
trivia=[BLANKS],
|
trivia=[BLANKS],
|
||||||
)
|
)
|
||||||
```
|
```
|
||||||
|
|
||||||
Now we can parse a list with spaces! "1 , 2, 3" will parse happily
|
Now we can parse a list with spaces! "1 , 2, 3" will parse happily
|
||||||
|
|
|
||||||
158
dingus/about.md
158
dingus/about.md
|
|
@ -20,31 +20,31 @@ object.
|
||||||
Here's an example:
|
Here's an example:
|
||||||
|
|
||||||
```python {.numberLines}
|
```python {.numberLines}
|
||||||
from parser import *
|
from parser import *
|
||||||
|
|
||||||
@rule
|
@rule
|
||||||
def expression():
|
def expression():
|
||||||
return seq(expression, PLUS, term) | term
|
return seq(expression, PLUS, term) | term
|
||||||
|
|
||||||
@rule
|
@rule
|
||||||
def term():
|
def term():
|
||||||
return seq(LPAREN, expression, RPAREN) | ID
|
return seq(LPAREN, expression, RPAREN) | ID
|
||||||
|
|
||||||
PLUS = Terminal('PLUS', '+')
|
PLUS = Terminal('PLUS', '+')
|
||||||
LPAREN = Terminal('LPAREN', '(')
|
LPAREN = Terminal('LPAREN', '(')
|
||||||
RPAREN = Terminal('RPAREN', ')')
|
RPAREN = Terminal('RPAREN', ')')
|
||||||
ID = Terminal(
|
ID = Terminal(
|
||||||
'ID',
|
'ID',
|
||||||
Re.seq(
|
Re.seq(
|
||||||
Re.set(("a", "z"), ("A", "Z"), "_"),
|
Re.set(("a", "z"), ("A", "Z"), "_"),
|
||||||
Re.set(("a", "z"), ("A", "Z"), ("0", "9"), "_").star(),
|
Re.set(("a", "z"), ("A", "Z"), ("0", "9"), "_").star(),
|
||||||
),
|
),
|
||||||
)
|
)
|
||||||
|
|
||||||
SimpleGrammar = Grammar(
|
SimpleGrammar = Grammar(
|
||||||
name="Simple",
|
name="Simple",
|
||||||
start=expression,
|
start=expression,
|
||||||
)
|
)
|
||||||
```
|
```
|
||||||
|
|
||||||
Terminal patterns can be plain strings or regular expressions
|
Terminal patterns can be plain strings or regular expressions
|
||||||
|
|
@ -61,17 +61,17 @@ things can be freely nested, as desired.
|
||||||
You can make lists in the classic context-free grammar way:
|
You can make lists in the classic context-free grammar way:
|
||||||
|
|
||||||
```python {.numberLines}
|
```python {.numberLines}
|
||||||
@rule
|
@rule
|
||||||
def list():
|
def list():
|
||||||
return NUMBER | (list + COMMA + NUMBER)
|
return NUMBER | (list + COMMA + NUMBER)
|
||||||
|
|
||||||
NUMBER = Terminal(Re.set(("0", "9")).plus())
|
NUMBER = Terminal(Re.set(("0", "9")).plus())
|
||||||
COMMA = Terminal(',')
|
COMMA = Terminal(',')
|
||||||
|
|
||||||
NumberList = Grammar(
|
NumberList = Grammar(
|
||||||
name="NumberList",
|
name="NumberList",
|
||||||
start=list,
|
start=list,
|
||||||
)
|
)
|
||||||
```
|
```
|
||||||
|
|
||||||
(Unlike with PEGs, you can write grammars with left or right-recursion,
|
(Unlike with PEGs, you can write grammars with left or right-recursion,
|
||||||
|
|
@ -97,23 +97,23 @@ which means they don't generate nodes in the tree and just dump their
|
||||||
contents into the parent node instead.
|
contents into the parent node instead.
|
||||||
|
|
||||||
```python {.numberLines}
|
```python {.numberLines}
|
||||||
@rule
|
@rule
|
||||||
def list():
|
def list():
|
||||||
# The starting rule can't be transparent: there has to be something to
|
# The starting rule can't be transparent: there has to be something to
|
||||||
# hold on to!
|
# hold on to!
|
||||||
return transparent_list
|
return transparent_list
|
||||||
|
|
||||||
@rule(transparent=True)
|
@rule(transparent=True)
|
||||||
def transparent_list() -> Rule:
|
def transparent_list() -> Rule:
|
||||||
return NUMBER | (transparent_list + COMMA + NUMBER)
|
return NUMBER | (transparent_list + COMMA + NUMBER)
|
||||||
|
|
||||||
NUMBER = Terminal(Re.set(("0", "9")).plus())
|
NUMBER = Terminal(Re.set(("0", "9")).plus())
|
||||||
COMMA = Terminal(',')
|
COMMA = Terminal(',')
|
||||||
|
|
||||||
NumberList = Grammar(
|
NumberList = Grammar(
|
||||||
name="NumberList",
|
name="NumberList",
|
||||||
start=list,
|
start=list,
|
||||||
)
|
)
|
||||||
```
|
```
|
||||||
|
|
||||||
This grammar will generate the far more useful tree:
|
This grammar will generate the far more useful tree:
|
||||||
|
|
@ -132,23 +132,23 @@ following the lead set by tree-sitter, and so the grammar above is
|
||||||
probably better-written as:
|
probably better-written as:
|
||||||
|
|
||||||
```python {.numberLines}
|
```python {.numberLines}
|
||||||
@rule
|
@rule
|
||||||
def list():
|
def list():
|
||||||
# The starting rule can't be transparent: there has to be something to
|
# The starting rule can't be transparent: there has to be something to
|
||||||
# hold on to!
|
# hold on to!
|
||||||
return transparent_list
|
return transparent_list
|
||||||
|
|
||||||
@rule
|
@rule
|
||||||
def _list() -> Rule:
|
def _list() -> Rule:
|
||||||
return NUMBER | (_list + COMMA + NUMBER)
|
return NUMBER | (_list + COMMA + NUMBER)
|
||||||
|
|
||||||
NUMBER = Terminal(Re.set(("0", "9")).plus())
|
NUMBER = Terminal(Re.set(("0", "9")).plus())
|
||||||
COMMA = Terminal(',')
|
COMMA = Terminal(',')
|
||||||
|
|
||||||
NumberList = Grammar(
|
NumberList = Grammar(
|
||||||
name="NumberList",
|
name="NumberList",
|
||||||
start=list,
|
start=list,
|
||||||
)
|
)
|
||||||
```
|
```
|
||||||
|
|
||||||
That will generate the same tree, but a little more succinctly.
|
That will generate the same tree, but a little more succinctly.
|
||||||
|
|
@ -157,17 +157,17 @@ Of course, it's a lot of work to write these transparent recursive
|
||||||
rules by hand all the time, so there are helpers that do it for you:
|
rules by hand all the time, so there are helpers that do it for you:
|
||||||
|
|
||||||
```python {.numberLines}
|
```python {.numberLines}
|
||||||
@rule
|
@rule
|
||||||
def list():
|
def list():
|
||||||
return zero_or_more(NUMBER, COMMA) + NUMBER
|
return zero_or_more(NUMBER, COMMA) + NUMBER
|
||||||
|
|
||||||
NUMBER = Terminal(Re.set(("0", "9")).plus())
|
NUMBER = Terminal(Re.set(("0", "9")).plus())
|
||||||
COMMA = Terminal(',')
|
COMMA = Terminal(',')
|
||||||
|
|
||||||
NumberList = Grammar(
|
NumberList = Grammar(
|
||||||
name="NumberList",
|
name="NumberList",
|
||||||
start=list,
|
start=list,
|
||||||
)
|
)
|
||||||
```
|
```
|
||||||
|
|
||||||
Much better.
|
Much better.
|
||||||
|
|
@ -182,21 +182,21 @@ To allow (and ignore) spaces, newlines, tabs, and carriage-returns in
|
||||||
our number lists, we would modify the grammar as follows:
|
our number lists, we would modify the grammar as follows:
|
||||||
|
|
||||||
```python {.numberLines}
|
```python {.numberLines}
|
||||||
@rule
|
@rule
|
||||||
def list():
|
def list():
|
||||||
return zero_or_more(NUMBER, COMMA) + NUMBER
|
return zero_or_more(NUMBER, COMMA) + NUMBER
|
||||||
|
|
||||||
NUMBER = Terminal(Re.set(("0", "9")).plus())
|
NUMBER = Terminal(Re.set(("0", "9")).plus())
|
||||||
COMMA = Terminal(',')
|
COMMA = Terminal(',')
|
||||||
|
|
||||||
BLANKS = Terminal(Re.set(" ", "\t", "\r", "\n").plus())
|
BLANKS = Terminal(Re.set(" ", "\t", "\r", "\n").plus())
|
||||||
# ^ and add a new terminal to describe what we're ignoring...
|
# ^ and add a new terminal to describe what we're ignoring...
|
||||||
|
|
||||||
NumberList = Grammar(
|
NumberList = Grammar(
|
||||||
name="NumberList",
|
name="NumberList",
|
||||||
start=list,
|
start=list,
|
||||||
trivia=[BLANKS],
|
trivia=[BLANKS],
|
||||||
)
|
)
|
||||||
```
|
```
|
||||||
|
|
||||||
Now we can parse a list with spaces! "1 , 2, 3" will parse happily
|
Now we can parse a list with spaces! "1 , 2, 3" will parse happily
|
||||||
|
|
|
||||||
|
|
@ -8,6 +8,22 @@ from dataclasses import dataclass
|
||||||
from . import parser
|
from . import parser
|
||||||
|
|
||||||
|
|
||||||
|
def offset_to_line_column(lines: list[int], pos: int) -> tuple[int, int]:
|
||||||
|
"""Convert a text offset to a line number and column number given a list
|
||||||
|
of line break positions. This is used to make errors intelligible. Lines
|
||||||
|
are 1-based, and columns are 0-based, in accordance with editor
|
||||||
|
traditions.
|
||||||
|
"""
|
||||||
|
line_index = bisect.bisect_left(lines, pos)
|
||||||
|
if line_index == 0:
|
||||||
|
col_start = 0
|
||||||
|
else:
|
||||||
|
col_start = lines[line_index - 1] + 1
|
||||||
|
column_index = pos - col_start
|
||||||
|
line_index += 1
|
||||||
|
return (line_index, column_index)
|
||||||
|
|
||||||
|
|
||||||
@dataclass
|
@dataclass
|
||||||
class TokenValue:
|
class TokenValue:
|
||||||
kind: str
|
kind: str
|
||||||
|
|
@ -597,21 +613,16 @@ class Parser:
|
||||||
if errors:
|
if errors:
|
||||||
lines = tokens.lines()
|
lines = tokens.lines()
|
||||||
for parse_error in errors:
|
for parse_error in errors:
|
||||||
line_index = bisect.bisect_left(lines, parse_error.start)
|
line_index, column_index = offset_to_line_column(lines, parse_error.start)
|
||||||
if line_index == 0:
|
|
||||||
col_start = 0
|
|
||||||
else:
|
|
||||||
col_start = lines[line_index - 1] + 1
|
|
||||||
column_index = parse_error.start - col_start
|
|
||||||
line_index += 1
|
|
||||||
|
|
||||||
error_strings.append(f"{line_index}:{column_index}: {parse_error.message}")
|
error_strings.append(f"{line_index}:{column_index}: {parse_error.message}")
|
||||||
|
|
||||||
return (result, error_strings)
|
return (result, error_strings)
|
||||||
|
|
||||||
|
|
||||||
def generic_tokenize(
|
def generic_tokenize(
|
||||||
src: str, table: parser.LexerTable
|
src: str,
|
||||||
|
table: parser.LexerTable,
|
||||||
|
lines: list[int],
|
||||||
) -> typing.Iterable[tuple[parser.Terminal, int, int]]:
|
) -> typing.Iterable[tuple[parser.Terminal, int, int]]:
|
||||||
pos = 0
|
pos = 0
|
||||||
state = 0
|
state = 0
|
||||||
|
|
@ -647,7 +658,8 @@ def generic_tokenize(
|
||||||
pass
|
pass
|
||||||
|
|
||||||
if last_accept is None:
|
if last_accept is None:
|
||||||
raise Exception(f"Token error at {pos}")
|
line_index, column_index = offset_to_line_column(lines, pos)
|
||||||
|
raise Exception(f"{line_index}:{column_index}: Unexpected character '{src[pos]}'")
|
||||||
|
|
||||||
yield (last_accept, start, last_accept_pos - start)
|
yield (last_accept, start, last_accept_pos - start)
|
||||||
|
|
||||||
|
|
@ -661,10 +673,10 @@ class GenericTokenStream:
|
||||||
def __init__(self, src: str, lexer: parser.LexerTable):
|
def __init__(self, src: str, lexer: parser.LexerTable):
|
||||||
self.src = src
|
self.src = src
|
||||||
self.lexer = lexer
|
self.lexer = lexer
|
||||||
self._tokens: list[typing.Tuple[parser.Terminal, int, int]] = list(
|
|
||||||
generic_tokenize(src, lexer)
|
|
||||||
)
|
|
||||||
self._lines = [m.start() for m in re.finditer("\n", src)]
|
self._lines = [m.start() for m in re.finditer("\n", src)]
|
||||||
|
self._tokens: list[typing.Tuple[parser.Terminal, int, int]] = list(
|
||||||
|
generic_tokenize(src, lexer, self._lines)
|
||||||
|
)
|
||||||
|
|
||||||
def tokens(self):
|
def tokens(self):
|
||||||
return self._tokens
|
return self._tokens
|
||||||
|
|
|
||||||
Loading…
Add table
Add a link
Reference in a new issue