Compare commits

...

2 commits

Author SHA1 Message Date
5f19b1e73e Rework the documentation examples 2025-02-15 15:06:42 -08:00
ed5baefd5d Fix errors 2025-02-14 19:09:35 -08:00
4 changed files with 204 additions and 131 deletions

149
README.md
View file

@ -17,26 +17,31 @@ class. Create one method per non-terminal, decorated with the `rule`
decorator. Here's an example:
```python
class SimpleGrammar(Grammar):
start = "expression"
from parser import *
@rule
def expression(self):
return seq(self.expression, self.PLUS, self.term) | self.term
@rule
def expression():
return seq(expression, PLUS, term) | term
@rule
def term(self):
return seq(self.LPAREN, self.expression, self.RPAREN) | self.ID
@rule
def term():
return seq(LPAREN, expression, RPAREN) | ID
PLUS = Terminal('+')
LPAREN = Terminal('(')
RPAREN = Terminal(')')
ID = Terminal(
Re.seq(
Re.set(("a", "z"), ("A", "Z"), "_"),
Re.set(("a", "z"), ("A", "Z"), ("0", "9"), "_").star(),
),
)
PLUS = Terminal('PLUS', '+')
LPAREN = Terminal('LPAREN', '(')
RPAREN = Terminal('RPAREN', ')')
ID = Terminal(
'ID',
Re.seq(
Re.set(("a", "z"), ("A", "Z"), "_"),
Re.set(("a", "z"), ("A", "Z"), ("0", "9"), "_").star(),
),
)
SimpleGrammar = Grammar(
name="Simple",
start=expression,
)
```
Terminals can be plain strings or regular expressions constructed with
@ -54,15 +59,17 @@ There are no helpers (yet!) for consuming lists, so they need to be
constructed in the classic context-free grammar way:
```python
class NumberList(Grammar):
start = "list"
@rule
def list():
return NUMBER | (list + COMMA + NUMBER)
@rule
def list(self):
return self.NUMBER | (self.list + self.COMMA + self.NUMBER)
NUMBER = Terminal(Re.set(("0", "9")).plus())
COMMA = Terminal(',')
NUMBER = Terminal(Re.set(("0", "9")).plus())
COMMA = Terminal(',')
NumberList = Grammar(
name="NumberList",
start=list,
)
```
(Unlike with PEGs, you can write grammars with left or right-recursion,
@ -88,21 +95,23 @@ which means they don't generate nodes in the tree and just dump their
contents into the parent node instead.
```python
class NumberList(Grammar):
start = "list"
@rule
def list():
# The starting rule can't be transparent: there has to be something to
# hold on to!
return transparent_list
@rule
def list(self):
# The starting rule can't be transparent: there has to be something to
# hold on to!
return self.transparent_list
@rule(transparent=True)
def transparent_list() -> Rule:
return NUMBER | (transparent_list + COMMA + NUMBER)
@rule(transparent=True)
def transparent_list(self) -> Rule:
return self.NUMBER | (self.transparent_list + self.COMMA + self.NUMBER)
NUMBER = Terminal(Re.set(("0", "9")).plus())
COMMA = Terminal(',')
NUMBER = Terminal(Re.set(("0", "9")).plus())
COMMA = Terminal(',')
NumberList = Grammar(
name="NumberList",
start=list,
)
```
This grammar will generate the far more useful tree:
@ -121,23 +130,46 @@ following the lead set by tree-sitter, and so the grammar above is
probably better-written as:
```python
class NumberList(Grammar):
start = "list"
@rule
def list():
# The starting rule can't be transparent: there has to be something to
# hold on to!
return transparent_list
@rule
def list(self):
return self._list
@rule
def _list() -> Rule:
return NUMBER | (_list + COMMA + NUMBER)
@rule
def _list(self):
return self.NUMBER | (self._list + self.COMMA + self.NUMBER)
NUMBER = Terminal(Re.set(("0", "9")).plus())
COMMA = Terminal(',')
NUMBER = Terminal(Re.set(("0", "9")).plus())
COMMA = Terminal(',')
NumberList = Grammar(
name="NumberList",
start=list,
)
```
That will generate the same tree, but a little more succinctly.
Of course, it's a lot of work to write these transparent recursive
rules by hand all the time, so there are helpers that do it for you:
```python
@rule
def list():
return zero_or_more(NUMBER, COMMA) + NUMBER
NUMBER = Terminal(Re.set(("0", "9")).plus())
COMMA = Terminal(',')
NumberList = Grammar(
name="NumberList",
start=list,
)
```
Much better.
### Trivia
Most folks that want to parse something want to skip blanks when they
@ -148,23 +180,20 @@ To allow (and ignore) spaces, newlines, tabs, and carriage-returns in
our number lists, we would modify the grammar as follows:
```python
class NumberList(Grammar):
start = "list"
trivia = ["BLANKS"] # <- Add a `trivia` member
@rule
def list():
return zero_or_more(NUMBER, COMMA) + NUMBER
@rule
def list(self):
return self._list
NUMBER = Terminal(Re.set(("0", "9")).plus())
COMMA = Terminal(',')
@rule
def _list(self):
return self.NUMBER | (self._list + self.COMMA + self.NUMBER)
BLANKS = Terminal(Re.set(" ", "\t", "\r", "\n").plus())
NUMBER = Terminal(Re.set(("0", "9")).plus())
COMMA = Terminal(',')
BLANKS = Terminal(Re.set(" ", "\t", "\r", "\n").plus())
# ^ and add a new terminal to describe it
NumberList = Grammar(
name="NumberList",
start=list,
trivia=[BLANKS],
)
```
Now we can parse a list with spaces! "1 , 2, 3" will parse happily

View file

@ -12,59 +12,66 @@ about doing fun things with grammars.
## Making Grammars
To get started, create a grammar that derives from the `Grammar`
class. Create one method per non-terminal, decorated with the `rule`
decorator. Here's an example:
To get started, create one function per non-terminal, decorated with
the `rule` decorator, and one instance of a `Terminal` object for each
terminal. Then tie it all together with an instance of a Grammar
object.
Here's an example:
```python {.numberLines}
from parser import *
class SimpleGrammar(Grammar):
start = "expression"
@rule
def expression():
return seq(expression, PLUS, term) | term
@rule
def expression(self):
return seq(self.expression, self.PLUS, self.term) | self.term
@rule
def term():
return seq(LPAREN, expression, RPAREN) | ID
@rule
def term(self):
return seq(self.LPAREN, self.expression, self.RPAREN) | self.ID
PLUS = Terminal('PLUS', '+')
LPAREN = Terminal('LPAREN', '(')
RPAREN = Terminal('RPAREN', ')')
ID = Terminal(
'ID',
Re.seq(
Re.set(("a", "z"), ("A", "Z"), "_"),
Re.set(("a", "z"), ("A", "Z"), ("0", "9"), "_").star(),
),
)
PLUS = Terminal('+')
LPAREN = Terminal('(')
RPAREN = Terminal(')')
ID = Terminal(
Re.seq(
Re.set(("a", "z"), ("A", "Z"), "_"),
Re.set(("a", "z"), ("A", "Z"), ("0", "9"), "_").star(),
),
)
SimpleGrammar = Grammar(
name="Simple",
start=expression,
)
```
Terminals can be plain strings or regular expressions constructed with
the `Re` object. (Ironically, I guess this library is not clever
enough to parse a regular expression string into one of these
structures. If you want to build one, go nuts! It's just Python, you
can do whatever you want so long as the result is an `Re` object.)
Terminal patterns can be plain strings or regular expressions
constructed with the `Re` object. (Ironically, I guess this library is
not clever enough to parse a regular expression string into one of
these structures. If you want to build one, go nuts! It's just Python,
you can do whatever you want so long as the result is an `Re` object.)
Productions can be built out of terminals and non-terminals,
concatenated with the `seq` function or the `+` operator. Alternatives
can be expressed with the `alt` function or the `|` operator. These
things can be freely nested, as desired.
There are no helpers (yet!) for consuming lists, so they need to be
constructed in the classic context-free grammar way:
You can make lists in the classic context-free grammar way:
```python {.numberLines}
class NumberList(Grammar):
start = "list"
@rule
def list():
return NUMBER | (list + COMMA + NUMBER)
@rule
def list(self):
return self.NUMBER | (self.list + self.COMMA + self.NUMBER)
NUMBER = Terminal(Re.set(("0", "9")).plus())
COMMA = Terminal(',')
NUMBER = Terminal(Re.set(("0", "9")).plus())
COMMA = Terminal(',')
NumberList = Grammar(
name="NumberList",
start=list,
)
```
(Unlike with PEGs, you can write grammars with left or right-recursion,
@ -90,21 +97,23 @@ which means they don't generate nodes in the tree and just dump their
contents into the parent node instead.
```python {.numberLines}
class NumberList(Grammar):
start = "list"
@rule
def list():
# The starting rule can't be transparent: there has to be something to
# hold on to!
return transparent_list
@rule
def list(self):
# The starting rule can't be transparent: there has to be something to
# hold on to!
return self.transparent_list
@rule(transparent=True)
def transparent_list() -> Rule:
return NUMBER | (transparent_list + COMMA + NUMBER)
@rule(transparent=True)
def transparent_list(self) -> Rule:
return self.NUMBER | (self.transparent_list + self.COMMA + self.NUMBER)
NUMBER = Terminal(Re.set(("0", "9")).plus())
COMMA = Terminal(',')
NUMBER = Terminal(Re.set(("0", "9")).plus())
COMMA = Terminal(',')
NumberList = Grammar(
name="NumberList",
start=list,
)
```
This grammar will generate the far more useful tree:
@ -123,23 +132,46 @@ following the lead set by tree-sitter, and so the grammar above is
probably better-written as:
```python {.numberLines}
class NumberList(Grammar):
start = "list"
@rule
def list():
# The starting rule can't be transparent: there has to be something to
# hold on to!
return transparent_list
@rule
def list(self):
return self._list
@rule
def _list() -> Rule:
return NUMBER | (_list + COMMA + NUMBER)
@rule
def _list(self):
return self.NUMBER | (self._list + self.COMMA + self.NUMBER)
NUMBER = Terminal(Re.set(("0", "9")).plus())
COMMA = Terminal(',')
NUMBER = Terminal(Re.set(("0", "9")).plus())
COMMA = Terminal(',')
NumberList = Grammar(
name="NumberList",
start=list,
)
```
That will generate the same tree, but a little more succinctly.
Of course, it's a lot of work to write these transparent recursive
rules by hand all the time, so there are helpers that do it for you:
```python {.numberLines}
@rule
def list():
return zero_or_more(NUMBER, COMMA) + NUMBER
NUMBER = Terminal(Re.set(("0", "9")).plus())
COMMA = Terminal(',')
NumberList = Grammar(
name="NumberList",
start=list,
)
```
Much better.
### Trivia
Most folks that want to parse something want to skip blanks when they
@ -150,23 +182,21 @@ To allow (and ignore) spaces, newlines, tabs, and carriage-returns in
our number lists, we would modify the grammar as follows:
```python {.numberLines}
class NumberList(Grammar):
start = "list"
trivia = ["BLANKS"] # <- Add a `trivia` member
@rule
def list():
return zero_or_more(NUMBER, COMMA) + NUMBER
@rule
def list(self):
return self._list
NUMBER = Terminal(Re.set(("0", "9")).plus())
COMMA = Terminal(',')
@rule
def _list(self):
return self.NUMBER | (self._list + self.COMMA + self.NUMBER)
BLANKS = Terminal(Re.set(" ", "\t", "\r", "\n").plus())
# ^ and add a new terminal to describe what we're ignoring...
NUMBER = Terminal(Re.set(("0", "9")).plus())
COMMA = Terminal(',')
BLANKS = Terminal(Re.set(" ", "\t", "\r", "\n").plus())
# ^ and add a new terminal to describe it
NumberList = Grammar(
name="NumberList",
start=list,
trivia=[BLANKS],
)
```
Now we can parse a list with spaces! "1 , 2, 3" will parse happily

View file

@ -18,7 +18,7 @@ include lrparser.mk
wheel: dist/lrparsers-$(VERSION)-py3-none-any.whl
dist/lrparsers-$(VERSION).tar.gz dist/lrparsers-$(VERSION)-py3-none-any.whl: pyproject.toml $(PYTHON_SOURCES)
uv build --no-clean
uv build --offline #--no-clean
.PHONY: clean
clean:

View file

@ -236,6 +236,20 @@ class ItemSet:
def __init__(self, items=None):
self.items = items or {}
self._hash = None
def __hash__(self):
# TODO: FREEZE
if self._hash is None:
self._hash = hash(tuple((key, frozenset(value)) for key, value in self.items.items()))
return self._hash
def __eq__(self, other):
if not isinstance(other, ItemSet):
return False
return self.items == other.items
def weakly_compatible(self, other: "ItemSet") -> bool:
a = self.items