Rework the documentation examples

This commit is contained in:
John Doty 2025-02-15 15:06:42 -08:00
parent ed5baefd5d
commit 5f19b1e73e
2 changed files with 189 additions and 130 deletions

149
README.md
View file

@ -17,26 +17,31 @@ class. Create one method per non-terminal, decorated with the `rule`
decorator. Here's an example: decorator. Here's an example:
```python ```python
class SimpleGrammar(Grammar): from parser import *
start = "expression"
@rule @rule
def expression(self): def expression():
return seq(self.expression, self.PLUS, self.term) | self.term return seq(expression, PLUS, term) | term
@rule @rule
def term(self): def term():
return seq(self.LPAREN, self.expression, self.RPAREN) | self.ID return seq(LPAREN, expression, RPAREN) | ID
PLUS = Terminal('+') PLUS = Terminal('PLUS', '+')
LPAREN = Terminal('(') LPAREN = Terminal('LPAREN', '(')
RPAREN = Terminal(')') RPAREN = Terminal('RPAREN', ')')
ID = Terminal( ID = Terminal(
Re.seq( 'ID',
Re.set(("a", "z"), ("A", "Z"), "_"), Re.seq(
Re.set(("a", "z"), ("A", "Z"), ("0", "9"), "_").star(), Re.set(("a", "z"), ("A", "Z"), "_"),
), Re.set(("a", "z"), ("A", "Z"), ("0", "9"), "_").star(),
) ),
)
SimpleGrammar = Grammar(
name="Simple",
start=expression,
)
``` ```
Terminals can be plain strings or regular expressions constructed with Terminals can be plain strings or regular expressions constructed with
@ -54,15 +59,17 @@ There are no helpers (yet!) for consuming lists, so they need to be
constructed in the classic context-free grammar way: constructed in the classic context-free grammar way:
```python ```python
class NumberList(Grammar): @rule
start = "list" def list():
return NUMBER | (list + COMMA + NUMBER)
@rule NUMBER = Terminal(Re.set(("0", "9")).plus())
def list(self): COMMA = Terminal(',')
return self.NUMBER | (self.list + self.COMMA + self.NUMBER)
NUMBER = Terminal(Re.set(("0", "9")).plus()) NumberList = Grammar(
COMMA = Terminal(',') name="NumberList",
start=list,
)
``` ```
(Unlike with PEGs, you can write grammars with left or right-recursion, (Unlike with PEGs, you can write grammars with left or right-recursion,
@ -88,21 +95,23 @@ which means they don't generate nodes in the tree and just dump their
contents into the parent node instead. contents into the parent node instead.
```python ```python
class NumberList(Grammar): @rule
start = "list" def list():
# The starting rule can't be transparent: there has to be something to
# hold on to!
return transparent_list
@rule @rule(transparent=True)
def list(self): def transparent_list() -> Rule:
# The starting rule can't be transparent: there has to be something to return NUMBER | (transparent_list + COMMA + NUMBER)
# hold on to!
return self.transparent_list
@rule(transparent=True) NUMBER = Terminal(Re.set(("0", "9")).plus())
def transparent_list(self) -> Rule: COMMA = Terminal(',')
return self.NUMBER | (self.transparent_list + self.COMMA + self.NUMBER)
NUMBER = Terminal(Re.set(("0", "9")).plus()) NumberList = Grammar(
COMMA = Terminal(',') name="NumberList",
start=list,
)
``` ```
This grammar will generate the far more useful tree: This grammar will generate the far more useful tree:
@ -121,23 +130,46 @@ following the lead set by tree-sitter, and so the grammar above is
probably better-written as: probably better-written as:
```python ```python
class NumberList(Grammar): @rule
start = "list" def list():
# The starting rule can't be transparent: there has to be something to
# hold on to!
return transparent_list
@rule @rule
def list(self): def _list() -> Rule:
return self._list return NUMBER | (_list + COMMA + NUMBER)
@rule NUMBER = Terminal(Re.set(("0", "9")).plus())
def _list(self): COMMA = Terminal(',')
return self.NUMBER | (self._list + self.COMMA + self.NUMBER)
NUMBER = Terminal(Re.set(("0", "9")).plus()) NumberList = Grammar(
COMMA = Terminal(',') name="NumberList",
start=list,
)
``` ```
That will generate the same tree, but a little more succinctly. That will generate the same tree, but a little more succinctly.
Of course, it's a lot of work to write these transparent recursive
rules by hand all the time, so there are helpers that do it for you:
```python
@rule
def list():
return zero_or_more(NUMBER, COMMA) + NUMBER
NUMBER = Terminal(Re.set(("0", "9")).plus())
COMMA = Terminal(',')
NumberList = Grammar(
name="NumberList",
start=list,
)
```
Much better.
### Trivia ### Trivia
Most folks that want to parse something want to skip blanks when they Most folks that want to parse something want to skip blanks when they
@ -148,23 +180,20 @@ To allow (and ignore) spaces, newlines, tabs, and carriage-returns in
our number lists, we would modify the grammar as follows: our number lists, we would modify the grammar as follows:
```python ```python
class NumberList(Grammar): @rule
start = "list" def list():
trivia = ["BLANKS"] # <- Add a `trivia` member return zero_or_more(NUMBER, COMMA) + NUMBER
@rule NUMBER = Terminal(Re.set(("0", "9")).plus())
def list(self): COMMA = Terminal(',')
return self._list
@rule BLANKS = Terminal(Re.set(" ", "\t", "\r", "\n").plus())
def _list(self):
return self.NUMBER | (self._list + self.COMMA + self.NUMBER)
NUMBER = Terminal(Re.set(("0", "9")).plus()) NumberList = Grammar(
COMMA = Terminal(',') name="NumberList",
start=list,
BLANKS = Terminal(Re.set(" ", "\t", "\r", "\n").plus()) trivia=[BLANKS],
# ^ and add a new terminal to describe it )
``` ```
Now we can parse a list with spaces! "1 , 2, 3" will parse happily Now we can parse a list with spaces! "1 , 2, 3" will parse happily

View file

@ -12,59 +12,66 @@ about doing fun things with grammars.
## Making Grammars ## Making Grammars
To get started, create a grammar that derives from the `Grammar` To get started, create one function per non-terminal, decorated with
class. Create one method per non-terminal, decorated with the `rule` the `rule` decorator, and one instance of a `Terminal` object for each
decorator. Here's an example: terminal. Then tie it all together with an instance of a Grammar
object.
Here's an example:
```python {.numberLines} ```python {.numberLines}
from parser import * from parser import *
class SimpleGrammar(Grammar): @rule
start = "expression" def expression():
return seq(expression, PLUS, term) | term
@rule @rule
def expression(self): def term():
return seq(self.expression, self.PLUS, self.term) | self.term return seq(LPAREN, expression, RPAREN) | ID
@rule PLUS = Terminal('PLUS', '+')
def term(self): LPAREN = Terminal('LPAREN', '(')
return seq(self.LPAREN, self.expression, self.RPAREN) | self.ID RPAREN = Terminal('RPAREN', ')')
ID = Terminal(
'ID',
Re.seq(
Re.set(("a", "z"), ("A", "Z"), "_"),
Re.set(("a", "z"), ("A", "Z"), ("0", "9"), "_").star(),
),
)
PLUS = Terminal('+') SimpleGrammar = Grammar(
LPAREN = Terminal('(') name="Simple",
RPAREN = Terminal(')') start=expression,
ID = Terminal( )
Re.seq(
Re.set(("a", "z"), ("A", "Z"), "_"),
Re.set(("a", "z"), ("A", "Z"), ("0", "9"), "_").star(),
),
)
``` ```
Terminals can be plain strings or regular expressions constructed with Terminal patterns can be plain strings or regular expressions
the `Re` object. (Ironically, I guess this library is not clever constructed with the `Re` object. (Ironically, I guess this library is
enough to parse a regular expression string into one of these not clever enough to parse a regular expression string into one of
structures. If you want to build one, go nuts! It's just Python, you these structures. If you want to build one, go nuts! It's just Python,
can do whatever you want so long as the result is an `Re` object.) you can do whatever you want so long as the result is an `Re` object.)
Productions can be built out of terminals and non-terminals, Productions can be built out of terminals and non-terminals,
concatenated with the `seq` function or the `+` operator. Alternatives concatenated with the `seq` function or the `+` operator. Alternatives
can be expressed with the `alt` function or the `|` operator. These can be expressed with the `alt` function or the `|` operator. These
things can be freely nested, as desired. things can be freely nested, as desired.
There are no helpers (yet!) for consuming lists, so they need to be You can make lists in the classic context-free grammar way:
constructed in the classic context-free grammar way:
```python {.numberLines} ```python {.numberLines}
class NumberList(Grammar): @rule
start = "list" def list():
return NUMBER | (list + COMMA + NUMBER)
@rule NUMBER = Terminal(Re.set(("0", "9")).plus())
def list(self): COMMA = Terminal(',')
return self.NUMBER | (self.list + self.COMMA + self.NUMBER)
NUMBER = Terminal(Re.set(("0", "9")).plus()) NumberList = Grammar(
COMMA = Terminal(',') name="NumberList",
start=list,
)
``` ```
(Unlike with PEGs, you can write grammars with left or right-recursion, (Unlike with PEGs, you can write grammars with left or right-recursion,
@ -90,21 +97,23 @@ which means they don't generate nodes in the tree and just dump their
contents into the parent node instead. contents into the parent node instead.
```python {.numberLines} ```python {.numberLines}
class NumberList(Grammar): @rule
start = "list" def list():
# The starting rule can't be transparent: there has to be something to
# hold on to!
return transparent_list
@rule @rule(transparent=True)
def list(self): def transparent_list() -> Rule:
# The starting rule can't be transparent: there has to be something to return NUMBER | (transparent_list + COMMA + NUMBER)
# hold on to!
return self.transparent_list
@rule(transparent=True) NUMBER = Terminal(Re.set(("0", "9")).plus())
def transparent_list(self) -> Rule: COMMA = Terminal(',')
return self.NUMBER | (self.transparent_list + self.COMMA + self.NUMBER)
NUMBER = Terminal(Re.set(("0", "9")).plus()) NumberList = Grammar(
COMMA = Terminal(',') name="NumberList",
start=list,
)
``` ```
This grammar will generate the far more useful tree: This grammar will generate the far more useful tree:
@ -123,23 +132,46 @@ following the lead set by tree-sitter, and so the grammar above is
probably better-written as: probably better-written as:
```python {.numberLines} ```python {.numberLines}
class NumberList(Grammar): @rule
start = "list" def list():
# The starting rule can't be transparent: there has to be something to
# hold on to!
return transparent_list
@rule @rule
def list(self): def _list() -> Rule:
return self._list return NUMBER | (_list + COMMA + NUMBER)
@rule NUMBER = Terminal(Re.set(("0", "9")).plus())
def _list(self): COMMA = Terminal(',')
return self.NUMBER | (self._list + self.COMMA + self.NUMBER)
NUMBER = Terminal(Re.set(("0", "9")).plus()) NumberList = Grammar(
COMMA = Terminal(',') name="NumberList",
start=list,
)
``` ```
That will generate the same tree, but a little more succinctly. That will generate the same tree, but a little more succinctly.
Of course, it's a lot of work to write these transparent recursive
rules by hand all the time, so there are helpers that do it for you:
```python {.numberLines}
@rule
def list():
return zero_or_more(NUMBER, COMMA) + NUMBER
NUMBER = Terminal(Re.set(("0", "9")).plus())
COMMA = Terminal(',')
NumberList = Grammar(
name="NumberList",
start=list,
)
```
Much better.
### Trivia ### Trivia
Most folks that want to parse something want to skip blanks when they Most folks that want to parse something want to skip blanks when they
@ -150,23 +182,21 @@ To allow (and ignore) spaces, newlines, tabs, and carriage-returns in
our number lists, we would modify the grammar as follows: our number lists, we would modify the grammar as follows:
```python {.numberLines} ```python {.numberLines}
class NumberList(Grammar): @rule
start = "list" def list():
trivia = ["BLANKS"] # <- Add a `trivia` member return zero_or_more(NUMBER, COMMA) + NUMBER
@rule NUMBER = Terminal(Re.set(("0", "9")).plus())
def list(self): COMMA = Terminal(',')
return self._list
@rule BLANKS = Terminal(Re.set(" ", "\t", "\r", "\n").plus())
def _list(self): # ^ and add a new terminal to describe what we're ignoring...
return self.NUMBER | (self._list + self.COMMA + self.NUMBER)
NUMBER = Terminal(Re.set(("0", "9")).plus()) NumberList = Grammar(
COMMA = Terminal(',') name="NumberList",
start=list,
BLANKS = Terminal(Re.set(" ", "\t", "\r", "\n").plus()) trivia=[BLANKS],
# ^ and add a new terminal to describe it )
``` ```
Now we can parse a list with spaces! "1 , 2, 3" will parse happily Now we can parse a list with spaces! "1 , 2, 3" will parse happily