diff --git a/README.md b/README.md index 44457ee..d416145 100644 --- a/README.md +++ b/README.md @@ -17,31 +17,26 @@ class. Create one method per non-terminal, decorated with the `rule` decorator. Here's an example: ```python - from parser import * + class SimpleGrammar(Grammar): + start = "expression" - @rule - def expression(): - return seq(expression, PLUS, term) | term + @rule + def expression(self): + return seq(self.expression, self.PLUS, self.term) | self.term - @rule - def term(): - return seq(LPAREN, expression, RPAREN) | ID + @rule + def term(self): + return seq(self.LPAREN, self.expression, self.RPAREN) | self.ID - PLUS = Terminal('PLUS', '+') - LPAREN = Terminal('LPAREN', '(') - RPAREN = Terminal('RPAREN', ')') - ID = Terminal( - 'ID', - Re.seq( - Re.set(("a", "z"), ("A", "Z"), "_"), - Re.set(("a", "z"), ("A", "Z"), ("0", "9"), "_").star(), - ), - ) - - SimpleGrammar = Grammar( - name="Simple", - start=expression, - ) + PLUS = Terminal('+') + LPAREN = Terminal('(') + RPAREN = Terminal(')') + ID = Terminal( + Re.seq( + Re.set(("a", "z"), ("A", "Z"), "_"), + Re.set(("a", "z"), ("A", "Z"), ("0", "9"), "_").star(), + ), + ) ``` Terminals can be plain strings or regular expressions constructed with @@ -59,17 +54,15 @@ There are no helpers (yet!) for consuming lists, so they need to be constructed in the classic context-free grammar way: ```python - @rule - def list(): - return NUMBER | (list + COMMA + NUMBER) + class NumberList(Grammar): + start = "list" - NUMBER = Terminal(Re.set(("0", "9")).plus()) - COMMA = Terminal(',') + @rule + def list(self): + return self.NUMBER | (self.list + self.COMMA + self.NUMBER) - NumberList = Grammar( - name="NumberList", - start=list, - ) + NUMBER = Terminal(Re.set(("0", "9")).plus()) + COMMA = Terminal(',') ``` (Unlike with PEGs, you can write grammars with left or right-recursion, @@ -95,23 +88,21 @@ which means they don't generate nodes in the tree and just dump their contents into the parent node instead. ```python - @rule - def list(): - # The starting rule can't be transparent: there has to be something to - # hold on to! - return transparent_list + class NumberList(Grammar): + start = "list" - @rule(transparent=True) - def transparent_list() -> Rule: - return NUMBER | (transparent_list + COMMA + NUMBER) + @rule + def list(self): + # The starting rule can't be transparent: there has to be something to + # hold on to! + return self.transparent_list - NUMBER = Terminal(Re.set(("0", "9")).plus()) - COMMA = Terminal(',') + @rule(transparent=True) + def transparent_list(self) -> Rule: + return self.NUMBER | (self.transparent_list + self.COMMA + self.NUMBER) - NumberList = Grammar( - name="NumberList", - start=list, - ) + NUMBER = Terminal(Re.set(("0", "9")).plus()) + COMMA = Terminal(',') ``` This grammar will generate the far more useful tree: @@ -130,46 +121,23 @@ following the lead set by tree-sitter, and so the grammar above is probably better-written as: ```python - @rule - def list(): - # The starting rule can't be transparent: there has to be something to - # hold on to! - return transparent_list + class NumberList(Grammar): + start = "list" - @rule - def _list() -> Rule: - return NUMBER | (_list + COMMA + NUMBER) + @rule + def list(self): + return self._list - NUMBER = Terminal(Re.set(("0", "9")).plus()) - COMMA = Terminal(',') + @rule + def _list(self): + return self.NUMBER | (self._list + self.COMMA + self.NUMBER) - NumberList = Grammar( - name="NumberList", - start=list, - ) + NUMBER = Terminal(Re.set(("0", "9")).plus()) + COMMA = Terminal(',') ``` That will generate the same tree, but a little more succinctly. -Of course, it's a lot of work to write these transparent recursive -rules by hand all the time, so there are helpers that do it for you: - -```python - @rule - def list(): - return zero_or_more(NUMBER, COMMA) + NUMBER - - NUMBER = Terminal(Re.set(("0", "9")).plus()) - COMMA = Terminal(',') - - NumberList = Grammar( - name="NumberList", - start=list, - ) -``` - -Much better. - ### Trivia Most folks that want to parse something want to skip blanks when they @@ -180,20 +148,23 @@ To allow (and ignore) spaces, newlines, tabs, and carriage-returns in our number lists, we would modify the grammar as follows: ```python - @rule - def list(): - return zero_or_more(NUMBER, COMMA) + NUMBER + class NumberList(Grammar): + start = "list" + trivia = ["BLANKS"] # <- Add a `trivia` member - NUMBER = Terminal(Re.set(("0", "9")).plus()) - COMMA = Terminal(',') + @rule + def list(self): + return self._list - BLANKS = Terminal(Re.set(" ", "\t", "\r", "\n").plus()) + @rule + def _list(self): + return self.NUMBER | (self._list + self.COMMA + self.NUMBER) - NumberList = Grammar( - name="NumberList", - start=list, - trivia=[BLANKS], - ) + NUMBER = Terminal(Re.set(("0", "9")).plus()) + COMMA = Terminal(',') + + BLANKS = Terminal(Re.set(" ", "\t", "\r", "\n").plus()) + # ^ and add a new terminal to describe it ``` Now we can parse a list with spaces! "1 , 2, 3" will parse happily diff --git a/dingus/about.md b/dingus/about.md index 890eaa4..3290b5a 100644 --- a/dingus/about.md +++ b/dingus/about.md @@ -12,66 +12,59 @@ about doing fun things with grammars. ## Making Grammars -To get started, create one function per non-terminal, decorated with -the `rule` decorator, and one instance of a `Terminal` object for each -terminal. Then tie it all together with an instance of a Grammar -object. - -Here's an example: +To get started, create a grammar that derives from the `Grammar` +class. Create one method per non-terminal, decorated with the `rule` +decorator. Here's an example: ```python {.numberLines} from parser import * - @rule - def expression(): - return seq(expression, PLUS, term) | term + class SimpleGrammar(Grammar): + start = "expression" - @rule - def term(): - return seq(LPAREN, expression, RPAREN) | ID + @rule + def expression(self): + return seq(self.expression, self.PLUS, self.term) | self.term - PLUS = Terminal('PLUS', '+') - LPAREN = Terminal('LPAREN', '(') - RPAREN = Terminal('RPAREN', ')') - ID = Terminal( - 'ID', - Re.seq( - Re.set(("a", "z"), ("A", "Z"), "_"), - Re.set(("a", "z"), ("A", "Z"), ("0", "9"), "_").star(), - ), - ) + @rule + def term(self): + return seq(self.LPAREN, self.expression, self.RPAREN) | self.ID - SimpleGrammar = Grammar( - name="Simple", - start=expression, - ) + PLUS = Terminal('+') + LPAREN = Terminal('(') + RPAREN = Terminal(')') + ID = Terminal( + Re.seq( + Re.set(("a", "z"), ("A", "Z"), "_"), + Re.set(("a", "z"), ("A", "Z"), ("0", "9"), "_").star(), + ), + ) ``` -Terminal patterns can be plain strings or regular expressions -constructed with the `Re` object. (Ironically, I guess this library is -not clever enough to parse a regular expression string into one of -these structures. If you want to build one, go nuts! It's just Python, -you can do whatever you want so long as the result is an `Re` object.) +Terminals can be plain strings or regular expressions constructed with +the `Re` object. (Ironically, I guess this library is not clever +enough to parse a regular expression string into one of these +structures. If you want to build one, go nuts! It's just Python, you +can do whatever you want so long as the result is an `Re` object.) Productions can be built out of terminals and non-terminals, concatenated with the `seq` function or the `+` operator. Alternatives can be expressed with the `alt` function or the `|` operator. These things can be freely nested, as desired. -You can make lists in the classic context-free grammar way: +There are no helpers (yet!) for consuming lists, so they need to be +constructed in the classic context-free grammar way: ```python {.numberLines} - @rule - def list(): - return NUMBER | (list + COMMA + NUMBER) + class NumberList(Grammar): + start = "list" - NUMBER = Terminal(Re.set(("0", "9")).plus()) - COMMA = Terminal(',') + @rule + def list(self): + return self.NUMBER | (self.list + self.COMMA + self.NUMBER) - NumberList = Grammar( - name="NumberList", - start=list, - ) + NUMBER = Terminal(Re.set(("0", "9")).plus()) + COMMA = Terminal(',') ``` (Unlike with PEGs, you can write grammars with left or right-recursion, @@ -97,23 +90,21 @@ which means they don't generate nodes in the tree and just dump their contents into the parent node instead. ```python {.numberLines} - @rule - def list(): - # The starting rule can't be transparent: there has to be something to - # hold on to! - return transparent_list + class NumberList(Grammar): + start = "list" - @rule(transparent=True) - def transparent_list() -> Rule: - return NUMBER | (transparent_list + COMMA + NUMBER) + @rule + def list(self): + # The starting rule can't be transparent: there has to be something to + # hold on to! + return self.transparent_list - NUMBER = Terminal(Re.set(("0", "9")).plus()) - COMMA = Terminal(',') + @rule(transparent=True) + def transparent_list(self) -> Rule: + return self.NUMBER | (self.transparent_list + self.COMMA + self.NUMBER) - NumberList = Grammar( - name="NumberList", - start=list, - ) + NUMBER = Terminal(Re.set(("0", "9")).plus()) + COMMA = Terminal(',') ``` This grammar will generate the far more useful tree: @@ -132,46 +123,23 @@ following the lead set by tree-sitter, and so the grammar above is probably better-written as: ```python {.numberLines} - @rule - def list(): - # The starting rule can't be transparent: there has to be something to - # hold on to! - return transparent_list + class NumberList(Grammar): + start = "list" - @rule - def _list() -> Rule: - return NUMBER | (_list + COMMA + NUMBER) + @rule + def list(self): + return self._list - NUMBER = Terminal(Re.set(("0", "9")).plus()) - COMMA = Terminal(',') + @rule + def _list(self): + return self.NUMBER | (self._list + self.COMMA + self.NUMBER) - NumberList = Grammar( - name="NumberList", - start=list, - ) + NUMBER = Terminal(Re.set(("0", "9")).plus()) + COMMA = Terminal(',') ``` That will generate the same tree, but a little more succinctly. -Of course, it's a lot of work to write these transparent recursive -rules by hand all the time, so there are helpers that do it for you: - -```python {.numberLines} - @rule - def list(): - return zero_or_more(NUMBER, COMMA) + NUMBER - - NUMBER = Terminal(Re.set(("0", "9")).plus()) - COMMA = Terminal(',') - - NumberList = Grammar( - name="NumberList", - start=list, - ) -``` - -Much better. - ### Trivia Most folks that want to parse something want to skip blanks when they @@ -182,21 +150,23 @@ To allow (and ignore) spaces, newlines, tabs, and carriage-returns in our number lists, we would modify the grammar as follows: ```python {.numberLines} - @rule - def list(): - return zero_or_more(NUMBER, COMMA) + NUMBER + class NumberList(Grammar): + start = "list" + trivia = ["BLANKS"] # <- Add a `trivia` member - NUMBER = Terminal(Re.set(("0", "9")).plus()) - COMMA = Terminal(',') + @rule + def list(self): + return self._list - BLANKS = Terminal(Re.set(" ", "\t", "\r", "\n").plus()) - # ^ and add a new terminal to describe what we're ignoring... + @rule + def _list(self): + return self.NUMBER | (self._list + self.COMMA + self.NUMBER) - NumberList = Grammar( - name="NumberList", - start=list, - trivia=[BLANKS], - ) + NUMBER = Terminal(Re.set(("0", "9")).plus()) + COMMA = Terminal(',') + + BLANKS = Terminal(Re.set(" ", "\t", "\r", "\n").plus()) + # ^ and add a new terminal to describe it ``` Now we can parse a list with spaces! "1 , 2, 3" will parse happily diff --git a/makefile b/makefile index 74e7ed7..88091e5 100644 --- a/makefile +++ b/makefile @@ -18,7 +18,7 @@ include lrparser.mk wheel: dist/lrparsers-$(VERSION)-py3-none-any.whl dist/lrparsers-$(VERSION).tar.gz dist/lrparsers-$(VERSION)-py3-none-any.whl: pyproject.toml $(PYTHON_SOURCES) - uv build --offline #--no-clean + uv build --no-clean .PHONY: clean clean: diff --git a/parser/parser.py b/parser/parser.py index 0bb2838..7b5c18e 100644 --- a/parser/parser.py +++ b/parser/parser.py @@ -236,20 +236,6 @@ class ItemSet: def __init__(self, items=None): self.items = items or {} - self._hash = None - - def __hash__(self): - # TODO: FREEZE - if self._hash is None: - self._hash = hash(tuple((key, frozenset(value)) for key, value in self.items.items())) - - return self._hash - - def __eq__(self, other): - if not isinstance(other, ItemSet): - return False - - return self.items == other.items def weakly_compatible(self, other: "ItemSet") -> bool: a = self.items