From 071cd29d8f0d6a208549549fa1e250244351ddeb Mon Sep 17 00:00:00 2001 From: John Doty Date: Sat, 21 Sep 2024 08:45:49 -0700 Subject: [PATCH] [readme] Rewrite the readme and add a helper The helper is nice actually. --- README.md | 312 ++++++++++++++++++++++++++++++------------- parser/runtime.py | 14 +- tests/test_wadler.py | 29 ++-- 3 files changed, 242 insertions(+), 113 deletions(-) diff --git a/README.md b/README.md index c5a8020..d416145 100644 --- a/README.md +++ b/README.md @@ -1,30 +1,25 @@ -# A collection of LR parser generators, from LR0 through LALR. +# A library for grammars -This is a small helper library to generate LR parser tables. +This is library to do interesting things with grammars. This was +originally built as a little toy for me to understand how LR parser +tables worked, but I discovered that what I *really* want is to be +able to leverage the grammar to do other things besides parsing. -The primary inspiration for this library is tree-sitter, which also generates -LR parsers for grammars written in a turing-complete language. Like that, we -write grammars in a language, only we do it in Python instead of JavaScript. - -Why Python? Because Python 3 is widely pre-installed on MacOS and Linux. This -library requires nothing more than the basic standard library, and not even a -new version of it. Therefore, it turns out to be a pretty light dependency for -a rust or C++ or something kind of project. (Tree-sitter, on the other hand, -requires node, which is a far less stable and available runtime in 2024.) - -The parser tables can really be used to power anything. I prefer to make -concrete syntax trees (again, see tree-sitter), and there is no facility at all -for actions or custom ASTs or whatnot. Any such processing needs to be done by -the thing that processes the tables. +The primary inspiration for this library is tree-sitter, which also +generates LR parsers for grammars written in a turing-complete +language. Like that, we write grammars in a language, only we do it in +Python instead of JavaScript. ## Making Grammars -To get started, create a grammar that derives from the `Grammar` class. Create -one method per nonterminal, decorated with the `rule` decorator. Here's an -example: - +To get started, create a grammar that derives from the `Grammar` +class. Create one method per non-terminal, decorated with the `rule` +decorator. Here's an example: +```python class SimpleGrammar(Grammar): + start = "expression" + @rule def expression(self): return seq(self.expression, self.PLUS, self.term) | self.term @@ -36,98 +31,231 @@ example: PLUS = Terminal('+') LPAREN = Terminal('(') RPAREN = Terminal(')') - ID = Terminal('id') + ID = Terminal( + Re.seq( + Re.set(("a", "z"), ("A", "Z"), "_"), + Re.set(("a", "z"), ("A", "Z"), ("0", "9"), "_").star(), + ), + ) +``` +Terminals can be plain strings or regular expressions constructed with +the `Re` object. (Ironically, I guess this library is not clever +enough to parse a regular expression string into one of these +structures. If you want to build one, go nuts! It's just Python, you +can do whatever you want so long as the result is an `Re` object.) -## Using grammars +Productions can be built out of terminals and non-terminals, +concatenated with the `seq` function or the `+` operator. Alternatives +can be expressed with the `alt` function or the `|` operator. These +things can be freely nested, as desired. -TODO +There are no helpers (yet!) for consuming lists, so they need to be +constructed in the classic context-free grammar way: -## Representation Choices +```python + class NumberList(Grammar): + start = "list" -The SimpleGrammar class might seem a little verbose compared to a dense -structure like: + @rule + def list(self): + return self.NUMBER | (self.list + self.COMMA + self.NUMBER) - grammar_simple = [ - ('E', ['E', '+', 'T']), - ('E', ['T']), - ('T', ['(', 'E', ')']), - ('T', ['id']), - ] + NUMBER = Terminal(Re.set(("0", "9")).plus()) + COMMA = Terminal(',') +``` -or +(Unlike with PEGs, you can write grammars with left or right-recursion, +without restriction, either is fine.) - grammar_simple = { - 'E': [ - ['E', '+', 'T'], - ['T'], - ], - 'T': [ - ['(', 'E', ')'], - ['id'], - ], - } +When used to generate a parser, the grammar describes a concrete +syntax tree. Unfortunately, that means that the list example above +will generate a very awkward tree for `1,2,3`: +``` +list + list + list + NUMBER ("1") + COMMA + NUMBER ("2") + COMMA + NUMBER ("3") +``` -The advantage that the class has over a table like this is that you get to have -all of your Python tools help you make sure your grammar is good, if you want -them. e.g., if you're working with an LSP or something, the members give you -autocomplete and jump-to-definition and possibly even type-checking. +In order to make this a little cleaner, rules can be "transparent", +which means they don't generate nodes in the tree and just dump their +contents into the parent node instead. -At the very least, if you mis-type the name of a nonterminal, or forget to -implement it, we will immediately raise an error that *INCLUDES THE LOCATION IN -THE SOURCE WHERE THE ERROR WAS MADE.* With tables, we can tell you that you -made a mistake but it's up to you to figure out where you did it. +```python + class NumberList(Grammar): + start = "list" -### Aside: What about a custom DSL/EBNF like thing? + @rule + def list(self): + # The starting rule can't be transparent: there has to be something to + # hold on to! + return self.transparent_list -Yeah, OK, there's a rich history of writing your grammar in a domain-specific -language. YACC did it, ANTLR does it, GRMTools.... just about everybody except -Tree-Sitter does this. + @rule(transparent=True) + def transparent_list(self) -> Rule: + return self.NUMBER | (self.transparent_list + self.COMMA + self.NUMBER) -But look, I've got several reasons for not doing it. + NUMBER = Terminal(Re.set(("0", "9")).plus()) + COMMA = Terminal(',') +``` -First, I'm lazy, and don't want to write yet another parser for my parser. What -tools should I use to write my parser generator parser? I guess I don't have my -parser generator parser yet, so probably a hand-written top down parser? Some -other python parser generator? Ugh! +This grammar will generate the far more useful tree: -As an add-on to that, if I make my own format then I need to make tooling for -*that* too: syntax highlighters, jump to definition, the works. Yuck. An -existing language, and a format that builds on an existing language, gets me the -tooling that comes along with that language. If you can leverage that -effictively (and I think I have) then you start way ahead in terms of tooling. +``` +list + NUMBER ("1") + COMMA + NUMBER ("2") + COMMA + NUMBER ("3") +``` -Second, this whole thing is supposed to be easy to include in an existing -project, and adding a custom compiler doesn't seem to be that. Adding two python -files seems to be about the right speed. +Rules that start with `_` are also interpreted as transparent, +following the lead set by tree-sitter, and so the grammar above is +probably better-written as: -Thirdly, and this is just hypothetical, it's probably pretty easy to write your -own tooling around a grammar if it's already in Python. If you want to make -railroad diagrams or EBNF pictures or whatever, all the productions are already -right there in data structures for you to process. I've tried to keep them -accessible and at least somewhat easy to work with. There's nothing that says a -DSL-based system *has* to produce unusable intermediate data- certainly there -are some tools that *try*- but with this approach the accessibility and the -ergonomics of the tool go hand in hand. +```python + class NumberList(Grammar): + start = "list" -## Some History + @rule + def list(self): + return self._list -The first version of this code was written as an idle exercise to learn how LR -parser table generation even worked. It was... very simple, fairly easy to -follow, and just *incredibly* slow. Like, mind-bogglingly slow. Unusably slow -for anything but the most trivial grammar. + @rule + def _list(self): + return self.NUMBER | (self._list + self.COMMA + self.NUMBER) -As a result, when I decided I wanted to use it for a larger grammar, I found that -I just couldn't. So this has been hacked and significantly improved from that -version, now capable of building tables for nontrivial grammars. It could still -be a lot faster, but it meets my needs for now. + NUMBER = Terminal(Re.set(("0", "9")).plus()) + COMMA = Terminal(',') +``` -(BTW, the notes I read to learn how all this works are at -http://dragonbook.stanford.edu/lecture-notes/Stanford-CS143/. Specifically, -I started with handout 8, 'Bottom-up-parsing', and went from there. (I did -eventually have to backtrack a little into handout 7, since that's where -First() and Follow() are covered.) +That will generate the same tree, but a little more succinctly. -doty -May 2024 +### Trivia + +Most folks that want to parse something want to skip blanks when they +do it. Our grammars don't say anything about that by default (sorry), +so you probably want to be explicit about such things. + +To allow (and ignore) spaces, newlines, tabs, and carriage-returns in +our number lists, we would modify the grammar as follows: + +```python + class NumberList(Grammar): + start = "list" + trivia = ["BLANKS"] # <- Add a `trivia` member + + @rule + def list(self): + return self._list + + @rule + def _list(self): + return self.NUMBER | (self._list + self.COMMA + self.NUMBER) + + NUMBER = Terminal(Re.set(("0", "9")).plus()) + COMMA = Terminal(',') + + BLANKS = Terminal(Re.set(" ", "\t", "\r", "\n").plus()) + # ^ and add a new terminal to describe it +``` + +Now we can parse a list with spaces! "1 , 2, 3" will parse happily +into: + +``` +list + NUMBER ("1") + COMMA + NUMBER ("2") + COMMA + NUMBER ("3") +``` + +## Using Grammars + +### Making Parsers and Parsing Text + +Once you have a grammar you can make a parse table from it by +constructing an instance of the grammar and calling the `build_table` +method on it. + +```python +grammar = NumberList() +parse_table = grammar.build_table() +lexer_table = grammar.compile_lexer() +``` + +In theory, in the future, you could pass the table to an output +generator and it would build a C source file or a Rust source file or +something to run the parse. Right now the only runtime is also written +in python, so you can do a parse as follows: + +``` +from parser import runtime + +text = "1,2,3" +result, errors = runtime.parse(parse_table, lexer_table, "1,2,3") +``` + +`result` in the above example will be a concrete syntax tree, if the +parse was successful, and `errors` will be a list of error strings +from the parse. Note that the python runtime has automatic error +recovery (with a variant of +[CPCT+](https://tratt.net/laurie/blog/2020/automatic_syntax_error_recovery.html)), +so you may get a parse tree even if there were parse errors. + +## Questions + +### Why Python? + +There are a few reasons to use python here. + +First, Python 3 is widely pre-installed on MacOS and Linux. This +library requires nothing more than the basic standard library, and not +even a new version of it. Therefore, it turns out to be a pretty light +dependency for a rust or C++ or some other kind of project, where +you're using this to generate the parser tables but the parser itself +will be in some other language. + +(Tree-sitter, on the other hand, requires its own standalone binary in +addition to node, which is a far less stable and available runtime in +2024.) + +I also find the ergonomics of working in python a little nicer than +working in, say, JavaScript. Python gives me operator overloading for +things like `|` and `+`, which make the rules read a little closer to +EBNF for me. It gives me type annotations that work without running a +compiler over my input. + +It also *actually raises errors* when I accidentally misspell the name +of a rule. And those errors come with the source location of exactly +where I made the spelling mistake! + +Finally, I guess you could ask why I'm not using some DSL or something +like literally every other parser generator tool except for +tree-sitter. And the answer for that is: I just don't care to maintain +a parser for my parser generator. ("Yo dawg, I heard you liked +parsers...") Python gives me the ability to describe the data I want, +in an easy to leverage way, that comes with all the power and +flexibility of a general-purpose programming language. Turns out to be +pretty nice. + +### What about grammars where blank space is significant, like ... well, python? + +Right now there's no way to describe them natively. + +You could write the grammar and introduce terminals like `INDENT` and +`DEDENT` but you would have to write a custom lexer to produce those +terminals, and probably handle them differently in all the other uses +of the grammar as well. + +That limits the ability to write the grammar once and automatically +use it everywhere, but maybe it's good enough for you? diff --git a/parser/runtime.py b/parser/runtime.py index b746cf1..81b3145 100644 --- a/parser/runtime.py +++ b/parser/runtime.py @@ -303,11 +303,6 @@ class TokenStream(typing.Protocol): ... -# TODO: This runtime API sucks; the TokenStream is nice and all but I should -# also be able to have a function that takes a string and produces a -# tree directly, with caching intermediates for codegen and whatnot. - - class Parser: table: parser.ParseTable @@ -612,3 +607,12 @@ class GenericTokenStream: line = f"{start:{max_offset_len}} {line_part} {column_index:3} {kind.name:{max_terminal_name}} {repr(value)}" lines.append(line) return lines + + +def parse( + parse_table: parser.ParseTable, + lexer_table: parser.LexerTable, + text: str, +) -> typing.Tuple[Tree | None, list[str]]: + """Parse the provided text with the generated parse table and lex table.""" + return Parser(parse_table).parse(GenericTokenStream(text, lexer_table)) diff --git a/tests/test_wadler.py b/tests/test_wadler.py index ce53cb4..e66c29d 100644 --- a/tests/test_wadler.py +++ b/tests/test_wadler.py @@ -109,9 +109,8 @@ class JsonGrammar(Grammar): JSON = JsonGrammar() -JSON_TABLE = JSON.build_table() +JSON_PARSER = JSON.build_table() JSON_LEXER = JSON.compile_lexer() -JSON_PARSER = parser_runtime.Parser(JSON_TABLE) def flatten_document(doc: runtime.Document, src: str) -> list: @@ -145,8 +144,7 @@ def flatten_document(doc: runtime.Document, src: str) -> list: def test_convert_tree_to_document(): text = '{"a": true, "b":[1,2,3]}' - tokens = parser_runtime.GenericTokenStream(text, JSON_LEXER) - tree, errors = JSON_PARSER.parse(tokens) + tree, errors = parser_runtime.parse(JSON_PARSER, JSON_LEXER, text) assert [] == errors assert tree is not None @@ -212,8 +210,7 @@ def _output(txt: str) -> str: def test_layout_basic(): text = '{"a": true, "b":[1,2,3], "c":[1,2,3,4,5,6,7]}' - tokens = parser_runtime.GenericTokenStream(text, JSON_LEXER) - tree, errors = JSON_PARSER.parse(tokens) + tree, errors = parser_runtime.parse(JSON_PARSER, JSON_LEXER, text) assert [] == errors assert tree is not None @@ -271,11 +268,11 @@ class TG(Grammar): def test_forced_break(): g = TG() g_lexer = g.compile_lexer() - g_parser = parser_runtime.Parser(g.build_table()) + g_parser = g.build_table() text = "((ok ok) (ok break break ok) (ok ok ok ok))" - tree, errors = g_parser.parse(parser_runtime.GenericTokenStream(text, g_lexer)) + tree, errors = parser_runtime.parse(g_parser, g_lexer, text) assert errors == [] assert tree is not None @@ -301,7 +298,7 @@ def test_forced_break(): def test_maintaining_line_breaks(): g = TG() g_lexer = g.compile_lexer() - g_parser = parser_runtime.Parser(g.build_table()) + g_parser = g.build_table() text = """((ok ok) ; Don't break here. @@ -315,7 +312,7 @@ def test_maintaining_line_breaks(): ; ^ This should only be one break. (ok))""" - tree, errors = g_parser.parse(parser_runtime.GenericTokenStream(text, g_lexer)) + tree, errors = parser_runtime.parse(g_parser, g_lexer, text) assert errors == [] assert tree is not None @@ -342,14 +339,14 @@ def test_maintaining_line_breaks(): def test_trailing_trivia(): g = TG() g_lexer = g.compile_lexer() - g_parser = parser_runtime.Parser(g.build_table()) + g_parser = g.build_table() text = """((ok ok)); Don't lose this! ; Or this! """ - tree, errors = g_parser.parse(parser_runtime.GenericTokenStream(text, g_lexer)) + tree, errors = parser_runtime.parse(g_parser, g_lexer, text) assert errors == [] assert tree is not None @@ -368,14 +365,14 @@ def test_trailing_trivia(): def test_trailing_trivia_two(): g = TG() g_lexer = g.compile_lexer() - g_parser = parser_runtime.Parser(g.build_table()) + g_parser = g.build_table() text = """((ok ok)) ; Or this! """ - tree, errors = g_parser.parse(parser_runtime.GenericTokenStream(text, g_lexer)) + tree, errors = parser_runtime.parse(g_parser, g_lexer, text) assert errors == [] assert tree is not None @@ -394,14 +391,14 @@ def test_trailing_trivia_two(): def test_trailing_trivia_split(): g = TG() g_lexer = g.compile_lexer() - g_parser = parser_runtime.Parser(g.build_table()) + g_parser = g.build_table() text = """((ok ok)); Don't lose this! ; Or this! """ - tree, errors = g_parser.parse(parser_runtime.GenericTokenStream(text, g_lexer)) + tree, errors = parser_runtime.parse(g_parser, g_lexer, text) assert errors == [] assert tree is not None