From 071cd29d8f0d6a208549549fa1e250244351ddeb Mon Sep 17 00:00:00 2001
From: John Doty <john@d0ty.me>
Date: Sat, 21 Sep 2024 08:45:49 -0700
Subject: [PATCH] [readme] Rewrite the readme and add a helper

The helper is nice actually.
---
 README.md            | 312 ++++++++++++++++++++++++++++++-------------
 parser/runtime.py    |  14 +-
 tests/test_wadler.py |  29 ++--
 3 files changed, 242 insertions(+), 113 deletions(-)

diff --git a/README.md b/README.md
index c5a8020..d416145 100644
--- a/README.md
+++ b/README.md
@@ -1,30 +1,25 @@
-# A collection of LR parser generators, from LR0 through LALR.
+# A library for grammars
 
-This is a small helper library to generate LR parser tables.
+This is library to do interesting things with grammars. This was
+originally built as a little toy for me to understand how LR parser
+tables worked, but I discovered that what I *really* want is to be
+able to leverage the grammar to do other things besides parsing.
 
-The primary inspiration for this library is tree-sitter, which also generates
-LR parsers for grammars written in a turing-complete language. Like that, we
-write grammars in a language, only we do it in Python instead of JavaScript.
-
-Why Python? Because Python 3 is widely pre-installed on MacOS and Linux. This
-library requires nothing more than the basic standard library, and not even a
-new version of it. Therefore, it turns out to be a pretty light dependency for
-a rust or C++ or something kind of project. (Tree-sitter, on the other hand,
-requires node, which is a far less stable and available runtime in 2024.)
-
-The parser tables can really be used to power anything. I prefer to make
-concrete syntax trees (again, see tree-sitter), and there is no facility at all
-for actions or custom ASTs or whatnot. Any such processing needs to be done by
-the thing that processes the tables.
+The primary inspiration for this library is tree-sitter, which also
+generates LR parsers for grammars written in a turing-complete
+language. Like that, we write grammars in a language, only we do it in
+Python instead of JavaScript.
 
 ## Making Grammars
 
-To get started, create a grammar that derives from the `Grammar` class. Create
-one method per nonterminal, decorated with the `rule` decorator. Here's an
-example:
-
+To get started, create a grammar that derives from the `Grammar`
+class. Create one method per non-terminal, decorated with the `rule`
+decorator. Here's an example:
 
+```python
     class SimpleGrammar(Grammar):
+        start = "expression"
+
         @rule
         def expression(self):
             return seq(self.expression, self.PLUS, self.term) | self.term
@@ -36,98 +31,231 @@ example:
         PLUS = Terminal('+')
         LPAREN = Terminal('(')
         RPAREN = Terminal(')')
-        ID = Terminal('id')
+        ID = Terminal(
+            Re.seq(
+                Re.set(("a", "z"), ("A", "Z"), "_"),
+                Re.set(("a", "z"), ("A", "Z"), ("0", "9"), "_").star(),
+            ),
+        )
+```
 
+Terminals can be plain strings or regular expressions constructed with
+the `Re` object. (Ironically, I guess this library is not clever
+enough to parse a regular expression string into one of these
+structures. If you want to build one, go nuts! It's just Python, you
+can do whatever you want so long as the result is an `Re` object.)
 
-## Using grammars
+Productions can be built out of terminals and non-terminals,
+concatenated with the `seq` function or the `+` operator. Alternatives
+can be expressed with the `alt` function or the `|` operator. These
+things can be freely nested, as desired.
 
-TODO
+There are no helpers (yet!) for consuming lists, so they need to be
+constructed in the classic context-free grammar way:
 
-## Representation Choices
+```python
+    class NumberList(Grammar):
+        start = "list"
 
-The SimpleGrammar class might seem a little verbose compared to a dense
-structure like:
+        @rule
+        def list(self):
+            return self.NUMBER | (self.list + self.COMMA + self.NUMBER)
 
-    grammar_simple = [
-        ('E', ['E', '+', 'T']),
-        ('E', ['T']),
-        ('T', ['(', 'E', ')']),
-        ('T', ['id']),
-    ]
+        NUMBER = Terminal(Re.set(("0", "9")).plus())
+        COMMA = Terminal(',')
+```
 
-or
+(Unlike with PEGs, you can write grammars with left or right-recursion,
+without restriction, either is fine.)
 
-    grammar_simple = {
-      'E': [
-          ['E', '+', 'T'],
-          ['T'],
-      ],
-      'T': [
-          ['(', 'E', ')'],
-          ['id'],
-      ],
-    }
+When used to generate a parser, the grammar describes a concrete
+syntax tree. Unfortunately, that means that the list example above
+will generate a very awkward tree for `1,2,3`:
 
+```
+list
+  list
+    list
+      NUMBER ("1")
+    COMMA
+    NUMBER ("2")
+  COMMA
+  NUMBER ("3")
+```
 
-The advantage that the class has over a table like this is that you get to have
-all of your Python tools help you make sure your grammar is good, if you want
-them. e.g., if you're working with an LSP or something, the members give you
-autocomplete and jump-to-definition and possibly even type-checking.
+In order to make this a little cleaner, rules can be "transparent",
+which means they don't generate nodes in the tree and just dump their
+contents into the parent node instead.
 
-At the very least, if you mis-type the name of a nonterminal, or forget to
-implement it, we will immediately raise an error that *INCLUDES THE LOCATION IN
-THE SOURCE WHERE THE ERROR WAS MADE.* With tables, we can tell you that you
-made a mistake but it's up to you to figure out where you did it.
+```python
+    class NumberList(Grammar):
+        start = "list"
 
-### Aside: What about a custom DSL/EBNF like thing?
+        @rule
+        def list(self):
+            # The starting rule can't be transparent: there has to be something to
+            # hold on to!
+            return self.transparent_list
 
-Yeah, OK, there's a rich history of writing your grammar in a domain-specific
-language. YACC did it, ANTLR does it, GRMTools.... just about everybody except
-Tree-Sitter does this.
+        @rule(transparent=True)
+        def transparent_list(self) -> Rule:
+            return self.NUMBER | (self.transparent_list + self.COMMA + self.NUMBER)
 
-But look, I've got several reasons for not doing it.
+        NUMBER = Terminal(Re.set(("0", "9")).plus())
+        COMMA = Terminal(',')
+```
 
-First, I'm lazy, and don't want to write yet another parser for my parser. What
-tools should I use to write my parser generator parser? I guess I don't have my
-parser generator parser yet, so probably a hand-written top down parser? Some
-other python parser generator? Ugh!
+This grammar will generate the far more useful tree:
 
-As an add-on to that, if I make my own format then I need to make tooling for
-*that* too: syntax highlighters, jump to definition, the works. Yuck. An
-existing language, and a format that builds on an existing language, gets me the
-tooling that comes along with that language. If you can leverage that
-effictively (and I think I have) then you start way ahead in terms of tooling.
+```
+list
+  NUMBER ("1")
+  COMMA
+  NUMBER ("2")
+  COMMA
+  NUMBER ("3")
+```
 
-Second, this whole thing is supposed to be easy to include in an existing
-project, and adding a custom compiler doesn't seem to be that. Adding two python
-files seems to be about the right speed.
+Rules that start with `_` are also interpreted as transparent,
+following the lead set by tree-sitter, and so the grammar above is
+probably better-written as:
 
-Thirdly, and this is just hypothetical, it's probably pretty easy to write your
-own tooling around a grammar if it's already in Python. If you want to make
-railroad diagrams or EBNF pictures or whatever, all the productions are already
-right there in data structures for you to process. I've tried to keep them
-accessible and at least somewhat easy to work with. There's nothing that says a
-DSL-based system *has* to produce unusable intermediate data- certainly there
-are some tools that *try*- but with this approach the accessibility and the
-ergonomics of the tool go hand in hand.
+```python
+    class NumberList(Grammar):
+        start = "list"
 
-## Some History
+        @rule
+        def list(self):
+            return self._list
 
-The first version of this code was written as an idle exercise to learn how LR
-parser table generation even worked. It was... very simple, fairly easy to
-follow, and just *incredibly* slow. Like, mind-bogglingly slow. Unusably slow
-for anything but the most trivial grammar.
+        @rule
+        def _list(self):
+            return self.NUMBER | (self._list + self.COMMA + self.NUMBER)
 
-As a result, when I decided I wanted to use it for a larger grammar, I found that
-I just couldn't. So this has been hacked and significantly improved from that
-version, now capable of building tables for nontrivial grammars. It could still
-be a lot faster, but it meets my needs for now.
+        NUMBER = Terminal(Re.set(("0", "9")).plus())
+        COMMA = Terminal(',')
+```
 
-(BTW, the notes I read to learn how all this works are at
-http://dragonbook.stanford.edu/lecture-notes/Stanford-CS143/. Specifically,
-I started with handout 8, 'Bottom-up-parsing', and went from there. (I did
-eventually have to backtrack a little into handout 7, since that's where
-First() and Follow() are covered.)
+That will generate the same tree, but a little more succinctly.
 
-doty
-May 2024
+### Trivia
+
+Most folks that want to parse something want to skip blanks when they
+do it. Our grammars don't say anything about that by default (sorry),
+so you probably want to be explicit about such things.
+
+To allow (and ignore) spaces, newlines, tabs, and carriage-returns in
+our number lists, we would modify the grammar as follows:
+
+```python
+    class NumberList(Grammar):
+        start = "list"
+        trivia = ["BLANKS"] # <- Add a `trivia` member
+
+        @rule
+        def list(self):
+            return self._list
+
+        @rule
+        def _list(self):
+            return self.NUMBER | (self._list + self.COMMA + self.NUMBER)
+
+        NUMBER = Terminal(Re.set(("0", "9")).plus())
+        COMMA = Terminal(',')
+
+        BLANKS = Terminal(Re.set(" ", "\t", "\r", "\n").plus())
+        # ^ and add a new terminal to describe it
+```
+
+Now we can parse a list with spaces! "1  , 2,   3" will parse happily
+into:
+
+```
+list
+  NUMBER ("1")
+  COMMA
+  NUMBER ("2")
+  COMMA
+  NUMBER ("3")
+```
+
+## Using Grammars
+
+### Making Parsers and Parsing Text
+
+Once you have a grammar you can make a parse table from it by
+constructing an instance of the grammar and calling the `build_table`
+method on it.
+
+```python
+grammar = NumberList()
+parse_table = grammar.build_table()
+lexer_table = grammar.compile_lexer()
+```
+
+In theory, in the future, you could pass the table to an output
+generator and it would build a C source file or a Rust source file or
+something to run the parse. Right now the only runtime is also written
+in python, so you can do a parse as follows:
+
+```
+from parser import runtime
+
+text = "1,2,3"
+result, errors = runtime.parse(parse_table, lexer_table, "1,2,3")
+```
+
+`result` in the above example will be a concrete syntax tree, if the
+parse was successful, and `errors` will be a list of error strings
+from the parse. Note that the python runtime has automatic error
+recovery (with a variant of
+[CPCT+](https://tratt.net/laurie/blog/2020/automatic_syntax_error_recovery.html)),
+so you may get a parse tree even if there were parse errors.
+
+## Questions
+
+### Why Python?
+
+There are a few reasons to use python here.
+
+First, Python 3 is widely pre-installed on MacOS and Linux. This
+library requires nothing more than the basic standard library, and not
+even a new version of it. Therefore, it turns out to be a pretty light
+dependency for a rust or C++ or some other kind of project, where
+you're using this to generate the parser tables but the parser itself
+will be in some other language.
+
+(Tree-sitter, on the other hand, requires its own standalone binary in
+addition to node, which is a far less stable and available runtime in
+2024.)
+
+I also find the ergonomics of working in python a little nicer than
+working in, say, JavaScript. Python gives me operator overloading for
+things like `|` and `+`, which make the rules read a little closer to
+EBNF for me. It gives me type annotations that work without running a
+compiler over my input.
+
+It also *actually raises errors* when I accidentally misspell the name
+of a rule. And those errors come with the source location of exactly
+where I made the spelling mistake!
+
+Finally, I guess you could ask why I'm not using some DSL or something
+like literally every other parser generator tool except for
+tree-sitter. And the answer for that is: I just don't care to maintain
+a parser for my parser generator. ("Yo dawg, I heard you liked
+parsers...") Python gives me the ability to describe the data I want,
+in an easy to leverage way, that comes with all the power and
+flexibility of a general-purpose programming language. Turns out to be
+pretty nice.
+
+### What about grammars where blank space is significant, like ... well, python?
+
+Right now there's no way to describe them natively.
+
+You could write the grammar and introduce terminals like `INDENT` and
+`DEDENT` but you would have to write a custom lexer to produce those
+terminals, and probably handle them differently in all the other uses
+of the grammar as well.
+
+That limits the ability to write the grammar once and automatically
+use it everywhere, but maybe it's good enough for you?
diff --git a/parser/runtime.py b/parser/runtime.py
index b746cf1..81b3145 100644
--- a/parser/runtime.py
+++ b/parser/runtime.py
@@ -303,11 +303,6 @@ class TokenStream(typing.Protocol):
         ...
 
 
-# TODO: This runtime API sucks; the TokenStream is nice and all but I should
-#       also be able to have a function that takes a string and produces a
-#       tree directly, with caching intermediates for codegen and whatnot.
-
-
 class Parser:
     table: parser.ParseTable
 
@@ -612,3 +607,12 @@ class GenericTokenStream:
             line = f"{start:{max_offset_len}} {line_part} {column_index:3} {kind.name:{max_terminal_name}} {repr(value)}"
             lines.append(line)
         return lines
+
+
+def parse(
+    parse_table: parser.ParseTable,
+    lexer_table: parser.LexerTable,
+    text: str,
+) -> typing.Tuple[Tree | None, list[str]]:
+    """Parse the provided text with the generated parse table and lex table."""
+    return Parser(parse_table).parse(GenericTokenStream(text, lexer_table))
diff --git a/tests/test_wadler.py b/tests/test_wadler.py
index ce53cb4..e66c29d 100644
--- a/tests/test_wadler.py
+++ b/tests/test_wadler.py
@@ -109,9 +109,8 @@ class JsonGrammar(Grammar):
 
 
 JSON = JsonGrammar()
-JSON_TABLE = JSON.build_table()
+JSON_PARSER = JSON.build_table()
 JSON_LEXER = JSON.compile_lexer()
-JSON_PARSER = parser_runtime.Parser(JSON_TABLE)
 
 
 def flatten_document(doc: runtime.Document, src: str) -> list:
@@ -145,8 +144,7 @@ def flatten_document(doc: runtime.Document, src: str) -> list:
 
 def test_convert_tree_to_document():
     text = '{"a": true, "b":[1,2,3]}'
-    tokens = parser_runtime.GenericTokenStream(text, JSON_LEXER)
-    tree, errors = JSON_PARSER.parse(tokens)
+    tree, errors = parser_runtime.parse(JSON_PARSER, JSON_LEXER, text)
     assert [] == errors
     assert tree is not None
 
@@ -212,8 +210,7 @@ def _output(txt: str) -> str:
 
 def test_layout_basic():
     text = '{"a": true, "b":[1,2,3], "c":[1,2,3,4,5,6,7]}'
-    tokens = parser_runtime.GenericTokenStream(text, JSON_LEXER)
-    tree, errors = JSON_PARSER.parse(tokens)
+    tree, errors = parser_runtime.parse(JSON_PARSER, JSON_LEXER, text)
     assert [] == errors
     assert tree is not None
 
@@ -271,11 +268,11 @@ class TG(Grammar):
 def test_forced_break():
     g = TG()
     g_lexer = g.compile_lexer()
-    g_parser = parser_runtime.Parser(g.build_table())
+    g_parser = g.build_table()
 
     text = "((ok ok) (ok break break ok) (ok ok ok ok))"
 
-    tree, errors = g_parser.parse(parser_runtime.GenericTokenStream(text, g_lexer))
+    tree, errors = parser_runtime.parse(g_parser, g_lexer, text)
     assert errors == []
     assert tree is not None
 
@@ -301,7 +298,7 @@ def test_forced_break():
 def test_maintaining_line_breaks():
     g = TG()
     g_lexer = g.compile_lexer()
-    g_parser = parser_runtime.Parser(g.build_table())
+    g_parser = g.build_table()
 
     text = """((ok ok)
 ; Don't break here.
@@ -315,7 +312,7 @@ def test_maintaining_line_breaks():
 ; ^ This should only be one break.
 (ok))"""
 
-    tree, errors = g_parser.parse(parser_runtime.GenericTokenStream(text, g_lexer))
+    tree, errors = parser_runtime.parse(g_parser, g_lexer, text)
     assert errors == []
     assert tree is not None
 
@@ -342,14 +339,14 @@ def test_maintaining_line_breaks():
 def test_trailing_trivia():
     g = TG()
     g_lexer = g.compile_lexer()
-    g_parser = parser_runtime.Parser(g.build_table())
+    g_parser = g.build_table()
 
     text = """((ok ok)); Don't lose this!
 
 ; Or this!
     """
 
-    tree, errors = g_parser.parse(parser_runtime.GenericTokenStream(text, g_lexer))
+    tree, errors = parser_runtime.parse(g_parser, g_lexer, text)
     assert errors == []
     assert tree is not None
 
@@ -368,14 +365,14 @@ def test_trailing_trivia():
 def test_trailing_trivia_two():
     g = TG()
     g_lexer = g.compile_lexer()
-    g_parser = parser_runtime.Parser(g.build_table())
+    g_parser = g.build_table()
 
     text = """((ok ok))
 
 ; Or this!
     """
 
-    tree, errors = g_parser.parse(parser_runtime.GenericTokenStream(text, g_lexer))
+    tree, errors = parser_runtime.parse(g_parser, g_lexer, text)
     assert errors == []
     assert tree is not None
 
@@ -394,14 +391,14 @@ def test_trailing_trivia_two():
 def test_trailing_trivia_split():
     g = TG()
     g_lexer = g.compile_lexer()
-    g_parser = parser_runtime.Parser(g.build_table())
+    g_parser = g.build_table()
 
     text = """((ok ok)); Don't lose this!
 
 ; Or this!
     """
 
-    tree, errors = g_parser.parse(parser_runtime.GenericTokenStream(text, g_lexer))
+    tree, errors = parser_runtime.parse(g_parser, g_lexer, text)
     assert errors == []
     assert tree is not None