Start moving the examples into tests

2024-06-15 07:52:16 -07:00 · 2024-06-15 07:52:16 -07:00 · e04aa1966e
commit e04aa1966e
parent d3b8d0e836
6 changed files with 221 additions and 237 deletions
--- a/grammar.py
+++ b/grammar.py
@ -1,5 +1,6 @@
 # This is an example grammar.
 import re
+import typing

 import parser
 from parser import Assoc, Grammar, Nothing, Terminal, rule, seq, Rule
@ -517,12 +518,15 @@ import bisect
 class FineTokens:
    def __init__(self, src: str):
        self.src = src
-        self._tokens = list(tokenize(src))
-        self.lines = [m.start() for m in re.finditer("\n", src)]
+        self._tokens: list[typing.Tuple[Terminal, int, int]] = list(tokenize(src))
+        self._lines = [m.start() for m in re.finditer("\n", src)]

    def tokens(self):
        return self._tokens

+    def lines(self):
+        return self._lines
+
    def dump(self, *, start=None, end=None):
        if start is None:
            start = 0
@ -531,11 +535,11 @@ class FineTokens:

        for token in self._tokens[start:end]:
            (kind, start, length) = token
-            line_index = bisect.bisect_left(self.lines, start)
+            line_index = bisect.bisect_left(self._lines, start)
            if line_index == 0:
                col_start = 0
            else:
-                col_start = self.lines[line_index - 1] + 1
+                col_start = self._lines[line_index - 1] + 1
            column_index = start - col_start
            value = self.src[start : start + length]
            print(f"{start:04} {kind.value:12} {value} ({line_index}, {column_index})")
--- a/harness.py
+++ b/harness.py
@ -300,7 +300,7 @@ class Harness:

            # print(f"{tokens.lines}")
            # tokens.dump(end=5)
-            (tree, errors) = runtime.Parser(table, trace=None).parse(self.tokens)
+            (tree, errors) = runtime.Parser(table).parse(self.tokens)
            parse_time = time.time()
            self.tree = tree
            self.errors = errors
--- a/2
+++ b/2
@ -1,3 +1,3 @@
 .PHONY: test
 test:
-	pytest
+	pdm run pytest
--- a/parser/parser.py
+++ b/parser/parser.py
@ -1097,73 +1097,6 @@ class GenerateLR0:
        return builder.flush(config_sets)


-def parse(table: ParseTable, input, trace=False):
-    """Parse the input with the generated parsing table and return the
-    concrete syntax tree.
-
-    The parsing table can be generated by GenerateLR0.gen_table() or by any
-    of the other generators below. The parsing mechanism never changes, only
-    the table generation mechanism.
-
-    input is a list of tokens. Don't stick an end-of-stream marker, I'll stick
-    one on for you.
-
-    This is not a *great* parser, it's really just a demo for what you can
-    do with the table.
-    """
-    assert "$" not in input
-    input = input + ["$"]
-    input_index = 0
-
-    # Our stack is a stack of tuples, where the first entry is the state number
-    # and the second entry is the 'value' that was generated when the state was
-    # pushed.
-    stack: list[typing.Tuple[int, typing.Any]] = [(0, None)]
-    while True:
-        current_state = stack[-1][0]
-        current_token = input[input_index]
-
-        action = table.actions[current_state].get(current_token, Error())
-        if trace:
-            print(
-                "{stack: <20}  {input: <50}  {action: <5}".format(
-                    stack=repr([s[0] for s in stack]),
-                    input=repr(input[input_index:]),
-                    action=repr(action),
-                )
-            )
-
-        match action:
-            case Accept():
-                return stack[-1][1]
-
-            case Reduce(name=name, count=size, transparent=transparent):
-                children = []
-                for _, c in stack[-size:]:
-                    if isinstance(c, tuple) and c[0] is None:
-                        children.extend(c[1])
-                    else:
-                        children.append(c)
-
-                value = (name if not transparent else None, tuple(children))
-                stack = stack[:-size]
-
-                goto = table.gotos[stack[-1][0]].get(name)
-                assert goto is not None
-                stack.append((goto, value))
-
-            case Shift(state):
-                stack.append((state, (current_token, ())))
-                input_index += 1
-
-            case Error():
-                raise ValueError(
-                    "Syntax error: unexpected symbol {sym}".format(
-                        sym=current_token,
-                    ),
-                )
-
-
 ###############################################################################
 # SLR(1)
 ###############################################################################
@ -1978,150 +1911,3 @@ class Grammar:
        gen = generator(start, desugared, precedence=self._precedence, transparents=transparents)
        table = gen.gen_table()
        return table
-
-
-###############################################################################
-# Formatting
-###############################################################################
-def format_node(node):
-    """Print out an indented concrete syntax tree, from parse()."""
-    lines = ["{name}".format(name=node[0])] + [
-        "  " + line for child in node[1] for line in format_node(child).split("\n")
-    ]
-    return "\n".join(lines)
-
-
-###############################################################################
-# Examples
-###############################################################################
-def examples():
-    def dump_grammar(grammar):
-        for name, symbols in grammar:
-            print(f"{name} -> {symbols}")
-        print()
-
-    # OK, this is a very simple LR0 grammar.
-    print("grammar_simple:")
-    grammar_simple = [
-        ("E", ["E", "+", "T"]),
-        ("E", ["T"]),
-        ("T", ["(", "E", ")"]),
-        ("T", ["id"]),
-    ]
-
-    gen = GenerateLR0("E", grammar_simple)
-    table = gen.gen_table()
-    print(table.format())
-    tree = parse(table, ["id", "+", "(", "id", ")"])
-    print(format_node(tree) + "\n")
-    print()
-
-    # This one doesn't work with LR0, though, it has a shift/reduce conflict.
-    print("grammar_lr0_shift_reduce (LR0):")
-    grammar_lr0_shift_reduce = grammar_simple + [
-        ("T", ["id", "[", "E", "]"]),
-    ]
-    try:
-        gen = GenerateLR0("E", grammar_lr0_shift_reduce)
-        table = gen.gen_table()
-        assert False
-    except ValueError as e:
-        print(e)
-        print()
-
-    # Nor does this: it has a reduce/reduce conflict.
-    print("grammar_lr0_reduce_reduce (LR0):")
-    grammar_lr0_reduce_reduce = grammar_simple + [
-        ("E", ["V", "=", "E"]),
-        ("V", ["id"]),
-    ]
-    try:
-        gen = GenerateLR0("E", grammar_lr0_reduce_reduce)
-        table = gen.gen_table()
-        assert False
-    except ValueError as e:
-        print(e)
-        print()
-
-    # Nullable symbols just don't work with constructs like this, because you can't
-    # look ahead to figure out if you should reduce an empty 'F' or not.
-    print("grammar_nullable (LR0):")
-    grammar_nullable = [
-        ("E", ["F", "boop"]),
-        ("F", ["beep"]),
-        ("F", []),
-    ]
-    try:
-        gen = GenerateLR0("E", grammar_nullable)
-        table = gen.gen_table()
-        assert False
-    except ValueError as e:
-        print(e)
-        print()
-
-    print("grammar_lr0_shift_reduce (SLR1):")
-    dump_grammar(grammar_lr0_shift_reduce)
-    gen = GenerateSLR1("E", grammar_lr0_shift_reduce)
-    print(f"Follow('E'): {str([gen.alphabet[f] for f in gen.gen_follow(gen.symbol_key['E'])])}")
-    table = gen.gen_table()
-    print(table.format())
-    tree = parse(table, ["id", "+", "(", "id", "[", "id", "]", ")"], trace=True)
-    print(format_node(tree) + "\n")
-    print()
-
-    # SLR1 can't handle this.
-    print("grammar_aho_ullman_1 (SLR1):")
-    grammar_aho_ullman_1 = [
-        ("S", ["L", "=", "R"]),
-        ("S", ["R"]),
-        ("L", ["*", "R"]),
-        ("L", ["id"]),
-        ("R", ["L"]),
-    ]
-    try:
-        gen = GenerateSLR1("S", grammar_aho_ullman_1)
-        table = gen.gen_table()
-        assert False
-    except ValueError as e:
-        print(e)
-        print()
-
-    # Here's an example with a full LR1 grammar, though.
-    print("grammar_aho_ullman_2 (LR1):")
-    grammar_aho_ullman_2 = [
-        ("S", ["X", "X"]),
-        ("X", ["a", "X"]),
-        ("X", ["b"]),
-    ]
-    gen = GenerateLR1("S", grammar_aho_ullman_2)
-    table = gen.gen_table()
-    print(table.format())
-    parse(table, ["b", "a", "a", "b"], trace=True)
-    print()
-
-    # What happens if we do LALR to it?
-    print("grammar_aho_ullman_2 (LALR):")
-    gen = GenerateLALR("S", grammar_aho_ullman_2)
-    table = gen.gen_table()
-    print(table.format())
-    print()
-
-    # A fun LALAR grammar.
-    print("grammar_lalr:")
-    grammar_lalr = [
-        ("S", ["V", "E"]),
-        ("E", ["F"]),
-        ("E", ["E", "+", "F"]),
-        ("F", ["V"]),
-        ("F", ["int"]),
-        ("F", ["(", "E", ")"]),
-        ("V", ["id"]),
-    ]
-    gen = GenerateLALR("S", grammar_lalr)
-    table = gen.gen_table()
-    print(table.format())
-    print()
-
-
-if __name__ == "__main__":
-    examples()
--- a/parser/runtime.py
+++ b/parser/runtime.py
@ -5,7 +5,7 @@ import logging
 import typing
 from dataclasses import dataclass

-from . import parser  # pyright: ignore # You're drunk.
+from . import parser


@dataclass
@ -267,17 +267,27 @@ def recover(table: parser.ParseTable, input: list[TokenValue], start: int, stack
 action_log = logging.getLogger("parser.action")


+class TokenStream(typing.Protocol):
+    def tokens(self) -> list[typing.Tuple[parser.Terminal, int, int]]:
+        """The tokens in the stream, in the form (terminal, start, length)."""
+        ...
+
+    def lines(self) -> list[int]:
+        """The offsets of line breaks in the tokens. (The end of line 0 is at
+        index 0, etc.)"""
+        ...
+
+
 class Parser:
    # Our stack is a stack of tuples, where the first entry is the state
    # number and the second entry is the 'value' that was generated when the
    # state was pushed.
    table: parser.ParseTable

-    def __init__(self, table, trace):
-        self.trace = trace
+    def __init__(self, table):
        self.table = table

-    def parse(self, tokens) -> typing.Tuple[Tree | None, list[str]]:
+    def parse(self, tokens: TokenStream) -> typing.Tuple[Tree | None, list[str]]:
        input_tokens = tokens.tokens()
        input: list[TokenValue] = [
            TokenValue(kind=kind.value, start=start, end=start + length)
@ -406,15 +416,17 @@ class Parser:

        # All done.
        error_strings = []
-        for parse_error in errors:
-            line_index = bisect.bisect_left(tokens.lines, parse_error.start)
-            if line_index == 0:
-                col_start = 0
-            else:
-                col_start = tokens.lines[line_index - 1] + 1
-            column_index = parse_error.start - col_start
-            line_index += 1
+        if errors:
+            lines = tokens.lines()
+            for parse_error in errors:
+                line_index = bisect.bisect_left(lines, parse_error.start)
+                if line_index == 0:
+                    col_start = 0
+                else:
+                    col_start = lines[line_index - 1] + 1
+                column_index = parse_error.start - col_start
+                line_index += 1

-            error_strings.append(f"{line_index}:{column_index}: {parse_error.message}")
+                error_strings.append(f"{line_index}:{column_index}: {parse_error.message}")

        return (result, error_strings)
--- a/tests/test_grammar.py
+++ b/tests/test_grammar.py
@ -1,6 +1,67 @@
-import parser
+import typing
+
 import pytest

+import parser
+import parser.runtime as runtime
+
+from parser import Grammar, seq, rule, Terminal
+
+PLUS = Terminal("+")
+LPAREN = Terminal("(")
+RPAREN = Terminal(")")
+IDENTIFIER = Terminal("id")
+
+
+class Tokens:
+    def __init__(self, *toks: Terminal):
+        self._tokens = [(t, 0, 0) for t in toks]
+        self._lines = []
+
+    def tokens(self):
+        return self._tokens
+
+    def lines(self):
+        return self._lines
+
+
+def _tree(treeform) -> runtime.Tree | runtime.TokenValue:
+    if isinstance(treeform, str):
+        return runtime.TokenValue(treeform, 0, 0)
+    else:
+        assert isinstance(treeform, tuple)
+        name = treeform[0]
+        assert isinstance(name, str)
+        return runtime.Tree(
+            name=name,
+            start=0,
+            end=0,
+            children=tuple(_tree(x) for x in treeform[1:]),
+        )
+
+
+class LR0Grammar(Grammar):
+    start = "E"
+    generator = parser.GenerateLR0
+
+    @rule
+    def E(self):
+        return seq(self.E, PLUS, self.T) | self.T
+
+    @rule
+    def T(self):
+        return seq(LPAREN, self.E, RPAREN) | IDENTIFIER
+
+
+def test_lr0_lr0():
+    """An LR0 grammar should work with an LR0 generator."""
+    table = LR0Grammar().build_table()
+    parser = runtime.Parser(table)
+    tree, errors = parser.parse(Tokens(IDENTIFIER, PLUS, LPAREN, IDENTIFIER, RPAREN))
+
+    assert errors == []
+    assert tree == _tree(("E", ("E", ("T", "id")), "+", ("T", "(", ("E", ("T", "id")), ")")))
+

 def test_conflicting_names():
    """Terminals and nonterminals cannot have the same name.
@ -16,14 +77,135 @@ def test_conflicting_names():
    to understand.
    """

-    IDENTIFIER = parser.Terminal("Identifier")
+    IDENTIFIER = Terminal("Identifier")

-    class TestGrammar(parser.Grammar):
+    class TestGrammar(Grammar):
        start = "Identifier"

-        @parser.rule("Identifier")
+        @rule("Identifier")
        def identifier(self):
            return IDENTIFIER

    with pytest.raises(ValueError):
        TestGrammar().build_table()
+
+
+###############################################################################
+# Examples
+###############################################################################
+# def examples():
+#     def dump_grammar(grammar):
+#         for name, symbols in grammar:
+#             print(f"{name} -> {symbols}")
+#         print()
+
+
+#     # This one doesn't work with LR0, though, it has a shift/reduce conflict.
+#     print("grammar_lr0_shift_reduce (LR0):")
+#     grammar_lr0_shift_reduce = grammar_simple + [
+#         ("T", ["id", "[", "E", "]"]),
+#     ]
+#     try:
+#         gen = GenerateLR0("E", grammar_lr0_shift_reduce)
+#         table = gen.gen_table()
+#         assert False
+#     except ValueError as e:
+#         print(e)
+#         print()
+
+#     # Nor does this: it has a reduce/reduce conflict.
+#     print("grammar_lr0_reduce_reduce (LR0):")
+#     grammar_lr0_reduce_reduce = grammar_simple + [
+#         ("E", ["V", "=", "E"]),
+#         ("V", ["id"]),
+#     ]
+#     try:
+#         gen = GenerateLR0("E", grammar_lr0_reduce_reduce)
+#         table = gen.gen_table()
+#         assert False
+#     except ValueError as e:
+#         print(e)
+#         print()
+
+#     # Nullable symbols just don't work with constructs like this, because you can't
+#     # look ahead to figure out if you should reduce an empty 'F' or not.
+#     print("grammar_nullable (LR0):")
+#     grammar_nullable = [
+#         ("E", ["F", "boop"]),
+#         ("F", ["beep"]),
+#         ("F", []),
+#     ]
+#     try:
+#         gen = GenerateLR0("E", grammar_nullable)
+#         table = gen.gen_table()
+#         assert False
+#     except ValueError as e:
+#         print(e)
+#         print()
+
+#     print("grammar_lr0_shift_reduce (SLR1):")
+#     dump_grammar(grammar_lr0_shift_reduce)
+#     gen = GenerateSLR1("E", grammar_lr0_shift_reduce)
+#     print(f"Follow('E'): {str([gen.alphabet[f] for f in gen.gen_follow(gen.symbol_key['E'])])}")
+#     table = gen.gen_table()
+#     print(table.format())
+#     tree = parse(table, ["id", "+", "(", "id", "[", "id", "]", ")"], trace=True)
+#     print(format_node(tree) + "\n")
+#     print()
+
+#     # SLR1 can't handle this.
+#     print("grammar_aho_ullman_1 (SLR1):")
+#     grammar_aho_ullman_1 = [
+#         ("S", ["L", "=", "R"]),
+#         ("S", ["R"]),
+#         ("L", ["*", "R"]),
+#         ("L", ["id"]),
+#         ("R", ["L"]),
+#     ]
+#     try:
+#         gen = GenerateSLR1("S", grammar_aho_ullman_1)
+#         table = gen.gen_table()
+#         assert False
+#     except ValueError as e:
+#         print(e)
+#         print()
+
+#     # Here's an example with a full LR1 grammar, though.
+#     print("grammar_aho_ullman_2 (LR1):")
+#     grammar_aho_ullman_2 = [
+#         ("S", ["X", "X"]),
+#         ("X", ["a", "X"]),
+#         ("X", ["b"]),
+#     ]
+#     gen = GenerateLR1("S", grammar_aho_ullman_2)
+#     table = gen.gen_table()
+#     print(table.format())
+#     parse(table, ["b", "a", "a", "b"], trace=True)
+#     print()
+
+#     # What happens if we do LALR to it?
+#     print("grammar_aho_ullman_2 (LALR):")
+#     gen = GenerateLALR("S", grammar_aho_ullman_2)
+#     table = gen.gen_table()
+#     print(table.format())
+#     print()
+
+#     # A fun LALAR grammar.
+#     print("grammar_lalr:")
+#     grammar_lalr = [
+#         ("S", ["V", "E"]),
+#         ("E", ["F"]),
+#         ("E", ["E", "+", "F"]),
+#         ("F", ["V"]),
+#         ("F", ["int"]),
+#         ("F", ["(", "E", ")"]),
+#         ("V", ["id"]),
+#     ]
+#     gen = GenerateLALR("S", grammar_lalr)
+#     table = gen.gen_table()
+#     print(table.format())
+#     print()
+
+
+# if __name__ == "__main__":
+#     examples()