Start moving the examples into tests

2024-06-15 07:52:16 -07:00 · 2024-06-15 07:52:16 -07:00 · e04aa1966e
commit e04aa1966e
parent d3b8d0e836
6 changed files with 221 additions and 237 deletions
--- a/grammar.py
+++ b/grammar.py
@ -1,5 +1,6 @@
 # This is an example grammar.
 import re
 import typing
 import parser
 from parser import Assoc, Grammar, Nothing, Terminal, rule, seq, Rule
@ -517,12 +518,15 @@ import bisect
 class FineTokens:
    def __init__(self, src: str):
        self.src = src
-        self._tokens = list(tokenize(src))
+        self._tokens: list[typing.Tuple[Terminal, int, int]] = list(tokenize(src))
-        self.lines = [m.start() for m in re.finditer("\n", src)]
+        self._lines = [m.start() for m in re.finditer("\n", src)]
    def tokens(self):
        return self._tokens
    def lines(self):
        return self._lines
    def dump(self, *, start=None, end=None):
        if start is None:
            start = 0
@ -531,11 +535,11 @@ class FineTokens:
        for token in self._tokens[start:end]:
            (kind, start, length) = token
-            line_index = bisect.bisect_left(self.lines, start)
+            line_index = bisect.bisect_left(self._lines, start)
            if line_index == 0:
                col_start = 0
            else:
-                col_start = self.lines[line_index - 1] + 1
+                col_start = self._lines[line_index - 1] + 1
            column_index = start - col_start
            value = self.src[start : start + length]
            print(f"{start:04} {kind.value:12} {value} ({line_index}, {column_index})")
--- a/harness.py
+++ b/harness.py
@ -300,7 +300,7 @@ class Harness:
            # print(f"{tokens.lines}")
            # tokens.dump(end=5)
-            (tree, errors) = runtime.Parser(table, trace=None).parse(self.tokens)
+            (tree, errors) = runtime.Parser(table).parse(self.tokens)
            parse_time = time.time()
            self.tree = tree
            self.errors = errors
--- a/2
+++ b/2
@ -1,3 +1,3 @@
 .PHONY: test
 test:
-	pytest
+	pdm run pytest
--- a/parser/parser.py
+++ b/parser/parser.py
@ -1097,73 +1097,6 @@ class GenerateLR0:
        return builder.flush(config_sets)
 def parse(table: ParseTable, input, trace=False):
    """Parse the input with the generated parsing table and return the
    concrete syntax tree.
    The parsing table can be generated by GenerateLR0.gen_table() or by any
    of the other generators below. The parsing mechanism never changes, only
    the table generation mechanism.
    input is a list of tokens. Don't stick an end-of-stream marker, I'll stick
    one on for you.
    This is not a *great* parser, it's really just a demo for what you can
    do with the table.
    """
    assert "$" not in input
    input = input + ["$"]
    input_index = 0
    # Our stack is a stack of tuples, where the first entry is the state number
    # and the second entry is the 'value' that was generated when the state was
    # pushed.
    stack: list[typing.Tuple[int, typing.Any]] = [(0, None)]
    while True:
        current_state = stack[-1][0]
        current_token = input[input_index]
        action = table.actions[current_state].get(current_token, Error())
        if trace:
            print(
                "{stack: <20}  {input: <50}  {action: <5}".format(
                    stack=repr([s[0] for s in stack]),
                    input=repr(input[input_index:]),
                    action=repr(action),
                )
            )
        match action:
            case Accept():
                return stack[-1][1]
            case Reduce(name=name, count=size, transparent=transparent):
                children = []
                for _, c in stack[-size:]:
                    if isinstance(c, tuple) and c[0] is None:
                        children.extend(c[1])
                    else:
                        children.append(c)
                value = (name if not transparent else None, tuple(children))
                stack = stack[:-size]
                goto = table.gotos[stack[-1][0]].get(name)
                assert goto is not None
                stack.append((goto, value))
            case Shift(state):
                stack.append((state, (current_token, ())))
                input_index += 1
            case Error():
                raise ValueError(
                    "Syntax error: unexpected symbol {sym}".format(
                        sym=current_token,
                    ),
                )
 ###############################################################################
 # SLR(1)
 ###############################################################################
@ -1978,150 +1911,3 @@ class Grammar:
        gen = generator(start, desugared, precedence=self._precedence, transparents=transparents)
        table = gen.gen_table()
        return table
 ###############################################################################
 # Formatting
 ###############################################################################
 def format_node(node):
    """Print out an indented concrete syntax tree, from parse()."""
    lines = ["{name}".format(name=node[0])] + [
        "  " + line for child in node[1] for line in format_node(child).split("\n")
    ]
    return "\n".join(lines)
 ###############################################################################
 # Examples
 ###############################################################################
 def examples():
    def dump_grammar(grammar):
        for name, symbols in grammar:
            print(f"{name} -> {symbols}")
        print()
    # OK, this is a very simple LR0 grammar.
    print("grammar_simple:")
    grammar_simple = [
        ("E", ["E", "+", "T"]),
        ("E", ["T"]),
        ("T", ["(", "E", ")"]),
        ("T", ["id"]),
    ]
    gen = GenerateLR0("E", grammar_simple)
    table = gen.gen_table()
    print(table.format())
    tree = parse(table, ["id", "+", "(", "id", ")"])
    print(format_node(tree) + "\n")
    print()
    # This one doesn't work with LR0, though, it has a shift/reduce conflict.
    print("grammar_lr0_shift_reduce (LR0):")
    grammar_lr0_shift_reduce = grammar_simple + [
        ("T", ["id", "[", "E", "]"]),
    ]
    try:
        gen = GenerateLR0("E", grammar_lr0_shift_reduce)
        table = gen.gen_table()
        assert False
    except ValueError as e:
        print(e)
        print()
    # Nor does this: it has a reduce/reduce conflict.
    print("grammar_lr0_reduce_reduce (LR0):")
    grammar_lr0_reduce_reduce = grammar_simple + [
        ("E", ["V", "=", "E"]),
        ("V", ["id"]),
    ]
    try:
        gen = GenerateLR0("E", grammar_lr0_reduce_reduce)
        table = gen.gen_table()
        assert False
    except ValueError as e:
        print(e)
        print()
    # Nullable symbols just don't work with constructs like this, because you can't
    # look ahead to figure out if you should reduce an empty 'F' or not.
    print("grammar_nullable (LR0):")
    grammar_nullable = [
        ("E", ["F", "boop"]),
        ("F", ["beep"]),
        ("F", []),
    ]
    try:
        gen = GenerateLR0("E", grammar_nullable)
        table = gen.gen_table()
        assert False
    except ValueError as e:
        print(e)
        print()
    print("grammar_lr0_shift_reduce (SLR1):")
    dump_grammar(grammar_lr0_shift_reduce)
    gen = GenerateSLR1("E", grammar_lr0_shift_reduce)
    print(f"Follow('E'): {str([gen.alphabet[f] for f in gen.gen_follow(gen.symbol_key['E'])])}")
    table = gen.gen_table()
    print(table.format())
    tree = parse(table, ["id", "+", "(", "id", "[", "id", "]", ")"], trace=True)
    print(format_node(tree) + "\n")
    print()
    # SLR1 can't handle this.
    print("grammar_aho_ullman_1 (SLR1):")
    grammar_aho_ullman_1 = [
        ("S", ["L", "=", "R"]),
        ("S", ["R"]),
        ("L", ["*", "R"]),
        ("L", ["id"]),
        ("R", ["L"]),
    ]
    try:
        gen = GenerateSLR1("S", grammar_aho_ullman_1)
        table = gen.gen_table()
        assert False
    except ValueError as e:
        print(e)
        print()
    # Here's an example with a full LR1 grammar, though.
    print("grammar_aho_ullman_2 (LR1):")
    grammar_aho_ullman_2 = [
        ("S", ["X", "X"]),
        ("X", ["a", "X"]),
        ("X", ["b"]),
    ]
    gen = GenerateLR1("S", grammar_aho_ullman_2)
    table = gen.gen_table()
    print(table.format())
    parse(table, ["b", "a", "a", "b"], trace=True)
    print()
    # What happens if we do LALR to it?
    print("grammar_aho_ullman_2 (LALR):")
    gen = GenerateLALR("S", grammar_aho_ullman_2)
    table = gen.gen_table()
    print(table.format())
    print()
    # A fun LALAR grammar.
    print("grammar_lalr:")
    grammar_lalr = [
        ("S", ["V", "E"]),
        ("E", ["F"]),
        ("E", ["E", "+", "F"]),
        ("F", ["V"]),
        ("F", ["int"]),
        ("F", ["(", "E", ")"]),
        ("V", ["id"]),
    ]
    gen = GenerateLALR("S", grammar_lalr)
    table = gen.gen_table()
    print(table.format())
    print()
 if __name__ == "__main__":
    examples()
--- a/parser/runtime.py
+++ b/parser/runtime.py
@ -5,7 +5,7 @@ import logging
 import typing
 from dataclasses import dataclass
-from . import parser  # pyright: ignore # You're drunk.
+from . import parser
@dataclass
@ -267,17 +267,27 @@ def recover(table: parser.ParseTable, input: list[TokenValue], start: int, stack
 action_log = logging.getLogger("parser.action")
 class TokenStream(typing.Protocol):
    def tokens(self) -> list[typing.Tuple[parser.Terminal, int, int]]:
        """The tokens in the stream, in the form (terminal, start, length)."""
        ...
    def lines(self) -> list[int]:
        """The offsets of line breaks in the tokens. (The end of line 0 is at
        index 0, etc.)"""
        ...
 class Parser:
    # Our stack is a stack of tuples, where the first entry is the state
    # number and the second entry is the 'value' that was generated when the
    # state was pushed.
    table: parser.ParseTable
-    def __init__(self, table, trace):
+    def __init__(self, table):
        self.trace = trace
        self.table = table
-    def parse(self, tokens) -> typing.Tuple[Tree | None, list[str]]:
+    def parse(self, tokens: TokenStream) -> typing.Tuple[Tree | None, list[str]]:
        input_tokens = tokens.tokens()
        input: list[TokenValue] = [
            TokenValue(kind=kind.value, start=start, end=start + length)
@ -406,15 +416,17 @@ class Parser:
        # All done.
        error_strings = []
-        for parse_error in errors:
+        if errors:
-            line_index = bisect.bisect_left(tokens.lines, parse_error.start)
+            lines = tokens.lines()
-            if line_index == 0:
+            for parse_error in errors:
-                col_start = 0
+                line_index = bisect.bisect_left(lines, parse_error.start)
-            else:
+                if line_index == 0:
-                col_start = tokens.lines[line_index - 1] + 1
+                    col_start = 0
-            column_index = parse_error.start - col_start
+                else:
-            line_index += 1
+                    col_start = lines[line_index - 1] + 1
                column_index = parse_error.start - col_start
                line_index += 1
-            error_strings.append(f"{line_index}:{column_index}: {parse_error.message}")
+                error_strings.append(f"{line_index}:{column_index}: {parse_error.message}")
        return (result, error_strings)
--- a/tests/test_grammar.py
+++ b/tests/test_grammar.py
@ -1,6 +1,67 @@
-import parser
+import typing
 import pytest
 import parser
 import parser.runtime as runtime
 from parser import Grammar, seq, rule, Terminal
 PLUS = Terminal("+")
 LPAREN = Terminal("(")
 RPAREN = Terminal(")")
 IDENTIFIER = Terminal("id")
 class Tokens:
    def __init__(self, *toks: Terminal):
        self._tokens = [(t, 0, 0) for t in toks]
        self._lines = []
    def tokens(self):
        return self._tokens
    def lines(self):
        return self._lines
 def _tree(treeform) -> runtime.Tree | runtime.TokenValue:
    if isinstance(treeform, str):
        return runtime.TokenValue(treeform, 0, 0)
    else:
        assert isinstance(treeform, tuple)
        name = treeform[0]
        assert isinstance(name, str)
        return runtime.Tree(
            name=name,
            start=0,
            end=0,
            children=tuple(_tree(x) for x in treeform[1:]),
        )
 class LR0Grammar(Grammar):
    start = "E"
    generator = parser.GenerateLR0
    @rule
    def E(self):
        return seq(self.E, PLUS, self.T) | self.T
    @rule
    def T(self):
        return seq(LPAREN, self.E, RPAREN) | IDENTIFIER
 def test_lr0_lr0():
    """An LR0 grammar should work with an LR0 generator."""
    table = LR0Grammar().build_table()
    parser = runtime.Parser(table)
    tree, errors = parser.parse(Tokens(IDENTIFIER, PLUS, LPAREN, IDENTIFIER, RPAREN))
    assert errors == []
    assert tree == _tree(("E", ("E", ("T", "id")), "+", ("T", "(", ("E", ("T", "id")), ")")))
 def test_conflicting_names():
    """Terminals and nonterminals cannot have the same name.
@ -16,14 +77,135 @@ def test_conflicting_names():
    to understand.
    """
-    IDENTIFIER = parser.Terminal("Identifier")
+    IDENTIFIER = Terminal("Identifier")
-    class TestGrammar(parser.Grammar):
+    class TestGrammar(Grammar):
        start = "Identifier"
-        @parser.rule("Identifier")
+        @rule("Identifier")
        def identifier(self):
            return IDENTIFIER
    with pytest.raises(ValueError):
        TestGrammar().build_table()
 ###############################################################################
 # Examples
 ###############################################################################
 # def examples():
 #     def dump_grammar(grammar):
 #         for name, symbols in grammar:
 #             print(f"{name} -> {symbols}")
 #         print()
 #     # This one doesn't work with LR0, though, it has a shift/reduce conflict.
 #     print("grammar_lr0_shift_reduce (LR0):")
 #     grammar_lr0_shift_reduce = grammar_simple + [
 #         ("T", ["id", "[", "E", "]"]),
 #     ]
 #     try:
 #         gen = GenerateLR0("E", grammar_lr0_shift_reduce)
 #         table = gen.gen_table()
 #         assert False
 #     except ValueError as e:
 #         print(e)
 #         print()
 #     # Nor does this: it has a reduce/reduce conflict.
 #     print("grammar_lr0_reduce_reduce (LR0):")
 #     grammar_lr0_reduce_reduce = grammar_simple + [
 #         ("E", ["V", "=", "E"]),
 #         ("V", ["id"]),
 #     ]
 #     try:
 #         gen = GenerateLR0("E", grammar_lr0_reduce_reduce)
 #         table = gen.gen_table()
 #         assert False
 #     except ValueError as e:
 #         print(e)
 #         print()
 #     # Nullable symbols just don't work with constructs like this, because you can't
 #     # look ahead to figure out if you should reduce an empty 'F' or not.
 #     print("grammar_nullable (LR0):")
 #     grammar_nullable = [
 #         ("E", ["F", "boop"]),
 #         ("F", ["beep"]),
 #         ("F", []),
 #     ]
 #     try:
 #         gen = GenerateLR0("E", grammar_nullable)
 #         table = gen.gen_table()
 #         assert False
 #     except ValueError as e:
 #         print(e)
 #         print()
 #     print("grammar_lr0_shift_reduce (SLR1):")
 #     dump_grammar(grammar_lr0_shift_reduce)
 #     gen = GenerateSLR1("E", grammar_lr0_shift_reduce)
 #     print(f"Follow('E'): {str([gen.alphabet[f] for f in gen.gen_follow(gen.symbol_key['E'])])}")
 #     table = gen.gen_table()
 #     print(table.format())
 #     tree = parse(table, ["id", "+", "(", "id", "[", "id", "]", ")"], trace=True)
 #     print(format_node(tree) + "\n")
 #     print()
 #     # SLR1 can't handle this.
 #     print("grammar_aho_ullman_1 (SLR1):")
 #     grammar_aho_ullman_1 = [
 #         ("S", ["L", "=", "R"]),
 #         ("S", ["R"]),
 #         ("L", ["*", "R"]),
 #         ("L", ["id"]),
 #         ("R", ["L"]),
 #     ]
 #     try:
 #         gen = GenerateSLR1("S", grammar_aho_ullman_1)
 #         table = gen.gen_table()
 #         assert False
 #     except ValueError as e:
 #         print(e)
 #         print()
 #     # Here's an example with a full LR1 grammar, though.
 #     print("grammar_aho_ullman_2 (LR1):")
 #     grammar_aho_ullman_2 = [
 #         ("S", ["X", "X"]),
 #         ("X", ["a", "X"]),
 #         ("X", ["b"]),
 #     ]
 #     gen = GenerateLR1("S", grammar_aho_ullman_2)
 #     table = gen.gen_table()
 #     print(table.format())
 #     parse(table, ["b", "a", "a", "b"], trace=True)
 #     print()
 #     # What happens if we do LALR to it?
 #     print("grammar_aho_ullman_2 (LALR):")
 #     gen = GenerateLALR("S", grammar_aho_ullman_2)
 #     table = gen.gen_table()
 #     print(table.format())
 #     print()
 #     # A fun LALAR grammar.
 #     print("grammar_lalr:")
 #     grammar_lalr = [
 #         ("S", ["V", "E"]),
 #         ("E", ["F"]),
 #         ("E", ["E", "+", "F"]),
 #         ("F", ["V"]),
 #         ("F", ["int"]),
 #         ("F", ["(", "E", ")"]),
 #         ("V", ["id"]),
 #     ]
 #     gen = GenerateLALR("S", grammar_lalr)
 #     table = gen.gen_table()
 #     print(table.format())
 #     print()
 # if __name__ == "__main__":
 #     examples()