Start moving the examples into tests

2024-06-15 07:52:16 -07:00 · 2024-06-15 07:52:16 -07:00 · e04aa1966e
commit e04aa1966e
parent d3b8d0e836
6 changed files with 221 additions and 237 deletions
--- a/parser/parser.py
+++ b/parser/parser.py
@ -1097,73 +1097,6 @@ class GenerateLR0:
        return builder.flush(config_sets)


-def parse(table: ParseTable, input, trace=False):
-    """Parse the input with the generated parsing table and return the
-    concrete syntax tree.
-
-    The parsing table can be generated by GenerateLR0.gen_table() or by any
-    of the other generators below. The parsing mechanism never changes, only
-    the table generation mechanism.
-
-    input is a list of tokens. Don't stick an end-of-stream marker, I'll stick
-    one on for you.
-
-    This is not a *great* parser, it's really just a demo for what you can
-    do with the table.
-    """
-    assert "$" not in input
-    input = input + ["$"]
-    input_index = 0
-
-    # Our stack is a stack of tuples, where the first entry is the state number
-    # and the second entry is the 'value' that was generated when the state was
-    # pushed.
-    stack: list[typing.Tuple[int, typing.Any]] = [(0, None)]
-    while True:
-        current_state = stack[-1][0]
-        current_token = input[input_index]
-
-        action = table.actions[current_state].get(current_token, Error())
-        if trace:
-            print(
-                "{stack: <20}  {input: <50}  {action: <5}".format(
-                    stack=repr([s[0] for s in stack]),
-                    input=repr(input[input_index:]),
-                    action=repr(action),
-                )
-            )
-
-        match action:
-            case Accept():
-                return stack[-1][1]
-
-            case Reduce(name=name, count=size, transparent=transparent):
-                children = []
-                for _, c in stack[-size:]:
-                    if isinstance(c, tuple) and c[0] is None:
-                        children.extend(c[1])
-                    else:
-                        children.append(c)
-
-                value = (name if not transparent else None, tuple(children))
-                stack = stack[:-size]
-
-                goto = table.gotos[stack[-1][0]].get(name)
-                assert goto is not None
-                stack.append((goto, value))
-
-            case Shift(state):
-                stack.append((state, (current_token, ())))
-                input_index += 1
-
-            case Error():
-                raise ValueError(
-                    "Syntax error: unexpected symbol {sym}".format(
-                        sym=current_token,
-                    ),
-                )
-
-
 ###############################################################################
 # SLR(1)
 ###############################################################################
@ -1978,150 +1911,3 @@ class Grammar:
        gen = generator(start, desugared, precedence=self._precedence, transparents=transparents)
        table = gen.gen_table()
        return table
-
-
-###############################################################################
-# Formatting
-###############################################################################
-def format_node(node):
-    """Print out an indented concrete syntax tree, from parse()."""
-    lines = ["{name}".format(name=node[0])] + [
-        "  " + line for child in node[1] for line in format_node(child).split("\n")
-    ]
-    return "\n".join(lines)
-
-
-###############################################################################
-# Examples
-###############################################################################
-def examples():
-    def dump_grammar(grammar):
-        for name, symbols in grammar:
-            print(f"{name} -> {symbols}")
-        print()
-
-    # OK, this is a very simple LR0 grammar.
-    print("grammar_simple:")
-    grammar_simple = [
-        ("E", ["E", "+", "T"]),
-        ("E", ["T"]),
-        ("T", ["(", "E", ")"]),
-        ("T", ["id"]),
-    ]
-
-    gen = GenerateLR0("E", grammar_simple)
-    table = gen.gen_table()
-    print(table.format())
-    tree = parse(table, ["id", "+", "(", "id", ")"])
-    print(format_node(tree) + "\n")
-    print()
-
-    # This one doesn't work with LR0, though, it has a shift/reduce conflict.
-    print("grammar_lr0_shift_reduce (LR0):")
-    grammar_lr0_shift_reduce = grammar_simple + [
-        ("T", ["id", "[", "E", "]"]),
-    ]
-    try:
-        gen = GenerateLR0("E", grammar_lr0_shift_reduce)
-        table = gen.gen_table()
-        assert False
-    except ValueError as e:
-        print(e)
-        print()
-
-    # Nor does this: it has a reduce/reduce conflict.
-    print("grammar_lr0_reduce_reduce (LR0):")
-    grammar_lr0_reduce_reduce = grammar_simple + [
-        ("E", ["V", "=", "E"]),
-        ("V", ["id"]),
-    ]
-    try:
-        gen = GenerateLR0("E", grammar_lr0_reduce_reduce)
-        table = gen.gen_table()
-        assert False
-    except ValueError as e:
-        print(e)
-        print()
-
-    # Nullable symbols just don't work with constructs like this, because you can't
-    # look ahead to figure out if you should reduce an empty 'F' or not.
-    print("grammar_nullable (LR0):")
-    grammar_nullable = [
-        ("E", ["F", "boop"]),
-        ("F", ["beep"]),
-        ("F", []),
-    ]
-    try:
-        gen = GenerateLR0("E", grammar_nullable)
-        table = gen.gen_table()
-        assert False
-    except ValueError as e:
-        print(e)
-        print()
-
-    print("grammar_lr0_shift_reduce (SLR1):")
-    dump_grammar(grammar_lr0_shift_reduce)
-    gen = GenerateSLR1("E", grammar_lr0_shift_reduce)
-    print(f"Follow('E'): {str([gen.alphabet[f] for f in gen.gen_follow(gen.symbol_key['E'])])}")
-    table = gen.gen_table()
-    print(table.format())
-    tree = parse(table, ["id", "+", "(", "id", "[", "id", "]", ")"], trace=True)
-    print(format_node(tree) + "\n")
-    print()
-
-    # SLR1 can't handle this.
-    print("grammar_aho_ullman_1 (SLR1):")
-    grammar_aho_ullman_1 = [
-        ("S", ["L", "=", "R"]),
-        ("S", ["R"]),
-        ("L", ["*", "R"]),
-        ("L", ["id"]),
-        ("R", ["L"]),
-    ]
-    try:
-        gen = GenerateSLR1("S", grammar_aho_ullman_1)
-        table = gen.gen_table()
-        assert False
-    except ValueError as e:
-        print(e)
-        print()
-
-    # Here's an example with a full LR1 grammar, though.
-    print("grammar_aho_ullman_2 (LR1):")
-    grammar_aho_ullman_2 = [
-        ("S", ["X", "X"]),
-        ("X", ["a", "X"]),
-        ("X", ["b"]),
-    ]
-    gen = GenerateLR1("S", grammar_aho_ullman_2)
-    table = gen.gen_table()
-    print(table.format())
-    parse(table, ["b", "a", "a", "b"], trace=True)
-    print()
-
-    # What happens if we do LALR to it?
-    print("grammar_aho_ullman_2 (LALR):")
-    gen = GenerateLALR("S", grammar_aho_ullman_2)
-    table = gen.gen_table()
-    print(table.format())
-    print()
-
-    # A fun LALAR grammar.
-    print("grammar_lalr:")
-    grammar_lalr = [
-        ("S", ["V", "E"]),
-        ("E", ["F"]),
-        ("E", ["E", "+", "F"]),
-        ("F", ["V"]),
-        ("F", ["int"]),
-        ("F", ["(", "E", ")"]),
-        ("V", ["id"]),
-    ]
-    gen = GenerateLALR("S", grammar_lalr)
-    table = gen.gen_table()
-    print(table.format())
-    print()
-
-
-if __name__ == "__main__":
-    examples()
--- a/parser/runtime.py
+++ b/parser/runtime.py
@ -5,7 +5,7 @@ import logging
 import typing
 from dataclasses import dataclass

-from . import parser  # pyright: ignore # You're drunk.
+from . import parser


@dataclass
@ -267,17 +267,27 @@ def recover(table: parser.ParseTable, input: list[TokenValue], start: int, stack
 action_log = logging.getLogger("parser.action")


+class TokenStream(typing.Protocol):
+    def tokens(self) -> list[typing.Tuple[parser.Terminal, int, int]]:
+        """The tokens in the stream, in the form (terminal, start, length)."""
+        ...
+
+    def lines(self) -> list[int]:
+        """The offsets of line breaks in the tokens. (The end of line 0 is at
+        index 0, etc.)"""
+        ...
+
+
 class Parser:
    # Our stack is a stack of tuples, where the first entry is the state
    # number and the second entry is the 'value' that was generated when the
    # state was pushed.
    table: parser.ParseTable

-    def __init__(self, table, trace):
-        self.trace = trace
+    def __init__(self, table):
        self.table = table

-    def parse(self, tokens) -> typing.Tuple[Tree | None, list[str]]:
+    def parse(self, tokens: TokenStream) -> typing.Tuple[Tree | None, list[str]]:
        input_tokens = tokens.tokens()
        input: list[TokenValue] = [
            TokenValue(kind=kind.value, start=start, end=start + length)
@ -406,15 +416,17 @@ class Parser:

        # All done.
        error_strings = []
-        for parse_error in errors:
-            line_index = bisect.bisect_left(tokens.lines, parse_error.start)
-            if line_index == 0:
-                col_start = 0
-            else:
-                col_start = tokens.lines[line_index - 1] + 1
-            column_index = parse_error.start - col_start
-            line_index += 1
+        if errors:
+            lines = tokens.lines()
+            for parse_error in errors:
+                line_index = bisect.bisect_left(lines, parse_error.start)
+                if line_index == 0:
+                    col_start = 0
+                else:
+                    col_start = lines[line_index - 1] + 1
+                column_index = parse_error.start - col_start
+                line_index += 1

-            error_strings.append(f"{line_index}:{column_index}: {parse_error.message}")
+                error_strings.append(f"{line_index}:{column_index}: {parse_error.message}")

        return (result, error_strings)