Start moving the examples into tests

This commit is contained in:
John Doty 2024-06-15 07:52:16 -07:00
parent d3b8d0e836
commit e04aa1966e
6 changed files with 221 additions and 237 deletions

View file

@ -1097,73 +1097,6 @@ class GenerateLR0:
return builder.flush(config_sets)
def parse(table: ParseTable, input, trace=False):
"""Parse the input with the generated parsing table and return the
concrete syntax tree.
The parsing table can be generated by GenerateLR0.gen_table() or by any
of the other generators below. The parsing mechanism never changes, only
the table generation mechanism.
input is a list of tokens. Don't stick an end-of-stream marker, I'll stick
one on for you.
This is not a *great* parser, it's really just a demo for what you can
do with the table.
"""
assert "$" not in input
input = input + ["$"]
input_index = 0
# Our stack is a stack of tuples, where the first entry is the state number
# and the second entry is the 'value' that was generated when the state was
# pushed.
stack: list[typing.Tuple[int, typing.Any]] = [(0, None)]
while True:
current_state = stack[-1][0]
current_token = input[input_index]
action = table.actions[current_state].get(current_token, Error())
if trace:
print(
"{stack: <20} {input: <50} {action: <5}".format(
stack=repr([s[0] for s in stack]),
input=repr(input[input_index:]),
action=repr(action),
)
)
match action:
case Accept():
return stack[-1][1]
case Reduce(name=name, count=size, transparent=transparent):
children = []
for _, c in stack[-size:]:
if isinstance(c, tuple) and c[0] is None:
children.extend(c[1])
else:
children.append(c)
value = (name if not transparent else None, tuple(children))
stack = stack[:-size]
goto = table.gotos[stack[-1][0]].get(name)
assert goto is not None
stack.append((goto, value))
case Shift(state):
stack.append((state, (current_token, ())))
input_index += 1
case Error():
raise ValueError(
"Syntax error: unexpected symbol {sym}".format(
sym=current_token,
),
)
###############################################################################
# SLR(1)
###############################################################################
@ -1978,150 +1911,3 @@ class Grammar:
gen = generator(start, desugared, precedence=self._precedence, transparents=transparents)
table = gen.gen_table()
return table
###############################################################################
# Formatting
###############################################################################
def format_node(node):
"""Print out an indented concrete syntax tree, from parse()."""
lines = ["{name}".format(name=node[0])] + [
" " + line for child in node[1] for line in format_node(child).split("\n")
]
return "\n".join(lines)
###############################################################################
# Examples
###############################################################################
def examples():
def dump_grammar(grammar):
for name, symbols in grammar:
print(f"{name} -> {symbols}")
print()
# OK, this is a very simple LR0 grammar.
print("grammar_simple:")
grammar_simple = [
("E", ["E", "+", "T"]),
("E", ["T"]),
("T", ["(", "E", ")"]),
("T", ["id"]),
]
gen = GenerateLR0("E", grammar_simple)
table = gen.gen_table()
print(table.format())
tree = parse(table, ["id", "+", "(", "id", ")"])
print(format_node(tree) + "\n")
print()
# This one doesn't work with LR0, though, it has a shift/reduce conflict.
print("grammar_lr0_shift_reduce (LR0):")
grammar_lr0_shift_reduce = grammar_simple + [
("T", ["id", "[", "E", "]"]),
]
try:
gen = GenerateLR0("E", grammar_lr0_shift_reduce)
table = gen.gen_table()
assert False
except ValueError as e:
print(e)
print()
# Nor does this: it has a reduce/reduce conflict.
print("grammar_lr0_reduce_reduce (LR0):")
grammar_lr0_reduce_reduce = grammar_simple + [
("E", ["V", "=", "E"]),
("V", ["id"]),
]
try:
gen = GenerateLR0("E", grammar_lr0_reduce_reduce)
table = gen.gen_table()
assert False
except ValueError as e:
print(e)
print()
# Nullable symbols just don't work with constructs like this, because you can't
# look ahead to figure out if you should reduce an empty 'F' or not.
print("grammar_nullable (LR0):")
grammar_nullable = [
("E", ["F", "boop"]),
("F", ["beep"]),
("F", []),
]
try:
gen = GenerateLR0("E", grammar_nullable)
table = gen.gen_table()
assert False
except ValueError as e:
print(e)
print()
print("grammar_lr0_shift_reduce (SLR1):")
dump_grammar(grammar_lr0_shift_reduce)
gen = GenerateSLR1("E", grammar_lr0_shift_reduce)
print(f"Follow('E'): {str([gen.alphabet[f] for f in gen.gen_follow(gen.symbol_key['E'])])}")
table = gen.gen_table()
print(table.format())
tree = parse(table, ["id", "+", "(", "id", "[", "id", "]", ")"], trace=True)
print(format_node(tree) + "\n")
print()
# SLR1 can't handle this.
print("grammar_aho_ullman_1 (SLR1):")
grammar_aho_ullman_1 = [
("S", ["L", "=", "R"]),
("S", ["R"]),
("L", ["*", "R"]),
("L", ["id"]),
("R", ["L"]),
]
try:
gen = GenerateSLR1("S", grammar_aho_ullman_1)
table = gen.gen_table()
assert False
except ValueError as e:
print(e)
print()
# Here's an example with a full LR1 grammar, though.
print("grammar_aho_ullman_2 (LR1):")
grammar_aho_ullman_2 = [
("S", ["X", "X"]),
("X", ["a", "X"]),
("X", ["b"]),
]
gen = GenerateLR1("S", grammar_aho_ullman_2)
table = gen.gen_table()
print(table.format())
parse(table, ["b", "a", "a", "b"], trace=True)
print()
# What happens if we do LALR to it?
print("grammar_aho_ullman_2 (LALR):")
gen = GenerateLALR("S", grammar_aho_ullman_2)
table = gen.gen_table()
print(table.format())
print()
# A fun LALAR grammar.
print("grammar_lalr:")
grammar_lalr = [
("S", ["V", "E"]),
("E", ["F"]),
("E", ["E", "+", "F"]),
("F", ["V"]),
("F", ["int"]),
("F", ["(", "E", ")"]),
("V", ["id"]),
]
gen = GenerateLALR("S", grammar_lalr)
table = gen.gen_table()
print(table.format())
print()
if __name__ == "__main__":
examples()

View file

@ -5,7 +5,7 @@ import logging
import typing
from dataclasses import dataclass
from . import parser # pyright: ignore # You're drunk.
from . import parser
@dataclass
@ -267,17 +267,27 @@ def recover(table: parser.ParseTable, input: list[TokenValue], start: int, stack
action_log = logging.getLogger("parser.action")
class TokenStream(typing.Protocol):
def tokens(self) -> list[typing.Tuple[parser.Terminal, int, int]]:
"""The tokens in the stream, in the form (terminal, start, length)."""
...
def lines(self) -> list[int]:
"""The offsets of line breaks in the tokens. (The end of line 0 is at
index 0, etc.)"""
...
class Parser:
# Our stack is a stack of tuples, where the first entry is the state
# number and the second entry is the 'value' that was generated when the
# state was pushed.
table: parser.ParseTable
def __init__(self, table, trace):
self.trace = trace
def __init__(self, table):
self.table = table
def parse(self, tokens) -> typing.Tuple[Tree | None, list[str]]:
def parse(self, tokens: TokenStream) -> typing.Tuple[Tree | None, list[str]]:
input_tokens = tokens.tokens()
input: list[TokenValue] = [
TokenValue(kind=kind.value, start=start, end=start + length)
@ -406,15 +416,17 @@ class Parser:
# All done.
error_strings = []
for parse_error in errors:
line_index = bisect.bisect_left(tokens.lines, parse_error.start)
if line_index == 0:
col_start = 0
else:
col_start = tokens.lines[line_index - 1] + 1
column_index = parse_error.start - col_start
line_index += 1
if errors:
lines = tokens.lines()
for parse_error in errors:
line_index = bisect.bisect_left(lines, parse_error.start)
if line_index == 0:
col_start = 0
else:
col_start = lines[line_index - 1] + 1
column_index = parse_error.start - col_start
line_index += 1
error_strings.append(f"{line_index}:{column_index}: {parse_error.message}")
error_strings.append(f"{line_index}:{column_index}: {parse_error.message}")
return (result, error_strings)