Start moving the examples into tests

This commit is contained in:
John Doty 2024-06-15 07:52:16 -07:00
parent d3b8d0e836
commit e04aa1966e
6 changed files with 221 additions and 237 deletions

View file

@ -1,5 +1,6 @@
# This is an example grammar. # This is an example grammar.
import re import re
import typing
import parser import parser
from parser import Assoc, Grammar, Nothing, Terminal, rule, seq, Rule from parser import Assoc, Grammar, Nothing, Terminal, rule, seq, Rule
@ -517,12 +518,15 @@ import bisect
class FineTokens: class FineTokens:
def __init__(self, src: str): def __init__(self, src: str):
self.src = src self.src = src
self._tokens = list(tokenize(src)) self._tokens: list[typing.Tuple[Terminal, int, int]] = list(tokenize(src))
self.lines = [m.start() for m in re.finditer("\n", src)] self._lines = [m.start() for m in re.finditer("\n", src)]
def tokens(self): def tokens(self):
return self._tokens return self._tokens
def lines(self):
return self._lines
def dump(self, *, start=None, end=None): def dump(self, *, start=None, end=None):
if start is None: if start is None:
start = 0 start = 0
@ -531,11 +535,11 @@ class FineTokens:
for token in self._tokens[start:end]: for token in self._tokens[start:end]:
(kind, start, length) = token (kind, start, length) = token
line_index = bisect.bisect_left(self.lines, start) line_index = bisect.bisect_left(self._lines, start)
if line_index == 0: if line_index == 0:
col_start = 0 col_start = 0
else: else:
col_start = self.lines[line_index - 1] + 1 col_start = self._lines[line_index - 1] + 1
column_index = start - col_start column_index = start - col_start
value = self.src[start : start + length] value = self.src[start : start + length]
print(f"{start:04} {kind.value:12} {value} ({line_index}, {column_index})") print(f"{start:04} {kind.value:12} {value} ({line_index}, {column_index})")

View file

@ -300,7 +300,7 @@ class Harness:
# print(f"{tokens.lines}") # print(f"{tokens.lines}")
# tokens.dump(end=5) # tokens.dump(end=5)
(tree, errors) = runtime.Parser(table, trace=None).parse(self.tokens) (tree, errors) = runtime.Parser(table).parse(self.tokens)
parse_time = time.time() parse_time = time.time()
self.tree = tree self.tree = tree
self.errors = errors self.errors = errors

View file

@ -1,3 +1,3 @@
.PHONY: test .PHONY: test
test: test:
pytest pdm run pytest

View file

@ -1097,73 +1097,6 @@ class GenerateLR0:
return builder.flush(config_sets) return builder.flush(config_sets)
def parse(table: ParseTable, input, trace=False):
"""Parse the input with the generated parsing table and return the
concrete syntax tree.
The parsing table can be generated by GenerateLR0.gen_table() or by any
of the other generators below. The parsing mechanism never changes, only
the table generation mechanism.
input is a list of tokens. Don't stick an end-of-stream marker, I'll stick
one on for you.
This is not a *great* parser, it's really just a demo for what you can
do with the table.
"""
assert "$" not in input
input = input + ["$"]
input_index = 0
# Our stack is a stack of tuples, where the first entry is the state number
# and the second entry is the 'value' that was generated when the state was
# pushed.
stack: list[typing.Tuple[int, typing.Any]] = [(0, None)]
while True:
current_state = stack[-1][0]
current_token = input[input_index]
action = table.actions[current_state].get(current_token, Error())
if trace:
print(
"{stack: <20} {input: <50} {action: <5}".format(
stack=repr([s[0] for s in stack]),
input=repr(input[input_index:]),
action=repr(action),
)
)
match action:
case Accept():
return stack[-1][1]
case Reduce(name=name, count=size, transparent=transparent):
children = []
for _, c in stack[-size:]:
if isinstance(c, tuple) and c[0] is None:
children.extend(c[1])
else:
children.append(c)
value = (name if not transparent else None, tuple(children))
stack = stack[:-size]
goto = table.gotos[stack[-1][0]].get(name)
assert goto is not None
stack.append((goto, value))
case Shift(state):
stack.append((state, (current_token, ())))
input_index += 1
case Error():
raise ValueError(
"Syntax error: unexpected symbol {sym}".format(
sym=current_token,
),
)
############################################################################### ###############################################################################
# SLR(1) # SLR(1)
############################################################################### ###############################################################################
@ -1978,150 +1911,3 @@ class Grammar:
gen = generator(start, desugared, precedence=self._precedence, transparents=transparents) gen = generator(start, desugared, precedence=self._precedence, transparents=transparents)
table = gen.gen_table() table = gen.gen_table()
return table return table
###############################################################################
# Formatting
###############################################################################
def format_node(node):
"""Print out an indented concrete syntax tree, from parse()."""
lines = ["{name}".format(name=node[0])] + [
" " + line for child in node[1] for line in format_node(child).split("\n")
]
return "\n".join(lines)
###############################################################################
# Examples
###############################################################################
def examples():
def dump_grammar(grammar):
for name, symbols in grammar:
print(f"{name} -> {symbols}")
print()
# OK, this is a very simple LR0 grammar.
print("grammar_simple:")
grammar_simple = [
("E", ["E", "+", "T"]),
("E", ["T"]),
("T", ["(", "E", ")"]),
("T", ["id"]),
]
gen = GenerateLR0("E", grammar_simple)
table = gen.gen_table()
print(table.format())
tree = parse(table, ["id", "+", "(", "id", ")"])
print(format_node(tree) + "\n")
print()
# This one doesn't work with LR0, though, it has a shift/reduce conflict.
print("grammar_lr0_shift_reduce (LR0):")
grammar_lr0_shift_reduce = grammar_simple + [
("T", ["id", "[", "E", "]"]),
]
try:
gen = GenerateLR0("E", grammar_lr0_shift_reduce)
table = gen.gen_table()
assert False
except ValueError as e:
print(e)
print()
# Nor does this: it has a reduce/reduce conflict.
print("grammar_lr0_reduce_reduce (LR0):")
grammar_lr0_reduce_reduce = grammar_simple + [
("E", ["V", "=", "E"]),
("V", ["id"]),
]
try:
gen = GenerateLR0("E", grammar_lr0_reduce_reduce)
table = gen.gen_table()
assert False
except ValueError as e:
print(e)
print()
# Nullable symbols just don't work with constructs like this, because you can't
# look ahead to figure out if you should reduce an empty 'F' or not.
print("grammar_nullable (LR0):")
grammar_nullable = [
("E", ["F", "boop"]),
("F", ["beep"]),
("F", []),
]
try:
gen = GenerateLR0("E", grammar_nullable)
table = gen.gen_table()
assert False
except ValueError as e:
print(e)
print()
print("grammar_lr0_shift_reduce (SLR1):")
dump_grammar(grammar_lr0_shift_reduce)
gen = GenerateSLR1("E", grammar_lr0_shift_reduce)
print(f"Follow('E'): {str([gen.alphabet[f] for f in gen.gen_follow(gen.symbol_key['E'])])}")
table = gen.gen_table()
print(table.format())
tree = parse(table, ["id", "+", "(", "id", "[", "id", "]", ")"], trace=True)
print(format_node(tree) + "\n")
print()
# SLR1 can't handle this.
print("grammar_aho_ullman_1 (SLR1):")
grammar_aho_ullman_1 = [
("S", ["L", "=", "R"]),
("S", ["R"]),
("L", ["*", "R"]),
("L", ["id"]),
("R", ["L"]),
]
try:
gen = GenerateSLR1("S", grammar_aho_ullman_1)
table = gen.gen_table()
assert False
except ValueError as e:
print(e)
print()
# Here's an example with a full LR1 grammar, though.
print("grammar_aho_ullman_2 (LR1):")
grammar_aho_ullman_2 = [
("S", ["X", "X"]),
("X", ["a", "X"]),
("X", ["b"]),
]
gen = GenerateLR1("S", grammar_aho_ullman_2)
table = gen.gen_table()
print(table.format())
parse(table, ["b", "a", "a", "b"], trace=True)
print()
# What happens if we do LALR to it?
print("grammar_aho_ullman_2 (LALR):")
gen = GenerateLALR("S", grammar_aho_ullman_2)
table = gen.gen_table()
print(table.format())
print()
# A fun LALAR grammar.
print("grammar_lalr:")
grammar_lalr = [
("S", ["V", "E"]),
("E", ["F"]),
("E", ["E", "+", "F"]),
("F", ["V"]),
("F", ["int"]),
("F", ["(", "E", ")"]),
("V", ["id"]),
]
gen = GenerateLALR("S", grammar_lalr)
table = gen.gen_table()
print(table.format())
print()
if __name__ == "__main__":
examples()

View file

@ -5,7 +5,7 @@ import logging
import typing import typing
from dataclasses import dataclass from dataclasses import dataclass
from . import parser # pyright: ignore # You're drunk. from . import parser
@dataclass @dataclass
@ -267,17 +267,27 @@ def recover(table: parser.ParseTable, input: list[TokenValue], start: int, stack
action_log = logging.getLogger("parser.action") action_log = logging.getLogger("parser.action")
class TokenStream(typing.Protocol):
def tokens(self) -> list[typing.Tuple[parser.Terminal, int, int]]:
"""The tokens in the stream, in the form (terminal, start, length)."""
...
def lines(self) -> list[int]:
"""The offsets of line breaks in the tokens. (The end of line 0 is at
index 0, etc.)"""
...
class Parser: class Parser:
# Our stack is a stack of tuples, where the first entry is the state # Our stack is a stack of tuples, where the first entry is the state
# number and the second entry is the 'value' that was generated when the # number and the second entry is the 'value' that was generated when the
# state was pushed. # state was pushed.
table: parser.ParseTable table: parser.ParseTable
def __init__(self, table, trace): def __init__(self, table):
self.trace = trace
self.table = table self.table = table
def parse(self, tokens) -> typing.Tuple[Tree | None, list[str]]: def parse(self, tokens: TokenStream) -> typing.Tuple[Tree | None, list[str]]:
input_tokens = tokens.tokens() input_tokens = tokens.tokens()
input: list[TokenValue] = [ input: list[TokenValue] = [
TokenValue(kind=kind.value, start=start, end=start + length) TokenValue(kind=kind.value, start=start, end=start + length)
@ -406,15 +416,17 @@ class Parser:
# All done. # All done.
error_strings = [] error_strings = []
for parse_error in errors: if errors:
line_index = bisect.bisect_left(tokens.lines, parse_error.start) lines = tokens.lines()
if line_index == 0: for parse_error in errors:
col_start = 0 line_index = bisect.bisect_left(lines, parse_error.start)
else: if line_index == 0:
col_start = tokens.lines[line_index - 1] + 1 col_start = 0
column_index = parse_error.start - col_start else:
line_index += 1 col_start = lines[line_index - 1] + 1
column_index = parse_error.start - col_start
line_index += 1
error_strings.append(f"{line_index}:{column_index}: {parse_error.message}") error_strings.append(f"{line_index}:{column_index}: {parse_error.message}")
return (result, error_strings) return (result, error_strings)

View file

@ -1,6 +1,67 @@
import parser import typing
import pytest import pytest
import parser
import parser.runtime as runtime
from parser import Grammar, seq, rule, Terminal
PLUS = Terminal("+")
LPAREN = Terminal("(")
RPAREN = Terminal(")")
IDENTIFIER = Terminal("id")
class Tokens:
def __init__(self, *toks: Terminal):
self._tokens = [(t, 0, 0) for t in toks]
self._lines = []
def tokens(self):
return self._tokens
def lines(self):
return self._lines
def _tree(treeform) -> runtime.Tree | runtime.TokenValue:
if isinstance(treeform, str):
return runtime.TokenValue(treeform, 0, 0)
else:
assert isinstance(treeform, tuple)
name = treeform[0]
assert isinstance(name, str)
return runtime.Tree(
name=name,
start=0,
end=0,
children=tuple(_tree(x) for x in treeform[1:]),
)
class LR0Grammar(Grammar):
start = "E"
generator = parser.GenerateLR0
@rule
def E(self):
return seq(self.E, PLUS, self.T) | self.T
@rule
def T(self):
return seq(LPAREN, self.E, RPAREN) | IDENTIFIER
def test_lr0_lr0():
"""An LR0 grammar should work with an LR0 generator."""
table = LR0Grammar().build_table()
parser = runtime.Parser(table)
tree, errors = parser.parse(Tokens(IDENTIFIER, PLUS, LPAREN, IDENTIFIER, RPAREN))
assert errors == []
assert tree == _tree(("E", ("E", ("T", "id")), "+", ("T", "(", ("E", ("T", "id")), ")")))
def test_conflicting_names(): def test_conflicting_names():
"""Terminals and nonterminals cannot have the same name. """Terminals and nonterminals cannot have the same name.
@ -16,14 +77,135 @@ def test_conflicting_names():
to understand. to understand.
""" """
IDENTIFIER = parser.Terminal("Identifier") IDENTIFIER = Terminal("Identifier")
class TestGrammar(parser.Grammar): class TestGrammar(Grammar):
start = "Identifier" start = "Identifier"
@parser.rule("Identifier") @rule("Identifier")
def identifier(self): def identifier(self):
return IDENTIFIER return IDENTIFIER
with pytest.raises(ValueError): with pytest.raises(ValueError):
TestGrammar().build_table() TestGrammar().build_table()
###############################################################################
# Examples
###############################################################################
# def examples():
# def dump_grammar(grammar):
# for name, symbols in grammar:
# print(f"{name} -> {symbols}")
# print()
# # This one doesn't work with LR0, though, it has a shift/reduce conflict.
# print("grammar_lr0_shift_reduce (LR0):")
# grammar_lr0_shift_reduce = grammar_simple + [
# ("T", ["id", "[", "E", "]"]),
# ]
# try:
# gen = GenerateLR0("E", grammar_lr0_shift_reduce)
# table = gen.gen_table()
# assert False
# except ValueError as e:
# print(e)
# print()
# # Nor does this: it has a reduce/reduce conflict.
# print("grammar_lr0_reduce_reduce (LR0):")
# grammar_lr0_reduce_reduce = grammar_simple + [
# ("E", ["V", "=", "E"]),
# ("V", ["id"]),
# ]
# try:
# gen = GenerateLR0("E", grammar_lr0_reduce_reduce)
# table = gen.gen_table()
# assert False
# except ValueError as e:
# print(e)
# print()
# # Nullable symbols just don't work with constructs like this, because you can't
# # look ahead to figure out if you should reduce an empty 'F' or not.
# print("grammar_nullable (LR0):")
# grammar_nullable = [
# ("E", ["F", "boop"]),
# ("F", ["beep"]),
# ("F", []),
# ]
# try:
# gen = GenerateLR0("E", grammar_nullable)
# table = gen.gen_table()
# assert False
# except ValueError as e:
# print(e)
# print()
# print("grammar_lr0_shift_reduce (SLR1):")
# dump_grammar(grammar_lr0_shift_reduce)
# gen = GenerateSLR1("E", grammar_lr0_shift_reduce)
# print(f"Follow('E'): {str([gen.alphabet[f] for f in gen.gen_follow(gen.symbol_key['E'])])}")
# table = gen.gen_table()
# print(table.format())
# tree = parse(table, ["id", "+", "(", "id", "[", "id", "]", ")"], trace=True)
# print(format_node(tree) + "\n")
# print()
# # SLR1 can't handle this.
# print("grammar_aho_ullman_1 (SLR1):")
# grammar_aho_ullman_1 = [
# ("S", ["L", "=", "R"]),
# ("S", ["R"]),
# ("L", ["*", "R"]),
# ("L", ["id"]),
# ("R", ["L"]),
# ]
# try:
# gen = GenerateSLR1("S", grammar_aho_ullman_1)
# table = gen.gen_table()
# assert False
# except ValueError as e:
# print(e)
# print()
# # Here's an example with a full LR1 grammar, though.
# print("grammar_aho_ullman_2 (LR1):")
# grammar_aho_ullman_2 = [
# ("S", ["X", "X"]),
# ("X", ["a", "X"]),
# ("X", ["b"]),
# ]
# gen = GenerateLR1("S", grammar_aho_ullman_2)
# table = gen.gen_table()
# print(table.format())
# parse(table, ["b", "a", "a", "b"], trace=True)
# print()
# # What happens if we do LALR to it?
# print("grammar_aho_ullman_2 (LALR):")
# gen = GenerateLALR("S", grammar_aho_ullman_2)
# table = gen.gen_table()
# print(table.format())
# print()
# # A fun LALAR grammar.
# print("grammar_lalr:")
# grammar_lalr = [
# ("S", ["V", "E"]),
# ("E", ["F"]),
# ("E", ["E", "+", "F"]),
# ("F", ["V"]),
# ("F", ["int"]),
# ("F", ["(", "E", ")"]),
# ("V", ["id"]),
# ]
# gen = GenerateLALR("S", grammar_lalr)
# table = gen.gen_table()
# print(table.format())
# print()
# if __name__ == "__main__":
# examples()