lrparsers/tests/test_grammar.py
John Doty b3b2102864 Record trivia in tokens
This will make our formatting better I think.
2024-09-12 06:22:49 -07:00

439 lines
9.9 KiB
Python

import pytest
import parser
import parser.runtime as runtime
from parser import Grammar, seq, rule, Terminal
class Tokens:
def __init__(self, *toks: Terminal):
self._tokens = [(t, 0, 0) for t in toks]
self._lines = []
def tokens(self):
return self._tokens
def lines(self):
return self._lines
def _tree(treeform) -> runtime.Tree | runtime.TokenValue:
if isinstance(treeform, str):
return runtime.TokenValue(treeform, 0, 0, [], [])
else:
assert isinstance(treeform, tuple)
name = treeform[0]
assert isinstance(name, str)
return runtime.Tree(
name=name,
start=0,
end=0,
children=tuple(_tree(x) for x in treeform[1:]),
)
def test_lr0_lr0():
"""An LR0 grammar should work with an LR0 generator."""
class G(Grammar):
start = "E"
generator = parser.GenerateLR0
@rule
def E(self):
return seq(self.E, self.PLUS, self.T) | self.T
@rule
def T(self):
return seq(self.LPAREN, self.E, self.RPAREN) | self.IDENTIFIER
PLUS = Terminal("+", name="+")
LPAREN = Terminal("(", name="(")
RPAREN = Terminal(")", name=")")
IDENTIFIER = Terminal("id", name="id")
table = G().build_table()
tree, errors = runtime.Parser(table).parse(
Tokens(G.IDENTIFIER, G.PLUS, G.LPAREN, G.IDENTIFIER, G.RPAREN)
)
assert errors == []
assert tree == _tree(("E", ("E", ("T", "id")), "+", ("T", "(", ("E", ("T", "id")), ")")))
def test_lr0_shift_reduce():
"""This one should not work in LR0- it has a shift/reduce conflict, but works in SLR1."""
class G(Grammar):
start = "E"
generator = parser.GenerateLR0
@rule
def E(self):
return seq(self.E, self.PLUS, self.T) | self.T
@rule
def T(self):
return (
seq(self.LPAREN, self.E, self.RPAREN)
| self.IDENTIFIER
| seq(self.IDENTIFIER, self.LSQUARE, self.E, self.RSQUARE)
)
PLUS = Terminal("+")
LPAREN = Terminal("(")
RPAREN = Terminal(")")
LSQUARE = Terminal("[")
RSQUARE = Terminal("]")
IDENTIFIER = Terminal("id")
with pytest.raises(parser.AmbiguityError):
G().build_table()
G().build_table(generator=parser.GenerateSLR1)
def test_lr0_reduce_reduce():
"""This one should not work, it has a reduce-reduce conflict."""
class G(Grammar):
start = "E"
generator = parser.GenerateLR0
@rule
def E(self):
return seq(self.E, self.PLUS, self.T) | self.T | seq(self.V, self.EQUAL, self.E)
@rule
def T(self):
return seq(self.LPAREN, self.E, self.RPAREN) | self.IDENTIFIER
@rule
def V(self):
return self.IDENTIFIER
PLUS = Terminal("+")
EQUAL = Terminal("=")
LPAREN = Terminal("(")
RPAREN = Terminal(")")
IDENTIFIER = Terminal("id")
with pytest.raises(parser.AmbiguityError):
G().build_table()
def test_lr0_empty():
"""LR0 can't handle empty productions because it doesn't know when to reduce."""
class G(Grammar):
start = "E"
generator = parser.GenerateLR0
@rule
def E(self):
return seq(self.F, self.BOOP)
@rule
def F(self):
return self.BEEP | parser.Nothing
BOOP = Terminal("boop")
BEEP = Terminal("beep")
with pytest.raises(parser.AmbiguityError):
G().build_table()
def test_grammar_aho_ullman_1():
class G(Grammar):
start = "S"
generator = parser.GenerateSLR1
@rule
def S(self):
return seq(self.L, self.EQUAL, self.R) | self.R
@rule
def L(self):
return seq(self.STAR, self.R) | self.ID
@rule
def R(self):
return self.L
EQUAL = Terminal("=")
STAR = Terminal("*")
ID = Terminal("id")
with pytest.raises(parser.AmbiguityError):
G().build_table()
G().build_table(generator=parser.GenerateLR1)
def test_grammar_aho_ullman_2():
class TestGrammar(Grammar):
start = "S"
generator = parser.GenerateSLR1
@rule
def S(self):
return seq(self.X, self.X)
@rule
def X(self):
return seq(self.A, self.X) | self.B
A = Terminal("a")
B = Terminal("b")
TestGrammar().build_table()
TestGrammar().build_table(generator=parser.GenerateLR1)
TestGrammar().build_table(generator=parser.GenerateLALR)
def test_fun_lalr():
class TestGrammar(Grammar):
start = "S"
generator = parser.GenerateLALR
@rule
def S(self):
return seq(self.V, self.E)
@rule
def E(self):
return self.F | seq(self.E, self.PLUS, self.F)
@rule
def F(self):
return self.V | self.INT | seq(self.LPAREN, self.E, self.RPAREN)
@rule
def V(self):
return self.ID
PLUS = Terminal("+")
INT = Terminal("int")
ID = Terminal("id")
LPAREN = Terminal("(")
RPAREN = Terminal(")")
TestGrammar().build_table()
def test_conflicting_names():
"""Terminals and nonterminals cannot have the same name.
I think that ultimately this gives a nicer experience, in error messages and
understandability. The input grammar can distinguish between them throughout,
and the system can always be unambiguous when it's working, but at times it
needs to report errors or display the grammar to humans. There is no clean
notation I can use at that time to distinguish between a terminal an a
nonterminal.
I think this restriction ultimately makes the grammars and the tooling easier
to understand.
"""
class TestGrammar(Grammar):
start = "IDENTIFIER"
@rule("IDENTIFIER")
def identifier(self):
return self.IDENTIFIER
IDENTIFIER = Terminal("Identifier")
with pytest.raises(ValueError):
TestGrammar().build_table()
def test_grammar_ignore_trivia():
class G(Grammar):
start = "sentence"
trivia = ["BLANK"]
@rule
def sentence(self):
return self.WORD | seq(self.sentence, self.WORD)
WORD = Terminal("blah")
BLANK = Terminal(" ")
table = G().build_table()
assert "BLANK" in table.trivia
tree, errors = runtime.Parser(table).parse(
Tokens(
G.WORD,
G.BLANK,
G.WORD,
G.BLANK,
)
)
assert errors == []
assert tree == runtime.Tree(
"sentence",
0,
0,
(
runtime.Tree(
"sentence",
0,
0,
(
runtime.TokenValue(
"WORD",
0,
0,
[],
[runtime.TokenValue("BLANK", 0, 0, [], [])],
),
),
),
runtime.TokenValue(
"WORD",
0,
0,
[runtime.TokenValue("BLANK", 0, 0, [], [])],
[runtime.TokenValue("BLANK", 0, 0, [], [])],
),
),
)
def test_grammar_unknown_trivia():
class G(Grammar):
start = "sentence"
trivia = ["BLANK"]
@rule
def sentence(self):
return self.WORD | seq(self.sentence, self.WORD)
WORD = Terminal("blah")
with pytest.raises(ValueError):
G().build_table()
def test_grammar_trivia_symbol():
class G(Grammar):
start = "sentence"
@rule
def sentence(self):
return self.WORD | seq(self.sentence, self.WORD)
WORD = Terminal("blah")
BLANK = Terminal(" ")
trivia = [BLANK]
table = G().build_table()
assert "BLANK" in table.trivia
def test_grammar_trivia_constructor():
class G(Grammar):
start = "sentence"
def __init__(self):
super().__init__(trivia=[self.BLANK])
@rule
def sentence(self):
return self.WORD | seq(self.sentence, self.WORD)
WORD = Terminal("blah")
BLANK = Terminal(" ")
table = G().build_table()
assert "BLANK" in table.trivia
def test_grammar_trivia_constructor_string():
class G(Grammar):
start = "sentence"
def __init__(self):
super().__init__(trivia=["BLANK"])
@rule
def sentence(self):
return self.WORD | seq(self.sentence, self.WORD)
WORD = Terminal("blah")
BLANK = Terminal(" ")
table = G().build_table()
assert "BLANK" in table.trivia
def test_grammar_trivia_constructor_string_unknown():
class G(Grammar):
start = "sentence"
def __init__(self):
super().__init__(trivia=["BLANK"])
@rule
def sentence(self):
return self.WORD | seq(self.sentence, self.WORD)
WORD = Terminal("blah")
with pytest.raises(ValueError):
G().build_table()
def test_grammar_name_implicit():
class FooGrammar(Grammar):
start = "x"
@rule
def x(self):
return self.WORD
WORD = Terminal("blah")
assert FooGrammar().name == "foo"
def test_grammar_name_explicit_member():
class FooGrammar(Grammar):
start = "x"
name = "bar"
@rule
def x(self):
return self.WORD
WORD = Terminal("blah")
assert FooGrammar().name == "bar"
def test_grammar_name_explicit_constructor():
class FooGrammar(Grammar):
start = "x"
name = "bar"
def __init__(self):
super().__init__(name="baz")
@rule
def x(self):
return self.WORD
WORD = Terminal("blah")
assert FooGrammar().name == "baz"