lrparsers/tests/test_error_recovery.py
2024-10-27 09:10:31 -07:00

400 lines
10 KiB
Python

from parser.parser import (
Grammar,
Re,
Terminal,
rule,
opt,
Assoc,
)
import parser.runtime as runtime
# Tests based on
# https://matklad.github.io/2023/05/21/resilient-ll-parsing-tutorial.html
class LGrammar(Grammar):
start = "File"
trivia = ["BLANKS"]
# Need a little bit of disambiguation for the symbol involved.
precedence = [
(Assoc.LEFT, ["PLUS", "MINUS"]),
(Assoc.LEFT, ["STAR", "SLASH"]),
(Assoc.LEFT, ["LPAREN"]),
]
@rule
def File(self):
# TODO: Make lists easier
return self._functions
@rule
def _functions(self):
return self.Function | (self._functions + self.Function)
@rule
def Function(self):
return self.FN + self.NAME + self.ParamList + opt(self.ARROW + self.TypeExpr) + self.Block
@rule
def ParamList(self):
return self.LPAREN + opt(self._parameters) + self.RPAREN
@rule
def _parameters(self):
# NOTE: The ungrammar in the reference does not talk about commas required between parameters
# so this massages it to make them required. Commas are in the list not the param, which
# is more awkward for processing but not terminally so.
return (self.Param + opt(self.COMMA)) | (self.Param + self.COMMA + self._parameters)
@rule
def Param(self):
return self.NAME + self.COLON + self.TypeExpr
@rule
def TypeExpr(self):
return self.NAME
@rule
def Block(self):
return self.LCURLY + opt(self._statements) + self.RCURLY
@rule
def _statements(self):
return self.Stmt | self._statements + self.Stmt
@rule
def Stmt(self):
return self.StmtExpr | self.StmtLet | self.StmtReturn
@rule
def StmtExpr(self):
return self.Expr + self.SEMICOLON
@rule
def StmtLet(self):
return self.LET + self.NAME + self.EQUAL + self.Expr + self.SEMICOLON
@rule
def StmtReturn(self):
return self.RETURN + self.Expr + self.SEMICOLON
@rule
def Expr(self):
return self.ExprLiteral | self.ExprName | self.ExprParen | self.ExprBinary | self.ExprCall
@rule
def ExprLiteral(self):
return self.INT | self.TRUE | self.FALSE
@rule
def ExprName(self):
return self.NAME
@rule
def ExprParen(self):
return self.LPAREN + self.Expr + self.RPAREN
@rule
def ExprBinary(self):
return self.Expr + (self.PLUS | self.MINUS | self.STAR | self.SLASH) + self.Expr
@rule
def ExprCall(self):
return self.Expr + self.ArgList
@rule
def ArgList(self):
return self.LPAREN + opt(self._arg_star) + self.RPAREN
@rule
def _arg_star(self):
# Again, a deviation from the original. See _parameters.
return (self.Expr + opt(self.COMMA)) | (self.Expr + self.COMMA + self._arg_star)
BLANKS = Terminal(Re.set(" ", "\t", "\r", "\n").plus())
TRUE = Terminal("true")
FALSE = Terminal("false")
INT = Terminal(Re.set(("0", "9")).plus())
FN = Terminal("fn")
ARROW = Terminal("->")
COMMA = Terminal(",")
LPAREN = Terminal("(")
RPAREN = Terminal(")")
LCURLY = Terminal("{")
RCURLY = Terminal("}")
COLON = Terminal(":")
SEMICOLON = Terminal(";")
LET = Terminal("let")
EQUAL = Terminal("=")
RETURN = Terminal("return")
PLUS = Terminal("+")
MINUS = Terminal("-")
STAR = Terminal("*")
SLASH = Terminal("/")
NAME = Terminal(
Re.seq(
Re.set(("a", "z"), ("A", "Z"), "_"),
Re.set(("a", "z"), ("A", "Z"), ("0", "9"), "_").star(),
),
)
L_PARSE_TABLE = LGrammar().build_table()
L_LEXER_TABLE = LGrammar().compile_lexer()
def test_matklad_one():
"""This is the motivating example from the post.
CPCT+ finds the correct sequence of tokens to resynchronize the parse.
"""
text = """
fn fib_rec(f1: u32,
fn fib(n: u32) -> u32 {
return fib_rec(1, 1, n);
}
"""
tree, errors = runtime.parse(L_PARSE_TABLE, L_LEXER_TABLE, text)
assert len(errors) > 0, "We ought to have caught at least one error"
assert tree is not None, "Gee we ought to have had *something* from this parse"
assert (
tree.format(text, ignore_error=True)
== """
File [1, 74)
Function [1, 24)
FN:'fn' [1, 3)
NAME:'fib_rec' [4, 11)
ParamList [11, 24)
LPAREN:'(' [11, 12)
Param [12, 19)
NAME:'f1' [12, 14)
COLON:':' [14, 15)
TypeExpr [16, 19)
NAME:'u32' [16, 19)
COMMA:',' [19, 20)
Function [22, 74)
FN:'fn' [22, 24)
NAME:'fib' [25, 28)
ParamList [28, 36)
LPAREN:'(' [28, 29)
Param [29, 35)
NAME:'n' [29, 30)
COLON:':' [30, 31)
TypeExpr [32, 35)
NAME:'u32' [32, 35)
RPAREN:')' [35, 36)
ARROW:'->' [37, 39)
TypeExpr [40, 43)
NAME:'u32' [40, 43)
Block [44, 74)
LCURLY:'{' [44, 45)
Stmt [48, 72)
StmtReturn [48, 72)
RETURN:'return' [48, 54)
Expr [55, 71)
ExprCall [55, 71)
Expr [55, 62)
ExprName [55, 62)
NAME:'fib_rec' [55, 62)
ArgList [62, 71)
LPAREN:'(' [62, 63)
Expr [63, 64)
ExprLiteral [63, 64)
INT:'1' [63, 64)
COMMA:',' [64, 65)
Expr [66, 67)
ExprLiteral [66, 67)
INT:'1' [66, 67)
COMMA:',' [67, 68)
Expr [69, 70)
ExprName [69, 70)
NAME:'n' [69, 70)
RPAREN:')' [70, 71)
SEMICOLON:';' [71, 72)
RCURLY:'}' [73, 74)
""".strip()
)
def test_matklad_two():
"""Second example.
CPCT+ discovers that deleting the extra comma is the right way to correct
the parse, and we get a nice parse tree with all three functions visible.
"""
text = """
fn f1(x: i32,
fn f2(x: i32,, z: i32) {}
fn f3() {}
"""
tree, errors = runtime.parse(L_PARSE_TABLE, L_LEXER_TABLE, text)
assert len(errors) > 0, "We ought to have caught at least one error"
assert tree is not None, "Gee we ought to have had *something* from this parse"
assert (
tree.format(text, ignore_error=True)
== """
File [1, 53)
Function [1, 18)
FN:'fn' [1, 3)
NAME:'f1' [4, 6)
ParamList [6, 18)
LPAREN:'(' [6, 7)
Param [7, 13)
NAME:'x' [7, 8)
COLON:':' [8, 9)
TypeExpr [10, 13)
NAME:'i32' [10, 13)
COMMA:',' [13, 14)
Function [16, 41)
FN:'fn' [16, 18)
NAME:'f2' [19, 21)
ParamList [21, 38)
LPAREN:'(' [21, 22)
Param [22, 28)
NAME:'x' [22, 23)
COLON:':' [23, 24)
TypeExpr [25, 28)
NAME:'i32' [25, 28)
COMMA:',' [28, 29)
Param [31, 37)
NAME:'z' [31, 32)
COLON:':' [32, 33)
TypeExpr [34, 37)
NAME:'i32' [34, 37)
RPAREN:')' [37, 38)
Block [39, 41)
LCURLY:'{' [39, 40)
RCURLY:'}' [40, 41)
Function [43, 53)
FN:'fn' [43, 45)
NAME:'f3' [46, 48)
ParamList [48, 50)
LPAREN:'(' [48, 49)
RPAREN:')' [49, 50)
Block [51, 53)
LCURLY:'{' [51, 52)
RCURLY:'}' [52, 53)
""".strip()
)
def test_matklad_three():
"""Third example.
CPCT+ just... resynchronizes perfectly. I didn't have to do any kind of
grammar tweaking at all.
"""
text = """
fn f() {
g(1,
let x =
}
fn g() {}
"""
# TODO: Error reporting here is wild.
tree, errors = runtime.parse(L_PARSE_TABLE, L_LEXER_TABLE, text)
assert len(errors) > 0, "We ought to have caught at least one error"
assert tree is not None, "Gee we ought to have had *something* from this parse"
assert (
tree.format(text, ignore_error=True)
== """
File [1, 39)
Function [1, 28)
FN:'fn' [1, 3)
NAME:'f' [4, 5)
ParamList [5, 7)
LPAREN:'(' [5, 6)
RPAREN:')' [6, 7)
Block [8, 28)
LCURLY:'{' [8, 9)
Stmt [12, 22)
StmtExpr [12, 22)
Expr [12, 22)
ExprCall [12, 22)
Expr [12, 13)
ExprName [12, 13)
NAME:'g' [12, 13)
ArgList [13, 22)
LPAREN:'(' [13, 14)
Expr [14, 15)
ExprLiteral [14, 15)
INT:'1' [14, 15)
COMMA:',' [15, 16)
Stmt [19, 28)
StmtLet [19, 28)
LET:'let' [19, 22)
NAME:'x' [23, 24)
EQUAL:'=' [25, 26)
RCURLY:'}' [27, 28)
Function [30, 39)
FN:'fn' [30, 32)
NAME:'g' [33, 34)
ParamList [34, 36)
LPAREN:'(' [34, 35)
RPAREN:')' [35, 36)
Block [37, 39)
LCURLY:'{' [37, 38)
RCURLY:'}' [38, 39)
""".strip()
)
def test_matklad_four():
"""Fourth example.
Again, CPCT+ resynchronizes the tree. (Funny enough, it synchronizes by
completing that broken `let` into `let x = 1 + FALSE;` which, sure, why
not?)
"""
text = """
fn f() {
let x = 1 +
let y = 2
}
"""
# TODO: Error reporting here is weird.
tree, errors = runtime.parse(L_PARSE_TABLE, L_LEXER_TABLE, text)
assert len(errors) > 0, "We ought to have caught at least one error"
assert tree is not None, "Gee we ought to have had *something* from this parse"
assert (
tree.format(text, ignore_error=True)
== """
File [1, 37)
Function [1, 37)
FN:'fn' [1, 3)
NAME:'f' [4, 5)
ParamList [5, 7)
LPAREN:'(' [5, 6)
RPAREN:')' [6, 7)
Block [8, 37)
LCURLY:'{' [8, 9)
Stmt [12, 29)
StmtLet [12, 29)
LET:'let' [12, 15)
NAME:'x' [16, 17)
EQUAL:'=' [18, 19)
Expr [20, 29)
ExprBinary [20, 29)
Expr [20, 21)
ExprLiteral [20, 21)
INT:'1' [20, 21)
PLUS:'+' [22, 23)
Stmt [26, 37)
StmtLet [26, 37)
LET:'let' [26, 29)
NAME:'y' [30, 31)
EQUAL:'=' [32, 33)
Expr [34, 35)
ExprLiteral [34, 35)
INT:'2' [34, 35)
RCURLY:'}' [36, 37)
""".strip()
)