lrparsers/tests/test_error_recovery.py
John Doty 5064a768e7 [all] A whole new style for grammars
Say good by to the sea of `self.`!
2024-11-09 11:21:30 -08:00

403 lines
9.4 KiB
Python

from parser.parser import (
Grammar,
Re,
Terminal,
rule,
opt,
Assoc,
)
import parser.runtime as runtime
# Tests based on
# https://matklad.github.io/2023/05/21/resilient-ll-parsing-tutorial.html
BLANKS = Terminal("BLANKS", Re.set(" ", "\t", "\r", "\n").plus())
TRUE = Terminal("TRUE", "true")
FALSE = Terminal("FALSE", "false")
INT = Terminal("INT", Re.set(("0", "9")).plus())
FN = Terminal("FN", "fn")
ARROW = Terminal("ARROW", "->")
COMMA = Terminal("COMMA", ",")
LPAREN = Terminal("LPAREN", "(")
RPAREN = Terminal("RPAREN", ")")
LCURLY = Terminal("LCURLY", "{")
RCURLY = Terminal("RCURLY", "}")
COLON = Terminal("COLON", ":")
SEMICOLON = Terminal("SEMICOLON", ";")
LET = Terminal("LET", "let")
EQUAL = Terminal("EQUAL", "=")
RETURN = Terminal("RETURN", "return")
PLUS = Terminal("PLUS", "+")
MINUS = Terminal("MINUS", "-")
STAR = Terminal("STAR", "*")
SLASH = Terminal("SLASH", "/")
NAME = Terminal(
"NAME",
Re.seq(
Re.set(("a", "z"), ("A", "Z"), "_"),
Re.set(("a", "z"), ("A", "Z"), ("0", "9"), "_").star(),
),
)
@rule
def File():
# TODO: Make lists easier
return _functions
@rule
def _functions():
return Function | (_functions + Function)
@rule
def Function():
return FN + NAME + ParamList + opt(ARROW + TypeExpr) + Block
@rule
def ParamList():
return LPAREN + opt(_parameters) + RPAREN
@rule
def _parameters():
# NOTE: The ungrammar in the reference does not talk about commas
# required between parameters so this massages it to make them
# required. Commas are in the list not the param, which is more
# awkward for processing but not terminally so.
return (Param + opt(COMMA)) | (Param + COMMA + _parameters)
@rule
def Param():
return NAME + COLON + TypeExpr
@rule
def TypeExpr():
return NAME
@rule
def Block():
return LCURLY + opt(_statements) + RCURLY
@rule
def _statements():
return Stmt | _statements + Stmt
@rule
def Stmt():
return StmtExpr | StmtLet | StmtReturn
@rule
def StmtExpr():
return Expr + SEMICOLON
@rule
def StmtLet():
return LET + NAME + EQUAL + Expr + SEMICOLON
@rule
def StmtReturn():
return RETURN + Expr + SEMICOLON
@rule
def Expr():
return ExprLiteral | ExprName | ExprParen | ExprBinary | ExprCall
@rule
def ExprLiteral():
return INT | TRUE | FALSE
@rule
def ExprName():
return NAME
@rule
def ExprParen():
return LPAREN + Expr + RPAREN
@rule
def ExprBinary():
return Expr + (PLUS | MINUS | STAR | SLASH) + Expr
@rule
def ExprCall():
return Expr + ArgList
@rule
def ArgList():
return LPAREN + opt(_arg_star) + RPAREN
@rule
def _arg_star():
# Again, a deviation from the original. See _parameters.
return (Expr + opt(COMMA)) | (Expr + COMMA + _arg_star)
LGrammar = Grammar(
start=File,
trivia=[BLANKS],
# Need a little bit of disambiguation for the symbol involved.
precedence = [
(Assoc.LEFT, [PLUS, MINUS]),
(Assoc.LEFT, [STAR, SLASH]),
(Assoc.LEFT, [LPAREN]),
],
)
L_PARSE_TABLE = LGrammar.build_table()
L_LEXER_TABLE = LGrammar.compile_lexer()
def test_matklad_one():
"""This is the motivating example from the post.
CPCT+ finds the correct sequence of tokens to resynchronize the parse.
"""
text = """
fn fib_rec(f1: u32,
fn fib(n: u32) -> u32 {
return fib_rec(1, 1, n);
}
"""
tree, errors = runtime.parse(L_PARSE_TABLE, L_LEXER_TABLE, text)
assert len(errors) > 0, "We ought to have caught at least one error"
assert tree is not None, "Gee we ought to have had *something* from this parse"
assert (
tree.format(text, ignore_error=True)
== """
File [1, 74)
Function [1, 24)
FN:'fn' [1, 3)
NAME:'fib_rec' [4, 11)
ParamList [11, 24)
LPAREN:'(' [11, 12)
Param [12, 19)
NAME:'f1' [12, 14)
COLON:':' [14, 15)
TypeExpr [16, 19)
NAME:'u32' [16, 19)
COMMA:',' [19, 20)
Function [22, 74)
FN:'fn' [22, 24)
NAME:'fib' [25, 28)
ParamList [28, 36)
LPAREN:'(' [28, 29)
Param [29, 35)
NAME:'n' [29, 30)
COLON:':' [30, 31)
TypeExpr [32, 35)
NAME:'u32' [32, 35)
RPAREN:')' [35, 36)
ARROW:'->' [37, 39)
TypeExpr [40, 43)
NAME:'u32' [40, 43)
Block [44, 74)
LCURLY:'{' [44, 45)
Stmt [48, 72)
StmtReturn [48, 72)
RETURN:'return' [48, 54)
Expr [55, 71)
ExprCall [55, 71)
Expr [55, 62)
ExprName [55, 62)
NAME:'fib_rec' [55, 62)
ArgList [62, 71)
LPAREN:'(' [62, 63)
Expr [63, 64)
ExprLiteral [63, 64)
INT:'1' [63, 64)
COMMA:',' [64, 65)
Expr [66, 67)
ExprLiteral [66, 67)
INT:'1' [66, 67)
COMMA:',' [67, 68)
Expr [69, 70)
ExprName [69, 70)
NAME:'n' [69, 70)
RPAREN:')' [70, 71)
SEMICOLON:';' [71, 72)
RCURLY:'}' [73, 74)
""".strip()
)
def test_matklad_two():
"""Second example.
CPCT+ discovers that deleting the extra comma is the right way to correct
the parse, and we get a nice parse tree with all three functions visible.
"""
text = """
fn f1(x: i32,
fn f2(x: i32,, z: i32) {}
fn f3() {}
"""
tree, errors = runtime.parse(L_PARSE_TABLE, L_LEXER_TABLE, text)
assert len(errors) > 0, "We ought to have caught at least one error"
assert tree is not None, "Gee we ought to have had *something* from this parse"
assert (
tree.format(text, ignore_error=True)
== """
File [1, 53)
Function [1, 18)
FN:'fn' [1, 3)
NAME:'f1' [4, 6)
ParamList [6, 18)
LPAREN:'(' [6, 7)
Param [7, 13)
NAME:'x' [7, 8)
COLON:':' [8, 9)
TypeExpr [10, 13)
NAME:'i32' [10, 13)
COMMA:',' [13, 14)
Function [16, 41)
FN:'fn' [16, 18)
NAME:'f2' [19, 21)
ParamList [21, 38)
LPAREN:'(' [21, 22)
Param [22, 28)
NAME:'x' [22, 23)
COLON:':' [23, 24)
TypeExpr [25, 28)
NAME:'i32' [25, 28)
COMMA:',' [28, 29)
Param [31, 37)
NAME:'z' [31, 32)
COLON:':' [32, 33)
TypeExpr [34, 37)
NAME:'i32' [34, 37)
RPAREN:')' [37, 38)
Block [39, 41)
LCURLY:'{' [39, 40)
RCURLY:'}' [40, 41)
Function [43, 53)
FN:'fn' [43, 45)
NAME:'f3' [46, 48)
ParamList [48, 50)
LPAREN:'(' [48, 49)
RPAREN:')' [49, 50)
Block [51, 53)
LCURLY:'{' [51, 52)
RCURLY:'}' [52, 53)
""".strip()
)
def test_matklad_three():
"""Third example.
CPCT+ just... resynchronizes perfectly. I didn't have to do any kind of
grammar tweaking at all.
"""
text = """
fn f() {
g(1,
let x =
}
fn g() {}
"""
# TODO: Error reporting here is wild.
tree, errors = runtime.parse(L_PARSE_TABLE, L_LEXER_TABLE, text)
assert len(errors) > 0, "We ought to have caught at least one error"
assert tree is not None, "Gee we ought to have had *something* from this parse"
assert (
tree.format(text, ignore_error=True)
== """
File [1, 39)
Function [1, 28)
FN:'fn' [1, 3)
NAME:'f' [4, 5)
ParamList [5, 7)
LPAREN:'(' [5, 6)
RPAREN:')' [6, 7)
Block [8, 28)
LCURLY:'{' [8, 9)
Stmt [12, 22)
StmtExpr [12, 22)
Expr [12, 22)
ExprCall [12, 22)
Expr [12, 13)
ExprName [12, 13)
NAME:'g' [12, 13)
ArgList [13, 22)
LPAREN:'(' [13, 14)
Expr [14, 15)
ExprLiteral [14, 15)
INT:'1' [14, 15)
COMMA:',' [15, 16)
Stmt [19, 28)
StmtLet [19, 28)
LET:'let' [19, 22)
NAME:'x' [23, 24)
EQUAL:'=' [25, 26)
RCURLY:'}' [27, 28)
Function [30, 39)
FN:'fn' [30, 32)
NAME:'g' [33, 34)
ParamList [34, 36)
LPAREN:'(' [34, 35)
RPAREN:')' [35, 36)
Block [37, 39)
LCURLY:'{' [37, 38)
RCURLY:'}' [38, 39)
""".strip()
)
def test_matklad_four():
"""Fourth example.
Again, CPCT+ resynchronizes the tree. (Funny enough, it synchronizes by
completing that broken `let` into `let x = 1 + FALSE;` which, sure, why
not?)
"""
text = """
fn f() {
let x = 1 +
let y = 2
}
"""
# TODO: Error reporting here is weird.
tree, errors = runtime.parse(L_PARSE_TABLE, L_LEXER_TABLE, text)
assert len(errors) > 0, "We ought to have caught at least one error"
assert tree is not None, "Gee we ought to have had *something* from this parse"
assert (
tree.format(text, ignore_error=True)
== """
File [1, 37)
Function [1, 37)
FN:'fn' [1, 3)
NAME:'f' [4, 5)
ParamList [5, 7)
LPAREN:'(' [5, 6)
RPAREN:')' [6, 7)
Block [8, 37)
LCURLY:'{' [8, 9)
Stmt [12, 29)
StmtLet [12, 29)
LET:'let' [12, 15)
NAME:'x' [16, 17)
EQUAL:'=' [18, 19)
Expr [20, 29)
ExprBinary [20, 29)
Expr [20, 21)
ExprLiteral [20, 21)
INT:'1' [20, 21)
PLUS:'+' [22, 23)
Stmt [26, 37)
StmtLet [26, 37)
LET:'let' [26, 29)
NAME:'y' [30, 31)
EQUAL:'=' [32, 33)
Expr [34, 35)
ExprLiteral [34, 35)
INT:'2' [34, 35)
RCURLY:'}' [36, 37)
""".strip()
)