From ccdda886ffb96c3584bc13f4db4dc11373758fb7 Mon Sep 17 00:00:00 2001 From: John Doty Date: Sun, 22 Sep 2024 08:49:33 -0700 Subject: [PATCH] [parser] Actually add the new tests Whoops, missed a file. --- tests/test_error_recovery.py | 398 +++++++++++++++++++++++++++++++++++ 1 file changed, 398 insertions(+) create mode 100644 tests/test_error_recovery.py diff --git a/tests/test_error_recovery.py b/tests/test_error_recovery.py new file mode 100644 index 0000000..8e4d5df --- /dev/null +++ b/tests/test_error_recovery.py @@ -0,0 +1,398 @@ +from parser.parser import ( + Grammar, + Re, + Terminal, + rule, + opt, + Assoc, +) +import parser.runtime as runtime + + +# Tests based on +# https://matklad.github.io/2023/05/21/resilient-ll-parsing-tutorial.html +class LGrammar(Grammar): + start = "File" + trivia = ["BLANKS"] + + # Need a little bit of disambiguation for the symbol involved. + precedence = [ + (Assoc.LEFT, ["PLUS", "MINUS"]), + (Assoc.LEFT, ["STAR", "SLASH"]), + (Assoc.LEFT, ["LPAREN"]), + ] + + @rule + def File(self): + # TODO: Make lists easier + return self._functions + + @rule + def _functions(self): + return self.Function | (self._functions + self.Function) + + @rule + def Function(self): + return self.FN + self.NAME + self.ParamList + opt(self.ARROW + self.TypeExpr) + self.Block + + @rule + def ParamList(self): + return self.LPAREN + opt(self._parameters) + self.RPAREN + + @rule + def _parameters(self): + # NOTE: The ungrammar in the reference does not talk about commas required between parameters + # so this massages it to make them required. Commas are in the list not the param, which + # is more awkward for processing but not terminally so. + return (self.Param + opt(self.COMMA)) | (self.Param + self.COMMA + self._parameters) + + @rule + def Param(self): + return self.NAME + self.COLON + self.TypeExpr + + @rule + def TypeExpr(self): + return self.NAME + + @rule + def Block(self): + return self.LCURLY + opt(self._statements) + self.RCURLY + + @rule + def _statements(self): + return self.Stmt | self._statements + self.Stmt + + @rule + def Stmt(self): + return self.StmtExpr | self.StmtLet | self.StmtReturn + + @rule + def StmtExpr(self): + return self.Expr + self.SEMICOLON + + @rule + def StmtLet(self): + return self.LET + self.NAME + self.EQUAL + self.Expr + self.SEMICOLON + + @rule + def StmtReturn(self): + return self.RETURN + self.Expr + self.SEMICOLON + + @rule + def Expr(self): + return self.ExprLiteral | self.ExprName | self.ExprParen | self.ExprBinary | self.ExprCall + + @rule + def ExprLiteral(self): + return self.INT | self.TRUE | self.FALSE + + @rule + def ExprName(self): + return self.NAME + + @rule + def ExprParen(self): + return self.LPAREN + self.Expr + self.RPAREN + + @rule + def ExprBinary(self): + return self.Expr + (self.PLUS | self.MINUS | self.STAR | self.SLASH) + self.Expr + + @rule + def ExprCall(self): + return self.Expr + self.ArgList + + @rule + def ArgList(self): + return self.LPAREN + opt(self._arg_star) + self.RPAREN + + @rule + def _arg_star(self): + # Again, a deviation from the original. See _parameters. + return (self.Expr + opt(self.COMMA)) | (self.Expr + self.COMMA + self._arg_star) + + BLANKS = Terminal(Re.set(" ", "\t", "\r", "\n").plus()) + + TRUE = Terminal("true") + FALSE = Terminal("false") + INT = Terminal(Re.set(("0", "9")).plus()) + FN = Terminal("fn") + ARROW = Terminal("->") + COMMA = Terminal(",") + LPAREN = Terminal("(") + RPAREN = Terminal(")") + LCURLY = Terminal("{") + RCURLY = Terminal("}") + COLON = Terminal(":") + SEMICOLON = Terminal(";") + LET = Terminal("let") + EQUAL = Terminal("=") + RETURN = Terminal("return") + PLUS = Terminal("+") + MINUS = Terminal("-") + STAR = Terminal("*") + SLASH = Terminal("/") + + NAME = Terminal( + Re.seq( + Re.set(("a", "z"), ("A", "Z"), "_"), + Re.set(("a", "z"), ("A", "Z"), ("0", "9"), "_").star(), + ), + ) + + +L_PARSE_TABLE = LGrammar().build_table() +L_LEXER_TABLE = LGrammar().compile_lexer() + + +def test_matklad_one(): + """This is the motivating example from the post. + + CPCT+ finds the correct sequence of tokens to resynchronize the parse. + """ + text = """ +fn fib_rec(f1: u32, + +fn fib(n: u32) -> u32 { + return fib_rec(1, 1, n); +} +""" + tree, errors = runtime.parse(L_PARSE_TABLE, L_LEXER_TABLE, text) + assert len(errors) > 0, "We ought to have caught at least one error" + assert tree is not None, "Gee we ought to have had *something* from this parse" + assert ( + tree.format(text, ignore_error=True) + == """ +File [1, 74) + Function [1, 24) + FN:'fn' [1, 3) + NAME:'fib_rec' [4, 11) + ParamList [11, 24) + LPAREN:'(' [11, 12) + Param [12, 19) + NAME:'f1' [12, 14) + COLON:':' [14, 15) + TypeExpr [16, 19) + NAME:'u32' [16, 19) + COMMA:',' [19, 20) + Function [22, 74) + FN:'fn' [22, 24) + NAME:'fib' [25, 28) + ParamList [28, 36) + LPAREN:'(' [28, 29) + Param [29, 35) + NAME:'n' [29, 30) + COLON:':' [30, 31) + TypeExpr [32, 35) + NAME:'u32' [32, 35) + RPAREN:')' [35, 36) + ARROW:'->' [37, 39) + TypeExpr [40, 43) + NAME:'u32' [40, 43) + Block [44, 74) + LCURLY:'{' [44, 45) + Stmt [48, 72) + StmtReturn [48, 72) + RETURN:'return' [48, 54) + Expr [55, 71) + ExprCall [55, 71) + Expr [55, 62) + ExprName [55, 62) + NAME:'fib_rec' [55, 62) + ArgList [62, 71) + LPAREN:'(' [62, 63) + Expr [63, 64) + ExprLiteral [63, 64) + INT:'1' [63, 64) + COMMA:',' [64, 65) + Expr [66, 67) + ExprLiteral [66, 67) + INT:'1' [66, 67) + COMMA:',' [67, 68) + Expr [69, 70) + ExprName [69, 70) + NAME:'n' [69, 70) + RPAREN:')' [70, 71) + SEMICOLON:';' [71, 72) + RCURLY:'}' [73, 74) + """.strip() + ) + + +def test_matklad_two(): + """Second example. + + CPCT+ discovers that deleting the extra comma is the right way to correct + the parse, and we get a nice parse tree with all three functions visible. + """ + text = """ +fn f1(x: i32, + +fn f2(x: i32,, z: i32) {} + +fn f3() {} +""" + tree, errors = runtime.parse(L_PARSE_TABLE, L_LEXER_TABLE, text) + assert len(errors) > 0, "We ought to have caught at least one error" + assert tree is not None, "Gee we ought to have had *something* from this parse" + assert ( + tree.format(text, ignore_error=True) + == """ +File [1, 53) + Function [1, 18) + FN:'fn' [1, 3) + NAME:'f1' [4, 6) + ParamList [6, 18) + LPAREN:'(' [6, 7) + Param [7, 13) + NAME:'x' [7, 8) + COLON:':' [8, 9) + TypeExpr [10, 13) + NAME:'i32' [10, 13) + COMMA:',' [13, 14) + Function [16, 41) + FN:'fn' [16, 18) + NAME:'f2' [19, 21) + ParamList [21, 38) + LPAREN:'(' [21, 22) + Param [22, 28) + NAME:'x' [22, 23) + COLON:':' [23, 24) + TypeExpr [25, 28) + NAME:'i32' [25, 28) + COMMA:',' [28, 29) + Param [31, 37) + NAME:'z' [31, 32) + COLON:':' [32, 33) + TypeExpr [34, 37) + NAME:'i32' [34, 37) + RPAREN:')' [37, 38) + Block [39, 41) + LCURLY:'{' [39, 40) + RCURLY:'}' [40, 41) + Function [43, 53) + FN:'fn' [43, 45) + NAME:'f3' [46, 48) + ParamList [48, 50) + LPAREN:'(' [48, 49) + RPAREN:')' [49, 50) + Block [51, 53) + LCURLY:'{' [51, 52) + RCURLY:'}' [52, 53) + """.strip() + ) + + +def test_matklad_three(): + """Third example. + + CPCT+ just... resynchronizes perfectly. I didn't have to do any kind of + grammar tweaking at all. + """ + + text = """ +fn f() { + g(1, + let x = +} + +fn g() {} +""" + tree, errors = runtime.parse(L_PARSE_TABLE, L_LEXER_TABLE, text) + assert len(errors) > 0, "We ought to have caught at least one error" + assert tree is not None, "Gee we ought to have had *something* from this parse" + assert ( + tree.format(text, ignore_error=True) + == """ +File [1, 39) + Function [1, 28) + FN:'fn' [1, 3) + NAME:'f' [4, 5) + ParamList [5, 7) + LPAREN:'(' [5, 6) + RPAREN:')' [6, 7) + Block [8, 28) + LCURLY:'{' [8, 9) + Stmt [12, 22) + StmtExpr [12, 22) + Expr [12, 22) + ExprCall [12, 22) + Expr [12, 13) + ExprName [12, 13) + NAME:'g' [12, 13) + ArgList [13, 22) + LPAREN:'(' [13, 14) + Expr [14, 15) + ExprLiteral [14, 15) + INT:'1' [14, 15) + COMMA:',' [15, 16) + Stmt [19, 28) + StmtLet [19, 28) + LET:'let' [19, 22) + NAME:'x' [23, 24) + EQUAL:'=' [25, 26) + RCURLY:'}' [27, 28) + Function [30, 39) + FN:'fn' [30, 32) + NAME:'g' [33, 34) + ParamList [34, 36) + LPAREN:'(' [34, 35) + RPAREN:')' [35, 36) + Block [37, 39) + LCURLY:'{' [37, 38) + RCURLY:'}' [38, 39) + """.strip() + ) + + +def test_matklad_four(): + """Fourth example. + + Again, CPCT+ resynchronizes the tree. (Funny enough, it synchronizes by + completing that broken `let` into `let x = 1 + FALSE;` which, sure, why + not?) + """ + + text = """ +fn f() { + let x = 1 + + let y = 2 +} +""" + tree, errors = runtime.parse(L_PARSE_TABLE, L_LEXER_TABLE, text) + assert len(errors) > 0, "We ought to have caught at least one error" + assert tree is not None, "Gee we ought to have had *something* from this parse" + assert ( + tree.format(text, ignore_error=True) + == """ +File [1, 37) + Function [1, 37) + FN:'fn' [1, 3) + NAME:'f' [4, 5) + ParamList [5, 7) + LPAREN:'(' [5, 6) + RPAREN:')' [6, 7) + Block [8, 37) + LCURLY:'{' [8, 9) + Stmt [12, 29) + StmtLet [12, 29) + LET:'let' [12, 15) + NAME:'x' [16, 17) + EQUAL:'=' [18, 19) + Expr [20, 29) + ExprBinary [20, 29) + Expr [20, 21) + ExprLiteral [20, 21) + INT:'1' [20, 21) + PLUS:'+' [22, 23) + Stmt [26, 37) + StmtLet [26, 37) + LET:'let' [26, 29) + NAME:'y' [30, 31) + EQUAL:'=' [32, 33) + Expr [34, 35) + ExprLiteral [34, 35) + INT:'2' [34, 35) + RCURLY:'}' [36, 37) + """.strip() + )