from parser.parser import ( Grammar, Re, Terminal, rule, opt, Assoc, ) import parser.runtime as runtime # Tests based on # https://matklad.github.io/2023/05/21/resilient-ll-parsing-tutorial.html class LGrammar(Grammar): start = "File" trivia = ["BLANKS"] # Need a little bit of disambiguation for the symbol involved. precedence = [ (Assoc.LEFT, ["PLUS", "MINUS"]), (Assoc.LEFT, ["STAR", "SLASH"]), (Assoc.LEFT, ["LPAREN"]), ] @rule def File(self): # TODO: Make lists easier return self._functions @rule def _functions(self): return self.Function | (self._functions + self.Function) @rule def Function(self): return self.FN + self.NAME + self.ParamList + opt(self.ARROW + self.TypeExpr) + self.Block @rule def ParamList(self): return self.LPAREN + opt(self._parameters) + self.RPAREN @rule def _parameters(self): # NOTE: The ungrammar in the reference does not talk about commas required between parameters # so this massages it to make them required. Commas are in the list not the param, which # is more awkward for processing but not terminally so. return (self.Param + opt(self.COMMA)) | (self.Param + self.COMMA + self._parameters) @rule def Param(self): return self.NAME + self.COLON + self.TypeExpr @rule def TypeExpr(self): return self.NAME @rule def Block(self): return self.LCURLY + opt(self._statements) + self.RCURLY @rule def _statements(self): return self.Stmt | self._statements + self.Stmt @rule def Stmt(self): return self.StmtExpr | self.StmtLet | self.StmtReturn @rule def StmtExpr(self): return self.Expr + self.SEMICOLON @rule def StmtLet(self): return self.LET + self.NAME + self.EQUAL + self.Expr + self.SEMICOLON @rule def StmtReturn(self): return self.RETURN + self.Expr + self.SEMICOLON @rule def Expr(self): return self.ExprLiteral | self.ExprName | self.ExprParen | self.ExprBinary | self.ExprCall @rule def ExprLiteral(self): return self.INT | self.TRUE | self.FALSE @rule def ExprName(self): return self.NAME @rule def ExprParen(self): return self.LPAREN + self.Expr + self.RPAREN @rule def ExprBinary(self): return self.Expr + (self.PLUS | self.MINUS | self.STAR | self.SLASH) + self.Expr @rule def ExprCall(self): return self.Expr + self.ArgList @rule def ArgList(self): return self.LPAREN + opt(self._arg_star) + self.RPAREN @rule def _arg_star(self): # Again, a deviation from the original. See _parameters. return (self.Expr + opt(self.COMMA)) | (self.Expr + self.COMMA + self._arg_star) BLANKS = Terminal(Re.set(" ", "\t", "\r", "\n").plus()) TRUE = Terminal("true") FALSE = Terminal("false") INT = Terminal(Re.set(("0", "9")).plus()) FN = Terminal("fn") ARROW = Terminal("->") COMMA = Terminal(",") LPAREN = Terminal("(") RPAREN = Terminal(")") LCURLY = Terminal("{") RCURLY = Terminal("}") COLON = Terminal(":") SEMICOLON = Terminal(";") LET = Terminal("let") EQUAL = Terminal("=") RETURN = Terminal("return") PLUS = Terminal("+") MINUS = Terminal("-") STAR = Terminal("*") SLASH = Terminal("/") NAME = Terminal( Re.seq( Re.set(("a", "z"), ("A", "Z"), "_"), Re.set(("a", "z"), ("A", "Z"), ("0", "9"), "_").star(), ), ) L_PARSE_TABLE = LGrammar().build_table() L_LEXER_TABLE = LGrammar().compile_lexer() def test_matklad_one(): """This is the motivating example from the post. CPCT+ finds the correct sequence of tokens to resynchronize the parse. """ text = """ fn fib_rec(f1: u32, fn fib(n: u32) -> u32 { return fib_rec(1, 1, n); } """ tree, errors = runtime.parse(L_PARSE_TABLE, L_LEXER_TABLE, text) assert len(errors) > 0, "We ought to have caught at least one error" assert tree is not None, "Gee we ought to have had *something* from this parse" assert ( tree.format(text, ignore_error=True) == """ File [1, 74) Function [1, 24) FN:'fn' [1, 3) NAME:'fib_rec' [4, 11) ParamList [11, 24) LPAREN:'(' [11, 12) Param [12, 19) NAME:'f1' [12, 14) COLON:':' [14, 15) TypeExpr [16, 19) NAME:'u32' [16, 19) COMMA:',' [19, 20) Function [22, 74) FN:'fn' [22, 24) NAME:'fib' [25, 28) ParamList [28, 36) LPAREN:'(' [28, 29) Param [29, 35) NAME:'n' [29, 30) COLON:':' [30, 31) TypeExpr [32, 35) NAME:'u32' [32, 35) RPAREN:')' [35, 36) ARROW:'->' [37, 39) TypeExpr [40, 43) NAME:'u32' [40, 43) Block [44, 74) LCURLY:'{' [44, 45) Stmt [48, 72) StmtReturn [48, 72) RETURN:'return' [48, 54) Expr [55, 71) ExprCall [55, 71) Expr [55, 62) ExprName [55, 62) NAME:'fib_rec' [55, 62) ArgList [62, 71) LPAREN:'(' [62, 63) Expr [63, 64) ExprLiteral [63, 64) INT:'1' [63, 64) COMMA:',' [64, 65) Expr [66, 67) ExprLiteral [66, 67) INT:'1' [66, 67) COMMA:',' [67, 68) Expr [69, 70) ExprName [69, 70) NAME:'n' [69, 70) RPAREN:')' [70, 71) SEMICOLON:';' [71, 72) RCURLY:'}' [73, 74) """.strip() ) def test_matklad_two(): """Second example. CPCT+ discovers that deleting the extra comma is the right way to correct the parse, and we get a nice parse tree with all three functions visible. """ text = """ fn f1(x: i32, fn f2(x: i32,, z: i32) {} fn f3() {} """ tree, errors = runtime.parse(L_PARSE_TABLE, L_LEXER_TABLE, text) assert len(errors) > 0, "We ought to have caught at least one error" assert tree is not None, "Gee we ought to have had *something* from this parse" assert ( tree.format(text, ignore_error=True) == """ File [1, 53) Function [1, 18) FN:'fn' [1, 3) NAME:'f1' [4, 6) ParamList [6, 18) LPAREN:'(' [6, 7) Param [7, 13) NAME:'x' [7, 8) COLON:':' [8, 9) TypeExpr [10, 13) NAME:'i32' [10, 13) COMMA:',' [13, 14) Function [16, 41) FN:'fn' [16, 18) NAME:'f2' [19, 21) ParamList [21, 38) LPAREN:'(' [21, 22) Param [22, 28) NAME:'x' [22, 23) COLON:':' [23, 24) TypeExpr [25, 28) NAME:'i32' [25, 28) COMMA:',' [28, 29) Param [31, 37) NAME:'z' [31, 32) COLON:':' [32, 33) TypeExpr [34, 37) NAME:'i32' [34, 37) RPAREN:')' [37, 38) Block [39, 41) LCURLY:'{' [39, 40) RCURLY:'}' [40, 41) Function [43, 53) FN:'fn' [43, 45) NAME:'f3' [46, 48) ParamList [48, 50) LPAREN:'(' [48, 49) RPAREN:')' [49, 50) Block [51, 53) LCURLY:'{' [51, 52) RCURLY:'}' [52, 53) """.strip() ) def test_matklad_three(): """Third example. CPCT+ just... resynchronizes perfectly. I didn't have to do any kind of grammar tweaking at all. """ text = """ fn f() { g(1, let x = } fn g() {} """ tree, errors = runtime.parse(L_PARSE_TABLE, L_LEXER_TABLE, text) assert len(errors) > 0, "We ought to have caught at least one error" assert tree is not None, "Gee we ought to have had *something* from this parse" assert ( tree.format(text, ignore_error=True) == """ File [1, 39) Function [1, 28) FN:'fn' [1, 3) NAME:'f' [4, 5) ParamList [5, 7) LPAREN:'(' [5, 6) RPAREN:')' [6, 7) Block [8, 28) LCURLY:'{' [8, 9) Stmt [12, 22) StmtExpr [12, 22) Expr [12, 22) ExprCall [12, 22) Expr [12, 13) ExprName [12, 13) NAME:'g' [12, 13) ArgList [13, 22) LPAREN:'(' [13, 14) Expr [14, 15) ExprLiteral [14, 15) INT:'1' [14, 15) COMMA:',' [15, 16) Stmt [19, 28) StmtLet [19, 28) LET:'let' [19, 22) NAME:'x' [23, 24) EQUAL:'=' [25, 26) RCURLY:'}' [27, 28) Function [30, 39) FN:'fn' [30, 32) NAME:'g' [33, 34) ParamList [34, 36) LPAREN:'(' [34, 35) RPAREN:')' [35, 36) Block [37, 39) LCURLY:'{' [37, 38) RCURLY:'}' [38, 39) """.strip() ) def test_matklad_four(): """Fourth example. Again, CPCT+ resynchronizes the tree. (Funny enough, it synchronizes by completing that broken `let` into `let x = 1 + FALSE;` which, sure, why not?) """ text = """ fn f() { let x = 1 + let y = 2 } """ tree, errors = runtime.parse(L_PARSE_TABLE, L_LEXER_TABLE, text) assert len(errors) > 0, "We ought to have caught at least one error" assert tree is not None, "Gee we ought to have had *something* from this parse" assert ( tree.format(text, ignore_error=True) == """ File [1, 37) Function [1, 37) FN:'fn' [1, 3) NAME:'f' [4, 5) ParamList [5, 7) LPAREN:'(' [5, 6) RPAREN:')' [6, 7) Block [8, 37) LCURLY:'{' [8, 9) Stmt [12, 29) StmtLet [12, 29) LET:'let' [12, 15) NAME:'x' [16, 17) EQUAL:'=' [18, 19) Expr [20, 29) ExprBinary [20, 29) Expr [20, 21) ExprLiteral [20, 21) INT:'1' [20, 21) PLUS:'+' [22, 23) Stmt [26, 37) StmtLet [26, 37) LET:'let' [26, 29) NAME:'y' [30, 31) EQUAL:'=' [32, 33) Expr [34, 35) ExprLiteral [34, 35) INT:'2' [34, 35) RCURLY:'}' [36, 37) """.strip() )