403 lines
9.4 KiB
Python
403 lines
9.4 KiB
Python
from parser.parser import (
|
|
Grammar,
|
|
Re,
|
|
Terminal,
|
|
rule,
|
|
opt,
|
|
Assoc,
|
|
)
|
|
import parser.runtime as runtime
|
|
|
|
|
|
# Tests based on
|
|
# https://matklad.github.io/2023/05/21/resilient-ll-parsing-tutorial.html
|
|
|
|
BLANKS = Terminal("BLANKS", Re.set(" ", "\t", "\r", "\n").plus())
|
|
|
|
TRUE = Terminal("TRUE", "true")
|
|
FALSE = Terminal("FALSE", "false")
|
|
INT = Terminal("INT", Re.set(("0", "9")).plus())
|
|
FN = Terminal("FN", "fn")
|
|
ARROW = Terminal("ARROW", "->")
|
|
COMMA = Terminal("COMMA", ",")
|
|
LPAREN = Terminal("LPAREN", "(")
|
|
RPAREN = Terminal("RPAREN", ")")
|
|
LCURLY = Terminal("LCURLY", "{")
|
|
RCURLY = Terminal("RCURLY", "}")
|
|
COLON = Terminal("COLON", ":")
|
|
SEMICOLON = Terminal("SEMICOLON", ";")
|
|
LET = Terminal("LET", "let")
|
|
EQUAL = Terminal("EQUAL", "=")
|
|
RETURN = Terminal("RETURN", "return")
|
|
PLUS = Terminal("PLUS", "+")
|
|
MINUS = Terminal("MINUS", "-")
|
|
STAR = Terminal("STAR", "*")
|
|
SLASH = Terminal("SLASH", "/")
|
|
|
|
NAME = Terminal(
|
|
"NAME",
|
|
Re.seq(
|
|
Re.set(("a", "z"), ("A", "Z"), "_"),
|
|
Re.set(("a", "z"), ("A", "Z"), ("0", "9"), "_").star(),
|
|
),
|
|
)
|
|
|
|
|
|
@rule
|
|
def File():
|
|
# TODO: Make lists easier
|
|
return _functions
|
|
|
|
@rule
|
|
def _functions():
|
|
return Function | (_functions + Function)
|
|
|
|
@rule
|
|
def Function():
|
|
return FN + NAME + ParamList + opt(ARROW + TypeExpr) + Block
|
|
|
|
@rule
|
|
def ParamList():
|
|
return LPAREN + opt(_parameters) + RPAREN
|
|
|
|
@rule
|
|
def _parameters():
|
|
# NOTE: The ungrammar in the reference does not talk about commas
|
|
# required between parameters so this massages it to make them
|
|
# required. Commas are in the list not the param, which is more
|
|
# awkward for processing but not terminally so.
|
|
return (Param + opt(COMMA)) | (Param + COMMA + _parameters)
|
|
|
|
@rule
|
|
def Param():
|
|
return NAME + COLON + TypeExpr
|
|
|
|
@rule
|
|
def TypeExpr():
|
|
return NAME
|
|
|
|
@rule
|
|
def Block():
|
|
return LCURLY + opt(_statements) + RCURLY
|
|
|
|
@rule
|
|
def _statements():
|
|
return Stmt | _statements + Stmt
|
|
|
|
@rule
|
|
def Stmt():
|
|
return StmtExpr | StmtLet | StmtReturn
|
|
|
|
@rule
|
|
def StmtExpr():
|
|
return Expr + SEMICOLON
|
|
|
|
@rule
|
|
def StmtLet():
|
|
return LET + NAME + EQUAL + Expr + SEMICOLON
|
|
|
|
@rule
|
|
def StmtReturn():
|
|
return RETURN + Expr + SEMICOLON
|
|
|
|
@rule
|
|
def Expr():
|
|
return ExprLiteral | ExprName | ExprParen | ExprBinary | ExprCall
|
|
|
|
@rule
|
|
def ExprLiteral():
|
|
return INT | TRUE | FALSE
|
|
|
|
@rule
|
|
def ExprName():
|
|
return NAME
|
|
|
|
@rule
|
|
def ExprParen():
|
|
return LPAREN + Expr + RPAREN
|
|
|
|
@rule
|
|
def ExprBinary():
|
|
return Expr + (PLUS | MINUS | STAR | SLASH) + Expr
|
|
|
|
@rule
|
|
def ExprCall():
|
|
return Expr + ArgList
|
|
|
|
@rule
|
|
def ArgList():
|
|
return LPAREN + opt(_arg_star) + RPAREN
|
|
|
|
@rule
|
|
def _arg_star():
|
|
# Again, a deviation from the original. See _parameters.
|
|
return (Expr + opt(COMMA)) | (Expr + COMMA + _arg_star)
|
|
|
|
LGrammar = Grammar(
|
|
start=File,
|
|
trivia=[BLANKS],
|
|
# Need a little bit of disambiguation for the symbol involved.
|
|
precedence = [
|
|
(Assoc.LEFT, [PLUS, MINUS]),
|
|
(Assoc.LEFT, [STAR, SLASH]),
|
|
(Assoc.LEFT, [LPAREN]),
|
|
],
|
|
)
|
|
|
|
L_PARSE_TABLE = LGrammar.build_table()
|
|
L_LEXER_TABLE = LGrammar.compile_lexer()
|
|
|
|
|
|
def test_matklad_one():
|
|
"""This is the motivating example from the post.
|
|
|
|
CPCT+ finds the correct sequence of tokens to resynchronize the parse.
|
|
"""
|
|
text = """
|
|
fn fib_rec(f1: u32,
|
|
|
|
fn fib(n: u32) -> u32 {
|
|
return fib_rec(1, 1, n);
|
|
}
|
|
"""
|
|
tree, errors = runtime.parse(L_PARSE_TABLE, L_LEXER_TABLE, text)
|
|
assert len(errors) > 0, "We ought to have caught at least one error"
|
|
assert tree is not None, "Gee we ought to have had *something* from this parse"
|
|
assert (
|
|
tree.format(text, ignore_error=True)
|
|
== """
|
|
File [1, 74)
|
|
Function [1, 24)
|
|
FN:'fn' [1, 3)
|
|
NAME:'fib_rec' [4, 11)
|
|
ParamList [11, 24)
|
|
LPAREN:'(' [11, 12)
|
|
Param [12, 19)
|
|
NAME:'f1' [12, 14)
|
|
COLON:':' [14, 15)
|
|
TypeExpr [16, 19)
|
|
NAME:'u32' [16, 19)
|
|
COMMA:',' [19, 20)
|
|
Function [22, 74)
|
|
FN:'fn' [22, 24)
|
|
NAME:'fib' [25, 28)
|
|
ParamList [28, 36)
|
|
LPAREN:'(' [28, 29)
|
|
Param [29, 35)
|
|
NAME:'n' [29, 30)
|
|
COLON:':' [30, 31)
|
|
TypeExpr [32, 35)
|
|
NAME:'u32' [32, 35)
|
|
RPAREN:')' [35, 36)
|
|
ARROW:'->' [37, 39)
|
|
TypeExpr [40, 43)
|
|
NAME:'u32' [40, 43)
|
|
Block [44, 74)
|
|
LCURLY:'{' [44, 45)
|
|
Stmt [48, 72)
|
|
StmtReturn [48, 72)
|
|
RETURN:'return' [48, 54)
|
|
Expr [55, 71)
|
|
ExprCall [55, 71)
|
|
Expr [55, 62)
|
|
ExprName [55, 62)
|
|
NAME:'fib_rec' [55, 62)
|
|
ArgList [62, 71)
|
|
LPAREN:'(' [62, 63)
|
|
Expr [63, 64)
|
|
ExprLiteral [63, 64)
|
|
INT:'1' [63, 64)
|
|
COMMA:',' [64, 65)
|
|
Expr [66, 67)
|
|
ExprLiteral [66, 67)
|
|
INT:'1' [66, 67)
|
|
COMMA:',' [67, 68)
|
|
Expr [69, 70)
|
|
ExprName [69, 70)
|
|
NAME:'n' [69, 70)
|
|
RPAREN:')' [70, 71)
|
|
SEMICOLON:';' [71, 72)
|
|
RCURLY:'}' [73, 74)
|
|
""".strip()
|
|
)
|
|
|
|
|
|
def test_matklad_two():
|
|
"""Second example.
|
|
|
|
CPCT+ discovers that deleting the extra comma is the right way to correct
|
|
the parse, and we get a nice parse tree with all three functions visible.
|
|
"""
|
|
text = """
|
|
fn f1(x: i32,
|
|
|
|
fn f2(x: i32,, z: i32) {}
|
|
|
|
fn f3() {}
|
|
"""
|
|
tree, errors = runtime.parse(L_PARSE_TABLE, L_LEXER_TABLE, text)
|
|
assert len(errors) > 0, "We ought to have caught at least one error"
|
|
assert tree is not None, "Gee we ought to have had *something* from this parse"
|
|
assert (
|
|
tree.format(text, ignore_error=True)
|
|
== """
|
|
File [1, 53)
|
|
Function [1, 18)
|
|
FN:'fn' [1, 3)
|
|
NAME:'f1' [4, 6)
|
|
ParamList [6, 18)
|
|
LPAREN:'(' [6, 7)
|
|
Param [7, 13)
|
|
NAME:'x' [7, 8)
|
|
COLON:':' [8, 9)
|
|
TypeExpr [10, 13)
|
|
NAME:'i32' [10, 13)
|
|
COMMA:',' [13, 14)
|
|
Function [16, 41)
|
|
FN:'fn' [16, 18)
|
|
NAME:'f2' [19, 21)
|
|
ParamList [21, 38)
|
|
LPAREN:'(' [21, 22)
|
|
Param [22, 28)
|
|
NAME:'x' [22, 23)
|
|
COLON:':' [23, 24)
|
|
TypeExpr [25, 28)
|
|
NAME:'i32' [25, 28)
|
|
COMMA:',' [28, 29)
|
|
Param [31, 37)
|
|
NAME:'z' [31, 32)
|
|
COLON:':' [32, 33)
|
|
TypeExpr [34, 37)
|
|
NAME:'i32' [34, 37)
|
|
RPAREN:')' [37, 38)
|
|
Block [39, 41)
|
|
LCURLY:'{' [39, 40)
|
|
RCURLY:'}' [40, 41)
|
|
Function [43, 53)
|
|
FN:'fn' [43, 45)
|
|
NAME:'f3' [46, 48)
|
|
ParamList [48, 50)
|
|
LPAREN:'(' [48, 49)
|
|
RPAREN:')' [49, 50)
|
|
Block [51, 53)
|
|
LCURLY:'{' [51, 52)
|
|
RCURLY:'}' [52, 53)
|
|
""".strip()
|
|
)
|
|
|
|
|
|
def test_matklad_three():
|
|
"""Third example.
|
|
|
|
CPCT+ just... resynchronizes perfectly. I didn't have to do any kind of
|
|
grammar tweaking at all.
|
|
"""
|
|
|
|
text = """
|
|
fn f() {
|
|
g(1,
|
|
let x =
|
|
}
|
|
|
|
fn g() {}
|
|
"""
|
|
# TODO: Error reporting here is wild.
|
|
tree, errors = runtime.parse(L_PARSE_TABLE, L_LEXER_TABLE, text)
|
|
assert len(errors) > 0, "We ought to have caught at least one error"
|
|
assert tree is not None, "Gee we ought to have had *something* from this parse"
|
|
assert (
|
|
tree.format(text, ignore_error=True)
|
|
== """
|
|
File [1, 39)
|
|
Function [1, 28)
|
|
FN:'fn' [1, 3)
|
|
NAME:'f' [4, 5)
|
|
ParamList [5, 7)
|
|
LPAREN:'(' [5, 6)
|
|
RPAREN:')' [6, 7)
|
|
Block [8, 28)
|
|
LCURLY:'{' [8, 9)
|
|
Stmt [12, 22)
|
|
StmtExpr [12, 22)
|
|
Expr [12, 22)
|
|
ExprCall [12, 22)
|
|
Expr [12, 13)
|
|
ExprName [12, 13)
|
|
NAME:'g' [12, 13)
|
|
ArgList [13, 22)
|
|
LPAREN:'(' [13, 14)
|
|
Expr [14, 15)
|
|
ExprLiteral [14, 15)
|
|
INT:'1' [14, 15)
|
|
COMMA:',' [15, 16)
|
|
Stmt [19, 28)
|
|
StmtLet [19, 28)
|
|
LET:'let' [19, 22)
|
|
NAME:'x' [23, 24)
|
|
EQUAL:'=' [25, 26)
|
|
RCURLY:'}' [27, 28)
|
|
Function [30, 39)
|
|
FN:'fn' [30, 32)
|
|
NAME:'g' [33, 34)
|
|
ParamList [34, 36)
|
|
LPAREN:'(' [34, 35)
|
|
RPAREN:')' [35, 36)
|
|
Block [37, 39)
|
|
LCURLY:'{' [37, 38)
|
|
RCURLY:'}' [38, 39)
|
|
""".strip()
|
|
)
|
|
|
|
|
|
def test_matklad_four():
|
|
"""Fourth example.
|
|
|
|
Again, CPCT+ resynchronizes the tree. (Funny enough, it synchronizes by
|
|
completing that broken `let` into `let x = 1 + FALSE;` which, sure, why
|
|
not?)
|
|
"""
|
|
|
|
text = """
|
|
fn f() {
|
|
let x = 1 +
|
|
let y = 2
|
|
}
|
|
"""
|
|
# TODO: Error reporting here is weird.
|
|
tree, errors = runtime.parse(L_PARSE_TABLE, L_LEXER_TABLE, text)
|
|
assert len(errors) > 0, "We ought to have caught at least one error"
|
|
assert tree is not None, "Gee we ought to have had *something* from this parse"
|
|
assert (
|
|
tree.format(text, ignore_error=True)
|
|
== """
|
|
File [1, 37)
|
|
Function [1, 37)
|
|
FN:'fn' [1, 3)
|
|
NAME:'f' [4, 5)
|
|
ParamList [5, 7)
|
|
LPAREN:'(' [5, 6)
|
|
RPAREN:')' [6, 7)
|
|
Block [8, 37)
|
|
LCURLY:'{' [8, 9)
|
|
Stmt [12, 29)
|
|
StmtLet [12, 29)
|
|
LET:'let' [12, 15)
|
|
NAME:'x' [16, 17)
|
|
EQUAL:'=' [18, 19)
|
|
Expr [20, 29)
|
|
ExprBinary [20, 29)
|
|
Expr [20, 21)
|
|
ExprLiteral [20, 21)
|
|
INT:'1' [20, 21)
|
|
PLUS:'+' [22, 23)
|
|
Stmt [26, 37)
|
|
StmtLet [26, 37)
|
|
LET:'let' [26, 29)
|
|
NAME:'y' [30, 31)
|
|
EQUAL:'=' [32, 33)
|
|
Expr [34, 35)
|
|
ExprLiteral [34, 35)
|
|
INT:'2' [34, 35)
|
|
RCURLY:'}' [36, 37)
|
|
""".strip()
|
|
)
|