diff --git a/grammar.py b/grammar.py new file mode 100644 index 0000000..abb1b04 --- /dev/null +++ b/grammar.py @@ -0,0 +1,418 @@ +import parser_faster +import sys +import typing + +from parser_faster import Assoc + +class Token: + value: str + + def __init__(self, value): + self.value = sys.intern(value) + +Symbol = Token | str + +def desugar( + grammar: dict[str, list[list[Symbol]]], + precedence: list[typing.Tuple[Assoc, list[Symbol]]], +): + nonterminal_refs = set() + nonterminals = set() + terminals = set() + + result: list[typing.Tuple[str, list[str]]] = [] + for (k, v) in grammar.items(): + nonterminals.add(k) + + for rule in v: + assert isinstance(rule, list) + result_rule: list[str] = [] + for symbol in rule: + if isinstance(symbol, Token): + result_rule.append(symbol.value) + terminals.add(symbol.value) + else: + result_rule.append(symbol) + nonterminal_refs.add(symbol) + + result.append((k, result_rule)) + + unknown_rules = nonterminal_refs - nonterminals + if len(unknown_rules) > 0: + undefined = "\n ".join(unknown_rules) + raise Exception(f"The following rules are not defined:\n {undefined}") + + overlap_rules = nonterminals & terminals + if len(overlap_rules) > 0: + overlap = "\n ".join(overlap_rules) + raise Exception(f"The following symbols are both tokens and rules:\n {overlap}") + + result_precedence = { + (symbol.value if isinstance(symbol, Token) else symbol):(associativity, precedence + 1) + for precedence, (associativity, symbols) in enumerate(precedence) + for symbol in symbols + } + + return result, result_precedence + +def dump_yacc(grammar): + tokens = set() + for rules in grammar.values(): + for rule in rules: + for symbol in rule: + if symbol.startswith("token:"): + symbol = symbol[6:].upper() + tokens.add(symbol) + for token in sorted(tokens): + print(f"%token {token}") + + print() + print("%%") + + for name, rules in grammar.items(): + print(f"{name} : ", end=''); + for i,rule in enumerate(rules): + if i != 0: + print(f"{' ' * len(name)} | ", end='') + + parts = [] + for symbol in rule: + if symbol.startswith("token:"): + symbol = symbol[6:].upper() + parts.append(symbol) + print(' '.join(parts)) + print() + + print("%%") + + +ARROW = Token("Arrow") +AS = Token("As") +BAR = Token("Bar") +CLASS = Token("Class") +COLON = Token("Colon") +ELSE = Token("Else") +FOR = Token("For") +FUN = Token("Fun") +IDENTIFIER = Token("Identifier") +IF = Token("If") +IMPORT = Token("Import") +IN = Token("In") +LCURLY = Token("LeftBrace") +LET = Token("Let") +RCURLY = Token("RightBrace") +RETURN = Token("Return") +SEMICOLON = Token("Semicolon") +STRING = Token("String") +WHILE = Token("While") +EQUAL = Token("Equal") +LPAREN = Token("LeftParen") +RPAREN = Token("RightParen") +COMMA = Token("Comma") +SELF = Token("Selff") +OR = Token("Or") +IS = Token("Is") +AND = Token("And") +EQUALEQUAL = Token("EqualEqual") +BANGEQUAL = Token("BangEqual") +LESS = Token("Less") +GREATER = Token("Greater") +LESSEQUAL = Token("LessEqual") +GREATEREQUAL = Token("GreaterEqual") +PLUS = Token("Plus") +MINUS = Token("Minus") +STAR = Token("Star") +SLASH = Token("Slash") +NUMBER = Token("Number") +TRUE = Token("True") +FALSE = Token("False") +BANG = Token("Bang") +DOT = Token("Dot") +MATCH = Token("Match") +EXPORT = Token("Export") +UNDERSCORE = Token("Underscore") +NEW = Token("New") + +# fmt: off +precedence = [ + (Assoc.RIGHT, [EQUAL]), + (Assoc.LEFT, [OR]), + (Assoc.LEFT, [IS]), + (Assoc.LEFT, [AND]), + (Assoc.LEFT, [EQUALEQUAL, BANGEQUAL]), + (Assoc.LEFT, [LESS, GREATER, GREATEREQUAL, LESSEQUAL]), + (Assoc.LEFT, [PLUS, MINUS]), + (Assoc.LEFT, [STAR, SLASH]), + (Assoc.LEFT, [LPAREN]), + (Assoc.LEFT, [DOT]), + + # If there's a confusion about whether to make an IF statement or an + # expression, prefer the statement. + (Assoc.NONE, ["IfStatement"]), +] + +grammar = { + "File": [ + ["FileStatementList"], + ], + "FileStatementList": [ + ["FileStatement"], + ["FileStatement", "FileStatementList"], + ], + "FileStatement": [ + ["ImportStatement"], + ["ClassDeclaration"], + ["ExportStatement"], + ["Statement"], + ], + + "ImportStatement": [ + [IMPORT, STRING, AS, IDENTIFIER, SEMICOLON], + ], + + # Classes + "ClassDeclaration": [ + [CLASS, IDENTIFIER, "ClassBody"], + ], + "ClassBody": [ + [LCURLY, RCURLY], + [LCURLY, "ClassMembers", RCURLY], + ], + "ClassMembers": [ + ["ClassMember"], + ["ClassMembers", "ClassMember"], + ], + "ClassMember": [ + ["FieldDeclaration"], + ["FunctionDeclaration"], + ], + "FieldDeclaration": [ + [IDENTIFIER, COLON, "TypeExpression", SEMICOLON], + ], + + # Types + "TypeExpression": [ + ["AlternateType"], + ["TypeIdentifier"], + ], + "AlternateType": [ + ["TypeExpression", BAR, "TypeIdentifier"], + ], + "TypeIdentifier": [ + [IDENTIFIER], + ], + + "ExportStatement": [ + [EXPORT, "ClassDeclaration"], + [EXPORT, "FunctionDeclaration"], + # [EXPORT, "LetStatement"], + [EXPORT, "ExportList", SEMICOLON], + ], + "ExportList": [ + [], + [IDENTIFIER], + [IDENTIFIER, COMMA, "ExportList"], + ], + + # Functions + "FunctionDeclaration": [ + [FUN, IDENTIFIER, "FunctionParameters", "Block"], + [FUN, IDENTIFIER, "FunctionParameters", ARROW, "TypeExpression", "Block"], + ], + "FunctionParameters": [ + [LPAREN, RPAREN], + [LPAREN, "FirstParameter", RPAREN], + [LPAREN, "FirstParameter", COMMA, "ParameterList", RPAREN], + ], + "FirstParameter": [ + [SELF], + ["Parameter"], + ], + "ParameterList": [ + [], + ["Parameter"], + ["Parameter", COMMA, "ParameterList"], + ], + "Parameter": [ + [IDENTIFIER, COLON, "TypeExpression"], + ], + + # Block + "Block": [ + [LCURLY, RCURLY], + [LCURLY, "StatementList", RCURLY], + [LCURLY, "StatementList", "Expression", RCURLY], + ], + "StatementList": [ + ["Statement"], + ["StatementList", "Statement"], + ], + + "Statement": [ + ["FunctionDeclaration"], + ["LetStatement"], + # ["ReturnStatement"], + # ["ForStatement"], + ["IfStatement"], + # ["WhileStatement"], + # ["ExpressionStatement"], + ], + + "LetStatement": [ + [LET, IDENTIFIER, EQUAL, "Expression", SEMICOLON], + ], + + # "ReturnStatement": [ + # [RETURN, "Expression", SEMICOLON], + # ], + + # "ForStatement": [ + # [FOR, "IteratorVariable", IN, "Expression", "Block"], + # ], + # "IteratorVariable": [[IDENTIFIER]], + + "IfStatement": [["ConditionalExpression"]], + + # "WhileStatement": [ + # [WHILE, "Expression", "Block"], + # ], + + # "ExpressionStatement": [ + # ["Expression", SEMICOLON], + # ], + + # Expressions + "Expression": [["AssignmentExpression"]], + + "AssignmentExpression": [ + ["OrExpression", EQUAL, "AssignmentExpression"], + ["OrExpression"], + ], + "OrExpression": [ + ["OrExpression", OR, "IsExpression"], + ["IsExpression"], + ], + "IsExpression": [ + # ["IsExpression", IS, "Pattern"], + ["AndExpression"], + ], + "AndExpression": [ + ["AndExpression", AND, "EqualityExpression"], + ["EqualityExpression"], + ], + "EqualityExpression": [ + ["EqualityExpression", EQUALEQUAL, "RelationExpression"], + ["EqualityExpression", BANGEQUAL, "RelationExpression"], + ["RelationExpression"], + ], + "RelationExpression": [ + ["RelationExpression", LESS, "AdditiveExpression"], + ["RelationExpression", LESSEQUAL, "AdditiveExpression"], + ["RelationExpression", GREATER, "AdditiveExpression"], + ["RelationExpression", GREATEREQUAL, "AdditiveExpression"], + ["AdditiveExpression"], + ], + "AdditiveExpression": [ + ["AdditiveExpression", PLUS, "MultiplicationExpression"], + ["AdditiveExpression", MINUS, "MultiplicationExpression"], + ["MultiplicationExpression"], + ], + "MultiplicationExpression": [ + ["MultiplicationExpression", STAR, "PrimaryExpression"], + ["MultiplicationExpression", SLASH, "PrimaryExpression"], + ["PrimaryExpression"], + ], + "PrimaryExpression": [ + [IDENTIFIER], + [SELF], + [NUMBER], + [STRING], + [TRUE], + [FALSE], + [BANG, "PrimaryExpression"], + [MINUS, "PrimaryExpression"], + + ["Block"], + ["ConditionalExpression"], + # ["ListConstructorExpression"], + # ["ObjectConstructorExpression"], + # ["MatchExpression"], + + # ["PrimaryExpression", LPAREN, "ExpressionList", RPAREN], + # ["PrimaryExpression", DOT, IDENTIFIER], + + [LPAREN, "Expression", RPAREN], + ], + + "ConditionalExpression": [ + [IF, "Expression", "Block"], + [IF, "Expression", "Block", ELSE, "ConditionalExpression"], + [IF, "Expression", "Block", ELSE, "Block"], + ], + + # "ListConstructorExpression": [ + # [LCURLY, "ExpressionList", RCURLY], + # ], + + # "ExpressionList": [ + # [], + # ["Expression"], + # ["Expression", COMMA, "ExpressionList"], + # ], + + # # Match Expression + # "MatchExpression": [ + # [MATCH, "MatchBody"], + # ], + # "MatchBody": [ + # [LCURLY, "MatchArms", RCURLY], + # ], + # "MatchArms": [ + # [], + # ["MatchArm"], + # ["MatchArm", COMMA, "MatchArms"], + # ], + # "MatchArm": [ + # ["Pattern", ARROW, "Expression"], + # ], + + # # Pattern + # "Pattern": [ + # ["VariableBinding", "PatternCore", AND, "AndExpression"], + # ["VariableBinding", "PatternCore"], + # ["PatternCore", AND, "AndExpression"], + # ["PatternCore"], + # ], + # "PatternCore": [ + # ["TypeExpression"], + # ["WildcardPattern"], + # ], + # "WildcardPattern": [[UNDERSCORE]], + # "VariableBinding": [[IDENTIFIER, COLON]], + + # # Object Constructor + # "ObjectConstructorExpression": [ + # [NEW, "TypeIdentifier", "FieldList"], + # ], + # "FieldList": [ + # [LCURLY, "FieldValues", RCURLY], + # ], + # "FieldValues": [ + # [], + # ["FieldValue"], + # ["FieldValue", COMMA, "FieldValues"], + # ], + # "FieldValue": [ + # [IDENTIFIER], + # [IDENTIFIER, COLON, "Expression"], + # ], +} +# fmt: on + +# dump_yacc(grammar) +grammar, precedence = desugar(grammar, precedence) +gen = parser_faster.GenerateLR1("File", grammar, precedence=precedence) +table = gen.gen_table() +print(parser_faster.format_table(gen, table)) +print() +# tree = parse(table, ["id", "+", "(", "id", "[", "id", "]", ")"])