faster: Snapshot a big grammar I'm playing with

This commit is contained in:
John Doty 2024-04-17 11:07:56 -07:00
parent d0be3ea267
commit 7147557e2b

418
grammar.py Normal file
View file

@ -0,0 +1,418 @@
import parser_faster
import sys
import typing
from parser_faster import Assoc
class Token:
value: str
def __init__(self, value):
self.value = sys.intern(value)
Symbol = Token | str
def desugar(
grammar: dict[str, list[list[Symbol]]],
precedence: list[typing.Tuple[Assoc, list[Symbol]]],
):
nonterminal_refs = set()
nonterminals = set()
terminals = set()
result: list[typing.Tuple[str, list[str]]] = []
for (k, v) in grammar.items():
nonterminals.add(k)
for rule in v:
assert isinstance(rule, list)
result_rule: list[str] = []
for symbol in rule:
if isinstance(symbol, Token):
result_rule.append(symbol.value)
terminals.add(symbol.value)
else:
result_rule.append(symbol)
nonterminal_refs.add(symbol)
result.append((k, result_rule))
unknown_rules = nonterminal_refs - nonterminals
if len(unknown_rules) > 0:
undefined = "\n ".join(unknown_rules)
raise Exception(f"The following rules are not defined:\n {undefined}")
overlap_rules = nonterminals & terminals
if len(overlap_rules) > 0:
overlap = "\n ".join(overlap_rules)
raise Exception(f"The following symbols are both tokens and rules:\n {overlap}")
result_precedence = {
(symbol.value if isinstance(symbol, Token) else symbol):(associativity, precedence + 1)
for precedence, (associativity, symbols) in enumerate(precedence)
for symbol in symbols
}
return result, result_precedence
def dump_yacc(grammar):
tokens = set()
for rules in grammar.values():
for rule in rules:
for symbol in rule:
if symbol.startswith("token:"):
symbol = symbol[6:].upper()
tokens.add(symbol)
for token in sorted(tokens):
print(f"%token {token}")
print()
print("%%")
for name, rules in grammar.items():
print(f"{name} : ", end='');
for i,rule in enumerate(rules):
if i != 0:
print(f"{' ' * len(name)} | ", end='')
parts = []
for symbol in rule:
if symbol.startswith("token:"):
symbol = symbol[6:].upper()
parts.append(symbol)
print(' '.join(parts))
print()
print("%%")
ARROW = Token("Arrow")
AS = Token("As")
BAR = Token("Bar")
CLASS = Token("Class")
COLON = Token("Colon")
ELSE = Token("Else")
FOR = Token("For")
FUN = Token("Fun")
IDENTIFIER = Token("Identifier")
IF = Token("If")
IMPORT = Token("Import")
IN = Token("In")
LCURLY = Token("LeftBrace")
LET = Token("Let")
RCURLY = Token("RightBrace")
RETURN = Token("Return")
SEMICOLON = Token("Semicolon")
STRING = Token("String")
WHILE = Token("While")
EQUAL = Token("Equal")
LPAREN = Token("LeftParen")
RPAREN = Token("RightParen")
COMMA = Token("Comma")
SELF = Token("Selff")
OR = Token("Or")
IS = Token("Is")
AND = Token("And")
EQUALEQUAL = Token("EqualEqual")
BANGEQUAL = Token("BangEqual")
LESS = Token("Less")
GREATER = Token("Greater")
LESSEQUAL = Token("LessEqual")
GREATEREQUAL = Token("GreaterEqual")
PLUS = Token("Plus")
MINUS = Token("Minus")
STAR = Token("Star")
SLASH = Token("Slash")
NUMBER = Token("Number")
TRUE = Token("True")
FALSE = Token("False")
BANG = Token("Bang")
DOT = Token("Dot")
MATCH = Token("Match")
EXPORT = Token("Export")
UNDERSCORE = Token("Underscore")
NEW = Token("New")
# fmt: off
precedence = [
(Assoc.RIGHT, [EQUAL]),
(Assoc.LEFT, [OR]),
(Assoc.LEFT, [IS]),
(Assoc.LEFT, [AND]),
(Assoc.LEFT, [EQUALEQUAL, BANGEQUAL]),
(Assoc.LEFT, [LESS, GREATER, GREATEREQUAL, LESSEQUAL]),
(Assoc.LEFT, [PLUS, MINUS]),
(Assoc.LEFT, [STAR, SLASH]),
(Assoc.LEFT, [LPAREN]),
(Assoc.LEFT, [DOT]),
# If there's a confusion about whether to make an IF statement or an
# expression, prefer the statement.
(Assoc.NONE, ["IfStatement"]),
]
grammar = {
"File": [
["FileStatementList"],
],
"FileStatementList": [
["FileStatement"],
["FileStatement", "FileStatementList"],
],
"FileStatement": [
["ImportStatement"],
["ClassDeclaration"],
["ExportStatement"],
["Statement"],
],
"ImportStatement": [
[IMPORT, STRING, AS, IDENTIFIER, SEMICOLON],
],
# Classes
"ClassDeclaration": [
[CLASS, IDENTIFIER, "ClassBody"],
],
"ClassBody": [
[LCURLY, RCURLY],
[LCURLY, "ClassMembers", RCURLY],
],
"ClassMembers": [
["ClassMember"],
["ClassMembers", "ClassMember"],
],
"ClassMember": [
["FieldDeclaration"],
["FunctionDeclaration"],
],
"FieldDeclaration": [
[IDENTIFIER, COLON, "TypeExpression", SEMICOLON],
],
# Types
"TypeExpression": [
["AlternateType"],
["TypeIdentifier"],
],
"AlternateType": [
["TypeExpression", BAR, "TypeIdentifier"],
],
"TypeIdentifier": [
[IDENTIFIER],
],
"ExportStatement": [
[EXPORT, "ClassDeclaration"],
[EXPORT, "FunctionDeclaration"],
# [EXPORT, "LetStatement"],
[EXPORT, "ExportList", SEMICOLON],
],
"ExportList": [
[],
[IDENTIFIER],
[IDENTIFIER, COMMA, "ExportList"],
],
# Functions
"FunctionDeclaration": [
[FUN, IDENTIFIER, "FunctionParameters", "Block"],
[FUN, IDENTIFIER, "FunctionParameters", ARROW, "TypeExpression", "Block"],
],
"FunctionParameters": [
[LPAREN, RPAREN],
[LPAREN, "FirstParameter", RPAREN],
[LPAREN, "FirstParameter", COMMA, "ParameterList", RPAREN],
],
"FirstParameter": [
[SELF],
["Parameter"],
],
"ParameterList": [
[],
["Parameter"],
["Parameter", COMMA, "ParameterList"],
],
"Parameter": [
[IDENTIFIER, COLON, "TypeExpression"],
],
# Block
"Block": [
[LCURLY, RCURLY],
[LCURLY, "StatementList", RCURLY],
[LCURLY, "StatementList", "Expression", RCURLY],
],
"StatementList": [
["Statement"],
["StatementList", "Statement"],
],
"Statement": [
["FunctionDeclaration"],
["LetStatement"],
# ["ReturnStatement"],
# ["ForStatement"],
["IfStatement"],
# ["WhileStatement"],
# ["ExpressionStatement"],
],
"LetStatement": [
[LET, IDENTIFIER, EQUAL, "Expression", SEMICOLON],
],
# "ReturnStatement": [
# [RETURN, "Expression", SEMICOLON],
# ],
# "ForStatement": [
# [FOR, "IteratorVariable", IN, "Expression", "Block"],
# ],
# "IteratorVariable": [[IDENTIFIER]],
"IfStatement": [["ConditionalExpression"]],
# "WhileStatement": [
# [WHILE, "Expression", "Block"],
# ],
# "ExpressionStatement": [
# ["Expression", SEMICOLON],
# ],
# Expressions
"Expression": [["AssignmentExpression"]],
"AssignmentExpression": [
["OrExpression", EQUAL, "AssignmentExpression"],
["OrExpression"],
],
"OrExpression": [
["OrExpression", OR, "IsExpression"],
["IsExpression"],
],
"IsExpression": [
# ["IsExpression", IS, "Pattern"],
["AndExpression"],
],
"AndExpression": [
["AndExpression", AND, "EqualityExpression"],
["EqualityExpression"],
],
"EqualityExpression": [
["EqualityExpression", EQUALEQUAL, "RelationExpression"],
["EqualityExpression", BANGEQUAL, "RelationExpression"],
["RelationExpression"],
],
"RelationExpression": [
["RelationExpression", LESS, "AdditiveExpression"],
["RelationExpression", LESSEQUAL, "AdditiveExpression"],
["RelationExpression", GREATER, "AdditiveExpression"],
["RelationExpression", GREATEREQUAL, "AdditiveExpression"],
["AdditiveExpression"],
],
"AdditiveExpression": [
["AdditiveExpression", PLUS, "MultiplicationExpression"],
["AdditiveExpression", MINUS, "MultiplicationExpression"],
["MultiplicationExpression"],
],
"MultiplicationExpression": [
["MultiplicationExpression", STAR, "PrimaryExpression"],
["MultiplicationExpression", SLASH, "PrimaryExpression"],
["PrimaryExpression"],
],
"PrimaryExpression": [
[IDENTIFIER],
[SELF],
[NUMBER],
[STRING],
[TRUE],
[FALSE],
[BANG, "PrimaryExpression"],
[MINUS, "PrimaryExpression"],
["Block"],
["ConditionalExpression"],
# ["ListConstructorExpression"],
# ["ObjectConstructorExpression"],
# ["MatchExpression"],
# ["PrimaryExpression", LPAREN, "ExpressionList", RPAREN],
# ["PrimaryExpression", DOT, IDENTIFIER],
[LPAREN, "Expression", RPAREN],
],
"ConditionalExpression": [
[IF, "Expression", "Block"],
[IF, "Expression", "Block", ELSE, "ConditionalExpression"],
[IF, "Expression", "Block", ELSE, "Block"],
],
# "ListConstructorExpression": [
# [LCURLY, "ExpressionList", RCURLY],
# ],
# "ExpressionList": [
# [],
# ["Expression"],
# ["Expression", COMMA, "ExpressionList"],
# ],
# # Match Expression
# "MatchExpression": [
# [MATCH, "MatchBody"],
# ],
# "MatchBody": [
# [LCURLY, "MatchArms", RCURLY],
# ],
# "MatchArms": [
# [],
# ["MatchArm"],
# ["MatchArm", COMMA, "MatchArms"],
# ],
# "MatchArm": [
# ["Pattern", ARROW, "Expression"],
# ],
# # Pattern
# "Pattern": [
# ["VariableBinding", "PatternCore", AND, "AndExpression"],
# ["VariableBinding", "PatternCore"],
# ["PatternCore", AND, "AndExpression"],
# ["PatternCore"],
# ],
# "PatternCore": [
# ["TypeExpression"],
# ["WildcardPattern"],
# ],
# "WildcardPattern": [[UNDERSCORE]],
# "VariableBinding": [[IDENTIFIER, COLON]],
# # Object Constructor
# "ObjectConstructorExpression": [
# [NEW, "TypeIdentifier", "FieldList"],
# ],
# "FieldList": [
# [LCURLY, "FieldValues", RCURLY],
# ],
# "FieldValues": [
# [],
# ["FieldValue"],
# ["FieldValue", COMMA, "FieldValues"],
# ],
# "FieldValue": [
# [IDENTIFIER],
# [IDENTIFIER, COLON, "Expression"],
# ],
}
# fmt: on
# dump_yacc(grammar)
grammar, precedence = desugar(grammar, precedence)
gen = parser_faster.GenerateLR1("File", grammar, precedence=precedence)
table = gen.gen_table()
print(parser_faster.format_table(gen, table))
print()
# tree = parse(table, ["id", "+", "(", "id", "[", "id", "]", ")"])