From 5064a768e7df5446ffd739bbd0fc29b56cda30af Mon Sep 17 00:00:00 2001 From: John Doty Date: Sat, 9 Nov 2024 11:21:30 -0800 Subject: [PATCH] [all] A whole new style for grammars Say good by to the sea of `self.`! --- grammar.py | 915 +++++++++++++++++------------------ parser/emacs.py | 15 +- parser/parser.py | 391 +++++++-------- parser/tree_sitter.py | 17 +- parser/wadler/builder.py | 25 +- sql.py | 223 +++++---- tests/test_error_recovery.py | 259 +++++----- tests/test_grammar.py | 301 ++---------- tests/test_lexer.py | 80 +-- tests/test_wadler.py | 189 ++++---- 10 files changed, 1097 insertions(+), 1318 deletions(-) diff --git a/grammar.py b/grammar.py index aee5f78..0981e73 100644 --- a/grammar.py +++ b/grammar.py @@ -20,503 +20,498 @@ from parser import ( sp, ) +@rule("File") +def file() -> Rule: + return _file_statement_list -class FineGrammar(Grammar): - # generator = parser.GenerateLR1 - # generator = parser.GeneratePager - start = "File" +@rule +def _file_statement_list() -> Rule: + return alt( + _file_statement, + _file_statement_list + nl + _file_statement, + ) - trivia = ["BLANKS", "LINE_BREAK", "COMMENT"] +@rule +def _file_statement() -> Rule: + return ( + import_statement | class_declaration | export_statement | _statement + ) - pretty_indent = " " +@rule +def import_statement() -> Rule: + return group( + IMPORT, sp, STRING, sp, AS, sp, IDENTIFIER, sp, SEMICOLON + ) - def __init__(self): - super().__init__( - precedence=[ - (Assoc.RIGHT, [self.EQUAL]), - (Assoc.LEFT, [self.OR]), - (Assoc.LEFT, [self.IS]), - (Assoc.LEFT, [self.AND]), - (Assoc.LEFT, [self.EQUALEQUAL, self.BANGEQUAL]), - (Assoc.LEFT, [self.LESS, self.GREATER, self.GREATEREQUAL, self.LESSEQUAL]), - (Assoc.LEFT, [self.PLUS, self.MINUS]), - (Assoc.LEFT, [self.STAR, self.SLASH]), - (Assoc.LEFT, [self.primary_expression]), - (Assoc.LEFT, [self.LPAREN]), - (Assoc.LEFT, [self.DOT]), - # - # If there's a confusion about whether to make an IF - # statement or an expression, prefer the statement. - # - (Assoc.NONE, [self.if_statement]), - ], - ) +@rule("ClassDeclaration") +def class_declaration() -> Rule: + return seq( + group( + CLASS, + sp, + mark(IDENTIFIER, field="name", highlight=highlight.entity.name.type), + sp, + LCURLY, + ), + indent(nl, mark(opt(class_body), field="body")), + nl, + RCURLY, + nl, # Extra newline at the end of the class + ) - @rule("File") - def file(self) -> Rule: - return self._file_statement_list +@rule("ClassBody") +def class_body() -> Rule: + return _class_members - @rule - def _file_statement_list(self) -> Rule: - return alt( - self._file_statement, - self._file_statement_list + nl + self._file_statement, - ) +@rule +def _class_members() -> Rule: + return _class_member | seq(_class_members, nl, _class_member) - @rule - def _file_statement(self) -> Rule: - return ( - self.import_statement | self.class_declaration | self.export_statement | self._statement - ) +@rule +def _class_member() -> Rule: + return field_declaration | function_declaration - @rule - def import_statement(self) -> Rule: - return group( - self.IMPORT, sp, self.STRING, sp, self.AS, sp, self.IDENTIFIER, sp, self.SEMICOLON - ) +@rule("FieldDecl") +def field_declaration() -> Rule: + return group(IDENTIFIER, COLON, sp, type_expression, SEMICOLON) - @rule("ClassDeclaration") - def class_declaration(self) -> Rule: - return seq( - group( - self.CLASS, - sp, - mark(self.IDENTIFIER, field="name", highlight=highlight.entity.name.type), - sp, - self.LCURLY, - ), - indent(nl, mark(opt(self.class_body), field="body")), - nl, - self.RCURLY, - nl, # Extra newline at the end of the class - ) +# Types +@rule("TypeExpression") +def type_expression() -> Rule: + return alternate_type | type_identifier - @rule("ClassBody") - def class_body(self) -> Rule: - return self._class_members +@rule("AlternateType") +def alternate_type() -> Rule: + return group(type_expression, sp, OR, sp, type_identifier) - @rule - def _class_members(self) -> Rule: - return self._class_member | seq(self._class_members, nl, self._class_member) +@rule("TypeIdentifier") +def type_identifier() -> Rule: + return mark(IDENTIFIER, field="id", highlight=highlight.entity.name.type) - @rule - def _class_member(self) -> Rule: - return self.field_declaration | self.function_declaration +@rule +def export_statement() -> Rule: + return alt( + group(EXPORT, sp, class_declaration), + group(EXPORT, sp, function_declaration), + group(EXPORT, sp, let_statement), + group(EXPORT, sp, export_list, SEMICOLON), + ) - @rule("FieldDecl") - def field_declaration(self) -> Rule: - return group(self.IDENTIFIER, self.COLON, sp, self.type_expression, self.SEMICOLON) +@rule +def export_list() -> Rule: + return IDENTIFIER | seq(IDENTIFIER, COMMA, sp, export_list) - # Types - @rule("TypeExpression") - def type_expression(self) -> Rule: - return self.alternate_type | self.type_identifier - - @rule("AlternateType") - def alternate_type(self) -> Rule: - return group(self.type_expression, sp, self.OR, sp, self.type_identifier) - - @rule("TypeIdentifier") - def type_identifier(self) -> Rule: - return mark(self.IDENTIFIER, field="id", highlight=highlight.entity.name.type) - - @rule - def export_statement(self) -> Rule: - return alt( - group(self.EXPORT, sp, self.class_declaration), - group(self.EXPORT, sp, self.function_declaration), - group(self.EXPORT, sp, self.let_statement), - group(self.EXPORT, sp, self.export_list, self.SEMICOLON), - ) - - @rule - def export_list(self) -> Rule: - return self.IDENTIFIER | seq(self.IDENTIFIER, self.COMMA, sp, self.export_list) - - # Functions - @rule("FunctionDecl") - def function_declaration(self) -> Rule: - return seq( +# Functions +@rule("FunctionDecl") +def function_declaration() -> Rule: + return seq( + group( group( group( - group( - self.FUN, - sp, - mark( - self.IDENTIFIER, - field="name", - highlight=highlight.entity.name.function, - ), + FUN, + sp, + mark( + IDENTIFIER, + field="name", + highlight=highlight.entity.name.function, ), - nl, - mark(self.function_parameters, field="parameters"), ), - mark( - opt(indent(sp, group(self.ARROW, sp, self.type_expression))), - field="return_type", - ), - ), - sp, - mark(self.block, field="body"), - nl, - ) - - @rule("ParamList") - def function_parameters(self) -> Rule: - return group( - self.LPAREN, - indent( nl, - opt( - self._first_parameter - | seq(self._first_parameter, self.COMMA) - | group(self._first_parameter, self.COMMA, sp, self._parameter_list) - ), + mark(function_parameters, field="parameters"), ), + mark( + opt(indent(sp, group(ARROW, sp, type_expression))), + field="return_type", + ), + ), + sp, + mark(block, field="body"), + nl, + ) + +@rule("ParamList") +def function_parameters() -> Rule: + return group( + LPAREN, + indent( nl, - self.RPAREN, - ) - - @rule - def _first_parameter(self) -> Rule: - return self.SELF | self.parameter - - @rule - def _parameter_list(self) -> Rule: - return self.parameter | seq(self.parameter, self.COMMA, sp, self._parameter_list) - - @rule("Parameter") - def parameter(self) -> Rule: - return group(self.IDENTIFIER, self.COLON, sp, self.type_expression) - - # Block - @rule("Block") - def block(self) -> Rule: - return alt( - group(self.LCURLY, nl, self.RCURLY), - group(self.LCURLY, indent(br, self.block_body), sp, self.RCURLY), - ) - - @rule("BlockBody") - def block_body(self) -> Rule: - return alt( - self.expression, - self._statement_list, - seq(self._statement_list, br, self.expression), - ) - - @rule - def _statement_list(self) -> Rule: - return self._statement | seq(self._statement_list, br, self._statement) - - @rule - def _statement(self) -> Rule: - return ( - self.function_declaration - | self.let_statement - | self.return_statement - | self.for_statement - | self.if_statement - | self.while_statement - | self.expression_statement - ) - - @rule("LetStatement") - def let_statement(self) -> Rule: - return group( - group( - self.LET, - sp, - self.IDENTIFIER, - sp, - self.EQUAL, + opt( + _first_parameter + | seq(_first_parameter, COMMA) + | group(_first_parameter, COMMA, sp, _parameter_list) ), - indent(sp, self.expression, self.SEMICOLON), - ) + ), + nl, + RPAREN, + ) - @rule("ReturnStatement") - def return_statement(self) -> Rule: - return alt( - group(self.RETURN, indent(sp, group(self.expression, self.SEMICOLON))), - group(self.RETURN, self.SEMICOLON), - ) +@rule +def _first_parameter() -> Rule: + return SELF | parameter - @rule("ForStatement") - def for_statement(self) -> Rule: - return group( - group(self.FOR, sp, self.iterator_variable, sp, self.IN, sp, group(self.expression)), - self.block, - ) +@rule +def _parameter_list() -> Rule: + return parameter | seq(parameter, COMMA, sp, _parameter_list) - @rule("IteratorVariable") - def iterator_variable(self) -> Rule: - return self.IDENTIFIER +@rule("Parameter") +def parameter() -> Rule: + return group(IDENTIFIER, COLON, sp, type_expression) - @rule("IfStatement") - def if_statement(self) -> Rule: - return self.conditional_expression +# Block +@rule("Block") +def block() -> Rule: + return alt( + group(LCURLY, nl, RCURLY), + group(LCURLY, indent(br, block_body), sp, RCURLY), + ) - @rule - def while_statement(self) -> Rule: - return group(group(self.WHILE, sp, self.expression), sp, self.block) +@rule("BlockBody") +def block_body() -> Rule: + return alt( + expression, + _statement_list, + seq(_statement_list, br, expression), + ) - @rule - def expression_statement(self) -> Rule: - return seq(self.expression, self.SEMICOLON) +@rule +def _statement_list() -> Rule: + return _statement | seq(_statement_list, br, _statement) - # Expressions - @rule(transparent=True) - def expression(self) -> Rule: - return self.binary_expression | self.is_expression | self.primary_expression +@rule +def _statement() -> Rule: + return ( + function_declaration + | let_statement + | return_statement + | for_statement + | if_statement + | while_statement + | expression_statement + ) - @rule("BinaryExpression") - def binary_expression(self) -> Rule: - return alt( - # Assignment gets special indentation. - group(group(self.expression, sp, self.EQUAL), indent(sp, self.expression)), - # Other ones do not. - group(group(self.expression, sp, self.OR), sp, self.expression), - group(group(self.expression, sp, self.AND), sp, self.expression), - group(group(self.expression, sp, self.EQUALEQUAL), sp, self.expression), - group(group(self.expression, sp, self.BANGEQUAL), sp, self.expression), - group(group(self.expression, sp, self.LESS), sp, self.expression), - group(group(self.expression, sp, self.LESSEQUAL), sp, self.expression), - group(group(self.expression, sp, self.GREATER), sp, self.expression), - group(group(self.expression, sp, self.GREATEREQUAL), sp, self.expression), - group(group(self.expression, sp, self.PLUS), sp, self.expression), - group(group(self.expression, sp, self.MINUS), sp, self.expression), - group(group(self.expression, sp, self.STAR), sp, self.expression), - group(group(self.expression, sp, self.SLASH), sp, self.expression), - ) - - @rule("IsExpression") - def is_expression(self) -> Rule: - return group(self.expression, sp, self.IS, indent(sp, self.pattern)) - - @rule - def primary_expression(self) -> Rule: - return ( - self.identifier_expression - | self.literal_expression - | self.SELF - | seq(self.BANG, self.primary_expression) - | seq(self.MINUS, self.primary_expression) - | self.block - | self.conditional_expression - | self.list_constructor_expression - | self.object_constructor_expression - | self.match_expression - | seq(self.primary_expression, self.LPAREN, self.RPAREN) - | group( - self.primary_expression, - self.LPAREN, - indent(nl, self._expression_list), - nl, - self.RPAREN, - ) - | group(self.primary_expression, indent(nl, self.DOT, self.IDENTIFIER)) - | group(self.LPAREN, indent(nl, self.expression), nl, self.RPAREN) - ) - - @rule("IdentifierExpression") - def identifier_expression(self): - return self.IDENTIFIER - - @rule("Literal") - def literal_expression(self): - return self.NUMBER | self.STRING | self.TRUE | self.FALSE - - @rule("ConditionalExpression") - def conditional_expression(self) -> Rule: - return ( - seq(group(self.IF, sp, self.expression), sp, self.block) - | seq( - group(self.IF, sp, self.expression), - sp, - self.block, - sp, - self.ELSE, - sp, - self.conditional_expression, - ) - | seq( - group(self.IF, sp, self.expression), sp, self.block, sp, self.ELSE, sp, self.block - ) - ) - - @rule - def list_constructor_expression(self) -> Rule: - return alt( - group(self.LSQUARE, nl, self.RSQUARE), - group(self.LSQUARE, indent(nl, self._expression_list), nl, self.RSQUARE), - ) - - @rule - def _expression_list(self) -> Rule: - return ( - self.expression - | seq(self.expression, self.COMMA) - | seq(self.expression, self.COMMA, sp, self._expression_list) - ) - - @rule - def match_expression(self) -> Rule: - return group( - group(self.MATCH, sp, self.expression, sp, self.LCURLY), - indent(sp, self.match_arms), +@rule("LetStatement") +def let_statement() -> Rule: + return group( + group( + LET, sp, - self.RCURLY, - ) - - @rule("MatchArms") - def match_arms(self) -> Rule: - return self._match_arms - - @rule - def _match_arms(self) -> Rule: - return ( - self.match_arm - | seq(self.match_arm, self.COMMA) - | seq(self.match_arm, self.COMMA, br, self._match_arms) - ) - - @rule("MatchArm") - def match_arm(self) -> Rule: - return group(self.pattern, sp, self.ARROW, sp, self.expression) - - @rule("Pattern") - def pattern(self) -> Rule: - return ( - group(self.variable_binding, self._pattern_core, sp, self.AND, sp, self.expression) - | group(self.variable_binding, self._pattern_core) - | self._pattern_core - ) - - @rule - def _pattern_core(self) -> Rule: - return self.type_expression | self.wildcard_pattern - - @rule("WildcardPattern") - def wildcard_pattern(self) -> Rule: - return self.UNDERSCORE - - @rule("VariableBinding") - def variable_binding(self) -> Rule: - return seq(self.IDENTIFIER, self.COLON) - - @rule - def object_constructor_expression(self) -> Rule: - return group(self.NEW, sp, self.type_identifier, sp, self.field_list) - - @rule - def field_list(self) -> Rule: - return alt( - seq(self.LCURLY, self.RCURLY), - group(self.LCURLY, indent(nl, self.field_values), nl, self.RCURLY), - ) - - @rule - def field_values(self) -> Rule: - return ( - self.field_value - | seq(self.field_value, self.COMMA) - | seq(self.field_value, self.COMMA, sp, self.field_values) - ) - - @rule - def field_value(self) -> Rule: - return self.IDENTIFIER | group(self.IDENTIFIER, self.COLON, indent(sp, self.expression)) - - BLANKS = Terminal(Re.set(" ", "\t").plus()) - LINE_BREAK = Terminal(Re.set("\r", "\n"), trivia_mode=TriviaMode.NewLine) - COMMENT = Terminal( - Re.seq(Re.literal("//"), Re.set("\n").invert().star()), - highlight=highlight.comment.line, - trivia_mode=TriviaMode.LineComment, - ) - - ARROW = Terminal("->", highlight=highlight.keyword.operator) - AS = Terminal("as", highlight=highlight.keyword.operator.expression) - BAR = Terminal("|", highlight=highlight.keyword.operator.expression) - CLASS = Terminal("class", highlight=highlight.storage.type.klass) - COLON = Terminal(":", highlight=highlight.punctuation.separator) - ELSE = Terminal("else", highlight=highlight.keyword.control.conditional) - FOR = Terminal("for", highlight=highlight.keyword.control) - FUN = Terminal("fun", highlight=highlight.storage.type.function) - IDENTIFIER = Terminal( - Re.seq( - Re.set(("a", "z"), ("A", "Z"), "_"), - Re.set(("a", "z"), ("A", "Z"), ("0", "9"), "_").star(), + IDENTIFIER, + sp, + EQUAL, ), + indent(sp, expression, SEMICOLON), ) - IF = Terminal("if", highlight=highlight.keyword.control.conditional) - IMPORT = Terminal("import", highlight=highlight.keyword.other) - IN = Terminal("in", highlight=highlight.keyword.operator) - LCURLY = Terminal("{", highlight=highlight.punctuation.curly_brace.open) - RCURLY = Terminal("}", highlight=highlight.punctuation.curly_brace.close) - LET = Terminal("let", highlight=highlight.keyword.other) - RETURN = Terminal("return", highlight=highlight.keyword.control) - SEMICOLON = Terminal(";", highlight=highlight.punctuation.separator) - STRING = Terminal( - # Double-quoted string. - Re.seq( - Re.literal('"'), - (~Re.set('"', "\\") | (Re.set("\\") + Re.any())).star(), - Re.literal('"'), + +@rule("ReturnStatement") +def return_statement() -> Rule: + return alt( + group(RETURN, indent(sp, group(expression, SEMICOLON))), + group(RETURN, SEMICOLON), + ) + +@rule("ForStatement") +def for_statement() -> Rule: + return group( + group(FOR, sp, iterator_variable, sp, IN, sp, group(expression)), + block, + ) + +@rule("IteratorVariable") +def iterator_variable() -> Rule: + return IDENTIFIER + +@rule("IfStatement") +def if_statement() -> Rule: + return conditional_expression + +@rule +def while_statement() -> Rule: + return group(group(WHILE, sp, expression), sp, block) + +@rule +def expression_statement() -> Rule: + return seq(expression, SEMICOLON) + +# Expressions +@rule(transparent=True) +def expression() -> Rule: + return binary_expression | is_expression | primary_expression + +@rule("BinaryExpression") +def binary_expression() -> Rule: + return alt( + # Assignment gets special indentation. + group(group(expression, sp, EQUAL), indent(sp, expression)), + # Other ones do not. + group(group(expression, sp, OR), sp, expression), + group(group(expression, sp, AND), sp, expression), + group(group(expression, sp, EQUALEQUAL), sp, expression), + group(group(expression, sp, BANGEQUAL), sp, expression), + group(group(expression, sp, LESS), sp, expression), + group(group(expression, sp, LESSEQUAL), sp, expression), + group(group(expression, sp, GREATER), sp, expression), + group(group(expression, sp, GREATEREQUAL), sp, expression), + group(group(expression, sp, PLUS), sp, expression), + group(group(expression, sp, MINUS), sp, expression), + group(group(expression, sp, STAR), sp, expression), + group(group(expression, sp, SLASH), sp, expression), + ) + +@rule("IsExpression") +def is_expression() -> Rule: + return group(expression, sp, IS, indent(sp, pattern)) + +@rule +def primary_expression() -> Rule: + return ( + identifier_expression + | literal_expression + | SELF + | seq(BANG, primary_expression) + | seq(MINUS, primary_expression) + | block + | conditional_expression + | list_constructor_expression + | object_constructor_expression + | match_expression + | seq(primary_expression, LPAREN, RPAREN) + | group( + primary_expression, + LPAREN, + indent(nl, _expression_list), + nl, + RPAREN, ) - # Single-quoted string. - | Re.seq( - Re.literal("'"), - (~Re.set("'", "\\") | (Re.set("\\") + Re.any())).star(), - Re.literal("'"), - ), - highlight=highlight.string.quoted, + | group(primary_expression, indent(nl, DOT, IDENTIFIER)) + | group(LPAREN, indent(nl, expression), nl, RPAREN) ) - WHILE = Terminal("while", highlight=highlight.keyword.control) - EQUAL = Terminal("=", highlight=highlight.keyword.operator.expression) - LPAREN = Terminal("(", highlight=highlight.punctuation.parenthesis.open) - RPAREN = Terminal(")", highlight=highlight.punctuation.parenthesis.close) - COMMA = Terminal(",", highlight=highlight.punctuation.separator) - SELF = Terminal("self", name="SELFF", highlight=highlight.variable.language) - OR = Terminal("or", highlight=highlight.keyword.operator.expression) - IS = Terminal("is", highlight=highlight.keyword.operator.expression) - AND = Terminal("and", highlight=highlight.keyword.operator.expression) - EQUALEQUAL = Terminal("==", highlight=highlight.keyword.operator.expression) - BANGEQUAL = Terminal("!=", highlight=highlight.keyword.operator.expression) - LESS = Terminal("<", highlight=highlight.keyword.operator.expression) - GREATER = Terminal(">", highlight=highlight.keyword.operator.expression) - LESSEQUAL = Terminal("<=", highlight=highlight.keyword.operator.expression) - GREATEREQUAL = Terminal(">=", highlight=highlight.keyword.operator.expression) - PLUS = Terminal("+", highlight=highlight.keyword.operator.expression) - MINUS = Terminal("-", highlight=highlight.keyword.operator.expression) - STAR = Terminal("*", highlight=highlight.keyword.operator.expression) - SLASH = Terminal("/", highlight=highlight.keyword.operator.expression) - NUMBER = Terminal( + +@rule("IdentifierExpression") +def identifier_expression(): + return IDENTIFIER + +@rule("Literal") +def literal_expression(): + return NUMBER | STRING | TRUE | FALSE + +@rule("ConditionalExpression") +def conditional_expression() -> Rule: + return ( + seq(group(IF, sp, expression), sp, block) + | seq( + group(IF, sp, expression), + sp, + block, + sp, + ELSE, + sp, + conditional_expression, + ) + | seq( + group(IF, sp, expression), sp, block, sp, ELSE, sp, block + ) + ) + +@rule +def list_constructor_expression() -> Rule: + return alt( + group(LSQUARE, nl, RSQUARE), + group(LSQUARE, indent(nl, _expression_list), nl, RSQUARE), + ) + +@rule +def _expression_list() -> Rule: + return ( + expression + | seq(expression, COMMA) + | seq(expression, COMMA, sp, _expression_list) + ) + +@rule +def match_expression() -> Rule: + return group( + group(MATCH, sp, expression, sp, LCURLY), + indent(sp, match_arms), + sp, + RCURLY, + ) + +@rule("MatchArms") +def match_arms() -> Rule: + return _match_arms + +@rule +def _match_arms() -> Rule: + return ( + match_arm + | seq(match_arm, COMMA) + | seq(match_arm, COMMA, br, _match_arms) + ) + +@rule("MatchArm") +def match_arm() -> Rule: + return group(pattern, sp, ARROW, sp, expression) + +@rule("Pattern") +def pattern() -> Rule: + return ( + group(variable_binding, _pattern_core, sp, AND, sp, expression) + | group(variable_binding, _pattern_core) + | _pattern_core + ) + +@rule +def _pattern_core() -> Rule: + return type_expression | wildcard_pattern + +@rule("WildcardPattern") +def wildcard_pattern() -> Rule: + return UNDERSCORE + +@rule("VariableBinding") +def variable_binding() -> Rule: + return seq(IDENTIFIER, COLON) + +@rule +def object_constructor_expression() -> Rule: + return group(NEW, sp, type_identifier, sp, field_list) + +@rule +def field_list() -> Rule: + return alt( + seq(LCURLY, RCURLY), + group(LCURLY, indent(nl, field_values), nl, RCURLY), + ) + +@rule +def field_values() -> Rule: + return ( + field_value + | seq(field_value, COMMA) + | seq(field_value, COMMA, sp, field_values) + ) + +@rule +def field_value() -> Rule: + return IDENTIFIER | group(IDENTIFIER, COLON, indent(sp, expression)) + +BLANKS = Terminal("BLANKS", Re.set(" ", "\t").plus()) +LINE_BREAK = Terminal("LINE_BREAK", Re.set("\r", "\n"), trivia_mode=TriviaMode.NewLine) +COMMENT = Terminal( + "COMMENT", + Re.seq(Re.literal("//"), Re.set("\n").invert().star()), + highlight=highlight.comment.line, + trivia_mode=TriviaMode.LineComment, +) + +ARROW = Terminal("ARROW", "->", highlight=highlight.keyword.operator) +AS = Terminal("AS", "as", highlight=highlight.keyword.operator.expression) +BAR = Terminal("BAR", "|", highlight=highlight.keyword.operator.expression) +CLASS = Terminal("CLASS", "class", highlight=highlight.storage.type.klass) +COLON = Terminal("COLON", ":", highlight=highlight.punctuation.separator) +ELSE = Terminal("ELSE", "else", highlight=highlight.keyword.control.conditional) +FOR = Terminal("FOR", "for", highlight=highlight.keyword.control) +FUN = Terminal("FUN", "fun", highlight=highlight.storage.type.function) +IDENTIFIER = Terminal( + "IDENTIFIER", + Re.seq( + Re.set(("a", "z"), ("A", "Z"), "_"), + Re.set(("a", "z"), ("A", "Z"), ("0", "9"), "_").star(), + ), +) +IF = Terminal("IF", "if", highlight=highlight.keyword.control.conditional) +IMPORT = Terminal("IMPORT", "import", highlight=highlight.keyword.other) +IN = Terminal("IN", "in", highlight=highlight.keyword.operator) +LCURLY = Terminal("LCURLY", "{", highlight=highlight.punctuation.curly_brace.open) +RCURLY = Terminal("RCURLY", "}", highlight=highlight.punctuation.curly_brace.close) +LET = Terminal("LET", "let", highlight=highlight.keyword.other) +RETURN = Terminal("RETURN", "return", highlight=highlight.keyword.control) +SEMICOLON = Terminal("SEMICOLON", ";", highlight=highlight.punctuation.separator) +STRING = Terminal( + "STRING", + # Double-quoted string. + Re.seq( + Re.literal('"'), + (~Re.set('"', "\\") | (Re.set("\\") + Re.any())).star(), + Re.literal('"'), + ) + # Single-quoted string. + | Re.seq( + Re.literal("'"), + (~Re.set("'", "\\") | (Re.set("\\") + Re.any())).star(), + Re.literal("'"), + ), + highlight=highlight.string.quoted, +) +WHILE = Terminal("WHILE", "while", highlight=highlight.keyword.control) +EQUAL = Terminal("EQUAL", "=", highlight=highlight.keyword.operator.expression) +LPAREN = Terminal("LPAREN", "(", highlight=highlight.punctuation.parenthesis.open) +RPAREN = Terminal("RPAREN", ")", highlight=highlight.punctuation.parenthesis.close) +COMMA = Terminal("COMMA", ",", highlight=highlight.punctuation.separator) +SELF = Terminal("SELFF", "self", highlight=highlight.variable.language) +OR = Terminal("OR", "or", highlight=highlight.keyword.operator.expression) +IS = Terminal("IS", "is", highlight=highlight.keyword.operator.expression) +AND = Terminal("AND", "and", highlight=highlight.keyword.operator.expression) +EQUALEQUAL = Terminal("EQUALEQUAL", "==", highlight=highlight.keyword.operator.expression) +BANGEQUAL = Terminal("BANGEQUAL", "!=", highlight=highlight.keyword.operator.expression) +LESS = Terminal("LESS", "<", highlight=highlight.keyword.operator.expression) +GREATER = Terminal("GREATER", ">", highlight=highlight.keyword.operator.expression) +LESSEQUAL = Terminal("LESSEQUAL", "<=", highlight=highlight.keyword.operator.expression) +GREATEREQUAL = Terminal("GREATEREQUAL", ">=", highlight=highlight.keyword.operator.expression) +PLUS = Terminal("PLUS", "+", highlight=highlight.keyword.operator.expression) +MINUS = Terminal("MINUS", "-", highlight=highlight.keyword.operator.expression) +STAR = Terminal("STAR", "*", highlight=highlight.keyword.operator.expression) +SLASH = Terminal("SLASH", "/", highlight=highlight.keyword.operator.expression) +NUMBER = Terminal( + "NUMBER", + Re.seq( + Re.set(("0", "9")).plus(), Re.seq( + Re.literal("."), Re.set(("0", "9")).plus(), - Re.seq( - Re.literal("."), - Re.set(("0", "9")).plus(), - ).question(), - Re.seq( - Re.set("e", "E"), - Re.set("+", "-").question(), - Re.set(("0", "9")).plus(), - ).question(), - ), - highlight=highlight.constant.numeric, - ) - TRUE = Terminal("true", highlight=highlight.constant.language) - FALSE = Terminal("false", highlight=highlight.constant.language) - BANG = Terminal("!", highlight=highlight.keyword.operator.expression) - DOT = Terminal(".", highlight=highlight.punctuation.separator) - MATCH = Terminal("match", highlight=highlight.keyword.other) - EXPORT = Terminal("export", highlight=highlight.keyword.other) - UNDERSCORE = Terminal("_", highlight=highlight.variable.language) - NEW = Terminal("new", highlight=highlight.keyword.operator) - LSQUARE = Terminal("[", highlight=highlight.punctuation.square_bracket.open) - RSQUARE = Terminal("]", highlight=highlight.punctuation.square_bracket.close) + ).question(), + Re.seq( + Re.set("e", "E"), + Re.set("+", "-").question(), + Re.set(("0", "9")).plus(), + ).question(), + ), + highlight=highlight.constant.numeric, +) +TRUE = Terminal("TRUE", "true", highlight=highlight.constant.language) +FALSE = Terminal("FALSE", "false", highlight=highlight.constant.language) +BANG = Terminal("BANG", "!", highlight=highlight.keyword.operator.expression) +DOT = Terminal("DOT", ".", highlight=highlight.punctuation.separator) +MATCH = Terminal("MATCH", "match", highlight=highlight.keyword.other) +EXPORT = Terminal("EXPORT", "export", highlight=highlight.keyword.other) +UNDERSCORE = Terminal("UNDERSCORE", "_", highlight=highlight.variable.language) +NEW = Terminal("NEW", "new", highlight=highlight.keyword.operator) +LSQUARE = Terminal("LSQUARE", "[", highlight=highlight.punctuation.square_bracket.open) +RSQUARE = Terminal("RSQUARE", "]", highlight=highlight.punctuation.square_bracket.close) +FineGrammar=Grammar( + start=file, + trivia=[BLANKS, LINE_BREAK, COMMENT], + pretty_indent=" ", + precedence=[ + (Assoc.RIGHT, [EQUAL]), + (Assoc.LEFT, [OR]), + (Assoc.LEFT, [IS]), + (Assoc.LEFT, [AND]), + (Assoc.LEFT, [EQUALEQUAL, BANGEQUAL]), + (Assoc.LEFT, [LESS, GREATER, GREATEREQUAL, LESSEQUAL]), + (Assoc.LEFT, [PLUS, MINUS]), + (Assoc.LEFT, [STAR, SLASH]), + (Assoc.LEFT, [primary_expression]), + (Assoc.LEFT, [LPAREN]), + (Assoc.LEFT, [DOT]), + # + # If there's a confusion about whether to make an IF + # statement or an expression, prefer the statement. + # + (Assoc.NONE, [if_statement]), + ], +) if __name__ == "__main__": from pathlib import Path @@ -525,7 +520,7 @@ if __name__ == "__main__": from parser.tree_sitter import emit_tree_sitter_grammar, emit_tree_sitter_queries # TODO: Actually generate a lexer/parser for some runtime. - grammar = FineGrammar() + grammar = FineGrammar table = grammar.build_table() # print(table.format()) diff --git a/parser/emacs.py b/parser/emacs.py index 1a73d88..6ab3036 100644 --- a/parser/emacs.py +++ b/parser/emacs.py @@ -25,8 +25,6 @@ class FaceQuery: def gather_faces(grammar: parser.Grammar): - nts = {nt.name: nt for nt in grammar.non_terminals()} - def scoop(node: str, input: parser.FlattenedWithMetadata, visited: set[str]) -> list[FaceQuery]: parts = [] for item in input: @@ -52,13 +50,12 @@ def gather_faces(grammar: parser.Grammar): ) ) - elif isinstance(item, str): - nt = nts[item] - if nt.transparent: - if nt.name in visited: + elif isinstance(item, parser.NonTerminal): + if item.transparent: + if item.name in visited: continue - visited.add(nt.name) - body = nt.fn(grammar) + visited.add(item.name) + body = item.definition for production in body.flatten(with_metadata=True): parts.extend(scoop(node, production, visited)) @@ -69,7 +66,7 @@ def gather_faces(grammar: parser.Grammar): if rule.transparent: continue - body = rule.fn(grammar) + body = rule.definition for production in body.flatten(with_metadata=True): queries.extend(scoop(rule.name, production, set())) diff --git a/parser/parser.py b/parser/parser.py index 320ce79..a54da0f 100644 --- a/parser/parser.py +++ b/parser/parser.py @@ -17,25 +17,24 @@ the thing that processes the tables. ## Making Grammars -To get started, create a grammar that derives from the `Grammar` class. Create -one method per nonterminal, decorated with the `rule` decorator. Here's an -example: +Define a series of terminals (with `Terminal`) and rules (as functions decorated +with `@rule`), and then pass the starting rule to the constructor of a `Grammar` +object: + @rule + def expression(self): + return seq(self.expression, self.PLUS, self.term) | self.term - class SimpleGrammar(Grammar): - @rule - def expression(self): - return seq(self.expression, self.PLUS, self.term) | self.term + @rule + def term(self): + return seq(self.LPAREN, self.expression, self.RPAREN) | self.ID - @rule - def term(self): - return seq(self.LPAREN, self.expression, self.RPAREN) | self.ID - - PLUS = Terminal('+') - LPAREN = Terminal('(') - RPAREN = Terminal(')') - ID = Terminal('id') + PLUS = Terminal('+') + LPAREN = Terminal('(') + RPAREN = Terminal(')') + ID = Terminal('id') + grammar = Grammar(start=expression) ## Using grammars @@ -1533,7 +1532,9 @@ class ParserGenerator: return builder.flush(config_sets) -FlattenedWithMetadata = list["str|Terminal|tuple[dict[str,typing.Any],FlattenedWithMetadata]"] +FlattenedWithMetadata = list[ + "NonTerminal|Terminal|tuple[dict[str,typing.Any],FlattenedWithMetadata]" +] ############################################################################### @@ -1578,26 +1579,32 @@ class Rule: class Terminal(Rule): """A token, or terminal symbol in the grammar.""" - name: str | None + name: str pattern: "str | Re" meta: dict[str, typing.Any] regex: bool error_name: str | None + definition_location: str def __init__( self, + name: str, pattern: "str|Re", *, - name: str | None = None, error_name: str | None = None, **kwargs, ): + # TODO: Consider identifying the name from some kind of globals + # dictionary or something if necessary. self.name = name self.pattern = pattern self.meta = kwargs self.regex = isinstance(pattern, Re) self.error_name = error_name + caller = inspect.stack()[1] + self.definition_location = f"{caller.filename}:{caller.lineno}" + def flatten( self, with_metadata: bool = False ) -> typing.Generator[FlattenedWithMetadata, None, None]: @@ -1617,14 +1624,17 @@ class NonTerminal(Rule): grammar class. """ - fn: typing.Callable[["Grammar"], Rule] + fn: typing.Callable[[], Rule] name: str transparent: bool error_name: str | None + definition_location: str + _definition: Rule | None + _body: "list[list[NonTerminal | Terminal]] | None" def __init__( self, - fn: typing.Callable[["Grammar"], Rule], + fn: typing.Callable[[], Rule], name: str | None = None, transparent: bool = False, error_name: str | None = None, @@ -1645,22 +1655,37 @@ class NonTerminal(Rule): self.name = name or fn.__name__ self.transparent = transparent self.error_name = error_name + self._definition = None + self._body = None - def generate_body(self, grammar) -> list[list[str | Terminal]]: - """Generate the body of the non-terminal. + caller = inspect.stack()[1] + self.definition_location = f"{caller.filename}:{caller.lineno}" - We do this by first calling the associated function in order to get a - Rule, and then flattening the Rule into the associated set of - productions. We strip the metadata from the flattened result to make - life a little easier for the caller. + @property + def definition(self) -> Rule: + """The rule that is the definition of this nonterminal. + + (As opposed this rule itself, which is... itself.) + """ + if self._definition is None: + self._definition = self.fn() + return self._definition + + @property + def body(self) -> "list[list[NonTerminal | Terminal]]": + """The flattened body of the nonterminal: a list of productions where + each production is a sequence of Terminals and NonTerminals. """ - def without_metadata(result: FlattenedWithMetadata) -> list[str | Terminal]: + def without_metadata(result: FlattenedWithMetadata) -> list[NonTerminal | Terminal]: for item in result: assert not isinstance(item, tuple) - return typing.cast(list[str | Terminal], result) + return typing.cast(list[NonTerminal | Terminal], result) - return [without_metadata(rule) for rule in self.fn(grammar).flatten(with_metadata=False)] + if self._body is None: + self._body = [without_metadata(rule) for rule in self.fn().flatten(with_metadata=False)] + + return self._body def flatten( self, with_metadata: bool = False @@ -1669,7 +1694,7 @@ class NonTerminal(Rule): # the context of some other production. Yield ourselves, and trust that # in time we will be asked to generate our body. del with_metadata - yield [self.name] + yield [self] class AlternativeRule(Rule): @@ -1775,7 +1800,7 @@ def mark(rule: Rule, **kwargs) -> Rule: @typing.overload -def rule(f: typing.Callable, /) -> Rule: ... +def rule(f: typing.Callable, /) -> NonTerminal: ... @typing.overload @@ -1783,16 +1808,15 @@ def rule( name: str | None = None, transparent: bool | None = None, error_name: str | None = None, -) -> typing.Callable[[typing.Callable[[typing.Any], Rule]], Rule]: ... +) -> typing.Callable[[typing.Callable[[], Rule]], NonTerminal]: ... def rule( name: str | None | typing.Callable = None, transparent: bool | None = None, error_name: str | None = None, -) -> Rule | typing.Callable[[typing.Callable[[typing.Any], Rule]], Rule]: - """The decorator that marks a method in a Grammar object as a nonterminal - rule. +) -> NonTerminal | typing.Callable[[typing.Callable[[], Rule]], NonTerminal]: + """The decorator that marks a function as a nonterminal rule. As with all the best decorators, it can be called with or without arguments. If called with one argument, that argument is a name that overrides the name @@ -1801,7 +1825,7 @@ def rule( if callable(name): return rule()(name) - def wrapper(f: typing.Callable[[typing.Any], Rule]): + def wrapper(f: typing.Callable[[], Rule]): nonlocal name nonlocal transparent nonlocal error_name @@ -2746,145 +2770,150 @@ class TriviaMode(enum.Enum): ############################################################################### -# Finally, the base class for grammars +# Finally, the grammar class. ############################################################################### -PrecedenceList = list[typing.Tuple[Assoc, list[Rule | str]]] +PrecedenceList = list[typing.Tuple[Assoc, list[Terminal|NonTerminal]]] + +def gather_grammar(start: NonTerminal, trivia: list[Terminal]) -> tuple[dict[str,NonTerminal], dict[str,Terminal]]: + """Starting from the given NonTerminal, gather all of the symbols + (NonTerminals and Terminals) that make up the grammar. + """ + # NOTE: We use a dummy dictionary here to preserve insertion order. + # That way the first element in named_rules is always the start + # symbol! + rules: dict[NonTerminal, int] = {} + terminals: dict[Terminal, int] = {} + + # STEP 1 is to just gather all of the symbols that we can find. + queue: list[NonTerminal] = [start] + while len(queue) > 0: + nt = queue.pop() + if nt in rules: + continue + + # TODO: Here we can track modules (via the funcitons that make up + # nonterminals, maybe) and maybe use that to infer terminal + # names. + rules[nt] = len(rules) + + for rule in nt.body: + for symbol in rule: + if isinstance(symbol, NonTerminal): + if symbol not in rules: + queue.append(symbol) + + elif isinstance(symbol, Terminal): + terminals[symbol] = len(terminals) + + else: + typing.assert_never(symbol) + + # (Terminals are also reachable!) + for symbol in trivia: + terminals[symbol] = len(terminals) + + # Step 2 is to organize all of these things and check them for errors. + named_rules: dict[str, NonTerminal] = {} + for rule in rules: + existing = named_rules.get(rule.name) + if existing is not None: + # TODO TEST + raise ValueError(f"""Found more than one rule named {rule.name}: +- {existing.definition_location} +- {rule.definition_location}""") + named_rules[rule.name] = rule + + named_terminals: dict[str, Terminal] = {} + for terminal in terminals: + existing = named_terminals.get(terminal.name) + if existing is not None: + # TODO TEST + raise ValueError(f"""Found more than one terminal named {terminal.name}: +- {existing.definition_location} +- {terminal.definition_location}""") + + existing_rule = named_rules.get(terminal.name) + if existing_rule is not None: + # TODO TEST + raise ValueError(f"""Found a terminal and a rule both named {terminal.name}: +- The rule was defined at {existing_rule.definition_location} +- The terminal was defined at {terminal.definition_location}""") + + named_terminals[terminal.name] = terminal + + return (named_rules, named_terminals) class Grammar: - """The base class for defining a grammar. - - Inherit from this, and and define members for your nonterminals, and then - use the `build_table` method to construct the parse tables. - + """A container that holds all the terminals and nonterminals for a + given grammar. The terminals and nonterminals are defined elsewhere; + provide the starting rule and this object will build the grammar from + everything accessible. Here's an example of a simple grammar: - class SimpleGrammar(Grammar): - @rule - def expression(self): - return seq(self.expression, self.PLUS, self.term) | self.term + @rule + def expression(self): + return seq(self.expression, self.PLUS, self.term) | self.term - @rule - def term(self): - return seq(self.LPAREN, self.expression, self.RPAREN) | self.ID + @rule + def term(self): + return seq(self.LPAREN, self.expression, self.RPAREN) | self.ID - PLUS = Terminal('+') - LPAREN = Terminal('(') - RPAREN = Terminal(')') - ID = Terminal('id') + PLUS = Terminal('+') + LPAREN = Terminal('(') + RPAREN = Terminal(')') + ID = Terminal('id') + grammar = Grammar(start=expression) Not very exciting, perhaps, but it's something. """ - _precedence: dict[str, typing.Tuple[Assoc, int]] - _generator: type[ParserGenerator] + start: NonTerminal + name: str + pretty_indent: str | None _terminals: dict[str, Terminal] _nonterminals: dict[str, NonTerminal] _trivia: list[Terminal] + _precedence: dict[str, typing.Tuple[Assoc, int]] def __init__( self, - start: str | NonTerminal | None = None, + start: NonTerminal, precedence: PrecedenceList | None = None, - generator: type[ParserGenerator] | None = None, - trivia: list[str | Terminal] | None = None, + trivia: list[Terminal] | None = None, name: str | None = None, + pretty_indent: str | None = None, ): - if start is None: - start = getattr(self, "start", None) - if start is None: - raise ValueError( - "The default start rule must either be specified in the constructor or as an " - "attribute in the class." - ) - if isinstance(start, NonTerminal): - start = start.name + if start.transparent: + # TODO: TEST + raise ValueError("The start rule cannot be transparent") if precedence is None: - precedence = getattr(self, "precedence", []) + precedence = [] assert precedence is not None - if generator is None: - generator = getattr(self, "generator", ParserGenerator) - assert generator is not None - if trivia is None: - trivia = getattr(self, "trivia", []) + trivia = [] assert trivia is not None - # Fixup terminal names with the name of the member that declared it. - terminals = {} - for n, t in inspect.getmembers(self, lambda x: isinstance(x, Terminal)): - if t.name is None: - t.name = n - - if n in terminals: - raise ValueError(f"More than one terminal has the name '{n}'") - terminals[n] = t - - # Get the nonterminals. - nonterminals = {} - for _, nt in inspect.getmembers(self, lambda x: isinstance(x, NonTerminal)): - if nt.name in nonterminals: - raise ValueError(f"More than one nonterminal found with the name '{nt.name}'") - - if nt.name in terminals: - raise ValueError( - f"'{nt.name}' is the name of both a Terminal and a NonTerminal rule" - ) - - nonterminals[nt.name] = nt - - # Resolve the trivia declarations correctly. - resolved_trivia: list[Terminal] = [] - for t in trivia: - if isinstance(t, str): - resolved = terminals.get(t) - if resolved is None: - raise ValueError(f"The trivia '{t}' is not a terminal name") - resolved_trivia.append(resolved) - elif isinstance(t, Terminal): - resolved_trivia.append(t) - else: - raise ValueError(f"{t} must be either a terminal name or literally a terminal") - # Fix up the precedence table. precedence_table = {} for prec, (associativity, symbols) in enumerate(precedence): for symbol in symbols: - key = None - if isinstance(symbol, Terminal): - key = symbol.name - if key is None: - raise ValueError(f"{symbol} is a terminal that has not had a name set yet") - elif isinstance(symbol, NonTerminal): - key = symbol.name - elif isinstance(symbol, str): - if symbol in terminals or symbol in nonterminals: - key = symbol - - if key is None: - raise ValueError( - f"{symbol} must be either a Token or a NonTerminal, or the name of one" - ) - - precedence_table[key] = (associativity, prec + 1) + precedence_table[symbol.name] = (associativity, prec + 1) if name is None: - name = getattr(self, "name", None) - if name is None: - name = self.__class__.__name__.removesuffix("Grammar").lower() + name = "unknown" - self._precedence = precedence_table self.start = start - self._generator = generator - self._terminals = terminals - self._nonterminals = nonterminals - self._trivia = resolved_trivia self.name = name + self._nonterminals, self._terminals = gather_grammar(start, trivia) + self._trivia = trivia + self._precedence = precedence_table + self.pretty_indent = pretty_indent def terminals(self) -> list[Terminal]: return list(self._terminals.values()) @@ -2898,55 +2927,7 @@ class Grammar: def get_precedence(self, name: str) -> None | tuple[Assoc, int]: return self._precedence.get(name) - # TODO: The flattened form should retain NonTerminal, not just str. - def generate_nonterminal_dict( - self, start: str | None = None - ) -> typing.Tuple[dict[str, list[list[str | Terminal]]], set[str]]: - """Convert the rules into a dictionary of productions, and a set of - the names of transparent nonterminals. - - Our table generators work on a very flat set of productions. This is the - first step in flattening the productions from the members: walk the rules - starting from the given start rule and flatten them, one by one, into a - dictionary that maps nonterminal rule name to its associated list of - productions. - """ - if start is None: - start = self.start - - nonterminals = self._nonterminals - transparents = {rule.name for rule in nonterminals.values() if rule.transparent} - - grammar = {} - - rule = nonterminals.get(start) - if rule is None: - raise ValueError(f"Cannot find a rule named '{start}'") - if rule.transparent: - raise ValueError("The start rule cannot be transparent") - queue = [rule] - while len(queue) > 0: - rule = queue.pop() - if rule.name in grammar: - continue - - body = rule.generate_body(self) - for clause in body: - for symbol in clause: - if not isinstance(symbol, Terminal): - assert isinstance(symbol, str) - nonterminal = nonterminals.get(symbol) - if nonterminal is None: - raise ValueError(f"While processing {rule.name}: cannot find {symbol}") - queue.append(nonterminal) - - grammar[rule.name] = body - - return (grammar, transparents) - - def desugar( - self, start: str | None = None - ) -> typing.Tuple[list[typing.Tuple[str, list[str]]], set[str]]: + def desugar(self) -> typing.Tuple[list[typing.Tuple[str, list[str]]], set[str]]: """Convert the rules into a flat list of productions. Our table generators work from a very flat set of productions. The form @@ -2954,37 +2935,27 @@ class Grammar: generate_nonterminal_dict- less useful to people, probably, but it is the input form needed by the Generator. """ - temp_grammar, transparents = self.generate_nonterminal_dict(start) + grammar: list[tuple[str,list[str]]] = [ + (rule.name, [s.name for s in production]) + for rule in self._nonterminals.values() + for production in rule.body + ] + assert grammar[0][0] == self.start.name - grammar = [] - for rule_name, clauses in temp_grammar.items(): - for clause in clauses: - new_clause = [] - for symbol in clause: - if isinstance(symbol, Terminal): - if symbol.name in temp_grammar: - raise ValueError( - f"'{symbol.name}' is the name of both a Terminal and a NonTerminal rule. This will cause problems." - ) - new_clause.append(symbol.name) - else: - new_clause.append(symbol) - - grammar.append((rule_name, new_clause)) + transparents = {name for name, rule in self._nonterminals.items() if rule.transparent} return grammar, transparents - def build_table(self, start: str | None = None, generator=None) -> ParseTable: - """Construct a parse table for this grammar, starting at the named - nonterminal rule. - """ - if start is None: - start = self.start - desugared, transparents = self.desugar(start) + def build_table(self) -> ParseTable: + """Construct a parse table for this grammar.""" + desugared, transparents = self.desugar() - if generator is None: - generator = self._generator - gen = generator(start, desugared, precedence=self._precedence, transparents=transparents) + gen = ParserGenerator( + self.start.name, + desugared, + precedence=self._precedence, + transparents=transparents, + ) table = gen.gen_table() for t in self._trivia: diff --git a/parser/tree_sitter.py b/parser/tree_sitter.py index 7f9d231..683ea16 100644 --- a/parser/tree_sitter.py +++ b/parser/tree_sitter.py @@ -263,8 +263,7 @@ def emit_tree_sitter_grammar(grammar: parser.Grammar, path: pathlib.Path | str): if rule.transparent: rule_name = "_" + rule_name - body = rule.fn(grammar) - rule_definition = convert_to_tree_sitter(body, grammar) + rule_definition = convert_to_tree_sitter(rule.definition, grammar) if rule_definition is None: raise Exception(f"Tree-sitter does not support the empty rule {rule_name}") rule_definition = apply_precedence(rule_definition, rule.name, grammar) @@ -283,7 +282,6 @@ def emit_tree_sitter_grammar(grammar: parser.Grammar, path: pathlib.Path | str): def emit_tree_sitter_queries(grammar: parser.Grammar, path: pathlib.Path | str): - nts = {nt.name: nt for nt in grammar.non_terminals()} scope_suffix = "." + grammar.name def scoop(input: parser.FlattenedWithMetadata, visited: set[str]) -> list[str]: @@ -300,13 +298,12 @@ def emit_tree_sitter_queries(grammar: parser.Grammar, path: pathlib.Path | str): raise Exception("Highlight must come with a field name") # TODO parts.append(f"{field_name}: _ @{highlight.scope}{scope_suffix}") - elif isinstance(item, str): - nt = nts[item] - if nt.transparent: - if nt.name in visited: + elif isinstance(item, parser.NonTerminal): + if item.transparent: + if item.name in visited: continue - visited.add(nt.name) - body = nt.fn(grammar) + visited.add(item.name) + body = item.definition for production in body.flatten(with_metadata=True): parts.extend(scoop(production, visited)) @@ -317,7 +314,7 @@ def emit_tree_sitter_queries(grammar: parser.Grammar, path: pathlib.Path | str): if rule.transparent: continue - body = rule.fn(grammar) + body = rule.definition patterns = set() for production in body.flatten(with_metadata=True): # Scoop up the meta... diff --git a/parser/wadler/builder.py b/parser/wadler/builder.py index fa2e23c..1d2ea95 100644 --- a/parser/wadler/builder.py +++ b/parser/wadler/builder.py @@ -79,11 +79,7 @@ class MatcherTable: newline_replace: dict[str, str] -def _compile_nonterminal_matcher( - grammar: parser.Grammar, - nonterminals: dict[str, parser.NonTerminal], - rule: parser.NonTerminal, -) -> MatcherTable: +def _compile_nonterminal_matcher(rule: parser.NonTerminal) -> MatcherTable: """Generate a matcher table for a single nonterminal. See the docs for [MatcherTable] to understand the result. @@ -111,7 +107,7 @@ def _compile_nonterminal_matcher( def compile_nonterminal(name: str, rule: parser.NonTerminal): if name not in visited: visited.add(name) - for production in rule.fn(grammar).flatten(with_metadata=True): + for production in rule.fn().flatten(with_metadata=True): trans_prod = compile_production(production) generated_grammar.append((name, trans_prod)) @@ -126,19 +122,18 @@ def _compile_nonterminal_matcher( result = [] for item in production: - if isinstance(item, str): - nt = nonterminals[item] - if nt.transparent: + if isinstance(item, parser.NonTerminal): + if item.transparent: # If it's transparent then we make a new set of # productions that covers the contents of the # transparent nonterminal. - name = "xxx_" + nt.name - compile_nonterminal(name, nt) + name = "xxx_" + item.name + compile_nonterminal(name, item) result.append(name) else: # Otherwise it's a "token" in our input, named # "tree_{whatever}". - result.append(f"tree_{item}") + result.append(f"tree_{item.name}") elif isinstance(item, parser.Terminal): # If it's a terminal it will appear in our input as @@ -257,7 +252,7 @@ def _compile_nonterminal_matcher( start_name = f"yyy_{rule.name}" compile_nonterminal(start_name, rule) - gen = grammar._generator(start_name, generated_grammar) + gen = parser.ParserGenerator(start_name, generated_grammar) parse_table = gen.gen_table() for (_, replacement), rule_name in newlines.items(): @@ -296,7 +291,7 @@ def compile_pretty_table(grammar: parser.Grammar, indent: str | None = None) -> matchers = {} if indent is None: - indent = getattr(grammar, "pretty_indent", None) + indent = grammar.pretty_indent if indent is None: indent = " " @@ -307,7 +302,7 @@ def compile_pretty_table(grammar: parser.Grammar, indent: str | None = None) -> trivia_mode[t.name] = mode for name, rule in nonterminals.items(): - matchers[name] = _compile_nonterminal_matcher(grammar, nonterminals, rule) + matchers[name] = _compile_nonterminal_matcher(rule) return PrettyTable( indent, diff --git a/sql.py b/sql.py index 4ed749d..f849631 100644 --- a/sql.py +++ b/sql.py @@ -2,6 +2,7 @@ from parser import * NAME = Terminal( + "NAME", Re.seq( Re.set(("a", "z"), ("A", "Z"), "_"), Re.set(("a", "z"), ("A", "Z"), ("0", "9"), "_").star(), @@ -9,6 +10,7 @@ NAME = Terminal( ) STRING = Terminal( + "STRING", Re.seq( Re.literal("'"), (~Re.set("'", "\\") | (Re.set("\\") + Re.any())).star(), @@ -18,6 +20,7 @@ STRING = Terminal( ) NUMBER = Terminal( + "NUMBER", Re.seq( Re.set(("0", "9")).plus(), Re.seq( @@ -33,118 +36,118 @@ NUMBER = Terminal( highlight=highlight.constant.numeric, ) -OR = Terminal("or") -AND = Terminal("and") -NOT = Terminal("not") +OR = Terminal("OR", "or") +AND = Terminal("AND", "and") +NOT = Terminal("NOT", "not") COMPARISON = Terminal( + "COMPARISON", Re.literal("=") | Re.literal("<>") | Re.literal("<") | Re.literal(">") | Re.literal("<=") - | Re.literal(">=") + | Re.literal(">="), ) -PLUS = Terminal("+") -MINUS = Terminal("-") -STAR = Terminal("*") -SLASH = Terminal("/") +PLUS = Terminal("PLUS", "+") +MINUS = Terminal("MINUS", "-") +STAR = Terminal("STAR", "*") +SLASH = Terminal("SLASH", "/") -precedence = [ - (Assoc.LEFT, ["OR"]), - (Assoc.LEFT, ["AND"]), - (Assoc.LEFT, ["NOT"]), - (Assoc.LEFT, ["COMPARISON"]), - (Assoc.LEFT, ["PLUS", "MINUS"]), - (Assoc.LEFT, ["STAR", "SLASH"]), - # TODO: Unary minus -] +ALL = Terminal("ALL", "all") +AMMSC = Terminal("AMMSC", "ammsc") +ANY = Terminal("ANY", "any") +AS = Terminal("AS", "as") +ASC = Terminal("ASC", "asc") +AUTHORIZATION = Terminal("AUTHORIZATION", "authorization") +BETWEEN = Terminal("BETWEEN", "between") +BY = Terminal("BY", "by") +CHARACTER = Terminal("CHARACTER", "character") +CHECK = Terminal("CHECK", "check") +CLOSE = Terminal("CLOSE", "close") +COMMIT = Terminal("COMMIT", "commit") +CONTINUE = Terminal("CONTINUE", "continue") +CREATE = Terminal("CREATE", "create") +CURRENT = Terminal("CURRENT", "current") +CURSOR = Terminal("CURSOR", "cursor") +DECIMAL = Terminal("DECIMAL", "decimal") +DECLARE = Terminal("DECLARE", "declare") +DEFAULT = Terminal("DEFAULT", "default") +DELETE = Terminal("DELETE", "delete") +DESC = Terminal("DESC", "desc") +DISTINCT = Terminal("DISTINCT", "distinct") +DOUBLE = Terminal("DOUBLE", "double") +ESCAPE = Terminal("ESCAPE", "escape") +EXISTS = Terminal("EXISTS", "exists") +FETCH = Terminal("FETCH", "fetch") +FLOAT = Terminal("FLOAT", "float") +FOR = Terminal("FOR", "for") +FOREIGN = Terminal("FOREIGN", "foreign") +FOUND = Terminal("FOUND", "found") +FROM = Terminal("FROM", "from") +GOTO = Terminal("GOTO", "goto") +GRANT = Terminal("GRANT", "grant") +GROUP = Terminal("GROUP", "group") +HAVING = Terminal("HAVING", "having") +IN = Terminal("IN", "in") +INDICATOR = Terminal("INDICATOR", "indicator") +INSERT = Terminal("INSERT", "insert") +INTEGER = Terminal("INTEGER", "integer") +INTO = Terminal("INTO", "into") +IS = Terminal("IS", "is") +KEY = Terminal("KEY", "key") +LANGUAGE = Terminal("LANGUAGE", "language") +LIKE = Terminal("LIKE", "like") +NULL = Terminal("NULL", "null") +NUMERIC = Terminal("NUMERIC", "numeric") +OF = Terminal("OF", "of") +ON = Terminal("ON", "on") +OPEN = Terminal("OPEN", "open") +OPTION = Terminal("OPTION", "option") +ORDER = Terminal("ORDER", "order") +PARAMETER = Terminal("PARAMETER", "parameter") +PRECISION = Terminal("PRECISION", "precision") +PRIMARY = Terminal("PRIMARY", "primary") +PRIVILEGES = Terminal("PRIVILEGES", "privileges") +PROCEDURE = Terminal("PROCEDURE", "procedure") +PUBLIC = Terminal("PUBLIC", "public") +REAL = Terminal("REAL", "real") +REFERENCES = Terminal("REFERENCES", "references") +ROLLBACK = Terminal("ROLLBACK", "rollback") +SCHEMA = Terminal("SCHEMA", "schema") +SELECT = Terminal("SELECT", "select") +SET = Terminal("SET", "set") +SMALLINT = Terminal("SMALLINT", "smallint") +SOME = Terminal("SOME", "some") +SQLCODE = Terminal("SQLCODE", "sqlcode") +SQLERROR = Terminal("SQLERROR", "sqlerror") +TABLE = Terminal("TABLE", "table") +TO = Terminal("TO", "to") +UNION = Terminal("UNION", "union") +UNIQUE = Terminal("UNIQUE", "unique") +UPDATE = Terminal("UPDATE", "update") +USER = Terminal("USER", "user") +VALUES = Terminal("VALUES", "values") +VIEW = Terminal("VIEW", "view") +WHENEVER = Terminal("WHENEVER", "whenever") +WHERE = Terminal("WHERE", "where") +WITH = Terminal("WITH", "with") +WORK = Terminal("WORK", "work") -ALL = Terminal("all") -AMMSC = Terminal("ammsc") -ANY = Terminal("any") -ASC = Terminal("asc") -AUTHORIZATION = Terminal("authorization") -BETWEEN = Terminal("between") -BY = Terminal("by") -CHARACTER = Terminal("character") -CHECK = Terminal("check") -CLOSE = Terminal("close") -COMMIT = Terminal("commit") -CONTINUE = Terminal("continue") -CREATE = Terminal("create") -CURRENT = Terminal("current") -CURSOR = Terminal("cursor") -DECIMAL = Terminal("decimal") -DECLARE = Terminal("declare") -DEFAULT = Terminal("default") -DELETE = Terminal("delete") -DESC = Terminal("desc") -DISTINCT = Terminal("distinct") -DOUBLE = Terminal("double") -ESCAPE = Terminal("escape") -EXISTS = Terminal("exists") -FETCH = Terminal("fetch") -FLOAT = Terminal("float") -FOR = Terminal("for") -FOREIGN = Terminal("foreign") -FOUND = Terminal("found") -FROM = Terminal("from") -GOTO = Terminal("goto") -GRANT = Terminal("grant") -GROUP = Terminal("group") -HAVING = Terminal("having") -IN = Terminal("in") -INDICATOR = Terminal("indicator") -INSERT = Terminal("insert") -INTEGER = Terminal("integer") -INTO = Terminal("into") -IS = Terminal("is") -KEY = Terminal("key") -LANGUAGE = Terminal("language") -LIKE = Terminal("like") -NULL = Terminal("null") -NUMERIC = Terminal("numeric") -OF = Terminal("of") -ON = Terminal("on") -OPEN = Terminal("open") -OPTION = Terminal("option") -ORDER = Terminal("order") -PARAMETER = Terminal("parameter") -PRECISION = Terminal("precision") -PRIMARY = Terminal("primary") -PRIVILEGES = Terminal("privileges") -PROCEDURE = Terminal("procedure") -PUBLIC = Terminal("public") -REAL = Terminal("real") -REFERENCES = Terminal("references") -ROLLBACK = Terminal("rollback") -SCHEMA = Terminal("schema") -SELECT = Terminal("select") -SET = Terminal("set") -SMALLINT = Terminal("smallint") -SOME = Terminal("some") -SQLCODE = Terminal("sqlcode") -SQLERROR = Terminal("sqlerror") -TABLE = Terminal("table") -TO = Terminal("to") -UNION = Terminal("union") -UNIQUE = Terminal("unique") -UPDATE = Terminal("update") -USER = Terminal("user") -VALUES = Terminal("values") -VIEW = Terminal("view") -WHENEVER = Terminal("whenever") -WHERE = Terminal("where") -WITH = Terminal("with") -WORK = Terminal("work") +SEMICOLON = Terminal("SEMICOLON", ";") +LPAREN = Terminal("LPAREN", "(") +RPAREN = Terminal("RPAREN", ")") +COMMA = Terminal("COMMA", ",") +EQUAL = Terminal("EQUAL", "=") +DOT = Terminal("DOT", ".") -SEMICOLON = Terminal(";") -LPAREN = Terminal("(") -RPAREN = Terminal(")") -COMMA = Terminal(",") -EQUAL = Terminal("=") -DOT = Terminal(".") -AS = Terminal("as") +BLANKS = Terminal("BLANKS", Re.set(" ", "\t").plus()) +LINE_BREAK = Terminal("LINE_BREAK", Re.set("\r", "\n"), trivia_mode=TriviaMode.NewLine) +COMMENT = Terminal( + "COMMENT", + Re.seq(Re.literal("--"), Re.set("\n").invert().star()), + highlight=highlight.comment.line, + trivia_mode=TriviaMode.LineComment, +) @rule @@ -740,3 +743,19 @@ def user(): @rule def when_action(): return (GOTO + NAME) | CONTINUE + + +SQL = Grammar( + start=sql_list, + precedence=[ + (Assoc.LEFT, [OR]), + (Assoc.LEFT, [AND]), + (Assoc.LEFT, [NOT]), + (Assoc.LEFT, [COMPARISON]), + (Assoc.LEFT, [PLUS, MINUS]), + (Assoc.LEFT, [STAR, SLASH]), + # TODO: Unary minus + ], + trivia=[BLANKS, COMMENT, LINE_BREAK], + name="SQL", +) diff --git a/tests/test_error_recovery.py b/tests/test_error_recovery.py index 96b5c49..dbb254a 100644 --- a/tests/test_error_recovery.py +++ b/tests/test_error_recovery.py @@ -11,138 +11,141 @@ import parser.runtime as runtime # Tests based on # https://matklad.github.io/2023/05/21/resilient-ll-parsing-tutorial.html -class LGrammar(Grammar): - start = "File" - trivia = ["BLANKS"] +BLANKS = Terminal("BLANKS", Re.set(" ", "\t", "\r", "\n").plus()) + +TRUE = Terminal("TRUE", "true") +FALSE = Terminal("FALSE", "false") +INT = Terminal("INT", Re.set(("0", "9")).plus()) +FN = Terminal("FN", "fn") +ARROW = Terminal("ARROW", "->") +COMMA = Terminal("COMMA", ",") +LPAREN = Terminal("LPAREN", "(") +RPAREN = Terminal("RPAREN", ")") +LCURLY = Terminal("LCURLY", "{") +RCURLY = Terminal("RCURLY", "}") +COLON = Terminal("COLON", ":") +SEMICOLON = Terminal("SEMICOLON", ";") +LET = Terminal("LET", "let") +EQUAL = Terminal("EQUAL", "=") +RETURN = Terminal("RETURN", "return") +PLUS = Terminal("PLUS", "+") +MINUS = Terminal("MINUS", "-") +STAR = Terminal("STAR", "*") +SLASH = Terminal("SLASH", "/") + +NAME = Terminal( + "NAME", + Re.seq( + Re.set(("a", "z"), ("A", "Z"), "_"), + Re.set(("a", "z"), ("A", "Z"), ("0", "9"), "_").star(), + ), +) + + +@rule +def File(): + # TODO: Make lists easier + return _functions + +@rule +def _functions(): + return Function | (_functions + Function) + +@rule +def Function(): + return FN + NAME + ParamList + opt(ARROW + TypeExpr) + Block + +@rule +def ParamList(): + return LPAREN + opt(_parameters) + RPAREN + +@rule +def _parameters(): + # NOTE: The ungrammar in the reference does not talk about commas + # required between parameters so this massages it to make them + # required. Commas are in the list not the param, which is more + # awkward for processing but not terminally so. + return (Param + opt(COMMA)) | (Param + COMMA + _parameters) + +@rule +def Param(): + return NAME + COLON + TypeExpr + +@rule +def TypeExpr(): + return NAME + +@rule +def Block(): + return LCURLY + opt(_statements) + RCURLY + +@rule +def _statements(): + return Stmt | _statements + Stmt + +@rule +def Stmt(): + return StmtExpr | StmtLet | StmtReturn + +@rule +def StmtExpr(): + return Expr + SEMICOLON + +@rule +def StmtLet(): + return LET + NAME + EQUAL + Expr + SEMICOLON + +@rule +def StmtReturn(): + return RETURN + Expr + SEMICOLON + +@rule +def Expr(): + return ExprLiteral | ExprName | ExprParen | ExprBinary | ExprCall + +@rule +def ExprLiteral(): + return INT | TRUE | FALSE + +@rule +def ExprName(): + return NAME + +@rule +def ExprParen(): + return LPAREN + Expr + RPAREN + +@rule +def ExprBinary(): + return Expr + (PLUS | MINUS | STAR | SLASH) + Expr + +@rule +def ExprCall(): + return Expr + ArgList + +@rule +def ArgList(): + return LPAREN + opt(_arg_star) + RPAREN + +@rule +def _arg_star(): + # Again, a deviation from the original. See _parameters. + return (Expr + opt(COMMA)) | (Expr + COMMA + _arg_star) + +LGrammar = Grammar( + start=File, + trivia=[BLANKS], # Need a little bit of disambiguation for the symbol involved. precedence = [ - (Assoc.LEFT, ["PLUS", "MINUS"]), - (Assoc.LEFT, ["STAR", "SLASH"]), - (Assoc.LEFT, ["LPAREN"]), - ] + (Assoc.LEFT, [PLUS, MINUS]), + (Assoc.LEFT, [STAR, SLASH]), + (Assoc.LEFT, [LPAREN]), + ], +) - @rule - def File(self): - # TODO: Make lists easier - return self._functions - - @rule - def _functions(self): - return self.Function | (self._functions + self.Function) - - @rule - def Function(self): - return self.FN + self.NAME + self.ParamList + opt(self.ARROW + self.TypeExpr) + self.Block - - @rule - def ParamList(self): - return self.LPAREN + opt(self._parameters) + self.RPAREN - - @rule - def _parameters(self): - # NOTE: The ungrammar in the reference does not talk about commas required between parameters - # so this massages it to make them required. Commas are in the list not the param, which - # is more awkward for processing but not terminally so. - return (self.Param + opt(self.COMMA)) | (self.Param + self.COMMA + self._parameters) - - @rule - def Param(self): - return self.NAME + self.COLON + self.TypeExpr - - @rule - def TypeExpr(self): - return self.NAME - - @rule - def Block(self): - return self.LCURLY + opt(self._statements) + self.RCURLY - - @rule - def _statements(self): - return self.Stmt | self._statements + self.Stmt - - @rule - def Stmt(self): - return self.StmtExpr | self.StmtLet | self.StmtReturn - - @rule - def StmtExpr(self): - return self.Expr + self.SEMICOLON - - @rule - def StmtLet(self): - return self.LET + self.NAME + self.EQUAL + self.Expr + self.SEMICOLON - - @rule - def StmtReturn(self): - return self.RETURN + self.Expr + self.SEMICOLON - - @rule - def Expr(self): - return self.ExprLiteral | self.ExprName | self.ExprParen | self.ExprBinary | self.ExprCall - - @rule - def ExprLiteral(self): - return self.INT | self.TRUE | self.FALSE - - @rule - def ExprName(self): - return self.NAME - - @rule - def ExprParen(self): - return self.LPAREN + self.Expr + self.RPAREN - - @rule - def ExprBinary(self): - return self.Expr + (self.PLUS | self.MINUS | self.STAR | self.SLASH) + self.Expr - - @rule - def ExprCall(self): - return self.Expr + self.ArgList - - @rule - def ArgList(self): - return self.LPAREN + opt(self._arg_star) + self.RPAREN - - @rule - def _arg_star(self): - # Again, a deviation from the original. See _parameters. - return (self.Expr + opt(self.COMMA)) | (self.Expr + self.COMMA + self._arg_star) - - BLANKS = Terminal(Re.set(" ", "\t", "\r", "\n").plus()) - - TRUE = Terminal("true") - FALSE = Terminal("false") - INT = Terminal(Re.set(("0", "9")).plus()) - FN = Terminal("fn") - ARROW = Terminal("->") - COMMA = Terminal(",") - LPAREN = Terminal("(") - RPAREN = Terminal(")") - LCURLY = Terminal("{") - RCURLY = Terminal("}") - COLON = Terminal(":") - SEMICOLON = Terminal(";") - LET = Terminal("let") - EQUAL = Terminal("=") - RETURN = Terminal("return") - PLUS = Terminal("+") - MINUS = Terminal("-") - STAR = Terminal("*") - SLASH = Terminal("/") - - NAME = Terminal( - Re.seq( - Re.set(("a", "z"), ("A", "Z"), "_"), - Re.set(("a", "z"), ("A", "Z"), ("0", "9"), "_").star(), - ), - ) - - -L_PARSE_TABLE = LGrammar().build_table() -L_LEXER_TABLE = LGrammar().compile_lexer() +L_PARSE_TABLE = LGrammar.build_table() +L_LEXER_TABLE = LGrammar.compile_lexer() def test_matklad_one(): diff --git a/tests/test_grammar.py b/tests/test_grammar.py index c12380b..398b416 100644 --- a/tests/test_grammar.py +++ b/tests/test_grammar.py @@ -1,6 +1,5 @@ import pytest -import parser import parser.runtime as runtime from parser import Grammar, seq, rule, Terminal @@ -40,117 +39,68 @@ def _tree(treeform, count=0) -> runtime.Tree | runtime.TokenValue: def test_lr0_lr0(): """An LR0 grammar should work with an LR0 generator.""" - class G(Grammar): - start = "E" - # generator = parser.GenerateLR0 + PLUS = Terminal("+", "+") + LPAREN = Terminal("(", "(") + RPAREN = Terminal(")", ")") + IDENTIFIER = Terminal("id", "id") - @rule - def E(self): - return seq(self.E, self.PLUS, self.T) | self.T + @rule + def E(): + return seq(E, PLUS, T) | T - @rule - def T(self): - return seq(self.LPAREN, self.E, self.RPAREN) | self.IDENTIFIER + @rule + def T(): + return seq(LPAREN, E, RPAREN) | IDENTIFIER - PLUS = Terminal("+", name="+") - LPAREN = Terminal("(", name="(") - RPAREN = Terminal(")", name=")") - IDENTIFIER = Terminal("id", name="id") + G = Grammar(start=E) - table = G().build_table() - tree, errors = runtime.Parser(table).parse( - Tokens(G.IDENTIFIER, G.PLUS, G.LPAREN, G.IDENTIFIER, G.RPAREN) - ) + table = G.build_table() + tree, errors = runtime.Parser(table).parse(Tokens(IDENTIFIER, PLUS, LPAREN, IDENTIFIER, RPAREN)) assert errors == [] assert tree == _tree(("E", ("E", ("T", "id")), "+", ("T", "(", ("E", ("T", "id")), ")"))) -def test_all_generators(): - """This grammar should work with everything honestly.""" - - class G(Grammar): - start = "E" - - @rule - def E(self): - return seq(self.E, self.PLUS, self.T) | self.T - - @rule - def T(self): - return seq(self.LPAREN, self.E, self.RPAREN) | self.IDENTIFIER - - PLUS = Terminal("+", name="+") - LPAREN = Terminal("(", name="(") - RPAREN = Terminal(")", name=")") - IDENTIFIER = Terminal("id", name="id") - - GENERATORS = [ - # parser.GenerateLR0, - # parser.GeneratePager, - parser.ParserGenerator, - ] - for generator in GENERATORS: - table = G().build_table(generator=generator) - tree, errors = runtime.Parser(table).parse( - Tokens(G.IDENTIFIER, G.PLUS, G.LPAREN, G.IDENTIFIER, G.RPAREN) - ) - - print("\n") - print(generator) - print(f"{table.format()}") - - assert errors == [] - assert tree == _tree(("E", ("E", ("T", "id")), "+", ("T", "(", ("E", ("T", "id")), ")"))) - def test_grammar_aho_ullman_2(): - class TestGrammar(Grammar): - start = "S" + @rule + def S(): + return seq(X, X) - @rule - def S(self): - return seq(self.X, self.X) + @rule + def X(): + return seq(A, X) | B - @rule - def X(self): - return seq(self.A, self.X) | self.B + A = Terminal("A", "a") + B = Terminal("B", "b") - A = Terminal("a") - B = Terminal("b") - - TestGrammar().build_table(generator=parser.ParserGenerator) - # TestGrammar().build_table(generator=parser.GeneratePager) + Grammar(start=S).build_table() def test_fun_lalr(): + @rule + def S(): + return seq(V, E) - class TestGrammar(Grammar): - start = "S" + @rule + def E(): + return F | seq(E, PLUS, F) - @rule - def S(self): - return seq(self.V, self.E) + @rule + def F(): + return V | INT | seq(LPAREN, E, RPAREN) - @rule - def E(self): - return self.F | seq(self.E, self.PLUS, self.F) + @rule + def V(): + return ID - @rule - def F(self): - return self.V | self.INT | seq(self.LPAREN, self.E, self.RPAREN) + PLUS = Terminal("PLUS", "+") + INT = Terminal("INT", "int") + ID = Terminal("ID", "id") + LPAREN = Terminal("LPAREN", "(") + RPAREN = Terminal("RPAREN", ")") - @rule - def V(self): - return self.ID - - PLUS = Terminal("+") - INT = Terminal("int") - ID = Terminal("id") - LPAREN = Terminal("(") - RPAREN = Terminal(")") - - TestGrammar().build_table() + Grammar(start=S).build_table() def test_conflicting_names(): @@ -167,43 +117,28 @@ def test_conflicting_names(): to understand. """ - class TestGrammar(Grammar): - start = "IDENTIFIER" + @rule("IDENTIFIER") + def identifier(): + return IDENTIFIER - @rule("IDENTIFIER") - def identifier(self): - return self.IDENTIFIER - - IDENTIFIER = Terminal("Identifier") + IDENTIFIER = Terminal("IDENTIFIER", "Identifier") with pytest.raises(ValueError): - TestGrammar().build_table() + Grammar(start=identifier).build_table() def test_grammar_ignore_trivia(): - class G(Grammar): - start = "sentence" + @rule + def sentence(): + return WORD | seq(sentence, WORD) - trivia = ["BLANK"] + WORD = Terminal("WORD", "blah") + BLANK = Terminal("BLANK", " ") - @rule - def sentence(self): - return self.WORD | seq(self.sentence, self.WORD) - - WORD = Terminal("blah") - BLANK = Terminal(" ") - - table = G().build_table() + table = Grammar(start=sentence, trivia=[BLANK]).build_table() assert "BLANK" in table.trivia - tree, errors = runtime.Parser(table).parse( - Tokens( - G.WORD, - G.BLANK, - G.WORD, - G.BLANK, - ) - ) + tree, errors = runtime.Parser(table).parse(Tokens(WORD, BLANK, WORD, BLANK)) assert errors == [] assert tree == runtime.Tree( @@ -234,135 +169,3 @@ def test_grammar_ignore_trivia(): ), ), ) - - -def test_grammar_unknown_trivia(): - class G(Grammar): - start = "sentence" - - trivia = ["BLANK"] - - @rule - def sentence(self): - return self.WORD | seq(self.sentence, self.WORD) - - WORD = Terminal("blah") - - with pytest.raises(ValueError): - G().build_table() - - -def test_grammar_trivia_symbol(): - class G(Grammar): - start = "sentence" - - @rule - def sentence(self): - return self.WORD | seq(self.sentence, self.WORD) - - WORD = Terminal("blah") - BLANK = Terminal(" ") - - trivia = [BLANK] - - table = G().build_table() - assert "BLANK" in table.trivia - - -def test_grammar_trivia_constructor(): - class G(Grammar): - start = "sentence" - - def __init__(self): - super().__init__(trivia=[self.BLANK]) - - @rule - def sentence(self): - return self.WORD | seq(self.sentence, self.WORD) - - WORD = Terminal("blah") - BLANK = Terminal(" ") - - table = G().build_table() - assert "BLANK" in table.trivia - - -def test_grammar_trivia_constructor_string(): - class G(Grammar): - start = "sentence" - - def __init__(self): - super().__init__(trivia=["BLANK"]) - - @rule - def sentence(self): - return self.WORD | seq(self.sentence, self.WORD) - - WORD = Terminal("blah") - BLANK = Terminal(" ") - - table = G().build_table() - assert "BLANK" in table.trivia - - -def test_grammar_trivia_constructor_string_unknown(): - class G(Grammar): - start = "sentence" - - def __init__(self): - super().__init__(trivia=["BLANK"]) - - @rule - def sentence(self): - return self.WORD | seq(self.sentence, self.WORD) - - WORD = Terminal("blah") - - with pytest.raises(ValueError): - G().build_table() - - -def test_grammar_name_implicit(): - class FooGrammar(Grammar): - start = "x" - - @rule - def x(self): - return self.WORD - - WORD = Terminal("blah") - - assert FooGrammar().name == "foo" - - -def test_grammar_name_explicit_member(): - class FooGrammar(Grammar): - start = "x" - - name = "bar" - - @rule - def x(self): - return self.WORD - - WORD = Terminal("blah") - - assert FooGrammar().name == "bar" - - -def test_grammar_name_explicit_constructor(): - class FooGrammar(Grammar): - start = "x" - - name = "bar" - - def __init__(self): - super().__init__(name="baz") - - @rule - def x(self): - return self.WORD - - WORD = Terminal("blah") - - assert FooGrammar().name == "baz" diff --git a/tests/test_lexer.py b/tests/test_lexer.py index ffff192..79fa499 100644 --- a/tests/test_lexer.py +++ b/tests/test_lexer.py @@ -354,32 +354,33 @@ def test_edge_list_always_sorted(points: list[tuple[int, int]]): def test_lexer_compile(): - class LexTest(Grammar): - @rule - def foo(self): - return self.IS + @rule + def foo(): + # NOTE: This is a hack to ensure the terminals are reachable. :P + return IS | AS | IDENTIFIER - start = "foo" - - IS = Terminal("is") - AS = Terminal("as") - IDENTIFIER = Terminal( - Re.seq( - Re.set(("a", "z"), ("A", "Z"), "_"), - Re.set(("a", "z"), ("A", "Z"), ("0", "9"), "_").star(), - ) + IS = Terminal("IS", "is") + AS = Terminal("AS", "as") + IDENTIFIER = Terminal( + "IDENTIFIER", + Re.seq( + Re.set(("a", "z"), ("A", "Z"), "_"), + Re.set(("a", "z"), ("A", "Z"), ("0", "9"), "_").star(), ) - BLANKS = Terminal(Re.set("\r", "\n", "\t", " ").plus()) + ) + BLANKS = Terminal("BLANKS", Re.set("\r", "\n", "\t", " ").plus()) - lexer = LexTest().compile_lexer() + + LexTest = Grammar(start=foo, trivia=[BLANKS]) + lexer = LexTest.compile_lexer() dump_lexer_table(lexer) tokens = list(generic_tokenize("xy is ass", lexer)) assert tokens == [ - (LexTest.IDENTIFIER, 0, 2), - (LexTest.BLANKS, 2, 1), - (LexTest.IS, 3, 2), - (LexTest.BLANKS, 5, 1), - (LexTest.IDENTIFIER, 6, 3), + (IDENTIFIER, 0, 2), + (BLANKS, 2, 1), + (IS, 3, 2), + (BLANKS, 5, 1), + (IDENTIFIER, 6, 3), ] @@ -387,34 +388,35 @@ def test_lexer_compile(): def test_lexer_numbers(n: float): assume(math.isfinite(n)) - class LexTest(Grammar): - @rule - def number(self): - return self.NUMBER + @rule + def number(): + return NUMBER - start = "number" - - NUMBER = Terminal( + NUMBER = Terminal( + "NUMBER", + Re.seq( + Re.set(("0", "9")).plus(), Re.seq( + Re.literal("."), Re.set(("0", "9")).plus(), - Re.seq( - Re.literal("."), - Re.set(("0", "9")).plus(), - ).question(), - Re.seq( - Re.set("e", "E"), - Re.set("+", "-").question(), - Re.set(("0", "9")).plus(), - ).question(), - ) + ).question(), + Re.seq( + Re.set("e", "E"), + Re.set("+", "-").question(), + Re.set(("0", "9")).plus(), + ).question(), ) + ) - lexer = LexTest().compile_lexer() + + LexTest = Grammar(start=number) + + lexer = LexTest.compile_lexer() dump_lexer_table(lexer) number_string = str(n) tokens = list(generic_tokenize(number_string, lexer)) assert tokens == [ - (LexTest.NUMBER, 0, len(number_string)), + (NUMBER, 0, len(number_string)), ] diff --git a/tests/test_wadler.py b/tests/test_wadler.py index e66c29d..bf52824 100644 --- a/tests/test_wadler.py +++ b/tests/test_wadler.py @@ -23,69 +23,66 @@ import parser.wadler.builder as builder import parser.wadler.runtime as runtime -class JsonGrammar(Grammar): - start = "root" - - trivia = ["BLANKS"] - +def make_json_grammar(): @rule - def root(self): - return self.value + def root(): + return value @rule(transparent=True) - def value(self): + def value(): return ( - self.object - | self.array - | self.NUMBER - | self.TRUE - | self.FALSE - | self.NULL - | self.STRING + object + | array + | NUMBER + | TRUE + | FALSE + | NULL + | STRING ) @rule - def object(self): + def object(): return group( - self.LCURLY + opt(indent(newline() + self._object_pairs)) + newline() + self.RCURLY + LCURLY + opt(indent(newline() + _object_pairs)) + newline() + RCURLY ) @rule - def _object_pairs(self): + def _object_pairs(): return alt( - self.object_pair, - self.object_pair + self.COMMA + newline(" ") + self._object_pairs, + object_pair, + object_pair + COMMA + newline(" ") + _object_pairs, ) @rule - def object_pair(self): - return group(self.STRING + self.COLON + indent(newline(" ") + self.value)) + def object_pair(): + return group(STRING + COLON + indent(newline(" ") + value)) @rule - def array(self): + def array(): return group( - self.LSQUARE + opt(indent(newline() + self._array_items)) + newline() + self.RSQUARE + LSQUARE + opt(indent(newline() + _array_items)) + newline() + RSQUARE ) @rule - def _array_items(self): + def _array_items(): return alt( - self.value, - self.value + self.COMMA + newline(" ") + self._array_items, + value, + value + COMMA + newline(" ") + _array_items, ) - BLANKS = Terminal(Re.set(" ", "\t", "\r", "\n").plus()) + BLANKS = Terminal("BLANKS", Re.set(" ", "\t", "\r", "\n").plus()) - LCURLY = Terminal("{") - RCURLY = Terminal("}") - COMMA = Terminal(",") - COLON = Terminal(":") - LSQUARE = Terminal("[") - RSQUARE = Terminal("]") - TRUE = Terminal("true") - FALSE = Terminal("false") - NULL = Terminal("null") + LCURLY = Terminal("LCURLY", "{") + RCURLY = Terminal("RCURLY", "}") + COMMA = Terminal("COMMA", ",") + COLON = Terminal("COLON", ":") + LSQUARE = Terminal("LSQUARE", "[") + RSQUARE = Terminal("RSQUARE", "]") + TRUE = Terminal("TRUE", "true") + FALSE = Terminal("FALSE", "false") + NULL = Terminal("NULL", "null") NUMBER = Terminal( + "NUMBER", Re.seq( Re.set(("0", "9")).plus(), Re.seq( @@ -100,6 +97,7 @@ class JsonGrammar(Grammar): ), ) STRING = Terminal( + "STRING", Re.seq( Re.literal('"'), (~Re.set('"', "\\") | (Re.set("\\") + Re.any())).star(), @@ -107,8 +105,9 @@ class JsonGrammar(Grammar): ) ) + return Grammar(start=root, trivia=[BLANKS]) -JSON = JsonGrammar() +JSON = make_json_grammar() JSON_PARSER = JSON.build_table() JSON_LEXER = JSON.compile_lexer() @@ -228,47 +227,49 @@ def test_layout_basic(): ) -class TG(Grammar): - start = "root" - trivia = ["BLANKS", "LINE_BREAK", "COMMENT"] +def make_test_grammar(): + @rule + def root(): + return _expression @rule - def root(self): - return self._expression + def _expression(): + return word | list @rule - def _expression(self): - return self.word | self.list + def list(): + return group(LPAREN, indent(nl, _expressions), nl, RPAREN) @rule - def list(self): - return group(self.LPAREN, indent(nl, self._expressions), nl, self.RPAREN) + def _expressions(): + return _expression | seq(_expressions, sp, _expression) @rule - def _expressions(self): - return self._expression | seq(self._expressions, sp, self._expression) + def word(): + return OK | seq(BREAK, br, BREAK) - @rule - def word(self): - return self.OK | seq(self.BREAK, br, self.BREAK) + LPAREN = Terminal("LPAREN", "(") + RPAREN = Terminal("RPAREN", ")") + OK = Terminal("OK", "ok") + BREAK = Terminal("BREAK", "break") - LPAREN = Terminal("(") - RPAREN = Terminal(")") - OK = Terminal("ok") - BREAK = Terminal("break") - - BLANKS = Terminal(Re.set(" ", "\t").plus()) - LINE_BREAK = Terminal(Re.set("\r", "\n"), trivia_mode=TriviaMode.NewLine) + BLANKS = Terminal("BLANKS", Re.set(" ", "\t").plus()) + LINE_BREAK = Terminal("LINE_BREAK", Re.set("\r", "\n"), trivia_mode=TriviaMode.NewLine) COMMENT = Terminal( + "COMMENT", Re.seq(Re.literal(";"), Re.set("\n").invert().star()), trivia_mode=TriviaMode.LineComment, ) + return Grammar(start=root, trivia=[BLANKS, LINE_BREAK, COMMENT], pretty_indent=" ") + +TG = make_test_grammar() + + def test_forced_break(): - g = TG() - g_lexer = g.compile_lexer() - g_parser = g.build_table() + g_lexer = TG.compile_lexer() + g_parser = TG.build_table() text = "((ok ok) (ok break break ok) (ok ok ok ok))" @@ -276,29 +277,28 @@ def test_forced_break(): assert errors == [] assert tree is not None - printer = runtime.Printer(builder.compile_pretty_table(g)) + printer = runtime.Printer(builder.compile_pretty_table(TG)) result = printer.format_tree(tree, text, 200).apply_to_source(text) assert result == _output( """ ( - (ok ok) - ( - ok - break - break - ok - ) - (ok ok ok ok) + (ok ok) + ( + ok + break + break + ok + ) + (ok ok ok ok) ) """ ) def test_maintaining_line_breaks(): - g = TG() - g_lexer = g.compile_lexer() - g_parser = g.build_table() + g_lexer = TG.compile_lexer() + g_parser = TG.build_table() text = """((ok ok) ; Don't break here. @@ -316,30 +316,29 @@ def test_maintaining_line_breaks(): assert errors == [] assert tree is not None - printer = runtime.Printer(builder.compile_pretty_table(g)) + printer = runtime.Printer(builder.compile_pretty_table(TG)) result = printer.format_tree(tree, text, 200).apply_to_source(text) assert result == _output( """ ( - (ok ok) - ; Don't break here. - (ok) -*SPACE* - ; ^ Do keep this break though. - (ok) -*SPACE* - ; ^ This should only be one break. - (ok) + (ok ok) + ; Don't break here. + (ok) +*SPACE**SPACE* + ; ^ Do keep this break though. + (ok) +*SPACE**SPACE* + ; ^ This should only be one break. + (ok) ) """ ) def test_trailing_trivia(): - g = TG() - g_lexer = g.compile_lexer() - g_parser = g.build_table() + g_lexer = TG.compile_lexer() + g_parser = TG.build_table() text = """((ok ok)); Don't lose this! @@ -350,7 +349,7 @@ def test_trailing_trivia(): assert errors == [] assert tree is not None - printer = runtime.Printer(builder.compile_pretty_table(g)) + printer = runtime.Printer(builder.compile_pretty_table(TG)) result = printer.format_tree(tree, text, 200).apply_to_source(text) assert result == _output( @@ -363,9 +362,8 @@ def test_trailing_trivia(): def test_trailing_trivia_two(): - g = TG() - g_lexer = g.compile_lexer() - g_parser = g.build_table() + g_lexer = TG.compile_lexer() + g_parser = TG.build_table() text = """((ok ok)) @@ -376,7 +374,7 @@ def test_trailing_trivia_two(): assert errors == [] assert tree is not None - printer = runtime.Printer(builder.compile_pretty_table(g)) + printer = runtime.Printer(builder.compile_pretty_table(TG)) result = printer.format_tree(tree, text, 200).apply_to_source(text) assert result == _output( @@ -389,9 +387,8 @@ def test_trailing_trivia_two(): def test_trailing_trivia_split(): - g = TG() - g_lexer = g.compile_lexer() - g_parser = g.build_table() + g_lexer = TG.compile_lexer() + g_parser = TG.build_table() text = """((ok ok)); Don't lose this! @@ -432,7 +429,7 @@ def test_trailing_trivia_split(): print(f"{mode:25} {t.kind:10} {repr(text[t.start:t.end])}") trivia_doc = runtime.Matcher( - builder.MatcherTable(ParseTable([], [], set()), {}, {}), + builder.MatcherTable(ParseTable([], [], set(), {}), {}, {}), TRIVIA_MODES, ).apply_post_trivia( token.post_trivia,