diff --git a/grammar.py b/grammar.py index 69f4de8..38299e9 100644 --- a/grammar.py +++ b/grammar.py @@ -2,17 +2,57 @@ import re import typing -from parser import ( - Assoc, - Grammar, - Nothing, - rule, - seq, - Rule, - Terminal, - Re, -) -from parser.parser import compile_lexer, dump_lexer_table +import parser +from parser import Assoc, Grammar, Nothing, Terminal, rule, seq, Rule + +ARROW = Terminal("Arrow") +AS = Terminal("As") +BAR = Terminal("Bar") +CLASS = Terminal("Class") +COLON = Terminal("Colon") +ELSE = Terminal("Else") +FOR = Terminal("For") +FUN = Terminal("Fun") +IDENTIFIER = Terminal("Identifier") +IF = Terminal("If") +IMPORT = Terminal("Import") +IN = Terminal("In") +LCURLY = Terminal("LeftBrace") +LET = Terminal("Let") +RCURLY = Terminal("RightBrace") +RETURN = Terminal("Return") +SEMICOLON = Terminal("Semicolon") +STRING = Terminal("String") +WHILE = Terminal("While") +EQUAL = Terminal("Equal") +LPAREN = Terminal("LeftParen") +RPAREN = Terminal("RightParen") +COMMA = Terminal("Comma") +SELF = Terminal("Selff") +OR = Terminal("Or") +IS = Terminal("Is") +AND = Terminal("And") +EQUALEQUAL = Terminal("EqualEqual") +BANGEQUAL = Terminal("BangEqual") +LESS = Terminal("Less") +GREATER = Terminal("Greater") +LESSEQUAL = Terminal("LessEqual") +GREATEREQUAL = Terminal("GreaterEqual") +PLUS = Terminal("Plus") +MINUS = Terminal("Minus") +STAR = Terminal("Star") +SLASH = Terminal("Slash") +NUMBER = Terminal("Number") +TRUE = Terminal("True") +FALSE = Terminal("False") +BANG = Terminal("Bang") +DOT = Terminal("Dot") +MATCH = Terminal("Match") +EXPORT = Terminal("Export") +UNDERSCORE = Terminal("Underscore") +NEW = Terminal("New") +LSQUARE = Terminal("LeftBracket") +RSQUARE = Terminal("RightBracket") class FineGrammar(Grammar): @@ -22,17 +62,17 @@ class FineGrammar(Grammar): def __init__(self): super().__init__( precedence=[ - (Assoc.RIGHT, [self.EQUAL]), - (Assoc.LEFT, [self.OR]), - (Assoc.LEFT, [self.IS]), - (Assoc.LEFT, [self.AND]), - (Assoc.LEFT, [self.EQUALEQUAL, self.BANGEQUAL]), - (Assoc.LEFT, [self.LESS, self.GREATER, self.GREATEREQUAL, self.LESSEQUAL]), - (Assoc.LEFT, [self.PLUS, self.MINUS]), - (Assoc.LEFT, [self.STAR, self.SLASH]), + (Assoc.RIGHT, [EQUAL]), + (Assoc.LEFT, [OR]), + (Assoc.LEFT, [IS]), + (Assoc.LEFT, [AND]), + (Assoc.LEFT, [EQUALEQUAL, BANGEQUAL]), + (Assoc.LEFT, [LESS, GREATER, GREATEREQUAL, LESSEQUAL]), + (Assoc.LEFT, [PLUS, MINUS]), + (Assoc.LEFT, [STAR, SLASH]), (Assoc.LEFT, [self.primary_expression]), - (Assoc.LEFT, [self.LPAREN]), - (Assoc.LEFT, [self.DOT]), + (Assoc.LEFT, [LPAREN]), + (Assoc.LEFT, [DOT]), # # If there's a confusion about whether to make an IF # statement or an expression, prefer the statement. @@ -57,15 +97,15 @@ class FineGrammar(Grammar): @rule def import_statement(self) -> Rule: - return seq(self.IMPORT, self.STRING, self.AS, self.IDENTIFIER, self.SEMICOLON) + return seq(IMPORT, STRING, AS, IDENTIFIER, SEMICOLON) @rule("ClassDeclaration") def class_declaration(self) -> Rule: - return seq(self.CLASS, self.IDENTIFIER, self._class_body) + return seq(CLASS, IDENTIFIER, self._class_body) @rule def _class_body(self) -> Rule: - return seq(self.LCURLY, self.RCURLY) | seq(self.LCURLY, self._class_members, self.RCURLY) + return seq(LCURLY, RCURLY) | seq(LCURLY, self._class_members, RCURLY) @rule def _class_members(self) -> Rule: @@ -77,7 +117,7 @@ class FineGrammar(Grammar): @rule("FieldDecl") def field_declaration(self) -> Rule: - return seq(self.IDENTIFIER, self.COLON, self.type_expression, self.SEMICOLON) + return seq(IDENTIFIER, COLON, self.type_expression, SEMICOLON) # Types @rule("TypeExpression") @@ -86,65 +126,60 @@ class FineGrammar(Grammar): @rule("AlternateType") def alternate_type(self) -> Rule: - return seq(self.type_expression, self.OR, self.type_identifier) + return seq(self.type_expression, OR, self.type_identifier) @rule("TypeIdentifier") def type_identifier(self) -> Rule: - return self.IDENTIFIER + return IDENTIFIER @rule def export_statement(self) -> Rule: return ( - seq(self.EXPORT, self.class_declaration) - | seq(self.EXPORT, self.function_declaration) - | seq(self.EXPORT, self.let_statement) - | seq(self.EXPORT, self.export_list, self.SEMICOLON) + seq(EXPORT, self.class_declaration) + | seq(EXPORT, self.function_declaration) + | seq(EXPORT, self.let_statement) + | seq(EXPORT, self.export_list, SEMICOLON) ) @rule def export_list(self) -> Rule: - return Nothing | self.IDENTIFIER | seq(self.IDENTIFIER, self.COMMA, self.export_list) + return Nothing | IDENTIFIER | seq(IDENTIFIER, COMMA, self.export_list) # Functions @rule("FunctionDecl") def function_declaration(self) -> Rule: - return seq(self.FUN, self.IDENTIFIER, self.function_parameters, self.block) | seq( - self.FUN, - self.IDENTIFIER, - self.function_parameters, - self.ARROW, - self.type_expression, - self.block, + return seq(FUN, IDENTIFIER, self.function_parameters, self.block) | seq( + FUN, IDENTIFIER, self.function_parameters, ARROW, self.type_expression, self.block ) @rule("ParamList") def function_parameters(self) -> Rule: return ( - seq(self.LPAREN, self.RPAREN) - | seq(self.LPAREN, self._first_parameter, self.RPAREN) - | seq(self.LPAREN, self._first_parameter, self.COMMA, self._parameter_list, self.RPAREN) + seq(LPAREN, RPAREN) + | seq(LPAREN, self._first_parameter, RPAREN) + | seq(LPAREN, self._first_parameter, COMMA, self._parameter_list, RPAREN) ) @rule def _first_parameter(self) -> Rule: - return self.SELF | self.parameter + return SELF | self.parameter @rule def _parameter_list(self) -> Rule: - return Nothing | self.parameter | seq(self.parameter, self.COMMA, self._parameter_list) + return Nothing | self.parameter | seq(self.parameter, COMMA, self._parameter_list) @rule("Parameter") def parameter(self) -> Rule: - return seq(self.IDENTIFIER, self.COLON, self.type_expression) + return seq(IDENTIFIER, COLON, self.type_expression) # Block @rule("Block") def block(self) -> Rule: return ( - seq(self.LCURLY, self.RCURLY) - | seq(self.LCURLY, self.expression, self.RCURLY) - | seq(self.LCURLY, self._statement_list, self.RCURLY) - | seq(self.LCURLY, self._statement_list, self.expression, self.RCURLY) + seq(LCURLY, RCURLY) + | seq(LCURLY, self.expression, RCURLY) + | seq(LCURLY, self._statement_list, RCURLY) + | seq(LCURLY, self._statement_list, self.expression, RCURLY) ) @rule @@ -165,19 +200,19 @@ class FineGrammar(Grammar): @rule("LetStatement") def let_statement(self) -> Rule: - return seq(self.LET, self.IDENTIFIER, self.EQUAL, self.expression, self.SEMICOLON) + return seq(LET, IDENTIFIER, EQUAL, self.expression, SEMICOLON) @rule("ReturnStatement") def return_statement(self) -> Rule: - return seq(self.RETURN, self.expression, self.SEMICOLON) | seq(self.RETURN, self.SEMICOLON) + return seq(RETURN, self.expression, SEMICOLON) | seq(RETURN, SEMICOLON) @rule("ForStatement") def for_statement(self) -> Rule: - return seq(self.FOR, self.iterator_variable, self.IN, self.expression, self.block) + return seq(FOR, self.iterator_variable, IN, self.expression, self.block) @rule("IteratorVariable") def iterator_variable(self) -> Rule: - return self.IDENTIFIER + return IDENTIFIER @rule("IfStatement") def if_statement(self) -> Rule: @@ -185,11 +220,11 @@ class FineGrammar(Grammar): @rule def while_statement(self) -> Rule: - return seq(self.WHILE, self.expression, self.block) + return seq(WHILE, self.expression, self.block) @rule def expression_statement(self) -> Rule: - return seq(self.expression, self.SEMICOLON) + return seq(self.expression, SEMICOLON) # Expressions @rule(transparent=True) @@ -199,93 +234,91 @@ class FineGrammar(Grammar): @rule("BinaryExpression") def binary_expression(self) -> Rule: return ( - seq(self.expression, self.EQUAL, self.expression) - | seq(self.expression, self.OR, self.expression) - | seq(self.expression, self.AND, self.expression) - | seq(self.expression, self.EQUALEQUAL, self.expression) - | seq(self.expression, self.BANGEQUAL, self.expression) - | seq(self.expression, self.LESS, self.expression) - | seq(self.expression, self.LESSEQUAL, self.expression) - | seq(self.expression, self.GREATER, self.expression) - | seq(self.expression, self.GREATEREQUAL, self.expression) - | seq(self.expression, self.PLUS, self.expression) - | seq(self.expression, self.MINUS, self.expression) - | seq(self.expression, self.STAR, self.expression) - | seq(self.expression, self.SLASH, self.expression) + seq(self.expression, EQUAL, self.expression) + | seq(self.expression, OR, self.expression) + | seq(self.expression, AND, self.expression) + | seq(self.expression, EQUALEQUAL, self.expression) + | seq(self.expression, BANGEQUAL, self.expression) + | seq(self.expression, LESS, self.expression) + | seq(self.expression, LESSEQUAL, self.expression) + | seq(self.expression, GREATER, self.expression) + | seq(self.expression, GREATEREQUAL, self.expression) + | seq(self.expression, PLUS, self.expression) + | seq(self.expression, MINUS, self.expression) + | seq(self.expression, STAR, self.expression) + | seq(self.expression, SLASH, self.expression) ) @rule("IsExpression") def is_expression(self) -> Rule: - return seq(self.expression, self.IS, self.pattern) + return seq(self.expression, IS, self.pattern) @rule def primary_expression(self) -> Rule: return ( self.identifier_expression | self.literal_expression - | self.SELF - | seq(self.BANG, self.primary_expression) - | seq(self.MINUS, self.primary_expression) + | SELF + | seq(BANG, self.primary_expression) + | seq(MINUS, self.primary_expression) | self.block | self.conditional_expression | self.list_constructor_expression | self.object_constructor_expression | self.match_expression - | seq(self.primary_expression, self.LPAREN, self.RPAREN) - | seq(self.primary_expression, self.LPAREN, self._expression_list, self.RPAREN) - | seq(self.primary_expression, self.DOT, self.IDENTIFIER) - | seq(self.LPAREN, self.expression, self.RPAREN) + | seq(self.primary_expression, LPAREN, RPAREN) + | seq(self.primary_expression, LPAREN, self._expression_list, RPAREN) + | seq(self.primary_expression, DOT, IDENTIFIER) + | seq(LPAREN, self.expression, RPAREN) ) @rule("IdentifierExpression") def identifier_expression(self): - return self.IDENTIFIER + return IDENTIFIER @rule("Literal") def literal_expression(self): - return self.NUMBER | self.STRING | self.TRUE | self.FALSE + return NUMBER | STRING | TRUE | FALSE @rule("ConditionalExpression") def conditional_expression(self) -> Rule: return ( - seq(self.IF, self.expression, self.block) - | seq(self.IF, self.expression, self.block, self.ELSE, self.conditional_expression) - | seq(self.IF, self.expression, self.block, self.ELSE, self.block) + seq(IF, self.expression, self.block) + | seq(IF, self.expression, self.block, ELSE, self.conditional_expression) + | seq(IF, self.expression, self.block, ELSE, self.block) ) @rule def list_constructor_expression(self) -> Rule: - return seq(self.LSQUARE, self.RSQUARE) | seq( - self.LSQUARE, self._expression_list, self.RSQUARE - ) + return seq(LSQUARE, RSQUARE) | seq(LSQUARE, self._expression_list, RSQUARE) @rule def _expression_list(self) -> Rule: return ( self.expression - | seq(self.expression, self.COMMA) - | seq(self.expression, self.COMMA, self._expression_list) + | seq(self.expression, COMMA) + | seq(self.expression, COMMA, self._expression_list) ) @rule def match_expression(self) -> Rule: - return seq(self.MATCH, self.expression, self.match_body) + return seq(MATCH, self.expression, self.match_body) @rule("MatchBody") def match_body(self) -> Rule: - return seq(self.LCURLY, self.RCURLY) | seq(self.LCURLY, self._match_arms, self.RCURLY) + return seq(LCURLY, RCURLY) | seq(LCURLY, self._match_arms, RCURLY) @rule def _match_arms(self) -> Rule: return ( self.match_arm - | seq(self.match_arm, self.COMMA) - | seq(self.match_arm, self.COMMA, self._match_arms) + | seq(self.match_arm, COMMA) + | seq(self.match_arm, COMMA, self._match_arms) ) @rule("MatchArm") def match_arm(self) -> Rule: - return seq(self.pattern, self.ARROW, self.expression) + return seq(self.pattern, ARROW, self.expression) @rule("Pattern") def pattern(self) -> Rule: @@ -297,7 +330,7 @@ class FineGrammar(Grammar): @rule def _pattern_predicate(self) -> Rule: - return seq(self.AND, self.expression) + return seq(AND, self.expression) @rule def _pattern_core(self) -> Rule: @@ -305,120 +338,60 @@ class FineGrammar(Grammar): @rule("WildcardPattern") def wildcard_pattern(self) -> Rule: - return self.UNDERSCORE + return UNDERSCORE @rule("VariableBinding") def variable_binding(self) -> Rule: - return seq(self.IDENTIFIER, self.COLON) + return seq(IDENTIFIER, COLON) @rule def object_constructor_expression(self) -> Rule: - return seq(self.NEW, self.type_identifier, self.field_list) + return seq(NEW, self.type_identifier, self.field_list) @rule def field_list(self) -> Rule: - return seq(self.LCURLY, self.RCURLY) | seq(self.LCURLY, self.field_values, self.RCURLY) + return seq(LCURLY, RCURLY) | seq(LCURLY, self.field_values, RCURLY) @rule def field_values(self) -> Rule: return ( self.field_value - | seq(self.field_value, self.COMMA) - | seq(self.field_value, self.COMMA, self.field_values) + | seq(self.field_value, COMMA) + | seq(self.field_value, COMMA, self.field_values) ) @rule def field_value(self) -> Rule: - return self.IDENTIFIER | seq(self.IDENTIFIER, self.COLON, self.expression) - - BLANK = Terminal(Re.set(" ", "\t", "\r", "\n").plus()) - - ARROW = Terminal("->") - AS = Terminal("as") - BAR = Terminal("bar") - CLASS = Terminal("class") - COLON = Terminal("colon") - COMMENT = Terminal("comment") - ELSE = Terminal("else") - FOR = Terminal("for") - FUN = Terminal("fun") - IDENTIFIER = Terminal( - Re.seq( - Re.set(("a", "z"), ("A", "Z"), "_"), - Re.set(("a", "z"), ("A", "Z"), ("0", "9"), "_").star(), - ) - ) - IF = Terminal("if") - IMPORT = Terminal("import") - IN = Terminal("in") - LCURLY = Terminal("{") - LET = Terminal("Let") - RCURLY = Terminal("}") - RETURN = Terminal("return") - SEMICOLON = Terminal(";") - STRING = Terminal('""') # TODO - WHILE = Terminal("while") - EQUAL = Terminal("=") - LPAREN = Terminal("(") - RPAREN = Terminal(")") - COMMA = Terminal(",") - SELF = Terminal("self", name="SELFF") - OR = Terminal("or") - IS = Terminal("is") - AND = Terminal("and") - EQUALEQUAL = Terminal("==") - BANGEQUAL = Terminal("!=") - LESS = Terminal("<") - GREATER = Terminal(">") - LESSEQUAL = Terminal("<=") - GREATEREQUAL = Terminal(">=") - PLUS = Terminal("+") - MINUS = Terminal("-") - STAR = Terminal("*") - SLASH = Terminal("/") - NUMBER = Terminal(Re.set(("0", "9")).plus()) - TRUE = Terminal("true") - FALSE = Terminal("false") - BANG = Terminal("!") - DOT = Terminal(".") - MATCH = Terminal("match") - EXPORT = Terminal("export") - UNDERSCORE = Terminal("_") - NEW = Terminal("new") - LSQUARE = Terminal("[") - RSQUARE = Terminal("]") + return IDENTIFIER | seq(IDENTIFIER, COLON, self.expression) # ----------------------------------------------------------------------------- # DORKY LEXER # ----------------------------------------------------------------------------- -import bisect - - NUMBER_RE = re.compile("[0-9]+(\\.[0-9]*([eE][-+]?[0-9]+)?)?") IDENTIFIER_RE = re.compile("[_A-Za-z][_A-Za-z0-9]*") KEYWORD_TABLE = { - "_": FineGrammar.UNDERSCORE, - "and": FineGrammar.AND, - "as": FineGrammar.AS, - "class": FineGrammar.CLASS, - "else": FineGrammar.ELSE, - "export": FineGrammar.EXPORT, - "false": FineGrammar.FALSE, - "for": FineGrammar.FOR, - "fun": FineGrammar.FUN, - "if": FineGrammar.IF, - "import": FineGrammar.IMPORT, - "in": FineGrammar.IN, - "is": FineGrammar.IS, - "let": FineGrammar.LET, - "match": FineGrammar.MATCH, - "new": FineGrammar.NEW, - "or": FineGrammar.OR, - "return": FineGrammar.RETURN, - "self": FineGrammar.SELF, - "true": FineGrammar.TRUE, - "while": FineGrammar.WHILE, + "_": UNDERSCORE, + "and": AND, + "as": AS, + "class": CLASS, + "else": ELSE, + "export": EXPORT, + "false": FALSE, + "for": FOR, + "fun": FUN, + "if": IF, + "import": IMPORT, + "in": IN, + "is": IS, + "let": LET, + "match": MATCH, + "new": NEW, + "or": OR, + "return": RETURN, + "self": SELF, + "true": TRUE, + "while": WHILE, } @@ -433,63 +406,63 @@ def tokenize(src: str): token = None if ch == "-": if src[pos : pos + 2] == "->": - token = (FineGrammar.ARROW, pos, 2) + token = (ARROW, pos, 2) else: - token = (FineGrammar.MINUS, pos, 1) + token = (MINUS, pos, 1) elif ch == "|": - token = (FineGrammar.BAR, pos, 1) + token = (BAR, pos, 1) elif ch == ":": - token = (FineGrammar.COLON, pos, 1) + token = (COLON, pos, 1) elif ch == "{": - token = (FineGrammar.LCURLY, pos, 1) + token = (LCURLY, pos, 1) elif ch == "}": - token = (FineGrammar.RCURLY, pos, 1) + token = (RCURLY, pos, 1) elif ch == ";": - token = (FineGrammar.SEMICOLON, pos, 1) + token = (SEMICOLON, pos, 1) elif ch == "=": if src[pos : pos + 2] == "==": - token = (FineGrammar.EQUALEQUAL, pos, 2) + token = (EQUALEQUAL, pos, 2) else: - token = (FineGrammar.EQUAL, pos, 1) + token = (EQUAL, pos, 1) elif ch == "(": - token = (FineGrammar.LPAREN, pos, 1) + token = (LPAREN, pos, 1) elif ch == ")": - token = (FineGrammar.RPAREN, pos, 1) + token = (RPAREN, pos, 1) elif ch == ",": - token = (FineGrammar.COMMA, pos, 1) + token = (COMMA, pos, 1) elif ch == "!": if src[pos : pos + 2] == "!=": - token = (FineGrammar.BANGEQUAL, pos, 2) + token = (BANGEQUAL, pos, 2) else: - token = (FineGrammar.BANG, pos, 1) + token = (BANG, pos, 1) elif ch == "<": if src[pos : pos + 2] == "<=": - token = (FineGrammar.LESSEQUAL, pos, 2) + token = (LESSEQUAL, pos, 2) else: - token = (FineGrammar.LESS, pos, 1) + token = (LESS, pos, 1) elif ch == ">": if src[pos : pos + 2] == ">=": - token = (FineGrammar.GREATEREQUAL, pos, 2) + token = (GREATEREQUAL, pos, 2) else: - token = (FineGrammar.GREATER, pos, 1) + token = (GREATER, pos, 1) elif ch == "+": - token = (FineGrammar.PLUS, pos, 1) + token = (PLUS, pos, 1) elif ch == "*": - token = (FineGrammar.STAR, pos, 1) + token = (STAR, pos, 1) elif ch == "/": if src[pos : pos + 2] == "//": @@ -497,16 +470,16 @@ def tokenize(src: str): pos = pos + 1 continue - token = (FineGrammar.SLASH, pos, 1) + token = (SLASH, pos, 1) elif ch == ".": - token = (FineGrammar.DOT, pos, 1) + token = (DOT, pos, 1) elif ch == "[": - token = (FineGrammar.LSQUARE, pos, 1) + token = (LSQUARE, pos, 1) elif ch == "]": - token = (FineGrammar.RSQUARE, pos, 1) + token = (RSQUARE, pos, 1) elif ch == '"' or ch == "'": end = pos + 1 @@ -517,12 +490,12 @@ def tokenize(src: str): if end == len(src): raise Exception(f"Unterminated string constant at {pos}") end += 1 - token = (FineGrammar.STRING, pos, end - pos) + token = (STRING, pos, end - pos) else: number_match = NUMBER_RE.match(src, pos) if number_match: - token = (FineGrammar.NUMBER, pos, number_match.end() - pos) + token = (NUMBER, pos, number_match.end() - pos) else: id_match = IDENTIFIER_RE.match(src, pos) if id_match: @@ -531,7 +504,7 @@ def tokenize(src: str): if keyword: token = (keyword, pos, len(fragment)) else: - token = (FineGrammar.IDENTIFIER, pos, len(fragment)) + token = (IDENTIFIER, pos, len(fragment)) if token is None: raise Exception("Token error") @@ -539,6 +512,9 @@ def tokenize(src: str): pos += token[2] +import bisect + + class FineTokens: def __init__(self, src: str): self.src = src @@ -570,8 +546,4 @@ class FineTokens: if __name__ == "__main__": - grammar = FineGrammar() - grammar.build_table() - - lexer = compile_lexer(grammar) - dump_lexer_table(lexer) + FineGrammar().build_table() diff --git a/parser/parser.py b/parser/parser.py index 8a23d4e..d0cb1fc 100644 --- a/parser/parser.py +++ b/parser/parser.py @@ -21,20 +21,19 @@ To get started, create a grammar that derives from the `Grammar` class. Create one method per nonterminal, decorated with the `rule` decorator. Here's an example: + PLUS = Terminal('+') + LPAREN = Terminal('(') + RPAREN = Terminal(')') + ID = Terminal('id') class SimpleGrammar(Grammar): @rule def expression(self): - return seq(self.expression, self.PLUS, self.term) | self.term + return seq(self.expression, PLUS, self.term) | self.term @rule def term(self): - return seq(self.LPAREN, self.expression, self.RPAREN) | self.ID - - PLUS = Terminal('+') - LPAREN = Terminal('(') - RPAREN = Terminal(')') - ID = Terminal('id') + return seq(LPAREN, self.expression, RPAREN) | ID ## Using grammars @@ -131,13 +130,13 @@ May 2024 """ import abc -import bisect import collections import dataclasses import enum import functools import inspect import json +import sys import typing @@ -1606,20 +1605,15 @@ class Rule: class Terminal(Rule): """A token, or terminal symbol in the grammar.""" - value: str | None - pattern: "str | Re" + value: str - def __init__(self, pattern, name=None): - self.value = name - self.pattern = pattern + def __init__(self, value): + self.value = sys.intern(value) def flatten(self) -> typing.Generator[list["str | Terminal"], None, None]: # We are just ourselves when flattened. yield [self] - def __repr__(self) -> str: - return self.value or "???" - class NonTerminal(Rule): """A non-terminal, or a production, in the grammar. @@ -1772,20 +1766,19 @@ class Grammar: Here's an example of a simple grammar: + PLUS = Terminal('+') + LPAREN = Terminal('(') + RPAREN = Terminal(')') + ID = Terminal('id') + class SimpleGrammar(Grammar): @rule def expression(self): - return seq(self.expression, self.PLUS, self.term) | self.term + return seq(self.expression, PLUS, self.term) | self.term @rule def term(self): - return seq(self.LPAREN, self.expression, self.RPAREN) | self.ID - - PLUS = Terminal('+') - LPAREN = Terminal('(') - RPAREN = Terminal(')') - ID = Terminal('id') - + return seq(LPAREN, self.expression, RPAREN) | ID Not very exciting, perhaps, but it's something. """ @@ -1793,7 +1786,6 @@ class Grammar: _precedence: dict[str, typing.Tuple[Assoc, int]] _start: str _generator: type[GenerateLR0] - _terminals: list[Terminal] def __init__( self, @@ -1817,14 +1809,6 @@ class Grammar: generator = getattr(self, "generator", GenerateLALR) assert generator is not None - # Fixup terminal names with the name of the member that declared it. - terminals = [] - for n, t in inspect.getmembers(self, lambda x: isinstance(x, Terminal)): - if t.value is None: - t.value = n - terminals.append(t) - - # Fix up the precedence table. precedence_table = {} for prec, (associativity, symbols) in enumerate(precedence): for symbol in symbols: @@ -1840,11 +1824,6 @@ class Grammar: self._precedence = precedence_table self._start = start self._generator = generator - self._terminals = terminals - - @property - def terminals(self) -> list[Terminal]: - return self._terminals def generate_nonterminal_dict( self, start: str | None = None @@ -1932,526 +1911,3 @@ class Grammar: gen = generator(start, desugared, precedence=self._precedence, transparents=transparents) table = gen.gen_table() return table - - -############################################################################### -# Lexer support -############################################################################### -# For machine-generated lexers - - -@dataclasses.dataclass(frozen=True, slots=True) -class Span: - lower: int # inclusive - upper: int # exclusive - - @classmethod - def from_str(cls, lower: str, upper: str | None = None) -> "Span": - lo = ord(lower) - if upper is None: - hi = lo + 1 - else: - hi = ord(upper) + 1 - - return Span(lower=lo, upper=hi) - - def __len__(self) -> int: - return self.upper - self.lower - - def intersects(self, other: "Span") -> bool: - """Determine if this span intersects the other span.""" - return self.lower < other.upper and self.upper > other.lower - - def split(self, other: "Span") -> tuple["Span|None", "Span|None", "Span|None"]: - """Split two possibly-intersecting spans into three regions: a low - region, which covers just the lower part of the union, a mid region, - which covers the intersection, and a hi region, which covers just the - upper part of the union. - - Together, low and high cover the union of the two spans. Mid covers - the intersection. The implication is that if both spans are identical - then the low and high regions will both be None and mid will be equal - to both. - - Graphically, given two spans A and B: - - [ B ) - [ A ) - [ lo )[ mid )[ hi ) - - If the lower bounds align then the `lo` region is empty: - - [ B ) - [ A ) - [ mid )[ hi ) - - If the upper bounds align then the `hi` region is empty: - - [ B ) - [ A ) - [ lo )[ mid ) - - If both bounds align then both are empty: - - [ B ) - [ A ) - [ mid ) - - split is reflexive: it doesn't matter which order you split things in, - you will always get the same output spans, in the same order. - """ - if not self.intersects(other): - if self.lower < other.lower: - return (self, None, other) - else: - return (other, None, self) - - first = min(self.lower, other.lower) - second = max(self.lower, other.lower) - third = min(self.upper, other.upper) - fourth = max(self.upper, other.upper) - - low = Span(first, second) if first != second else None - mid = Span(second, third) - hi = Span(third, fourth) if third != fourth else None - - return (low, mid, hi) - - def __str__(self) -> str: - return f"[{self.lower}-{self.upper})" - - -ET = typing.TypeVar("ET") - - -class EdgeList[ET]: - """A list of edge transitions, keyed by *span*.""" - - _edges: list[tuple[Span, list[ET]]] - - def __init__(self): - self._edges = [] - - def __iter__(self) -> typing.Iterator[tuple[Span, list[ET]]]: - return iter(self._edges) - - def __repr__(self) -> str: - return f"EdgeList[{','.join(str(s[0]) + '->' + repr(s[1]) for s in self._edges)}]" - - def add_edge(self, c: Span, s: ET): - """Add an edge for the given span to the list. If there are already - spans that overlap this one, split and generating multiple distinct - edges. - """ - our_targets = [s] - - # Look to see where we would put this span based solely on a sort of - # lower bounds: find the lowest upper bound that is greater than the - # lower bound of the incoming span. - point = bisect.bisect_right(self._edges, c.lower, key=lambda x: x[0].upper) - - # We might need to run this in multiple iterations because we keep - # splitting against the *lowest* matching span. - next_span: Span | None = c - while next_span is not None: - c = next_span - next_span = None - - # print(f" incoming: {self} @ {point} <- {c}->[{s}]") - - # Check to see if we've run off the end of the list of spans. - if point == len(self._edges): - self._edges.insert(point, (c, [s])) - # print(f" trivial end: {self}") - return - - # Nope, pull out the span to the right of us. - right_span, right_targets = self._edges[point] - - # Because we intersect at least a little bit we know that we need to - # split and keep processing. - del self._edges[point] - lo, mid, hi = c.split(right_span) # Remember the semantics - # print(f" -> {c} splits {right_span} -> {lo}, {mid}, {hi} @{point}") - - # We do this from lo to hi, lo first. - if lo is not None: - # NOTE: lo will never intersect both no matter what. - if lo.intersects(right_span): - assert not lo.intersects(c) - targets = right_targets - else: - assert lo.intersects(c) - targets = our_targets - - self._edges.insert(point, (lo, targets)) - point += 1 # Adjust the insertion point, important for us to keep running. - - if mid is not None: - # If mid exists it is known to intersect with both so we can just - # do it. - self._edges.insert(point, (mid, right_targets + our_targets)) - point += 1 # Adjust the insertion point, important for us to keep running. - - if hi is not None: - # NOTE: Just like lo, hi will never intersect both no matter what. - if hi.intersects(right_span): - # If hi intersects the right span then we're done, no - # need to keep running. - assert not hi.intersects(c) - self._edges.insert(point, (hi, right_targets)) - - else: - # BUT! If hi intersects the incoming span then what we - # need to do is to replace the incoming span with hi - # (having chopped off the lower part of the incoming - # span) and continue to execute with only the upper part - # of the incoming span. - # - # Why? Because the upper part of the incoming span might - # intersect *more* spans, in which case we need to keep - # splitting and merging targets. - assert hi.intersects(c) - next_span = hi - - # print(f" result: {self}") - - -class NFAState: - """An NFA state. Each state can be the accept state, with one or more - Terminals as the result.""" - - accept: list[Terminal] - epsilons: list["NFAState"] - _edges: EdgeList["NFAState"] - - def __init__(self): - self.accept = [] - self.epsilons = [] - self._edges = EdgeList() - - def __repr__(self): - return f"State{id(self)}" - - def edges(self) -> typing.Iterable[tuple[Span, list["NFAState"]]]: - return self._edges - - def add_edge(self, c: Span, s: "NFAState") -> "NFAState": - self._edges.add_edge(c, s) - return s - - def dump_graph(self, name="nfa.dot"): - with open(name, "w", encoding="utf8") as f: - f.write("digraph G {\n") - - stack: list[NFAState] = [self] - visited = set() - while len(stack) > 0: - state = stack.pop() - if state in visited: - continue - visited.add(state) - - label = ", ".join([t.value for t in state.accept if t.value is not None]) - f.write(f' {id(state)} [label="{label}"];\n') - for target in state.epsilons: - stack.append(target) - f.write(f' {id(state)} -> {id(target)} [label="\u03B5"];\n') - - for span, targets in state.edges(): - label = str(span).replace('"', '\\"') - for target in targets: - stack.append(target) - f.write(f' {id(state)} -> {id(target)} [label="{label}"];\n') - - f.write("}\n") - - -@dataclasses.dataclass -class Re: - def to_nfa(self, start: NFAState) -> NFAState: - del start - raise NotImplementedError() - - def __str__(self) -> str: - raise NotImplementedError() - - @classmethod - def seq(cls, *values: "Re") -> "Re": - result = values[0] - for v in values[1:]: - result = RegexSequence(result, v) - return result - - @classmethod - def literal(cls, value: str) -> "Re": - return cls.seq(*[RegexLiteral.from_ranges(c) for c in value]) - - @classmethod - def set(cls, *args: str | tuple[str, str]) -> "Re": - return RegexLiteral.from_ranges(*args) - - def plus(self) -> "Re": - return RegexPlus(self) - - def star(self) -> "Re": - return RegexStar(self) - - def question(self) -> "Re": - return RegexQuestion(self) - - def __or__(self, value: "Re", /) -> "Re": - return RegexAlternation(self, value) - - -@dataclasses.dataclass -class RegexLiteral(Re): - values: list[Span] - - @classmethod - def from_ranges(cls, *args: str | tuple[str, str]) -> "RegexLiteral": - values = [] - for a in args: - if isinstance(a, str): - values.append(Span.from_str(a)) - else: - values.append(Span.from_str(a[0], a[1])) - - return RegexLiteral(values) - - def to_nfa(self, start: NFAState) -> NFAState: - end = NFAState() - for span in self.values: - start.add_edge(span, end) - return end - - def __str__(self) -> str: - if len(self.values) == 1: - span = self.values[0] - if len(span) == 1: - return chr(span.lower) - - ranges = [] - for span in self.values: - start = chr(span.lower) - end = chr(span.upper - 1) - if start == end: - ranges.append(start) - else: - ranges.append(f"{start}-{end}") - return "[{}]".format("".join(ranges)) - - -@dataclasses.dataclass -class RegexPlus(Re): - child: Re - - def to_nfa(self, start: NFAState) -> NFAState: - end = self.child.to_nfa(start) - end.epsilons.append(start) - return end - - def __str__(self) -> str: - return f"({self.child})+" - - -@dataclasses.dataclass -class RegexStar(Re): - child: Re - - def to_nfa(self, start: NFAState) -> NFAState: - end = self.child.to_nfa(start) - end.epsilons.append(start) - start.epsilons.append(end) - return end - - def __str__(self) -> str: - return f"({self.child})*" - - -@dataclasses.dataclass -class RegexQuestion(Re): - child: Re - - def to_nfa(self, start: NFAState) -> NFAState: - end = self.child.to_nfa(start) - start.epsilons.append(end) - return end - - def __str__(self) -> str: - return f"({self.child})?" - - -@dataclasses.dataclass -class RegexSequence(Re): - left: Re - right: Re - - def to_nfa(self, start: NFAState) -> NFAState: - mid = self.left.to_nfa(start) - return self.right.to_nfa(mid) - - def __str__(self) -> str: - return f"{self.left}{self.right}" - - -@dataclasses.dataclass -class RegexAlternation(Re): - left: Re - right: Re - - def to_nfa(self, start: NFAState) -> NFAState: - left_start = NFAState() - start.epsilons.append(left_start) - left_end = self.left.to_nfa(left_start) - - right_start = NFAState() - start.epsilons.append(right_start) - right_end = self.right.to_nfa(right_start) - - end = NFAState() - left_end.epsilons.append(end) - right_end.epsilons.append(end) - - return end - - def __str__(self) -> str: - return f"(({self.left})||({self.right}))" - - -LexerTable = list[tuple[Terminal | None, list[tuple[Span, int]]]] - - -class NFASuperState: - states: frozenset[NFAState] - - def __init__(self, states: typing.Iterable[NFAState]): - # Close over the given states, including every state that is - # reachable by epsilon-transition. - stack = list(states) - result = set() - while len(stack) > 0: - st = stack.pop() - if st in result: - continue - result.add(st) - stack.extend(st.epsilons) - - self.states = frozenset(result) - - def __eq__(self, other): - if not isinstance(other, NFASuperState): - return False - return self.states == other.states - - def __hash__(self) -> int: - return hash(self.states) - - def edges(self) -> list[tuple[Span, "NFASuperState"]]: - working: EdgeList[list[NFAState]] = EdgeList() - for st in self.states: - for span, targets in st.edges(): - working.add_edge(span, targets) - - # EdgeList maps span to list[list[State]] which we want to flatten. - last_upper = None - result = [] - for span, stateses in working: - if last_upper is not None: - assert last_upper <= span.lower - last_upper = span.upper - - s: list[NFAState] = [] - for states in stateses: - s.extend(states) - - result.append((span, NFASuperState(s))) - - if len(result) > 0: - for i in range(0, len(result) - 1): - span = result[i][0] - next_span = result[i + 1][0] - assert span.upper <= next_span.lower - - # TODO: Merge spans that are adjacent and go to the same state. - - return result - - def accept_terminal(self) -> Terminal | None: - accept = None - for st in self.states: - for ac in st.accept: - if accept is None: - accept = ac - elif accept.value != ac.value: - accept_regex = isinstance(accept.pattern, Re) - ac_regex = isinstance(ac.pattern, Re) - - if accept_regex and not ac_regex: - accept = ac - elif ac_regex and not accept_regex: - pass - else: - raise ValueError( - f"Lexer is ambiguous: cannot distinguish between {accept.value} ('{accept.pattern}') and {ac.value} ('{ac.pattern}')" - ) - - return accept - - -def compile_lexer(x: Grammar) -> LexerTable: - # Parse the terminals all together into a big NFA rooted at `NFA`. - NFA = NFAState() - for terminal in x.terminals: - start = NFAState() - NFA.epsilons.append(start) - - pattern = terminal.pattern - if isinstance(pattern, Re): - ending = pattern.to_nfa(start) - else: - ending = start - for c in pattern: - ending = ending.add_edge(Span.from_str(c), NFAState()) - - ending.accept.append(terminal) - - NFA.dump_graph() - - # Convert the NFA into a DFA in the most straightforward way (by tracking - # sets of state closures, called SuperStates.) - DFA: dict[NFASuperState, tuple[int, list[tuple[Span, NFASuperState]]]] = {} - - stack = [NFASuperState([NFA])] - while len(stack) > 0: - ss = stack.pop() - if ss in DFA: - continue - - edges = ss.edges() - - DFA[ss] = (len(DFA), edges) - for _, target in edges: - stack.append(target) - - return [ - ( - ss.accept_terminal(), - [(k, DFA[v][0]) for k, v in edges], - ) - for ss, (_, edges) in DFA.items() - ] - - -def dump_lexer_table(table: LexerTable): - with open("lexer.dot", "w", encoding="utf-8") as f: - f.write("digraph G {\n") - for index, (accept, edges) in enumerate(table): - label = accept.value if accept is not None else "" - f.write(f' {index} [label="{label}"];\n') - for span, target in edges: - label = str(span).replace('"', '\\"') - f.write(f' {index} -> {target} [label="{label}"];\n') - - pass - f.write("}\n") diff --git a/parser/runtime.py b/parser/runtime.py index 124bc7b..f5be3a4 100644 --- a/parser/runtime.py +++ b/parser/runtime.py @@ -430,58 +430,3 @@ class Parser: error_strings.append(f"{line_index}:{column_index}: {parse_error.message}") return (result, error_strings) - - -def generic_tokenize( - src: str, table: parser.LexerTable -) -> typing.Iterable[tuple[parser.Terminal, int, int]]: - pos = 0 - state = 0 - start = 0 - last_accept = None - last_accept_pos = 0 - - print(f"LEXING: {src} ({len(src)})") - - while pos < len(src): - while state is not None: - accept, edges = table[state] - if accept is not None: - last_accept = accept - last_accept_pos = pos - - print(f" @ {pos} state: {state} ({accept})") - if pos >= len(src): - break - - char = ord(src[pos]) - print(f" -> char: {char} ({repr(src[pos])})") - - # Find the index of the span where the upper value is the tightest - # bound on the character. - state = None - index = bisect.bisect_right(edges, char, key=lambda x: x[0].upper) - print(f" -> {index}") - if index < len(edges): - span, target = edges[index] - print(f" -> {span}, {target}") - if char >= span.lower: - print(f" -> target: {target}") - state = target - pos += 1 - - else: - print(f" Nope (outside range)") - else: - print(f" Nope (at end)") - - if last_accept is None: - raise Exception(f"Token error at {pos}") - - yield (last_accept, start, last_accept_pos - start) - - print(f" Yield: {last_accept}, reset to {last_accept_pos}") - last_accept = None - pos = last_accept_pos - start = pos - state = 0 diff --git a/pdm.lock b/pdm.lock index a937da9..b80bf6d 100644 --- a/pdm.lock +++ b/pdm.lock @@ -3,26 +3,9 @@ [metadata] groups = ["default", "dev"] -strategy = ["inherit_metadata"] -lock_version = "4.5.0" -content_hash = "sha256:c4fec06f95402db1e9843df4a8a4a275273c6ec4f41f192f30d8a92ee52d15ea" - -[[metadata.targets]] -requires_python = ">=3.12" - -[[package]] -name = "attrs" -version = "24.2.0" -requires_python = ">=3.7" -summary = "Classes Without Boilerplate" -groups = ["dev"] -dependencies = [ - "importlib-metadata; python_version < \"3.8\"", -] -files = [ - {file = "attrs-24.2.0-py3-none-any.whl", hash = "sha256:81921eb96de3191c8258c199618104dd27ac608d9366f5e35d011eae1867ede2"}, - {file = "attrs-24.2.0.tar.gz", hash = "sha256:5cfb1b9148b5b086569baec03f20d7b6bf3bcacc9a42bebf87ffaaca362f6346"}, -] +strategy = ["cross_platform", "inherit_metadata"] +lock_version = "4.4.1" +content_hash = "sha256:143b06c001132ba589a47b2b3a498dd54f4840d95d216c794068089fcea48d4d" [[package]] name = "colorama" @@ -36,22 +19,6 @@ files = [ {file = "colorama-0.4.6.tar.gz", hash = "sha256:08695f5cb7ed6e0531a20572697297273c47b8cae5a63ffc6d6ed5c201be6e44"}, ] -[[package]] -name = "hypothesis" -version = "6.111.1" -requires_python = ">=3.8" -summary = "A library for property-based testing" -groups = ["dev"] -dependencies = [ - "attrs>=22.2.0", - "exceptiongroup>=1.0.0; python_version < \"3.11\"", - "sortedcontainers<3.0.0,>=2.1.0", -] -files = [ - {file = "hypothesis-6.111.1-py3-none-any.whl", hash = "sha256:9422adbac4b2104f6cf92dc6604b5c9df975efc08ffc7145ecc39bc617243835"}, - {file = "hypothesis-6.111.1.tar.gz", hash = "sha256:6ab6185a858fa692bf125c0d0a936134edc318bee01c05e407c71c9ead0b61c5"}, -] - [[package]] name = "iniconfig" version = "2.0.0" @@ -93,23 +60,11 @@ summary = "pytest: simple powerful testing with Python" groups = ["dev"] dependencies = [ "colorama; sys_platform == \"win32\"", - "exceptiongroup>=1.0.0rc8; python_version < \"3.11\"", "iniconfig", "packaging", "pluggy<2.0,>=1.5", - "tomli>=1; python_version < \"3.11\"", ] files = [ {file = "pytest-8.2.2-py3-none-any.whl", hash = "sha256:c434598117762e2bd304e526244f67bf66bbd7b5d6cf22138be51ff661980343"}, {file = "pytest-8.2.2.tar.gz", hash = "sha256:de4bb8104e201939ccdc688b27a89a7be2079b22e2bd2b07f806b6ba71117977"}, ] - -[[package]] -name = "sortedcontainers" -version = "2.4.0" -summary = "Sorted Containers -- Sorted List, Sorted Dict, Sorted Set" -groups = ["dev"] -files = [ - {file = "sortedcontainers-2.4.0-py2.py3-none-any.whl", hash = "sha256:a163dcaede0f1c021485e957a39245190e74249897e2ae4b2aa38595db237ee0"}, - {file = "sortedcontainers-2.4.0.tar.gz", hash = "sha256:25caa5a06cc30b6b83d11423433f65d1f9d76c4c6a0c90e3379eaa43b9bfdb88"}, -] diff --git a/pyproject.toml b/pyproject.toml index c7721e1..1e28adc 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -22,7 +22,6 @@ distribution = true [tool.pdm.dev-dependencies] dev = [ "pytest>=8.2.2", - "hypothesis>=6.111.1", ] [tool.pyright] diff --git a/tests/test_grammar.py b/tests/test_grammar.py index 26e5057..a320e06 100644 --- a/tests/test_grammar.py +++ b/tests/test_grammar.py @@ -38,27 +38,25 @@ def _tree(treeform) -> runtime.Tree | runtime.TokenValue: def test_lr0_lr0(): """An LR0 grammar should work with an LR0 generator.""" - class G(Grammar): + PLUS = Terminal("+") + LPAREN = Terminal("(") + RPAREN = Terminal(")") + IDENTIFIER = Terminal("id") + + class LR0Grammar(Grammar): start = "E" generator = parser.GenerateLR0 @rule def E(self): - return seq(self.E, self.PLUS, self.T) | self.T + return seq(self.E, PLUS, self.T) | self.T @rule def T(self): - return seq(self.LPAREN, self.E, self.RPAREN) | self.IDENTIFIER + return seq(LPAREN, self.E, RPAREN) | IDENTIFIER - PLUS = Terminal("+", name="+") - LPAREN = Terminal("(", name="(") - RPAREN = Terminal(")", name=")") - IDENTIFIER = Terminal("id", name="id") - - table = G().build_table() - tree, errors = runtime.Parser(table).parse( - Tokens(G.IDENTIFIER, G.PLUS, G.LPAREN, G.IDENTIFIER, G.RPAREN) - ) + table = LR0Grammar().build_table() + tree, errors = runtime.Parser(table).parse(Tokens(IDENTIFIER, PLUS, LPAREN, IDENTIFIER, RPAREN)) assert errors == [] assert tree == _tree(("E", ("E", ("T", "id")), "+", ("T", "(", ("E", ("T", "id")), ")"))) @@ -67,114 +65,114 @@ def test_lr0_lr0(): def test_lr0_shift_reduce(): """This one should not work in LR0- it has a shift/reduce conflict, but works in SLR1.""" - class G(Grammar): + PLUS = Terminal("+") + LPAREN = Terminal("(") + RPAREN = Terminal(")") + LSQUARE = Terminal("[") + RSQUARE = Terminal("]") + IDENTIFIER = Terminal("id") + + class TestGrammar(Grammar): start = "E" generator = parser.GenerateLR0 @rule def E(self): - return seq(self.E, self.PLUS, self.T) | self.T + return seq(self.E, PLUS, self.T) | self.T @rule def T(self): return ( - seq(self.LPAREN, self.E, self.RPAREN) - | self.IDENTIFIER - | seq(self.IDENTIFIER, self.LSQUARE, self.E, self.RSQUARE) + seq(LPAREN, self.E, RPAREN) | IDENTIFIER | seq(IDENTIFIER, LSQUARE, self.E, RSQUARE) ) - PLUS = Terminal("+") - LPAREN = Terminal("(") - RPAREN = Terminal(")") - LSQUARE = Terminal("[") - RSQUARE = Terminal("]") - IDENTIFIER = Terminal("id") - with pytest.raises(parser.AmbiguityError): - G().build_table() + TestGrammar().build_table() - G().build_table(generator=parser.GenerateSLR1) + TestGrammar().build_table(generator=parser.GenerateSLR1) def test_lr0_reduce_reduce(): """This one should not work, it has a reduce-reduce conflict.""" - class G(Grammar): + PLUS = Terminal("+") + EQUAL = Terminal("=") + LPAREN = Terminal("(") + RPAREN = Terminal(")") + IDENTIFIER = Terminal("id") + + class TestGrammar(Grammar): start = "E" generator = parser.GenerateLR0 @rule def E(self): - return seq(self.E, self.PLUS, self.T) | self.T | seq(self.V, self.EQUAL, self.E) + return seq(self.E, PLUS, self.T) | self.T | seq(self.V, EQUAL, self.E) @rule def T(self): - return seq(self.LPAREN, self.E, self.RPAREN) | self.IDENTIFIER + return seq(LPAREN, self.E, RPAREN) | IDENTIFIER @rule def V(self): - return self.IDENTIFIER - - PLUS = Terminal("+") - EQUAL = Terminal("=") - LPAREN = Terminal("(") - RPAREN = Terminal(")") - IDENTIFIER = Terminal("id") + return IDENTIFIER with pytest.raises(parser.AmbiguityError): - G().build_table() + TestGrammar().build_table() def test_lr0_empty(): """LR0 can't handle empty productions because it doesn't know when to reduce.""" + BOOP = Terminal("boop") + BEEP = Terminal("beep") - class G(Grammar): + class TestGrammar(Grammar): start = "E" generator = parser.GenerateLR0 @rule def E(self): - return seq(self.F, self.BOOP) + return seq(self.F, BOOP) @rule def F(self): - return self.BEEP | parser.Nothing - - BOOP = Terminal("boop") - BEEP = Terminal("beep") + return BEEP | parser.Nothing with pytest.raises(parser.AmbiguityError): - G().build_table() + TestGrammar().build_table() def test_grammar_aho_ullman_1(): - class G(Grammar): + EQUAL = Terminal("=") + STAR = Terminal("*") + ID = Terminal("id") + + class TestGrammar(Grammar): start = "S" generator = parser.GenerateSLR1 @rule def S(self): - return seq(self.L, self.EQUAL, self.R) | self.R + return seq(self.L, EQUAL, self.R) | self.R @rule def L(self): - return seq(self.STAR, self.R) | self.ID + return seq(STAR, self.R) | ID @rule def R(self): return self.L - EQUAL = Terminal("=") - STAR = Terminal("*") - ID = Terminal("id") - with pytest.raises(parser.AmbiguityError): - G().build_table() + TestGrammar().build_table() - G().build_table(generator=parser.GenerateLR1) + TestGrammar().build_table(generator=parser.GenerateLR1) def test_grammar_aho_ullman_2(): + A = Terminal("a") + B = Terminal("b") + class TestGrammar(Grammar): start = "S" generator = parser.GenerateSLR1 @@ -185,10 +183,7 @@ def test_grammar_aho_ullman_2(): @rule def X(self): - return seq(self.A, self.X) | self.B - - A = Terminal("a") - B = Terminal("b") + return seq(A, self.X) | B TestGrammar().build_table() TestGrammar().build_table(generator=parser.GenerateLR1) @@ -196,6 +191,11 @@ def test_grammar_aho_ullman_2(): def test_fun_lalr(): + PLUS = Terminal("+") + INT = Terminal("int") + ID = Terminal("id") + LPAREN = Terminal("(") + RPAREN = Terminal(")") class TestGrammar(Grammar): start = "S" @@ -207,21 +207,15 @@ def test_fun_lalr(): @rule def E(self): - return self.F | seq(self.E, self.PLUS, self.F) + return self.F | seq(self.E, PLUS, self.F) @rule def F(self): - return self.V | self.INT | seq(self.LPAREN, self.E, self.RPAREN) + return self.V | INT | seq(LPAREN, self.E, RPAREN) @rule def V(self): - return self.ID - - PLUS = Terminal("+") - INT = Terminal("int") - ID = Terminal("id") - LPAREN = Terminal("(") - RPAREN = Terminal(")") + return ID TestGrammar().build_table() @@ -240,14 +234,14 @@ def test_conflicting_names(): to understand. """ + IDENTIFIER = Terminal("Identifier") + class TestGrammar(Grammar): - start = "IDENTIFIER" + start = "Identifier" - @rule("IDENTIFIER") + @rule("Identifier") def identifier(self): - return self.IDENTIFIER - - IDENTIFIER = Terminal("Identifier") + return IDENTIFIER with pytest.raises(ValueError): TestGrammar().build_table() diff --git a/tests/test_lexer.py b/tests/test_lexer.py deleted file mode 100644 index fe442d8..0000000 --- a/tests/test_lexer.py +++ /dev/null @@ -1,384 +0,0 @@ -import collections - -from hypothesis import assume, example, given -from hypothesis.strategies import integers, lists, tuples - -import pytest - -from parser import ( - EdgeList, - Span, - Grammar, - rule, - Terminal, - compile_lexer, - dump_lexer_table, - Re, -) - -from parser.runtime import generic_tokenize - - -def test_span_intersection(): - pairs = [ - ((1, 3), (2, 4)), - ((1, 3), (2, 3)), - ((1, 3), (1, 2)), - ((1, 3), (0, 2)), - ((1, 3), (0, 4)), - ] - - for a, b in pairs: - left = Span(*a) - right = Span(*b) - assert left.intersects(right) - assert right.intersects(left) - - -def test_span_no_intersection(): - pairs = [ - ((1, 2), (3, 4)), - ] - - for a, b in pairs: - left = Span(*a) - right = Span(*b) - assert not left.intersects(right) - assert not right.intersects(left) - - -def test_span_split(): - TC = collections.namedtuple("TC", ["left", "right", "expected"]) - cases = [ - TC( - left=Span(1, 4), - right=Span(2, 3), - expected=(Span(1, 2), Span(2, 3), Span(3, 4)), - ), - TC( - left=Span(1, 4), - right=Span(1, 2), - expected=(None, Span(1, 2), Span(2, 4)), - ), - TC( - left=Span(1, 4), - right=Span(3, 4), - expected=(Span(1, 3), Span(3, 4), None), - ), - TC( - left=Span(1, 4), - right=Span(1, 4), - expected=(None, Span(1, 4), None), - ), - ] - - for left, right, expected in cases: - result = left.split(right) - assert result == expected - - result = right.split(left) - assert result == expected - - -@given(integers(), integers()) -def test_equal_span_mid_only(x, y): - """Splitting spans against themselves results in an empty lo and hi bound.""" - assume(x < y) - span = Span(x, y) - lo, mid, hi = span.split(span) - assert lo is None - assert hi is None - assert mid == span - - -three_distinct_points = lists( - integers(), - min_size=3, - max_size=3, - unique=True, -).map(sorted) - - -@given(three_distinct_points) -def test_span_low_align_lo_none(vals): - """Splitting spans with aligned lower bounds results in an empty lo bound.""" - # x y z - # [ a ) - # [ b ) - x, y, z = vals - - a = Span(x, y) - b = Span(x, z) - lo, _, _ = a.split(b) - - assert lo is None - - -@given(three_distinct_points) -def test_span_high_align_hi_none(vals): - """Splitting spans with aligned lower bounds results in an empty lo bound.""" - # x y z - # [ a ) - # [ b ) - x, y, z = vals - - a = Span(y, z) - b = Span(x, z) - _, _, hi = a.split(b) - - assert hi is None - - -four_distinct_points = lists( - integers(), - min_size=4, - max_size=4, - unique=True, -).map(sorted) - - -@given(four_distinct_points) -def test_span_split_overlapping_lo_left(vals): - """Splitting two overlapping spans results in lo overlapping left.""" - a, b, c, d = vals - - left = Span(a, c) - right = Span(b, d) - - lo, _, _ = left.split(right) - assert lo is not None - assert lo.intersects(left) - - -@given(four_distinct_points) -def test_span_split_overlapping_lo_not_right(vals): - """Splitting two overlapping spans results in lo NOT overlapping right.""" - a, b, c, d = vals - - left = Span(a, c) - right = Span(b, d) - - lo, _, _ = left.split(right) - assert lo is not None - assert not lo.intersects(right) - - -@given(four_distinct_points) -def test_span_split_overlapping_mid_left(vals): - """Splitting two overlapping spans results in mid overlapping left.""" - a, b, c, d = vals - - left = Span(a, c) - right = Span(b, d) - - _, mid, _ = left.split(right) - assert mid is not None - assert mid.intersects(left) - - -@given(four_distinct_points) -def test_span_split_overlapping_mid_right(vals): - """Splitting two overlapping spans results in mid overlapping right.""" - a, b, c, d = vals - - left = Span(a, c) - right = Span(b, d) - - _, mid, _ = left.split(right) - assert mid is not None - assert mid.intersects(right) - - -@given(four_distinct_points) -def test_span_split_overlapping_hi_right(vals): - """Splitting two overlapping spans results in hi overlapping right.""" - a, b, c, d = vals - - left = Span(a, c) - right = Span(b, d) - - _, _, hi = left.split(right) - assert hi is not None - assert hi.intersects(right) - - -@given(four_distinct_points) -def test_span_split_overlapping_hi_not_left(vals): - """Splitting two overlapping spans results in hi NOT overlapping left.""" - a, b, c, d = vals - - left = Span(a, c) - right = Span(b, d) - - _, _, hi = left.split(right) - assert hi is not None - assert not hi.intersects(left) - - -@given(four_distinct_points) -def test_span_split_embedded(vals): - """Splitting two spans where one overlaps the other.""" - a, b, c, d = vals - - outer = Span(a, d) - inner = Span(b, c) - - lo, mid, hi = outer.split(inner) - - assert lo is not None - assert mid is not None - assert hi is not None - - assert lo.intersects(outer) - assert not lo.intersects(inner) - - assert mid.intersects(outer) - assert mid.intersects(inner) - - assert hi.intersects(outer) - assert not hi.intersects(inner) - - -def test_edge_list_single(): - el: EdgeList[str] = EdgeList() - el.add_edge(Span(1, 4), "A") - - edges = list(el) - assert edges == [ - (Span(1, 4), ["A"]), - ] - - -def test_edge_list_fully_enclosed(): - el: EdgeList[str] = EdgeList() - el.add_edge(Span(1, 4), "A") - el.add_edge(Span(2, 3), "B") - - edges = list(el) - assert edges == [ - (Span(1, 2), ["A"]), - (Span(2, 3), ["A", "B"]), - (Span(3, 4), ["A"]), - ] - - -def test_edge_list_overlap(): - el: EdgeList[str] = EdgeList() - el.add_edge(Span(1, 4), "A") - el.add_edge(Span(2, 5), "B") - - edges = list(el) - assert edges == [ - (Span(1, 2), ["A"]), - (Span(2, 4), ["A", "B"]), - (Span(4, 5), ["B"]), - ] - - -def test_edge_list_no_overlap(): - el: EdgeList[str] = EdgeList() - el.add_edge(Span(1, 4), "A") - el.add_edge(Span(5, 8), "B") - - edges = list(el) - assert edges == [ - (Span(1, 4), ["A"]), - (Span(5, 8), ["B"]), - ] - - -def test_edge_list_no_overlap_ordered(): - el: EdgeList[str] = EdgeList() - el.add_edge(Span(5, 8), "B") - el.add_edge(Span(1, 4), "A") - - edges = list(el) - assert edges == [ - (Span(1, 4), ["A"]), - (Span(5, 8), ["B"]), - ] - - -def test_edge_list_overlap_span(): - el: EdgeList[str] = EdgeList() - el.add_edge(Span(1, 3), "A") - el.add_edge(Span(4, 6), "B") - el.add_edge(Span(2, 5), "C") - - edges = list(el) - assert edges == [ - (Span(1, 2), ["A"]), - (Span(2, 3), ["A", "C"]), - (Span(3, 4), ["C"]), - (Span(4, 5), ["B", "C"]), - (Span(5, 6), ["B"]), - ] - - -def test_edge_list_overlap_span_big(): - el: EdgeList[str] = EdgeList() - el.add_edge(Span(2, 3), "A") - el.add_edge(Span(4, 5), "B") - el.add_edge(Span(6, 7), "C") - el.add_edge(Span(1, 8), "D") - - edges = list(el) - assert edges == [ - (Span(1, 2), ["D"]), - (Span(2, 3), ["A", "D"]), - (Span(3, 4), ["D"]), - (Span(4, 5), ["B", "D"]), - (Span(5, 6), ["D"]), - (Span(6, 7), ["C", "D"]), - (Span(7, 8), ["D"]), - ] - - -@given(lists(lists(integers(), min_size=2, max_size=2, unique=True), min_size=1)) -@example(points=[[0, 1], [1, 2]]) -def test_edge_list_always_sorted(points: list[tuple[int, int]]): - # OK this is weird but stick with me. - el: EdgeList[str] = EdgeList() - for i, (a, b) in enumerate(points): - lower = min(a, b) - upper = max(a, b) - - span = Span(lower, upper) - - el.add_edge(span, str(i)) - - last_upper = None - for span, _ in el: - if last_upper is not None: - assert last_upper <= span.lower, "Edges from list are not sorted" - last_upper = span.upper - - -def test_lexer_compile(): - class LexTest(Grammar): - @rule - def foo(self): - return self.IS - - start = foo - - IS = Terminal("is") - AS = Terminal("as") - IDENTIFIER = Terminal( - Re.seq( - Re.set(("a", "z"), ("A", "Z"), "_"), - Re.set(("a", "z"), ("A", "Z"), ("0", "9"), "_").star(), - ) - ) - BLANKS = Terminal(Re.set("\r", "\n", "\t", " ").plus()) - - lexer = compile_lexer(LexTest()) - dump_lexer_table(lexer) - tokens = list(generic_tokenize("xy is ass", lexer)) - assert tokens == [ - (LexTest.IDENTIFIER, 0, 2), - (LexTest.BLANKS, 2, 1), - (LexTest.IS, 3, 2), - (LexTest.BLANKS, 5, 1), - (LexTest.IDENTIFIER, 6, 3), - ]