diff --git a/grammar.py b/grammar.py index 38299e9..69f4de8 100644 --- a/grammar.py +++ b/grammar.py @@ -2,57 +2,17 @@ import re import typing -import parser -from parser import Assoc, Grammar, Nothing, Terminal, rule, seq, Rule - -ARROW = Terminal("Arrow") -AS = Terminal("As") -BAR = Terminal("Bar") -CLASS = Terminal("Class") -COLON = Terminal("Colon") -ELSE = Terminal("Else") -FOR = Terminal("For") -FUN = Terminal("Fun") -IDENTIFIER = Terminal("Identifier") -IF = Terminal("If") -IMPORT = Terminal("Import") -IN = Terminal("In") -LCURLY = Terminal("LeftBrace") -LET = Terminal("Let") -RCURLY = Terminal("RightBrace") -RETURN = Terminal("Return") -SEMICOLON = Terminal("Semicolon") -STRING = Terminal("String") -WHILE = Terminal("While") -EQUAL = Terminal("Equal") -LPAREN = Terminal("LeftParen") -RPAREN = Terminal("RightParen") -COMMA = Terminal("Comma") -SELF = Terminal("Selff") -OR = Terminal("Or") -IS = Terminal("Is") -AND = Terminal("And") -EQUALEQUAL = Terminal("EqualEqual") -BANGEQUAL = Terminal("BangEqual") -LESS = Terminal("Less") -GREATER = Terminal("Greater") -LESSEQUAL = Terminal("LessEqual") -GREATEREQUAL = Terminal("GreaterEqual") -PLUS = Terminal("Plus") -MINUS = Terminal("Minus") -STAR = Terminal("Star") -SLASH = Terminal("Slash") -NUMBER = Terminal("Number") -TRUE = Terminal("True") -FALSE = Terminal("False") -BANG = Terminal("Bang") -DOT = Terminal("Dot") -MATCH = Terminal("Match") -EXPORT = Terminal("Export") -UNDERSCORE = Terminal("Underscore") -NEW = Terminal("New") -LSQUARE = Terminal("LeftBracket") -RSQUARE = Terminal("RightBracket") +from parser import ( + Assoc, + Grammar, + Nothing, + rule, + seq, + Rule, + Terminal, + Re, +) +from parser.parser import compile_lexer, dump_lexer_table class FineGrammar(Grammar): @@ -62,17 +22,17 @@ class FineGrammar(Grammar): def __init__(self): super().__init__( precedence=[ - (Assoc.RIGHT, [EQUAL]), - (Assoc.LEFT, [OR]), - (Assoc.LEFT, [IS]), - (Assoc.LEFT, [AND]), - (Assoc.LEFT, [EQUALEQUAL, BANGEQUAL]), - (Assoc.LEFT, [LESS, GREATER, GREATEREQUAL, LESSEQUAL]), - (Assoc.LEFT, [PLUS, MINUS]), - (Assoc.LEFT, [STAR, SLASH]), + (Assoc.RIGHT, [self.EQUAL]), + (Assoc.LEFT, [self.OR]), + (Assoc.LEFT, [self.IS]), + (Assoc.LEFT, [self.AND]), + (Assoc.LEFT, [self.EQUALEQUAL, self.BANGEQUAL]), + (Assoc.LEFT, [self.LESS, self.GREATER, self.GREATEREQUAL, self.LESSEQUAL]), + (Assoc.LEFT, [self.PLUS, self.MINUS]), + (Assoc.LEFT, [self.STAR, self.SLASH]), (Assoc.LEFT, [self.primary_expression]), - (Assoc.LEFT, [LPAREN]), - (Assoc.LEFT, [DOT]), + (Assoc.LEFT, [self.LPAREN]), + (Assoc.LEFT, [self.DOT]), # # If there's a confusion about whether to make an IF # statement or an expression, prefer the statement. @@ -97,15 +57,15 @@ class FineGrammar(Grammar): @rule def import_statement(self) -> Rule: - return seq(IMPORT, STRING, AS, IDENTIFIER, SEMICOLON) + return seq(self.IMPORT, self.STRING, self.AS, self.IDENTIFIER, self.SEMICOLON) @rule("ClassDeclaration") def class_declaration(self) -> Rule: - return seq(CLASS, IDENTIFIER, self._class_body) + return seq(self.CLASS, self.IDENTIFIER, self._class_body) @rule def _class_body(self) -> Rule: - return seq(LCURLY, RCURLY) | seq(LCURLY, self._class_members, RCURLY) + return seq(self.LCURLY, self.RCURLY) | seq(self.LCURLY, self._class_members, self.RCURLY) @rule def _class_members(self) -> Rule: @@ -117,7 +77,7 @@ class FineGrammar(Grammar): @rule("FieldDecl") def field_declaration(self) -> Rule: - return seq(IDENTIFIER, COLON, self.type_expression, SEMICOLON) + return seq(self.IDENTIFIER, self.COLON, self.type_expression, self.SEMICOLON) # Types @rule("TypeExpression") @@ -126,60 +86,65 @@ class FineGrammar(Grammar): @rule("AlternateType") def alternate_type(self) -> Rule: - return seq(self.type_expression, OR, self.type_identifier) + return seq(self.type_expression, self.OR, self.type_identifier) @rule("TypeIdentifier") def type_identifier(self) -> Rule: - return IDENTIFIER + return self.IDENTIFIER @rule def export_statement(self) -> Rule: return ( - seq(EXPORT, self.class_declaration) - | seq(EXPORT, self.function_declaration) - | seq(EXPORT, self.let_statement) - | seq(EXPORT, self.export_list, SEMICOLON) + seq(self.EXPORT, self.class_declaration) + | seq(self.EXPORT, self.function_declaration) + | seq(self.EXPORT, self.let_statement) + | seq(self.EXPORT, self.export_list, self.SEMICOLON) ) @rule def export_list(self) -> Rule: - return Nothing | IDENTIFIER | seq(IDENTIFIER, COMMA, self.export_list) + return Nothing | self.IDENTIFIER | seq(self.IDENTIFIER, self.COMMA, self.export_list) # Functions @rule("FunctionDecl") def function_declaration(self) -> Rule: - return seq(FUN, IDENTIFIER, self.function_parameters, self.block) | seq( - FUN, IDENTIFIER, self.function_parameters, ARROW, self.type_expression, self.block + return seq(self.FUN, self.IDENTIFIER, self.function_parameters, self.block) | seq( + self.FUN, + self.IDENTIFIER, + self.function_parameters, + self.ARROW, + self.type_expression, + self.block, ) @rule("ParamList") def function_parameters(self) -> Rule: return ( - seq(LPAREN, RPAREN) - | seq(LPAREN, self._first_parameter, RPAREN) - | seq(LPAREN, self._first_parameter, COMMA, self._parameter_list, RPAREN) + seq(self.LPAREN, self.RPAREN) + | seq(self.LPAREN, self._first_parameter, self.RPAREN) + | seq(self.LPAREN, self._first_parameter, self.COMMA, self._parameter_list, self.RPAREN) ) @rule def _first_parameter(self) -> Rule: - return SELF | self.parameter + return self.SELF | self.parameter @rule def _parameter_list(self) -> Rule: - return Nothing | self.parameter | seq(self.parameter, COMMA, self._parameter_list) + return Nothing | self.parameter | seq(self.parameter, self.COMMA, self._parameter_list) @rule("Parameter") def parameter(self) -> Rule: - return seq(IDENTIFIER, COLON, self.type_expression) + return seq(self.IDENTIFIER, self.COLON, self.type_expression) # Block @rule("Block") def block(self) -> Rule: return ( - seq(LCURLY, RCURLY) - | seq(LCURLY, self.expression, RCURLY) - | seq(LCURLY, self._statement_list, RCURLY) - | seq(LCURLY, self._statement_list, self.expression, RCURLY) + seq(self.LCURLY, self.RCURLY) + | seq(self.LCURLY, self.expression, self.RCURLY) + | seq(self.LCURLY, self._statement_list, self.RCURLY) + | seq(self.LCURLY, self._statement_list, self.expression, self.RCURLY) ) @rule @@ -200,19 +165,19 @@ class FineGrammar(Grammar): @rule("LetStatement") def let_statement(self) -> Rule: - return seq(LET, IDENTIFIER, EQUAL, self.expression, SEMICOLON) + return seq(self.LET, self.IDENTIFIER, self.EQUAL, self.expression, self.SEMICOLON) @rule("ReturnStatement") def return_statement(self) -> Rule: - return seq(RETURN, self.expression, SEMICOLON) | seq(RETURN, SEMICOLON) + return seq(self.RETURN, self.expression, self.SEMICOLON) | seq(self.RETURN, self.SEMICOLON) @rule("ForStatement") def for_statement(self) -> Rule: - return seq(FOR, self.iterator_variable, IN, self.expression, self.block) + return seq(self.FOR, self.iterator_variable, self.IN, self.expression, self.block) @rule("IteratorVariable") def iterator_variable(self) -> Rule: - return IDENTIFIER + return self.IDENTIFIER @rule("IfStatement") def if_statement(self) -> Rule: @@ -220,11 +185,11 @@ class FineGrammar(Grammar): @rule def while_statement(self) -> Rule: - return seq(WHILE, self.expression, self.block) + return seq(self.WHILE, self.expression, self.block) @rule def expression_statement(self) -> Rule: - return seq(self.expression, SEMICOLON) + return seq(self.expression, self.SEMICOLON) # Expressions @rule(transparent=True) @@ -234,91 +199,93 @@ class FineGrammar(Grammar): @rule("BinaryExpression") def binary_expression(self) -> Rule: return ( - seq(self.expression, EQUAL, self.expression) - | seq(self.expression, OR, self.expression) - | seq(self.expression, AND, self.expression) - | seq(self.expression, EQUALEQUAL, self.expression) - | seq(self.expression, BANGEQUAL, self.expression) - | seq(self.expression, LESS, self.expression) - | seq(self.expression, LESSEQUAL, self.expression) - | seq(self.expression, GREATER, self.expression) - | seq(self.expression, GREATEREQUAL, self.expression) - | seq(self.expression, PLUS, self.expression) - | seq(self.expression, MINUS, self.expression) - | seq(self.expression, STAR, self.expression) - | seq(self.expression, SLASH, self.expression) + seq(self.expression, self.EQUAL, self.expression) + | seq(self.expression, self.OR, self.expression) + | seq(self.expression, self.AND, self.expression) + | seq(self.expression, self.EQUALEQUAL, self.expression) + | seq(self.expression, self.BANGEQUAL, self.expression) + | seq(self.expression, self.LESS, self.expression) + | seq(self.expression, self.LESSEQUAL, self.expression) + | seq(self.expression, self.GREATER, self.expression) + | seq(self.expression, self.GREATEREQUAL, self.expression) + | seq(self.expression, self.PLUS, self.expression) + | seq(self.expression, self.MINUS, self.expression) + | seq(self.expression, self.STAR, self.expression) + | seq(self.expression, self.SLASH, self.expression) ) @rule("IsExpression") def is_expression(self) -> Rule: - return seq(self.expression, IS, self.pattern) + return seq(self.expression, self.IS, self.pattern) @rule def primary_expression(self) -> Rule: return ( self.identifier_expression | self.literal_expression - | SELF - | seq(BANG, self.primary_expression) - | seq(MINUS, self.primary_expression) + | self.SELF + | seq(self.BANG, self.primary_expression) + | seq(self.MINUS, self.primary_expression) | self.block | self.conditional_expression | self.list_constructor_expression | self.object_constructor_expression | self.match_expression - | seq(self.primary_expression, LPAREN, RPAREN) - | seq(self.primary_expression, LPAREN, self._expression_list, RPAREN) - | seq(self.primary_expression, DOT, IDENTIFIER) - | seq(LPAREN, self.expression, RPAREN) + | seq(self.primary_expression, self.LPAREN, self.RPAREN) + | seq(self.primary_expression, self.LPAREN, self._expression_list, self.RPAREN) + | seq(self.primary_expression, self.DOT, self.IDENTIFIER) + | seq(self.LPAREN, self.expression, self.RPAREN) ) @rule("IdentifierExpression") def identifier_expression(self): - return IDENTIFIER + return self.IDENTIFIER @rule("Literal") def literal_expression(self): - return NUMBER | STRING | TRUE | FALSE + return self.NUMBER | self.STRING | self.TRUE | self.FALSE @rule("ConditionalExpression") def conditional_expression(self) -> Rule: return ( - seq(IF, self.expression, self.block) - | seq(IF, self.expression, self.block, ELSE, self.conditional_expression) - | seq(IF, self.expression, self.block, ELSE, self.block) + seq(self.IF, self.expression, self.block) + | seq(self.IF, self.expression, self.block, self.ELSE, self.conditional_expression) + | seq(self.IF, self.expression, self.block, self.ELSE, self.block) ) @rule def list_constructor_expression(self) -> Rule: - return seq(LSQUARE, RSQUARE) | seq(LSQUARE, self._expression_list, RSQUARE) + return seq(self.LSQUARE, self.RSQUARE) | seq( + self.LSQUARE, self._expression_list, self.RSQUARE + ) @rule def _expression_list(self) -> Rule: return ( self.expression - | seq(self.expression, COMMA) - | seq(self.expression, COMMA, self._expression_list) + | seq(self.expression, self.COMMA) + | seq(self.expression, self.COMMA, self._expression_list) ) @rule def match_expression(self) -> Rule: - return seq(MATCH, self.expression, self.match_body) + return seq(self.MATCH, self.expression, self.match_body) @rule("MatchBody") def match_body(self) -> Rule: - return seq(LCURLY, RCURLY) | seq(LCURLY, self._match_arms, RCURLY) + return seq(self.LCURLY, self.RCURLY) | seq(self.LCURLY, self._match_arms, self.RCURLY) @rule def _match_arms(self) -> Rule: return ( self.match_arm - | seq(self.match_arm, COMMA) - | seq(self.match_arm, COMMA, self._match_arms) + | seq(self.match_arm, self.COMMA) + | seq(self.match_arm, self.COMMA, self._match_arms) ) @rule("MatchArm") def match_arm(self) -> Rule: - return seq(self.pattern, ARROW, self.expression) + return seq(self.pattern, self.ARROW, self.expression) @rule("Pattern") def pattern(self) -> Rule: @@ -330,7 +297,7 @@ class FineGrammar(Grammar): @rule def _pattern_predicate(self) -> Rule: - return seq(AND, self.expression) + return seq(self.AND, self.expression) @rule def _pattern_core(self) -> Rule: @@ -338,60 +305,120 @@ class FineGrammar(Grammar): @rule("WildcardPattern") def wildcard_pattern(self) -> Rule: - return UNDERSCORE + return self.UNDERSCORE @rule("VariableBinding") def variable_binding(self) -> Rule: - return seq(IDENTIFIER, COLON) + return seq(self.IDENTIFIER, self.COLON) @rule def object_constructor_expression(self) -> Rule: - return seq(NEW, self.type_identifier, self.field_list) + return seq(self.NEW, self.type_identifier, self.field_list) @rule def field_list(self) -> Rule: - return seq(LCURLY, RCURLY) | seq(LCURLY, self.field_values, RCURLY) + return seq(self.LCURLY, self.RCURLY) | seq(self.LCURLY, self.field_values, self.RCURLY) @rule def field_values(self) -> Rule: return ( self.field_value - | seq(self.field_value, COMMA) - | seq(self.field_value, COMMA, self.field_values) + | seq(self.field_value, self.COMMA) + | seq(self.field_value, self.COMMA, self.field_values) ) @rule def field_value(self) -> Rule: - return IDENTIFIER | seq(IDENTIFIER, COLON, self.expression) + return self.IDENTIFIER | seq(self.IDENTIFIER, self.COLON, self.expression) + + BLANK = Terminal(Re.set(" ", "\t", "\r", "\n").plus()) + + ARROW = Terminal("->") + AS = Terminal("as") + BAR = Terminal("bar") + CLASS = Terminal("class") + COLON = Terminal("colon") + COMMENT = Terminal("comment") + ELSE = Terminal("else") + FOR = Terminal("for") + FUN = Terminal("fun") + IDENTIFIER = Terminal( + Re.seq( + Re.set(("a", "z"), ("A", "Z"), "_"), + Re.set(("a", "z"), ("A", "Z"), ("0", "9"), "_").star(), + ) + ) + IF = Terminal("if") + IMPORT = Terminal("import") + IN = Terminal("in") + LCURLY = Terminal("{") + LET = Terminal("Let") + RCURLY = Terminal("}") + RETURN = Terminal("return") + SEMICOLON = Terminal(";") + STRING = Terminal('""') # TODO + WHILE = Terminal("while") + EQUAL = Terminal("=") + LPAREN = Terminal("(") + RPAREN = Terminal(")") + COMMA = Terminal(",") + SELF = Terminal("self", name="SELFF") + OR = Terminal("or") + IS = Terminal("is") + AND = Terminal("and") + EQUALEQUAL = Terminal("==") + BANGEQUAL = Terminal("!=") + LESS = Terminal("<") + GREATER = Terminal(">") + LESSEQUAL = Terminal("<=") + GREATEREQUAL = Terminal(">=") + PLUS = Terminal("+") + MINUS = Terminal("-") + STAR = Terminal("*") + SLASH = Terminal("/") + NUMBER = Terminal(Re.set(("0", "9")).plus()) + TRUE = Terminal("true") + FALSE = Terminal("false") + BANG = Terminal("!") + DOT = Terminal(".") + MATCH = Terminal("match") + EXPORT = Terminal("export") + UNDERSCORE = Terminal("_") + NEW = Terminal("new") + LSQUARE = Terminal("[") + RSQUARE = Terminal("]") # ----------------------------------------------------------------------------- # DORKY LEXER # ----------------------------------------------------------------------------- +import bisect + + NUMBER_RE = re.compile("[0-9]+(\\.[0-9]*([eE][-+]?[0-9]+)?)?") IDENTIFIER_RE = re.compile("[_A-Za-z][_A-Za-z0-9]*") KEYWORD_TABLE = { - "_": UNDERSCORE, - "and": AND, - "as": AS, - "class": CLASS, - "else": ELSE, - "export": EXPORT, - "false": FALSE, - "for": FOR, - "fun": FUN, - "if": IF, - "import": IMPORT, - "in": IN, - "is": IS, - "let": LET, - "match": MATCH, - "new": NEW, - "or": OR, - "return": RETURN, - "self": SELF, - "true": TRUE, - "while": WHILE, + "_": FineGrammar.UNDERSCORE, + "and": FineGrammar.AND, + "as": FineGrammar.AS, + "class": FineGrammar.CLASS, + "else": FineGrammar.ELSE, + "export": FineGrammar.EXPORT, + "false": FineGrammar.FALSE, + "for": FineGrammar.FOR, + "fun": FineGrammar.FUN, + "if": FineGrammar.IF, + "import": FineGrammar.IMPORT, + "in": FineGrammar.IN, + "is": FineGrammar.IS, + "let": FineGrammar.LET, + "match": FineGrammar.MATCH, + "new": FineGrammar.NEW, + "or": FineGrammar.OR, + "return": FineGrammar.RETURN, + "self": FineGrammar.SELF, + "true": FineGrammar.TRUE, + "while": FineGrammar.WHILE, } @@ -406,63 +433,63 @@ def tokenize(src: str): token = None if ch == "-": if src[pos : pos + 2] == "->": - token = (ARROW, pos, 2) + token = (FineGrammar.ARROW, pos, 2) else: - token = (MINUS, pos, 1) + token = (FineGrammar.MINUS, pos, 1) elif ch == "|": - token = (BAR, pos, 1) + token = (FineGrammar.BAR, pos, 1) elif ch == ":": - token = (COLON, pos, 1) + token = (FineGrammar.COLON, pos, 1) elif ch == "{": - token = (LCURLY, pos, 1) + token = (FineGrammar.LCURLY, pos, 1) elif ch == "}": - token = (RCURLY, pos, 1) + token = (FineGrammar.RCURLY, pos, 1) elif ch == ";": - token = (SEMICOLON, pos, 1) + token = (FineGrammar.SEMICOLON, pos, 1) elif ch == "=": if src[pos : pos + 2] == "==": - token = (EQUALEQUAL, pos, 2) + token = (FineGrammar.EQUALEQUAL, pos, 2) else: - token = (EQUAL, pos, 1) + token = (FineGrammar.EQUAL, pos, 1) elif ch == "(": - token = (LPAREN, pos, 1) + token = (FineGrammar.LPAREN, pos, 1) elif ch == ")": - token = (RPAREN, pos, 1) + token = (FineGrammar.RPAREN, pos, 1) elif ch == ",": - token = (COMMA, pos, 1) + token = (FineGrammar.COMMA, pos, 1) elif ch == "!": if src[pos : pos + 2] == "!=": - token = (BANGEQUAL, pos, 2) + token = (FineGrammar.BANGEQUAL, pos, 2) else: - token = (BANG, pos, 1) + token = (FineGrammar.BANG, pos, 1) elif ch == "<": if src[pos : pos + 2] == "<=": - token = (LESSEQUAL, pos, 2) + token = (FineGrammar.LESSEQUAL, pos, 2) else: - token = (LESS, pos, 1) + token = (FineGrammar.LESS, pos, 1) elif ch == ">": if src[pos : pos + 2] == ">=": - token = (GREATEREQUAL, pos, 2) + token = (FineGrammar.GREATEREQUAL, pos, 2) else: - token = (GREATER, pos, 1) + token = (FineGrammar.GREATER, pos, 1) elif ch == "+": - token = (PLUS, pos, 1) + token = (FineGrammar.PLUS, pos, 1) elif ch == "*": - token = (STAR, pos, 1) + token = (FineGrammar.STAR, pos, 1) elif ch == "/": if src[pos : pos + 2] == "//": @@ -470,16 +497,16 @@ def tokenize(src: str): pos = pos + 1 continue - token = (SLASH, pos, 1) + token = (FineGrammar.SLASH, pos, 1) elif ch == ".": - token = (DOT, pos, 1) + token = (FineGrammar.DOT, pos, 1) elif ch == "[": - token = (LSQUARE, pos, 1) + token = (FineGrammar.LSQUARE, pos, 1) elif ch == "]": - token = (RSQUARE, pos, 1) + token = (FineGrammar.RSQUARE, pos, 1) elif ch == '"' or ch == "'": end = pos + 1 @@ -490,12 +517,12 @@ def tokenize(src: str): if end == len(src): raise Exception(f"Unterminated string constant at {pos}") end += 1 - token = (STRING, pos, end - pos) + token = (FineGrammar.STRING, pos, end - pos) else: number_match = NUMBER_RE.match(src, pos) if number_match: - token = (NUMBER, pos, number_match.end() - pos) + token = (FineGrammar.NUMBER, pos, number_match.end() - pos) else: id_match = IDENTIFIER_RE.match(src, pos) if id_match: @@ -504,7 +531,7 @@ def tokenize(src: str): if keyword: token = (keyword, pos, len(fragment)) else: - token = (IDENTIFIER, pos, len(fragment)) + token = (FineGrammar.IDENTIFIER, pos, len(fragment)) if token is None: raise Exception("Token error") @@ -512,9 +539,6 @@ def tokenize(src: str): pos += token[2] -import bisect - - class FineTokens: def __init__(self, src: str): self.src = src @@ -546,4 +570,8 @@ class FineTokens: if __name__ == "__main__": - FineGrammar().build_table() + grammar = FineGrammar() + grammar.build_table() + + lexer = compile_lexer(grammar) + dump_lexer_table(lexer) diff --git a/parser/parser.py b/parser/parser.py index d0cb1fc..8a23d4e 100644 --- a/parser/parser.py +++ b/parser/parser.py @@ -21,19 +21,20 @@ To get started, create a grammar that derives from the `Grammar` class. Create one method per nonterminal, decorated with the `rule` decorator. Here's an example: - PLUS = Terminal('+') - LPAREN = Terminal('(') - RPAREN = Terminal(')') - ID = Terminal('id') class SimpleGrammar(Grammar): @rule def expression(self): - return seq(self.expression, PLUS, self.term) | self.term + return seq(self.expression, self.PLUS, self.term) | self.term @rule def term(self): - return seq(LPAREN, self.expression, RPAREN) | ID + return seq(self.LPAREN, self.expression, self.RPAREN) | self.ID + + PLUS = Terminal('+') + LPAREN = Terminal('(') + RPAREN = Terminal(')') + ID = Terminal('id') ## Using grammars @@ -130,13 +131,13 @@ May 2024 """ import abc +import bisect import collections import dataclasses import enum import functools import inspect import json -import sys import typing @@ -1605,15 +1606,20 @@ class Rule: class Terminal(Rule): """A token, or terminal symbol in the grammar.""" - value: str + value: str | None + pattern: "str | Re" - def __init__(self, value): - self.value = sys.intern(value) + def __init__(self, pattern, name=None): + self.value = name + self.pattern = pattern def flatten(self) -> typing.Generator[list["str | Terminal"], None, None]: # We are just ourselves when flattened. yield [self] + def __repr__(self) -> str: + return self.value or "???" + class NonTerminal(Rule): """A non-terminal, or a production, in the grammar. @@ -1766,19 +1772,20 @@ class Grammar: Here's an example of a simple grammar: - PLUS = Terminal('+') - LPAREN = Terminal('(') - RPAREN = Terminal(')') - ID = Terminal('id') - class SimpleGrammar(Grammar): @rule def expression(self): - return seq(self.expression, PLUS, self.term) | self.term + return seq(self.expression, self.PLUS, self.term) | self.term @rule def term(self): - return seq(LPAREN, self.expression, RPAREN) | ID + return seq(self.LPAREN, self.expression, self.RPAREN) | self.ID + + PLUS = Terminal('+') + LPAREN = Terminal('(') + RPAREN = Terminal(')') + ID = Terminal('id') + Not very exciting, perhaps, but it's something. """ @@ -1786,6 +1793,7 @@ class Grammar: _precedence: dict[str, typing.Tuple[Assoc, int]] _start: str _generator: type[GenerateLR0] + _terminals: list[Terminal] def __init__( self, @@ -1809,6 +1817,14 @@ class Grammar: generator = getattr(self, "generator", GenerateLALR) assert generator is not None + # Fixup terminal names with the name of the member that declared it. + terminals = [] + for n, t in inspect.getmembers(self, lambda x: isinstance(x, Terminal)): + if t.value is None: + t.value = n + terminals.append(t) + + # Fix up the precedence table. precedence_table = {} for prec, (associativity, symbols) in enumerate(precedence): for symbol in symbols: @@ -1824,6 +1840,11 @@ class Grammar: self._precedence = precedence_table self._start = start self._generator = generator + self._terminals = terminals + + @property + def terminals(self) -> list[Terminal]: + return self._terminals def generate_nonterminal_dict( self, start: str | None = None @@ -1911,3 +1932,526 @@ class Grammar: gen = generator(start, desugared, precedence=self._precedence, transparents=transparents) table = gen.gen_table() return table + + +############################################################################### +# Lexer support +############################################################################### +# For machine-generated lexers + + +@dataclasses.dataclass(frozen=True, slots=True) +class Span: + lower: int # inclusive + upper: int # exclusive + + @classmethod + def from_str(cls, lower: str, upper: str | None = None) -> "Span": + lo = ord(lower) + if upper is None: + hi = lo + 1 + else: + hi = ord(upper) + 1 + + return Span(lower=lo, upper=hi) + + def __len__(self) -> int: + return self.upper - self.lower + + def intersects(self, other: "Span") -> bool: + """Determine if this span intersects the other span.""" + return self.lower < other.upper and self.upper > other.lower + + def split(self, other: "Span") -> tuple["Span|None", "Span|None", "Span|None"]: + """Split two possibly-intersecting spans into three regions: a low + region, which covers just the lower part of the union, a mid region, + which covers the intersection, and a hi region, which covers just the + upper part of the union. + + Together, low and high cover the union of the two spans. Mid covers + the intersection. The implication is that if both spans are identical + then the low and high regions will both be None and mid will be equal + to both. + + Graphically, given two spans A and B: + + [ B ) + [ A ) + [ lo )[ mid )[ hi ) + + If the lower bounds align then the `lo` region is empty: + + [ B ) + [ A ) + [ mid )[ hi ) + + If the upper bounds align then the `hi` region is empty: + + [ B ) + [ A ) + [ lo )[ mid ) + + If both bounds align then both are empty: + + [ B ) + [ A ) + [ mid ) + + split is reflexive: it doesn't matter which order you split things in, + you will always get the same output spans, in the same order. + """ + if not self.intersects(other): + if self.lower < other.lower: + return (self, None, other) + else: + return (other, None, self) + + first = min(self.lower, other.lower) + second = max(self.lower, other.lower) + third = min(self.upper, other.upper) + fourth = max(self.upper, other.upper) + + low = Span(first, second) if first != second else None + mid = Span(second, third) + hi = Span(third, fourth) if third != fourth else None + + return (low, mid, hi) + + def __str__(self) -> str: + return f"[{self.lower}-{self.upper})" + + +ET = typing.TypeVar("ET") + + +class EdgeList[ET]: + """A list of edge transitions, keyed by *span*.""" + + _edges: list[tuple[Span, list[ET]]] + + def __init__(self): + self._edges = [] + + def __iter__(self) -> typing.Iterator[tuple[Span, list[ET]]]: + return iter(self._edges) + + def __repr__(self) -> str: + return f"EdgeList[{','.join(str(s[0]) + '->' + repr(s[1]) for s in self._edges)}]" + + def add_edge(self, c: Span, s: ET): + """Add an edge for the given span to the list. If there are already + spans that overlap this one, split and generating multiple distinct + edges. + """ + our_targets = [s] + + # Look to see where we would put this span based solely on a sort of + # lower bounds: find the lowest upper bound that is greater than the + # lower bound of the incoming span. + point = bisect.bisect_right(self._edges, c.lower, key=lambda x: x[0].upper) + + # We might need to run this in multiple iterations because we keep + # splitting against the *lowest* matching span. + next_span: Span | None = c + while next_span is not None: + c = next_span + next_span = None + + # print(f" incoming: {self} @ {point} <- {c}->[{s}]") + + # Check to see if we've run off the end of the list of spans. + if point == len(self._edges): + self._edges.insert(point, (c, [s])) + # print(f" trivial end: {self}") + return + + # Nope, pull out the span to the right of us. + right_span, right_targets = self._edges[point] + + # Because we intersect at least a little bit we know that we need to + # split and keep processing. + del self._edges[point] + lo, mid, hi = c.split(right_span) # Remember the semantics + # print(f" -> {c} splits {right_span} -> {lo}, {mid}, {hi} @{point}") + + # We do this from lo to hi, lo first. + if lo is not None: + # NOTE: lo will never intersect both no matter what. + if lo.intersects(right_span): + assert not lo.intersects(c) + targets = right_targets + else: + assert lo.intersects(c) + targets = our_targets + + self._edges.insert(point, (lo, targets)) + point += 1 # Adjust the insertion point, important for us to keep running. + + if mid is not None: + # If mid exists it is known to intersect with both so we can just + # do it. + self._edges.insert(point, (mid, right_targets + our_targets)) + point += 1 # Adjust the insertion point, important for us to keep running. + + if hi is not None: + # NOTE: Just like lo, hi will never intersect both no matter what. + if hi.intersects(right_span): + # If hi intersects the right span then we're done, no + # need to keep running. + assert not hi.intersects(c) + self._edges.insert(point, (hi, right_targets)) + + else: + # BUT! If hi intersects the incoming span then what we + # need to do is to replace the incoming span with hi + # (having chopped off the lower part of the incoming + # span) and continue to execute with only the upper part + # of the incoming span. + # + # Why? Because the upper part of the incoming span might + # intersect *more* spans, in which case we need to keep + # splitting and merging targets. + assert hi.intersects(c) + next_span = hi + + # print(f" result: {self}") + + +class NFAState: + """An NFA state. Each state can be the accept state, with one or more + Terminals as the result.""" + + accept: list[Terminal] + epsilons: list["NFAState"] + _edges: EdgeList["NFAState"] + + def __init__(self): + self.accept = [] + self.epsilons = [] + self._edges = EdgeList() + + def __repr__(self): + return f"State{id(self)}" + + def edges(self) -> typing.Iterable[tuple[Span, list["NFAState"]]]: + return self._edges + + def add_edge(self, c: Span, s: "NFAState") -> "NFAState": + self._edges.add_edge(c, s) + return s + + def dump_graph(self, name="nfa.dot"): + with open(name, "w", encoding="utf8") as f: + f.write("digraph G {\n") + + stack: list[NFAState] = [self] + visited = set() + while len(stack) > 0: + state = stack.pop() + if state in visited: + continue + visited.add(state) + + label = ", ".join([t.value for t in state.accept if t.value is not None]) + f.write(f' {id(state)} [label="{label}"];\n') + for target in state.epsilons: + stack.append(target) + f.write(f' {id(state)} -> {id(target)} [label="\u03B5"];\n') + + for span, targets in state.edges(): + label = str(span).replace('"', '\\"') + for target in targets: + stack.append(target) + f.write(f' {id(state)} -> {id(target)} [label="{label}"];\n') + + f.write("}\n") + + +@dataclasses.dataclass +class Re: + def to_nfa(self, start: NFAState) -> NFAState: + del start + raise NotImplementedError() + + def __str__(self) -> str: + raise NotImplementedError() + + @classmethod + def seq(cls, *values: "Re") -> "Re": + result = values[0] + for v in values[1:]: + result = RegexSequence(result, v) + return result + + @classmethod + def literal(cls, value: str) -> "Re": + return cls.seq(*[RegexLiteral.from_ranges(c) for c in value]) + + @classmethod + def set(cls, *args: str | tuple[str, str]) -> "Re": + return RegexLiteral.from_ranges(*args) + + def plus(self) -> "Re": + return RegexPlus(self) + + def star(self) -> "Re": + return RegexStar(self) + + def question(self) -> "Re": + return RegexQuestion(self) + + def __or__(self, value: "Re", /) -> "Re": + return RegexAlternation(self, value) + + +@dataclasses.dataclass +class RegexLiteral(Re): + values: list[Span] + + @classmethod + def from_ranges(cls, *args: str | tuple[str, str]) -> "RegexLiteral": + values = [] + for a in args: + if isinstance(a, str): + values.append(Span.from_str(a)) + else: + values.append(Span.from_str(a[0], a[1])) + + return RegexLiteral(values) + + def to_nfa(self, start: NFAState) -> NFAState: + end = NFAState() + for span in self.values: + start.add_edge(span, end) + return end + + def __str__(self) -> str: + if len(self.values) == 1: + span = self.values[0] + if len(span) == 1: + return chr(span.lower) + + ranges = [] + for span in self.values: + start = chr(span.lower) + end = chr(span.upper - 1) + if start == end: + ranges.append(start) + else: + ranges.append(f"{start}-{end}") + return "[{}]".format("".join(ranges)) + + +@dataclasses.dataclass +class RegexPlus(Re): + child: Re + + def to_nfa(self, start: NFAState) -> NFAState: + end = self.child.to_nfa(start) + end.epsilons.append(start) + return end + + def __str__(self) -> str: + return f"({self.child})+" + + +@dataclasses.dataclass +class RegexStar(Re): + child: Re + + def to_nfa(self, start: NFAState) -> NFAState: + end = self.child.to_nfa(start) + end.epsilons.append(start) + start.epsilons.append(end) + return end + + def __str__(self) -> str: + return f"({self.child})*" + + +@dataclasses.dataclass +class RegexQuestion(Re): + child: Re + + def to_nfa(self, start: NFAState) -> NFAState: + end = self.child.to_nfa(start) + start.epsilons.append(end) + return end + + def __str__(self) -> str: + return f"({self.child})?" + + +@dataclasses.dataclass +class RegexSequence(Re): + left: Re + right: Re + + def to_nfa(self, start: NFAState) -> NFAState: + mid = self.left.to_nfa(start) + return self.right.to_nfa(mid) + + def __str__(self) -> str: + return f"{self.left}{self.right}" + + +@dataclasses.dataclass +class RegexAlternation(Re): + left: Re + right: Re + + def to_nfa(self, start: NFAState) -> NFAState: + left_start = NFAState() + start.epsilons.append(left_start) + left_end = self.left.to_nfa(left_start) + + right_start = NFAState() + start.epsilons.append(right_start) + right_end = self.right.to_nfa(right_start) + + end = NFAState() + left_end.epsilons.append(end) + right_end.epsilons.append(end) + + return end + + def __str__(self) -> str: + return f"(({self.left})||({self.right}))" + + +LexerTable = list[tuple[Terminal | None, list[tuple[Span, int]]]] + + +class NFASuperState: + states: frozenset[NFAState] + + def __init__(self, states: typing.Iterable[NFAState]): + # Close over the given states, including every state that is + # reachable by epsilon-transition. + stack = list(states) + result = set() + while len(stack) > 0: + st = stack.pop() + if st in result: + continue + result.add(st) + stack.extend(st.epsilons) + + self.states = frozenset(result) + + def __eq__(self, other): + if not isinstance(other, NFASuperState): + return False + return self.states == other.states + + def __hash__(self) -> int: + return hash(self.states) + + def edges(self) -> list[tuple[Span, "NFASuperState"]]: + working: EdgeList[list[NFAState]] = EdgeList() + for st in self.states: + for span, targets in st.edges(): + working.add_edge(span, targets) + + # EdgeList maps span to list[list[State]] which we want to flatten. + last_upper = None + result = [] + for span, stateses in working: + if last_upper is not None: + assert last_upper <= span.lower + last_upper = span.upper + + s: list[NFAState] = [] + for states in stateses: + s.extend(states) + + result.append((span, NFASuperState(s))) + + if len(result) > 0: + for i in range(0, len(result) - 1): + span = result[i][0] + next_span = result[i + 1][0] + assert span.upper <= next_span.lower + + # TODO: Merge spans that are adjacent and go to the same state. + + return result + + def accept_terminal(self) -> Terminal | None: + accept = None + for st in self.states: + for ac in st.accept: + if accept is None: + accept = ac + elif accept.value != ac.value: + accept_regex = isinstance(accept.pattern, Re) + ac_regex = isinstance(ac.pattern, Re) + + if accept_regex and not ac_regex: + accept = ac + elif ac_regex and not accept_regex: + pass + else: + raise ValueError( + f"Lexer is ambiguous: cannot distinguish between {accept.value} ('{accept.pattern}') and {ac.value} ('{ac.pattern}')" + ) + + return accept + + +def compile_lexer(x: Grammar) -> LexerTable: + # Parse the terminals all together into a big NFA rooted at `NFA`. + NFA = NFAState() + for terminal in x.terminals: + start = NFAState() + NFA.epsilons.append(start) + + pattern = terminal.pattern + if isinstance(pattern, Re): + ending = pattern.to_nfa(start) + else: + ending = start + for c in pattern: + ending = ending.add_edge(Span.from_str(c), NFAState()) + + ending.accept.append(terminal) + + NFA.dump_graph() + + # Convert the NFA into a DFA in the most straightforward way (by tracking + # sets of state closures, called SuperStates.) + DFA: dict[NFASuperState, tuple[int, list[tuple[Span, NFASuperState]]]] = {} + + stack = [NFASuperState([NFA])] + while len(stack) > 0: + ss = stack.pop() + if ss in DFA: + continue + + edges = ss.edges() + + DFA[ss] = (len(DFA), edges) + for _, target in edges: + stack.append(target) + + return [ + ( + ss.accept_terminal(), + [(k, DFA[v][0]) for k, v in edges], + ) + for ss, (_, edges) in DFA.items() + ] + + +def dump_lexer_table(table: LexerTable): + with open("lexer.dot", "w", encoding="utf-8") as f: + f.write("digraph G {\n") + for index, (accept, edges) in enumerate(table): + label = accept.value if accept is not None else "" + f.write(f' {index} [label="{label}"];\n') + for span, target in edges: + label = str(span).replace('"', '\\"') + f.write(f' {index} -> {target} [label="{label}"];\n') + + pass + f.write("}\n") diff --git a/parser/runtime.py b/parser/runtime.py index f5be3a4..124bc7b 100644 --- a/parser/runtime.py +++ b/parser/runtime.py @@ -430,3 +430,58 @@ class Parser: error_strings.append(f"{line_index}:{column_index}: {parse_error.message}") return (result, error_strings) + + +def generic_tokenize( + src: str, table: parser.LexerTable +) -> typing.Iterable[tuple[parser.Terminal, int, int]]: + pos = 0 + state = 0 + start = 0 + last_accept = None + last_accept_pos = 0 + + print(f"LEXING: {src} ({len(src)})") + + while pos < len(src): + while state is not None: + accept, edges = table[state] + if accept is not None: + last_accept = accept + last_accept_pos = pos + + print(f" @ {pos} state: {state} ({accept})") + if pos >= len(src): + break + + char = ord(src[pos]) + print(f" -> char: {char} ({repr(src[pos])})") + + # Find the index of the span where the upper value is the tightest + # bound on the character. + state = None + index = bisect.bisect_right(edges, char, key=lambda x: x[0].upper) + print(f" -> {index}") + if index < len(edges): + span, target = edges[index] + print(f" -> {span}, {target}") + if char >= span.lower: + print(f" -> target: {target}") + state = target + pos += 1 + + else: + print(f" Nope (outside range)") + else: + print(f" Nope (at end)") + + if last_accept is None: + raise Exception(f"Token error at {pos}") + + yield (last_accept, start, last_accept_pos - start) + + print(f" Yield: {last_accept}, reset to {last_accept_pos}") + last_accept = None + pos = last_accept_pos + start = pos + state = 0 diff --git a/pdm.lock b/pdm.lock index b80bf6d..a937da9 100644 --- a/pdm.lock +++ b/pdm.lock @@ -3,9 +3,26 @@ [metadata] groups = ["default", "dev"] -strategy = ["cross_platform", "inherit_metadata"] -lock_version = "4.4.1" -content_hash = "sha256:143b06c001132ba589a47b2b3a498dd54f4840d95d216c794068089fcea48d4d" +strategy = ["inherit_metadata"] +lock_version = "4.5.0" +content_hash = "sha256:c4fec06f95402db1e9843df4a8a4a275273c6ec4f41f192f30d8a92ee52d15ea" + +[[metadata.targets]] +requires_python = ">=3.12" + +[[package]] +name = "attrs" +version = "24.2.0" +requires_python = ">=3.7" +summary = "Classes Without Boilerplate" +groups = ["dev"] +dependencies = [ + "importlib-metadata; python_version < \"3.8\"", +] +files = [ + {file = "attrs-24.2.0-py3-none-any.whl", hash = "sha256:81921eb96de3191c8258c199618104dd27ac608d9366f5e35d011eae1867ede2"}, + {file = "attrs-24.2.0.tar.gz", hash = "sha256:5cfb1b9148b5b086569baec03f20d7b6bf3bcacc9a42bebf87ffaaca362f6346"}, +] [[package]] name = "colorama" @@ -19,6 +36,22 @@ files = [ {file = "colorama-0.4.6.tar.gz", hash = "sha256:08695f5cb7ed6e0531a20572697297273c47b8cae5a63ffc6d6ed5c201be6e44"}, ] +[[package]] +name = "hypothesis" +version = "6.111.1" +requires_python = ">=3.8" +summary = "A library for property-based testing" +groups = ["dev"] +dependencies = [ + "attrs>=22.2.0", + "exceptiongroup>=1.0.0; python_version < \"3.11\"", + "sortedcontainers<3.0.0,>=2.1.0", +] +files = [ + {file = "hypothesis-6.111.1-py3-none-any.whl", hash = "sha256:9422adbac4b2104f6cf92dc6604b5c9df975efc08ffc7145ecc39bc617243835"}, + {file = "hypothesis-6.111.1.tar.gz", hash = "sha256:6ab6185a858fa692bf125c0d0a936134edc318bee01c05e407c71c9ead0b61c5"}, +] + [[package]] name = "iniconfig" version = "2.0.0" @@ -60,11 +93,23 @@ summary = "pytest: simple powerful testing with Python" groups = ["dev"] dependencies = [ "colorama; sys_platform == \"win32\"", + "exceptiongroup>=1.0.0rc8; python_version < \"3.11\"", "iniconfig", "packaging", "pluggy<2.0,>=1.5", + "tomli>=1; python_version < \"3.11\"", ] files = [ {file = "pytest-8.2.2-py3-none-any.whl", hash = "sha256:c434598117762e2bd304e526244f67bf66bbd7b5d6cf22138be51ff661980343"}, {file = "pytest-8.2.2.tar.gz", hash = "sha256:de4bb8104e201939ccdc688b27a89a7be2079b22e2bd2b07f806b6ba71117977"}, ] + +[[package]] +name = "sortedcontainers" +version = "2.4.0" +summary = "Sorted Containers -- Sorted List, Sorted Dict, Sorted Set" +groups = ["dev"] +files = [ + {file = "sortedcontainers-2.4.0-py2.py3-none-any.whl", hash = "sha256:a163dcaede0f1c021485e957a39245190e74249897e2ae4b2aa38595db237ee0"}, + {file = "sortedcontainers-2.4.0.tar.gz", hash = "sha256:25caa5a06cc30b6b83d11423433f65d1f9d76c4c6a0c90e3379eaa43b9bfdb88"}, +] diff --git a/pyproject.toml b/pyproject.toml index 1e28adc..c7721e1 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -22,6 +22,7 @@ distribution = true [tool.pdm.dev-dependencies] dev = [ "pytest>=8.2.2", + "hypothesis>=6.111.1", ] [tool.pyright] diff --git a/tests/test_grammar.py b/tests/test_grammar.py index a320e06..26e5057 100644 --- a/tests/test_grammar.py +++ b/tests/test_grammar.py @@ -38,25 +38,27 @@ def _tree(treeform) -> runtime.Tree | runtime.TokenValue: def test_lr0_lr0(): """An LR0 grammar should work with an LR0 generator.""" - PLUS = Terminal("+") - LPAREN = Terminal("(") - RPAREN = Terminal(")") - IDENTIFIER = Terminal("id") - - class LR0Grammar(Grammar): + class G(Grammar): start = "E" generator = parser.GenerateLR0 @rule def E(self): - return seq(self.E, PLUS, self.T) | self.T + return seq(self.E, self.PLUS, self.T) | self.T @rule def T(self): - return seq(LPAREN, self.E, RPAREN) | IDENTIFIER + return seq(self.LPAREN, self.E, self.RPAREN) | self.IDENTIFIER - table = LR0Grammar().build_table() - tree, errors = runtime.Parser(table).parse(Tokens(IDENTIFIER, PLUS, LPAREN, IDENTIFIER, RPAREN)) + PLUS = Terminal("+", name="+") + LPAREN = Terminal("(", name="(") + RPAREN = Terminal(")", name=")") + IDENTIFIER = Terminal("id", name="id") + + table = G().build_table() + tree, errors = runtime.Parser(table).parse( + Tokens(G.IDENTIFIER, G.PLUS, G.LPAREN, G.IDENTIFIER, G.RPAREN) + ) assert errors == [] assert tree == _tree(("E", ("E", ("T", "id")), "+", ("T", "(", ("E", ("T", "id")), ")"))) @@ -65,114 +67,114 @@ def test_lr0_lr0(): def test_lr0_shift_reduce(): """This one should not work in LR0- it has a shift/reduce conflict, but works in SLR1.""" - PLUS = Terminal("+") - LPAREN = Terminal("(") - RPAREN = Terminal(")") - LSQUARE = Terminal("[") - RSQUARE = Terminal("]") - IDENTIFIER = Terminal("id") - - class TestGrammar(Grammar): + class G(Grammar): start = "E" generator = parser.GenerateLR0 @rule def E(self): - return seq(self.E, PLUS, self.T) | self.T + return seq(self.E, self.PLUS, self.T) | self.T @rule def T(self): return ( - seq(LPAREN, self.E, RPAREN) | IDENTIFIER | seq(IDENTIFIER, LSQUARE, self.E, RSQUARE) + seq(self.LPAREN, self.E, self.RPAREN) + | self.IDENTIFIER + | seq(self.IDENTIFIER, self.LSQUARE, self.E, self.RSQUARE) ) - with pytest.raises(parser.AmbiguityError): - TestGrammar().build_table() + PLUS = Terminal("+") + LPAREN = Terminal("(") + RPAREN = Terminal(")") + LSQUARE = Terminal("[") + RSQUARE = Terminal("]") + IDENTIFIER = Terminal("id") - TestGrammar().build_table(generator=parser.GenerateSLR1) + with pytest.raises(parser.AmbiguityError): + G().build_table() + + G().build_table(generator=parser.GenerateSLR1) def test_lr0_reduce_reduce(): """This one should not work, it has a reduce-reduce conflict.""" - PLUS = Terminal("+") - EQUAL = Terminal("=") - LPAREN = Terminal("(") - RPAREN = Terminal(")") - IDENTIFIER = Terminal("id") - - class TestGrammar(Grammar): + class G(Grammar): start = "E" generator = parser.GenerateLR0 @rule def E(self): - return seq(self.E, PLUS, self.T) | self.T | seq(self.V, EQUAL, self.E) + return seq(self.E, self.PLUS, self.T) | self.T | seq(self.V, self.EQUAL, self.E) @rule def T(self): - return seq(LPAREN, self.E, RPAREN) | IDENTIFIER + return seq(self.LPAREN, self.E, self.RPAREN) | self.IDENTIFIER @rule def V(self): - return IDENTIFIER + return self.IDENTIFIER + + PLUS = Terminal("+") + EQUAL = Terminal("=") + LPAREN = Terminal("(") + RPAREN = Terminal(")") + IDENTIFIER = Terminal("id") with pytest.raises(parser.AmbiguityError): - TestGrammar().build_table() + G().build_table() def test_lr0_empty(): """LR0 can't handle empty productions because it doesn't know when to reduce.""" - BOOP = Terminal("boop") - BEEP = Terminal("beep") - class TestGrammar(Grammar): + class G(Grammar): start = "E" generator = parser.GenerateLR0 @rule def E(self): - return seq(self.F, BOOP) + return seq(self.F, self.BOOP) @rule def F(self): - return BEEP | parser.Nothing + return self.BEEP | parser.Nothing + + BOOP = Terminal("boop") + BEEP = Terminal("beep") with pytest.raises(parser.AmbiguityError): - TestGrammar().build_table() + G().build_table() def test_grammar_aho_ullman_1(): - EQUAL = Terminal("=") - STAR = Terminal("*") - ID = Terminal("id") - - class TestGrammar(Grammar): + class G(Grammar): start = "S" generator = parser.GenerateSLR1 @rule def S(self): - return seq(self.L, EQUAL, self.R) | self.R + return seq(self.L, self.EQUAL, self.R) | self.R @rule def L(self): - return seq(STAR, self.R) | ID + return seq(self.STAR, self.R) | self.ID @rule def R(self): return self.L - with pytest.raises(parser.AmbiguityError): - TestGrammar().build_table() + EQUAL = Terminal("=") + STAR = Terminal("*") + ID = Terminal("id") - TestGrammar().build_table(generator=parser.GenerateLR1) + with pytest.raises(parser.AmbiguityError): + G().build_table() + + G().build_table(generator=parser.GenerateLR1) def test_grammar_aho_ullman_2(): - A = Terminal("a") - B = Terminal("b") - class TestGrammar(Grammar): start = "S" generator = parser.GenerateSLR1 @@ -183,7 +185,10 @@ def test_grammar_aho_ullman_2(): @rule def X(self): - return seq(A, self.X) | B + return seq(self.A, self.X) | self.B + + A = Terminal("a") + B = Terminal("b") TestGrammar().build_table() TestGrammar().build_table(generator=parser.GenerateLR1) @@ -191,11 +196,6 @@ def test_grammar_aho_ullman_2(): def test_fun_lalr(): - PLUS = Terminal("+") - INT = Terminal("int") - ID = Terminal("id") - LPAREN = Terminal("(") - RPAREN = Terminal(")") class TestGrammar(Grammar): start = "S" @@ -207,15 +207,21 @@ def test_fun_lalr(): @rule def E(self): - return self.F | seq(self.E, PLUS, self.F) + return self.F | seq(self.E, self.PLUS, self.F) @rule def F(self): - return self.V | INT | seq(LPAREN, self.E, RPAREN) + return self.V | self.INT | seq(self.LPAREN, self.E, self.RPAREN) @rule def V(self): - return ID + return self.ID + + PLUS = Terminal("+") + INT = Terminal("int") + ID = Terminal("id") + LPAREN = Terminal("(") + RPAREN = Terminal(")") TestGrammar().build_table() @@ -234,14 +240,14 @@ def test_conflicting_names(): to understand. """ - IDENTIFIER = Terminal("Identifier") - class TestGrammar(Grammar): - start = "Identifier" + start = "IDENTIFIER" - @rule("Identifier") + @rule("IDENTIFIER") def identifier(self): - return IDENTIFIER + return self.IDENTIFIER + + IDENTIFIER = Terminal("Identifier") with pytest.raises(ValueError): TestGrammar().build_table() diff --git a/tests/test_lexer.py b/tests/test_lexer.py new file mode 100644 index 0000000..fe442d8 --- /dev/null +++ b/tests/test_lexer.py @@ -0,0 +1,384 @@ +import collections + +from hypothesis import assume, example, given +from hypothesis.strategies import integers, lists, tuples + +import pytest + +from parser import ( + EdgeList, + Span, + Grammar, + rule, + Terminal, + compile_lexer, + dump_lexer_table, + Re, +) + +from parser.runtime import generic_tokenize + + +def test_span_intersection(): + pairs = [ + ((1, 3), (2, 4)), + ((1, 3), (2, 3)), + ((1, 3), (1, 2)), + ((1, 3), (0, 2)), + ((1, 3), (0, 4)), + ] + + for a, b in pairs: + left = Span(*a) + right = Span(*b) + assert left.intersects(right) + assert right.intersects(left) + + +def test_span_no_intersection(): + pairs = [ + ((1, 2), (3, 4)), + ] + + for a, b in pairs: + left = Span(*a) + right = Span(*b) + assert not left.intersects(right) + assert not right.intersects(left) + + +def test_span_split(): + TC = collections.namedtuple("TC", ["left", "right", "expected"]) + cases = [ + TC( + left=Span(1, 4), + right=Span(2, 3), + expected=(Span(1, 2), Span(2, 3), Span(3, 4)), + ), + TC( + left=Span(1, 4), + right=Span(1, 2), + expected=(None, Span(1, 2), Span(2, 4)), + ), + TC( + left=Span(1, 4), + right=Span(3, 4), + expected=(Span(1, 3), Span(3, 4), None), + ), + TC( + left=Span(1, 4), + right=Span(1, 4), + expected=(None, Span(1, 4), None), + ), + ] + + for left, right, expected in cases: + result = left.split(right) + assert result == expected + + result = right.split(left) + assert result == expected + + +@given(integers(), integers()) +def test_equal_span_mid_only(x, y): + """Splitting spans against themselves results in an empty lo and hi bound.""" + assume(x < y) + span = Span(x, y) + lo, mid, hi = span.split(span) + assert lo is None + assert hi is None + assert mid == span + + +three_distinct_points = lists( + integers(), + min_size=3, + max_size=3, + unique=True, +).map(sorted) + + +@given(three_distinct_points) +def test_span_low_align_lo_none(vals): + """Splitting spans with aligned lower bounds results in an empty lo bound.""" + # x y z + # [ a ) + # [ b ) + x, y, z = vals + + a = Span(x, y) + b = Span(x, z) + lo, _, _ = a.split(b) + + assert lo is None + + +@given(three_distinct_points) +def test_span_high_align_hi_none(vals): + """Splitting spans with aligned lower bounds results in an empty lo bound.""" + # x y z + # [ a ) + # [ b ) + x, y, z = vals + + a = Span(y, z) + b = Span(x, z) + _, _, hi = a.split(b) + + assert hi is None + + +four_distinct_points = lists( + integers(), + min_size=4, + max_size=4, + unique=True, +).map(sorted) + + +@given(four_distinct_points) +def test_span_split_overlapping_lo_left(vals): + """Splitting two overlapping spans results in lo overlapping left.""" + a, b, c, d = vals + + left = Span(a, c) + right = Span(b, d) + + lo, _, _ = left.split(right) + assert lo is not None + assert lo.intersects(left) + + +@given(four_distinct_points) +def test_span_split_overlapping_lo_not_right(vals): + """Splitting two overlapping spans results in lo NOT overlapping right.""" + a, b, c, d = vals + + left = Span(a, c) + right = Span(b, d) + + lo, _, _ = left.split(right) + assert lo is not None + assert not lo.intersects(right) + + +@given(four_distinct_points) +def test_span_split_overlapping_mid_left(vals): + """Splitting two overlapping spans results in mid overlapping left.""" + a, b, c, d = vals + + left = Span(a, c) + right = Span(b, d) + + _, mid, _ = left.split(right) + assert mid is not None + assert mid.intersects(left) + + +@given(four_distinct_points) +def test_span_split_overlapping_mid_right(vals): + """Splitting two overlapping spans results in mid overlapping right.""" + a, b, c, d = vals + + left = Span(a, c) + right = Span(b, d) + + _, mid, _ = left.split(right) + assert mid is not None + assert mid.intersects(right) + + +@given(four_distinct_points) +def test_span_split_overlapping_hi_right(vals): + """Splitting two overlapping spans results in hi overlapping right.""" + a, b, c, d = vals + + left = Span(a, c) + right = Span(b, d) + + _, _, hi = left.split(right) + assert hi is not None + assert hi.intersects(right) + + +@given(four_distinct_points) +def test_span_split_overlapping_hi_not_left(vals): + """Splitting two overlapping spans results in hi NOT overlapping left.""" + a, b, c, d = vals + + left = Span(a, c) + right = Span(b, d) + + _, _, hi = left.split(right) + assert hi is not None + assert not hi.intersects(left) + + +@given(four_distinct_points) +def test_span_split_embedded(vals): + """Splitting two spans where one overlaps the other.""" + a, b, c, d = vals + + outer = Span(a, d) + inner = Span(b, c) + + lo, mid, hi = outer.split(inner) + + assert lo is not None + assert mid is not None + assert hi is not None + + assert lo.intersects(outer) + assert not lo.intersects(inner) + + assert mid.intersects(outer) + assert mid.intersects(inner) + + assert hi.intersects(outer) + assert not hi.intersects(inner) + + +def test_edge_list_single(): + el: EdgeList[str] = EdgeList() + el.add_edge(Span(1, 4), "A") + + edges = list(el) + assert edges == [ + (Span(1, 4), ["A"]), + ] + + +def test_edge_list_fully_enclosed(): + el: EdgeList[str] = EdgeList() + el.add_edge(Span(1, 4), "A") + el.add_edge(Span(2, 3), "B") + + edges = list(el) + assert edges == [ + (Span(1, 2), ["A"]), + (Span(2, 3), ["A", "B"]), + (Span(3, 4), ["A"]), + ] + + +def test_edge_list_overlap(): + el: EdgeList[str] = EdgeList() + el.add_edge(Span(1, 4), "A") + el.add_edge(Span(2, 5), "B") + + edges = list(el) + assert edges == [ + (Span(1, 2), ["A"]), + (Span(2, 4), ["A", "B"]), + (Span(4, 5), ["B"]), + ] + + +def test_edge_list_no_overlap(): + el: EdgeList[str] = EdgeList() + el.add_edge(Span(1, 4), "A") + el.add_edge(Span(5, 8), "B") + + edges = list(el) + assert edges == [ + (Span(1, 4), ["A"]), + (Span(5, 8), ["B"]), + ] + + +def test_edge_list_no_overlap_ordered(): + el: EdgeList[str] = EdgeList() + el.add_edge(Span(5, 8), "B") + el.add_edge(Span(1, 4), "A") + + edges = list(el) + assert edges == [ + (Span(1, 4), ["A"]), + (Span(5, 8), ["B"]), + ] + + +def test_edge_list_overlap_span(): + el: EdgeList[str] = EdgeList() + el.add_edge(Span(1, 3), "A") + el.add_edge(Span(4, 6), "B") + el.add_edge(Span(2, 5), "C") + + edges = list(el) + assert edges == [ + (Span(1, 2), ["A"]), + (Span(2, 3), ["A", "C"]), + (Span(3, 4), ["C"]), + (Span(4, 5), ["B", "C"]), + (Span(5, 6), ["B"]), + ] + + +def test_edge_list_overlap_span_big(): + el: EdgeList[str] = EdgeList() + el.add_edge(Span(2, 3), "A") + el.add_edge(Span(4, 5), "B") + el.add_edge(Span(6, 7), "C") + el.add_edge(Span(1, 8), "D") + + edges = list(el) + assert edges == [ + (Span(1, 2), ["D"]), + (Span(2, 3), ["A", "D"]), + (Span(3, 4), ["D"]), + (Span(4, 5), ["B", "D"]), + (Span(5, 6), ["D"]), + (Span(6, 7), ["C", "D"]), + (Span(7, 8), ["D"]), + ] + + +@given(lists(lists(integers(), min_size=2, max_size=2, unique=True), min_size=1)) +@example(points=[[0, 1], [1, 2]]) +def test_edge_list_always_sorted(points: list[tuple[int, int]]): + # OK this is weird but stick with me. + el: EdgeList[str] = EdgeList() + for i, (a, b) in enumerate(points): + lower = min(a, b) + upper = max(a, b) + + span = Span(lower, upper) + + el.add_edge(span, str(i)) + + last_upper = None + for span, _ in el: + if last_upper is not None: + assert last_upper <= span.lower, "Edges from list are not sorted" + last_upper = span.upper + + +def test_lexer_compile(): + class LexTest(Grammar): + @rule + def foo(self): + return self.IS + + start = foo + + IS = Terminal("is") + AS = Terminal("as") + IDENTIFIER = Terminal( + Re.seq( + Re.set(("a", "z"), ("A", "Z"), "_"), + Re.set(("a", "z"), ("A", "Z"), ("0", "9"), "_").star(), + ) + ) + BLANKS = Terminal(Re.set("\r", "\n", "\t", " ").plus()) + + lexer = compile_lexer(LexTest()) + dump_lexer_table(lexer) + tokens = list(generic_tokenize("xy is ass", lexer)) + assert tokens == [ + (LexTest.IDENTIFIER, 0, 2), + (LexTest.BLANKS, 2, 1), + (LexTest.IS, 3, 2), + (LexTest.BLANKS, 5, 1), + (LexTest.IDENTIFIER, 6, 3), + ]