diff --git a/grammar.py b/grammar.py index 38299e9..502c924 100644 --- a/grammar.py +++ b/grammar.py @@ -2,57 +2,7 @@ import re import typing -import parser -from parser import Assoc, Grammar, Nothing, Terminal, rule, seq, Rule - -ARROW = Terminal("Arrow") -AS = Terminal("As") -BAR = Terminal("Bar") -CLASS = Terminal("Class") -COLON = Terminal("Colon") -ELSE = Terminal("Else") -FOR = Terminal("For") -FUN = Terminal("Fun") -IDENTIFIER = Terminal("Identifier") -IF = Terminal("If") -IMPORT = Terminal("Import") -IN = Terminal("In") -LCURLY = Terminal("LeftBrace") -LET = Terminal("Let") -RCURLY = Terminal("RightBrace") -RETURN = Terminal("Return") -SEMICOLON = Terminal("Semicolon") -STRING = Terminal("String") -WHILE = Terminal("While") -EQUAL = Terminal("Equal") -LPAREN = Terminal("LeftParen") -RPAREN = Terminal("RightParen") -COMMA = Terminal("Comma") -SELF = Terminal("Selff") -OR = Terminal("Or") -IS = Terminal("Is") -AND = Terminal("And") -EQUALEQUAL = Terminal("EqualEqual") -BANGEQUAL = Terminal("BangEqual") -LESS = Terminal("Less") -GREATER = Terminal("Greater") -LESSEQUAL = Terminal("LessEqual") -GREATEREQUAL = Terminal("GreaterEqual") -PLUS = Terminal("Plus") -MINUS = Terminal("Minus") -STAR = Terminal("Star") -SLASH = Terminal("Slash") -NUMBER = Terminal("Number") -TRUE = Terminal("True") -FALSE = Terminal("False") -BANG = Terminal("Bang") -DOT = Terminal("Dot") -MATCH = Terminal("Match") -EXPORT = Terminal("Export") -UNDERSCORE = Terminal("Underscore") -NEW = Terminal("New") -LSQUARE = Terminal("LeftBracket") -RSQUARE = Terminal("RightBracket") +from parser import Assoc, Grammar, Nothing, rule, seq, Rule, Terminal class FineGrammar(Grammar): @@ -62,17 +12,17 @@ class FineGrammar(Grammar): def __init__(self): super().__init__( precedence=[ - (Assoc.RIGHT, [EQUAL]), - (Assoc.LEFT, [OR]), - (Assoc.LEFT, [IS]), - (Assoc.LEFT, [AND]), - (Assoc.LEFT, [EQUALEQUAL, BANGEQUAL]), - (Assoc.LEFT, [LESS, GREATER, GREATEREQUAL, LESSEQUAL]), - (Assoc.LEFT, [PLUS, MINUS]), - (Assoc.LEFT, [STAR, SLASH]), + (Assoc.RIGHT, [self.EQUAL]), + (Assoc.LEFT, [self.OR]), + (Assoc.LEFT, [self.IS]), + (Assoc.LEFT, [self.AND]), + (Assoc.LEFT, [self.EQUALEQUAL, self.BANGEQUAL]), + (Assoc.LEFT, [self.LESS, self.GREATER, self.GREATEREQUAL, self.LESSEQUAL]), + (Assoc.LEFT, [self.PLUS, self.MINUS]), + (Assoc.LEFT, [self.STAR, self.SLASH]), (Assoc.LEFT, [self.primary_expression]), - (Assoc.LEFT, [LPAREN]), - (Assoc.LEFT, [DOT]), + (Assoc.LEFT, [self.LPAREN]), + (Assoc.LEFT, [self.DOT]), # # If there's a confusion about whether to make an IF # statement or an expression, prefer the statement. @@ -97,15 +47,15 @@ class FineGrammar(Grammar): @rule def import_statement(self) -> Rule: - return seq(IMPORT, STRING, AS, IDENTIFIER, SEMICOLON) + return seq(self.IMPORT, self.STRING, self.AS, self.IDENTIFIER, self.SEMICOLON) @rule("ClassDeclaration") def class_declaration(self) -> Rule: - return seq(CLASS, IDENTIFIER, self._class_body) + return seq(self.CLASS, self.IDENTIFIER, self._class_body) @rule def _class_body(self) -> Rule: - return seq(LCURLY, RCURLY) | seq(LCURLY, self._class_members, RCURLY) + return seq(self.LCURLY, self.RCURLY) | seq(self.LCURLY, self._class_members, self.RCURLY) @rule def _class_members(self) -> Rule: @@ -117,7 +67,7 @@ class FineGrammar(Grammar): @rule("FieldDecl") def field_declaration(self) -> Rule: - return seq(IDENTIFIER, COLON, self.type_expression, SEMICOLON) + return seq(self.IDENTIFIER, self.COLON, self.type_expression, self.SEMICOLON) # Types @rule("TypeExpression") @@ -126,60 +76,65 @@ class FineGrammar(Grammar): @rule("AlternateType") def alternate_type(self) -> Rule: - return seq(self.type_expression, OR, self.type_identifier) + return seq(self.type_expression, self.OR, self.type_identifier) @rule("TypeIdentifier") def type_identifier(self) -> Rule: - return IDENTIFIER + return self.IDENTIFIER @rule def export_statement(self) -> Rule: return ( - seq(EXPORT, self.class_declaration) - | seq(EXPORT, self.function_declaration) - | seq(EXPORT, self.let_statement) - | seq(EXPORT, self.export_list, SEMICOLON) + seq(self.EXPORT, self.class_declaration) + | seq(self.EXPORT, self.function_declaration) + | seq(self.EXPORT, self.let_statement) + | seq(self.EXPORT, self.export_list, self.SEMICOLON) ) @rule def export_list(self) -> Rule: - return Nothing | IDENTIFIER | seq(IDENTIFIER, COMMA, self.export_list) + return Nothing | self.IDENTIFIER | seq(self.IDENTIFIER, self.COMMA, self.export_list) # Functions @rule("FunctionDecl") def function_declaration(self) -> Rule: - return seq(FUN, IDENTIFIER, self.function_parameters, self.block) | seq( - FUN, IDENTIFIER, self.function_parameters, ARROW, self.type_expression, self.block + return seq(self.FUN, self.IDENTIFIER, self.function_parameters, self.block) | seq( + self.FUN, + self.IDENTIFIER, + self.function_parameters, + self.ARROW, + self.type_expression, + self.block, ) @rule("ParamList") def function_parameters(self) -> Rule: return ( - seq(LPAREN, RPAREN) - | seq(LPAREN, self._first_parameter, RPAREN) - | seq(LPAREN, self._first_parameter, COMMA, self._parameter_list, RPAREN) + seq(self.LPAREN, self.RPAREN) + | seq(self.LPAREN, self._first_parameter, self.RPAREN) + | seq(self.LPAREN, self._first_parameter, self.COMMA, self._parameter_list, self.RPAREN) ) @rule def _first_parameter(self) -> Rule: - return SELF | self.parameter + return self.SELF | self.parameter @rule def _parameter_list(self) -> Rule: - return Nothing | self.parameter | seq(self.parameter, COMMA, self._parameter_list) + return Nothing | self.parameter | seq(self.parameter, self.COMMA, self._parameter_list) @rule("Parameter") def parameter(self) -> Rule: - return seq(IDENTIFIER, COLON, self.type_expression) + return seq(self.IDENTIFIER, self.COLON, self.type_expression) # Block @rule("Block") def block(self) -> Rule: return ( - seq(LCURLY, RCURLY) - | seq(LCURLY, self.expression, RCURLY) - | seq(LCURLY, self._statement_list, RCURLY) - | seq(LCURLY, self._statement_list, self.expression, RCURLY) + seq(self.LCURLY, self.RCURLY) + | seq(self.LCURLY, self.expression, self.RCURLY) + | seq(self.LCURLY, self._statement_list, self.RCURLY) + | seq(self.LCURLY, self._statement_list, self.expression, self.RCURLY) ) @rule @@ -200,19 +155,19 @@ class FineGrammar(Grammar): @rule("LetStatement") def let_statement(self) -> Rule: - return seq(LET, IDENTIFIER, EQUAL, self.expression, SEMICOLON) + return seq(self.LET, self.IDENTIFIER, self.EQUAL, self.expression, self.SEMICOLON) @rule("ReturnStatement") def return_statement(self) -> Rule: - return seq(RETURN, self.expression, SEMICOLON) | seq(RETURN, SEMICOLON) + return seq(self.RETURN, self.expression, self.SEMICOLON) | seq(self.RETURN, self.SEMICOLON) @rule("ForStatement") def for_statement(self) -> Rule: - return seq(FOR, self.iterator_variable, IN, self.expression, self.block) + return seq(self.FOR, self.iterator_variable, self.IN, self.expression, self.block) @rule("IteratorVariable") def iterator_variable(self) -> Rule: - return IDENTIFIER + return self.IDENTIFIER @rule("IfStatement") def if_statement(self) -> Rule: @@ -220,11 +175,11 @@ class FineGrammar(Grammar): @rule def while_statement(self) -> Rule: - return seq(WHILE, self.expression, self.block) + return seq(self.WHILE, self.expression, self.block) @rule def expression_statement(self) -> Rule: - return seq(self.expression, SEMICOLON) + return seq(self.expression, self.SEMICOLON) # Expressions @rule(transparent=True) @@ -234,91 +189,93 @@ class FineGrammar(Grammar): @rule("BinaryExpression") def binary_expression(self) -> Rule: return ( - seq(self.expression, EQUAL, self.expression) - | seq(self.expression, OR, self.expression) - | seq(self.expression, AND, self.expression) - | seq(self.expression, EQUALEQUAL, self.expression) - | seq(self.expression, BANGEQUAL, self.expression) - | seq(self.expression, LESS, self.expression) - | seq(self.expression, LESSEQUAL, self.expression) - | seq(self.expression, GREATER, self.expression) - | seq(self.expression, GREATEREQUAL, self.expression) - | seq(self.expression, PLUS, self.expression) - | seq(self.expression, MINUS, self.expression) - | seq(self.expression, STAR, self.expression) - | seq(self.expression, SLASH, self.expression) + seq(self.expression, self.EQUAL, self.expression) + | seq(self.expression, self.OR, self.expression) + | seq(self.expression, self.AND, self.expression) + | seq(self.expression, self.EQUALEQUAL, self.expression) + | seq(self.expression, self.BANGEQUAL, self.expression) + | seq(self.expression, self.LESS, self.expression) + | seq(self.expression, self.LESSEQUAL, self.expression) + | seq(self.expression, self.GREATER, self.expression) + | seq(self.expression, self.GREATEREQUAL, self.expression) + | seq(self.expression, self.PLUS, self.expression) + | seq(self.expression, self.MINUS, self.expression) + | seq(self.expression, self.STAR, self.expression) + | seq(self.expression, self.SLASH, self.expression) ) @rule("IsExpression") def is_expression(self) -> Rule: - return seq(self.expression, IS, self.pattern) + return seq(self.expression, self.IS, self.pattern) @rule def primary_expression(self) -> Rule: return ( self.identifier_expression | self.literal_expression - | SELF - | seq(BANG, self.primary_expression) - | seq(MINUS, self.primary_expression) + | self.SELF + | seq(self.BANG, self.primary_expression) + | seq(self.MINUS, self.primary_expression) | self.block | self.conditional_expression | self.list_constructor_expression | self.object_constructor_expression | self.match_expression - | seq(self.primary_expression, LPAREN, RPAREN) - | seq(self.primary_expression, LPAREN, self._expression_list, RPAREN) - | seq(self.primary_expression, DOT, IDENTIFIER) - | seq(LPAREN, self.expression, RPAREN) + | seq(self.primary_expression, self.LPAREN, self.RPAREN) + | seq(self.primary_expression, self.LPAREN, self._expression_list, self.RPAREN) + | seq(self.primary_expression, self.DOT, self.IDENTIFIER) + | seq(self.LPAREN, self.expression, self.RPAREN) ) @rule("IdentifierExpression") def identifier_expression(self): - return IDENTIFIER + return self.IDENTIFIER @rule("Literal") def literal_expression(self): - return NUMBER | STRING | TRUE | FALSE + return self.NUMBER | self.STRING | self.TRUE | self.FALSE @rule("ConditionalExpression") def conditional_expression(self) -> Rule: return ( - seq(IF, self.expression, self.block) - | seq(IF, self.expression, self.block, ELSE, self.conditional_expression) - | seq(IF, self.expression, self.block, ELSE, self.block) + seq(self.IF, self.expression, self.block) + | seq(self.IF, self.expression, self.block, self.ELSE, self.conditional_expression) + | seq(self.IF, self.expression, self.block, self.ELSE, self.block) ) @rule def list_constructor_expression(self) -> Rule: - return seq(LSQUARE, RSQUARE) | seq(LSQUARE, self._expression_list, RSQUARE) + return seq(self.LSQUARE, self.RSQUARE) | seq( + self.LSQUARE, self._expression_list, self.RSQUARE + ) @rule def _expression_list(self) -> Rule: return ( self.expression - | seq(self.expression, COMMA) - | seq(self.expression, COMMA, self._expression_list) + | seq(self.expression, self.COMMA) + | seq(self.expression, self.COMMA, self._expression_list) ) @rule def match_expression(self) -> Rule: - return seq(MATCH, self.expression, self.match_body) + return seq(self.MATCH, self.expression, self.match_body) @rule("MatchBody") def match_body(self) -> Rule: - return seq(LCURLY, RCURLY) | seq(LCURLY, self._match_arms, RCURLY) + return seq(self.LCURLY, self.RCURLY) | seq(self.LCURLY, self._match_arms, self.RCURLY) @rule def _match_arms(self) -> Rule: return ( self.match_arm - | seq(self.match_arm, COMMA) - | seq(self.match_arm, COMMA, self._match_arms) + | seq(self.match_arm, self.COMMA) + | seq(self.match_arm, self.COMMA, self._match_arms) ) @rule("MatchArm") def match_arm(self) -> Rule: - return seq(self.pattern, ARROW, self.expression) + return seq(self.pattern, self.ARROW, self.expression) @rule("Pattern") def pattern(self) -> Rule: @@ -330,7 +287,7 @@ class FineGrammar(Grammar): @rule def _pattern_predicate(self) -> Rule: - return seq(AND, self.expression) + return seq(self.AND, self.expression) @rule def _pattern_core(self) -> Rule: @@ -338,60 +295,116 @@ class FineGrammar(Grammar): @rule("WildcardPattern") def wildcard_pattern(self) -> Rule: - return UNDERSCORE + return self.UNDERSCORE @rule("VariableBinding") def variable_binding(self) -> Rule: - return seq(IDENTIFIER, COLON) + return seq(self.IDENTIFIER, self.COLON) @rule def object_constructor_expression(self) -> Rule: - return seq(NEW, self.type_identifier, self.field_list) + return seq(self.NEW, self.type_identifier, self.field_list) @rule def field_list(self) -> Rule: - return seq(LCURLY, RCURLY) | seq(LCURLY, self.field_values, RCURLY) + return seq(self.LCURLY, self.RCURLY) | seq(self.LCURLY, self.field_values, self.RCURLY) @rule def field_values(self) -> Rule: return ( self.field_value - | seq(self.field_value, COMMA) - | seq(self.field_value, COMMA, self.field_values) + | seq(self.field_value, self.COMMA) + | seq(self.field_value, self.COMMA, self.field_values) ) @rule def field_value(self) -> Rule: - return IDENTIFIER | seq(IDENTIFIER, COLON, self.expression) + return self.IDENTIFIER | seq(self.IDENTIFIER, self.COLON, self.expression) + + BLANK = Terminal("[ \t\r\n]+", regex=True) + + ARROW = Terminal("->") + AS = Terminal("as") + BAR = Terminal("bar") + CLASS = Terminal("class") + COLON = Terminal("colon") + COMMENT = Terminal("comment") + ELSE = Terminal("else") + FOR = Terminal("for") + FUN = Terminal("fun") + IDENTIFIER = Terminal("[A-Za-z_][A-Za-z0-9_]*", regex=True) + IF = Terminal("if") + IMPORT = Terminal("import") + IN = Terminal("in") + LCURLY = Terminal("{") + LET = Terminal("Let") + RCURLY = Terminal("}") + RETURN = Terminal("return") + SEMICOLON = Terminal(";") + STRING = Terminal('""', regex=True) + WHILE = Terminal("while") + EQUAL = Terminal("=") + LPAREN = Terminal("(") + RPAREN = Terminal(")") + COMMA = Terminal(",") + SELF = Terminal("self", name="SELFF") + OR = Terminal("or") + IS = Terminal("is") + AND = Terminal("and") + EQUALEQUAL = Terminal("==") + BANGEQUAL = Terminal("!=") + LESS = Terminal("<") + GREATER = Terminal(">") + LESSEQUAL = Terminal("<=") + GREATEREQUAL = Terminal(">=") + PLUS = Terminal("+") + MINUS = Terminal("-") + STAR = Terminal("*") + SLASH = Terminal("/") + NUMBER = Terminal("[0-9]+", regex=True) + TRUE = Terminal("true") + FALSE = Terminal("false") + BANG = Terminal("!") + DOT = Terminal(".") + MATCH = Terminal("match") + EXPORT = Terminal("export") + UNDERSCORE = Terminal("_") + NEW = Terminal("new") + LSQUARE = Terminal("[") + RSQUARE = Terminal("]") # ----------------------------------------------------------------------------- # DORKY LEXER # ----------------------------------------------------------------------------- +import bisect +import dataclasses + + NUMBER_RE = re.compile("[0-9]+(\\.[0-9]*([eE][-+]?[0-9]+)?)?") IDENTIFIER_RE = re.compile("[_A-Za-z][_A-Za-z0-9]*") KEYWORD_TABLE = { - "_": UNDERSCORE, - "and": AND, - "as": AS, - "class": CLASS, - "else": ELSE, - "export": EXPORT, - "false": FALSE, - "for": FOR, - "fun": FUN, - "if": IF, - "import": IMPORT, - "in": IN, - "is": IS, - "let": LET, - "match": MATCH, - "new": NEW, - "or": OR, - "return": RETURN, - "self": SELF, - "true": TRUE, - "while": WHILE, + "_": FineGrammar.UNDERSCORE, + "and": FineGrammar.AND, + "as": FineGrammar.AS, + "class": FineGrammar.CLASS, + "else": FineGrammar.ELSE, + "export": FineGrammar.EXPORT, + "false": FineGrammar.FALSE, + "for": FineGrammar.FOR, + "fun": FineGrammar.FUN, + "if": FineGrammar.IF, + "import": FineGrammar.IMPORT, + "in": FineGrammar.IN, + "is": FineGrammar.IS, + "let": FineGrammar.LET, + "match": FineGrammar.MATCH, + "new": FineGrammar.NEW, + "or": FineGrammar.OR, + "return": FineGrammar.RETURN, + "self": FineGrammar.SELF, + "true": FineGrammar.TRUE, + "while": FineGrammar.WHILE, } @@ -406,63 +419,63 @@ def tokenize(src: str): token = None if ch == "-": if src[pos : pos + 2] == "->": - token = (ARROW, pos, 2) + token = (FineGrammar.ARROW, pos, 2) else: - token = (MINUS, pos, 1) + token = (FineGrammar.MINUS, pos, 1) elif ch == "|": - token = (BAR, pos, 1) + token = (FineGrammar.BAR, pos, 1) elif ch == ":": - token = (COLON, pos, 1) + token = (FineGrammar.COLON, pos, 1) elif ch == "{": - token = (LCURLY, pos, 1) + token = (FineGrammar.LCURLY, pos, 1) elif ch == "}": - token = (RCURLY, pos, 1) + token = (FineGrammar.RCURLY, pos, 1) elif ch == ";": - token = (SEMICOLON, pos, 1) + token = (FineGrammar.SEMICOLON, pos, 1) elif ch == "=": if src[pos : pos + 2] == "==": - token = (EQUALEQUAL, pos, 2) + token = (FineGrammar.EQUALEQUAL, pos, 2) else: - token = (EQUAL, pos, 1) + token = (FineGrammar.EQUAL, pos, 1) elif ch == "(": - token = (LPAREN, pos, 1) + token = (FineGrammar.LPAREN, pos, 1) elif ch == ")": - token = (RPAREN, pos, 1) + token = (FineGrammar.RPAREN, pos, 1) elif ch == ",": - token = (COMMA, pos, 1) + token = (FineGrammar.COMMA, pos, 1) elif ch == "!": if src[pos : pos + 2] == "!=": - token = (BANGEQUAL, pos, 2) + token = (FineGrammar.BANGEQUAL, pos, 2) else: - token = (BANG, pos, 1) + token = (FineGrammar.BANG, pos, 1) elif ch == "<": if src[pos : pos + 2] == "<=": - token = (LESSEQUAL, pos, 2) + token = (FineGrammar.LESSEQUAL, pos, 2) else: - token = (LESS, pos, 1) + token = (FineGrammar.LESS, pos, 1) elif ch == ">": if src[pos : pos + 2] == ">=": - token = (GREATEREQUAL, pos, 2) + token = (FineGrammar.GREATEREQUAL, pos, 2) else: - token = (GREATER, pos, 1) + token = (FineGrammar.GREATER, pos, 1) elif ch == "+": - token = (PLUS, pos, 1) + token = (FineGrammar.PLUS, pos, 1) elif ch == "*": - token = (STAR, pos, 1) + token = (FineGrammar.STAR, pos, 1) elif ch == "/": if src[pos : pos + 2] == "//": @@ -470,16 +483,16 @@ def tokenize(src: str): pos = pos + 1 continue - token = (SLASH, pos, 1) + token = (FineGrammar.SLASH, pos, 1) elif ch == ".": - token = (DOT, pos, 1) + token = (FineGrammar.DOT, pos, 1) elif ch == "[": - token = (LSQUARE, pos, 1) + token = (FineGrammar.LSQUARE, pos, 1) elif ch == "]": - token = (RSQUARE, pos, 1) + token = (FineGrammar.RSQUARE, pos, 1) elif ch == '"' or ch == "'": end = pos + 1 @@ -490,12 +503,12 @@ def tokenize(src: str): if end == len(src): raise Exception(f"Unterminated string constant at {pos}") end += 1 - token = (STRING, pos, end - pos) + token = (FineGrammar.STRING, pos, end - pos) else: number_match = NUMBER_RE.match(src, pos) if number_match: - token = (NUMBER, pos, number_match.end() - pos) + token = (FineGrammar.NUMBER, pos, number_match.end() - pos) else: id_match = IDENTIFIER_RE.match(src, pos) if id_match: @@ -504,7 +517,7 @@ def tokenize(src: str): if keyword: token = (keyword, pos, len(fragment)) else: - token = (IDENTIFIER, pos, len(fragment)) + token = (FineGrammar.IDENTIFIER, pos, len(fragment)) if token is None: raise Exception("Token error") @@ -512,9 +525,6 @@ def tokenize(src: str): pos += token[2] -import bisect - - class FineTokens: def __init__(self, src: str): self.src = src @@ -546,4 +556,20 @@ class FineTokens: if __name__ == "__main__": - FineGrammar().build_table() + grammar = FineGrammar() + grammar.build_table() + + class LexTest(Grammar): + @rule + def foo(self): + return self.IS + + start = foo + + IS = Terminal("is") + AS = Terminal("as") + IDENTIFIER = Terminal("[a-z]+", regex=True) + # IDENTIFIER = Terminal("[A-Za-z_][A-Za-z0-9_]*", regex=True) + + lexer = compile_lexer(LexTest()) + dump_lexer_table(lexer) diff --git a/parser/parser.py b/parser/parser.py index d0cb1fc..4d19e29 100644 --- a/parser/parser.py +++ b/parser/parser.py @@ -21,19 +21,20 @@ To get started, create a grammar that derives from the `Grammar` class. Create one method per nonterminal, decorated with the `rule` decorator. Here's an example: - PLUS = Terminal('+') - LPAREN = Terminal('(') - RPAREN = Terminal(')') - ID = Terminal('id') class SimpleGrammar(Grammar): @rule def expression(self): - return seq(self.expression, PLUS, self.term) | self.term + return seq(self.expression, self.PLUS, self.term) | self.term @rule def term(self): - return seq(LPAREN, self.expression, RPAREN) | ID + return seq(self.LPAREN, self.expression, self.RPAREN) | self.ID + + PLUS = Terminal('+') + LPAREN = Terminal('(') + RPAREN = Terminal(')') + ID = Terminal('id') ## Using grammars @@ -1605,10 +1606,14 @@ class Rule: class Terminal(Rule): """A token, or terminal symbol in the grammar.""" - value: str + value: str | None + pattern: str + regex: bool - def __init__(self, value): - self.value = sys.intern(value) + def __init__(self, pattern, name=None, regex=False): + self.value = name + self.pattern = pattern + self.regex = regex def flatten(self) -> typing.Generator[list["str | Terminal"], None, None]: # We are just ourselves when flattened. @@ -1766,19 +1771,20 @@ class Grammar: Here's an example of a simple grammar: - PLUS = Terminal('+') - LPAREN = Terminal('(') - RPAREN = Terminal(')') - ID = Terminal('id') - class SimpleGrammar(Grammar): @rule def expression(self): - return seq(self.expression, PLUS, self.term) | self.term + return seq(self.expression, self.PLUS, self.term) | self.term @rule def term(self): - return seq(LPAREN, self.expression, RPAREN) | ID + return seq(self.LPAREN, self.expression, self.RPAREN) | self.ID + + PLUS = Terminal('+') + LPAREN = Terminal('(') + RPAREN = Terminal(')') + ID = Terminal('id') + Not very exciting, perhaps, but it's something. """ @@ -1786,6 +1792,7 @@ class Grammar: _precedence: dict[str, typing.Tuple[Assoc, int]] _start: str _generator: type[GenerateLR0] + _terminals: list[Terminal] def __init__( self, @@ -1809,6 +1816,14 @@ class Grammar: generator = getattr(self, "generator", GenerateLALR) assert generator is not None + # Fixup terminal names with the name of the member that declared it. + terminals = [] + for n, t in inspect.getmembers(self, lambda x: isinstance(x, Terminal)): + if t.value is None: + t.value = n + terminals.append(t) + + # Fix up the precedence table. precedence_table = {} for prec, (associativity, symbols) in enumerate(precedence): for symbol in symbols: @@ -1824,6 +1839,11 @@ class Grammar: self._precedence = precedence_table self._start = start self._generator = generator + self._terminals = terminals + + @property + def terminals(self) -> list[Terminal]: + return self._terminals def generate_nonterminal_dict( self, start: str | None = None @@ -1911,3 +1931,149 @@ class Grammar: gen = generator(start, desugared, precedence=self._precedence, transparents=transparents) table = gen.gen_table() return table + + +############################################################################### +# Lexer support +############################################################################### +# For machine-generated lexers + + +@dataclasses.dataclass(frozen=True, slots=True) +class Span: + lower: int # inclusive + upper: int # exclusive + + @classmethod + def from_str(cls, c: str) -> "Span": + return Span(lower=ord(c), upper=ord(c) + 1) + + def intersects(self, other: "Span") -> bool: + return self.lower < other.upper and self.upper > other.lower + + def split(self, other: "Span") -> tuple["Span|None", "Span", "Span|None"]: + assert self.intersects(other) + + first = min(self.lower, other.lower) + second = max(self.lower, other.lower) + third = min(self.upper, other.upper) + fourth = max(self.upper, other.upper) + + low = Span(first, second) if first != second else None + mid = Span(second, third) + hi = Span(third, fourth) if third != fourth else None + + return (low, mid, hi) + + def __str__(self) -> str: + if self.upper - self.lower == 1: + return str(self.lower) + + lower = str(self.lower) + upper = str(self.upper) + return f"[{lower}-{upper})" + + def __lt__(self, other: "Span") -> bool: + return self.lower < other.lower + + +ET = typing.TypeVar("ET") + + +class EdgeList[ET]: + """A list of edge transitions, keyed by *span*. A given span can have + multiple targets, because this supports NFAs.""" + + _edges: list[tuple[Span, list[ET]]] + + def __init__(self): + self._edges = [] + + def __iter__(self) -> typing.Iterator[tuple[Span, list[ET]]]: + return iter(self._edges) + + def __repr__(self) -> str: + return f"EdgeList[{','.join(str(s[0]) + '->' + repr(s[1]) for s in self._edges)}]" + + def add_edge(self, c: Span, s: ET): + """Add an edge for the given span to the list. If there are already + spans that overlap this one, split and generating multiple distinct + edges. + """ + # print(f" Adding {c}->{s} to {self}...") + # Look to see where we would put this span based solely on a + # sort of lower bounds. + point = bisect.bisect_left(self._edges, c, key=lambda x: x[0]) + + # If this is not the first span in the list then we might + # overlap with the span to our left.... + if point > 0: + left_point = point - 1 + left_span, left_targets = self._edges[left_point] + if c.intersects(left_span): + # ...if we intersect with the span to our left then we + # must split the span to our left with regards to our + # span. Then we have three target spans: + # + # - The lo one, which just has the targets from the old + # left span. (This may be empty if we overlap the + # left one completely on the left side.) + # + # - The mid one, which has both the targets from the + # old left and the new target. + # + # - The hi one, which if it exists only has our target. + # If it exists it basically replaces the current span + # for our future processing. (If not, then our span + # is completely subsumed into the left span and we + # can stop.) + # + del self._edges[left_point] + lo, mid, hi = c.split(left_span) + # print(f" <- {c} splits {left_span} -> {lo}, {mid}, {hi} @{left_point}") + self._edges.insert(left_point, (mid, left_targets + [s])) + if lo is not None: + self._edges.insert(left_point, (lo, left_targets)) + if hi is None or not hi.intersects(c): + # Yup, completely subsumed. + # print(f" result: {self} (left out)") + return + + # Continue processing with `c` as the hi split from the + # left. If the left and right spans abut each other then + # `c` will be subsumed in our right span. + c = hi + + # If point is not at the very end of the list then it might + # overlap the span to our right... + if point < len(self._edges): + right_span, right_targets = self._edges[point] + if c.intersects(right_span): + # ...this is similar to the left case, above, except the + # lower bound has the targets that our only ours, etc. + del self._edges[point] + lo, mid, hi = c.split(right_span) + # print(f" -> {c} splits {right_span} -> {lo}, {mid}, {hi} @{point}") + if hi is not None: + self._edges.insert(point, (hi, right_targets)) + self._edges.insert(point, (mid, right_targets + [s])) + if lo is None or not lo.intersects(c): + # Our span is completely subsumed on the lower side + # of the range; there is no lower side that just has + # our targets. Bail now. + # print(f" result: {self} (right out)") + return + + # Continue processing with `c` as the lo split, since + # that's the one that has only the specified state as the + # target. + c = lo + + # If we made it here then either we have a point that does not + # intersect at all, or it only partially intersects on either the + # left or right. Either way, we have ensured that: + # + # - c doesn't intersect with left or right (any more) + # - point is where it should go + self._edges.insert(point, (c, [s])) + # print(f" result: {self} (done)") diff --git a/tests/test_grammar.py b/tests/test_grammar.py index a320e06..26e5057 100644 --- a/tests/test_grammar.py +++ b/tests/test_grammar.py @@ -38,25 +38,27 @@ def _tree(treeform) -> runtime.Tree | runtime.TokenValue: def test_lr0_lr0(): """An LR0 grammar should work with an LR0 generator.""" - PLUS = Terminal("+") - LPAREN = Terminal("(") - RPAREN = Terminal(")") - IDENTIFIER = Terminal("id") - - class LR0Grammar(Grammar): + class G(Grammar): start = "E" generator = parser.GenerateLR0 @rule def E(self): - return seq(self.E, PLUS, self.T) | self.T + return seq(self.E, self.PLUS, self.T) | self.T @rule def T(self): - return seq(LPAREN, self.E, RPAREN) | IDENTIFIER + return seq(self.LPAREN, self.E, self.RPAREN) | self.IDENTIFIER - table = LR0Grammar().build_table() - tree, errors = runtime.Parser(table).parse(Tokens(IDENTIFIER, PLUS, LPAREN, IDENTIFIER, RPAREN)) + PLUS = Terminal("+", name="+") + LPAREN = Terminal("(", name="(") + RPAREN = Terminal(")", name=")") + IDENTIFIER = Terminal("id", name="id") + + table = G().build_table() + tree, errors = runtime.Parser(table).parse( + Tokens(G.IDENTIFIER, G.PLUS, G.LPAREN, G.IDENTIFIER, G.RPAREN) + ) assert errors == [] assert tree == _tree(("E", ("E", ("T", "id")), "+", ("T", "(", ("E", ("T", "id")), ")"))) @@ -65,114 +67,114 @@ def test_lr0_lr0(): def test_lr0_shift_reduce(): """This one should not work in LR0- it has a shift/reduce conflict, but works in SLR1.""" - PLUS = Terminal("+") - LPAREN = Terminal("(") - RPAREN = Terminal(")") - LSQUARE = Terminal("[") - RSQUARE = Terminal("]") - IDENTIFIER = Terminal("id") - - class TestGrammar(Grammar): + class G(Grammar): start = "E" generator = parser.GenerateLR0 @rule def E(self): - return seq(self.E, PLUS, self.T) | self.T + return seq(self.E, self.PLUS, self.T) | self.T @rule def T(self): return ( - seq(LPAREN, self.E, RPAREN) | IDENTIFIER | seq(IDENTIFIER, LSQUARE, self.E, RSQUARE) + seq(self.LPAREN, self.E, self.RPAREN) + | self.IDENTIFIER + | seq(self.IDENTIFIER, self.LSQUARE, self.E, self.RSQUARE) ) - with pytest.raises(parser.AmbiguityError): - TestGrammar().build_table() + PLUS = Terminal("+") + LPAREN = Terminal("(") + RPAREN = Terminal(")") + LSQUARE = Terminal("[") + RSQUARE = Terminal("]") + IDENTIFIER = Terminal("id") - TestGrammar().build_table(generator=parser.GenerateSLR1) + with pytest.raises(parser.AmbiguityError): + G().build_table() + + G().build_table(generator=parser.GenerateSLR1) def test_lr0_reduce_reduce(): """This one should not work, it has a reduce-reduce conflict.""" - PLUS = Terminal("+") - EQUAL = Terminal("=") - LPAREN = Terminal("(") - RPAREN = Terminal(")") - IDENTIFIER = Terminal("id") - - class TestGrammar(Grammar): + class G(Grammar): start = "E" generator = parser.GenerateLR0 @rule def E(self): - return seq(self.E, PLUS, self.T) | self.T | seq(self.V, EQUAL, self.E) + return seq(self.E, self.PLUS, self.T) | self.T | seq(self.V, self.EQUAL, self.E) @rule def T(self): - return seq(LPAREN, self.E, RPAREN) | IDENTIFIER + return seq(self.LPAREN, self.E, self.RPAREN) | self.IDENTIFIER @rule def V(self): - return IDENTIFIER + return self.IDENTIFIER + + PLUS = Terminal("+") + EQUAL = Terminal("=") + LPAREN = Terminal("(") + RPAREN = Terminal(")") + IDENTIFIER = Terminal("id") with pytest.raises(parser.AmbiguityError): - TestGrammar().build_table() + G().build_table() def test_lr0_empty(): """LR0 can't handle empty productions because it doesn't know when to reduce.""" - BOOP = Terminal("boop") - BEEP = Terminal("beep") - class TestGrammar(Grammar): + class G(Grammar): start = "E" generator = parser.GenerateLR0 @rule def E(self): - return seq(self.F, BOOP) + return seq(self.F, self.BOOP) @rule def F(self): - return BEEP | parser.Nothing + return self.BEEP | parser.Nothing + + BOOP = Terminal("boop") + BEEP = Terminal("beep") with pytest.raises(parser.AmbiguityError): - TestGrammar().build_table() + G().build_table() def test_grammar_aho_ullman_1(): - EQUAL = Terminal("=") - STAR = Terminal("*") - ID = Terminal("id") - - class TestGrammar(Grammar): + class G(Grammar): start = "S" generator = parser.GenerateSLR1 @rule def S(self): - return seq(self.L, EQUAL, self.R) | self.R + return seq(self.L, self.EQUAL, self.R) | self.R @rule def L(self): - return seq(STAR, self.R) | ID + return seq(self.STAR, self.R) | self.ID @rule def R(self): return self.L - with pytest.raises(parser.AmbiguityError): - TestGrammar().build_table() + EQUAL = Terminal("=") + STAR = Terminal("*") + ID = Terminal("id") - TestGrammar().build_table(generator=parser.GenerateLR1) + with pytest.raises(parser.AmbiguityError): + G().build_table() + + G().build_table(generator=parser.GenerateLR1) def test_grammar_aho_ullman_2(): - A = Terminal("a") - B = Terminal("b") - class TestGrammar(Grammar): start = "S" generator = parser.GenerateSLR1 @@ -183,7 +185,10 @@ def test_grammar_aho_ullman_2(): @rule def X(self): - return seq(A, self.X) | B + return seq(self.A, self.X) | self.B + + A = Terminal("a") + B = Terminal("b") TestGrammar().build_table() TestGrammar().build_table(generator=parser.GenerateLR1) @@ -191,11 +196,6 @@ def test_grammar_aho_ullman_2(): def test_fun_lalr(): - PLUS = Terminal("+") - INT = Terminal("int") - ID = Terminal("id") - LPAREN = Terminal("(") - RPAREN = Terminal(")") class TestGrammar(Grammar): start = "S" @@ -207,15 +207,21 @@ def test_fun_lalr(): @rule def E(self): - return self.F | seq(self.E, PLUS, self.F) + return self.F | seq(self.E, self.PLUS, self.F) @rule def F(self): - return self.V | INT | seq(LPAREN, self.E, RPAREN) + return self.V | self.INT | seq(self.LPAREN, self.E, self.RPAREN) @rule def V(self): - return ID + return self.ID + + PLUS = Terminal("+") + INT = Terminal("int") + ID = Terminal("id") + LPAREN = Terminal("(") + RPAREN = Terminal(")") TestGrammar().build_table() @@ -234,14 +240,14 @@ def test_conflicting_names(): to understand. """ - IDENTIFIER = Terminal("Identifier") - class TestGrammar(Grammar): - start = "Identifier" + start = "IDENTIFIER" - @rule("Identifier") + @rule("IDENTIFIER") def identifier(self): - return IDENTIFIER + return self.IDENTIFIER + + IDENTIFIER = Terminal("Identifier") with pytest.raises(ValueError): TestGrammar().build_table() diff --git a/tests/test_lexer.py b/tests/test_lexer.py new file mode 100644 index 0000000..b082889 --- /dev/null +++ b/tests/test_lexer.py @@ -0,0 +1,452 @@ +from parser import Span + +# LexerTable = list[tuple[Terminal | None, list[tuple[Span, int]]]] + + +# def compile_lexer(x: Grammar) -> LexerTable: + +# class State: +# """An NFA state. Each state can be the accept state, with one or more +# Terminals as the result.""" + +# accept: list[Terminal] +# epsilons: list["State"] +# _edges: EdgeList["State"] + +# def __init__(self): +# self.accept = [] +# self.epsilons = [] +# self._edges = EdgeList() + +# def __repr__(self): +# return f"State{id(self)}" + +# def edges(self) -> typing.Iterable[tuple[Span, list["State"]]]: +# return self._edges + +# def add_edge(self, c: Span, s: "State") -> "State": +# self._edges.add_edge(c, s) +# return s + +# def dump_graph(self, name="nfa.dot"): +# with open(name, "w", encoding="utf8") as f: +# f.write("digraph G {\n") + +# stack: list[State] = [self] +# visited = set() +# while len(stack) > 0: +# state = stack.pop() +# if state in visited: +# continue +# visited.add(state) + +# label = ", ".join([t.value for t in state.accept if t.value is not None]) +# f.write(f' {id(state)} [label="{label}"];\n') +# for target in state.epsilons: +# stack.append(target) +# f.write(f' {id(state)} -> {id(target)} [label="\u03B5"];\n') + +# for span, targets in state.edges(): +# label = str(span).replace('"', '\\"') +# for target in targets: +# stack.append(target) +# f.write(f' {id(state)} -> {id(target)} [label="{label}"];\n') + +# f.write("}\n") + +# @dataclasses.dataclass +# class RegexNode: +# def to_nfa(self, start: State) -> State: +# del start +# raise NotImplementedError() + +# def __str__(self) -> str: +# raise NotImplementedError() + +# @dataclasses.dataclass +# class RegexLiteral(RegexNode): +# values: list[tuple[str, str]] + +# def to_nfa(self, start: State) -> State: +# end = State() +# for s, e in self.values: +# start.add_edge(Span(ord(s), ord(e)), end) +# return end + +# def __str__(self) -> str: +# if len(self.values) == 1: +# start, end = self.values[0] +# if start == end: +# return start + +# ranges = [] +# for start, end in self.values: +# if start == end: +# ranges.append(start) +# else: +# ranges.append(f"{start}-{end}") +# return "![{}]".format("".join(ranges)) + +# @dataclasses.dataclass +# class RegexPlus(RegexNode): +# child: RegexNode + +# def to_nfa(self, start: State) -> State: +# end = self.child.to_nfa(start) +# end.epsilons.append(start) +# return end + +# def __str__(self) -> str: +# return f"({self.child})+" + +# @dataclasses.dataclass +# class RegexStar(RegexNode): +# child: RegexNode + +# def to_nfa(self, start: State) -> State: +# end = self.child.to_nfa(start) +# end.epsilons.append(start) +# start.epsilons.append(end) +# return end + +# def __str__(self) -> str: +# return f"({self.child})*" + +# @dataclasses.dataclass +# class RegexQuestion(RegexNode): +# child: RegexNode + +# def to_nfa(self, start: State) -> State: +# end = self.child.to_nfa(start) +# start.epsilons.append(end) +# return end + +# def __str__(self) -> str: +# return f"({self.child})?" + +# @dataclasses.dataclass +# class RegexSequence(RegexNode): +# left: RegexNode +# right: RegexNode + +# def to_nfa(self, start: State) -> State: +# mid = self.left.to_nfa(start) +# return self.right.to_nfa(mid) + +# def __str__(self) -> str: +# return f"{self.left}{self.right}" + +# @dataclasses.dataclass +# class RegexAlternation(RegexNode): +# left: RegexNode +# right: RegexNode + +# def to_nfa(self, start: State) -> State: +# left_start = State() +# start.epsilons.append(left_start) +# left_end = self.left.to_nfa(left_start) + +# right_start = State() +# start.epsilons.append(right_start) +# right_end = self.right.to_nfa(right_start) + +# end = State() +# left_end.epsilons.append(end) +# right_end.epsilons.append(end) + +# return end + +# def __str__(self) -> str: +# return f"(({self.left})||({self.right}))" + +# class RegexParser: +# # TODO: HANDLE ALTERNATION AND PRECEDENCE (CONCAT HAS HIGHEST PRECEDENCE) +# PREFIX: dict[str, typing.Callable[[str], RegexNode]] +# POSTFIX: dict[str, typing.Callable[[RegexNode, int], RegexNode]] +# BINDING: dict[str, tuple[int, int]] + +# index: int +# pattern: str + +# def __init__(self, pattern: str): +# self.PREFIX = { +# "(": self.parse_group, +# "[": self.parse_set, +# } +# self.POSTFIX = { +# "+": self.parse_plus, +# "*": self.parse_star, +# "?": self.parse_question, +# "|": self.parse_alternation, +# } + +# self.BINDING = { +# "|": (1, 1), +# "+": (2, 2), +# "*": (2, 2), +# "?": (2, 2), +# ")": (-1, -1), # Always stop parsing on ) +# } + +# self.index = 0 +# self.pattern = pattern + +# def consume(self) -> str: +# if self.index >= len(self.pattern): +# raise ValueError(f"Unable to parse regular expression '{self.pattern}'") +# result = self.pattern[self.index] +# self.index += 1 +# return result + +# def peek(self) -> str | None: +# if self.index >= len(self.pattern): +# return None +# return self.pattern[self.index] + +# def eof(self) -> bool: +# return self.index >= len(self.pattern) + +# def expect(self, ch: str): +# actual = self.consume() +# if ch != actual: +# raise ValueError(f"Expected '{ch}'") + +# def parse_regex(self, minimum_binding=0) -> RegexNode: +# ch = self.consume() +# parser = self.PREFIX.get(ch, self.parse_single) +# node = parser(ch) + +# while not self.eof(): +# ch = self.peek() +# assert ch is not None + +# lp, rp = self.BINDING.get(ch, (minimum_binding, minimum_binding)) +# if lp < minimum_binding: +# break + +# parser = self.POSTFIX.get(ch, self.parse_concat) +# node = parser(node, rp) + +# return node + +# def parse_single(self, ch: str) -> RegexNode: +# return RegexLiteral(values=[(ch, ch)]) + +# def parse_group(self, ch: str) -> RegexNode: +# del ch + +# node = self.parse_regex() +# self.expect(")") +# return node + +# def parse_set(self, ch: str) -> RegexNode: +# del ch + +# # TODO: INVERSION? +# ranges = [] +# while self.peek() not in (None, "]"): +# start = self.consume() +# if self.peek() == "-": +# self.consume() +# end = self.consume() +# else: +# end = start +# ranges.append((start, end)) + +# self.expect("]") +# return RegexLiteral(values=ranges) + +# def parse_alternation(self, node: RegexNode, rp: int) -> RegexNode: +# return RegexAlternation(left=node, right=self.parse_regex(rp)) + +# def parse_plus(self, left: RegexNode, rp: int) -> RegexNode: +# del rp +# self.expect("+") +# return RegexPlus(child=left) + +# def parse_star(self, left: RegexNode, rp: int) -> RegexNode: +# del rp +# self.expect("*") +# return RegexStar(child=left) + +# def parse_question(self, left: RegexNode, rp: int) -> RegexNode: +# del rp +# self.expect("?") +# return RegexQuestion(child=left) + +# def parse_concat(self, left: RegexNode, rp: int) -> RegexNode: +# return RegexSequence(left, self.parse_regex(rp)) + +# class SuperState: +# states: frozenset[State] +# index: int + +# def __init__(self, states: typing.Iterable[State]): +# # Close over the given states, including every state that is +# # reachable by epsilon-transition. +# stack = list(states) +# result = set() +# while len(stack) > 0: +# st = stack.pop() +# if st in result: +# continue +# result.add(st) +# stack.extend(st.epsilons) + +# self.states = frozenset(result) +# self.index = -1 + +# def __eq__(self, other): +# if not isinstance(other, SuperState): +# return False +# return self.states == other.states + +# def __hash__(self) -> int: +# return hash(self.states) + +# def edges(self) -> list[tuple[Span, "SuperState"]]: +# working: EdgeList[list[State]] = EdgeList() +# for st in self.states: +# for span, targets in st.edges(): +# working.add_edge(span, targets) + +# # EdgeList maps span to list[list[State]] which we want to flatten. +# result = [] +# for span, stateses in working: +# s: list[State] = [] +# for states in stateses: +# s.extend(states) + +# result.append((span, SuperState(s))) + +# return result + +# def accept_terminal(self) -> Terminal | None: +# accept = None +# for st in self.states: +# for ac in st.accept: +# if accept is None: +# accept = ac +# elif accept.value != ac.value: +# if accept.regex and not ac.regex: +# accept = ac +# elif ac.regex and not accept.regex: +# pass +# else: +# raise ValueError( +# f"Lexer is ambiguous: cannot distinguish between {accept.value} ('{accept.pattern}') and {ac.value} ('{ac.pattern}')" +# ) + +# return accept + +# # Parse the terminals all together into a big NFA rooted at `NFA`. +# NFA = State() +# for token in x.terminals: +# start = State() +# NFA.epsilons.append(start) + +# if token.regex: +# node = RegexParser(token.pattern).parse_regex() +# print(f" Parsed {token.pattern} to {node}") +# ending = node.to_nfa(start) + +# else: +# ending = start +# for c in token.pattern: +# ending = ending.add_edge(Span.from_str(c), State()) + +# ending.accept.append(token) + +# NFA.dump_graph() + +# # Convert the NFA into a DFA in the most straightforward way (by tracking +# # sets of state closures, called SuperStates.) +# DFA: dict[SuperState, list[tuple[Span, SuperState]]] = {} +# stack = [SuperState([NFA])] +# while len(stack) > 0: +# ss = stack.pop() +# if ss in DFA: +# continue + +# edges = ss.edges() + +# DFA[ss] = edges +# for _, target in edges: +# stack.append(target) + +# for i, k in enumerate(DFA): +# k.index = i + +# return [ +# ( +# ss.accept_terminal(), +# [(k, v.index) for k, v in edges], +# ) +# for ss, edges in DFA.items() +# ] + + +# def dump_lexer_table(table: LexerTable): +# with open("lexer.dot", "w", encoding="utf-8") as f: +# f.write("digraph G {\n") +# for index, (accept, edges) in enumerate(table): +# label = accept.value if accept is not None else "" +# f.write(f' {index} [label="{label}"];\n') +# for span, target in edges: +# label = str(span).replace('"', '\\"') +# f.write(f' {index} -> {target} [label="{label}"];\n') + +# pass +# f.write("}\n") + + +# def generic_tokenize(src: str, table: LexerTable): +# pos = 0 +# state = 0 +# start = 0 +# last_accept = None +# last_accept_pos = 0 + +# while pos < len(src): +# accept, edges = table[state] +# if accept is not None: +# last_accept = accept +# last_accept_pos = pos + 1 + +# char = ord(src[pos]) + +# # Find the index of the span where the upper value is the tightest +# # bound on the character. +# index = bisect.bisect_left(edges, char, key=lambda x: x[0].upper) +# # If the character is greater than or equal to the lower bound we +# # found then we have a hit, otherwise no. +# state = edges[index][1] if index < len(edges) and char >= edges[index][0].lower else None +# if state is None: +# if last_accept is None: +# raise Exception(f"Token error at {pos}") + +# yield (last_accept, start, last_accept_pos - start) + +# last_accept = None +# pos = last_accept_pos +# start = pos +# state = 0 + +# else: +# pos += 1 + + +def test_span_intersection(): + pairs = [ + ((1, 3), (2, 4)), + ((1, 3), (2, 3)), + ((1, 3), (1, 2)), + ((1, 3), (0, 2)), + ((1, 3), (0, 4)), + ] + + for a, b in pairs: + left = Span(*a) + right = Span(*b) + assert left.intersects(right) + assert right.intersects(left)