Move terminals into grammar definition

Starting to work on machine-generated lexers too
This commit is contained in:
John Doty 2024-08-23 07:24:30 -07:00
parent f6bc2ccea8
commit 58c3004702
4 changed files with 917 additions and 267 deletions

View file

@ -2,57 +2,7 @@
import re
import typing
import parser
from parser import Assoc, Grammar, Nothing, Terminal, rule, seq, Rule
ARROW = Terminal("Arrow")
AS = Terminal("As")
BAR = Terminal("Bar")
CLASS = Terminal("Class")
COLON = Terminal("Colon")
ELSE = Terminal("Else")
FOR = Terminal("For")
FUN = Terminal("Fun")
IDENTIFIER = Terminal("Identifier")
IF = Terminal("If")
IMPORT = Terminal("Import")
IN = Terminal("In")
LCURLY = Terminal("LeftBrace")
LET = Terminal("Let")
RCURLY = Terminal("RightBrace")
RETURN = Terminal("Return")
SEMICOLON = Terminal("Semicolon")
STRING = Terminal("String")
WHILE = Terminal("While")
EQUAL = Terminal("Equal")
LPAREN = Terminal("LeftParen")
RPAREN = Terminal("RightParen")
COMMA = Terminal("Comma")
SELF = Terminal("Selff")
OR = Terminal("Or")
IS = Terminal("Is")
AND = Terminal("And")
EQUALEQUAL = Terminal("EqualEqual")
BANGEQUAL = Terminal("BangEqual")
LESS = Terminal("Less")
GREATER = Terminal("Greater")
LESSEQUAL = Terminal("LessEqual")
GREATEREQUAL = Terminal("GreaterEqual")
PLUS = Terminal("Plus")
MINUS = Terminal("Minus")
STAR = Terminal("Star")
SLASH = Terminal("Slash")
NUMBER = Terminal("Number")
TRUE = Terminal("True")
FALSE = Terminal("False")
BANG = Terminal("Bang")
DOT = Terminal("Dot")
MATCH = Terminal("Match")
EXPORT = Terminal("Export")
UNDERSCORE = Terminal("Underscore")
NEW = Terminal("New")
LSQUARE = Terminal("LeftBracket")
RSQUARE = Terminal("RightBracket")
from parser import Assoc, Grammar, Nothing, rule, seq, Rule, Terminal
class FineGrammar(Grammar):
@ -62,17 +12,17 @@ class FineGrammar(Grammar):
def __init__(self):
super().__init__(
precedence=[
(Assoc.RIGHT, [EQUAL]),
(Assoc.LEFT, [OR]),
(Assoc.LEFT, [IS]),
(Assoc.LEFT, [AND]),
(Assoc.LEFT, [EQUALEQUAL, BANGEQUAL]),
(Assoc.LEFT, [LESS, GREATER, GREATEREQUAL, LESSEQUAL]),
(Assoc.LEFT, [PLUS, MINUS]),
(Assoc.LEFT, [STAR, SLASH]),
(Assoc.RIGHT, [self.EQUAL]),
(Assoc.LEFT, [self.OR]),
(Assoc.LEFT, [self.IS]),
(Assoc.LEFT, [self.AND]),
(Assoc.LEFT, [self.EQUALEQUAL, self.BANGEQUAL]),
(Assoc.LEFT, [self.LESS, self.GREATER, self.GREATEREQUAL, self.LESSEQUAL]),
(Assoc.LEFT, [self.PLUS, self.MINUS]),
(Assoc.LEFT, [self.STAR, self.SLASH]),
(Assoc.LEFT, [self.primary_expression]),
(Assoc.LEFT, [LPAREN]),
(Assoc.LEFT, [DOT]),
(Assoc.LEFT, [self.LPAREN]),
(Assoc.LEFT, [self.DOT]),
#
# If there's a confusion about whether to make an IF
# statement or an expression, prefer the statement.
@ -97,15 +47,15 @@ class FineGrammar(Grammar):
@rule
def import_statement(self) -> Rule:
return seq(IMPORT, STRING, AS, IDENTIFIER, SEMICOLON)
return seq(self.IMPORT, self.STRING, self.AS, self.IDENTIFIER, self.SEMICOLON)
@rule("ClassDeclaration")
def class_declaration(self) -> Rule:
return seq(CLASS, IDENTIFIER, self._class_body)
return seq(self.CLASS, self.IDENTIFIER, self._class_body)
@rule
def _class_body(self) -> Rule:
return seq(LCURLY, RCURLY) | seq(LCURLY, self._class_members, RCURLY)
return seq(self.LCURLY, self.RCURLY) | seq(self.LCURLY, self._class_members, self.RCURLY)
@rule
def _class_members(self) -> Rule:
@ -117,7 +67,7 @@ class FineGrammar(Grammar):
@rule("FieldDecl")
def field_declaration(self) -> Rule:
return seq(IDENTIFIER, COLON, self.type_expression, SEMICOLON)
return seq(self.IDENTIFIER, self.COLON, self.type_expression, self.SEMICOLON)
# Types
@rule("TypeExpression")
@ -126,60 +76,65 @@ class FineGrammar(Grammar):
@rule("AlternateType")
def alternate_type(self) -> Rule:
return seq(self.type_expression, OR, self.type_identifier)
return seq(self.type_expression, self.OR, self.type_identifier)
@rule("TypeIdentifier")
def type_identifier(self) -> Rule:
return IDENTIFIER
return self.IDENTIFIER
@rule
def export_statement(self) -> Rule:
return (
seq(EXPORT, self.class_declaration)
| seq(EXPORT, self.function_declaration)
| seq(EXPORT, self.let_statement)
| seq(EXPORT, self.export_list, SEMICOLON)
seq(self.EXPORT, self.class_declaration)
| seq(self.EXPORT, self.function_declaration)
| seq(self.EXPORT, self.let_statement)
| seq(self.EXPORT, self.export_list, self.SEMICOLON)
)
@rule
def export_list(self) -> Rule:
return Nothing | IDENTIFIER | seq(IDENTIFIER, COMMA, self.export_list)
return Nothing | self.IDENTIFIER | seq(self.IDENTIFIER, self.COMMA, self.export_list)
# Functions
@rule("FunctionDecl")
def function_declaration(self) -> Rule:
return seq(FUN, IDENTIFIER, self.function_parameters, self.block) | seq(
FUN, IDENTIFIER, self.function_parameters, ARROW, self.type_expression, self.block
return seq(self.FUN, self.IDENTIFIER, self.function_parameters, self.block) | seq(
self.FUN,
self.IDENTIFIER,
self.function_parameters,
self.ARROW,
self.type_expression,
self.block,
)
@rule("ParamList")
def function_parameters(self) -> Rule:
return (
seq(LPAREN, RPAREN)
| seq(LPAREN, self._first_parameter, RPAREN)
| seq(LPAREN, self._first_parameter, COMMA, self._parameter_list, RPAREN)
seq(self.LPAREN, self.RPAREN)
| seq(self.LPAREN, self._first_parameter, self.RPAREN)
| seq(self.LPAREN, self._first_parameter, self.COMMA, self._parameter_list, self.RPAREN)
)
@rule
def _first_parameter(self) -> Rule:
return SELF | self.parameter
return self.SELF | self.parameter
@rule
def _parameter_list(self) -> Rule:
return Nothing | self.parameter | seq(self.parameter, COMMA, self._parameter_list)
return Nothing | self.parameter | seq(self.parameter, self.COMMA, self._parameter_list)
@rule("Parameter")
def parameter(self) -> Rule:
return seq(IDENTIFIER, COLON, self.type_expression)
return seq(self.IDENTIFIER, self.COLON, self.type_expression)
# Block
@rule("Block")
def block(self) -> Rule:
return (
seq(LCURLY, RCURLY)
| seq(LCURLY, self.expression, RCURLY)
| seq(LCURLY, self._statement_list, RCURLY)
| seq(LCURLY, self._statement_list, self.expression, RCURLY)
seq(self.LCURLY, self.RCURLY)
| seq(self.LCURLY, self.expression, self.RCURLY)
| seq(self.LCURLY, self._statement_list, self.RCURLY)
| seq(self.LCURLY, self._statement_list, self.expression, self.RCURLY)
)
@rule
@ -200,19 +155,19 @@ class FineGrammar(Grammar):
@rule("LetStatement")
def let_statement(self) -> Rule:
return seq(LET, IDENTIFIER, EQUAL, self.expression, SEMICOLON)
return seq(self.LET, self.IDENTIFIER, self.EQUAL, self.expression, self.SEMICOLON)
@rule("ReturnStatement")
def return_statement(self) -> Rule:
return seq(RETURN, self.expression, SEMICOLON) | seq(RETURN, SEMICOLON)
return seq(self.RETURN, self.expression, self.SEMICOLON) | seq(self.RETURN, self.SEMICOLON)
@rule("ForStatement")
def for_statement(self) -> Rule:
return seq(FOR, self.iterator_variable, IN, self.expression, self.block)
return seq(self.FOR, self.iterator_variable, self.IN, self.expression, self.block)
@rule("IteratorVariable")
def iterator_variable(self) -> Rule:
return IDENTIFIER
return self.IDENTIFIER
@rule("IfStatement")
def if_statement(self) -> Rule:
@ -220,11 +175,11 @@ class FineGrammar(Grammar):
@rule
def while_statement(self) -> Rule:
return seq(WHILE, self.expression, self.block)
return seq(self.WHILE, self.expression, self.block)
@rule
def expression_statement(self) -> Rule:
return seq(self.expression, SEMICOLON)
return seq(self.expression, self.SEMICOLON)
# Expressions
@rule(transparent=True)
@ -234,91 +189,93 @@ class FineGrammar(Grammar):
@rule("BinaryExpression")
def binary_expression(self) -> Rule:
return (
seq(self.expression, EQUAL, self.expression)
| seq(self.expression, OR, self.expression)
| seq(self.expression, AND, self.expression)
| seq(self.expression, EQUALEQUAL, self.expression)
| seq(self.expression, BANGEQUAL, self.expression)
| seq(self.expression, LESS, self.expression)
| seq(self.expression, LESSEQUAL, self.expression)
| seq(self.expression, GREATER, self.expression)
| seq(self.expression, GREATEREQUAL, self.expression)
| seq(self.expression, PLUS, self.expression)
| seq(self.expression, MINUS, self.expression)
| seq(self.expression, STAR, self.expression)
| seq(self.expression, SLASH, self.expression)
seq(self.expression, self.EQUAL, self.expression)
| seq(self.expression, self.OR, self.expression)
| seq(self.expression, self.AND, self.expression)
| seq(self.expression, self.EQUALEQUAL, self.expression)
| seq(self.expression, self.BANGEQUAL, self.expression)
| seq(self.expression, self.LESS, self.expression)
| seq(self.expression, self.LESSEQUAL, self.expression)
| seq(self.expression, self.GREATER, self.expression)
| seq(self.expression, self.GREATEREQUAL, self.expression)
| seq(self.expression, self.PLUS, self.expression)
| seq(self.expression, self.MINUS, self.expression)
| seq(self.expression, self.STAR, self.expression)
| seq(self.expression, self.SLASH, self.expression)
)
@rule("IsExpression")
def is_expression(self) -> Rule:
return seq(self.expression, IS, self.pattern)
return seq(self.expression, self.IS, self.pattern)
@rule
def primary_expression(self) -> Rule:
return (
self.identifier_expression
| self.literal_expression
| SELF
| seq(BANG, self.primary_expression)
| seq(MINUS, self.primary_expression)
| self.SELF
| seq(self.BANG, self.primary_expression)
| seq(self.MINUS, self.primary_expression)
| self.block
| self.conditional_expression
| self.list_constructor_expression
| self.object_constructor_expression
| self.match_expression
| seq(self.primary_expression, LPAREN, RPAREN)
| seq(self.primary_expression, LPAREN, self._expression_list, RPAREN)
| seq(self.primary_expression, DOT, IDENTIFIER)
| seq(LPAREN, self.expression, RPAREN)
| seq(self.primary_expression, self.LPAREN, self.RPAREN)
| seq(self.primary_expression, self.LPAREN, self._expression_list, self.RPAREN)
| seq(self.primary_expression, self.DOT, self.IDENTIFIER)
| seq(self.LPAREN, self.expression, self.RPAREN)
)
@rule("IdentifierExpression")
def identifier_expression(self):
return IDENTIFIER
return self.IDENTIFIER
@rule("Literal")
def literal_expression(self):
return NUMBER | STRING | TRUE | FALSE
return self.NUMBER | self.STRING | self.TRUE | self.FALSE
@rule("ConditionalExpression")
def conditional_expression(self) -> Rule:
return (
seq(IF, self.expression, self.block)
| seq(IF, self.expression, self.block, ELSE, self.conditional_expression)
| seq(IF, self.expression, self.block, ELSE, self.block)
seq(self.IF, self.expression, self.block)
| seq(self.IF, self.expression, self.block, self.ELSE, self.conditional_expression)
| seq(self.IF, self.expression, self.block, self.ELSE, self.block)
)
@rule
def list_constructor_expression(self) -> Rule:
return seq(LSQUARE, RSQUARE) | seq(LSQUARE, self._expression_list, RSQUARE)
return seq(self.LSQUARE, self.RSQUARE) | seq(
self.LSQUARE, self._expression_list, self.RSQUARE
)
@rule
def _expression_list(self) -> Rule:
return (
self.expression
| seq(self.expression, COMMA)
| seq(self.expression, COMMA, self._expression_list)
| seq(self.expression, self.COMMA)
| seq(self.expression, self.COMMA, self._expression_list)
)
@rule
def match_expression(self) -> Rule:
return seq(MATCH, self.expression, self.match_body)
return seq(self.MATCH, self.expression, self.match_body)
@rule("MatchBody")
def match_body(self) -> Rule:
return seq(LCURLY, RCURLY) | seq(LCURLY, self._match_arms, RCURLY)
return seq(self.LCURLY, self.RCURLY) | seq(self.LCURLY, self._match_arms, self.RCURLY)
@rule
def _match_arms(self) -> Rule:
return (
self.match_arm
| seq(self.match_arm, COMMA)
| seq(self.match_arm, COMMA, self._match_arms)
| seq(self.match_arm, self.COMMA)
| seq(self.match_arm, self.COMMA, self._match_arms)
)
@rule("MatchArm")
def match_arm(self) -> Rule:
return seq(self.pattern, ARROW, self.expression)
return seq(self.pattern, self.ARROW, self.expression)
@rule("Pattern")
def pattern(self) -> Rule:
@ -330,7 +287,7 @@ class FineGrammar(Grammar):
@rule
def _pattern_predicate(self) -> Rule:
return seq(AND, self.expression)
return seq(self.AND, self.expression)
@rule
def _pattern_core(self) -> Rule:
@ -338,60 +295,116 @@ class FineGrammar(Grammar):
@rule("WildcardPattern")
def wildcard_pattern(self) -> Rule:
return UNDERSCORE
return self.UNDERSCORE
@rule("VariableBinding")
def variable_binding(self) -> Rule:
return seq(IDENTIFIER, COLON)
return seq(self.IDENTIFIER, self.COLON)
@rule
def object_constructor_expression(self) -> Rule:
return seq(NEW, self.type_identifier, self.field_list)
return seq(self.NEW, self.type_identifier, self.field_list)
@rule
def field_list(self) -> Rule:
return seq(LCURLY, RCURLY) | seq(LCURLY, self.field_values, RCURLY)
return seq(self.LCURLY, self.RCURLY) | seq(self.LCURLY, self.field_values, self.RCURLY)
@rule
def field_values(self) -> Rule:
return (
self.field_value
| seq(self.field_value, COMMA)
| seq(self.field_value, COMMA, self.field_values)
| seq(self.field_value, self.COMMA)
| seq(self.field_value, self.COMMA, self.field_values)
)
@rule
def field_value(self) -> Rule:
return IDENTIFIER | seq(IDENTIFIER, COLON, self.expression)
return self.IDENTIFIER | seq(self.IDENTIFIER, self.COLON, self.expression)
BLANK = Terminal("[ \t\r\n]+", regex=True)
ARROW = Terminal("->")
AS = Terminal("as")
BAR = Terminal("bar")
CLASS = Terminal("class")
COLON = Terminal("colon")
COMMENT = Terminal("comment")
ELSE = Terminal("else")
FOR = Terminal("for")
FUN = Terminal("fun")
IDENTIFIER = Terminal("[A-Za-z_][A-Za-z0-9_]*", regex=True)
IF = Terminal("if")
IMPORT = Terminal("import")
IN = Terminal("in")
LCURLY = Terminal("{")
LET = Terminal("Let")
RCURLY = Terminal("}")
RETURN = Terminal("return")
SEMICOLON = Terminal(";")
STRING = Terminal('""', regex=True)
WHILE = Terminal("while")
EQUAL = Terminal("=")
LPAREN = Terminal("(")
RPAREN = Terminal(")")
COMMA = Terminal(",")
SELF = Terminal("self", name="SELFF")
OR = Terminal("or")
IS = Terminal("is")
AND = Terminal("and")
EQUALEQUAL = Terminal("==")
BANGEQUAL = Terminal("!=")
LESS = Terminal("<")
GREATER = Terminal(">")
LESSEQUAL = Terminal("<=")
GREATEREQUAL = Terminal(">=")
PLUS = Terminal("+")
MINUS = Terminal("-")
STAR = Terminal("*")
SLASH = Terminal("/")
NUMBER = Terminal("[0-9]+", regex=True)
TRUE = Terminal("true")
FALSE = Terminal("false")
BANG = Terminal("!")
DOT = Terminal(".")
MATCH = Terminal("match")
EXPORT = Terminal("export")
UNDERSCORE = Terminal("_")
NEW = Terminal("new")
LSQUARE = Terminal("[")
RSQUARE = Terminal("]")
# -----------------------------------------------------------------------------
# DORKY LEXER
# -----------------------------------------------------------------------------
import bisect
import dataclasses
NUMBER_RE = re.compile("[0-9]+(\\.[0-9]*([eE][-+]?[0-9]+)?)?")
IDENTIFIER_RE = re.compile("[_A-Za-z][_A-Za-z0-9]*")
KEYWORD_TABLE = {
"_": UNDERSCORE,
"and": AND,
"as": AS,
"class": CLASS,
"else": ELSE,
"export": EXPORT,
"false": FALSE,
"for": FOR,
"fun": FUN,
"if": IF,
"import": IMPORT,
"in": IN,
"is": IS,
"let": LET,
"match": MATCH,
"new": NEW,
"or": OR,
"return": RETURN,
"self": SELF,
"true": TRUE,
"while": WHILE,
"_": FineGrammar.UNDERSCORE,
"and": FineGrammar.AND,
"as": FineGrammar.AS,
"class": FineGrammar.CLASS,
"else": FineGrammar.ELSE,
"export": FineGrammar.EXPORT,
"false": FineGrammar.FALSE,
"for": FineGrammar.FOR,
"fun": FineGrammar.FUN,
"if": FineGrammar.IF,
"import": FineGrammar.IMPORT,
"in": FineGrammar.IN,
"is": FineGrammar.IS,
"let": FineGrammar.LET,
"match": FineGrammar.MATCH,
"new": FineGrammar.NEW,
"or": FineGrammar.OR,
"return": FineGrammar.RETURN,
"self": FineGrammar.SELF,
"true": FineGrammar.TRUE,
"while": FineGrammar.WHILE,
}
@ -406,63 +419,63 @@ def tokenize(src: str):
token = None
if ch == "-":
if src[pos : pos + 2] == "->":
token = (ARROW, pos, 2)
token = (FineGrammar.ARROW, pos, 2)
else:
token = (MINUS, pos, 1)
token = (FineGrammar.MINUS, pos, 1)
elif ch == "|":
token = (BAR, pos, 1)
token = (FineGrammar.BAR, pos, 1)
elif ch == ":":
token = (COLON, pos, 1)
token = (FineGrammar.COLON, pos, 1)
elif ch == "{":
token = (LCURLY, pos, 1)
token = (FineGrammar.LCURLY, pos, 1)
elif ch == "}":
token = (RCURLY, pos, 1)
token = (FineGrammar.RCURLY, pos, 1)
elif ch == ";":
token = (SEMICOLON, pos, 1)
token = (FineGrammar.SEMICOLON, pos, 1)
elif ch == "=":
if src[pos : pos + 2] == "==":
token = (EQUALEQUAL, pos, 2)
token = (FineGrammar.EQUALEQUAL, pos, 2)
else:
token = (EQUAL, pos, 1)
token = (FineGrammar.EQUAL, pos, 1)
elif ch == "(":
token = (LPAREN, pos, 1)
token = (FineGrammar.LPAREN, pos, 1)
elif ch == ")":
token = (RPAREN, pos, 1)
token = (FineGrammar.RPAREN, pos, 1)
elif ch == ",":
token = (COMMA, pos, 1)
token = (FineGrammar.COMMA, pos, 1)
elif ch == "!":
if src[pos : pos + 2] == "!=":
token = (BANGEQUAL, pos, 2)
token = (FineGrammar.BANGEQUAL, pos, 2)
else:
token = (BANG, pos, 1)
token = (FineGrammar.BANG, pos, 1)
elif ch == "<":
if src[pos : pos + 2] == "<=":
token = (LESSEQUAL, pos, 2)
token = (FineGrammar.LESSEQUAL, pos, 2)
else:
token = (LESS, pos, 1)
token = (FineGrammar.LESS, pos, 1)
elif ch == ">":
if src[pos : pos + 2] == ">=":
token = (GREATEREQUAL, pos, 2)
token = (FineGrammar.GREATEREQUAL, pos, 2)
else:
token = (GREATER, pos, 1)
token = (FineGrammar.GREATER, pos, 1)
elif ch == "+":
token = (PLUS, pos, 1)
token = (FineGrammar.PLUS, pos, 1)
elif ch == "*":
token = (STAR, pos, 1)
token = (FineGrammar.STAR, pos, 1)
elif ch == "/":
if src[pos : pos + 2] == "//":
@ -470,16 +483,16 @@ def tokenize(src: str):
pos = pos + 1
continue
token = (SLASH, pos, 1)
token = (FineGrammar.SLASH, pos, 1)
elif ch == ".":
token = (DOT, pos, 1)
token = (FineGrammar.DOT, pos, 1)
elif ch == "[":
token = (LSQUARE, pos, 1)
token = (FineGrammar.LSQUARE, pos, 1)
elif ch == "]":
token = (RSQUARE, pos, 1)
token = (FineGrammar.RSQUARE, pos, 1)
elif ch == '"' or ch == "'":
end = pos + 1
@ -490,12 +503,12 @@ def tokenize(src: str):
if end == len(src):
raise Exception(f"Unterminated string constant at {pos}")
end += 1
token = (STRING, pos, end - pos)
token = (FineGrammar.STRING, pos, end - pos)
else:
number_match = NUMBER_RE.match(src, pos)
if number_match:
token = (NUMBER, pos, number_match.end() - pos)
token = (FineGrammar.NUMBER, pos, number_match.end() - pos)
else:
id_match = IDENTIFIER_RE.match(src, pos)
if id_match:
@ -504,7 +517,7 @@ def tokenize(src: str):
if keyword:
token = (keyword, pos, len(fragment))
else:
token = (IDENTIFIER, pos, len(fragment))
token = (FineGrammar.IDENTIFIER, pos, len(fragment))
if token is None:
raise Exception("Token error")
@ -512,9 +525,6 @@ def tokenize(src: str):
pos += token[2]
import bisect
class FineTokens:
def __init__(self, src: str):
self.src = src
@ -546,4 +556,20 @@ class FineTokens:
if __name__ == "__main__":
FineGrammar().build_table()
grammar = FineGrammar()
grammar.build_table()
class LexTest(Grammar):
@rule
def foo(self):
return self.IS
start = foo
IS = Terminal("is")
AS = Terminal("as")
IDENTIFIER = Terminal("[a-z]+", regex=True)
# IDENTIFIER = Terminal("[A-Za-z_][A-Za-z0-9_]*", regex=True)
lexer = compile_lexer(LexTest())
dump_lexer_table(lexer)

View file

@ -21,19 +21,20 @@ To get started, create a grammar that derives from the `Grammar` class. Create
one method per nonterminal, decorated with the `rule` decorator. Here's an
example:
PLUS = Terminal('+')
LPAREN = Terminal('(')
RPAREN = Terminal(')')
ID = Terminal('id')
class SimpleGrammar(Grammar):
@rule
def expression(self):
return seq(self.expression, PLUS, self.term) | self.term
return seq(self.expression, self.PLUS, self.term) | self.term
@rule
def term(self):
return seq(LPAREN, self.expression, RPAREN) | ID
return seq(self.LPAREN, self.expression, self.RPAREN) | self.ID
PLUS = Terminal('+')
LPAREN = Terminal('(')
RPAREN = Terminal(')')
ID = Terminal('id')
## Using grammars
@ -1605,10 +1606,14 @@ class Rule:
class Terminal(Rule):
"""A token, or terminal symbol in the grammar."""
value: str
value: str | None
pattern: str
regex: bool
def __init__(self, value):
self.value = sys.intern(value)
def __init__(self, pattern, name=None, regex=False):
self.value = name
self.pattern = pattern
self.regex = regex
def flatten(self) -> typing.Generator[list["str | Terminal"], None, None]:
# We are just ourselves when flattened.
@ -1766,19 +1771,20 @@ class Grammar:
Here's an example of a simple grammar:
PLUS = Terminal('+')
LPAREN = Terminal('(')
RPAREN = Terminal(')')
ID = Terminal('id')
class SimpleGrammar(Grammar):
@rule
def expression(self):
return seq(self.expression, PLUS, self.term) | self.term
return seq(self.expression, self.PLUS, self.term) | self.term
@rule
def term(self):
return seq(LPAREN, self.expression, RPAREN) | ID
return seq(self.LPAREN, self.expression, self.RPAREN) | self.ID
PLUS = Terminal('+')
LPAREN = Terminal('(')
RPAREN = Terminal(')')
ID = Terminal('id')
Not very exciting, perhaps, but it's something.
"""
@ -1786,6 +1792,7 @@ class Grammar:
_precedence: dict[str, typing.Tuple[Assoc, int]]
_start: str
_generator: type[GenerateLR0]
_terminals: list[Terminal]
def __init__(
self,
@ -1809,6 +1816,14 @@ class Grammar:
generator = getattr(self, "generator", GenerateLALR)
assert generator is not None
# Fixup terminal names with the name of the member that declared it.
terminals = []
for n, t in inspect.getmembers(self, lambda x: isinstance(x, Terminal)):
if t.value is None:
t.value = n
terminals.append(t)
# Fix up the precedence table.
precedence_table = {}
for prec, (associativity, symbols) in enumerate(precedence):
for symbol in symbols:
@ -1824,6 +1839,11 @@ class Grammar:
self._precedence = precedence_table
self._start = start
self._generator = generator
self._terminals = terminals
@property
def terminals(self) -> list[Terminal]:
return self._terminals
def generate_nonterminal_dict(
self, start: str | None = None
@ -1911,3 +1931,149 @@ class Grammar:
gen = generator(start, desugared, precedence=self._precedence, transparents=transparents)
table = gen.gen_table()
return table
###############################################################################
# Lexer support
###############################################################################
# For machine-generated lexers
@dataclasses.dataclass(frozen=True, slots=True)
class Span:
lower: int # inclusive
upper: int # exclusive
@classmethod
def from_str(cls, c: str) -> "Span":
return Span(lower=ord(c), upper=ord(c) + 1)
def intersects(self, other: "Span") -> bool:
return self.lower < other.upper and self.upper > other.lower
def split(self, other: "Span") -> tuple["Span|None", "Span", "Span|None"]:
assert self.intersects(other)
first = min(self.lower, other.lower)
second = max(self.lower, other.lower)
third = min(self.upper, other.upper)
fourth = max(self.upper, other.upper)
low = Span(first, second) if first != second else None
mid = Span(second, third)
hi = Span(third, fourth) if third != fourth else None
return (low, mid, hi)
def __str__(self) -> str:
if self.upper - self.lower == 1:
return str(self.lower)
lower = str(self.lower)
upper = str(self.upper)
return f"[{lower}-{upper})"
def __lt__(self, other: "Span") -> bool:
return self.lower < other.lower
ET = typing.TypeVar("ET")
class EdgeList[ET]:
"""A list of edge transitions, keyed by *span*. A given span can have
multiple targets, because this supports NFAs."""
_edges: list[tuple[Span, list[ET]]]
def __init__(self):
self._edges = []
def __iter__(self) -> typing.Iterator[tuple[Span, list[ET]]]:
return iter(self._edges)
def __repr__(self) -> str:
return f"EdgeList[{','.join(str(s[0]) + '->' + repr(s[1]) for s in self._edges)}]"
def add_edge(self, c: Span, s: ET):
"""Add an edge for the given span to the list. If there are already
spans that overlap this one, split and generating multiple distinct
edges.
"""
# print(f" Adding {c}->{s} to {self}...")
# Look to see where we would put this span based solely on a
# sort of lower bounds.
point = bisect.bisect_left(self._edges, c, key=lambda x: x[0])
# If this is not the first span in the list then we might
# overlap with the span to our left....
if point > 0:
left_point = point - 1
left_span, left_targets = self._edges[left_point]
if c.intersects(left_span):
# ...if we intersect with the span to our left then we
# must split the span to our left with regards to our
# span. Then we have three target spans:
#
# - The lo one, which just has the targets from the old
# left span. (This may be empty if we overlap the
# left one completely on the left side.)
#
# - The mid one, which has both the targets from the
# old left and the new target.
#
# - The hi one, which if it exists only has our target.
# If it exists it basically replaces the current span
# for our future processing. (If not, then our span
# is completely subsumed into the left span and we
# can stop.)
#
del self._edges[left_point]
lo, mid, hi = c.split(left_span)
# print(f" <- {c} splits {left_span} -> {lo}, {mid}, {hi} @{left_point}")
self._edges.insert(left_point, (mid, left_targets + [s]))
if lo is not None:
self._edges.insert(left_point, (lo, left_targets))
if hi is None or not hi.intersects(c):
# Yup, completely subsumed.
# print(f" result: {self} (left out)")
return
# Continue processing with `c` as the hi split from the
# left. If the left and right spans abut each other then
# `c` will be subsumed in our right span.
c = hi
# If point is not at the very end of the list then it might
# overlap the span to our right...
if point < len(self._edges):
right_span, right_targets = self._edges[point]
if c.intersects(right_span):
# ...this is similar to the left case, above, except the
# lower bound has the targets that our only ours, etc.
del self._edges[point]
lo, mid, hi = c.split(right_span)
# print(f" -> {c} splits {right_span} -> {lo}, {mid}, {hi} @{point}")
if hi is not None:
self._edges.insert(point, (hi, right_targets))
self._edges.insert(point, (mid, right_targets + [s]))
if lo is None or not lo.intersects(c):
# Our span is completely subsumed on the lower side
# of the range; there is no lower side that just has
# our targets. Bail now.
# print(f" result: {self} (right out)")
return
# Continue processing with `c` as the lo split, since
# that's the one that has only the specified state as the
# target.
c = lo
# If we made it here then either we have a point that does not
# intersect at all, or it only partially intersects on either the
# left or right. Either way, we have ensured that:
#
# - c doesn't intersect with left or right (any more)
# - point is where it should go
self._edges.insert(point, (c, [s]))
# print(f" result: {self} (done)")

View file

@ -38,25 +38,27 @@ def _tree(treeform) -> runtime.Tree | runtime.TokenValue:
def test_lr0_lr0():
"""An LR0 grammar should work with an LR0 generator."""
PLUS = Terminal("+")
LPAREN = Terminal("(")
RPAREN = Terminal(")")
IDENTIFIER = Terminal("id")
class LR0Grammar(Grammar):
class G(Grammar):
start = "E"
generator = parser.GenerateLR0
@rule
def E(self):
return seq(self.E, PLUS, self.T) | self.T
return seq(self.E, self.PLUS, self.T) | self.T
@rule
def T(self):
return seq(LPAREN, self.E, RPAREN) | IDENTIFIER
return seq(self.LPAREN, self.E, self.RPAREN) | self.IDENTIFIER
table = LR0Grammar().build_table()
tree, errors = runtime.Parser(table).parse(Tokens(IDENTIFIER, PLUS, LPAREN, IDENTIFIER, RPAREN))
PLUS = Terminal("+", name="+")
LPAREN = Terminal("(", name="(")
RPAREN = Terminal(")", name=")")
IDENTIFIER = Terminal("id", name="id")
table = G().build_table()
tree, errors = runtime.Parser(table).parse(
Tokens(G.IDENTIFIER, G.PLUS, G.LPAREN, G.IDENTIFIER, G.RPAREN)
)
assert errors == []
assert tree == _tree(("E", ("E", ("T", "id")), "+", ("T", "(", ("E", ("T", "id")), ")")))
@ -65,114 +67,114 @@ def test_lr0_lr0():
def test_lr0_shift_reduce():
"""This one should not work in LR0- it has a shift/reduce conflict, but works in SLR1."""
PLUS = Terminal("+")
LPAREN = Terminal("(")
RPAREN = Terminal(")")
LSQUARE = Terminal("[")
RSQUARE = Terminal("]")
IDENTIFIER = Terminal("id")
class TestGrammar(Grammar):
class G(Grammar):
start = "E"
generator = parser.GenerateLR0
@rule
def E(self):
return seq(self.E, PLUS, self.T) | self.T
return seq(self.E, self.PLUS, self.T) | self.T
@rule
def T(self):
return (
seq(LPAREN, self.E, RPAREN) | IDENTIFIER | seq(IDENTIFIER, LSQUARE, self.E, RSQUARE)
seq(self.LPAREN, self.E, self.RPAREN)
| self.IDENTIFIER
| seq(self.IDENTIFIER, self.LSQUARE, self.E, self.RSQUARE)
)
with pytest.raises(parser.AmbiguityError):
TestGrammar().build_table()
PLUS = Terminal("+")
LPAREN = Terminal("(")
RPAREN = Terminal(")")
LSQUARE = Terminal("[")
RSQUARE = Terminal("]")
IDENTIFIER = Terminal("id")
TestGrammar().build_table(generator=parser.GenerateSLR1)
with pytest.raises(parser.AmbiguityError):
G().build_table()
G().build_table(generator=parser.GenerateSLR1)
def test_lr0_reduce_reduce():
"""This one should not work, it has a reduce-reduce conflict."""
PLUS = Terminal("+")
EQUAL = Terminal("=")
LPAREN = Terminal("(")
RPAREN = Terminal(")")
IDENTIFIER = Terminal("id")
class TestGrammar(Grammar):
class G(Grammar):
start = "E"
generator = parser.GenerateLR0
@rule
def E(self):
return seq(self.E, PLUS, self.T) | self.T | seq(self.V, EQUAL, self.E)
return seq(self.E, self.PLUS, self.T) | self.T | seq(self.V, self.EQUAL, self.E)
@rule
def T(self):
return seq(LPAREN, self.E, RPAREN) | IDENTIFIER
return seq(self.LPAREN, self.E, self.RPAREN) | self.IDENTIFIER
@rule
def V(self):
return IDENTIFIER
return self.IDENTIFIER
PLUS = Terminal("+")
EQUAL = Terminal("=")
LPAREN = Terminal("(")
RPAREN = Terminal(")")
IDENTIFIER = Terminal("id")
with pytest.raises(parser.AmbiguityError):
TestGrammar().build_table()
G().build_table()
def test_lr0_empty():
"""LR0 can't handle empty productions because it doesn't know when to reduce."""
BOOP = Terminal("boop")
BEEP = Terminal("beep")
class TestGrammar(Grammar):
class G(Grammar):
start = "E"
generator = parser.GenerateLR0
@rule
def E(self):
return seq(self.F, BOOP)
return seq(self.F, self.BOOP)
@rule
def F(self):
return BEEP | parser.Nothing
return self.BEEP | parser.Nothing
BOOP = Terminal("boop")
BEEP = Terminal("beep")
with pytest.raises(parser.AmbiguityError):
TestGrammar().build_table()
G().build_table()
def test_grammar_aho_ullman_1():
EQUAL = Terminal("=")
STAR = Terminal("*")
ID = Terminal("id")
class TestGrammar(Grammar):
class G(Grammar):
start = "S"
generator = parser.GenerateSLR1
@rule
def S(self):
return seq(self.L, EQUAL, self.R) | self.R
return seq(self.L, self.EQUAL, self.R) | self.R
@rule
def L(self):
return seq(STAR, self.R) | ID
return seq(self.STAR, self.R) | self.ID
@rule
def R(self):
return self.L
with pytest.raises(parser.AmbiguityError):
TestGrammar().build_table()
EQUAL = Terminal("=")
STAR = Terminal("*")
ID = Terminal("id")
TestGrammar().build_table(generator=parser.GenerateLR1)
with pytest.raises(parser.AmbiguityError):
G().build_table()
G().build_table(generator=parser.GenerateLR1)
def test_grammar_aho_ullman_2():
A = Terminal("a")
B = Terminal("b")
class TestGrammar(Grammar):
start = "S"
generator = parser.GenerateSLR1
@ -183,7 +185,10 @@ def test_grammar_aho_ullman_2():
@rule
def X(self):
return seq(A, self.X) | B
return seq(self.A, self.X) | self.B
A = Terminal("a")
B = Terminal("b")
TestGrammar().build_table()
TestGrammar().build_table(generator=parser.GenerateLR1)
@ -191,11 +196,6 @@ def test_grammar_aho_ullman_2():
def test_fun_lalr():
PLUS = Terminal("+")
INT = Terminal("int")
ID = Terminal("id")
LPAREN = Terminal("(")
RPAREN = Terminal(")")
class TestGrammar(Grammar):
start = "S"
@ -207,15 +207,21 @@ def test_fun_lalr():
@rule
def E(self):
return self.F | seq(self.E, PLUS, self.F)
return self.F | seq(self.E, self.PLUS, self.F)
@rule
def F(self):
return self.V | INT | seq(LPAREN, self.E, RPAREN)
return self.V | self.INT | seq(self.LPAREN, self.E, self.RPAREN)
@rule
def V(self):
return ID
return self.ID
PLUS = Terminal("+")
INT = Terminal("int")
ID = Terminal("id")
LPAREN = Terminal("(")
RPAREN = Terminal(")")
TestGrammar().build_table()
@ -234,14 +240,14 @@ def test_conflicting_names():
to understand.
"""
IDENTIFIER = Terminal("Identifier")
class TestGrammar(Grammar):
start = "Identifier"
start = "IDENTIFIER"
@rule("Identifier")
@rule("IDENTIFIER")
def identifier(self):
return IDENTIFIER
return self.IDENTIFIER
IDENTIFIER = Terminal("Identifier")
with pytest.raises(ValueError):
TestGrammar().build_table()

452
tests/test_lexer.py Normal file
View file

@ -0,0 +1,452 @@
from parser import Span
# LexerTable = list[tuple[Terminal | None, list[tuple[Span, int]]]]
# def compile_lexer(x: Grammar) -> LexerTable:
# class State:
# """An NFA state. Each state can be the accept state, with one or more
# Terminals as the result."""
# accept: list[Terminal]
# epsilons: list["State"]
# _edges: EdgeList["State"]
# def __init__(self):
# self.accept = []
# self.epsilons = []
# self._edges = EdgeList()
# def __repr__(self):
# return f"State{id(self)}"
# def edges(self) -> typing.Iterable[tuple[Span, list["State"]]]:
# return self._edges
# def add_edge(self, c: Span, s: "State") -> "State":
# self._edges.add_edge(c, s)
# return s
# def dump_graph(self, name="nfa.dot"):
# with open(name, "w", encoding="utf8") as f:
# f.write("digraph G {\n")
# stack: list[State] = [self]
# visited = set()
# while len(stack) > 0:
# state = stack.pop()
# if state in visited:
# continue
# visited.add(state)
# label = ", ".join([t.value for t in state.accept if t.value is not None])
# f.write(f' {id(state)} [label="{label}"];\n')
# for target in state.epsilons:
# stack.append(target)
# f.write(f' {id(state)} -> {id(target)} [label="\u03B5"];\n')
# for span, targets in state.edges():
# label = str(span).replace('"', '\\"')
# for target in targets:
# stack.append(target)
# f.write(f' {id(state)} -> {id(target)} [label="{label}"];\n')
# f.write("}\n")
# @dataclasses.dataclass
# class RegexNode:
# def to_nfa(self, start: State) -> State:
# del start
# raise NotImplementedError()
# def __str__(self) -> str:
# raise NotImplementedError()
# @dataclasses.dataclass
# class RegexLiteral(RegexNode):
# values: list[tuple[str, str]]
# def to_nfa(self, start: State) -> State:
# end = State()
# for s, e in self.values:
# start.add_edge(Span(ord(s), ord(e)), end)
# return end
# def __str__(self) -> str:
# if len(self.values) == 1:
# start, end = self.values[0]
# if start == end:
# return start
# ranges = []
# for start, end in self.values:
# if start == end:
# ranges.append(start)
# else:
# ranges.append(f"{start}-{end}")
# return "![{}]".format("".join(ranges))
# @dataclasses.dataclass
# class RegexPlus(RegexNode):
# child: RegexNode
# def to_nfa(self, start: State) -> State:
# end = self.child.to_nfa(start)
# end.epsilons.append(start)
# return end
# def __str__(self) -> str:
# return f"({self.child})+"
# @dataclasses.dataclass
# class RegexStar(RegexNode):
# child: RegexNode
# def to_nfa(self, start: State) -> State:
# end = self.child.to_nfa(start)
# end.epsilons.append(start)
# start.epsilons.append(end)
# return end
# def __str__(self) -> str:
# return f"({self.child})*"
# @dataclasses.dataclass
# class RegexQuestion(RegexNode):
# child: RegexNode
# def to_nfa(self, start: State) -> State:
# end = self.child.to_nfa(start)
# start.epsilons.append(end)
# return end
# def __str__(self) -> str:
# return f"({self.child})?"
# @dataclasses.dataclass
# class RegexSequence(RegexNode):
# left: RegexNode
# right: RegexNode
# def to_nfa(self, start: State) -> State:
# mid = self.left.to_nfa(start)
# return self.right.to_nfa(mid)
# def __str__(self) -> str:
# return f"{self.left}{self.right}"
# @dataclasses.dataclass
# class RegexAlternation(RegexNode):
# left: RegexNode
# right: RegexNode
# def to_nfa(self, start: State) -> State:
# left_start = State()
# start.epsilons.append(left_start)
# left_end = self.left.to_nfa(left_start)
# right_start = State()
# start.epsilons.append(right_start)
# right_end = self.right.to_nfa(right_start)
# end = State()
# left_end.epsilons.append(end)
# right_end.epsilons.append(end)
# return end
# def __str__(self) -> str:
# return f"(({self.left})||({self.right}))"
# class RegexParser:
# # TODO: HANDLE ALTERNATION AND PRECEDENCE (CONCAT HAS HIGHEST PRECEDENCE)
# PREFIX: dict[str, typing.Callable[[str], RegexNode]]
# POSTFIX: dict[str, typing.Callable[[RegexNode, int], RegexNode]]
# BINDING: dict[str, tuple[int, int]]
# index: int
# pattern: str
# def __init__(self, pattern: str):
# self.PREFIX = {
# "(": self.parse_group,
# "[": self.parse_set,
# }
# self.POSTFIX = {
# "+": self.parse_plus,
# "*": self.parse_star,
# "?": self.parse_question,
# "|": self.parse_alternation,
# }
# self.BINDING = {
# "|": (1, 1),
# "+": (2, 2),
# "*": (2, 2),
# "?": (2, 2),
# ")": (-1, -1), # Always stop parsing on )
# }
# self.index = 0
# self.pattern = pattern
# def consume(self) -> str:
# if self.index >= len(self.pattern):
# raise ValueError(f"Unable to parse regular expression '{self.pattern}'")
# result = self.pattern[self.index]
# self.index += 1
# return result
# def peek(self) -> str | None:
# if self.index >= len(self.pattern):
# return None
# return self.pattern[self.index]
# def eof(self) -> bool:
# return self.index >= len(self.pattern)
# def expect(self, ch: str):
# actual = self.consume()
# if ch != actual:
# raise ValueError(f"Expected '{ch}'")
# def parse_regex(self, minimum_binding=0) -> RegexNode:
# ch = self.consume()
# parser = self.PREFIX.get(ch, self.parse_single)
# node = parser(ch)
# while not self.eof():
# ch = self.peek()
# assert ch is not None
# lp, rp = self.BINDING.get(ch, (minimum_binding, minimum_binding))
# if lp < minimum_binding:
# break
# parser = self.POSTFIX.get(ch, self.parse_concat)
# node = parser(node, rp)
# return node
# def parse_single(self, ch: str) -> RegexNode:
# return RegexLiteral(values=[(ch, ch)])
# def parse_group(self, ch: str) -> RegexNode:
# del ch
# node = self.parse_regex()
# self.expect(")")
# return node
# def parse_set(self, ch: str) -> RegexNode:
# del ch
# # TODO: INVERSION?
# ranges = []
# while self.peek() not in (None, "]"):
# start = self.consume()
# if self.peek() == "-":
# self.consume()
# end = self.consume()
# else:
# end = start
# ranges.append((start, end))
# self.expect("]")
# return RegexLiteral(values=ranges)
# def parse_alternation(self, node: RegexNode, rp: int) -> RegexNode:
# return RegexAlternation(left=node, right=self.parse_regex(rp))
# def parse_plus(self, left: RegexNode, rp: int) -> RegexNode:
# del rp
# self.expect("+")
# return RegexPlus(child=left)
# def parse_star(self, left: RegexNode, rp: int) -> RegexNode:
# del rp
# self.expect("*")
# return RegexStar(child=left)
# def parse_question(self, left: RegexNode, rp: int) -> RegexNode:
# del rp
# self.expect("?")
# return RegexQuestion(child=left)
# def parse_concat(self, left: RegexNode, rp: int) -> RegexNode:
# return RegexSequence(left, self.parse_regex(rp))
# class SuperState:
# states: frozenset[State]
# index: int
# def __init__(self, states: typing.Iterable[State]):
# # Close over the given states, including every state that is
# # reachable by epsilon-transition.
# stack = list(states)
# result = set()
# while len(stack) > 0:
# st = stack.pop()
# if st in result:
# continue
# result.add(st)
# stack.extend(st.epsilons)
# self.states = frozenset(result)
# self.index = -1
# def __eq__(self, other):
# if not isinstance(other, SuperState):
# return False
# return self.states == other.states
# def __hash__(self) -> int:
# return hash(self.states)
# def edges(self) -> list[tuple[Span, "SuperState"]]:
# working: EdgeList[list[State]] = EdgeList()
# for st in self.states:
# for span, targets in st.edges():
# working.add_edge(span, targets)
# # EdgeList maps span to list[list[State]] which we want to flatten.
# result = []
# for span, stateses in working:
# s: list[State] = []
# for states in stateses:
# s.extend(states)
# result.append((span, SuperState(s)))
# return result
# def accept_terminal(self) -> Terminal | None:
# accept = None
# for st in self.states:
# for ac in st.accept:
# if accept is None:
# accept = ac
# elif accept.value != ac.value:
# if accept.regex and not ac.regex:
# accept = ac
# elif ac.regex and not accept.regex:
# pass
# else:
# raise ValueError(
# f"Lexer is ambiguous: cannot distinguish between {accept.value} ('{accept.pattern}') and {ac.value} ('{ac.pattern}')"
# )
# return accept
# # Parse the terminals all together into a big NFA rooted at `NFA`.
# NFA = State()
# for token in x.terminals:
# start = State()
# NFA.epsilons.append(start)
# if token.regex:
# node = RegexParser(token.pattern).parse_regex()
# print(f" Parsed {token.pattern} to {node}")
# ending = node.to_nfa(start)
# else:
# ending = start
# for c in token.pattern:
# ending = ending.add_edge(Span.from_str(c), State())
# ending.accept.append(token)
# NFA.dump_graph()
# # Convert the NFA into a DFA in the most straightforward way (by tracking
# # sets of state closures, called SuperStates.)
# DFA: dict[SuperState, list[tuple[Span, SuperState]]] = {}
# stack = [SuperState([NFA])]
# while len(stack) > 0:
# ss = stack.pop()
# if ss in DFA:
# continue
# edges = ss.edges()
# DFA[ss] = edges
# for _, target in edges:
# stack.append(target)
# for i, k in enumerate(DFA):
# k.index = i
# return [
# (
# ss.accept_terminal(),
# [(k, v.index) for k, v in edges],
# )
# for ss, edges in DFA.items()
# ]
# def dump_lexer_table(table: LexerTable):
# with open("lexer.dot", "w", encoding="utf-8") as f:
# f.write("digraph G {\n")
# for index, (accept, edges) in enumerate(table):
# label = accept.value if accept is not None else ""
# f.write(f' {index} [label="{label}"];\n')
# for span, target in edges:
# label = str(span).replace('"', '\\"')
# f.write(f' {index} -> {target} [label="{label}"];\n')
# pass
# f.write("}\n")
# def generic_tokenize(src: str, table: LexerTable):
# pos = 0
# state = 0
# start = 0
# last_accept = None
# last_accept_pos = 0
# while pos < len(src):
# accept, edges = table[state]
# if accept is not None:
# last_accept = accept
# last_accept_pos = pos + 1
# char = ord(src[pos])
# # Find the index of the span where the upper value is the tightest
# # bound on the character.
# index = bisect.bisect_left(edges, char, key=lambda x: x[0].upper)
# # If the character is greater than or equal to the lower bound we
# # found then we have a hit, otherwise no.
# state = edges[index][1] if index < len(edges) and char >= edges[index][0].lower else None
# if state is None:
# if last_accept is None:
# raise Exception(f"Token error at {pos}")
# yield (last_accept, start, last_accept_pos - start)
# last_accept = None
# pos = last_accept_pos
# start = pos
# state = 0
# else:
# pos += 1
def test_span_intersection():
pairs = [
((1, 3), (2, 4)),
((1, 3), (2, 3)),
((1, 3), (1, 2)),
((1, 3), (0, 2)),
((1, 3), (0, 4)),
]
for a, b in pairs:
left = Span(*a)
right = Span(*b)
assert left.intersects(right)
assert right.intersects(left)