Move terminals into grammar definition

Starting to work on machine-generated lexers too
This commit is contained in:
John Doty 2024-08-23 07:24:30 -07:00
parent f6bc2ccea8
commit 58c3004702
4 changed files with 917 additions and 267 deletions

View file

@ -2,57 +2,7 @@
import re import re
import typing import typing
import parser from parser import Assoc, Grammar, Nothing, rule, seq, Rule, Terminal
from parser import Assoc, Grammar, Nothing, Terminal, rule, seq, Rule
ARROW = Terminal("Arrow")
AS = Terminal("As")
BAR = Terminal("Bar")
CLASS = Terminal("Class")
COLON = Terminal("Colon")
ELSE = Terminal("Else")
FOR = Terminal("For")
FUN = Terminal("Fun")
IDENTIFIER = Terminal("Identifier")
IF = Terminal("If")
IMPORT = Terminal("Import")
IN = Terminal("In")
LCURLY = Terminal("LeftBrace")
LET = Terminal("Let")
RCURLY = Terminal("RightBrace")
RETURN = Terminal("Return")
SEMICOLON = Terminal("Semicolon")
STRING = Terminal("String")
WHILE = Terminal("While")
EQUAL = Terminal("Equal")
LPAREN = Terminal("LeftParen")
RPAREN = Terminal("RightParen")
COMMA = Terminal("Comma")
SELF = Terminal("Selff")
OR = Terminal("Or")
IS = Terminal("Is")
AND = Terminal("And")
EQUALEQUAL = Terminal("EqualEqual")
BANGEQUAL = Terminal("BangEqual")
LESS = Terminal("Less")
GREATER = Terminal("Greater")
LESSEQUAL = Terminal("LessEqual")
GREATEREQUAL = Terminal("GreaterEqual")
PLUS = Terminal("Plus")
MINUS = Terminal("Minus")
STAR = Terminal("Star")
SLASH = Terminal("Slash")
NUMBER = Terminal("Number")
TRUE = Terminal("True")
FALSE = Terminal("False")
BANG = Terminal("Bang")
DOT = Terminal("Dot")
MATCH = Terminal("Match")
EXPORT = Terminal("Export")
UNDERSCORE = Terminal("Underscore")
NEW = Terminal("New")
LSQUARE = Terminal("LeftBracket")
RSQUARE = Terminal("RightBracket")
class FineGrammar(Grammar): class FineGrammar(Grammar):
@ -62,17 +12,17 @@ class FineGrammar(Grammar):
def __init__(self): def __init__(self):
super().__init__( super().__init__(
precedence=[ precedence=[
(Assoc.RIGHT, [EQUAL]), (Assoc.RIGHT, [self.EQUAL]),
(Assoc.LEFT, [OR]), (Assoc.LEFT, [self.OR]),
(Assoc.LEFT, [IS]), (Assoc.LEFT, [self.IS]),
(Assoc.LEFT, [AND]), (Assoc.LEFT, [self.AND]),
(Assoc.LEFT, [EQUALEQUAL, BANGEQUAL]), (Assoc.LEFT, [self.EQUALEQUAL, self.BANGEQUAL]),
(Assoc.LEFT, [LESS, GREATER, GREATEREQUAL, LESSEQUAL]), (Assoc.LEFT, [self.LESS, self.GREATER, self.GREATEREQUAL, self.LESSEQUAL]),
(Assoc.LEFT, [PLUS, MINUS]), (Assoc.LEFT, [self.PLUS, self.MINUS]),
(Assoc.LEFT, [STAR, SLASH]), (Assoc.LEFT, [self.STAR, self.SLASH]),
(Assoc.LEFT, [self.primary_expression]), (Assoc.LEFT, [self.primary_expression]),
(Assoc.LEFT, [LPAREN]), (Assoc.LEFT, [self.LPAREN]),
(Assoc.LEFT, [DOT]), (Assoc.LEFT, [self.DOT]),
# #
# If there's a confusion about whether to make an IF # If there's a confusion about whether to make an IF
# statement or an expression, prefer the statement. # statement or an expression, prefer the statement.
@ -97,15 +47,15 @@ class FineGrammar(Grammar):
@rule @rule
def import_statement(self) -> Rule: def import_statement(self) -> Rule:
return seq(IMPORT, STRING, AS, IDENTIFIER, SEMICOLON) return seq(self.IMPORT, self.STRING, self.AS, self.IDENTIFIER, self.SEMICOLON)
@rule("ClassDeclaration") @rule("ClassDeclaration")
def class_declaration(self) -> Rule: def class_declaration(self) -> Rule:
return seq(CLASS, IDENTIFIER, self._class_body) return seq(self.CLASS, self.IDENTIFIER, self._class_body)
@rule @rule
def _class_body(self) -> Rule: def _class_body(self) -> Rule:
return seq(LCURLY, RCURLY) | seq(LCURLY, self._class_members, RCURLY) return seq(self.LCURLY, self.RCURLY) | seq(self.LCURLY, self._class_members, self.RCURLY)
@rule @rule
def _class_members(self) -> Rule: def _class_members(self) -> Rule:
@ -117,7 +67,7 @@ class FineGrammar(Grammar):
@rule("FieldDecl") @rule("FieldDecl")
def field_declaration(self) -> Rule: def field_declaration(self) -> Rule:
return seq(IDENTIFIER, COLON, self.type_expression, SEMICOLON) return seq(self.IDENTIFIER, self.COLON, self.type_expression, self.SEMICOLON)
# Types # Types
@rule("TypeExpression") @rule("TypeExpression")
@ -126,60 +76,65 @@ class FineGrammar(Grammar):
@rule("AlternateType") @rule("AlternateType")
def alternate_type(self) -> Rule: def alternate_type(self) -> Rule:
return seq(self.type_expression, OR, self.type_identifier) return seq(self.type_expression, self.OR, self.type_identifier)
@rule("TypeIdentifier") @rule("TypeIdentifier")
def type_identifier(self) -> Rule: def type_identifier(self) -> Rule:
return IDENTIFIER return self.IDENTIFIER
@rule @rule
def export_statement(self) -> Rule: def export_statement(self) -> Rule:
return ( return (
seq(EXPORT, self.class_declaration) seq(self.EXPORT, self.class_declaration)
| seq(EXPORT, self.function_declaration) | seq(self.EXPORT, self.function_declaration)
| seq(EXPORT, self.let_statement) | seq(self.EXPORT, self.let_statement)
| seq(EXPORT, self.export_list, SEMICOLON) | seq(self.EXPORT, self.export_list, self.SEMICOLON)
) )
@rule @rule
def export_list(self) -> Rule: def export_list(self) -> Rule:
return Nothing | IDENTIFIER | seq(IDENTIFIER, COMMA, self.export_list) return Nothing | self.IDENTIFIER | seq(self.IDENTIFIER, self.COMMA, self.export_list)
# Functions # Functions
@rule("FunctionDecl") @rule("FunctionDecl")
def function_declaration(self) -> Rule: def function_declaration(self) -> Rule:
return seq(FUN, IDENTIFIER, self.function_parameters, self.block) | seq( return seq(self.FUN, self.IDENTIFIER, self.function_parameters, self.block) | seq(
FUN, IDENTIFIER, self.function_parameters, ARROW, self.type_expression, self.block self.FUN,
self.IDENTIFIER,
self.function_parameters,
self.ARROW,
self.type_expression,
self.block,
) )
@rule("ParamList") @rule("ParamList")
def function_parameters(self) -> Rule: def function_parameters(self) -> Rule:
return ( return (
seq(LPAREN, RPAREN) seq(self.LPAREN, self.RPAREN)
| seq(LPAREN, self._first_parameter, RPAREN) | seq(self.LPAREN, self._first_parameter, self.RPAREN)
| seq(LPAREN, self._first_parameter, COMMA, self._parameter_list, RPAREN) | seq(self.LPAREN, self._first_parameter, self.COMMA, self._parameter_list, self.RPAREN)
) )
@rule @rule
def _first_parameter(self) -> Rule: def _first_parameter(self) -> Rule:
return SELF | self.parameter return self.SELF | self.parameter
@rule @rule
def _parameter_list(self) -> Rule: def _parameter_list(self) -> Rule:
return Nothing | self.parameter | seq(self.parameter, COMMA, self._parameter_list) return Nothing | self.parameter | seq(self.parameter, self.COMMA, self._parameter_list)
@rule("Parameter") @rule("Parameter")
def parameter(self) -> Rule: def parameter(self) -> Rule:
return seq(IDENTIFIER, COLON, self.type_expression) return seq(self.IDENTIFIER, self.COLON, self.type_expression)
# Block # Block
@rule("Block") @rule("Block")
def block(self) -> Rule: def block(self) -> Rule:
return ( return (
seq(LCURLY, RCURLY) seq(self.LCURLY, self.RCURLY)
| seq(LCURLY, self.expression, RCURLY) | seq(self.LCURLY, self.expression, self.RCURLY)
| seq(LCURLY, self._statement_list, RCURLY) | seq(self.LCURLY, self._statement_list, self.RCURLY)
| seq(LCURLY, self._statement_list, self.expression, RCURLY) | seq(self.LCURLY, self._statement_list, self.expression, self.RCURLY)
) )
@rule @rule
@ -200,19 +155,19 @@ class FineGrammar(Grammar):
@rule("LetStatement") @rule("LetStatement")
def let_statement(self) -> Rule: def let_statement(self) -> Rule:
return seq(LET, IDENTIFIER, EQUAL, self.expression, SEMICOLON) return seq(self.LET, self.IDENTIFIER, self.EQUAL, self.expression, self.SEMICOLON)
@rule("ReturnStatement") @rule("ReturnStatement")
def return_statement(self) -> Rule: def return_statement(self) -> Rule:
return seq(RETURN, self.expression, SEMICOLON) | seq(RETURN, SEMICOLON) return seq(self.RETURN, self.expression, self.SEMICOLON) | seq(self.RETURN, self.SEMICOLON)
@rule("ForStatement") @rule("ForStatement")
def for_statement(self) -> Rule: def for_statement(self) -> Rule:
return seq(FOR, self.iterator_variable, IN, self.expression, self.block) return seq(self.FOR, self.iterator_variable, self.IN, self.expression, self.block)
@rule("IteratorVariable") @rule("IteratorVariable")
def iterator_variable(self) -> Rule: def iterator_variable(self) -> Rule:
return IDENTIFIER return self.IDENTIFIER
@rule("IfStatement") @rule("IfStatement")
def if_statement(self) -> Rule: def if_statement(self) -> Rule:
@ -220,11 +175,11 @@ class FineGrammar(Grammar):
@rule @rule
def while_statement(self) -> Rule: def while_statement(self) -> Rule:
return seq(WHILE, self.expression, self.block) return seq(self.WHILE, self.expression, self.block)
@rule @rule
def expression_statement(self) -> Rule: def expression_statement(self) -> Rule:
return seq(self.expression, SEMICOLON) return seq(self.expression, self.SEMICOLON)
# Expressions # Expressions
@rule(transparent=True) @rule(transparent=True)
@ -234,91 +189,93 @@ class FineGrammar(Grammar):
@rule("BinaryExpression") @rule("BinaryExpression")
def binary_expression(self) -> Rule: def binary_expression(self) -> Rule:
return ( return (
seq(self.expression, EQUAL, self.expression) seq(self.expression, self.EQUAL, self.expression)
| seq(self.expression, OR, self.expression) | seq(self.expression, self.OR, self.expression)
| seq(self.expression, AND, self.expression) | seq(self.expression, self.AND, self.expression)
| seq(self.expression, EQUALEQUAL, self.expression) | seq(self.expression, self.EQUALEQUAL, self.expression)
| seq(self.expression, BANGEQUAL, self.expression) | seq(self.expression, self.BANGEQUAL, self.expression)
| seq(self.expression, LESS, self.expression) | seq(self.expression, self.LESS, self.expression)
| seq(self.expression, LESSEQUAL, self.expression) | seq(self.expression, self.LESSEQUAL, self.expression)
| seq(self.expression, GREATER, self.expression) | seq(self.expression, self.GREATER, self.expression)
| seq(self.expression, GREATEREQUAL, self.expression) | seq(self.expression, self.GREATEREQUAL, self.expression)
| seq(self.expression, PLUS, self.expression) | seq(self.expression, self.PLUS, self.expression)
| seq(self.expression, MINUS, self.expression) | seq(self.expression, self.MINUS, self.expression)
| seq(self.expression, STAR, self.expression) | seq(self.expression, self.STAR, self.expression)
| seq(self.expression, SLASH, self.expression) | seq(self.expression, self.SLASH, self.expression)
) )
@rule("IsExpression") @rule("IsExpression")
def is_expression(self) -> Rule: def is_expression(self) -> Rule:
return seq(self.expression, IS, self.pattern) return seq(self.expression, self.IS, self.pattern)
@rule @rule
def primary_expression(self) -> Rule: def primary_expression(self) -> Rule:
return ( return (
self.identifier_expression self.identifier_expression
| self.literal_expression | self.literal_expression
| SELF | self.SELF
| seq(BANG, self.primary_expression) | seq(self.BANG, self.primary_expression)
| seq(MINUS, self.primary_expression) | seq(self.MINUS, self.primary_expression)
| self.block | self.block
| self.conditional_expression | self.conditional_expression
| self.list_constructor_expression | self.list_constructor_expression
| self.object_constructor_expression | self.object_constructor_expression
| self.match_expression | self.match_expression
| seq(self.primary_expression, LPAREN, RPAREN) | seq(self.primary_expression, self.LPAREN, self.RPAREN)
| seq(self.primary_expression, LPAREN, self._expression_list, RPAREN) | seq(self.primary_expression, self.LPAREN, self._expression_list, self.RPAREN)
| seq(self.primary_expression, DOT, IDENTIFIER) | seq(self.primary_expression, self.DOT, self.IDENTIFIER)
| seq(LPAREN, self.expression, RPAREN) | seq(self.LPAREN, self.expression, self.RPAREN)
) )
@rule("IdentifierExpression") @rule("IdentifierExpression")
def identifier_expression(self): def identifier_expression(self):
return IDENTIFIER return self.IDENTIFIER
@rule("Literal") @rule("Literal")
def literal_expression(self): def literal_expression(self):
return NUMBER | STRING | TRUE | FALSE return self.NUMBER | self.STRING | self.TRUE | self.FALSE
@rule("ConditionalExpression") @rule("ConditionalExpression")
def conditional_expression(self) -> Rule: def conditional_expression(self) -> Rule:
return ( return (
seq(IF, self.expression, self.block) seq(self.IF, self.expression, self.block)
| seq(IF, self.expression, self.block, ELSE, self.conditional_expression) | seq(self.IF, self.expression, self.block, self.ELSE, self.conditional_expression)
| seq(IF, self.expression, self.block, ELSE, self.block) | seq(self.IF, self.expression, self.block, self.ELSE, self.block)
) )
@rule @rule
def list_constructor_expression(self) -> Rule: def list_constructor_expression(self) -> Rule:
return seq(LSQUARE, RSQUARE) | seq(LSQUARE, self._expression_list, RSQUARE) return seq(self.LSQUARE, self.RSQUARE) | seq(
self.LSQUARE, self._expression_list, self.RSQUARE
)
@rule @rule
def _expression_list(self) -> Rule: def _expression_list(self) -> Rule:
return ( return (
self.expression self.expression
| seq(self.expression, COMMA) | seq(self.expression, self.COMMA)
| seq(self.expression, COMMA, self._expression_list) | seq(self.expression, self.COMMA, self._expression_list)
) )
@rule @rule
def match_expression(self) -> Rule: def match_expression(self) -> Rule:
return seq(MATCH, self.expression, self.match_body) return seq(self.MATCH, self.expression, self.match_body)
@rule("MatchBody") @rule("MatchBody")
def match_body(self) -> Rule: def match_body(self) -> Rule:
return seq(LCURLY, RCURLY) | seq(LCURLY, self._match_arms, RCURLY) return seq(self.LCURLY, self.RCURLY) | seq(self.LCURLY, self._match_arms, self.RCURLY)
@rule @rule
def _match_arms(self) -> Rule: def _match_arms(self) -> Rule:
return ( return (
self.match_arm self.match_arm
| seq(self.match_arm, COMMA) | seq(self.match_arm, self.COMMA)
| seq(self.match_arm, COMMA, self._match_arms) | seq(self.match_arm, self.COMMA, self._match_arms)
) )
@rule("MatchArm") @rule("MatchArm")
def match_arm(self) -> Rule: def match_arm(self) -> Rule:
return seq(self.pattern, ARROW, self.expression) return seq(self.pattern, self.ARROW, self.expression)
@rule("Pattern") @rule("Pattern")
def pattern(self) -> Rule: def pattern(self) -> Rule:
@ -330,7 +287,7 @@ class FineGrammar(Grammar):
@rule @rule
def _pattern_predicate(self) -> Rule: def _pattern_predicate(self) -> Rule:
return seq(AND, self.expression) return seq(self.AND, self.expression)
@rule @rule
def _pattern_core(self) -> Rule: def _pattern_core(self) -> Rule:
@ -338,60 +295,116 @@ class FineGrammar(Grammar):
@rule("WildcardPattern") @rule("WildcardPattern")
def wildcard_pattern(self) -> Rule: def wildcard_pattern(self) -> Rule:
return UNDERSCORE return self.UNDERSCORE
@rule("VariableBinding") @rule("VariableBinding")
def variable_binding(self) -> Rule: def variable_binding(self) -> Rule:
return seq(IDENTIFIER, COLON) return seq(self.IDENTIFIER, self.COLON)
@rule @rule
def object_constructor_expression(self) -> Rule: def object_constructor_expression(self) -> Rule:
return seq(NEW, self.type_identifier, self.field_list) return seq(self.NEW, self.type_identifier, self.field_list)
@rule @rule
def field_list(self) -> Rule: def field_list(self) -> Rule:
return seq(LCURLY, RCURLY) | seq(LCURLY, self.field_values, RCURLY) return seq(self.LCURLY, self.RCURLY) | seq(self.LCURLY, self.field_values, self.RCURLY)
@rule @rule
def field_values(self) -> Rule: def field_values(self) -> Rule:
return ( return (
self.field_value self.field_value
| seq(self.field_value, COMMA) | seq(self.field_value, self.COMMA)
| seq(self.field_value, COMMA, self.field_values) | seq(self.field_value, self.COMMA, self.field_values)
) )
@rule @rule
def field_value(self) -> Rule: def field_value(self) -> Rule:
return IDENTIFIER | seq(IDENTIFIER, COLON, self.expression) return self.IDENTIFIER | seq(self.IDENTIFIER, self.COLON, self.expression)
BLANK = Terminal("[ \t\r\n]+", regex=True)
ARROW = Terminal("->")
AS = Terminal("as")
BAR = Terminal("bar")
CLASS = Terminal("class")
COLON = Terminal("colon")
COMMENT = Terminal("comment")
ELSE = Terminal("else")
FOR = Terminal("for")
FUN = Terminal("fun")
IDENTIFIER = Terminal("[A-Za-z_][A-Za-z0-9_]*", regex=True)
IF = Terminal("if")
IMPORT = Terminal("import")
IN = Terminal("in")
LCURLY = Terminal("{")
LET = Terminal("Let")
RCURLY = Terminal("}")
RETURN = Terminal("return")
SEMICOLON = Terminal(";")
STRING = Terminal('""', regex=True)
WHILE = Terminal("while")
EQUAL = Terminal("=")
LPAREN = Terminal("(")
RPAREN = Terminal(")")
COMMA = Terminal(",")
SELF = Terminal("self", name="SELFF")
OR = Terminal("or")
IS = Terminal("is")
AND = Terminal("and")
EQUALEQUAL = Terminal("==")
BANGEQUAL = Terminal("!=")
LESS = Terminal("<")
GREATER = Terminal(">")
LESSEQUAL = Terminal("<=")
GREATEREQUAL = Terminal(">=")
PLUS = Terminal("+")
MINUS = Terminal("-")
STAR = Terminal("*")
SLASH = Terminal("/")
NUMBER = Terminal("[0-9]+", regex=True)
TRUE = Terminal("true")
FALSE = Terminal("false")
BANG = Terminal("!")
DOT = Terminal(".")
MATCH = Terminal("match")
EXPORT = Terminal("export")
UNDERSCORE = Terminal("_")
NEW = Terminal("new")
LSQUARE = Terminal("[")
RSQUARE = Terminal("]")
# ----------------------------------------------------------------------------- # -----------------------------------------------------------------------------
# DORKY LEXER # DORKY LEXER
# ----------------------------------------------------------------------------- # -----------------------------------------------------------------------------
import bisect
import dataclasses
NUMBER_RE = re.compile("[0-9]+(\\.[0-9]*([eE][-+]?[0-9]+)?)?") NUMBER_RE = re.compile("[0-9]+(\\.[0-9]*([eE][-+]?[0-9]+)?)?")
IDENTIFIER_RE = re.compile("[_A-Za-z][_A-Za-z0-9]*") IDENTIFIER_RE = re.compile("[_A-Za-z][_A-Za-z0-9]*")
KEYWORD_TABLE = { KEYWORD_TABLE = {
"_": UNDERSCORE, "_": FineGrammar.UNDERSCORE,
"and": AND, "and": FineGrammar.AND,
"as": AS, "as": FineGrammar.AS,
"class": CLASS, "class": FineGrammar.CLASS,
"else": ELSE, "else": FineGrammar.ELSE,
"export": EXPORT, "export": FineGrammar.EXPORT,
"false": FALSE, "false": FineGrammar.FALSE,
"for": FOR, "for": FineGrammar.FOR,
"fun": FUN, "fun": FineGrammar.FUN,
"if": IF, "if": FineGrammar.IF,
"import": IMPORT, "import": FineGrammar.IMPORT,
"in": IN, "in": FineGrammar.IN,
"is": IS, "is": FineGrammar.IS,
"let": LET, "let": FineGrammar.LET,
"match": MATCH, "match": FineGrammar.MATCH,
"new": NEW, "new": FineGrammar.NEW,
"or": OR, "or": FineGrammar.OR,
"return": RETURN, "return": FineGrammar.RETURN,
"self": SELF, "self": FineGrammar.SELF,
"true": TRUE, "true": FineGrammar.TRUE,
"while": WHILE, "while": FineGrammar.WHILE,
} }
@ -406,63 +419,63 @@ def tokenize(src: str):
token = None token = None
if ch == "-": if ch == "-":
if src[pos : pos + 2] == "->": if src[pos : pos + 2] == "->":
token = (ARROW, pos, 2) token = (FineGrammar.ARROW, pos, 2)
else: else:
token = (MINUS, pos, 1) token = (FineGrammar.MINUS, pos, 1)
elif ch == "|": elif ch == "|":
token = (BAR, pos, 1) token = (FineGrammar.BAR, pos, 1)
elif ch == ":": elif ch == ":":
token = (COLON, pos, 1) token = (FineGrammar.COLON, pos, 1)
elif ch == "{": elif ch == "{":
token = (LCURLY, pos, 1) token = (FineGrammar.LCURLY, pos, 1)
elif ch == "}": elif ch == "}":
token = (RCURLY, pos, 1) token = (FineGrammar.RCURLY, pos, 1)
elif ch == ";": elif ch == ";":
token = (SEMICOLON, pos, 1) token = (FineGrammar.SEMICOLON, pos, 1)
elif ch == "=": elif ch == "=":
if src[pos : pos + 2] == "==": if src[pos : pos + 2] == "==":
token = (EQUALEQUAL, pos, 2) token = (FineGrammar.EQUALEQUAL, pos, 2)
else: else:
token = (EQUAL, pos, 1) token = (FineGrammar.EQUAL, pos, 1)
elif ch == "(": elif ch == "(":
token = (LPAREN, pos, 1) token = (FineGrammar.LPAREN, pos, 1)
elif ch == ")": elif ch == ")":
token = (RPAREN, pos, 1) token = (FineGrammar.RPAREN, pos, 1)
elif ch == ",": elif ch == ",":
token = (COMMA, pos, 1) token = (FineGrammar.COMMA, pos, 1)
elif ch == "!": elif ch == "!":
if src[pos : pos + 2] == "!=": if src[pos : pos + 2] == "!=":
token = (BANGEQUAL, pos, 2) token = (FineGrammar.BANGEQUAL, pos, 2)
else: else:
token = (BANG, pos, 1) token = (FineGrammar.BANG, pos, 1)
elif ch == "<": elif ch == "<":
if src[pos : pos + 2] == "<=": if src[pos : pos + 2] == "<=":
token = (LESSEQUAL, pos, 2) token = (FineGrammar.LESSEQUAL, pos, 2)
else: else:
token = (LESS, pos, 1) token = (FineGrammar.LESS, pos, 1)
elif ch == ">": elif ch == ">":
if src[pos : pos + 2] == ">=": if src[pos : pos + 2] == ">=":
token = (GREATEREQUAL, pos, 2) token = (FineGrammar.GREATEREQUAL, pos, 2)
else: else:
token = (GREATER, pos, 1) token = (FineGrammar.GREATER, pos, 1)
elif ch == "+": elif ch == "+":
token = (PLUS, pos, 1) token = (FineGrammar.PLUS, pos, 1)
elif ch == "*": elif ch == "*":
token = (STAR, pos, 1) token = (FineGrammar.STAR, pos, 1)
elif ch == "/": elif ch == "/":
if src[pos : pos + 2] == "//": if src[pos : pos + 2] == "//":
@ -470,16 +483,16 @@ def tokenize(src: str):
pos = pos + 1 pos = pos + 1
continue continue
token = (SLASH, pos, 1) token = (FineGrammar.SLASH, pos, 1)
elif ch == ".": elif ch == ".":
token = (DOT, pos, 1) token = (FineGrammar.DOT, pos, 1)
elif ch == "[": elif ch == "[":
token = (LSQUARE, pos, 1) token = (FineGrammar.LSQUARE, pos, 1)
elif ch == "]": elif ch == "]":
token = (RSQUARE, pos, 1) token = (FineGrammar.RSQUARE, pos, 1)
elif ch == '"' or ch == "'": elif ch == '"' or ch == "'":
end = pos + 1 end = pos + 1
@ -490,12 +503,12 @@ def tokenize(src: str):
if end == len(src): if end == len(src):
raise Exception(f"Unterminated string constant at {pos}") raise Exception(f"Unterminated string constant at {pos}")
end += 1 end += 1
token = (STRING, pos, end - pos) token = (FineGrammar.STRING, pos, end - pos)
else: else:
number_match = NUMBER_RE.match(src, pos) number_match = NUMBER_RE.match(src, pos)
if number_match: if number_match:
token = (NUMBER, pos, number_match.end() - pos) token = (FineGrammar.NUMBER, pos, number_match.end() - pos)
else: else:
id_match = IDENTIFIER_RE.match(src, pos) id_match = IDENTIFIER_RE.match(src, pos)
if id_match: if id_match:
@ -504,7 +517,7 @@ def tokenize(src: str):
if keyword: if keyword:
token = (keyword, pos, len(fragment)) token = (keyword, pos, len(fragment))
else: else:
token = (IDENTIFIER, pos, len(fragment)) token = (FineGrammar.IDENTIFIER, pos, len(fragment))
if token is None: if token is None:
raise Exception("Token error") raise Exception("Token error")
@ -512,9 +525,6 @@ def tokenize(src: str):
pos += token[2] pos += token[2]
import bisect
class FineTokens: class FineTokens:
def __init__(self, src: str): def __init__(self, src: str):
self.src = src self.src = src
@ -546,4 +556,20 @@ class FineTokens:
if __name__ == "__main__": if __name__ == "__main__":
FineGrammar().build_table() grammar = FineGrammar()
grammar.build_table()
class LexTest(Grammar):
@rule
def foo(self):
return self.IS
start = foo
IS = Terminal("is")
AS = Terminal("as")
IDENTIFIER = Terminal("[a-z]+", regex=True)
# IDENTIFIER = Terminal("[A-Za-z_][A-Za-z0-9_]*", regex=True)
lexer = compile_lexer(LexTest())
dump_lexer_table(lexer)

View file

@ -21,19 +21,20 @@ To get started, create a grammar that derives from the `Grammar` class. Create
one method per nonterminal, decorated with the `rule` decorator. Here's an one method per nonterminal, decorated with the `rule` decorator. Here's an
example: example:
PLUS = Terminal('+')
LPAREN = Terminal('(')
RPAREN = Terminal(')')
ID = Terminal('id')
class SimpleGrammar(Grammar): class SimpleGrammar(Grammar):
@rule @rule
def expression(self): def expression(self):
return seq(self.expression, PLUS, self.term) | self.term return seq(self.expression, self.PLUS, self.term) | self.term
@rule @rule
def term(self): def term(self):
return seq(LPAREN, self.expression, RPAREN) | ID return seq(self.LPAREN, self.expression, self.RPAREN) | self.ID
PLUS = Terminal('+')
LPAREN = Terminal('(')
RPAREN = Terminal(')')
ID = Terminal('id')
## Using grammars ## Using grammars
@ -1605,10 +1606,14 @@ class Rule:
class Terminal(Rule): class Terminal(Rule):
"""A token, or terminal symbol in the grammar.""" """A token, or terminal symbol in the grammar."""
value: str value: str | None
pattern: str
regex: bool
def __init__(self, value): def __init__(self, pattern, name=None, regex=False):
self.value = sys.intern(value) self.value = name
self.pattern = pattern
self.regex = regex
def flatten(self) -> typing.Generator[list["str | Terminal"], None, None]: def flatten(self) -> typing.Generator[list["str | Terminal"], None, None]:
# We are just ourselves when flattened. # We are just ourselves when flattened.
@ -1766,19 +1771,20 @@ class Grammar:
Here's an example of a simple grammar: Here's an example of a simple grammar:
PLUS = Terminal('+')
LPAREN = Terminal('(')
RPAREN = Terminal(')')
ID = Terminal('id')
class SimpleGrammar(Grammar): class SimpleGrammar(Grammar):
@rule @rule
def expression(self): def expression(self):
return seq(self.expression, PLUS, self.term) | self.term return seq(self.expression, self.PLUS, self.term) | self.term
@rule @rule
def term(self): def term(self):
return seq(LPAREN, self.expression, RPAREN) | ID return seq(self.LPAREN, self.expression, self.RPAREN) | self.ID
PLUS = Terminal('+')
LPAREN = Terminal('(')
RPAREN = Terminal(')')
ID = Terminal('id')
Not very exciting, perhaps, but it's something. Not very exciting, perhaps, but it's something.
""" """
@ -1786,6 +1792,7 @@ class Grammar:
_precedence: dict[str, typing.Tuple[Assoc, int]] _precedence: dict[str, typing.Tuple[Assoc, int]]
_start: str _start: str
_generator: type[GenerateLR0] _generator: type[GenerateLR0]
_terminals: list[Terminal]
def __init__( def __init__(
self, self,
@ -1809,6 +1816,14 @@ class Grammar:
generator = getattr(self, "generator", GenerateLALR) generator = getattr(self, "generator", GenerateLALR)
assert generator is not None assert generator is not None
# Fixup terminal names with the name of the member that declared it.
terminals = []
for n, t in inspect.getmembers(self, lambda x: isinstance(x, Terminal)):
if t.value is None:
t.value = n
terminals.append(t)
# Fix up the precedence table.
precedence_table = {} precedence_table = {}
for prec, (associativity, symbols) in enumerate(precedence): for prec, (associativity, symbols) in enumerate(precedence):
for symbol in symbols: for symbol in symbols:
@ -1824,6 +1839,11 @@ class Grammar:
self._precedence = precedence_table self._precedence = precedence_table
self._start = start self._start = start
self._generator = generator self._generator = generator
self._terminals = terminals
@property
def terminals(self) -> list[Terminal]:
return self._terminals
def generate_nonterminal_dict( def generate_nonterminal_dict(
self, start: str | None = None self, start: str | None = None
@ -1911,3 +1931,149 @@ class Grammar:
gen = generator(start, desugared, precedence=self._precedence, transparents=transparents) gen = generator(start, desugared, precedence=self._precedence, transparents=transparents)
table = gen.gen_table() table = gen.gen_table()
return table return table
###############################################################################
# Lexer support
###############################################################################
# For machine-generated lexers
@dataclasses.dataclass(frozen=True, slots=True)
class Span:
lower: int # inclusive
upper: int # exclusive
@classmethod
def from_str(cls, c: str) -> "Span":
return Span(lower=ord(c), upper=ord(c) + 1)
def intersects(self, other: "Span") -> bool:
return self.lower < other.upper and self.upper > other.lower
def split(self, other: "Span") -> tuple["Span|None", "Span", "Span|None"]:
assert self.intersects(other)
first = min(self.lower, other.lower)
second = max(self.lower, other.lower)
third = min(self.upper, other.upper)
fourth = max(self.upper, other.upper)
low = Span(first, second) if first != second else None
mid = Span(second, third)
hi = Span(third, fourth) if third != fourth else None
return (low, mid, hi)
def __str__(self) -> str:
if self.upper - self.lower == 1:
return str(self.lower)
lower = str(self.lower)
upper = str(self.upper)
return f"[{lower}-{upper})"
def __lt__(self, other: "Span") -> bool:
return self.lower < other.lower
ET = typing.TypeVar("ET")
class EdgeList[ET]:
"""A list of edge transitions, keyed by *span*. A given span can have
multiple targets, because this supports NFAs."""
_edges: list[tuple[Span, list[ET]]]
def __init__(self):
self._edges = []
def __iter__(self) -> typing.Iterator[tuple[Span, list[ET]]]:
return iter(self._edges)
def __repr__(self) -> str:
return f"EdgeList[{','.join(str(s[0]) + '->' + repr(s[1]) for s in self._edges)}]"
def add_edge(self, c: Span, s: ET):
"""Add an edge for the given span to the list. If there are already
spans that overlap this one, split and generating multiple distinct
edges.
"""
# print(f" Adding {c}->{s} to {self}...")
# Look to see where we would put this span based solely on a
# sort of lower bounds.
point = bisect.bisect_left(self._edges, c, key=lambda x: x[0])
# If this is not the first span in the list then we might
# overlap with the span to our left....
if point > 0:
left_point = point - 1
left_span, left_targets = self._edges[left_point]
if c.intersects(left_span):
# ...if we intersect with the span to our left then we
# must split the span to our left with regards to our
# span. Then we have three target spans:
#
# - The lo one, which just has the targets from the old
# left span. (This may be empty if we overlap the
# left one completely on the left side.)
#
# - The mid one, which has both the targets from the
# old left and the new target.
#
# - The hi one, which if it exists only has our target.
# If it exists it basically replaces the current span
# for our future processing. (If not, then our span
# is completely subsumed into the left span and we
# can stop.)
#
del self._edges[left_point]
lo, mid, hi = c.split(left_span)
# print(f" <- {c} splits {left_span} -> {lo}, {mid}, {hi} @{left_point}")
self._edges.insert(left_point, (mid, left_targets + [s]))
if lo is not None:
self._edges.insert(left_point, (lo, left_targets))
if hi is None or not hi.intersects(c):
# Yup, completely subsumed.
# print(f" result: {self} (left out)")
return
# Continue processing with `c` as the hi split from the
# left. If the left and right spans abut each other then
# `c` will be subsumed in our right span.
c = hi
# If point is not at the very end of the list then it might
# overlap the span to our right...
if point < len(self._edges):
right_span, right_targets = self._edges[point]
if c.intersects(right_span):
# ...this is similar to the left case, above, except the
# lower bound has the targets that our only ours, etc.
del self._edges[point]
lo, mid, hi = c.split(right_span)
# print(f" -> {c} splits {right_span} -> {lo}, {mid}, {hi} @{point}")
if hi is not None:
self._edges.insert(point, (hi, right_targets))
self._edges.insert(point, (mid, right_targets + [s]))
if lo is None or not lo.intersects(c):
# Our span is completely subsumed on the lower side
# of the range; there is no lower side that just has
# our targets. Bail now.
# print(f" result: {self} (right out)")
return
# Continue processing with `c` as the lo split, since
# that's the one that has only the specified state as the
# target.
c = lo
# If we made it here then either we have a point that does not
# intersect at all, or it only partially intersects on either the
# left or right. Either way, we have ensured that:
#
# - c doesn't intersect with left or right (any more)
# - point is where it should go
self._edges.insert(point, (c, [s]))
# print(f" result: {self} (done)")

View file

@ -38,25 +38,27 @@ def _tree(treeform) -> runtime.Tree | runtime.TokenValue:
def test_lr0_lr0(): def test_lr0_lr0():
"""An LR0 grammar should work with an LR0 generator.""" """An LR0 grammar should work with an LR0 generator."""
PLUS = Terminal("+") class G(Grammar):
LPAREN = Terminal("(")
RPAREN = Terminal(")")
IDENTIFIER = Terminal("id")
class LR0Grammar(Grammar):
start = "E" start = "E"
generator = parser.GenerateLR0 generator = parser.GenerateLR0
@rule @rule
def E(self): def E(self):
return seq(self.E, PLUS, self.T) | self.T return seq(self.E, self.PLUS, self.T) | self.T
@rule @rule
def T(self): def T(self):
return seq(LPAREN, self.E, RPAREN) | IDENTIFIER return seq(self.LPAREN, self.E, self.RPAREN) | self.IDENTIFIER
table = LR0Grammar().build_table() PLUS = Terminal("+", name="+")
tree, errors = runtime.Parser(table).parse(Tokens(IDENTIFIER, PLUS, LPAREN, IDENTIFIER, RPAREN)) LPAREN = Terminal("(", name="(")
RPAREN = Terminal(")", name=")")
IDENTIFIER = Terminal("id", name="id")
table = G().build_table()
tree, errors = runtime.Parser(table).parse(
Tokens(G.IDENTIFIER, G.PLUS, G.LPAREN, G.IDENTIFIER, G.RPAREN)
)
assert errors == [] assert errors == []
assert tree == _tree(("E", ("E", ("T", "id")), "+", ("T", "(", ("E", ("T", "id")), ")"))) assert tree == _tree(("E", ("E", ("T", "id")), "+", ("T", "(", ("E", ("T", "id")), ")")))
@ -65,114 +67,114 @@ def test_lr0_lr0():
def test_lr0_shift_reduce(): def test_lr0_shift_reduce():
"""This one should not work in LR0- it has a shift/reduce conflict, but works in SLR1.""" """This one should not work in LR0- it has a shift/reduce conflict, but works in SLR1."""
PLUS = Terminal("+") class G(Grammar):
LPAREN = Terminal("(")
RPAREN = Terminal(")")
LSQUARE = Terminal("[")
RSQUARE = Terminal("]")
IDENTIFIER = Terminal("id")
class TestGrammar(Grammar):
start = "E" start = "E"
generator = parser.GenerateLR0 generator = parser.GenerateLR0
@rule @rule
def E(self): def E(self):
return seq(self.E, PLUS, self.T) | self.T return seq(self.E, self.PLUS, self.T) | self.T
@rule @rule
def T(self): def T(self):
return ( return (
seq(LPAREN, self.E, RPAREN) | IDENTIFIER | seq(IDENTIFIER, LSQUARE, self.E, RSQUARE) seq(self.LPAREN, self.E, self.RPAREN)
| self.IDENTIFIER
| seq(self.IDENTIFIER, self.LSQUARE, self.E, self.RSQUARE)
) )
with pytest.raises(parser.AmbiguityError): PLUS = Terminal("+")
TestGrammar().build_table() LPAREN = Terminal("(")
RPAREN = Terminal(")")
LSQUARE = Terminal("[")
RSQUARE = Terminal("]")
IDENTIFIER = Terminal("id")
TestGrammar().build_table(generator=parser.GenerateSLR1) with pytest.raises(parser.AmbiguityError):
G().build_table()
G().build_table(generator=parser.GenerateSLR1)
def test_lr0_reduce_reduce(): def test_lr0_reduce_reduce():
"""This one should not work, it has a reduce-reduce conflict.""" """This one should not work, it has a reduce-reduce conflict."""
PLUS = Terminal("+") class G(Grammar):
EQUAL = Terminal("=")
LPAREN = Terminal("(")
RPAREN = Terminal(")")
IDENTIFIER = Terminal("id")
class TestGrammar(Grammar):
start = "E" start = "E"
generator = parser.GenerateLR0 generator = parser.GenerateLR0
@rule @rule
def E(self): def E(self):
return seq(self.E, PLUS, self.T) | self.T | seq(self.V, EQUAL, self.E) return seq(self.E, self.PLUS, self.T) | self.T | seq(self.V, self.EQUAL, self.E)
@rule @rule
def T(self): def T(self):
return seq(LPAREN, self.E, RPAREN) | IDENTIFIER return seq(self.LPAREN, self.E, self.RPAREN) | self.IDENTIFIER
@rule @rule
def V(self): def V(self):
return IDENTIFIER return self.IDENTIFIER
PLUS = Terminal("+")
EQUAL = Terminal("=")
LPAREN = Terminal("(")
RPAREN = Terminal(")")
IDENTIFIER = Terminal("id")
with pytest.raises(parser.AmbiguityError): with pytest.raises(parser.AmbiguityError):
TestGrammar().build_table() G().build_table()
def test_lr0_empty(): def test_lr0_empty():
"""LR0 can't handle empty productions because it doesn't know when to reduce.""" """LR0 can't handle empty productions because it doesn't know when to reduce."""
BOOP = Terminal("boop")
BEEP = Terminal("beep")
class TestGrammar(Grammar): class G(Grammar):
start = "E" start = "E"
generator = parser.GenerateLR0 generator = parser.GenerateLR0
@rule @rule
def E(self): def E(self):
return seq(self.F, BOOP) return seq(self.F, self.BOOP)
@rule @rule
def F(self): def F(self):
return BEEP | parser.Nothing return self.BEEP | parser.Nothing
BOOP = Terminal("boop")
BEEP = Terminal("beep")
with pytest.raises(parser.AmbiguityError): with pytest.raises(parser.AmbiguityError):
TestGrammar().build_table() G().build_table()
def test_grammar_aho_ullman_1(): def test_grammar_aho_ullman_1():
EQUAL = Terminal("=") class G(Grammar):
STAR = Terminal("*")
ID = Terminal("id")
class TestGrammar(Grammar):
start = "S" start = "S"
generator = parser.GenerateSLR1 generator = parser.GenerateSLR1
@rule @rule
def S(self): def S(self):
return seq(self.L, EQUAL, self.R) | self.R return seq(self.L, self.EQUAL, self.R) | self.R
@rule @rule
def L(self): def L(self):
return seq(STAR, self.R) | ID return seq(self.STAR, self.R) | self.ID
@rule @rule
def R(self): def R(self):
return self.L return self.L
with pytest.raises(parser.AmbiguityError): EQUAL = Terminal("=")
TestGrammar().build_table() STAR = Terminal("*")
ID = Terminal("id")
TestGrammar().build_table(generator=parser.GenerateLR1) with pytest.raises(parser.AmbiguityError):
G().build_table()
G().build_table(generator=parser.GenerateLR1)
def test_grammar_aho_ullman_2(): def test_grammar_aho_ullman_2():
A = Terminal("a")
B = Terminal("b")
class TestGrammar(Grammar): class TestGrammar(Grammar):
start = "S" start = "S"
generator = parser.GenerateSLR1 generator = parser.GenerateSLR1
@ -183,7 +185,10 @@ def test_grammar_aho_ullman_2():
@rule @rule
def X(self): def X(self):
return seq(A, self.X) | B return seq(self.A, self.X) | self.B
A = Terminal("a")
B = Terminal("b")
TestGrammar().build_table() TestGrammar().build_table()
TestGrammar().build_table(generator=parser.GenerateLR1) TestGrammar().build_table(generator=parser.GenerateLR1)
@ -191,11 +196,6 @@ def test_grammar_aho_ullman_2():
def test_fun_lalr(): def test_fun_lalr():
PLUS = Terminal("+")
INT = Terminal("int")
ID = Terminal("id")
LPAREN = Terminal("(")
RPAREN = Terminal(")")
class TestGrammar(Grammar): class TestGrammar(Grammar):
start = "S" start = "S"
@ -207,15 +207,21 @@ def test_fun_lalr():
@rule @rule
def E(self): def E(self):
return self.F | seq(self.E, PLUS, self.F) return self.F | seq(self.E, self.PLUS, self.F)
@rule @rule
def F(self): def F(self):
return self.V | INT | seq(LPAREN, self.E, RPAREN) return self.V | self.INT | seq(self.LPAREN, self.E, self.RPAREN)
@rule @rule
def V(self): def V(self):
return ID return self.ID
PLUS = Terminal("+")
INT = Terminal("int")
ID = Terminal("id")
LPAREN = Terminal("(")
RPAREN = Terminal(")")
TestGrammar().build_table() TestGrammar().build_table()
@ -234,14 +240,14 @@ def test_conflicting_names():
to understand. to understand.
""" """
IDENTIFIER = Terminal("Identifier")
class TestGrammar(Grammar): class TestGrammar(Grammar):
start = "Identifier" start = "IDENTIFIER"
@rule("Identifier") @rule("IDENTIFIER")
def identifier(self): def identifier(self):
return IDENTIFIER return self.IDENTIFIER
IDENTIFIER = Terminal("Identifier")
with pytest.raises(ValueError): with pytest.raises(ValueError):
TestGrammar().build_table() TestGrammar().build_table()

452
tests/test_lexer.py Normal file
View file

@ -0,0 +1,452 @@
from parser import Span
# LexerTable = list[tuple[Terminal | None, list[tuple[Span, int]]]]
# def compile_lexer(x: Grammar) -> LexerTable:
# class State:
# """An NFA state. Each state can be the accept state, with one or more
# Terminals as the result."""
# accept: list[Terminal]
# epsilons: list["State"]
# _edges: EdgeList["State"]
# def __init__(self):
# self.accept = []
# self.epsilons = []
# self._edges = EdgeList()
# def __repr__(self):
# return f"State{id(self)}"
# def edges(self) -> typing.Iterable[tuple[Span, list["State"]]]:
# return self._edges
# def add_edge(self, c: Span, s: "State") -> "State":
# self._edges.add_edge(c, s)
# return s
# def dump_graph(self, name="nfa.dot"):
# with open(name, "w", encoding="utf8") as f:
# f.write("digraph G {\n")
# stack: list[State] = [self]
# visited = set()
# while len(stack) > 0:
# state = stack.pop()
# if state in visited:
# continue
# visited.add(state)
# label = ", ".join([t.value for t in state.accept if t.value is not None])
# f.write(f' {id(state)} [label="{label}"];\n')
# for target in state.epsilons:
# stack.append(target)
# f.write(f' {id(state)} -> {id(target)} [label="\u03B5"];\n')
# for span, targets in state.edges():
# label = str(span).replace('"', '\\"')
# for target in targets:
# stack.append(target)
# f.write(f' {id(state)} -> {id(target)} [label="{label}"];\n')
# f.write("}\n")
# @dataclasses.dataclass
# class RegexNode:
# def to_nfa(self, start: State) -> State:
# del start
# raise NotImplementedError()
# def __str__(self) -> str:
# raise NotImplementedError()
# @dataclasses.dataclass
# class RegexLiteral(RegexNode):
# values: list[tuple[str, str]]
# def to_nfa(self, start: State) -> State:
# end = State()
# for s, e in self.values:
# start.add_edge(Span(ord(s), ord(e)), end)
# return end
# def __str__(self) -> str:
# if len(self.values) == 1:
# start, end = self.values[0]
# if start == end:
# return start
# ranges = []
# for start, end in self.values:
# if start == end:
# ranges.append(start)
# else:
# ranges.append(f"{start}-{end}")
# return "![{}]".format("".join(ranges))
# @dataclasses.dataclass
# class RegexPlus(RegexNode):
# child: RegexNode
# def to_nfa(self, start: State) -> State:
# end = self.child.to_nfa(start)
# end.epsilons.append(start)
# return end
# def __str__(self) -> str:
# return f"({self.child})+"
# @dataclasses.dataclass
# class RegexStar(RegexNode):
# child: RegexNode
# def to_nfa(self, start: State) -> State:
# end = self.child.to_nfa(start)
# end.epsilons.append(start)
# start.epsilons.append(end)
# return end
# def __str__(self) -> str:
# return f"({self.child})*"
# @dataclasses.dataclass
# class RegexQuestion(RegexNode):
# child: RegexNode
# def to_nfa(self, start: State) -> State:
# end = self.child.to_nfa(start)
# start.epsilons.append(end)
# return end
# def __str__(self) -> str:
# return f"({self.child})?"
# @dataclasses.dataclass
# class RegexSequence(RegexNode):
# left: RegexNode
# right: RegexNode
# def to_nfa(self, start: State) -> State:
# mid = self.left.to_nfa(start)
# return self.right.to_nfa(mid)
# def __str__(self) -> str:
# return f"{self.left}{self.right}"
# @dataclasses.dataclass
# class RegexAlternation(RegexNode):
# left: RegexNode
# right: RegexNode
# def to_nfa(self, start: State) -> State:
# left_start = State()
# start.epsilons.append(left_start)
# left_end = self.left.to_nfa(left_start)
# right_start = State()
# start.epsilons.append(right_start)
# right_end = self.right.to_nfa(right_start)
# end = State()
# left_end.epsilons.append(end)
# right_end.epsilons.append(end)
# return end
# def __str__(self) -> str:
# return f"(({self.left})||({self.right}))"
# class RegexParser:
# # TODO: HANDLE ALTERNATION AND PRECEDENCE (CONCAT HAS HIGHEST PRECEDENCE)
# PREFIX: dict[str, typing.Callable[[str], RegexNode]]
# POSTFIX: dict[str, typing.Callable[[RegexNode, int], RegexNode]]
# BINDING: dict[str, tuple[int, int]]
# index: int
# pattern: str
# def __init__(self, pattern: str):
# self.PREFIX = {
# "(": self.parse_group,
# "[": self.parse_set,
# }
# self.POSTFIX = {
# "+": self.parse_plus,
# "*": self.parse_star,
# "?": self.parse_question,
# "|": self.parse_alternation,
# }
# self.BINDING = {
# "|": (1, 1),
# "+": (2, 2),
# "*": (2, 2),
# "?": (2, 2),
# ")": (-1, -1), # Always stop parsing on )
# }
# self.index = 0
# self.pattern = pattern
# def consume(self) -> str:
# if self.index >= len(self.pattern):
# raise ValueError(f"Unable to parse regular expression '{self.pattern}'")
# result = self.pattern[self.index]
# self.index += 1
# return result
# def peek(self) -> str | None:
# if self.index >= len(self.pattern):
# return None
# return self.pattern[self.index]
# def eof(self) -> bool:
# return self.index >= len(self.pattern)
# def expect(self, ch: str):
# actual = self.consume()
# if ch != actual:
# raise ValueError(f"Expected '{ch}'")
# def parse_regex(self, minimum_binding=0) -> RegexNode:
# ch = self.consume()
# parser = self.PREFIX.get(ch, self.parse_single)
# node = parser(ch)
# while not self.eof():
# ch = self.peek()
# assert ch is not None
# lp, rp = self.BINDING.get(ch, (minimum_binding, minimum_binding))
# if lp < minimum_binding:
# break
# parser = self.POSTFIX.get(ch, self.parse_concat)
# node = parser(node, rp)
# return node
# def parse_single(self, ch: str) -> RegexNode:
# return RegexLiteral(values=[(ch, ch)])
# def parse_group(self, ch: str) -> RegexNode:
# del ch
# node = self.parse_regex()
# self.expect(")")
# return node
# def parse_set(self, ch: str) -> RegexNode:
# del ch
# # TODO: INVERSION?
# ranges = []
# while self.peek() not in (None, "]"):
# start = self.consume()
# if self.peek() == "-":
# self.consume()
# end = self.consume()
# else:
# end = start
# ranges.append((start, end))
# self.expect("]")
# return RegexLiteral(values=ranges)
# def parse_alternation(self, node: RegexNode, rp: int) -> RegexNode:
# return RegexAlternation(left=node, right=self.parse_regex(rp))
# def parse_plus(self, left: RegexNode, rp: int) -> RegexNode:
# del rp
# self.expect("+")
# return RegexPlus(child=left)
# def parse_star(self, left: RegexNode, rp: int) -> RegexNode:
# del rp
# self.expect("*")
# return RegexStar(child=left)
# def parse_question(self, left: RegexNode, rp: int) -> RegexNode:
# del rp
# self.expect("?")
# return RegexQuestion(child=left)
# def parse_concat(self, left: RegexNode, rp: int) -> RegexNode:
# return RegexSequence(left, self.parse_regex(rp))
# class SuperState:
# states: frozenset[State]
# index: int
# def __init__(self, states: typing.Iterable[State]):
# # Close over the given states, including every state that is
# # reachable by epsilon-transition.
# stack = list(states)
# result = set()
# while len(stack) > 0:
# st = stack.pop()
# if st in result:
# continue
# result.add(st)
# stack.extend(st.epsilons)
# self.states = frozenset(result)
# self.index = -1
# def __eq__(self, other):
# if not isinstance(other, SuperState):
# return False
# return self.states == other.states
# def __hash__(self) -> int:
# return hash(self.states)
# def edges(self) -> list[tuple[Span, "SuperState"]]:
# working: EdgeList[list[State]] = EdgeList()
# for st in self.states:
# for span, targets in st.edges():
# working.add_edge(span, targets)
# # EdgeList maps span to list[list[State]] which we want to flatten.
# result = []
# for span, stateses in working:
# s: list[State] = []
# for states in stateses:
# s.extend(states)
# result.append((span, SuperState(s)))
# return result
# def accept_terminal(self) -> Terminal | None:
# accept = None
# for st in self.states:
# for ac in st.accept:
# if accept is None:
# accept = ac
# elif accept.value != ac.value:
# if accept.regex and not ac.regex:
# accept = ac
# elif ac.regex and not accept.regex:
# pass
# else:
# raise ValueError(
# f"Lexer is ambiguous: cannot distinguish between {accept.value} ('{accept.pattern}') and {ac.value} ('{ac.pattern}')"
# )
# return accept
# # Parse the terminals all together into a big NFA rooted at `NFA`.
# NFA = State()
# for token in x.terminals:
# start = State()
# NFA.epsilons.append(start)
# if token.regex:
# node = RegexParser(token.pattern).parse_regex()
# print(f" Parsed {token.pattern} to {node}")
# ending = node.to_nfa(start)
# else:
# ending = start
# for c in token.pattern:
# ending = ending.add_edge(Span.from_str(c), State())
# ending.accept.append(token)
# NFA.dump_graph()
# # Convert the NFA into a DFA in the most straightforward way (by tracking
# # sets of state closures, called SuperStates.)
# DFA: dict[SuperState, list[tuple[Span, SuperState]]] = {}
# stack = [SuperState([NFA])]
# while len(stack) > 0:
# ss = stack.pop()
# if ss in DFA:
# continue
# edges = ss.edges()
# DFA[ss] = edges
# for _, target in edges:
# stack.append(target)
# for i, k in enumerate(DFA):
# k.index = i
# return [
# (
# ss.accept_terminal(),
# [(k, v.index) for k, v in edges],
# )
# for ss, edges in DFA.items()
# ]
# def dump_lexer_table(table: LexerTable):
# with open("lexer.dot", "w", encoding="utf-8") as f:
# f.write("digraph G {\n")
# for index, (accept, edges) in enumerate(table):
# label = accept.value if accept is not None else ""
# f.write(f' {index} [label="{label}"];\n')
# for span, target in edges:
# label = str(span).replace('"', '\\"')
# f.write(f' {index} -> {target} [label="{label}"];\n')
# pass
# f.write("}\n")
# def generic_tokenize(src: str, table: LexerTable):
# pos = 0
# state = 0
# start = 0
# last_accept = None
# last_accept_pos = 0
# while pos < len(src):
# accept, edges = table[state]
# if accept is not None:
# last_accept = accept
# last_accept_pos = pos + 1
# char = ord(src[pos])
# # Find the index of the span where the upper value is the tightest
# # bound on the character.
# index = bisect.bisect_left(edges, char, key=lambda x: x[0].upper)
# # If the character is greater than or equal to the lower bound we
# # found then we have a hit, otherwise no.
# state = edges[index][1] if index < len(edges) and char >= edges[index][0].lower else None
# if state is None:
# if last_accept is None:
# raise Exception(f"Token error at {pos}")
# yield (last_accept, start, last_accept_pos - start)
# last_accept = None
# pos = last_accept_pos
# start = pos
# state = 0
# else:
# pos += 1
def test_span_intersection():
pairs = [
((1, 3), (2, 4)),
((1, 3), (2, 3)),
((1, 3), (1, 2)),
((1, 3), (0, 2)),
((1, 3), (0, 4)),
]
for a, b in pairs:
left = Span(*a)
right = Span(*b)
assert left.intersects(right)
assert right.intersects(left)