Compare commits
2 commits
f6bc2ccea8
...
72052645d6
| Author | SHA1 | Date | |
|---|---|---|---|
| 72052645d6 | |||
| 58c3004702 |
7 changed files with 1334 additions and 271 deletions
396
grammar.py
396
grammar.py
|
|
@ -2,57 +2,17 @@
|
|||
import re
|
||||
import typing
|
||||
|
||||
import parser
|
||||
from parser import Assoc, Grammar, Nothing, Terminal, rule, seq, Rule
|
||||
|
||||
ARROW = Terminal("Arrow")
|
||||
AS = Terminal("As")
|
||||
BAR = Terminal("Bar")
|
||||
CLASS = Terminal("Class")
|
||||
COLON = Terminal("Colon")
|
||||
ELSE = Terminal("Else")
|
||||
FOR = Terminal("For")
|
||||
FUN = Terminal("Fun")
|
||||
IDENTIFIER = Terminal("Identifier")
|
||||
IF = Terminal("If")
|
||||
IMPORT = Terminal("Import")
|
||||
IN = Terminal("In")
|
||||
LCURLY = Terminal("LeftBrace")
|
||||
LET = Terminal("Let")
|
||||
RCURLY = Terminal("RightBrace")
|
||||
RETURN = Terminal("Return")
|
||||
SEMICOLON = Terminal("Semicolon")
|
||||
STRING = Terminal("String")
|
||||
WHILE = Terminal("While")
|
||||
EQUAL = Terminal("Equal")
|
||||
LPAREN = Terminal("LeftParen")
|
||||
RPAREN = Terminal("RightParen")
|
||||
COMMA = Terminal("Comma")
|
||||
SELF = Terminal("Selff")
|
||||
OR = Terminal("Or")
|
||||
IS = Terminal("Is")
|
||||
AND = Terminal("And")
|
||||
EQUALEQUAL = Terminal("EqualEqual")
|
||||
BANGEQUAL = Terminal("BangEqual")
|
||||
LESS = Terminal("Less")
|
||||
GREATER = Terminal("Greater")
|
||||
LESSEQUAL = Terminal("LessEqual")
|
||||
GREATEREQUAL = Terminal("GreaterEqual")
|
||||
PLUS = Terminal("Plus")
|
||||
MINUS = Terminal("Minus")
|
||||
STAR = Terminal("Star")
|
||||
SLASH = Terminal("Slash")
|
||||
NUMBER = Terminal("Number")
|
||||
TRUE = Terminal("True")
|
||||
FALSE = Terminal("False")
|
||||
BANG = Terminal("Bang")
|
||||
DOT = Terminal("Dot")
|
||||
MATCH = Terminal("Match")
|
||||
EXPORT = Terminal("Export")
|
||||
UNDERSCORE = Terminal("Underscore")
|
||||
NEW = Terminal("New")
|
||||
LSQUARE = Terminal("LeftBracket")
|
||||
RSQUARE = Terminal("RightBracket")
|
||||
from parser import (
|
||||
Assoc,
|
||||
Grammar,
|
||||
Nothing,
|
||||
rule,
|
||||
seq,
|
||||
Rule,
|
||||
Terminal,
|
||||
Re,
|
||||
)
|
||||
from parser.parser import compile_lexer, dump_lexer_table
|
||||
|
||||
|
||||
class FineGrammar(Grammar):
|
||||
|
|
@ -62,17 +22,17 @@ class FineGrammar(Grammar):
|
|||
def __init__(self):
|
||||
super().__init__(
|
||||
precedence=[
|
||||
(Assoc.RIGHT, [EQUAL]),
|
||||
(Assoc.LEFT, [OR]),
|
||||
(Assoc.LEFT, [IS]),
|
||||
(Assoc.LEFT, [AND]),
|
||||
(Assoc.LEFT, [EQUALEQUAL, BANGEQUAL]),
|
||||
(Assoc.LEFT, [LESS, GREATER, GREATEREQUAL, LESSEQUAL]),
|
||||
(Assoc.LEFT, [PLUS, MINUS]),
|
||||
(Assoc.LEFT, [STAR, SLASH]),
|
||||
(Assoc.RIGHT, [self.EQUAL]),
|
||||
(Assoc.LEFT, [self.OR]),
|
||||
(Assoc.LEFT, [self.IS]),
|
||||
(Assoc.LEFT, [self.AND]),
|
||||
(Assoc.LEFT, [self.EQUALEQUAL, self.BANGEQUAL]),
|
||||
(Assoc.LEFT, [self.LESS, self.GREATER, self.GREATEREQUAL, self.LESSEQUAL]),
|
||||
(Assoc.LEFT, [self.PLUS, self.MINUS]),
|
||||
(Assoc.LEFT, [self.STAR, self.SLASH]),
|
||||
(Assoc.LEFT, [self.primary_expression]),
|
||||
(Assoc.LEFT, [LPAREN]),
|
||||
(Assoc.LEFT, [DOT]),
|
||||
(Assoc.LEFT, [self.LPAREN]),
|
||||
(Assoc.LEFT, [self.DOT]),
|
||||
#
|
||||
# If there's a confusion about whether to make an IF
|
||||
# statement or an expression, prefer the statement.
|
||||
|
|
@ -97,15 +57,15 @@ class FineGrammar(Grammar):
|
|||
|
||||
@rule
|
||||
def import_statement(self) -> Rule:
|
||||
return seq(IMPORT, STRING, AS, IDENTIFIER, SEMICOLON)
|
||||
return seq(self.IMPORT, self.STRING, self.AS, self.IDENTIFIER, self.SEMICOLON)
|
||||
|
||||
@rule("ClassDeclaration")
|
||||
def class_declaration(self) -> Rule:
|
||||
return seq(CLASS, IDENTIFIER, self._class_body)
|
||||
return seq(self.CLASS, self.IDENTIFIER, self._class_body)
|
||||
|
||||
@rule
|
||||
def _class_body(self) -> Rule:
|
||||
return seq(LCURLY, RCURLY) | seq(LCURLY, self._class_members, RCURLY)
|
||||
return seq(self.LCURLY, self.RCURLY) | seq(self.LCURLY, self._class_members, self.RCURLY)
|
||||
|
||||
@rule
|
||||
def _class_members(self) -> Rule:
|
||||
|
|
@ -117,7 +77,7 @@ class FineGrammar(Grammar):
|
|||
|
||||
@rule("FieldDecl")
|
||||
def field_declaration(self) -> Rule:
|
||||
return seq(IDENTIFIER, COLON, self.type_expression, SEMICOLON)
|
||||
return seq(self.IDENTIFIER, self.COLON, self.type_expression, self.SEMICOLON)
|
||||
|
||||
# Types
|
||||
@rule("TypeExpression")
|
||||
|
|
@ -126,60 +86,65 @@ class FineGrammar(Grammar):
|
|||
|
||||
@rule("AlternateType")
|
||||
def alternate_type(self) -> Rule:
|
||||
return seq(self.type_expression, OR, self.type_identifier)
|
||||
return seq(self.type_expression, self.OR, self.type_identifier)
|
||||
|
||||
@rule("TypeIdentifier")
|
||||
def type_identifier(self) -> Rule:
|
||||
return IDENTIFIER
|
||||
return self.IDENTIFIER
|
||||
|
||||
@rule
|
||||
def export_statement(self) -> Rule:
|
||||
return (
|
||||
seq(EXPORT, self.class_declaration)
|
||||
| seq(EXPORT, self.function_declaration)
|
||||
| seq(EXPORT, self.let_statement)
|
||||
| seq(EXPORT, self.export_list, SEMICOLON)
|
||||
seq(self.EXPORT, self.class_declaration)
|
||||
| seq(self.EXPORT, self.function_declaration)
|
||||
| seq(self.EXPORT, self.let_statement)
|
||||
| seq(self.EXPORT, self.export_list, self.SEMICOLON)
|
||||
)
|
||||
|
||||
@rule
|
||||
def export_list(self) -> Rule:
|
||||
return Nothing | IDENTIFIER | seq(IDENTIFIER, COMMA, self.export_list)
|
||||
return Nothing | self.IDENTIFIER | seq(self.IDENTIFIER, self.COMMA, self.export_list)
|
||||
|
||||
# Functions
|
||||
@rule("FunctionDecl")
|
||||
def function_declaration(self) -> Rule:
|
||||
return seq(FUN, IDENTIFIER, self.function_parameters, self.block) | seq(
|
||||
FUN, IDENTIFIER, self.function_parameters, ARROW, self.type_expression, self.block
|
||||
return seq(self.FUN, self.IDENTIFIER, self.function_parameters, self.block) | seq(
|
||||
self.FUN,
|
||||
self.IDENTIFIER,
|
||||
self.function_parameters,
|
||||
self.ARROW,
|
||||
self.type_expression,
|
||||
self.block,
|
||||
)
|
||||
|
||||
@rule("ParamList")
|
||||
def function_parameters(self) -> Rule:
|
||||
return (
|
||||
seq(LPAREN, RPAREN)
|
||||
| seq(LPAREN, self._first_parameter, RPAREN)
|
||||
| seq(LPAREN, self._first_parameter, COMMA, self._parameter_list, RPAREN)
|
||||
seq(self.LPAREN, self.RPAREN)
|
||||
| seq(self.LPAREN, self._first_parameter, self.RPAREN)
|
||||
| seq(self.LPAREN, self._first_parameter, self.COMMA, self._parameter_list, self.RPAREN)
|
||||
)
|
||||
|
||||
@rule
|
||||
def _first_parameter(self) -> Rule:
|
||||
return SELF | self.parameter
|
||||
return self.SELF | self.parameter
|
||||
|
||||
@rule
|
||||
def _parameter_list(self) -> Rule:
|
||||
return Nothing | self.parameter | seq(self.parameter, COMMA, self._parameter_list)
|
||||
return Nothing | self.parameter | seq(self.parameter, self.COMMA, self._parameter_list)
|
||||
|
||||
@rule("Parameter")
|
||||
def parameter(self) -> Rule:
|
||||
return seq(IDENTIFIER, COLON, self.type_expression)
|
||||
return seq(self.IDENTIFIER, self.COLON, self.type_expression)
|
||||
|
||||
# Block
|
||||
@rule("Block")
|
||||
def block(self) -> Rule:
|
||||
return (
|
||||
seq(LCURLY, RCURLY)
|
||||
| seq(LCURLY, self.expression, RCURLY)
|
||||
| seq(LCURLY, self._statement_list, RCURLY)
|
||||
| seq(LCURLY, self._statement_list, self.expression, RCURLY)
|
||||
seq(self.LCURLY, self.RCURLY)
|
||||
| seq(self.LCURLY, self.expression, self.RCURLY)
|
||||
| seq(self.LCURLY, self._statement_list, self.RCURLY)
|
||||
| seq(self.LCURLY, self._statement_list, self.expression, self.RCURLY)
|
||||
)
|
||||
|
||||
@rule
|
||||
|
|
@ -200,19 +165,19 @@ class FineGrammar(Grammar):
|
|||
|
||||
@rule("LetStatement")
|
||||
def let_statement(self) -> Rule:
|
||||
return seq(LET, IDENTIFIER, EQUAL, self.expression, SEMICOLON)
|
||||
return seq(self.LET, self.IDENTIFIER, self.EQUAL, self.expression, self.SEMICOLON)
|
||||
|
||||
@rule("ReturnStatement")
|
||||
def return_statement(self) -> Rule:
|
||||
return seq(RETURN, self.expression, SEMICOLON) | seq(RETURN, SEMICOLON)
|
||||
return seq(self.RETURN, self.expression, self.SEMICOLON) | seq(self.RETURN, self.SEMICOLON)
|
||||
|
||||
@rule("ForStatement")
|
||||
def for_statement(self) -> Rule:
|
||||
return seq(FOR, self.iterator_variable, IN, self.expression, self.block)
|
||||
return seq(self.FOR, self.iterator_variable, self.IN, self.expression, self.block)
|
||||
|
||||
@rule("IteratorVariable")
|
||||
def iterator_variable(self) -> Rule:
|
||||
return IDENTIFIER
|
||||
return self.IDENTIFIER
|
||||
|
||||
@rule("IfStatement")
|
||||
def if_statement(self) -> Rule:
|
||||
|
|
@ -220,11 +185,11 @@ class FineGrammar(Grammar):
|
|||
|
||||
@rule
|
||||
def while_statement(self) -> Rule:
|
||||
return seq(WHILE, self.expression, self.block)
|
||||
return seq(self.WHILE, self.expression, self.block)
|
||||
|
||||
@rule
|
||||
def expression_statement(self) -> Rule:
|
||||
return seq(self.expression, SEMICOLON)
|
||||
return seq(self.expression, self.SEMICOLON)
|
||||
|
||||
# Expressions
|
||||
@rule(transparent=True)
|
||||
|
|
@ -234,91 +199,93 @@ class FineGrammar(Grammar):
|
|||
@rule("BinaryExpression")
|
||||
def binary_expression(self) -> Rule:
|
||||
return (
|
||||
seq(self.expression, EQUAL, self.expression)
|
||||
| seq(self.expression, OR, self.expression)
|
||||
| seq(self.expression, AND, self.expression)
|
||||
| seq(self.expression, EQUALEQUAL, self.expression)
|
||||
| seq(self.expression, BANGEQUAL, self.expression)
|
||||
| seq(self.expression, LESS, self.expression)
|
||||
| seq(self.expression, LESSEQUAL, self.expression)
|
||||
| seq(self.expression, GREATER, self.expression)
|
||||
| seq(self.expression, GREATEREQUAL, self.expression)
|
||||
| seq(self.expression, PLUS, self.expression)
|
||||
| seq(self.expression, MINUS, self.expression)
|
||||
| seq(self.expression, STAR, self.expression)
|
||||
| seq(self.expression, SLASH, self.expression)
|
||||
seq(self.expression, self.EQUAL, self.expression)
|
||||
| seq(self.expression, self.OR, self.expression)
|
||||
| seq(self.expression, self.AND, self.expression)
|
||||
| seq(self.expression, self.EQUALEQUAL, self.expression)
|
||||
| seq(self.expression, self.BANGEQUAL, self.expression)
|
||||
| seq(self.expression, self.LESS, self.expression)
|
||||
| seq(self.expression, self.LESSEQUAL, self.expression)
|
||||
| seq(self.expression, self.GREATER, self.expression)
|
||||
| seq(self.expression, self.GREATEREQUAL, self.expression)
|
||||
| seq(self.expression, self.PLUS, self.expression)
|
||||
| seq(self.expression, self.MINUS, self.expression)
|
||||
| seq(self.expression, self.STAR, self.expression)
|
||||
| seq(self.expression, self.SLASH, self.expression)
|
||||
)
|
||||
|
||||
@rule("IsExpression")
|
||||
def is_expression(self) -> Rule:
|
||||
return seq(self.expression, IS, self.pattern)
|
||||
return seq(self.expression, self.IS, self.pattern)
|
||||
|
||||
@rule
|
||||
def primary_expression(self) -> Rule:
|
||||
return (
|
||||
self.identifier_expression
|
||||
| self.literal_expression
|
||||
| SELF
|
||||
| seq(BANG, self.primary_expression)
|
||||
| seq(MINUS, self.primary_expression)
|
||||
| self.SELF
|
||||
| seq(self.BANG, self.primary_expression)
|
||||
| seq(self.MINUS, self.primary_expression)
|
||||
| self.block
|
||||
| self.conditional_expression
|
||||
| self.list_constructor_expression
|
||||
| self.object_constructor_expression
|
||||
| self.match_expression
|
||||
| seq(self.primary_expression, LPAREN, RPAREN)
|
||||
| seq(self.primary_expression, LPAREN, self._expression_list, RPAREN)
|
||||
| seq(self.primary_expression, DOT, IDENTIFIER)
|
||||
| seq(LPAREN, self.expression, RPAREN)
|
||||
| seq(self.primary_expression, self.LPAREN, self.RPAREN)
|
||||
| seq(self.primary_expression, self.LPAREN, self._expression_list, self.RPAREN)
|
||||
| seq(self.primary_expression, self.DOT, self.IDENTIFIER)
|
||||
| seq(self.LPAREN, self.expression, self.RPAREN)
|
||||
)
|
||||
|
||||
@rule("IdentifierExpression")
|
||||
def identifier_expression(self):
|
||||
return IDENTIFIER
|
||||
return self.IDENTIFIER
|
||||
|
||||
@rule("Literal")
|
||||
def literal_expression(self):
|
||||
return NUMBER | STRING | TRUE | FALSE
|
||||
return self.NUMBER | self.STRING | self.TRUE | self.FALSE
|
||||
|
||||
@rule("ConditionalExpression")
|
||||
def conditional_expression(self) -> Rule:
|
||||
return (
|
||||
seq(IF, self.expression, self.block)
|
||||
| seq(IF, self.expression, self.block, ELSE, self.conditional_expression)
|
||||
| seq(IF, self.expression, self.block, ELSE, self.block)
|
||||
seq(self.IF, self.expression, self.block)
|
||||
| seq(self.IF, self.expression, self.block, self.ELSE, self.conditional_expression)
|
||||
| seq(self.IF, self.expression, self.block, self.ELSE, self.block)
|
||||
)
|
||||
|
||||
@rule
|
||||
def list_constructor_expression(self) -> Rule:
|
||||
return seq(LSQUARE, RSQUARE) | seq(LSQUARE, self._expression_list, RSQUARE)
|
||||
return seq(self.LSQUARE, self.RSQUARE) | seq(
|
||||
self.LSQUARE, self._expression_list, self.RSQUARE
|
||||
)
|
||||
|
||||
@rule
|
||||
def _expression_list(self) -> Rule:
|
||||
return (
|
||||
self.expression
|
||||
| seq(self.expression, COMMA)
|
||||
| seq(self.expression, COMMA, self._expression_list)
|
||||
| seq(self.expression, self.COMMA)
|
||||
| seq(self.expression, self.COMMA, self._expression_list)
|
||||
)
|
||||
|
||||
@rule
|
||||
def match_expression(self) -> Rule:
|
||||
return seq(MATCH, self.expression, self.match_body)
|
||||
return seq(self.MATCH, self.expression, self.match_body)
|
||||
|
||||
@rule("MatchBody")
|
||||
def match_body(self) -> Rule:
|
||||
return seq(LCURLY, RCURLY) | seq(LCURLY, self._match_arms, RCURLY)
|
||||
return seq(self.LCURLY, self.RCURLY) | seq(self.LCURLY, self._match_arms, self.RCURLY)
|
||||
|
||||
@rule
|
||||
def _match_arms(self) -> Rule:
|
||||
return (
|
||||
self.match_arm
|
||||
| seq(self.match_arm, COMMA)
|
||||
| seq(self.match_arm, COMMA, self._match_arms)
|
||||
| seq(self.match_arm, self.COMMA)
|
||||
| seq(self.match_arm, self.COMMA, self._match_arms)
|
||||
)
|
||||
|
||||
@rule("MatchArm")
|
||||
def match_arm(self) -> Rule:
|
||||
return seq(self.pattern, ARROW, self.expression)
|
||||
return seq(self.pattern, self.ARROW, self.expression)
|
||||
|
||||
@rule("Pattern")
|
||||
def pattern(self) -> Rule:
|
||||
|
|
@ -330,7 +297,7 @@ class FineGrammar(Grammar):
|
|||
|
||||
@rule
|
||||
def _pattern_predicate(self) -> Rule:
|
||||
return seq(AND, self.expression)
|
||||
return seq(self.AND, self.expression)
|
||||
|
||||
@rule
|
||||
def _pattern_core(self) -> Rule:
|
||||
|
|
@ -338,60 +305,120 @@ class FineGrammar(Grammar):
|
|||
|
||||
@rule("WildcardPattern")
|
||||
def wildcard_pattern(self) -> Rule:
|
||||
return UNDERSCORE
|
||||
return self.UNDERSCORE
|
||||
|
||||
@rule("VariableBinding")
|
||||
def variable_binding(self) -> Rule:
|
||||
return seq(IDENTIFIER, COLON)
|
||||
return seq(self.IDENTIFIER, self.COLON)
|
||||
|
||||
@rule
|
||||
def object_constructor_expression(self) -> Rule:
|
||||
return seq(NEW, self.type_identifier, self.field_list)
|
||||
return seq(self.NEW, self.type_identifier, self.field_list)
|
||||
|
||||
@rule
|
||||
def field_list(self) -> Rule:
|
||||
return seq(LCURLY, RCURLY) | seq(LCURLY, self.field_values, RCURLY)
|
||||
return seq(self.LCURLY, self.RCURLY) | seq(self.LCURLY, self.field_values, self.RCURLY)
|
||||
|
||||
@rule
|
||||
def field_values(self) -> Rule:
|
||||
return (
|
||||
self.field_value
|
||||
| seq(self.field_value, COMMA)
|
||||
| seq(self.field_value, COMMA, self.field_values)
|
||||
| seq(self.field_value, self.COMMA)
|
||||
| seq(self.field_value, self.COMMA, self.field_values)
|
||||
)
|
||||
|
||||
@rule
|
||||
def field_value(self) -> Rule:
|
||||
return IDENTIFIER | seq(IDENTIFIER, COLON, self.expression)
|
||||
return self.IDENTIFIER | seq(self.IDENTIFIER, self.COLON, self.expression)
|
||||
|
||||
BLANK = Terminal(Re.set(" ", "\t", "\r", "\n").plus())
|
||||
|
||||
ARROW = Terminal("->")
|
||||
AS = Terminal("as")
|
||||
BAR = Terminal("bar")
|
||||
CLASS = Terminal("class")
|
||||
COLON = Terminal("colon")
|
||||
COMMENT = Terminal("comment")
|
||||
ELSE = Terminal("else")
|
||||
FOR = Terminal("for")
|
||||
FUN = Terminal("fun")
|
||||
IDENTIFIER = Terminal(
|
||||
Re.seq(
|
||||
Re.set(("a", "z"), ("A", "Z"), "_"),
|
||||
Re.set(("a", "z"), ("A", "Z"), ("0", "9"), "_").star(),
|
||||
)
|
||||
)
|
||||
IF = Terminal("if")
|
||||
IMPORT = Terminal("import")
|
||||
IN = Terminal("in")
|
||||
LCURLY = Terminal("{")
|
||||
LET = Terminal("Let")
|
||||
RCURLY = Terminal("}")
|
||||
RETURN = Terminal("return")
|
||||
SEMICOLON = Terminal(";")
|
||||
STRING = Terminal('""') # TODO
|
||||
WHILE = Terminal("while")
|
||||
EQUAL = Terminal("=")
|
||||
LPAREN = Terminal("(")
|
||||
RPAREN = Terminal(")")
|
||||
COMMA = Terminal(",")
|
||||
SELF = Terminal("self", name="SELFF")
|
||||
OR = Terminal("or")
|
||||
IS = Terminal("is")
|
||||
AND = Terminal("and")
|
||||
EQUALEQUAL = Terminal("==")
|
||||
BANGEQUAL = Terminal("!=")
|
||||
LESS = Terminal("<")
|
||||
GREATER = Terminal(">")
|
||||
LESSEQUAL = Terminal("<=")
|
||||
GREATEREQUAL = Terminal(">=")
|
||||
PLUS = Terminal("+")
|
||||
MINUS = Terminal("-")
|
||||
STAR = Terminal("*")
|
||||
SLASH = Terminal("/")
|
||||
NUMBER = Terminal(Re.set(("0", "9")).plus())
|
||||
TRUE = Terminal("true")
|
||||
FALSE = Terminal("false")
|
||||
BANG = Terminal("!")
|
||||
DOT = Terminal(".")
|
||||
MATCH = Terminal("match")
|
||||
EXPORT = Terminal("export")
|
||||
UNDERSCORE = Terminal("_")
|
||||
NEW = Terminal("new")
|
||||
LSQUARE = Terminal("[")
|
||||
RSQUARE = Terminal("]")
|
||||
|
||||
|
||||
# -----------------------------------------------------------------------------
|
||||
# DORKY LEXER
|
||||
# -----------------------------------------------------------------------------
|
||||
import bisect
|
||||
|
||||
|
||||
NUMBER_RE = re.compile("[0-9]+(\\.[0-9]*([eE][-+]?[0-9]+)?)?")
|
||||
IDENTIFIER_RE = re.compile("[_A-Za-z][_A-Za-z0-9]*")
|
||||
KEYWORD_TABLE = {
|
||||
"_": UNDERSCORE,
|
||||
"and": AND,
|
||||
"as": AS,
|
||||
"class": CLASS,
|
||||
"else": ELSE,
|
||||
"export": EXPORT,
|
||||
"false": FALSE,
|
||||
"for": FOR,
|
||||
"fun": FUN,
|
||||
"if": IF,
|
||||
"import": IMPORT,
|
||||
"in": IN,
|
||||
"is": IS,
|
||||
"let": LET,
|
||||
"match": MATCH,
|
||||
"new": NEW,
|
||||
"or": OR,
|
||||
"return": RETURN,
|
||||
"self": SELF,
|
||||
"true": TRUE,
|
||||
"while": WHILE,
|
||||
"_": FineGrammar.UNDERSCORE,
|
||||
"and": FineGrammar.AND,
|
||||
"as": FineGrammar.AS,
|
||||
"class": FineGrammar.CLASS,
|
||||
"else": FineGrammar.ELSE,
|
||||
"export": FineGrammar.EXPORT,
|
||||
"false": FineGrammar.FALSE,
|
||||
"for": FineGrammar.FOR,
|
||||
"fun": FineGrammar.FUN,
|
||||
"if": FineGrammar.IF,
|
||||
"import": FineGrammar.IMPORT,
|
||||
"in": FineGrammar.IN,
|
||||
"is": FineGrammar.IS,
|
||||
"let": FineGrammar.LET,
|
||||
"match": FineGrammar.MATCH,
|
||||
"new": FineGrammar.NEW,
|
||||
"or": FineGrammar.OR,
|
||||
"return": FineGrammar.RETURN,
|
||||
"self": FineGrammar.SELF,
|
||||
"true": FineGrammar.TRUE,
|
||||
"while": FineGrammar.WHILE,
|
||||
}
|
||||
|
||||
|
||||
|
|
@ -406,63 +433,63 @@ def tokenize(src: str):
|
|||
token = None
|
||||
if ch == "-":
|
||||
if src[pos : pos + 2] == "->":
|
||||
token = (ARROW, pos, 2)
|
||||
token = (FineGrammar.ARROW, pos, 2)
|
||||
else:
|
||||
token = (MINUS, pos, 1)
|
||||
token = (FineGrammar.MINUS, pos, 1)
|
||||
|
||||
elif ch == "|":
|
||||
token = (BAR, pos, 1)
|
||||
token = (FineGrammar.BAR, pos, 1)
|
||||
|
||||
elif ch == ":":
|
||||
token = (COLON, pos, 1)
|
||||
token = (FineGrammar.COLON, pos, 1)
|
||||
|
||||
elif ch == "{":
|
||||
token = (LCURLY, pos, 1)
|
||||
token = (FineGrammar.LCURLY, pos, 1)
|
||||
|
||||
elif ch == "}":
|
||||
token = (RCURLY, pos, 1)
|
||||
token = (FineGrammar.RCURLY, pos, 1)
|
||||
|
||||
elif ch == ";":
|
||||
token = (SEMICOLON, pos, 1)
|
||||
token = (FineGrammar.SEMICOLON, pos, 1)
|
||||
|
||||
elif ch == "=":
|
||||
if src[pos : pos + 2] == "==":
|
||||
token = (EQUALEQUAL, pos, 2)
|
||||
token = (FineGrammar.EQUALEQUAL, pos, 2)
|
||||
else:
|
||||
token = (EQUAL, pos, 1)
|
||||
token = (FineGrammar.EQUAL, pos, 1)
|
||||
|
||||
elif ch == "(":
|
||||
token = (LPAREN, pos, 1)
|
||||
token = (FineGrammar.LPAREN, pos, 1)
|
||||
|
||||
elif ch == ")":
|
||||
token = (RPAREN, pos, 1)
|
||||
token = (FineGrammar.RPAREN, pos, 1)
|
||||
|
||||
elif ch == ",":
|
||||
token = (COMMA, pos, 1)
|
||||
token = (FineGrammar.COMMA, pos, 1)
|
||||
|
||||
elif ch == "!":
|
||||
if src[pos : pos + 2] == "!=":
|
||||
token = (BANGEQUAL, pos, 2)
|
||||
token = (FineGrammar.BANGEQUAL, pos, 2)
|
||||
else:
|
||||
token = (BANG, pos, 1)
|
||||
token = (FineGrammar.BANG, pos, 1)
|
||||
|
||||
elif ch == "<":
|
||||
if src[pos : pos + 2] == "<=":
|
||||
token = (LESSEQUAL, pos, 2)
|
||||
token = (FineGrammar.LESSEQUAL, pos, 2)
|
||||
else:
|
||||
token = (LESS, pos, 1)
|
||||
token = (FineGrammar.LESS, pos, 1)
|
||||
|
||||
elif ch == ">":
|
||||
if src[pos : pos + 2] == ">=":
|
||||
token = (GREATEREQUAL, pos, 2)
|
||||
token = (FineGrammar.GREATEREQUAL, pos, 2)
|
||||
else:
|
||||
token = (GREATER, pos, 1)
|
||||
token = (FineGrammar.GREATER, pos, 1)
|
||||
|
||||
elif ch == "+":
|
||||
token = (PLUS, pos, 1)
|
||||
token = (FineGrammar.PLUS, pos, 1)
|
||||
|
||||
elif ch == "*":
|
||||
token = (STAR, pos, 1)
|
||||
token = (FineGrammar.STAR, pos, 1)
|
||||
|
||||
elif ch == "/":
|
||||
if src[pos : pos + 2] == "//":
|
||||
|
|
@ -470,16 +497,16 @@ def tokenize(src: str):
|
|||
pos = pos + 1
|
||||
continue
|
||||
|
||||
token = (SLASH, pos, 1)
|
||||
token = (FineGrammar.SLASH, pos, 1)
|
||||
|
||||
elif ch == ".":
|
||||
token = (DOT, pos, 1)
|
||||
token = (FineGrammar.DOT, pos, 1)
|
||||
|
||||
elif ch == "[":
|
||||
token = (LSQUARE, pos, 1)
|
||||
token = (FineGrammar.LSQUARE, pos, 1)
|
||||
|
||||
elif ch == "]":
|
||||
token = (RSQUARE, pos, 1)
|
||||
token = (FineGrammar.RSQUARE, pos, 1)
|
||||
|
||||
elif ch == '"' or ch == "'":
|
||||
end = pos + 1
|
||||
|
|
@ -490,12 +517,12 @@ def tokenize(src: str):
|
|||
if end == len(src):
|
||||
raise Exception(f"Unterminated string constant at {pos}")
|
||||
end += 1
|
||||
token = (STRING, pos, end - pos)
|
||||
token = (FineGrammar.STRING, pos, end - pos)
|
||||
|
||||
else:
|
||||
number_match = NUMBER_RE.match(src, pos)
|
||||
if number_match:
|
||||
token = (NUMBER, pos, number_match.end() - pos)
|
||||
token = (FineGrammar.NUMBER, pos, number_match.end() - pos)
|
||||
else:
|
||||
id_match = IDENTIFIER_RE.match(src, pos)
|
||||
if id_match:
|
||||
|
|
@ -504,7 +531,7 @@ def tokenize(src: str):
|
|||
if keyword:
|
||||
token = (keyword, pos, len(fragment))
|
||||
else:
|
||||
token = (IDENTIFIER, pos, len(fragment))
|
||||
token = (FineGrammar.IDENTIFIER, pos, len(fragment))
|
||||
|
||||
if token is None:
|
||||
raise Exception("Token error")
|
||||
|
|
@ -512,9 +539,6 @@ def tokenize(src: str):
|
|||
pos += token[2]
|
||||
|
||||
|
||||
import bisect
|
||||
|
||||
|
||||
class FineTokens:
|
||||
def __init__(self, src: str):
|
||||
self.src = src
|
||||
|
|
@ -546,4 +570,8 @@ class FineTokens:
|
|||
|
||||
|
||||
if __name__ == "__main__":
|
||||
FineGrammar().build_table()
|
||||
grammar = FineGrammar()
|
||||
grammar.build_table()
|
||||
|
||||
lexer = compile_lexer(grammar)
|
||||
dump_lexer_table(lexer)
|
||||
|
|
|
|||
578
parser/parser.py
578
parser/parser.py
|
|
@ -21,19 +21,20 @@ To get started, create a grammar that derives from the `Grammar` class. Create
|
|||
one method per nonterminal, decorated with the `rule` decorator. Here's an
|
||||
example:
|
||||
|
||||
PLUS = Terminal('+')
|
||||
LPAREN = Terminal('(')
|
||||
RPAREN = Terminal(')')
|
||||
ID = Terminal('id')
|
||||
|
||||
class SimpleGrammar(Grammar):
|
||||
@rule
|
||||
def expression(self):
|
||||
return seq(self.expression, PLUS, self.term) | self.term
|
||||
return seq(self.expression, self.PLUS, self.term) | self.term
|
||||
|
||||
@rule
|
||||
def term(self):
|
||||
return seq(LPAREN, self.expression, RPAREN) | ID
|
||||
return seq(self.LPAREN, self.expression, self.RPAREN) | self.ID
|
||||
|
||||
PLUS = Terminal('+')
|
||||
LPAREN = Terminal('(')
|
||||
RPAREN = Terminal(')')
|
||||
ID = Terminal('id')
|
||||
|
||||
|
||||
## Using grammars
|
||||
|
|
@ -130,13 +131,13 @@ May 2024
|
|||
"""
|
||||
|
||||
import abc
|
||||
import bisect
|
||||
import collections
|
||||
import dataclasses
|
||||
import enum
|
||||
import functools
|
||||
import inspect
|
||||
import json
|
||||
import sys
|
||||
import typing
|
||||
|
||||
|
||||
|
|
@ -1605,15 +1606,20 @@ class Rule:
|
|||
class Terminal(Rule):
|
||||
"""A token, or terminal symbol in the grammar."""
|
||||
|
||||
value: str
|
||||
value: str | None
|
||||
pattern: "str | Re"
|
||||
|
||||
def __init__(self, value):
|
||||
self.value = sys.intern(value)
|
||||
def __init__(self, pattern, name=None):
|
||||
self.value = name
|
||||
self.pattern = pattern
|
||||
|
||||
def flatten(self) -> typing.Generator[list["str | Terminal"], None, None]:
|
||||
# We are just ourselves when flattened.
|
||||
yield [self]
|
||||
|
||||
def __repr__(self) -> str:
|
||||
return self.value or "???"
|
||||
|
||||
|
||||
class NonTerminal(Rule):
|
||||
"""A non-terminal, or a production, in the grammar.
|
||||
|
|
@ -1766,19 +1772,20 @@ class Grammar:
|
|||
|
||||
Here's an example of a simple grammar:
|
||||
|
||||
PLUS = Terminal('+')
|
||||
LPAREN = Terminal('(')
|
||||
RPAREN = Terminal(')')
|
||||
ID = Terminal('id')
|
||||
|
||||
class SimpleGrammar(Grammar):
|
||||
@rule
|
||||
def expression(self):
|
||||
return seq(self.expression, PLUS, self.term) | self.term
|
||||
return seq(self.expression, self.PLUS, self.term) | self.term
|
||||
|
||||
@rule
|
||||
def term(self):
|
||||
return seq(LPAREN, self.expression, RPAREN) | ID
|
||||
return seq(self.LPAREN, self.expression, self.RPAREN) | self.ID
|
||||
|
||||
PLUS = Terminal('+')
|
||||
LPAREN = Terminal('(')
|
||||
RPAREN = Terminal(')')
|
||||
ID = Terminal('id')
|
||||
|
||||
|
||||
Not very exciting, perhaps, but it's something.
|
||||
"""
|
||||
|
|
@ -1786,6 +1793,7 @@ class Grammar:
|
|||
_precedence: dict[str, typing.Tuple[Assoc, int]]
|
||||
_start: str
|
||||
_generator: type[GenerateLR0]
|
||||
_terminals: list[Terminal]
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
|
|
@ -1809,6 +1817,14 @@ class Grammar:
|
|||
generator = getattr(self, "generator", GenerateLALR)
|
||||
assert generator is not None
|
||||
|
||||
# Fixup terminal names with the name of the member that declared it.
|
||||
terminals = []
|
||||
for n, t in inspect.getmembers(self, lambda x: isinstance(x, Terminal)):
|
||||
if t.value is None:
|
||||
t.value = n
|
||||
terminals.append(t)
|
||||
|
||||
# Fix up the precedence table.
|
||||
precedence_table = {}
|
||||
for prec, (associativity, symbols) in enumerate(precedence):
|
||||
for symbol in symbols:
|
||||
|
|
@ -1824,6 +1840,11 @@ class Grammar:
|
|||
self._precedence = precedence_table
|
||||
self._start = start
|
||||
self._generator = generator
|
||||
self._terminals = terminals
|
||||
|
||||
@property
|
||||
def terminals(self) -> list[Terminal]:
|
||||
return self._terminals
|
||||
|
||||
def generate_nonterminal_dict(
|
||||
self, start: str | None = None
|
||||
|
|
@ -1911,3 +1932,526 @@ class Grammar:
|
|||
gen = generator(start, desugared, precedence=self._precedence, transparents=transparents)
|
||||
table = gen.gen_table()
|
||||
return table
|
||||
|
||||
|
||||
###############################################################################
|
||||
# Lexer support
|
||||
###############################################################################
|
||||
# For machine-generated lexers
|
||||
|
||||
|
||||
@dataclasses.dataclass(frozen=True, slots=True)
|
||||
class Span:
|
||||
lower: int # inclusive
|
||||
upper: int # exclusive
|
||||
|
||||
@classmethod
|
||||
def from_str(cls, lower: str, upper: str | None = None) -> "Span":
|
||||
lo = ord(lower)
|
||||
if upper is None:
|
||||
hi = lo + 1
|
||||
else:
|
||||
hi = ord(upper) + 1
|
||||
|
||||
return Span(lower=lo, upper=hi)
|
||||
|
||||
def __len__(self) -> int:
|
||||
return self.upper - self.lower
|
||||
|
||||
def intersects(self, other: "Span") -> bool:
|
||||
"""Determine if this span intersects the other span."""
|
||||
return self.lower < other.upper and self.upper > other.lower
|
||||
|
||||
def split(self, other: "Span") -> tuple["Span|None", "Span|None", "Span|None"]:
|
||||
"""Split two possibly-intersecting spans into three regions: a low
|
||||
region, which covers just the lower part of the union, a mid region,
|
||||
which covers the intersection, and a hi region, which covers just the
|
||||
upper part of the union.
|
||||
|
||||
Together, low and high cover the union of the two spans. Mid covers
|
||||
the intersection. The implication is that if both spans are identical
|
||||
then the low and high regions will both be None and mid will be equal
|
||||
to both.
|
||||
|
||||
Graphically, given two spans A and B:
|
||||
|
||||
[ B )
|
||||
[ A )
|
||||
[ lo )[ mid )[ hi )
|
||||
|
||||
If the lower bounds align then the `lo` region is empty:
|
||||
|
||||
[ B )
|
||||
[ A )
|
||||
[ mid )[ hi )
|
||||
|
||||
If the upper bounds align then the `hi` region is empty:
|
||||
|
||||
[ B )
|
||||
[ A )
|
||||
[ lo )[ mid )
|
||||
|
||||
If both bounds align then both are empty:
|
||||
|
||||
[ B )
|
||||
[ A )
|
||||
[ mid )
|
||||
|
||||
split is reflexive: it doesn't matter which order you split things in,
|
||||
you will always get the same output spans, in the same order.
|
||||
"""
|
||||
if not self.intersects(other):
|
||||
if self.lower < other.lower:
|
||||
return (self, None, other)
|
||||
else:
|
||||
return (other, None, self)
|
||||
|
||||
first = min(self.lower, other.lower)
|
||||
second = max(self.lower, other.lower)
|
||||
third = min(self.upper, other.upper)
|
||||
fourth = max(self.upper, other.upper)
|
||||
|
||||
low = Span(first, second) if first != second else None
|
||||
mid = Span(second, third)
|
||||
hi = Span(third, fourth) if third != fourth else None
|
||||
|
||||
return (low, mid, hi)
|
||||
|
||||
def __str__(self) -> str:
|
||||
return f"[{self.lower}-{self.upper})"
|
||||
|
||||
|
||||
ET = typing.TypeVar("ET")
|
||||
|
||||
|
||||
class EdgeList[ET]:
|
||||
"""A list of edge transitions, keyed by *span*."""
|
||||
|
||||
_edges: list[tuple[Span, list[ET]]]
|
||||
|
||||
def __init__(self):
|
||||
self._edges = []
|
||||
|
||||
def __iter__(self) -> typing.Iterator[tuple[Span, list[ET]]]:
|
||||
return iter(self._edges)
|
||||
|
||||
def __repr__(self) -> str:
|
||||
return f"EdgeList[{','.join(str(s[0]) + '->' + repr(s[1]) for s in self._edges)}]"
|
||||
|
||||
def add_edge(self, c: Span, s: ET):
|
||||
"""Add an edge for the given span to the list. If there are already
|
||||
spans that overlap this one, split and generating multiple distinct
|
||||
edges.
|
||||
"""
|
||||
our_targets = [s]
|
||||
|
||||
# Look to see where we would put this span based solely on a sort of
|
||||
# lower bounds: find the lowest upper bound that is greater than the
|
||||
# lower bound of the incoming span.
|
||||
point = bisect.bisect_right(self._edges, c.lower, key=lambda x: x[0].upper)
|
||||
|
||||
# We might need to run this in multiple iterations because we keep
|
||||
# splitting against the *lowest* matching span.
|
||||
next_span: Span | None = c
|
||||
while next_span is not None:
|
||||
c = next_span
|
||||
next_span = None
|
||||
|
||||
# print(f" incoming: {self} @ {point} <- {c}->[{s}]")
|
||||
|
||||
# Check to see if we've run off the end of the list of spans.
|
||||
if point == len(self._edges):
|
||||
self._edges.insert(point, (c, [s]))
|
||||
# print(f" trivial end: {self}")
|
||||
return
|
||||
|
||||
# Nope, pull out the span to the right of us.
|
||||
right_span, right_targets = self._edges[point]
|
||||
|
||||
# Because we intersect at least a little bit we know that we need to
|
||||
# split and keep processing.
|
||||
del self._edges[point]
|
||||
lo, mid, hi = c.split(right_span) # Remember the semantics
|
||||
# print(f" -> {c} splits {right_span} -> {lo}, {mid}, {hi} @{point}")
|
||||
|
||||
# We do this from lo to hi, lo first.
|
||||
if lo is not None:
|
||||
# NOTE: lo will never intersect both no matter what.
|
||||
if lo.intersects(right_span):
|
||||
assert not lo.intersects(c)
|
||||
targets = right_targets
|
||||
else:
|
||||
assert lo.intersects(c)
|
||||
targets = our_targets
|
||||
|
||||
self._edges.insert(point, (lo, targets))
|
||||
point += 1 # Adjust the insertion point, important for us to keep running.
|
||||
|
||||
if mid is not None:
|
||||
# If mid exists it is known to intersect with both so we can just
|
||||
# do it.
|
||||
self._edges.insert(point, (mid, right_targets + our_targets))
|
||||
point += 1 # Adjust the insertion point, important for us to keep running.
|
||||
|
||||
if hi is not None:
|
||||
# NOTE: Just like lo, hi will never intersect both no matter what.
|
||||
if hi.intersects(right_span):
|
||||
# If hi intersects the right span then we're done, no
|
||||
# need to keep running.
|
||||
assert not hi.intersects(c)
|
||||
self._edges.insert(point, (hi, right_targets))
|
||||
|
||||
else:
|
||||
# BUT! If hi intersects the incoming span then what we
|
||||
# need to do is to replace the incoming span with hi
|
||||
# (having chopped off the lower part of the incoming
|
||||
# span) and continue to execute with only the upper part
|
||||
# of the incoming span.
|
||||
#
|
||||
# Why? Because the upper part of the incoming span might
|
||||
# intersect *more* spans, in which case we need to keep
|
||||
# splitting and merging targets.
|
||||
assert hi.intersects(c)
|
||||
next_span = hi
|
||||
|
||||
# print(f" result: {self}")
|
||||
|
||||
|
||||
class NFAState:
|
||||
"""An NFA state. Each state can be the accept state, with one or more
|
||||
Terminals as the result."""
|
||||
|
||||
accept: list[Terminal]
|
||||
epsilons: list["NFAState"]
|
||||
_edges: EdgeList["NFAState"]
|
||||
|
||||
def __init__(self):
|
||||
self.accept = []
|
||||
self.epsilons = []
|
||||
self._edges = EdgeList()
|
||||
|
||||
def __repr__(self):
|
||||
return f"State{id(self)}"
|
||||
|
||||
def edges(self) -> typing.Iterable[tuple[Span, list["NFAState"]]]:
|
||||
return self._edges
|
||||
|
||||
def add_edge(self, c: Span, s: "NFAState") -> "NFAState":
|
||||
self._edges.add_edge(c, s)
|
||||
return s
|
||||
|
||||
def dump_graph(self, name="nfa.dot"):
|
||||
with open(name, "w", encoding="utf8") as f:
|
||||
f.write("digraph G {\n")
|
||||
|
||||
stack: list[NFAState] = [self]
|
||||
visited = set()
|
||||
while len(stack) > 0:
|
||||
state = stack.pop()
|
||||
if state in visited:
|
||||
continue
|
||||
visited.add(state)
|
||||
|
||||
label = ", ".join([t.value for t in state.accept if t.value is not None])
|
||||
f.write(f' {id(state)} [label="{label}"];\n')
|
||||
for target in state.epsilons:
|
||||
stack.append(target)
|
||||
f.write(f' {id(state)} -> {id(target)} [label="\u03B5"];\n')
|
||||
|
||||
for span, targets in state.edges():
|
||||
label = str(span).replace('"', '\\"')
|
||||
for target in targets:
|
||||
stack.append(target)
|
||||
f.write(f' {id(state)} -> {id(target)} [label="{label}"];\n')
|
||||
|
||||
f.write("}\n")
|
||||
|
||||
|
||||
@dataclasses.dataclass
|
||||
class Re:
|
||||
def to_nfa(self, start: NFAState) -> NFAState:
|
||||
del start
|
||||
raise NotImplementedError()
|
||||
|
||||
def __str__(self) -> str:
|
||||
raise NotImplementedError()
|
||||
|
||||
@classmethod
|
||||
def seq(cls, *values: "Re") -> "Re":
|
||||
result = values[0]
|
||||
for v in values[1:]:
|
||||
result = RegexSequence(result, v)
|
||||
return result
|
||||
|
||||
@classmethod
|
||||
def literal(cls, value: str) -> "Re":
|
||||
return cls.seq(*[RegexLiteral.from_ranges(c) for c in value])
|
||||
|
||||
@classmethod
|
||||
def set(cls, *args: str | tuple[str, str]) -> "Re":
|
||||
return RegexLiteral.from_ranges(*args)
|
||||
|
||||
def plus(self) -> "Re":
|
||||
return RegexPlus(self)
|
||||
|
||||
def star(self) -> "Re":
|
||||
return RegexStar(self)
|
||||
|
||||
def question(self) -> "Re":
|
||||
return RegexQuestion(self)
|
||||
|
||||
def __or__(self, value: "Re", /) -> "Re":
|
||||
return RegexAlternation(self, value)
|
||||
|
||||
|
||||
@dataclasses.dataclass
|
||||
class RegexLiteral(Re):
|
||||
values: list[Span]
|
||||
|
||||
@classmethod
|
||||
def from_ranges(cls, *args: str | tuple[str, str]) -> "RegexLiteral":
|
||||
values = []
|
||||
for a in args:
|
||||
if isinstance(a, str):
|
||||
values.append(Span.from_str(a))
|
||||
else:
|
||||
values.append(Span.from_str(a[0], a[1]))
|
||||
|
||||
return RegexLiteral(values)
|
||||
|
||||
def to_nfa(self, start: NFAState) -> NFAState:
|
||||
end = NFAState()
|
||||
for span in self.values:
|
||||
start.add_edge(span, end)
|
||||
return end
|
||||
|
||||
def __str__(self) -> str:
|
||||
if len(self.values) == 1:
|
||||
span = self.values[0]
|
||||
if len(span) == 1:
|
||||
return chr(span.lower)
|
||||
|
||||
ranges = []
|
||||
for span in self.values:
|
||||
start = chr(span.lower)
|
||||
end = chr(span.upper - 1)
|
||||
if start == end:
|
||||
ranges.append(start)
|
||||
else:
|
||||
ranges.append(f"{start}-{end}")
|
||||
return "[{}]".format("".join(ranges))
|
||||
|
||||
|
||||
@dataclasses.dataclass
|
||||
class RegexPlus(Re):
|
||||
child: Re
|
||||
|
||||
def to_nfa(self, start: NFAState) -> NFAState:
|
||||
end = self.child.to_nfa(start)
|
||||
end.epsilons.append(start)
|
||||
return end
|
||||
|
||||
def __str__(self) -> str:
|
||||
return f"({self.child})+"
|
||||
|
||||
|
||||
@dataclasses.dataclass
|
||||
class RegexStar(Re):
|
||||
child: Re
|
||||
|
||||
def to_nfa(self, start: NFAState) -> NFAState:
|
||||
end = self.child.to_nfa(start)
|
||||
end.epsilons.append(start)
|
||||
start.epsilons.append(end)
|
||||
return end
|
||||
|
||||
def __str__(self) -> str:
|
||||
return f"({self.child})*"
|
||||
|
||||
|
||||
@dataclasses.dataclass
|
||||
class RegexQuestion(Re):
|
||||
child: Re
|
||||
|
||||
def to_nfa(self, start: NFAState) -> NFAState:
|
||||
end = self.child.to_nfa(start)
|
||||
start.epsilons.append(end)
|
||||
return end
|
||||
|
||||
def __str__(self) -> str:
|
||||
return f"({self.child})?"
|
||||
|
||||
|
||||
@dataclasses.dataclass
|
||||
class RegexSequence(Re):
|
||||
left: Re
|
||||
right: Re
|
||||
|
||||
def to_nfa(self, start: NFAState) -> NFAState:
|
||||
mid = self.left.to_nfa(start)
|
||||
return self.right.to_nfa(mid)
|
||||
|
||||
def __str__(self) -> str:
|
||||
return f"{self.left}{self.right}"
|
||||
|
||||
|
||||
@dataclasses.dataclass
|
||||
class RegexAlternation(Re):
|
||||
left: Re
|
||||
right: Re
|
||||
|
||||
def to_nfa(self, start: NFAState) -> NFAState:
|
||||
left_start = NFAState()
|
||||
start.epsilons.append(left_start)
|
||||
left_end = self.left.to_nfa(left_start)
|
||||
|
||||
right_start = NFAState()
|
||||
start.epsilons.append(right_start)
|
||||
right_end = self.right.to_nfa(right_start)
|
||||
|
||||
end = NFAState()
|
||||
left_end.epsilons.append(end)
|
||||
right_end.epsilons.append(end)
|
||||
|
||||
return end
|
||||
|
||||
def __str__(self) -> str:
|
||||
return f"(({self.left})||({self.right}))"
|
||||
|
||||
|
||||
LexerTable = list[tuple[Terminal | None, list[tuple[Span, int]]]]
|
||||
|
||||
|
||||
class NFASuperState:
|
||||
states: frozenset[NFAState]
|
||||
|
||||
def __init__(self, states: typing.Iterable[NFAState]):
|
||||
# Close over the given states, including every state that is
|
||||
# reachable by epsilon-transition.
|
||||
stack = list(states)
|
||||
result = set()
|
||||
while len(stack) > 0:
|
||||
st = stack.pop()
|
||||
if st in result:
|
||||
continue
|
||||
result.add(st)
|
||||
stack.extend(st.epsilons)
|
||||
|
||||
self.states = frozenset(result)
|
||||
|
||||
def __eq__(self, other):
|
||||
if not isinstance(other, NFASuperState):
|
||||
return False
|
||||
return self.states == other.states
|
||||
|
||||
def __hash__(self) -> int:
|
||||
return hash(self.states)
|
||||
|
||||
def edges(self) -> list[tuple[Span, "NFASuperState"]]:
|
||||
working: EdgeList[list[NFAState]] = EdgeList()
|
||||
for st in self.states:
|
||||
for span, targets in st.edges():
|
||||
working.add_edge(span, targets)
|
||||
|
||||
# EdgeList maps span to list[list[State]] which we want to flatten.
|
||||
last_upper = None
|
||||
result = []
|
||||
for span, stateses in working:
|
||||
if last_upper is not None:
|
||||
assert last_upper <= span.lower
|
||||
last_upper = span.upper
|
||||
|
||||
s: list[NFAState] = []
|
||||
for states in stateses:
|
||||
s.extend(states)
|
||||
|
||||
result.append((span, NFASuperState(s)))
|
||||
|
||||
if len(result) > 0:
|
||||
for i in range(0, len(result) - 1):
|
||||
span = result[i][0]
|
||||
next_span = result[i + 1][0]
|
||||
assert span.upper <= next_span.lower
|
||||
|
||||
# TODO: Merge spans that are adjacent and go to the same state.
|
||||
|
||||
return result
|
||||
|
||||
def accept_terminal(self) -> Terminal | None:
|
||||
accept = None
|
||||
for st in self.states:
|
||||
for ac in st.accept:
|
||||
if accept is None:
|
||||
accept = ac
|
||||
elif accept.value != ac.value:
|
||||
accept_regex = isinstance(accept.pattern, Re)
|
||||
ac_regex = isinstance(ac.pattern, Re)
|
||||
|
||||
if accept_regex and not ac_regex:
|
||||
accept = ac
|
||||
elif ac_regex and not accept_regex:
|
||||
pass
|
||||
else:
|
||||
raise ValueError(
|
||||
f"Lexer is ambiguous: cannot distinguish between {accept.value} ('{accept.pattern}') and {ac.value} ('{ac.pattern}')"
|
||||
)
|
||||
|
||||
return accept
|
||||
|
||||
|
||||
def compile_lexer(x: Grammar) -> LexerTable:
|
||||
# Parse the terminals all together into a big NFA rooted at `NFA`.
|
||||
NFA = NFAState()
|
||||
for terminal in x.terminals:
|
||||
start = NFAState()
|
||||
NFA.epsilons.append(start)
|
||||
|
||||
pattern = terminal.pattern
|
||||
if isinstance(pattern, Re):
|
||||
ending = pattern.to_nfa(start)
|
||||
else:
|
||||
ending = start
|
||||
for c in pattern:
|
||||
ending = ending.add_edge(Span.from_str(c), NFAState())
|
||||
|
||||
ending.accept.append(terminal)
|
||||
|
||||
NFA.dump_graph()
|
||||
|
||||
# Convert the NFA into a DFA in the most straightforward way (by tracking
|
||||
# sets of state closures, called SuperStates.)
|
||||
DFA: dict[NFASuperState, tuple[int, list[tuple[Span, NFASuperState]]]] = {}
|
||||
|
||||
stack = [NFASuperState([NFA])]
|
||||
while len(stack) > 0:
|
||||
ss = stack.pop()
|
||||
if ss in DFA:
|
||||
continue
|
||||
|
||||
edges = ss.edges()
|
||||
|
||||
DFA[ss] = (len(DFA), edges)
|
||||
for _, target in edges:
|
||||
stack.append(target)
|
||||
|
||||
return [
|
||||
(
|
||||
ss.accept_terminal(),
|
||||
[(k, DFA[v][0]) for k, v in edges],
|
||||
)
|
||||
for ss, (_, edges) in DFA.items()
|
||||
]
|
||||
|
||||
|
||||
def dump_lexer_table(table: LexerTable):
|
||||
with open("lexer.dot", "w", encoding="utf-8") as f:
|
||||
f.write("digraph G {\n")
|
||||
for index, (accept, edges) in enumerate(table):
|
||||
label = accept.value if accept is not None else ""
|
||||
f.write(f' {index} [label="{label}"];\n')
|
||||
for span, target in edges:
|
||||
label = str(span).replace('"', '\\"')
|
||||
f.write(f' {index} -> {target} [label="{label}"];\n')
|
||||
|
||||
pass
|
||||
f.write("}\n")
|
||||
|
|
|
|||
|
|
@ -430,3 +430,58 @@ class Parser:
|
|||
error_strings.append(f"{line_index}:{column_index}: {parse_error.message}")
|
||||
|
||||
return (result, error_strings)
|
||||
|
||||
|
||||
def generic_tokenize(
|
||||
src: str, table: parser.LexerTable
|
||||
) -> typing.Iterable[tuple[parser.Terminal, int, int]]:
|
||||
pos = 0
|
||||
state = 0
|
||||
start = 0
|
||||
last_accept = None
|
||||
last_accept_pos = 0
|
||||
|
||||
print(f"LEXING: {src} ({len(src)})")
|
||||
|
||||
while pos < len(src):
|
||||
while state is not None:
|
||||
accept, edges = table[state]
|
||||
if accept is not None:
|
||||
last_accept = accept
|
||||
last_accept_pos = pos
|
||||
|
||||
print(f" @ {pos} state: {state} ({accept})")
|
||||
if pos >= len(src):
|
||||
break
|
||||
|
||||
char = ord(src[pos])
|
||||
print(f" -> char: {char} ({repr(src[pos])})")
|
||||
|
||||
# Find the index of the span where the upper value is the tightest
|
||||
# bound on the character.
|
||||
state = None
|
||||
index = bisect.bisect_right(edges, char, key=lambda x: x[0].upper)
|
||||
print(f" -> {index}")
|
||||
if index < len(edges):
|
||||
span, target = edges[index]
|
||||
print(f" -> {span}, {target}")
|
||||
if char >= span.lower:
|
||||
print(f" -> target: {target}")
|
||||
state = target
|
||||
pos += 1
|
||||
|
||||
else:
|
||||
print(f" Nope (outside range)")
|
||||
else:
|
||||
print(f" Nope (at end)")
|
||||
|
||||
if last_accept is None:
|
||||
raise Exception(f"Token error at {pos}")
|
||||
|
||||
yield (last_accept, start, last_accept_pos - start)
|
||||
|
||||
print(f" Yield: {last_accept}, reset to {last_accept_pos}")
|
||||
last_accept = None
|
||||
pos = last_accept_pos
|
||||
start = pos
|
||||
state = 0
|
||||
|
|
|
|||
51
pdm.lock
generated
51
pdm.lock
generated
|
|
@ -3,9 +3,26 @@
|
|||
|
||||
[metadata]
|
||||
groups = ["default", "dev"]
|
||||
strategy = ["cross_platform", "inherit_metadata"]
|
||||
lock_version = "4.4.1"
|
||||
content_hash = "sha256:143b06c001132ba589a47b2b3a498dd54f4840d95d216c794068089fcea48d4d"
|
||||
strategy = ["inherit_metadata"]
|
||||
lock_version = "4.5.0"
|
||||
content_hash = "sha256:c4fec06f95402db1e9843df4a8a4a275273c6ec4f41f192f30d8a92ee52d15ea"
|
||||
|
||||
[[metadata.targets]]
|
||||
requires_python = ">=3.12"
|
||||
|
||||
[[package]]
|
||||
name = "attrs"
|
||||
version = "24.2.0"
|
||||
requires_python = ">=3.7"
|
||||
summary = "Classes Without Boilerplate"
|
||||
groups = ["dev"]
|
||||
dependencies = [
|
||||
"importlib-metadata; python_version < \"3.8\"",
|
||||
]
|
||||
files = [
|
||||
{file = "attrs-24.2.0-py3-none-any.whl", hash = "sha256:81921eb96de3191c8258c199618104dd27ac608d9366f5e35d011eae1867ede2"},
|
||||
{file = "attrs-24.2.0.tar.gz", hash = "sha256:5cfb1b9148b5b086569baec03f20d7b6bf3bcacc9a42bebf87ffaaca362f6346"},
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "colorama"
|
||||
|
|
@ -19,6 +36,22 @@ files = [
|
|||
{file = "colorama-0.4.6.tar.gz", hash = "sha256:08695f5cb7ed6e0531a20572697297273c47b8cae5a63ffc6d6ed5c201be6e44"},
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "hypothesis"
|
||||
version = "6.111.1"
|
||||
requires_python = ">=3.8"
|
||||
summary = "A library for property-based testing"
|
||||
groups = ["dev"]
|
||||
dependencies = [
|
||||
"attrs>=22.2.0",
|
||||
"exceptiongroup>=1.0.0; python_version < \"3.11\"",
|
||||
"sortedcontainers<3.0.0,>=2.1.0",
|
||||
]
|
||||
files = [
|
||||
{file = "hypothesis-6.111.1-py3-none-any.whl", hash = "sha256:9422adbac4b2104f6cf92dc6604b5c9df975efc08ffc7145ecc39bc617243835"},
|
||||
{file = "hypothesis-6.111.1.tar.gz", hash = "sha256:6ab6185a858fa692bf125c0d0a936134edc318bee01c05e407c71c9ead0b61c5"},
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "iniconfig"
|
||||
version = "2.0.0"
|
||||
|
|
@ -60,11 +93,23 @@ summary = "pytest: simple powerful testing with Python"
|
|||
groups = ["dev"]
|
||||
dependencies = [
|
||||
"colorama; sys_platform == \"win32\"",
|
||||
"exceptiongroup>=1.0.0rc8; python_version < \"3.11\"",
|
||||
"iniconfig",
|
||||
"packaging",
|
||||
"pluggy<2.0,>=1.5",
|
||||
"tomli>=1; python_version < \"3.11\"",
|
||||
]
|
||||
files = [
|
||||
{file = "pytest-8.2.2-py3-none-any.whl", hash = "sha256:c434598117762e2bd304e526244f67bf66bbd7b5d6cf22138be51ff661980343"},
|
||||
{file = "pytest-8.2.2.tar.gz", hash = "sha256:de4bb8104e201939ccdc688b27a89a7be2079b22e2bd2b07f806b6ba71117977"},
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "sortedcontainers"
|
||||
version = "2.4.0"
|
||||
summary = "Sorted Containers -- Sorted List, Sorted Dict, Sorted Set"
|
||||
groups = ["dev"]
|
||||
files = [
|
||||
{file = "sortedcontainers-2.4.0-py2.py3-none-any.whl", hash = "sha256:a163dcaede0f1c021485e957a39245190e74249897e2ae4b2aa38595db237ee0"},
|
||||
{file = "sortedcontainers-2.4.0.tar.gz", hash = "sha256:25caa5a06cc30b6b83d11423433f65d1f9d76c4c6a0c90e3379eaa43b9bfdb88"},
|
||||
]
|
||||
|
|
|
|||
|
|
@ -22,6 +22,7 @@ distribution = true
|
|||
[tool.pdm.dev-dependencies]
|
||||
dev = [
|
||||
"pytest>=8.2.2",
|
||||
"hypothesis>=6.111.1",
|
||||
]
|
||||
|
||||
[tool.pyright]
|
||||
|
|
|
|||
|
|
@ -38,25 +38,27 @@ def _tree(treeform) -> runtime.Tree | runtime.TokenValue:
|
|||
def test_lr0_lr0():
|
||||
"""An LR0 grammar should work with an LR0 generator."""
|
||||
|
||||
PLUS = Terminal("+")
|
||||
LPAREN = Terminal("(")
|
||||
RPAREN = Terminal(")")
|
||||
IDENTIFIER = Terminal("id")
|
||||
|
||||
class LR0Grammar(Grammar):
|
||||
class G(Grammar):
|
||||
start = "E"
|
||||
generator = parser.GenerateLR0
|
||||
|
||||
@rule
|
||||
def E(self):
|
||||
return seq(self.E, PLUS, self.T) | self.T
|
||||
return seq(self.E, self.PLUS, self.T) | self.T
|
||||
|
||||
@rule
|
||||
def T(self):
|
||||
return seq(LPAREN, self.E, RPAREN) | IDENTIFIER
|
||||
return seq(self.LPAREN, self.E, self.RPAREN) | self.IDENTIFIER
|
||||
|
||||
table = LR0Grammar().build_table()
|
||||
tree, errors = runtime.Parser(table).parse(Tokens(IDENTIFIER, PLUS, LPAREN, IDENTIFIER, RPAREN))
|
||||
PLUS = Terminal("+", name="+")
|
||||
LPAREN = Terminal("(", name="(")
|
||||
RPAREN = Terminal(")", name=")")
|
||||
IDENTIFIER = Terminal("id", name="id")
|
||||
|
||||
table = G().build_table()
|
||||
tree, errors = runtime.Parser(table).parse(
|
||||
Tokens(G.IDENTIFIER, G.PLUS, G.LPAREN, G.IDENTIFIER, G.RPAREN)
|
||||
)
|
||||
|
||||
assert errors == []
|
||||
assert tree == _tree(("E", ("E", ("T", "id")), "+", ("T", "(", ("E", ("T", "id")), ")")))
|
||||
|
|
@ -65,114 +67,114 @@ def test_lr0_lr0():
|
|||
def test_lr0_shift_reduce():
|
||||
"""This one should not work in LR0- it has a shift/reduce conflict, but works in SLR1."""
|
||||
|
||||
PLUS = Terminal("+")
|
||||
LPAREN = Terminal("(")
|
||||
RPAREN = Terminal(")")
|
||||
LSQUARE = Terminal("[")
|
||||
RSQUARE = Terminal("]")
|
||||
IDENTIFIER = Terminal("id")
|
||||
|
||||
class TestGrammar(Grammar):
|
||||
class G(Grammar):
|
||||
start = "E"
|
||||
generator = parser.GenerateLR0
|
||||
|
||||
@rule
|
||||
def E(self):
|
||||
return seq(self.E, PLUS, self.T) | self.T
|
||||
return seq(self.E, self.PLUS, self.T) | self.T
|
||||
|
||||
@rule
|
||||
def T(self):
|
||||
return (
|
||||
seq(LPAREN, self.E, RPAREN) | IDENTIFIER | seq(IDENTIFIER, LSQUARE, self.E, RSQUARE)
|
||||
seq(self.LPAREN, self.E, self.RPAREN)
|
||||
| self.IDENTIFIER
|
||||
| seq(self.IDENTIFIER, self.LSQUARE, self.E, self.RSQUARE)
|
||||
)
|
||||
|
||||
with pytest.raises(parser.AmbiguityError):
|
||||
TestGrammar().build_table()
|
||||
PLUS = Terminal("+")
|
||||
LPAREN = Terminal("(")
|
||||
RPAREN = Terminal(")")
|
||||
LSQUARE = Terminal("[")
|
||||
RSQUARE = Terminal("]")
|
||||
IDENTIFIER = Terminal("id")
|
||||
|
||||
TestGrammar().build_table(generator=parser.GenerateSLR1)
|
||||
with pytest.raises(parser.AmbiguityError):
|
||||
G().build_table()
|
||||
|
||||
G().build_table(generator=parser.GenerateSLR1)
|
||||
|
||||
|
||||
def test_lr0_reduce_reduce():
|
||||
"""This one should not work, it has a reduce-reduce conflict."""
|
||||
|
||||
PLUS = Terminal("+")
|
||||
EQUAL = Terminal("=")
|
||||
LPAREN = Terminal("(")
|
||||
RPAREN = Terminal(")")
|
||||
IDENTIFIER = Terminal("id")
|
||||
|
||||
class TestGrammar(Grammar):
|
||||
class G(Grammar):
|
||||
start = "E"
|
||||
generator = parser.GenerateLR0
|
||||
|
||||
@rule
|
||||
def E(self):
|
||||
return seq(self.E, PLUS, self.T) | self.T | seq(self.V, EQUAL, self.E)
|
||||
return seq(self.E, self.PLUS, self.T) | self.T | seq(self.V, self.EQUAL, self.E)
|
||||
|
||||
@rule
|
||||
def T(self):
|
||||
return seq(LPAREN, self.E, RPAREN) | IDENTIFIER
|
||||
return seq(self.LPAREN, self.E, self.RPAREN) | self.IDENTIFIER
|
||||
|
||||
@rule
|
||||
def V(self):
|
||||
return IDENTIFIER
|
||||
return self.IDENTIFIER
|
||||
|
||||
PLUS = Terminal("+")
|
||||
EQUAL = Terminal("=")
|
||||
LPAREN = Terminal("(")
|
||||
RPAREN = Terminal(")")
|
||||
IDENTIFIER = Terminal("id")
|
||||
|
||||
with pytest.raises(parser.AmbiguityError):
|
||||
TestGrammar().build_table()
|
||||
G().build_table()
|
||||
|
||||
|
||||
def test_lr0_empty():
|
||||
"""LR0 can't handle empty productions because it doesn't know when to reduce."""
|
||||
BOOP = Terminal("boop")
|
||||
BEEP = Terminal("beep")
|
||||
|
||||
class TestGrammar(Grammar):
|
||||
class G(Grammar):
|
||||
start = "E"
|
||||
generator = parser.GenerateLR0
|
||||
|
||||
@rule
|
||||
def E(self):
|
||||
return seq(self.F, BOOP)
|
||||
return seq(self.F, self.BOOP)
|
||||
|
||||
@rule
|
||||
def F(self):
|
||||
return BEEP | parser.Nothing
|
||||
return self.BEEP | parser.Nothing
|
||||
|
||||
BOOP = Terminal("boop")
|
||||
BEEP = Terminal("beep")
|
||||
|
||||
with pytest.raises(parser.AmbiguityError):
|
||||
TestGrammar().build_table()
|
||||
G().build_table()
|
||||
|
||||
|
||||
def test_grammar_aho_ullman_1():
|
||||
EQUAL = Terminal("=")
|
||||
STAR = Terminal("*")
|
||||
ID = Terminal("id")
|
||||
|
||||
class TestGrammar(Grammar):
|
||||
class G(Grammar):
|
||||
start = "S"
|
||||
generator = parser.GenerateSLR1
|
||||
|
||||
@rule
|
||||
def S(self):
|
||||
return seq(self.L, EQUAL, self.R) | self.R
|
||||
return seq(self.L, self.EQUAL, self.R) | self.R
|
||||
|
||||
@rule
|
||||
def L(self):
|
||||
return seq(STAR, self.R) | ID
|
||||
return seq(self.STAR, self.R) | self.ID
|
||||
|
||||
@rule
|
||||
def R(self):
|
||||
return self.L
|
||||
|
||||
with pytest.raises(parser.AmbiguityError):
|
||||
TestGrammar().build_table()
|
||||
EQUAL = Terminal("=")
|
||||
STAR = Terminal("*")
|
||||
ID = Terminal("id")
|
||||
|
||||
TestGrammar().build_table(generator=parser.GenerateLR1)
|
||||
with pytest.raises(parser.AmbiguityError):
|
||||
G().build_table()
|
||||
|
||||
G().build_table(generator=parser.GenerateLR1)
|
||||
|
||||
|
||||
def test_grammar_aho_ullman_2():
|
||||
A = Terminal("a")
|
||||
B = Terminal("b")
|
||||
|
||||
class TestGrammar(Grammar):
|
||||
start = "S"
|
||||
generator = parser.GenerateSLR1
|
||||
|
|
@ -183,7 +185,10 @@ def test_grammar_aho_ullman_2():
|
|||
|
||||
@rule
|
||||
def X(self):
|
||||
return seq(A, self.X) | B
|
||||
return seq(self.A, self.X) | self.B
|
||||
|
||||
A = Terminal("a")
|
||||
B = Terminal("b")
|
||||
|
||||
TestGrammar().build_table()
|
||||
TestGrammar().build_table(generator=parser.GenerateLR1)
|
||||
|
|
@ -191,11 +196,6 @@ def test_grammar_aho_ullman_2():
|
|||
|
||||
|
||||
def test_fun_lalr():
|
||||
PLUS = Terminal("+")
|
||||
INT = Terminal("int")
|
||||
ID = Terminal("id")
|
||||
LPAREN = Terminal("(")
|
||||
RPAREN = Terminal(")")
|
||||
|
||||
class TestGrammar(Grammar):
|
||||
start = "S"
|
||||
|
|
@ -207,15 +207,21 @@ def test_fun_lalr():
|
|||
|
||||
@rule
|
||||
def E(self):
|
||||
return self.F | seq(self.E, PLUS, self.F)
|
||||
return self.F | seq(self.E, self.PLUS, self.F)
|
||||
|
||||
@rule
|
||||
def F(self):
|
||||
return self.V | INT | seq(LPAREN, self.E, RPAREN)
|
||||
return self.V | self.INT | seq(self.LPAREN, self.E, self.RPAREN)
|
||||
|
||||
@rule
|
||||
def V(self):
|
||||
return ID
|
||||
return self.ID
|
||||
|
||||
PLUS = Terminal("+")
|
||||
INT = Terminal("int")
|
||||
ID = Terminal("id")
|
||||
LPAREN = Terminal("(")
|
||||
RPAREN = Terminal(")")
|
||||
|
||||
TestGrammar().build_table()
|
||||
|
||||
|
|
@ -234,14 +240,14 @@ def test_conflicting_names():
|
|||
to understand.
|
||||
"""
|
||||
|
||||
IDENTIFIER = Terminal("Identifier")
|
||||
|
||||
class TestGrammar(Grammar):
|
||||
start = "Identifier"
|
||||
start = "IDENTIFIER"
|
||||
|
||||
@rule("Identifier")
|
||||
@rule("IDENTIFIER")
|
||||
def identifier(self):
|
||||
return IDENTIFIER
|
||||
return self.IDENTIFIER
|
||||
|
||||
IDENTIFIER = Terminal("Identifier")
|
||||
|
||||
with pytest.raises(ValueError):
|
||||
TestGrammar().build_table()
|
||||
|
|
|
|||
384
tests/test_lexer.py
Normal file
384
tests/test_lexer.py
Normal file
|
|
@ -0,0 +1,384 @@
|
|||
import collections
|
||||
|
||||
from hypothesis import assume, example, given
|
||||
from hypothesis.strategies import integers, lists, tuples
|
||||
|
||||
import pytest
|
||||
|
||||
from parser import (
|
||||
EdgeList,
|
||||
Span,
|
||||
Grammar,
|
||||
rule,
|
||||
Terminal,
|
||||
compile_lexer,
|
||||
dump_lexer_table,
|
||||
Re,
|
||||
)
|
||||
|
||||
from parser.runtime import generic_tokenize
|
||||
|
||||
|
||||
def test_span_intersection():
|
||||
pairs = [
|
||||
((1, 3), (2, 4)),
|
||||
((1, 3), (2, 3)),
|
||||
((1, 3), (1, 2)),
|
||||
((1, 3), (0, 2)),
|
||||
((1, 3), (0, 4)),
|
||||
]
|
||||
|
||||
for a, b in pairs:
|
||||
left = Span(*a)
|
||||
right = Span(*b)
|
||||
assert left.intersects(right)
|
||||
assert right.intersects(left)
|
||||
|
||||
|
||||
def test_span_no_intersection():
|
||||
pairs = [
|
||||
((1, 2), (3, 4)),
|
||||
]
|
||||
|
||||
for a, b in pairs:
|
||||
left = Span(*a)
|
||||
right = Span(*b)
|
||||
assert not left.intersects(right)
|
||||
assert not right.intersects(left)
|
||||
|
||||
|
||||
def test_span_split():
|
||||
TC = collections.namedtuple("TC", ["left", "right", "expected"])
|
||||
cases = [
|
||||
TC(
|
||||
left=Span(1, 4),
|
||||
right=Span(2, 3),
|
||||
expected=(Span(1, 2), Span(2, 3), Span(3, 4)),
|
||||
),
|
||||
TC(
|
||||
left=Span(1, 4),
|
||||
right=Span(1, 2),
|
||||
expected=(None, Span(1, 2), Span(2, 4)),
|
||||
),
|
||||
TC(
|
||||
left=Span(1, 4),
|
||||
right=Span(3, 4),
|
||||
expected=(Span(1, 3), Span(3, 4), None),
|
||||
),
|
||||
TC(
|
||||
left=Span(1, 4),
|
||||
right=Span(1, 4),
|
||||
expected=(None, Span(1, 4), None),
|
||||
),
|
||||
]
|
||||
|
||||
for left, right, expected in cases:
|
||||
result = left.split(right)
|
||||
assert result == expected
|
||||
|
||||
result = right.split(left)
|
||||
assert result == expected
|
||||
|
||||
|
||||
@given(integers(), integers())
|
||||
def test_equal_span_mid_only(x, y):
|
||||
"""Splitting spans against themselves results in an empty lo and hi bound."""
|
||||
assume(x < y)
|
||||
span = Span(x, y)
|
||||
lo, mid, hi = span.split(span)
|
||||
assert lo is None
|
||||
assert hi is None
|
||||
assert mid == span
|
||||
|
||||
|
||||
three_distinct_points = lists(
|
||||
integers(),
|
||||
min_size=3,
|
||||
max_size=3,
|
||||
unique=True,
|
||||
).map(sorted)
|
||||
|
||||
|
||||
@given(three_distinct_points)
|
||||
def test_span_low_align_lo_none(vals):
|
||||
"""Splitting spans with aligned lower bounds results in an empty lo bound."""
|
||||
# x y z
|
||||
# [ a )
|
||||
# [ b )
|
||||
x, y, z = vals
|
||||
|
||||
a = Span(x, y)
|
||||
b = Span(x, z)
|
||||
lo, _, _ = a.split(b)
|
||||
|
||||
assert lo is None
|
||||
|
||||
|
||||
@given(three_distinct_points)
|
||||
def test_span_high_align_hi_none(vals):
|
||||
"""Splitting spans with aligned lower bounds results in an empty lo bound."""
|
||||
# x y z
|
||||
# [ a )
|
||||
# [ b )
|
||||
x, y, z = vals
|
||||
|
||||
a = Span(y, z)
|
||||
b = Span(x, z)
|
||||
_, _, hi = a.split(b)
|
||||
|
||||
assert hi is None
|
||||
|
||||
|
||||
four_distinct_points = lists(
|
||||
integers(),
|
||||
min_size=4,
|
||||
max_size=4,
|
||||
unique=True,
|
||||
).map(sorted)
|
||||
|
||||
|
||||
@given(four_distinct_points)
|
||||
def test_span_split_overlapping_lo_left(vals):
|
||||
"""Splitting two overlapping spans results in lo overlapping left."""
|
||||
a, b, c, d = vals
|
||||
|
||||
left = Span(a, c)
|
||||
right = Span(b, d)
|
||||
|
||||
lo, _, _ = left.split(right)
|
||||
assert lo is not None
|
||||
assert lo.intersects(left)
|
||||
|
||||
|
||||
@given(four_distinct_points)
|
||||
def test_span_split_overlapping_lo_not_right(vals):
|
||||
"""Splitting two overlapping spans results in lo NOT overlapping right."""
|
||||
a, b, c, d = vals
|
||||
|
||||
left = Span(a, c)
|
||||
right = Span(b, d)
|
||||
|
||||
lo, _, _ = left.split(right)
|
||||
assert lo is not None
|
||||
assert not lo.intersects(right)
|
||||
|
||||
|
||||
@given(four_distinct_points)
|
||||
def test_span_split_overlapping_mid_left(vals):
|
||||
"""Splitting two overlapping spans results in mid overlapping left."""
|
||||
a, b, c, d = vals
|
||||
|
||||
left = Span(a, c)
|
||||
right = Span(b, d)
|
||||
|
||||
_, mid, _ = left.split(right)
|
||||
assert mid is not None
|
||||
assert mid.intersects(left)
|
||||
|
||||
|
||||
@given(four_distinct_points)
|
||||
def test_span_split_overlapping_mid_right(vals):
|
||||
"""Splitting two overlapping spans results in mid overlapping right."""
|
||||
a, b, c, d = vals
|
||||
|
||||
left = Span(a, c)
|
||||
right = Span(b, d)
|
||||
|
||||
_, mid, _ = left.split(right)
|
||||
assert mid is not None
|
||||
assert mid.intersects(right)
|
||||
|
||||
|
||||
@given(four_distinct_points)
|
||||
def test_span_split_overlapping_hi_right(vals):
|
||||
"""Splitting two overlapping spans results in hi overlapping right."""
|
||||
a, b, c, d = vals
|
||||
|
||||
left = Span(a, c)
|
||||
right = Span(b, d)
|
||||
|
||||
_, _, hi = left.split(right)
|
||||
assert hi is not None
|
||||
assert hi.intersects(right)
|
||||
|
||||
|
||||
@given(four_distinct_points)
|
||||
def test_span_split_overlapping_hi_not_left(vals):
|
||||
"""Splitting two overlapping spans results in hi NOT overlapping left."""
|
||||
a, b, c, d = vals
|
||||
|
||||
left = Span(a, c)
|
||||
right = Span(b, d)
|
||||
|
||||
_, _, hi = left.split(right)
|
||||
assert hi is not None
|
||||
assert not hi.intersects(left)
|
||||
|
||||
|
||||
@given(four_distinct_points)
|
||||
def test_span_split_embedded(vals):
|
||||
"""Splitting two spans where one overlaps the other."""
|
||||
a, b, c, d = vals
|
||||
|
||||
outer = Span(a, d)
|
||||
inner = Span(b, c)
|
||||
|
||||
lo, mid, hi = outer.split(inner)
|
||||
|
||||
assert lo is not None
|
||||
assert mid is not None
|
||||
assert hi is not None
|
||||
|
||||
assert lo.intersects(outer)
|
||||
assert not lo.intersects(inner)
|
||||
|
||||
assert mid.intersects(outer)
|
||||
assert mid.intersects(inner)
|
||||
|
||||
assert hi.intersects(outer)
|
||||
assert not hi.intersects(inner)
|
||||
|
||||
|
||||
def test_edge_list_single():
|
||||
el: EdgeList[str] = EdgeList()
|
||||
el.add_edge(Span(1, 4), "A")
|
||||
|
||||
edges = list(el)
|
||||
assert edges == [
|
||||
(Span(1, 4), ["A"]),
|
||||
]
|
||||
|
||||
|
||||
def test_edge_list_fully_enclosed():
|
||||
el: EdgeList[str] = EdgeList()
|
||||
el.add_edge(Span(1, 4), "A")
|
||||
el.add_edge(Span(2, 3), "B")
|
||||
|
||||
edges = list(el)
|
||||
assert edges == [
|
||||
(Span(1, 2), ["A"]),
|
||||
(Span(2, 3), ["A", "B"]),
|
||||
(Span(3, 4), ["A"]),
|
||||
]
|
||||
|
||||
|
||||
def test_edge_list_overlap():
|
||||
el: EdgeList[str] = EdgeList()
|
||||
el.add_edge(Span(1, 4), "A")
|
||||
el.add_edge(Span(2, 5), "B")
|
||||
|
||||
edges = list(el)
|
||||
assert edges == [
|
||||
(Span(1, 2), ["A"]),
|
||||
(Span(2, 4), ["A", "B"]),
|
||||
(Span(4, 5), ["B"]),
|
||||
]
|
||||
|
||||
|
||||
def test_edge_list_no_overlap():
|
||||
el: EdgeList[str] = EdgeList()
|
||||
el.add_edge(Span(1, 4), "A")
|
||||
el.add_edge(Span(5, 8), "B")
|
||||
|
||||
edges = list(el)
|
||||
assert edges == [
|
||||
(Span(1, 4), ["A"]),
|
||||
(Span(5, 8), ["B"]),
|
||||
]
|
||||
|
||||
|
||||
def test_edge_list_no_overlap_ordered():
|
||||
el: EdgeList[str] = EdgeList()
|
||||
el.add_edge(Span(5, 8), "B")
|
||||
el.add_edge(Span(1, 4), "A")
|
||||
|
||||
edges = list(el)
|
||||
assert edges == [
|
||||
(Span(1, 4), ["A"]),
|
||||
(Span(5, 8), ["B"]),
|
||||
]
|
||||
|
||||
|
||||
def test_edge_list_overlap_span():
|
||||
el: EdgeList[str] = EdgeList()
|
||||
el.add_edge(Span(1, 3), "A")
|
||||
el.add_edge(Span(4, 6), "B")
|
||||
el.add_edge(Span(2, 5), "C")
|
||||
|
||||
edges = list(el)
|
||||
assert edges == [
|
||||
(Span(1, 2), ["A"]),
|
||||
(Span(2, 3), ["A", "C"]),
|
||||
(Span(3, 4), ["C"]),
|
||||
(Span(4, 5), ["B", "C"]),
|
||||
(Span(5, 6), ["B"]),
|
||||
]
|
||||
|
||||
|
||||
def test_edge_list_overlap_span_big():
|
||||
el: EdgeList[str] = EdgeList()
|
||||
el.add_edge(Span(2, 3), "A")
|
||||
el.add_edge(Span(4, 5), "B")
|
||||
el.add_edge(Span(6, 7), "C")
|
||||
el.add_edge(Span(1, 8), "D")
|
||||
|
||||
edges = list(el)
|
||||
assert edges == [
|
||||
(Span(1, 2), ["D"]),
|
||||
(Span(2, 3), ["A", "D"]),
|
||||
(Span(3, 4), ["D"]),
|
||||
(Span(4, 5), ["B", "D"]),
|
||||
(Span(5, 6), ["D"]),
|
||||
(Span(6, 7), ["C", "D"]),
|
||||
(Span(7, 8), ["D"]),
|
||||
]
|
||||
|
||||
|
||||
@given(lists(lists(integers(), min_size=2, max_size=2, unique=True), min_size=1))
|
||||
@example(points=[[0, 1], [1, 2]])
|
||||
def test_edge_list_always_sorted(points: list[tuple[int, int]]):
|
||||
# OK this is weird but stick with me.
|
||||
el: EdgeList[str] = EdgeList()
|
||||
for i, (a, b) in enumerate(points):
|
||||
lower = min(a, b)
|
||||
upper = max(a, b)
|
||||
|
||||
span = Span(lower, upper)
|
||||
|
||||
el.add_edge(span, str(i))
|
||||
|
||||
last_upper = None
|
||||
for span, _ in el:
|
||||
if last_upper is not None:
|
||||
assert last_upper <= span.lower, "Edges from list are not sorted"
|
||||
last_upper = span.upper
|
||||
|
||||
|
||||
def test_lexer_compile():
|
||||
class LexTest(Grammar):
|
||||
@rule
|
||||
def foo(self):
|
||||
return self.IS
|
||||
|
||||
start = foo
|
||||
|
||||
IS = Terminal("is")
|
||||
AS = Terminal("as")
|
||||
IDENTIFIER = Terminal(
|
||||
Re.seq(
|
||||
Re.set(("a", "z"), ("A", "Z"), "_"),
|
||||
Re.set(("a", "z"), ("A", "Z"), ("0", "9"), "_").star(),
|
||||
)
|
||||
)
|
||||
BLANKS = Terminal(Re.set("\r", "\n", "\t", " ").plus())
|
||||
|
||||
lexer = compile_lexer(LexTest())
|
||||
dump_lexer_table(lexer)
|
||||
tokens = list(generic_tokenize("xy is ass", lexer))
|
||||
assert tokens == [
|
||||
(LexTest.IDENTIFIER, 0, 2),
|
||||
(LexTest.BLANKS, 2, 1),
|
||||
(LexTest.IS, 3, 2),
|
||||
(LexTest.BLANKS, 5, 1),
|
||||
(LexTest.IDENTIFIER, 6, 3),
|
||||
]
|
||||
Loading…
Add table
Add a link
Reference in a new issue