Move terminals into grammar definition
Starting to work on machine-generated lexers too
This commit is contained in:
parent
f6bc2ccea8
commit
58c3004702
4 changed files with 917 additions and 267 deletions
394
grammar.py
394
grammar.py
|
|
@ -2,57 +2,7 @@
|
|||
import re
|
||||
import typing
|
||||
|
||||
import parser
|
||||
from parser import Assoc, Grammar, Nothing, Terminal, rule, seq, Rule
|
||||
|
||||
ARROW = Terminal("Arrow")
|
||||
AS = Terminal("As")
|
||||
BAR = Terminal("Bar")
|
||||
CLASS = Terminal("Class")
|
||||
COLON = Terminal("Colon")
|
||||
ELSE = Terminal("Else")
|
||||
FOR = Terminal("For")
|
||||
FUN = Terminal("Fun")
|
||||
IDENTIFIER = Terminal("Identifier")
|
||||
IF = Terminal("If")
|
||||
IMPORT = Terminal("Import")
|
||||
IN = Terminal("In")
|
||||
LCURLY = Terminal("LeftBrace")
|
||||
LET = Terminal("Let")
|
||||
RCURLY = Terminal("RightBrace")
|
||||
RETURN = Terminal("Return")
|
||||
SEMICOLON = Terminal("Semicolon")
|
||||
STRING = Terminal("String")
|
||||
WHILE = Terminal("While")
|
||||
EQUAL = Terminal("Equal")
|
||||
LPAREN = Terminal("LeftParen")
|
||||
RPAREN = Terminal("RightParen")
|
||||
COMMA = Terminal("Comma")
|
||||
SELF = Terminal("Selff")
|
||||
OR = Terminal("Or")
|
||||
IS = Terminal("Is")
|
||||
AND = Terminal("And")
|
||||
EQUALEQUAL = Terminal("EqualEqual")
|
||||
BANGEQUAL = Terminal("BangEqual")
|
||||
LESS = Terminal("Less")
|
||||
GREATER = Terminal("Greater")
|
||||
LESSEQUAL = Terminal("LessEqual")
|
||||
GREATEREQUAL = Terminal("GreaterEqual")
|
||||
PLUS = Terminal("Plus")
|
||||
MINUS = Terminal("Minus")
|
||||
STAR = Terminal("Star")
|
||||
SLASH = Terminal("Slash")
|
||||
NUMBER = Terminal("Number")
|
||||
TRUE = Terminal("True")
|
||||
FALSE = Terminal("False")
|
||||
BANG = Terminal("Bang")
|
||||
DOT = Terminal("Dot")
|
||||
MATCH = Terminal("Match")
|
||||
EXPORT = Terminal("Export")
|
||||
UNDERSCORE = Terminal("Underscore")
|
||||
NEW = Terminal("New")
|
||||
LSQUARE = Terminal("LeftBracket")
|
||||
RSQUARE = Terminal("RightBracket")
|
||||
from parser import Assoc, Grammar, Nothing, rule, seq, Rule, Terminal
|
||||
|
||||
|
||||
class FineGrammar(Grammar):
|
||||
|
|
@ -62,17 +12,17 @@ class FineGrammar(Grammar):
|
|||
def __init__(self):
|
||||
super().__init__(
|
||||
precedence=[
|
||||
(Assoc.RIGHT, [EQUAL]),
|
||||
(Assoc.LEFT, [OR]),
|
||||
(Assoc.LEFT, [IS]),
|
||||
(Assoc.LEFT, [AND]),
|
||||
(Assoc.LEFT, [EQUALEQUAL, BANGEQUAL]),
|
||||
(Assoc.LEFT, [LESS, GREATER, GREATEREQUAL, LESSEQUAL]),
|
||||
(Assoc.LEFT, [PLUS, MINUS]),
|
||||
(Assoc.LEFT, [STAR, SLASH]),
|
||||
(Assoc.RIGHT, [self.EQUAL]),
|
||||
(Assoc.LEFT, [self.OR]),
|
||||
(Assoc.LEFT, [self.IS]),
|
||||
(Assoc.LEFT, [self.AND]),
|
||||
(Assoc.LEFT, [self.EQUALEQUAL, self.BANGEQUAL]),
|
||||
(Assoc.LEFT, [self.LESS, self.GREATER, self.GREATEREQUAL, self.LESSEQUAL]),
|
||||
(Assoc.LEFT, [self.PLUS, self.MINUS]),
|
||||
(Assoc.LEFT, [self.STAR, self.SLASH]),
|
||||
(Assoc.LEFT, [self.primary_expression]),
|
||||
(Assoc.LEFT, [LPAREN]),
|
||||
(Assoc.LEFT, [DOT]),
|
||||
(Assoc.LEFT, [self.LPAREN]),
|
||||
(Assoc.LEFT, [self.DOT]),
|
||||
#
|
||||
# If there's a confusion about whether to make an IF
|
||||
# statement or an expression, prefer the statement.
|
||||
|
|
@ -97,15 +47,15 @@ class FineGrammar(Grammar):
|
|||
|
||||
@rule
|
||||
def import_statement(self) -> Rule:
|
||||
return seq(IMPORT, STRING, AS, IDENTIFIER, SEMICOLON)
|
||||
return seq(self.IMPORT, self.STRING, self.AS, self.IDENTIFIER, self.SEMICOLON)
|
||||
|
||||
@rule("ClassDeclaration")
|
||||
def class_declaration(self) -> Rule:
|
||||
return seq(CLASS, IDENTIFIER, self._class_body)
|
||||
return seq(self.CLASS, self.IDENTIFIER, self._class_body)
|
||||
|
||||
@rule
|
||||
def _class_body(self) -> Rule:
|
||||
return seq(LCURLY, RCURLY) | seq(LCURLY, self._class_members, RCURLY)
|
||||
return seq(self.LCURLY, self.RCURLY) | seq(self.LCURLY, self._class_members, self.RCURLY)
|
||||
|
||||
@rule
|
||||
def _class_members(self) -> Rule:
|
||||
|
|
@ -117,7 +67,7 @@ class FineGrammar(Grammar):
|
|||
|
||||
@rule("FieldDecl")
|
||||
def field_declaration(self) -> Rule:
|
||||
return seq(IDENTIFIER, COLON, self.type_expression, SEMICOLON)
|
||||
return seq(self.IDENTIFIER, self.COLON, self.type_expression, self.SEMICOLON)
|
||||
|
||||
# Types
|
||||
@rule("TypeExpression")
|
||||
|
|
@ -126,60 +76,65 @@ class FineGrammar(Grammar):
|
|||
|
||||
@rule("AlternateType")
|
||||
def alternate_type(self) -> Rule:
|
||||
return seq(self.type_expression, OR, self.type_identifier)
|
||||
return seq(self.type_expression, self.OR, self.type_identifier)
|
||||
|
||||
@rule("TypeIdentifier")
|
||||
def type_identifier(self) -> Rule:
|
||||
return IDENTIFIER
|
||||
return self.IDENTIFIER
|
||||
|
||||
@rule
|
||||
def export_statement(self) -> Rule:
|
||||
return (
|
||||
seq(EXPORT, self.class_declaration)
|
||||
| seq(EXPORT, self.function_declaration)
|
||||
| seq(EXPORT, self.let_statement)
|
||||
| seq(EXPORT, self.export_list, SEMICOLON)
|
||||
seq(self.EXPORT, self.class_declaration)
|
||||
| seq(self.EXPORT, self.function_declaration)
|
||||
| seq(self.EXPORT, self.let_statement)
|
||||
| seq(self.EXPORT, self.export_list, self.SEMICOLON)
|
||||
)
|
||||
|
||||
@rule
|
||||
def export_list(self) -> Rule:
|
||||
return Nothing | IDENTIFIER | seq(IDENTIFIER, COMMA, self.export_list)
|
||||
return Nothing | self.IDENTIFIER | seq(self.IDENTIFIER, self.COMMA, self.export_list)
|
||||
|
||||
# Functions
|
||||
@rule("FunctionDecl")
|
||||
def function_declaration(self) -> Rule:
|
||||
return seq(FUN, IDENTIFIER, self.function_parameters, self.block) | seq(
|
||||
FUN, IDENTIFIER, self.function_parameters, ARROW, self.type_expression, self.block
|
||||
return seq(self.FUN, self.IDENTIFIER, self.function_parameters, self.block) | seq(
|
||||
self.FUN,
|
||||
self.IDENTIFIER,
|
||||
self.function_parameters,
|
||||
self.ARROW,
|
||||
self.type_expression,
|
||||
self.block,
|
||||
)
|
||||
|
||||
@rule("ParamList")
|
||||
def function_parameters(self) -> Rule:
|
||||
return (
|
||||
seq(LPAREN, RPAREN)
|
||||
| seq(LPAREN, self._first_parameter, RPAREN)
|
||||
| seq(LPAREN, self._first_parameter, COMMA, self._parameter_list, RPAREN)
|
||||
seq(self.LPAREN, self.RPAREN)
|
||||
| seq(self.LPAREN, self._first_parameter, self.RPAREN)
|
||||
| seq(self.LPAREN, self._first_parameter, self.COMMA, self._parameter_list, self.RPAREN)
|
||||
)
|
||||
|
||||
@rule
|
||||
def _first_parameter(self) -> Rule:
|
||||
return SELF | self.parameter
|
||||
return self.SELF | self.parameter
|
||||
|
||||
@rule
|
||||
def _parameter_list(self) -> Rule:
|
||||
return Nothing | self.parameter | seq(self.parameter, COMMA, self._parameter_list)
|
||||
return Nothing | self.parameter | seq(self.parameter, self.COMMA, self._parameter_list)
|
||||
|
||||
@rule("Parameter")
|
||||
def parameter(self) -> Rule:
|
||||
return seq(IDENTIFIER, COLON, self.type_expression)
|
||||
return seq(self.IDENTIFIER, self.COLON, self.type_expression)
|
||||
|
||||
# Block
|
||||
@rule("Block")
|
||||
def block(self) -> Rule:
|
||||
return (
|
||||
seq(LCURLY, RCURLY)
|
||||
| seq(LCURLY, self.expression, RCURLY)
|
||||
| seq(LCURLY, self._statement_list, RCURLY)
|
||||
| seq(LCURLY, self._statement_list, self.expression, RCURLY)
|
||||
seq(self.LCURLY, self.RCURLY)
|
||||
| seq(self.LCURLY, self.expression, self.RCURLY)
|
||||
| seq(self.LCURLY, self._statement_list, self.RCURLY)
|
||||
| seq(self.LCURLY, self._statement_list, self.expression, self.RCURLY)
|
||||
)
|
||||
|
||||
@rule
|
||||
|
|
@ -200,19 +155,19 @@ class FineGrammar(Grammar):
|
|||
|
||||
@rule("LetStatement")
|
||||
def let_statement(self) -> Rule:
|
||||
return seq(LET, IDENTIFIER, EQUAL, self.expression, SEMICOLON)
|
||||
return seq(self.LET, self.IDENTIFIER, self.EQUAL, self.expression, self.SEMICOLON)
|
||||
|
||||
@rule("ReturnStatement")
|
||||
def return_statement(self) -> Rule:
|
||||
return seq(RETURN, self.expression, SEMICOLON) | seq(RETURN, SEMICOLON)
|
||||
return seq(self.RETURN, self.expression, self.SEMICOLON) | seq(self.RETURN, self.SEMICOLON)
|
||||
|
||||
@rule("ForStatement")
|
||||
def for_statement(self) -> Rule:
|
||||
return seq(FOR, self.iterator_variable, IN, self.expression, self.block)
|
||||
return seq(self.FOR, self.iterator_variable, self.IN, self.expression, self.block)
|
||||
|
||||
@rule("IteratorVariable")
|
||||
def iterator_variable(self) -> Rule:
|
||||
return IDENTIFIER
|
||||
return self.IDENTIFIER
|
||||
|
||||
@rule("IfStatement")
|
||||
def if_statement(self) -> Rule:
|
||||
|
|
@ -220,11 +175,11 @@ class FineGrammar(Grammar):
|
|||
|
||||
@rule
|
||||
def while_statement(self) -> Rule:
|
||||
return seq(WHILE, self.expression, self.block)
|
||||
return seq(self.WHILE, self.expression, self.block)
|
||||
|
||||
@rule
|
||||
def expression_statement(self) -> Rule:
|
||||
return seq(self.expression, SEMICOLON)
|
||||
return seq(self.expression, self.SEMICOLON)
|
||||
|
||||
# Expressions
|
||||
@rule(transparent=True)
|
||||
|
|
@ -234,91 +189,93 @@ class FineGrammar(Grammar):
|
|||
@rule("BinaryExpression")
|
||||
def binary_expression(self) -> Rule:
|
||||
return (
|
||||
seq(self.expression, EQUAL, self.expression)
|
||||
| seq(self.expression, OR, self.expression)
|
||||
| seq(self.expression, AND, self.expression)
|
||||
| seq(self.expression, EQUALEQUAL, self.expression)
|
||||
| seq(self.expression, BANGEQUAL, self.expression)
|
||||
| seq(self.expression, LESS, self.expression)
|
||||
| seq(self.expression, LESSEQUAL, self.expression)
|
||||
| seq(self.expression, GREATER, self.expression)
|
||||
| seq(self.expression, GREATEREQUAL, self.expression)
|
||||
| seq(self.expression, PLUS, self.expression)
|
||||
| seq(self.expression, MINUS, self.expression)
|
||||
| seq(self.expression, STAR, self.expression)
|
||||
| seq(self.expression, SLASH, self.expression)
|
||||
seq(self.expression, self.EQUAL, self.expression)
|
||||
| seq(self.expression, self.OR, self.expression)
|
||||
| seq(self.expression, self.AND, self.expression)
|
||||
| seq(self.expression, self.EQUALEQUAL, self.expression)
|
||||
| seq(self.expression, self.BANGEQUAL, self.expression)
|
||||
| seq(self.expression, self.LESS, self.expression)
|
||||
| seq(self.expression, self.LESSEQUAL, self.expression)
|
||||
| seq(self.expression, self.GREATER, self.expression)
|
||||
| seq(self.expression, self.GREATEREQUAL, self.expression)
|
||||
| seq(self.expression, self.PLUS, self.expression)
|
||||
| seq(self.expression, self.MINUS, self.expression)
|
||||
| seq(self.expression, self.STAR, self.expression)
|
||||
| seq(self.expression, self.SLASH, self.expression)
|
||||
)
|
||||
|
||||
@rule("IsExpression")
|
||||
def is_expression(self) -> Rule:
|
||||
return seq(self.expression, IS, self.pattern)
|
||||
return seq(self.expression, self.IS, self.pattern)
|
||||
|
||||
@rule
|
||||
def primary_expression(self) -> Rule:
|
||||
return (
|
||||
self.identifier_expression
|
||||
| self.literal_expression
|
||||
| SELF
|
||||
| seq(BANG, self.primary_expression)
|
||||
| seq(MINUS, self.primary_expression)
|
||||
| self.SELF
|
||||
| seq(self.BANG, self.primary_expression)
|
||||
| seq(self.MINUS, self.primary_expression)
|
||||
| self.block
|
||||
| self.conditional_expression
|
||||
| self.list_constructor_expression
|
||||
| self.object_constructor_expression
|
||||
| self.match_expression
|
||||
| seq(self.primary_expression, LPAREN, RPAREN)
|
||||
| seq(self.primary_expression, LPAREN, self._expression_list, RPAREN)
|
||||
| seq(self.primary_expression, DOT, IDENTIFIER)
|
||||
| seq(LPAREN, self.expression, RPAREN)
|
||||
| seq(self.primary_expression, self.LPAREN, self.RPAREN)
|
||||
| seq(self.primary_expression, self.LPAREN, self._expression_list, self.RPAREN)
|
||||
| seq(self.primary_expression, self.DOT, self.IDENTIFIER)
|
||||
| seq(self.LPAREN, self.expression, self.RPAREN)
|
||||
)
|
||||
|
||||
@rule("IdentifierExpression")
|
||||
def identifier_expression(self):
|
||||
return IDENTIFIER
|
||||
return self.IDENTIFIER
|
||||
|
||||
@rule("Literal")
|
||||
def literal_expression(self):
|
||||
return NUMBER | STRING | TRUE | FALSE
|
||||
return self.NUMBER | self.STRING | self.TRUE | self.FALSE
|
||||
|
||||
@rule("ConditionalExpression")
|
||||
def conditional_expression(self) -> Rule:
|
||||
return (
|
||||
seq(IF, self.expression, self.block)
|
||||
| seq(IF, self.expression, self.block, ELSE, self.conditional_expression)
|
||||
| seq(IF, self.expression, self.block, ELSE, self.block)
|
||||
seq(self.IF, self.expression, self.block)
|
||||
| seq(self.IF, self.expression, self.block, self.ELSE, self.conditional_expression)
|
||||
| seq(self.IF, self.expression, self.block, self.ELSE, self.block)
|
||||
)
|
||||
|
||||
@rule
|
||||
def list_constructor_expression(self) -> Rule:
|
||||
return seq(LSQUARE, RSQUARE) | seq(LSQUARE, self._expression_list, RSQUARE)
|
||||
return seq(self.LSQUARE, self.RSQUARE) | seq(
|
||||
self.LSQUARE, self._expression_list, self.RSQUARE
|
||||
)
|
||||
|
||||
@rule
|
||||
def _expression_list(self) -> Rule:
|
||||
return (
|
||||
self.expression
|
||||
| seq(self.expression, COMMA)
|
||||
| seq(self.expression, COMMA, self._expression_list)
|
||||
| seq(self.expression, self.COMMA)
|
||||
| seq(self.expression, self.COMMA, self._expression_list)
|
||||
)
|
||||
|
||||
@rule
|
||||
def match_expression(self) -> Rule:
|
||||
return seq(MATCH, self.expression, self.match_body)
|
||||
return seq(self.MATCH, self.expression, self.match_body)
|
||||
|
||||
@rule("MatchBody")
|
||||
def match_body(self) -> Rule:
|
||||
return seq(LCURLY, RCURLY) | seq(LCURLY, self._match_arms, RCURLY)
|
||||
return seq(self.LCURLY, self.RCURLY) | seq(self.LCURLY, self._match_arms, self.RCURLY)
|
||||
|
||||
@rule
|
||||
def _match_arms(self) -> Rule:
|
||||
return (
|
||||
self.match_arm
|
||||
| seq(self.match_arm, COMMA)
|
||||
| seq(self.match_arm, COMMA, self._match_arms)
|
||||
| seq(self.match_arm, self.COMMA)
|
||||
| seq(self.match_arm, self.COMMA, self._match_arms)
|
||||
)
|
||||
|
||||
@rule("MatchArm")
|
||||
def match_arm(self) -> Rule:
|
||||
return seq(self.pattern, ARROW, self.expression)
|
||||
return seq(self.pattern, self.ARROW, self.expression)
|
||||
|
||||
@rule("Pattern")
|
||||
def pattern(self) -> Rule:
|
||||
|
|
@ -330,7 +287,7 @@ class FineGrammar(Grammar):
|
|||
|
||||
@rule
|
||||
def _pattern_predicate(self) -> Rule:
|
||||
return seq(AND, self.expression)
|
||||
return seq(self.AND, self.expression)
|
||||
|
||||
@rule
|
||||
def _pattern_core(self) -> Rule:
|
||||
|
|
@ -338,60 +295,116 @@ class FineGrammar(Grammar):
|
|||
|
||||
@rule("WildcardPattern")
|
||||
def wildcard_pattern(self) -> Rule:
|
||||
return UNDERSCORE
|
||||
return self.UNDERSCORE
|
||||
|
||||
@rule("VariableBinding")
|
||||
def variable_binding(self) -> Rule:
|
||||
return seq(IDENTIFIER, COLON)
|
||||
return seq(self.IDENTIFIER, self.COLON)
|
||||
|
||||
@rule
|
||||
def object_constructor_expression(self) -> Rule:
|
||||
return seq(NEW, self.type_identifier, self.field_list)
|
||||
return seq(self.NEW, self.type_identifier, self.field_list)
|
||||
|
||||
@rule
|
||||
def field_list(self) -> Rule:
|
||||
return seq(LCURLY, RCURLY) | seq(LCURLY, self.field_values, RCURLY)
|
||||
return seq(self.LCURLY, self.RCURLY) | seq(self.LCURLY, self.field_values, self.RCURLY)
|
||||
|
||||
@rule
|
||||
def field_values(self) -> Rule:
|
||||
return (
|
||||
self.field_value
|
||||
| seq(self.field_value, COMMA)
|
||||
| seq(self.field_value, COMMA, self.field_values)
|
||||
| seq(self.field_value, self.COMMA)
|
||||
| seq(self.field_value, self.COMMA, self.field_values)
|
||||
)
|
||||
|
||||
@rule
|
||||
def field_value(self) -> Rule:
|
||||
return IDENTIFIER | seq(IDENTIFIER, COLON, self.expression)
|
||||
return self.IDENTIFIER | seq(self.IDENTIFIER, self.COLON, self.expression)
|
||||
|
||||
BLANK = Terminal("[ \t\r\n]+", regex=True)
|
||||
|
||||
ARROW = Terminal("->")
|
||||
AS = Terminal("as")
|
||||
BAR = Terminal("bar")
|
||||
CLASS = Terminal("class")
|
||||
COLON = Terminal("colon")
|
||||
COMMENT = Terminal("comment")
|
||||
ELSE = Terminal("else")
|
||||
FOR = Terminal("for")
|
||||
FUN = Terminal("fun")
|
||||
IDENTIFIER = Terminal("[A-Za-z_][A-Za-z0-9_]*", regex=True)
|
||||
IF = Terminal("if")
|
||||
IMPORT = Terminal("import")
|
||||
IN = Terminal("in")
|
||||
LCURLY = Terminal("{")
|
||||
LET = Terminal("Let")
|
||||
RCURLY = Terminal("}")
|
||||
RETURN = Terminal("return")
|
||||
SEMICOLON = Terminal(";")
|
||||
STRING = Terminal('""', regex=True)
|
||||
WHILE = Terminal("while")
|
||||
EQUAL = Terminal("=")
|
||||
LPAREN = Terminal("(")
|
||||
RPAREN = Terminal(")")
|
||||
COMMA = Terminal(",")
|
||||
SELF = Terminal("self", name="SELFF")
|
||||
OR = Terminal("or")
|
||||
IS = Terminal("is")
|
||||
AND = Terminal("and")
|
||||
EQUALEQUAL = Terminal("==")
|
||||
BANGEQUAL = Terminal("!=")
|
||||
LESS = Terminal("<")
|
||||
GREATER = Terminal(">")
|
||||
LESSEQUAL = Terminal("<=")
|
||||
GREATEREQUAL = Terminal(">=")
|
||||
PLUS = Terminal("+")
|
||||
MINUS = Terminal("-")
|
||||
STAR = Terminal("*")
|
||||
SLASH = Terminal("/")
|
||||
NUMBER = Terminal("[0-9]+", regex=True)
|
||||
TRUE = Terminal("true")
|
||||
FALSE = Terminal("false")
|
||||
BANG = Terminal("!")
|
||||
DOT = Terminal(".")
|
||||
MATCH = Terminal("match")
|
||||
EXPORT = Terminal("export")
|
||||
UNDERSCORE = Terminal("_")
|
||||
NEW = Terminal("new")
|
||||
LSQUARE = Terminal("[")
|
||||
RSQUARE = Terminal("]")
|
||||
|
||||
|
||||
# -----------------------------------------------------------------------------
|
||||
# DORKY LEXER
|
||||
# -----------------------------------------------------------------------------
|
||||
import bisect
|
||||
import dataclasses
|
||||
|
||||
|
||||
NUMBER_RE = re.compile("[0-9]+(\\.[0-9]*([eE][-+]?[0-9]+)?)?")
|
||||
IDENTIFIER_RE = re.compile("[_A-Za-z][_A-Za-z0-9]*")
|
||||
KEYWORD_TABLE = {
|
||||
"_": UNDERSCORE,
|
||||
"and": AND,
|
||||
"as": AS,
|
||||
"class": CLASS,
|
||||
"else": ELSE,
|
||||
"export": EXPORT,
|
||||
"false": FALSE,
|
||||
"for": FOR,
|
||||
"fun": FUN,
|
||||
"if": IF,
|
||||
"import": IMPORT,
|
||||
"in": IN,
|
||||
"is": IS,
|
||||
"let": LET,
|
||||
"match": MATCH,
|
||||
"new": NEW,
|
||||
"or": OR,
|
||||
"return": RETURN,
|
||||
"self": SELF,
|
||||
"true": TRUE,
|
||||
"while": WHILE,
|
||||
"_": FineGrammar.UNDERSCORE,
|
||||
"and": FineGrammar.AND,
|
||||
"as": FineGrammar.AS,
|
||||
"class": FineGrammar.CLASS,
|
||||
"else": FineGrammar.ELSE,
|
||||
"export": FineGrammar.EXPORT,
|
||||
"false": FineGrammar.FALSE,
|
||||
"for": FineGrammar.FOR,
|
||||
"fun": FineGrammar.FUN,
|
||||
"if": FineGrammar.IF,
|
||||
"import": FineGrammar.IMPORT,
|
||||
"in": FineGrammar.IN,
|
||||
"is": FineGrammar.IS,
|
||||
"let": FineGrammar.LET,
|
||||
"match": FineGrammar.MATCH,
|
||||
"new": FineGrammar.NEW,
|
||||
"or": FineGrammar.OR,
|
||||
"return": FineGrammar.RETURN,
|
||||
"self": FineGrammar.SELF,
|
||||
"true": FineGrammar.TRUE,
|
||||
"while": FineGrammar.WHILE,
|
||||
}
|
||||
|
||||
|
||||
|
|
@ -406,63 +419,63 @@ def tokenize(src: str):
|
|||
token = None
|
||||
if ch == "-":
|
||||
if src[pos : pos + 2] == "->":
|
||||
token = (ARROW, pos, 2)
|
||||
token = (FineGrammar.ARROW, pos, 2)
|
||||
else:
|
||||
token = (MINUS, pos, 1)
|
||||
token = (FineGrammar.MINUS, pos, 1)
|
||||
|
||||
elif ch == "|":
|
||||
token = (BAR, pos, 1)
|
||||
token = (FineGrammar.BAR, pos, 1)
|
||||
|
||||
elif ch == ":":
|
||||
token = (COLON, pos, 1)
|
||||
token = (FineGrammar.COLON, pos, 1)
|
||||
|
||||
elif ch == "{":
|
||||
token = (LCURLY, pos, 1)
|
||||
token = (FineGrammar.LCURLY, pos, 1)
|
||||
|
||||
elif ch == "}":
|
||||
token = (RCURLY, pos, 1)
|
||||
token = (FineGrammar.RCURLY, pos, 1)
|
||||
|
||||
elif ch == ";":
|
||||
token = (SEMICOLON, pos, 1)
|
||||
token = (FineGrammar.SEMICOLON, pos, 1)
|
||||
|
||||
elif ch == "=":
|
||||
if src[pos : pos + 2] == "==":
|
||||
token = (EQUALEQUAL, pos, 2)
|
||||
token = (FineGrammar.EQUALEQUAL, pos, 2)
|
||||
else:
|
||||
token = (EQUAL, pos, 1)
|
||||
token = (FineGrammar.EQUAL, pos, 1)
|
||||
|
||||
elif ch == "(":
|
||||
token = (LPAREN, pos, 1)
|
||||
token = (FineGrammar.LPAREN, pos, 1)
|
||||
|
||||
elif ch == ")":
|
||||
token = (RPAREN, pos, 1)
|
||||
token = (FineGrammar.RPAREN, pos, 1)
|
||||
|
||||
elif ch == ",":
|
||||
token = (COMMA, pos, 1)
|
||||
token = (FineGrammar.COMMA, pos, 1)
|
||||
|
||||
elif ch == "!":
|
||||
if src[pos : pos + 2] == "!=":
|
||||
token = (BANGEQUAL, pos, 2)
|
||||
token = (FineGrammar.BANGEQUAL, pos, 2)
|
||||
else:
|
||||
token = (BANG, pos, 1)
|
||||
token = (FineGrammar.BANG, pos, 1)
|
||||
|
||||
elif ch == "<":
|
||||
if src[pos : pos + 2] == "<=":
|
||||
token = (LESSEQUAL, pos, 2)
|
||||
token = (FineGrammar.LESSEQUAL, pos, 2)
|
||||
else:
|
||||
token = (LESS, pos, 1)
|
||||
token = (FineGrammar.LESS, pos, 1)
|
||||
|
||||
elif ch == ">":
|
||||
if src[pos : pos + 2] == ">=":
|
||||
token = (GREATEREQUAL, pos, 2)
|
||||
token = (FineGrammar.GREATEREQUAL, pos, 2)
|
||||
else:
|
||||
token = (GREATER, pos, 1)
|
||||
token = (FineGrammar.GREATER, pos, 1)
|
||||
|
||||
elif ch == "+":
|
||||
token = (PLUS, pos, 1)
|
||||
token = (FineGrammar.PLUS, pos, 1)
|
||||
|
||||
elif ch == "*":
|
||||
token = (STAR, pos, 1)
|
||||
token = (FineGrammar.STAR, pos, 1)
|
||||
|
||||
elif ch == "/":
|
||||
if src[pos : pos + 2] == "//":
|
||||
|
|
@ -470,16 +483,16 @@ def tokenize(src: str):
|
|||
pos = pos + 1
|
||||
continue
|
||||
|
||||
token = (SLASH, pos, 1)
|
||||
token = (FineGrammar.SLASH, pos, 1)
|
||||
|
||||
elif ch == ".":
|
||||
token = (DOT, pos, 1)
|
||||
token = (FineGrammar.DOT, pos, 1)
|
||||
|
||||
elif ch == "[":
|
||||
token = (LSQUARE, pos, 1)
|
||||
token = (FineGrammar.LSQUARE, pos, 1)
|
||||
|
||||
elif ch == "]":
|
||||
token = (RSQUARE, pos, 1)
|
||||
token = (FineGrammar.RSQUARE, pos, 1)
|
||||
|
||||
elif ch == '"' or ch == "'":
|
||||
end = pos + 1
|
||||
|
|
@ -490,12 +503,12 @@ def tokenize(src: str):
|
|||
if end == len(src):
|
||||
raise Exception(f"Unterminated string constant at {pos}")
|
||||
end += 1
|
||||
token = (STRING, pos, end - pos)
|
||||
token = (FineGrammar.STRING, pos, end - pos)
|
||||
|
||||
else:
|
||||
number_match = NUMBER_RE.match(src, pos)
|
||||
if number_match:
|
||||
token = (NUMBER, pos, number_match.end() - pos)
|
||||
token = (FineGrammar.NUMBER, pos, number_match.end() - pos)
|
||||
else:
|
||||
id_match = IDENTIFIER_RE.match(src, pos)
|
||||
if id_match:
|
||||
|
|
@ -504,7 +517,7 @@ def tokenize(src: str):
|
|||
if keyword:
|
||||
token = (keyword, pos, len(fragment))
|
||||
else:
|
||||
token = (IDENTIFIER, pos, len(fragment))
|
||||
token = (FineGrammar.IDENTIFIER, pos, len(fragment))
|
||||
|
||||
if token is None:
|
||||
raise Exception("Token error")
|
||||
|
|
@ -512,9 +525,6 @@ def tokenize(src: str):
|
|||
pos += token[2]
|
||||
|
||||
|
||||
import bisect
|
||||
|
||||
|
||||
class FineTokens:
|
||||
def __init__(self, src: str):
|
||||
self.src = src
|
||||
|
|
@ -546,4 +556,20 @@ class FineTokens:
|
|||
|
||||
|
||||
if __name__ == "__main__":
|
||||
FineGrammar().build_table()
|
||||
grammar = FineGrammar()
|
||||
grammar.build_table()
|
||||
|
||||
class LexTest(Grammar):
|
||||
@rule
|
||||
def foo(self):
|
||||
return self.IS
|
||||
|
||||
start = foo
|
||||
|
||||
IS = Terminal("is")
|
||||
AS = Terminal("as")
|
||||
IDENTIFIER = Terminal("[a-z]+", regex=True)
|
||||
# IDENTIFIER = Terminal("[A-Za-z_][A-Za-z0-9_]*", regex=True)
|
||||
|
||||
lexer = compile_lexer(LexTest())
|
||||
dump_lexer_table(lexer)
|
||||
|
|
|
|||
198
parser/parser.py
198
parser/parser.py
|
|
@ -21,19 +21,20 @@ To get started, create a grammar that derives from the `Grammar` class. Create
|
|||
one method per nonterminal, decorated with the `rule` decorator. Here's an
|
||||
example:
|
||||
|
||||
PLUS = Terminal('+')
|
||||
LPAREN = Terminal('(')
|
||||
RPAREN = Terminal(')')
|
||||
ID = Terminal('id')
|
||||
|
||||
class SimpleGrammar(Grammar):
|
||||
@rule
|
||||
def expression(self):
|
||||
return seq(self.expression, PLUS, self.term) | self.term
|
||||
return seq(self.expression, self.PLUS, self.term) | self.term
|
||||
|
||||
@rule
|
||||
def term(self):
|
||||
return seq(LPAREN, self.expression, RPAREN) | ID
|
||||
return seq(self.LPAREN, self.expression, self.RPAREN) | self.ID
|
||||
|
||||
PLUS = Terminal('+')
|
||||
LPAREN = Terminal('(')
|
||||
RPAREN = Terminal(')')
|
||||
ID = Terminal('id')
|
||||
|
||||
|
||||
## Using grammars
|
||||
|
|
@ -1605,10 +1606,14 @@ class Rule:
|
|||
class Terminal(Rule):
|
||||
"""A token, or terminal symbol in the grammar."""
|
||||
|
||||
value: str
|
||||
value: str | None
|
||||
pattern: str
|
||||
regex: bool
|
||||
|
||||
def __init__(self, value):
|
||||
self.value = sys.intern(value)
|
||||
def __init__(self, pattern, name=None, regex=False):
|
||||
self.value = name
|
||||
self.pattern = pattern
|
||||
self.regex = regex
|
||||
|
||||
def flatten(self) -> typing.Generator[list["str | Terminal"], None, None]:
|
||||
# We are just ourselves when flattened.
|
||||
|
|
@ -1766,19 +1771,20 @@ class Grammar:
|
|||
|
||||
Here's an example of a simple grammar:
|
||||
|
||||
PLUS = Terminal('+')
|
||||
LPAREN = Terminal('(')
|
||||
RPAREN = Terminal(')')
|
||||
ID = Terminal('id')
|
||||
|
||||
class SimpleGrammar(Grammar):
|
||||
@rule
|
||||
def expression(self):
|
||||
return seq(self.expression, PLUS, self.term) | self.term
|
||||
return seq(self.expression, self.PLUS, self.term) | self.term
|
||||
|
||||
@rule
|
||||
def term(self):
|
||||
return seq(LPAREN, self.expression, RPAREN) | ID
|
||||
return seq(self.LPAREN, self.expression, self.RPAREN) | self.ID
|
||||
|
||||
PLUS = Terminal('+')
|
||||
LPAREN = Terminal('(')
|
||||
RPAREN = Terminal(')')
|
||||
ID = Terminal('id')
|
||||
|
||||
|
||||
Not very exciting, perhaps, but it's something.
|
||||
"""
|
||||
|
|
@ -1786,6 +1792,7 @@ class Grammar:
|
|||
_precedence: dict[str, typing.Tuple[Assoc, int]]
|
||||
_start: str
|
||||
_generator: type[GenerateLR0]
|
||||
_terminals: list[Terminal]
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
|
|
@ -1809,6 +1816,14 @@ class Grammar:
|
|||
generator = getattr(self, "generator", GenerateLALR)
|
||||
assert generator is not None
|
||||
|
||||
# Fixup terminal names with the name of the member that declared it.
|
||||
terminals = []
|
||||
for n, t in inspect.getmembers(self, lambda x: isinstance(x, Terminal)):
|
||||
if t.value is None:
|
||||
t.value = n
|
||||
terminals.append(t)
|
||||
|
||||
# Fix up the precedence table.
|
||||
precedence_table = {}
|
||||
for prec, (associativity, symbols) in enumerate(precedence):
|
||||
for symbol in symbols:
|
||||
|
|
@ -1824,6 +1839,11 @@ class Grammar:
|
|||
self._precedence = precedence_table
|
||||
self._start = start
|
||||
self._generator = generator
|
||||
self._terminals = terminals
|
||||
|
||||
@property
|
||||
def terminals(self) -> list[Terminal]:
|
||||
return self._terminals
|
||||
|
||||
def generate_nonterminal_dict(
|
||||
self, start: str | None = None
|
||||
|
|
@ -1911,3 +1931,149 @@ class Grammar:
|
|||
gen = generator(start, desugared, precedence=self._precedence, transparents=transparents)
|
||||
table = gen.gen_table()
|
||||
return table
|
||||
|
||||
|
||||
###############################################################################
|
||||
# Lexer support
|
||||
###############################################################################
|
||||
# For machine-generated lexers
|
||||
|
||||
|
||||
@dataclasses.dataclass(frozen=True, slots=True)
|
||||
class Span:
|
||||
lower: int # inclusive
|
||||
upper: int # exclusive
|
||||
|
||||
@classmethod
|
||||
def from_str(cls, c: str) -> "Span":
|
||||
return Span(lower=ord(c), upper=ord(c) + 1)
|
||||
|
||||
def intersects(self, other: "Span") -> bool:
|
||||
return self.lower < other.upper and self.upper > other.lower
|
||||
|
||||
def split(self, other: "Span") -> tuple["Span|None", "Span", "Span|None"]:
|
||||
assert self.intersects(other)
|
||||
|
||||
first = min(self.lower, other.lower)
|
||||
second = max(self.lower, other.lower)
|
||||
third = min(self.upper, other.upper)
|
||||
fourth = max(self.upper, other.upper)
|
||||
|
||||
low = Span(first, second) if first != second else None
|
||||
mid = Span(second, third)
|
||||
hi = Span(third, fourth) if third != fourth else None
|
||||
|
||||
return (low, mid, hi)
|
||||
|
||||
def __str__(self) -> str:
|
||||
if self.upper - self.lower == 1:
|
||||
return str(self.lower)
|
||||
|
||||
lower = str(self.lower)
|
||||
upper = str(self.upper)
|
||||
return f"[{lower}-{upper})"
|
||||
|
||||
def __lt__(self, other: "Span") -> bool:
|
||||
return self.lower < other.lower
|
||||
|
||||
|
||||
ET = typing.TypeVar("ET")
|
||||
|
||||
|
||||
class EdgeList[ET]:
|
||||
"""A list of edge transitions, keyed by *span*. A given span can have
|
||||
multiple targets, because this supports NFAs."""
|
||||
|
||||
_edges: list[tuple[Span, list[ET]]]
|
||||
|
||||
def __init__(self):
|
||||
self._edges = []
|
||||
|
||||
def __iter__(self) -> typing.Iterator[tuple[Span, list[ET]]]:
|
||||
return iter(self._edges)
|
||||
|
||||
def __repr__(self) -> str:
|
||||
return f"EdgeList[{','.join(str(s[0]) + '->' + repr(s[1]) for s in self._edges)}]"
|
||||
|
||||
def add_edge(self, c: Span, s: ET):
|
||||
"""Add an edge for the given span to the list. If there are already
|
||||
spans that overlap this one, split and generating multiple distinct
|
||||
edges.
|
||||
"""
|
||||
# print(f" Adding {c}->{s} to {self}...")
|
||||
# Look to see where we would put this span based solely on a
|
||||
# sort of lower bounds.
|
||||
point = bisect.bisect_left(self._edges, c, key=lambda x: x[0])
|
||||
|
||||
# If this is not the first span in the list then we might
|
||||
# overlap with the span to our left....
|
||||
if point > 0:
|
||||
left_point = point - 1
|
||||
left_span, left_targets = self._edges[left_point]
|
||||
if c.intersects(left_span):
|
||||
# ...if we intersect with the span to our left then we
|
||||
# must split the span to our left with regards to our
|
||||
# span. Then we have three target spans:
|
||||
#
|
||||
# - The lo one, which just has the targets from the old
|
||||
# left span. (This may be empty if we overlap the
|
||||
# left one completely on the left side.)
|
||||
#
|
||||
# - The mid one, which has both the targets from the
|
||||
# old left and the new target.
|
||||
#
|
||||
# - The hi one, which if it exists only has our target.
|
||||
# If it exists it basically replaces the current span
|
||||
# for our future processing. (If not, then our span
|
||||
# is completely subsumed into the left span and we
|
||||
# can stop.)
|
||||
#
|
||||
del self._edges[left_point]
|
||||
lo, mid, hi = c.split(left_span)
|
||||
# print(f" <- {c} splits {left_span} -> {lo}, {mid}, {hi} @{left_point}")
|
||||
self._edges.insert(left_point, (mid, left_targets + [s]))
|
||||
if lo is not None:
|
||||
self._edges.insert(left_point, (lo, left_targets))
|
||||
if hi is None or not hi.intersects(c):
|
||||
# Yup, completely subsumed.
|
||||
# print(f" result: {self} (left out)")
|
||||
return
|
||||
|
||||
# Continue processing with `c` as the hi split from the
|
||||
# left. If the left and right spans abut each other then
|
||||
# `c` will be subsumed in our right span.
|
||||
c = hi
|
||||
|
||||
# If point is not at the very end of the list then it might
|
||||
# overlap the span to our right...
|
||||
if point < len(self._edges):
|
||||
right_span, right_targets = self._edges[point]
|
||||
if c.intersects(right_span):
|
||||
# ...this is similar to the left case, above, except the
|
||||
# lower bound has the targets that our only ours, etc.
|
||||
del self._edges[point]
|
||||
lo, mid, hi = c.split(right_span)
|
||||
# print(f" -> {c} splits {right_span} -> {lo}, {mid}, {hi} @{point}")
|
||||
if hi is not None:
|
||||
self._edges.insert(point, (hi, right_targets))
|
||||
self._edges.insert(point, (mid, right_targets + [s]))
|
||||
if lo is None or not lo.intersects(c):
|
||||
# Our span is completely subsumed on the lower side
|
||||
# of the range; there is no lower side that just has
|
||||
# our targets. Bail now.
|
||||
# print(f" result: {self} (right out)")
|
||||
return
|
||||
|
||||
# Continue processing with `c` as the lo split, since
|
||||
# that's the one that has only the specified state as the
|
||||
# target.
|
||||
c = lo
|
||||
|
||||
# If we made it here then either we have a point that does not
|
||||
# intersect at all, or it only partially intersects on either the
|
||||
# left or right. Either way, we have ensured that:
|
||||
#
|
||||
# - c doesn't intersect with left or right (any more)
|
||||
# - point is where it should go
|
||||
self._edges.insert(point, (c, [s]))
|
||||
# print(f" result: {self} (done)")
|
||||
|
|
|
|||
|
|
@ -38,25 +38,27 @@ def _tree(treeform) -> runtime.Tree | runtime.TokenValue:
|
|||
def test_lr0_lr0():
|
||||
"""An LR0 grammar should work with an LR0 generator."""
|
||||
|
||||
PLUS = Terminal("+")
|
||||
LPAREN = Terminal("(")
|
||||
RPAREN = Terminal(")")
|
||||
IDENTIFIER = Terminal("id")
|
||||
|
||||
class LR0Grammar(Grammar):
|
||||
class G(Grammar):
|
||||
start = "E"
|
||||
generator = parser.GenerateLR0
|
||||
|
||||
@rule
|
||||
def E(self):
|
||||
return seq(self.E, PLUS, self.T) | self.T
|
||||
return seq(self.E, self.PLUS, self.T) | self.T
|
||||
|
||||
@rule
|
||||
def T(self):
|
||||
return seq(LPAREN, self.E, RPAREN) | IDENTIFIER
|
||||
return seq(self.LPAREN, self.E, self.RPAREN) | self.IDENTIFIER
|
||||
|
||||
table = LR0Grammar().build_table()
|
||||
tree, errors = runtime.Parser(table).parse(Tokens(IDENTIFIER, PLUS, LPAREN, IDENTIFIER, RPAREN))
|
||||
PLUS = Terminal("+", name="+")
|
||||
LPAREN = Terminal("(", name="(")
|
||||
RPAREN = Terminal(")", name=")")
|
||||
IDENTIFIER = Terminal("id", name="id")
|
||||
|
||||
table = G().build_table()
|
||||
tree, errors = runtime.Parser(table).parse(
|
||||
Tokens(G.IDENTIFIER, G.PLUS, G.LPAREN, G.IDENTIFIER, G.RPAREN)
|
||||
)
|
||||
|
||||
assert errors == []
|
||||
assert tree == _tree(("E", ("E", ("T", "id")), "+", ("T", "(", ("E", ("T", "id")), ")")))
|
||||
|
|
@ -65,114 +67,114 @@ def test_lr0_lr0():
|
|||
def test_lr0_shift_reduce():
|
||||
"""This one should not work in LR0- it has a shift/reduce conflict, but works in SLR1."""
|
||||
|
||||
PLUS = Terminal("+")
|
||||
LPAREN = Terminal("(")
|
||||
RPAREN = Terminal(")")
|
||||
LSQUARE = Terminal("[")
|
||||
RSQUARE = Terminal("]")
|
||||
IDENTIFIER = Terminal("id")
|
||||
|
||||
class TestGrammar(Grammar):
|
||||
class G(Grammar):
|
||||
start = "E"
|
||||
generator = parser.GenerateLR0
|
||||
|
||||
@rule
|
||||
def E(self):
|
||||
return seq(self.E, PLUS, self.T) | self.T
|
||||
return seq(self.E, self.PLUS, self.T) | self.T
|
||||
|
||||
@rule
|
||||
def T(self):
|
||||
return (
|
||||
seq(LPAREN, self.E, RPAREN) | IDENTIFIER | seq(IDENTIFIER, LSQUARE, self.E, RSQUARE)
|
||||
seq(self.LPAREN, self.E, self.RPAREN)
|
||||
| self.IDENTIFIER
|
||||
| seq(self.IDENTIFIER, self.LSQUARE, self.E, self.RSQUARE)
|
||||
)
|
||||
|
||||
with pytest.raises(parser.AmbiguityError):
|
||||
TestGrammar().build_table()
|
||||
PLUS = Terminal("+")
|
||||
LPAREN = Terminal("(")
|
||||
RPAREN = Terminal(")")
|
||||
LSQUARE = Terminal("[")
|
||||
RSQUARE = Terminal("]")
|
||||
IDENTIFIER = Terminal("id")
|
||||
|
||||
TestGrammar().build_table(generator=parser.GenerateSLR1)
|
||||
with pytest.raises(parser.AmbiguityError):
|
||||
G().build_table()
|
||||
|
||||
G().build_table(generator=parser.GenerateSLR1)
|
||||
|
||||
|
||||
def test_lr0_reduce_reduce():
|
||||
"""This one should not work, it has a reduce-reduce conflict."""
|
||||
|
||||
PLUS = Terminal("+")
|
||||
EQUAL = Terminal("=")
|
||||
LPAREN = Terminal("(")
|
||||
RPAREN = Terminal(")")
|
||||
IDENTIFIER = Terminal("id")
|
||||
|
||||
class TestGrammar(Grammar):
|
||||
class G(Grammar):
|
||||
start = "E"
|
||||
generator = parser.GenerateLR0
|
||||
|
||||
@rule
|
||||
def E(self):
|
||||
return seq(self.E, PLUS, self.T) | self.T | seq(self.V, EQUAL, self.E)
|
||||
return seq(self.E, self.PLUS, self.T) | self.T | seq(self.V, self.EQUAL, self.E)
|
||||
|
||||
@rule
|
||||
def T(self):
|
||||
return seq(LPAREN, self.E, RPAREN) | IDENTIFIER
|
||||
return seq(self.LPAREN, self.E, self.RPAREN) | self.IDENTIFIER
|
||||
|
||||
@rule
|
||||
def V(self):
|
||||
return IDENTIFIER
|
||||
return self.IDENTIFIER
|
||||
|
||||
PLUS = Terminal("+")
|
||||
EQUAL = Terminal("=")
|
||||
LPAREN = Terminal("(")
|
||||
RPAREN = Terminal(")")
|
||||
IDENTIFIER = Terminal("id")
|
||||
|
||||
with pytest.raises(parser.AmbiguityError):
|
||||
TestGrammar().build_table()
|
||||
G().build_table()
|
||||
|
||||
|
||||
def test_lr0_empty():
|
||||
"""LR0 can't handle empty productions because it doesn't know when to reduce."""
|
||||
BOOP = Terminal("boop")
|
||||
BEEP = Terminal("beep")
|
||||
|
||||
class TestGrammar(Grammar):
|
||||
class G(Grammar):
|
||||
start = "E"
|
||||
generator = parser.GenerateLR0
|
||||
|
||||
@rule
|
||||
def E(self):
|
||||
return seq(self.F, BOOP)
|
||||
return seq(self.F, self.BOOP)
|
||||
|
||||
@rule
|
||||
def F(self):
|
||||
return BEEP | parser.Nothing
|
||||
return self.BEEP | parser.Nothing
|
||||
|
||||
BOOP = Terminal("boop")
|
||||
BEEP = Terminal("beep")
|
||||
|
||||
with pytest.raises(parser.AmbiguityError):
|
||||
TestGrammar().build_table()
|
||||
G().build_table()
|
||||
|
||||
|
||||
def test_grammar_aho_ullman_1():
|
||||
EQUAL = Terminal("=")
|
||||
STAR = Terminal("*")
|
||||
ID = Terminal("id")
|
||||
|
||||
class TestGrammar(Grammar):
|
||||
class G(Grammar):
|
||||
start = "S"
|
||||
generator = parser.GenerateSLR1
|
||||
|
||||
@rule
|
||||
def S(self):
|
||||
return seq(self.L, EQUAL, self.R) | self.R
|
||||
return seq(self.L, self.EQUAL, self.R) | self.R
|
||||
|
||||
@rule
|
||||
def L(self):
|
||||
return seq(STAR, self.R) | ID
|
||||
return seq(self.STAR, self.R) | self.ID
|
||||
|
||||
@rule
|
||||
def R(self):
|
||||
return self.L
|
||||
|
||||
with pytest.raises(parser.AmbiguityError):
|
||||
TestGrammar().build_table()
|
||||
EQUAL = Terminal("=")
|
||||
STAR = Terminal("*")
|
||||
ID = Terminal("id")
|
||||
|
||||
TestGrammar().build_table(generator=parser.GenerateLR1)
|
||||
with pytest.raises(parser.AmbiguityError):
|
||||
G().build_table()
|
||||
|
||||
G().build_table(generator=parser.GenerateLR1)
|
||||
|
||||
|
||||
def test_grammar_aho_ullman_2():
|
||||
A = Terminal("a")
|
||||
B = Terminal("b")
|
||||
|
||||
class TestGrammar(Grammar):
|
||||
start = "S"
|
||||
generator = parser.GenerateSLR1
|
||||
|
|
@ -183,7 +185,10 @@ def test_grammar_aho_ullman_2():
|
|||
|
||||
@rule
|
||||
def X(self):
|
||||
return seq(A, self.X) | B
|
||||
return seq(self.A, self.X) | self.B
|
||||
|
||||
A = Terminal("a")
|
||||
B = Terminal("b")
|
||||
|
||||
TestGrammar().build_table()
|
||||
TestGrammar().build_table(generator=parser.GenerateLR1)
|
||||
|
|
@ -191,11 +196,6 @@ def test_grammar_aho_ullman_2():
|
|||
|
||||
|
||||
def test_fun_lalr():
|
||||
PLUS = Terminal("+")
|
||||
INT = Terminal("int")
|
||||
ID = Terminal("id")
|
||||
LPAREN = Terminal("(")
|
||||
RPAREN = Terminal(")")
|
||||
|
||||
class TestGrammar(Grammar):
|
||||
start = "S"
|
||||
|
|
@ -207,15 +207,21 @@ def test_fun_lalr():
|
|||
|
||||
@rule
|
||||
def E(self):
|
||||
return self.F | seq(self.E, PLUS, self.F)
|
||||
return self.F | seq(self.E, self.PLUS, self.F)
|
||||
|
||||
@rule
|
||||
def F(self):
|
||||
return self.V | INT | seq(LPAREN, self.E, RPAREN)
|
||||
return self.V | self.INT | seq(self.LPAREN, self.E, self.RPAREN)
|
||||
|
||||
@rule
|
||||
def V(self):
|
||||
return ID
|
||||
return self.ID
|
||||
|
||||
PLUS = Terminal("+")
|
||||
INT = Terminal("int")
|
||||
ID = Terminal("id")
|
||||
LPAREN = Terminal("(")
|
||||
RPAREN = Terminal(")")
|
||||
|
||||
TestGrammar().build_table()
|
||||
|
||||
|
|
@ -234,14 +240,14 @@ def test_conflicting_names():
|
|||
to understand.
|
||||
"""
|
||||
|
||||
IDENTIFIER = Terminal("Identifier")
|
||||
|
||||
class TestGrammar(Grammar):
|
||||
start = "Identifier"
|
||||
start = "IDENTIFIER"
|
||||
|
||||
@rule("Identifier")
|
||||
@rule("IDENTIFIER")
|
||||
def identifier(self):
|
||||
return IDENTIFIER
|
||||
return self.IDENTIFIER
|
||||
|
||||
IDENTIFIER = Terminal("Identifier")
|
||||
|
||||
with pytest.raises(ValueError):
|
||||
TestGrammar().build_table()
|
||||
|
|
|
|||
452
tests/test_lexer.py
Normal file
452
tests/test_lexer.py
Normal file
|
|
@ -0,0 +1,452 @@
|
|||
from parser import Span
|
||||
|
||||
# LexerTable = list[tuple[Terminal | None, list[tuple[Span, int]]]]
|
||||
|
||||
|
||||
# def compile_lexer(x: Grammar) -> LexerTable:
|
||||
|
||||
# class State:
|
||||
# """An NFA state. Each state can be the accept state, with one or more
|
||||
# Terminals as the result."""
|
||||
|
||||
# accept: list[Terminal]
|
||||
# epsilons: list["State"]
|
||||
# _edges: EdgeList["State"]
|
||||
|
||||
# def __init__(self):
|
||||
# self.accept = []
|
||||
# self.epsilons = []
|
||||
# self._edges = EdgeList()
|
||||
|
||||
# def __repr__(self):
|
||||
# return f"State{id(self)}"
|
||||
|
||||
# def edges(self) -> typing.Iterable[tuple[Span, list["State"]]]:
|
||||
# return self._edges
|
||||
|
||||
# def add_edge(self, c: Span, s: "State") -> "State":
|
||||
# self._edges.add_edge(c, s)
|
||||
# return s
|
||||
|
||||
# def dump_graph(self, name="nfa.dot"):
|
||||
# with open(name, "w", encoding="utf8") as f:
|
||||
# f.write("digraph G {\n")
|
||||
|
||||
# stack: list[State] = [self]
|
||||
# visited = set()
|
||||
# while len(stack) > 0:
|
||||
# state = stack.pop()
|
||||
# if state in visited:
|
||||
# continue
|
||||
# visited.add(state)
|
||||
|
||||
# label = ", ".join([t.value for t in state.accept if t.value is not None])
|
||||
# f.write(f' {id(state)} [label="{label}"];\n')
|
||||
# for target in state.epsilons:
|
||||
# stack.append(target)
|
||||
# f.write(f' {id(state)} -> {id(target)} [label="\u03B5"];\n')
|
||||
|
||||
# for span, targets in state.edges():
|
||||
# label = str(span).replace('"', '\\"')
|
||||
# for target in targets:
|
||||
# stack.append(target)
|
||||
# f.write(f' {id(state)} -> {id(target)} [label="{label}"];\n')
|
||||
|
||||
# f.write("}\n")
|
||||
|
||||
# @dataclasses.dataclass
|
||||
# class RegexNode:
|
||||
# def to_nfa(self, start: State) -> State:
|
||||
# del start
|
||||
# raise NotImplementedError()
|
||||
|
||||
# def __str__(self) -> str:
|
||||
# raise NotImplementedError()
|
||||
|
||||
# @dataclasses.dataclass
|
||||
# class RegexLiteral(RegexNode):
|
||||
# values: list[tuple[str, str]]
|
||||
|
||||
# def to_nfa(self, start: State) -> State:
|
||||
# end = State()
|
||||
# for s, e in self.values:
|
||||
# start.add_edge(Span(ord(s), ord(e)), end)
|
||||
# return end
|
||||
|
||||
# def __str__(self) -> str:
|
||||
# if len(self.values) == 1:
|
||||
# start, end = self.values[0]
|
||||
# if start == end:
|
||||
# return start
|
||||
|
||||
# ranges = []
|
||||
# for start, end in self.values:
|
||||
# if start == end:
|
||||
# ranges.append(start)
|
||||
# else:
|
||||
# ranges.append(f"{start}-{end}")
|
||||
# return "![{}]".format("".join(ranges))
|
||||
|
||||
# @dataclasses.dataclass
|
||||
# class RegexPlus(RegexNode):
|
||||
# child: RegexNode
|
||||
|
||||
# def to_nfa(self, start: State) -> State:
|
||||
# end = self.child.to_nfa(start)
|
||||
# end.epsilons.append(start)
|
||||
# return end
|
||||
|
||||
# def __str__(self) -> str:
|
||||
# return f"({self.child})+"
|
||||
|
||||
# @dataclasses.dataclass
|
||||
# class RegexStar(RegexNode):
|
||||
# child: RegexNode
|
||||
|
||||
# def to_nfa(self, start: State) -> State:
|
||||
# end = self.child.to_nfa(start)
|
||||
# end.epsilons.append(start)
|
||||
# start.epsilons.append(end)
|
||||
# return end
|
||||
|
||||
# def __str__(self) -> str:
|
||||
# return f"({self.child})*"
|
||||
|
||||
# @dataclasses.dataclass
|
||||
# class RegexQuestion(RegexNode):
|
||||
# child: RegexNode
|
||||
|
||||
# def to_nfa(self, start: State) -> State:
|
||||
# end = self.child.to_nfa(start)
|
||||
# start.epsilons.append(end)
|
||||
# return end
|
||||
|
||||
# def __str__(self) -> str:
|
||||
# return f"({self.child})?"
|
||||
|
||||
# @dataclasses.dataclass
|
||||
# class RegexSequence(RegexNode):
|
||||
# left: RegexNode
|
||||
# right: RegexNode
|
||||
|
||||
# def to_nfa(self, start: State) -> State:
|
||||
# mid = self.left.to_nfa(start)
|
||||
# return self.right.to_nfa(mid)
|
||||
|
||||
# def __str__(self) -> str:
|
||||
# return f"{self.left}{self.right}"
|
||||
|
||||
# @dataclasses.dataclass
|
||||
# class RegexAlternation(RegexNode):
|
||||
# left: RegexNode
|
||||
# right: RegexNode
|
||||
|
||||
# def to_nfa(self, start: State) -> State:
|
||||
# left_start = State()
|
||||
# start.epsilons.append(left_start)
|
||||
# left_end = self.left.to_nfa(left_start)
|
||||
|
||||
# right_start = State()
|
||||
# start.epsilons.append(right_start)
|
||||
# right_end = self.right.to_nfa(right_start)
|
||||
|
||||
# end = State()
|
||||
# left_end.epsilons.append(end)
|
||||
# right_end.epsilons.append(end)
|
||||
|
||||
# return end
|
||||
|
||||
# def __str__(self) -> str:
|
||||
# return f"(({self.left})||({self.right}))"
|
||||
|
||||
# class RegexParser:
|
||||
# # TODO: HANDLE ALTERNATION AND PRECEDENCE (CONCAT HAS HIGHEST PRECEDENCE)
|
||||
# PREFIX: dict[str, typing.Callable[[str], RegexNode]]
|
||||
# POSTFIX: dict[str, typing.Callable[[RegexNode, int], RegexNode]]
|
||||
# BINDING: dict[str, tuple[int, int]]
|
||||
|
||||
# index: int
|
||||
# pattern: str
|
||||
|
||||
# def __init__(self, pattern: str):
|
||||
# self.PREFIX = {
|
||||
# "(": self.parse_group,
|
||||
# "[": self.parse_set,
|
||||
# }
|
||||
# self.POSTFIX = {
|
||||
# "+": self.parse_plus,
|
||||
# "*": self.parse_star,
|
||||
# "?": self.parse_question,
|
||||
# "|": self.parse_alternation,
|
||||
# }
|
||||
|
||||
# self.BINDING = {
|
||||
# "|": (1, 1),
|
||||
# "+": (2, 2),
|
||||
# "*": (2, 2),
|
||||
# "?": (2, 2),
|
||||
# ")": (-1, -1), # Always stop parsing on )
|
||||
# }
|
||||
|
||||
# self.index = 0
|
||||
# self.pattern = pattern
|
||||
|
||||
# def consume(self) -> str:
|
||||
# if self.index >= len(self.pattern):
|
||||
# raise ValueError(f"Unable to parse regular expression '{self.pattern}'")
|
||||
# result = self.pattern[self.index]
|
||||
# self.index += 1
|
||||
# return result
|
||||
|
||||
# def peek(self) -> str | None:
|
||||
# if self.index >= len(self.pattern):
|
||||
# return None
|
||||
# return self.pattern[self.index]
|
||||
|
||||
# def eof(self) -> bool:
|
||||
# return self.index >= len(self.pattern)
|
||||
|
||||
# def expect(self, ch: str):
|
||||
# actual = self.consume()
|
||||
# if ch != actual:
|
||||
# raise ValueError(f"Expected '{ch}'")
|
||||
|
||||
# def parse_regex(self, minimum_binding=0) -> RegexNode:
|
||||
# ch = self.consume()
|
||||
# parser = self.PREFIX.get(ch, self.parse_single)
|
||||
# node = parser(ch)
|
||||
|
||||
# while not self.eof():
|
||||
# ch = self.peek()
|
||||
# assert ch is not None
|
||||
|
||||
# lp, rp = self.BINDING.get(ch, (minimum_binding, minimum_binding))
|
||||
# if lp < minimum_binding:
|
||||
# break
|
||||
|
||||
# parser = self.POSTFIX.get(ch, self.parse_concat)
|
||||
# node = parser(node, rp)
|
||||
|
||||
# return node
|
||||
|
||||
# def parse_single(self, ch: str) -> RegexNode:
|
||||
# return RegexLiteral(values=[(ch, ch)])
|
||||
|
||||
# def parse_group(self, ch: str) -> RegexNode:
|
||||
# del ch
|
||||
|
||||
# node = self.parse_regex()
|
||||
# self.expect(")")
|
||||
# return node
|
||||
|
||||
# def parse_set(self, ch: str) -> RegexNode:
|
||||
# del ch
|
||||
|
||||
# # TODO: INVERSION?
|
||||
# ranges = []
|
||||
# while self.peek() not in (None, "]"):
|
||||
# start = self.consume()
|
||||
# if self.peek() == "-":
|
||||
# self.consume()
|
||||
# end = self.consume()
|
||||
# else:
|
||||
# end = start
|
||||
# ranges.append((start, end))
|
||||
|
||||
# self.expect("]")
|
||||
# return RegexLiteral(values=ranges)
|
||||
|
||||
# def parse_alternation(self, node: RegexNode, rp: int) -> RegexNode:
|
||||
# return RegexAlternation(left=node, right=self.parse_regex(rp))
|
||||
|
||||
# def parse_plus(self, left: RegexNode, rp: int) -> RegexNode:
|
||||
# del rp
|
||||
# self.expect("+")
|
||||
# return RegexPlus(child=left)
|
||||
|
||||
# def parse_star(self, left: RegexNode, rp: int) -> RegexNode:
|
||||
# del rp
|
||||
# self.expect("*")
|
||||
# return RegexStar(child=left)
|
||||
|
||||
# def parse_question(self, left: RegexNode, rp: int) -> RegexNode:
|
||||
# del rp
|
||||
# self.expect("?")
|
||||
# return RegexQuestion(child=left)
|
||||
|
||||
# def parse_concat(self, left: RegexNode, rp: int) -> RegexNode:
|
||||
# return RegexSequence(left, self.parse_regex(rp))
|
||||
|
||||
# class SuperState:
|
||||
# states: frozenset[State]
|
||||
# index: int
|
||||
|
||||
# def __init__(self, states: typing.Iterable[State]):
|
||||
# # Close over the given states, including every state that is
|
||||
# # reachable by epsilon-transition.
|
||||
# stack = list(states)
|
||||
# result = set()
|
||||
# while len(stack) > 0:
|
||||
# st = stack.pop()
|
||||
# if st in result:
|
||||
# continue
|
||||
# result.add(st)
|
||||
# stack.extend(st.epsilons)
|
||||
|
||||
# self.states = frozenset(result)
|
||||
# self.index = -1
|
||||
|
||||
# def __eq__(self, other):
|
||||
# if not isinstance(other, SuperState):
|
||||
# return False
|
||||
# return self.states == other.states
|
||||
|
||||
# def __hash__(self) -> int:
|
||||
# return hash(self.states)
|
||||
|
||||
# def edges(self) -> list[tuple[Span, "SuperState"]]:
|
||||
# working: EdgeList[list[State]] = EdgeList()
|
||||
# for st in self.states:
|
||||
# for span, targets in st.edges():
|
||||
# working.add_edge(span, targets)
|
||||
|
||||
# # EdgeList maps span to list[list[State]] which we want to flatten.
|
||||
# result = []
|
||||
# for span, stateses in working:
|
||||
# s: list[State] = []
|
||||
# for states in stateses:
|
||||
# s.extend(states)
|
||||
|
||||
# result.append((span, SuperState(s)))
|
||||
|
||||
# return result
|
||||
|
||||
# def accept_terminal(self) -> Terminal | None:
|
||||
# accept = None
|
||||
# for st in self.states:
|
||||
# for ac in st.accept:
|
||||
# if accept is None:
|
||||
# accept = ac
|
||||
# elif accept.value != ac.value:
|
||||
# if accept.regex and not ac.regex:
|
||||
# accept = ac
|
||||
# elif ac.regex and not accept.regex:
|
||||
# pass
|
||||
# else:
|
||||
# raise ValueError(
|
||||
# f"Lexer is ambiguous: cannot distinguish between {accept.value} ('{accept.pattern}') and {ac.value} ('{ac.pattern}')"
|
||||
# )
|
||||
|
||||
# return accept
|
||||
|
||||
# # Parse the terminals all together into a big NFA rooted at `NFA`.
|
||||
# NFA = State()
|
||||
# for token in x.terminals:
|
||||
# start = State()
|
||||
# NFA.epsilons.append(start)
|
||||
|
||||
# if token.regex:
|
||||
# node = RegexParser(token.pattern).parse_regex()
|
||||
# print(f" Parsed {token.pattern} to {node}")
|
||||
# ending = node.to_nfa(start)
|
||||
|
||||
# else:
|
||||
# ending = start
|
||||
# for c in token.pattern:
|
||||
# ending = ending.add_edge(Span.from_str(c), State())
|
||||
|
||||
# ending.accept.append(token)
|
||||
|
||||
# NFA.dump_graph()
|
||||
|
||||
# # Convert the NFA into a DFA in the most straightforward way (by tracking
|
||||
# # sets of state closures, called SuperStates.)
|
||||
# DFA: dict[SuperState, list[tuple[Span, SuperState]]] = {}
|
||||
# stack = [SuperState([NFA])]
|
||||
# while len(stack) > 0:
|
||||
# ss = stack.pop()
|
||||
# if ss in DFA:
|
||||
# continue
|
||||
|
||||
# edges = ss.edges()
|
||||
|
||||
# DFA[ss] = edges
|
||||
# for _, target in edges:
|
||||
# stack.append(target)
|
||||
|
||||
# for i, k in enumerate(DFA):
|
||||
# k.index = i
|
||||
|
||||
# return [
|
||||
# (
|
||||
# ss.accept_terminal(),
|
||||
# [(k, v.index) for k, v in edges],
|
||||
# )
|
||||
# for ss, edges in DFA.items()
|
||||
# ]
|
||||
|
||||
|
||||
# def dump_lexer_table(table: LexerTable):
|
||||
# with open("lexer.dot", "w", encoding="utf-8") as f:
|
||||
# f.write("digraph G {\n")
|
||||
# for index, (accept, edges) in enumerate(table):
|
||||
# label = accept.value if accept is not None else ""
|
||||
# f.write(f' {index} [label="{label}"];\n')
|
||||
# for span, target in edges:
|
||||
# label = str(span).replace('"', '\\"')
|
||||
# f.write(f' {index} -> {target} [label="{label}"];\n')
|
||||
|
||||
# pass
|
||||
# f.write("}\n")
|
||||
|
||||
|
||||
# def generic_tokenize(src: str, table: LexerTable):
|
||||
# pos = 0
|
||||
# state = 0
|
||||
# start = 0
|
||||
# last_accept = None
|
||||
# last_accept_pos = 0
|
||||
|
||||
# while pos < len(src):
|
||||
# accept, edges = table[state]
|
||||
# if accept is not None:
|
||||
# last_accept = accept
|
||||
# last_accept_pos = pos + 1
|
||||
|
||||
# char = ord(src[pos])
|
||||
|
||||
# # Find the index of the span where the upper value is the tightest
|
||||
# # bound on the character.
|
||||
# index = bisect.bisect_left(edges, char, key=lambda x: x[0].upper)
|
||||
# # If the character is greater than or equal to the lower bound we
|
||||
# # found then we have a hit, otherwise no.
|
||||
# state = edges[index][1] if index < len(edges) and char >= edges[index][0].lower else None
|
||||
# if state is None:
|
||||
# if last_accept is None:
|
||||
# raise Exception(f"Token error at {pos}")
|
||||
|
||||
# yield (last_accept, start, last_accept_pos - start)
|
||||
|
||||
# last_accept = None
|
||||
# pos = last_accept_pos
|
||||
# start = pos
|
||||
# state = 0
|
||||
|
||||
# else:
|
||||
# pos += 1
|
||||
|
||||
|
||||
def test_span_intersection():
|
||||
pairs = [
|
||||
((1, 3), (2, 4)),
|
||||
((1, 3), (2, 3)),
|
||||
((1, 3), (1, 2)),
|
||||
((1, 3), (0, 2)),
|
||||
((1, 3), (0, 4)),
|
||||
]
|
||||
|
||||
for a, b in pairs:
|
||||
left = Span(*a)
|
||||
right = Span(*b)
|
||||
assert left.intersects(right)
|
||||
assert right.intersects(left)
|
||||
Loading…
Add table
Add a link
Reference in a new issue