Compare commits

...

2 commits

Author SHA1 Message Date
72052645d6 Generated lexers actually kinda work
But regular expressions are underpowered and verbose
2024-08-23 15:32:35 -07:00
58c3004702 Move terminals into grammar definition
Starting to work on machine-generated lexers too
2024-08-23 07:24:30 -07:00
7 changed files with 1334 additions and 271 deletions

View file

@ -2,57 +2,17 @@
import re
import typing
import parser
from parser import Assoc, Grammar, Nothing, Terminal, rule, seq, Rule
ARROW = Terminal("Arrow")
AS = Terminal("As")
BAR = Terminal("Bar")
CLASS = Terminal("Class")
COLON = Terminal("Colon")
ELSE = Terminal("Else")
FOR = Terminal("For")
FUN = Terminal("Fun")
IDENTIFIER = Terminal("Identifier")
IF = Terminal("If")
IMPORT = Terminal("Import")
IN = Terminal("In")
LCURLY = Terminal("LeftBrace")
LET = Terminal("Let")
RCURLY = Terminal("RightBrace")
RETURN = Terminal("Return")
SEMICOLON = Terminal("Semicolon")
STRING = Terminal("String")
WHILE = Terminal("While")
EQUAL = Terminal("Equal")
LPAREN = Terminal("LeftParen")
RPAREN = Terminal("RightParen")
COMMA = Terminal("Comma")
SELF = Terminal("Selff")
OR = Terminal("Or")
IS = Terminal("Is")
AND = Terminal("And")
EQUALEQUAL = Terminal("EqualEqual")
BANGEQUAL = Terminal("BangEqual")
LESS = Terminal("Less")
GREATER = Terminal("Greater")
LESSEQUAL = Terminal("LessEqual")
GREATEREQUAL = Terminal("GreaterEqual")
PLUS = Terminal("Plus")
MINUS = Terminal("Minus")
STAR = Terminal("Star")
SLASH = Terminal("Slash")
NUMBER = Terminal("Number")
TRUE = Terminal("True")
FALSE = Terminal("False")
BANG = Terminal("Bang")
DOT = Terminal("Dot")
MATCH = Terminal("Match")
EXPORT = Terminal("Export")
UNDERSCORE = Terminal("Underscore")
NEW = Terminal("New")
LSQUARE = Terminal("LeftBracket")
RSQUARE = Terminal("RightBracket")
from parser import (
Assoc,
Grammar,
Nothing,
rule,
seq,
Rule,
Terminal,
Re,
)
from parser.parser import compile_lexer, dump_lexer_table
class FineGrammar(Grammar):
@ -62,17 +22,17 @@ class FineGrammar(Grammar):
def __init__(self):
super().__init__(
precedence=[
(Assoc.RIGHT, [EQUAL]),
(Assoc.LEFT, [OR]),
(Assoc.LEFT, [IS]),
(Assoc.LEFT, [AND]),
(Assoc.LEFT, [EQUALEQUAL, BANGEQUAL]),
(Assoc.LEFT, [LESS, GREATER, GREATEREQUAL, LESSEQUAL]),
(Assoc.LEFT, [PLUS, MINUS]),
(Assoc.LEFT, [STAR, SLASH]),
(Assoc.RIGHT, [self.EQUAL]),
(Assoc.LEFT, [self.OR]),
(Assoc.LEFT, [self.IS]),
(Assoc.LEFT, [self.AND]),
(Assoc.LEFT, [self.EQUALEQUAL, self.BANGEQUAL]),
(Assoc.LEFT, [self.LESS, self.GREATER, self.GREATEREQUAL, self.LESSEQUAL]),
(Assoc.LEFT, [self.PLUS, self.MINUS]),
(Assoc.LEFT, [self.STAR, self.SLASH]),
(Assoc.LEFT, [self.primary_expression]),
(Assoc.LEFT, [LPAREN]),
(Assoc.LEFT, [DOT]),
(Assoc.LEFT, [self.LPAREN]),
(Assoc.LEFT, [self.DOT]),
#
# If there's a confusion about whether to make an IF
# statement or an expression, prefer the statement.
@ -97,15 +57,15 @@ class FineGrammar(Grammar):
@rule
def import_statement(self) -> Rule:
return seq(IMPORT, STRING, AS, IDENTIFIER, SEMICOLON)
return seq(self.IMPORT, self.STRING, self.AS, self.IDENTIFIER, self.SEMICOLON)
@rule("ClassDeclaration")
def class_declaration(self) -> Rule:
return seq(CLASS, IDENTIFIER, self._class_body)
return seq(self.CLASS, self.IDENTIFIER, self._class_body)
@rule
def _class_body(self) -> Rule:
return seq(LCURLY, RCURLY) | seq(LCURLY, self._class_members, RCURLY)
return seq(self.LCURLY, self.RCURLY) | seq(self.LCURLY, self._class_members, self.RCURLY)
@rule
def _class_members(self) -> Rule:
@ -117,7 +77,7 @@ class FineGrammar(Grammar):
@rule("FieldDecl")
def field_declaration(self) -> Rule:
return seq(IDENTIFIER, COLON, self.type_expression, SEMICOLON)
return seq(self.IDENTIFIER, self.COLON, self.type_expression, self.SEMICOLON)
# Types
@rule("TypeExpression")
@ -126,60 +86,65 @@ class FineGrammar(Grammar):
@rule("AlternateType")
def alternate_type(self) -> Rule:
return seq(self.type_expression, OR, self.type_identifier)
return seq(self.type_expression, self.OR, self.type_identifier)
@rule("TypeIdentifier")
def type_identifier(self) -> Rule:
return IDENTIFIER
return self.IDENTIFIER
@rule
def export_statement(self) -> Rule:
return (
seq(EXPORT, self.class_declaration)
| seq(EXPORT, self.function_declaration)
| seq(EXPORT, self.let_statement)
| seq(EXPORT, self.export_list, SEMICOLON)
seq(self.EXPORT, self.class_declaration)
| seq(self.EXPORT, self.function_declaration)
| seq(self.EXPORT, self.let_statement)
| seq(self.EXPORT, self.export_list, self.SEMICOLON)
)
@rule
def export_list(self) -> Rule:
return Nothing | IDENTIFIER | seq(IDENTIFIER, COMMA, self.export_list)
return Nothing | self.IDENTIFIER | seq(self.IDENTIFIER, self.COMMA, self.export_list)
# Functions
@rule("FunctionDecl")
def function_declaration(self) -> Rule:
return seq(FUN, IDENTIFIER, self.function_parameters, self.block) | seq(
FUN, IDENTIFIER, self.function_parameters, ARROW, self.type_expression, self.block
return seq(self.FUN, self.IDENTIFIER, self.function_parameters, self.block) | seq(
self.FUN,
self.IDENTIFIER,
self.function_parameters,
self.ARROW,
self.type_expression,
self.block,
)
@rule("ParamList")
def function_parameters(self) -> Rule:
return (
seq(LPAREN, RPAREN)
| seq(LPAREN, self._first_parameter, RPAREN)
| seq(LPAREN, self._first_parameter, COMMA, self._parameter_list, RPAREN)
seq(self.LPAREN, self.RPAREN)
| seq(self.LPAREN, self._first_parameter, self.RPAREN)
| seq(self.LPAREN, self._first_parameter, self.COMMA, self._parameter_list, self.RPAREN)
)
@rule
def _first_parameter(self) -> Rule:
return SELF | self.parameter
return self.SELF | self.parameter
@rule
def _parameter_list(self) -> Rule:
return Nothing | self.parameter | seq(self.parameter, COMMA, self._parameter_list)
return Nothing | self.parameter | seq(self.parameter, self.COMMA, self._parameter_list)
@rule("Parameter")
def parameter(self) -> Rule:
return seq(IDENTIFIER, COLON, self.type_expression)
return seq(self.IDENTIFIER, self.COLON, self.type_expression)
# Block
@rule("Block")
def block(self) -> Rule:
return (
seq(LCURLY, RCURLY)
| seq(LCURLY, self.expression, RCURLY)
| seq(LCURLY, self._statement_list, RCURLY)
| seq(LCURLY, self._statement_list, self.expression, RCURLY)
seq(self.LCURLY, self.RCURLY)
| seq(self.LCURLY, self.expression, self.RCURLY)
| seq(self.LCURLY, self._statement_list, self.RCURLY)
| seq(self.LCURLY, self._statement_list, self.expression, self.RCURLY)
)
@rule
@ -200,19 +165,19 @@ class FineGrammar(Grammar):
@rule("LetStatement")
def let_statement(self) -> Rule:
return seq(LET, IDENTIFIER, EQUAL, self.expression, SEMICOLON)
return seq(self.LET, self.IDENTIFIER, self.EQUAL, self.expression, self.SEMICOLON)
@rule("ReturnStatement")
def return_statement(self) -> Rule:
return seq(RETURN, self.expression, SEMICOLON) | seq(RETURN, SEMICOLON)
return seq(self.RETURN, self.expression, self.SEMICOLON) | seq(self.RETURN, self.SEMICOLON)
@rule("ForStatement")
def for_statement(self) -> Rule:
return seq(FOR, self.iterator_variable, IN, self.expression, self.block)
return seq(self.FOR, self.iterator_variable, self.IN, self.expression, self.block)
@rule("IteratorVariable")
def iterator_variable(self) -> Rule:
return IDENTIFIER
return self.IDENTIFIER
@rule("IfStatement")
def if_statement(self) -> Rule:
@ -220,11 +185,11 @@ class FineGrammar(Grammar):
@rule
def while_statement(self) -> Rule:
return seq(WHILE, self.expression, self.block)
return seq(self.WHILE, self.expression, self.block)
@rule
def expression_statement(self) -> Rule:
return seq(self.expression, SEMICOLON)
return seq(self.expression, self.SEMICOLON)
# Expressions
@rule(transparent=True)
@ -234,91 +199,93 @@ class FineGrammar(Grammar):
@rule("BinaryExpression")
def binary_expression(self) -> Rule:
return (
seq(self.expression, EQUAL, self.expression)
| seq(self.expression, OR, self.expression)
| seq(self.expression, AND, self.expression)
| seq(self.expression, EQUALEQUAL, self.expression)
| seq(self.expression, BANGEQUAL, self.expression)
| seq(self.expression, LESS, self.expression)
| seq(self.expression, LESSEQUAL, self.expression)
| seq(self.expression, GREATER, self.expression)
| seq(self.expression, GREATEREQUAL, self.expression)
| seq(self.expression, PLUS, self.expression)
| seq(self.expression, MINUS, self.expression)
| seq(self.expression, STAR, self.expression)
| seq(self.expression, SLASH, self.expression)
seq(self.expression, self.EQUAL, self.expression)
| seq(self.expression, self.OR, self.expression)
| seq(self.expression, self.AND, self.expression)
| seq(self.expression, self.EQUALEQUAL, self.expression)
| seq(self.expression, self.BANGEQUAL, self.expression)
| seq(self.expression, self.LESS, self.expression)
| seq(self.expression, self.LESSEQUAL, self.expression)
| seq(self.expression, self.GREATER, self.expression)
| seq(self.expression, self.GREATEREQUAL, self.expression)
| seq(self.expression, self.PLUS, self.expression)
| seq(self.expression, self.MINUS, self.expression)
| seq(self.expression, self.STAR, self.expression)
| seq(self.expression, self.SLASH, self.expression)
)
@rule("IsExpression")
def is_expression(self) -> Rule:
return seq(self.expression, IS, self.pattern)
return seq(self.expression, self.IS, self.pattern)
@rule
def primary_expression(self) -> Rule:
return (
self.identifier_expression
| self.literal_expression
| SELF
| seq(BANG, self.primary_expression)
| seq(MINUS, self.primary_expression)
| self.SELF
| seq(self.BANG, self.primary_expression)
| seq(self.MINUS, self.primary_expression)
| self.block
| self.conditional_expression
| self.list_constructor_expression
| self.object_constructor_expression
| self.match_expression
| seq(self.primary_expression, LPAREN, RPAREN)
| seq(self.primary_expression, LPAREN, self._expression_list, RPAREN)
| seq(self.primary_expression, DOT, IDENTIFIER)
| seq(LPAREN, self.expression, RPAREN)
| seq(self.primary_expression, self.LPAREN, self.RPAREN)
| seq(self.primary_expression, self.LPAREN, self._expression_list, self.RPAREN)
| seq(self.primary_expression, self.DOT, self.IDENTIFIER)
| seq(self.LPAREN, self.expression, self.RPAREN)
)
@rule("IdentifierExpression")
def identifier_expression(self):
return IDENTIFIER
return self.IDENTIFIER
@rule("Literal")
def literal_expression(self):
return NUMBER | STRING | TRUE | FALSE
return self.NUMBER | self.STRING | self.TRUE | self.FALSE
@rule("ConditionalExpression")
def conditional_expression(self) -> Rule:
return (
seq(IF, self.expression, self.block)
| seq(IF, self.expression, self.block, ELSE, self.conditional_expression)
| seq(IF, self.expression, self.block, ELSE, self.block)
seq(self.IF, self.expression, self.block)
| seq(self.IF, self.expression, self.block, self.ELSE, self.conditional_expression)
| seq(self.IF, self.expression, self.block, self.ELSE, self.block)
)
@rule
def list_constructor_expression(self) -> Rule:
return seq(LSQUARE, RSQUARE) | seq(LSQUARE, self._expression_list, RSQUARE)
return seq(self.LSQUARE, self.RSQUARE) | seq(
self.LSQUARE, self._expression_list, self.RSQUARE
)
@rule
def _expression_list(self) -> Rule:
return (
self.expression
| seq(self.expression, COMMA)
| seq(self.expression, COMMA, self._expression_list)
| seq(self.expression, self.COMMA)
| seq(self.expression, self.COMMA, self._expression_list)
)
@rule
def match_expression(self) -> Rule:
return seq(MATCH, self.expression, self.match_body)
return seq(self.MATCH, self.expression, self.match_body)
@rule("MatchBody")
def match_body(self) -> Rule:
return seq(LCURLY, RCURLY) | seq(LCURLY, self._match_arms, RCURLY)
return seq(self.LCURLY, self.RCURLY) | seq(self.LCURLY, self._match_arms, self.RCURLY)
@rule
def _match_arms(self) -> Rule:
return (
self.match_arm
| seq(self.match_arm, COMMA)
| seq(self.match_arm, COMMA, self._match_arms)
| seq(self.match_arm, self.COMMA)
| seq(self.match_arm, self.COMMA, self._match_arms)
)
@rule("MatchArm")
def match_arm(self) -> Rule:
return seq(self.pattern, ARROW, self.expression)
return seq(self.pattern, self.ARROW, self.expression)
@rule("Pattern")
def pattern(self) -> Rule:
@ -330,7 +297,7 @@ class FineGrammar(Grammar):
@rule
def _pattern_predicate(self) -> Rule:
return seq(AND, self.expression)
return seq(self.AND, self.expression)
@rule
def _pattern_core(self) -> Rule:
@ -338,60 +305,120 @@ class FineGrammar(Grammar):
@rule("WildcardPattern")
def wildcard_pattern(self) -> Rule:
return UNDERSCORE
return self.UNDERSCORE
@rule("VariableBinding")
def variable_binding(self) -> Rule:
return seq(IDENTIFIER, COLON)
return seq(self.IDENTIFIER, self.COLON)
@rule
def object_constructor_expression(self) -> Rule:
return seq(NEW, self.type_identifier, self.field_list)
return seq(self.NEW, self.type_identifier, self.field_list)
@rule
def field_list(self) -> Rule:
return seq(LCURLY, RCURLY) | seq(LCURLY, self.field_values, RCURLY)
return seq(self.LCURLY, self.RCURLY) | seq(self.LCURLY, self.field_values, self.RCURLY)
@rule
def field_values(self) -> Rule:
return (
self.field_value
| seq(self.field_value, COMMA)
| seq(self.field_value, COMMA, self.field_values)
| seq(self.field_value, self.COMMA)
| seq(self.field_value, self.COMMA, self.field_values)
)
@rule
def field_value(self) -> Rule:
return IDENTIFIER | seq(IDENTIFIER, COLON, self.expression)
return self.IDENTIFIER | seq(self.IDENTIFIER, self.COLON, self.expression)
BLANK = Terminal(Re.set(" ", "\t", "\r", "\n").plus())
ARROW = Terminal("->")
AS = Terminal("as")
BAR = Terminal("bar")
CLASS = Terminal("class")
COLON = Terminal("colon")
COMMENT = Terminal("comment")
ELSE = Terminal("else")
FOR = Terminal("for")
FUN = Terminal("fun")
IDENTIFIER = Terminal(
Re.seq(
Re.set(("a", "z"), ("A", "Z"), "_"),
Re.set(("a", "z"), ("A", "Z"), ("0", "9"), "_").star(),
)
)
IF = Terminal("if")
IMPORT = Terminal("import")
IN = Terminal("in")
LCURLY = Terminal("{")
LET = Terminal("Let")
RCURLY = Terminal("}")
RETURN = Terminal("return")
SEMICOLON = Terminal(";")
STRING = Terminal('""') # TODO
WHILE = Terminal("while")
EQUAL = Terminal("=")
LPAREN = Terminal("(")
RPAREN = Terminal(")")
COMMA = Terminal(",")
SELF = Terminal("self", name="SELFF")
OR = Terminal("or")
IS = Terminal("is")
AND = Terminal("and")
EQUALEQUAL = Terminal("==")
BANGEQUAL = Terminal("!=")
LESS = Terminal("<")
GREATER = Terminal(">")
LESSEQUAL = Terminal("<=")
GREATEREQUAL = Terminal(">=")
PLUS = Terminal("+")
MINUS = Terminal("-")
STAR = Terminal("*")
SLASH = Terminal("/")
NUMBER = Terminal(Re.set(("0", "9")).plus())
TRUE = Terminal("true")
FALSE = Terminal("false")
BANG = Terminal("!")
DOT = Terminal(".")
MATCH = Terminal("match")
EXPORT = Terminal("export")
UNDERSCORE = Terminal("_")
NEW = Terminal("new")
LSQUARE = Terminal("[")
RSQUARE = Terminal("]")
# -----------------------------------------------------------------------------
# DORKY LEXER
# -----------------------------------------------------------------------------
import bisect
NUMBER_RE = re.compile("[0-9]+(\\.[0-9]*([eE][-+]?[0-9]+)?)?")
IDENTIFIER_RE = re.compile("[_A-Za-z][_A-Za-z0-9]*")
KEYWORD_TABLE = {
"_": UNDERSCORE,
"and": AND,
"as": AS,
"class": CLASS,
"else": ELSE,
"export": EXPORT,
"false": FALSE,
"for": FOR,
"fun": FUN,
"if": IF,
"import": IMPORT,
"in": IN,
"is": IS,
"let": LET,
"match": MATCH,
"new": NEW,
"or": OR,
"return": RETURN,
"self": SELF,
"true": TRUE,
"while": WHILE,
"_": FineGrammar.UNDERSCORE,
"and": FineGrammar.AND,
"as": FineGrammar.AS,
"class": FineGrammar.CLASS,
"else": FineGrammar.ELSE,
"export": FineGrammar.EXPORT,
"false": FineGrammar.FALSE,
"for": FineGrammar.FOR,
"fun": FineGrammar.FUN,
"if": FineGrammar.IF,
"import": FineGrammar.IMPORT,
"in": FineGrammar.IN,
"is": FineGrammar.IS,
"let": FineGrammar.LET,
"match": FineGrammar.MATCH,
"new": FineGrammar.NEW,
"or": FineGrammar.OR,
"return": FineGrammar.RETURN,
"self": FineGrammar.SELF,
"true": FineGrammar.TRUE,
"while": FineGrammar.WHILE,
}
@ -406,63 +433,63 @@ def tokenize(src: str):
token = None
if ch == "-":
if src[pos : pos + 2] == "->":
token = (ARROW, pos, 2)
token = (FineGrammar.ARROW, pos, 2)
else:
token = (MINUS, pos, 1)
token = (FineGrammar.MINUS, pos, 1)
elif ch == "|":
token = (BAR, pos, 1)
token = (FineGrammar.BAR, pos, 1)
elif ch == ":":
token = (COLON, pos, 1)
token = (FineGrammar.COLON, pos, 1)
elif ch == "{":
token = (LCURLY, pos, 1)
token = (FineGrammar.LCURLY, pos, 1)
elif ch == "}":
token = (RCURLY, pos, 1)
token = (FineGrammar.RCURLY, pos, 1)
elif ch == ";":
token = (SEMICOLON, pos, 1)
token = (FineGrammar.SEMICOLON, pos, 1)
elif ch == "=":
if src[pos : pos + 2] == "==":
token = (EQUALEQUAL, pos, 2)
token = (FineGrammar.EQUALEQUAL, pos, 2)
else:
token = (EQUAL, pos, 1)
token = (FineGrammar.EQUAL, pos, 1)
elif ch == "(":
token = (LPAREN, pos, 1)
token = (FineGrammar.LPAREN, pos, 1)
elif ch == ")":
token = (RPAREN, pos, 1)
token = (FineGrammar.RPAREN, pos, 1)
elif ch == ",":
token = (COMMA, pos, 1)
token = (FineGrammar.COMMA, pos, 1)
elif ch == "!":
if src[pos : pos + 2] == "!=":
token = (BANGEQUAL, pos, 2)
token = (FineGrammar.BANGEQUAL, pos, 2)
else:
token = (BANG, pos, 1)
token = (FineGrammar.BANG, pos, 1)
elif ch == "<":
if src[pos : pos + 2] == "<=":
token = (LESSEQUAL, pos, 2)
token = (FineGrammar.LESSEQUAL, pos, 2)
else:
token = (LESS, pos, 1)
token = (FineGrammar.LESS, pos, 1)
elif ch == ">":
if src[pos : pos + 2] == ">=":
token = (GREATEREQUAL, pos, 2)
token = (FineGrammar.GREATEREQUAL, pos, 2)
else:
token = (GREATER, pos, 1)
token = (FineGrammar.GREATER, pos, 1)
elif ch == "+":
token = (PLUS, pos, 1)
token = (FineGrammar.PLUS, pos, 1)
elif ch == "*":
token = (STAR, pos, 1)
token = (FineGrammar.STAR, pos, 1)
elif ch == "/":
if src[pos : pos + 2] == "//":
@ -470,16 +497,16 @@ def tokenize(src: str):
pos = pos + 1
continue
token = (SLASH, pos, 1)
token = (FineGrammar.SLASH, pos, 1)
elif ch == ".":
token = (DOT, pos, 1)
token = (FineGrammar.DOT, pos, 1)
elif ch == "[":
token = (LSQUARE, pos, 1)
token = (FineGrammar.LSQUARE, pos, 1)
elif ch == "]":
token = (RSQUARE, pos, 1)
token = (FineGrammar.RSQUARE, pos, 1)
elif ch == '"' or ch == "'":
end = pos + 1
@ -490,12 +517,12 @@ def tokenize(src: str):
if end == len(src):
raise Exception(f"Unterminated string constant at {pos}")
end += 1
token = (STRING, pos, end - pos)
token = (FineGrammar.STRING, pos, end - pos)
else:
number_match = NUMBER_RE.match(src, pos)
if number_match:
token = (NUMBER, pos, number_match.end() - pos)
token = (FineGrammar.NUMBER, pos, number_match.end() - pos)
else:
id_match = IDENTIFIER_RE.match(src, pos)
if id_match:
@ -504,7 +531,7 @@ def tokenize(src: str):
if keyword:
token = (keyword, pos, len(fragment))
else:
token = (IDENTIFIER, pos, len(fragment))
token = (FineGrammar.IDENTIFIER, pos, len(fragment))
if token is None:
raise Exception("Token error")
@ -512,9 +539,6 @@ def tokenize(src: str):
pos += token[2]
import bisect
class FineTokens:
def __init__(self, src: str):
self.src = src
@ -546,4 +570,8 @@ class FineTokens:
if __name__ == "__main__":
FineGrammar().build_table()
grammar = FineGrammar()
grammar.build_table()
lexer = compile_lexer(grammar)
dump_lexer_table(lexer)

View file

@ -21,19 +21,20 @@ To get started, create a grammar that derives from the `Grammar` class. Create
one method per nonterminal, decorated with the `rule` decorator. Here's an
example:
PLUS = Terminal('+')
LPAREN = Terminal('(')
RPAREN = Terminal(')')
ID = Terminal('id')
class SimpleGrammar(Grammar):
@rule
def expression(self):
return seq(self.expression, PLUS, self.term) | self.term
return seq(self.expression, self.PLUS, self.term) | self.term
@rule
def term(self):
return seq(LPAREN, self.expression, RPAREN) | ID
return seq(self.LPAREN, self.expression, self.RPAREN) | self.ID
PLUS = Terminal('+')
LPAREN = Terminal('(')
RPAREN = Terminal(')')
ID = Terminal('id')
## Using grammars
@ -130,13 +131,13 @@ May 2024
"""
import abc
import bisect
import collections
import dataclasses
import enum
import functools
import inspect
import json
import sys
import typing
@ -1605,15 +1606,20 @@ class Rule:
class Terminal(Rule):
"""A token, or terminal symbol in the grammar."""
value: str
value: str | None
pattern: "str | Re"
def __init__(self, value):
self.value = sys.intern(value)
def __init__(self, pattern, name=None):
self.value = name
self.pattern = pattern
def flatten(self) -> typing.Generator[list["str | Terminal"], None, None]:
# We are just ourselves when flattened.
yield [self]
def __repr__(self) -> str:
return self.value or "???"
class NonTerminal(Rule):
"""A non-terminal, or a production, in the grammar.
@ -1766,19 +1772,20 @@ class Grammar:
Here's an example of a simple grammar:
PLUS = Terminal('+')
LPAREN = Terminal('(')
RPAREN = Terminal(')')
ID = Terminal('id')
class SimpleGrammar(Grammar):
@rule
def expression(self):
return seq(self.expression, PLUS, self.term) | self.term
return seq(self.expression, self.PLUS, self.term) | self.term
@rule
def term(self):
return seq(LPAREN, self.expression, RPAREN) | ID
return seq(self.LPAREN, self.expression, self.RPAREN) | self.ID
PLUS = Terminal('+')
LPAREN = Terminal('(')
RPAREN = Terminal(')')
ID = Terminal('id')
Not very exciting, perhaps, but it's something.
"""
@ -1786,6 +1793,7 @@ class Grammar:
_precedence: dict[str, typing.Tuple[Assoc, int]]
_start: str
_generator: type[GenerateLR0]
_terminals: list[Terminal]
def __init__(
self,
@ -1809,6 +1817,14 @@ class Grammar:
generator = getattr(self, "generator", GenerateLALR)
assert generator is not None
# Fixup terminal names with the name of the member that declared it.
terminals = []
for n, t in inspect.getmembers(self, lambda x: isinstance(x, Terminal)):
if t.value is None:
t.value = n
terminals.append(t)
# Fix up the precedence table.
precedence_table = {}
for prec, (associativity, symbols) in enumerate(precedence):
for symbol in symbols:
@ -1824,6 +1840,11 @@ class Grammar:
self._precedence = precedence_table
self._start = start
self._generator = generator
self._terminals = terminals
@property
def terminals(self) -> list[Terminal]:
return self._terminals
def generate_nonterminal_dict(
self, start: str | None = None
@ -1911,3 +1932,526 @@ class Grammar:
gen = generator(start, desugared, precedence=self._precedence, transparents=transparents)
table = gen.gen_table()
return table
###############################################################################
# Lexer support
###############################################################################
# For machine-generated lexers
@dataclasses.dataclass(frozen=True, slots=True)
class Span:
lower: int # inclusive
upper: int # exclusive
@classmethod
def from_str(cls, lower: str, upper: str | None = None) -> "Span":
lo = ord(lower)
if upper is None:
hi = lo + 1
else:
hi = ord(upper) + 1
return Span(lower=lo, upper=hi)
def __len__(self) -> int:
return self.upper - self.lower
def intersects(self, other: "Span") -> bool:
"""Determine if this span intersects the other span."""
return self.lower < other.upper and self.upper > other.lower
def split(self, other: "Span") -> tuple["Span|None", "Span|None", "Span|None"]:
"""Split two possibly-intersecting spans into three regions: a low
region, which covers just the lower part of the union, a mid region,
which covers the intersection, and a hi region, which covers just the
upper part of the union.
Together, low and high cover the union of the two spans. Mid covers
the intersection. The implication is that if both spans are identical
then the low and high regions will both be None and mid will be equal
to both.
Graphically, given two spans A and B:
[ B )
[ A )
[ lo )[ mid )[ hi )
If the lower bounds align then the `lo` region is empty:
[ B )
[ A )
[ mid )[ hi )
If the upper bounds align then the `hi` region is empty:
[ B )
[ A )
[ lo )[ mid )
If both bounds align then both are empty:
[ B )
[ A )
[ mid )
split is reflexive: it doesn't matter which order you split things in,
you will always get the same output spans, in the same order.
"""
if not self.intersects(other):
if self.lower < other.lower:
return (self, None, other)
else:
return (other, None, self)
first = min(self.lower, other.lower)
second = max(self.lower, other.lower)
third = min(self.upper, other.upper)
fourth = max(self.upper, other.upper)
low = Span(first, second) if first != second else None
mid = Span(second, third)
hi = Span(third, fourth) if third != fourth else None
return (low, mid, hi)
def __str__(self) -> str:
return f"[{self.lower}-{self.upper})"
ET = typing.TypeVar("ET")
class EdgeList[ET]:
"""A list of edge transitions, keyed by *span*."""
_edges: list[tuple[Span, list[ET]]]
def __init__(self):
self._edges = []
def __iter__(self) -> typing.Iterator[tuple[Span, list[ET]]]:
return iter(self._edges)
def __repr__(self) -> str:
return f"EdgeList[{','.join(str(s[0]) + '->' + repr(s[1]) for s in self._edges)}]"
def add_edge(self, c: Span, s: ET):
"""Add an edge for the given span to the list. If there are already
spans that overlap this one, split and generating multiple distinct
edges.
"""
our_targets = [s]
# Look to see where we would put this span based solely on a sort of
# lower bounds: find the lowest upper bound that is greater than the
# lower bound of the incoming span.
point = bisect.bisect_right(self._edges, c.lower, key=lambda x: x[0].upper)
# We might need to run this in multiple iterations because we keep
# splitting against the *lowest* matching span.
next_span: Span | None = c
while next_span is not None:
c = next_span
next_span = None
# print(f" incoming: {self} @ {point} <- {c}->[{s}]")
# Check to see if we've run off the end of the list of spans.
if point == len(self._edges):
self._edges.insert(point, (c, [s]))
# print(f" trivial end: {self}")
return
# Nope, pull out the span to the right of us.
right_span, right_targets = self._edges[point]
# Because we intersect at least a little bit we know that we need to
# split and keep processing.
del self._edges[point]
lo, mid, hi = c.split(right_span) # Remember the semantics
# print(f" -> {c} splits {right_span} -> {lo}, {mid}, {hi} @{point}")
# We do this from lo to hi, lo first.
if lo is not None:
# NOTE: lo will never intersect both no matter what.
if lo.intersects(right_span):
assert not lo.intersects(c)
targets = right_targets
else:
assert lo.intersects(c)
targets = our_targets
self._edges.insert(point, (lo, targets))
point += 1 # Adjust the insertion point, important for us to keep running.
if mid is not None:
# If mid exists it is known to intersect with both so we can just
# do it.
self._edges.insert(point, (mid, right_targets + our_targets))
point += 1 # Adjust the insertion point, important for us to keep running.
if hi is not None:
# NOTE: Just like lo, hi will never intersect both no matter what.
if hi.intersects(right_span):
# If hi intersects the right span then we're done, no
# need to keep running.
assert not hi.intersects(c)
self._edges.insert(point, (hi, right_targets))
else:
# BUT! If hi intersects the incoming span then what we
# need to do is to replace the incoming span with hi
# (having chopped off the lower part of the incoming
# span) and continue to execute with only the upper part
# of the incoming span.
#
# Why? Because the upper part of the incoming span might
# intersect *more* spans, in which case we need to keep
# splitting and merging targets.
assert hi.intersects(c)
next_span = hi
# print(f" result: {self}")
class NFAState:
"""An NFA state. Each state can be the accept state, with one or more
Terminals as the result."""
accept: list[Terminal]
epsilons: list["NFAState"]
_edges: EdgeList["NFAState"]
def __init__(self):
self.accept = []
self.epsilons = []
self._edges = EdgeList()
def __repr__(self):
return f"State{id(self)}"
def edges(self) -> typing.Iterable[tuple[Span, list["NFAState"]]]:
return self._edges
def add_edge(self, c: Span, s: "NFAState") -> "NFAState":
self._edges.add_edge(c, s)
return s
def dump_graph(self, name="nfa.dot"):
with open(name, "w", encoding="utf8") as f:
f.write("digraph G {\n")
stack: list[NFAState] = [self]
visited = set()
while len(stack) > 0:
state = stack.pop()
if state in visited:
continue
visited.add(state)
label = ", ".join([t.value for t in state.accept if t.value is not None])
f.write(f' {id(state)} [label="{label}"];\n')
for target in state.epsilons:
stack.append(target)
f.write(f' {id(state)} -> {id(target)} [label="\u03B5"];\n')
for span, targets in state.edges():
label = str(span).replace('"', '\\"')
for target in targets:
stack.append(target)
f.write(f' {id(state)} -> {id(target)} [label="{label}"];\n')
f.write("}\n")
@dataclasses.dataclass
class Re:
def to_nfa(self, start: NFAState) -> NFAState:
del start
raise NotImplementedError()
def __str__(self) -> str:
raise NotImplementedError()
@classmethod
def seq(cls, *values: "Re") -> "Re":
result = values[0]
for v in values[1:]:
result = RegexSequence(result, v)
return result
@classmethod
def literal(cls, value: str) -> "Re":
return cls.seq(*[RegexLiteral.from_ranges(c) for c in value])
@classmethod
def set(cls, *args: str | tuple[str, str]) -> "Re":
return RegexLiteral.from_ranges(*args)
def plus(self) -> "Re":
return RegexPlus(self)
def star(self) -> "Re":
return RegexStar(self)
def question(self) -> "Re":
return RegexQuestion(self)
def __or__(self, value: "Re", /) -> "Re":
return RegexAlternation(self, value)
@dataclasses.dataclass
class RegexLiteral(Re):
values: list[Span]
@classmethod
def from_ranges(cls, *args: str | tuple[str, str]) -> "RegexLiteral":
values = []
for a in args:
if isinstance(a, str):
values.append(Span.from_str(a))
else:
values.append(Span.from_str(a[0], a[1]))
return RegexLiteral(values)
def to_nfa(self, start: NFAState) -> NFAState:
end = NFAState()
for span in self.values:
start.add_edge(span, end)
return end
def __str__(self) -> str:
if len(self.values) == 1:
span = self.values[0]
if len(span) == 1:
return chr(span.lower)
ranges = []
for span in self.values:
start = chr(span.lower)
end = chr(span.upper - 1)
if start == end:
ranges.append(start)
else:
ranges.append(f"{start}-{end}")
return "[{}]".format("".join(ranges))
@dataclasses.dataclass
class RegexPlus(Re):
child: Re
def to_nfa(self, start: NFAState) -> NFAState:
end = self.child.to_nfa(start)
end.epsilons.append(start)
return end
def __str__(self) -> str:
return f"({self.child})+"
@dataclasses.dataclass
class RegexStar(Re):
child: Re
def to_nfa(self, start: NFAState) -> NFAState:
end = self.child.to_nfa(start)
end.epsilons.append(start)
start.epsilons.append(end)
return end
def __str__(self) -> str:
return f"({self.child})*"
@dataclasses.dataclass
class RegexQuestion(Re):
child: Re
def to_nfa(self, start: NFAState) -> NFAState:
end = self.child.to_nfa(start)
start.epsilons.append(end)
return end
def __str__(self) -> str:
return f"({self.child})?"
@dataclasses.dataclass
class RegexSequence(Re):
left: Re
right: Re
def to_nfa(self, start: NFAState) -> NFAState:
mid = self.left.to_nfa(start)
return self.right.to_nfa(mid)
def __str__(self) -> str:
return f"{self.left}{self.right}"
@dataclasses.dataclass
class RegexAlternation(Re):
left: Re
right: Re
def to_nfa(self, start: NFAState) -> NFAState:
left_start = NFAState()
start.epsilons.append(left_start)
left_end = self.left.to_nfa(left_start)
right_start = NFAState()
start.epsilons.append(right_start)
right_end = self.right.to_nfa(right_start)
end = NFAState()
left_end.epsilons.append(end)
right_end.epsilons.append(end)
return end
def __str__(self) -> str:
return f"(({self.left})||({self.right}))"
LexerTable = list[tuple[Terminal | None, list[tuple[Span, int]]]]
class NFASuperState:
states: frozenset[NFAState]
def __init__(self, states: typing.Iterable[NFAState]):
# Close over the given states, including every state that is
# reachable by epsilon-transition.
stack = list(states)
result = set()
while len(stack) > 0:
st = stack.pop()
if st in result:
continue
result.add(st)
stack.extend(st.epsilons)
self.states = frozenset(result)
def __eq__(self, other):
if not isinstance(other, NFASuperState):
return False
return self.states == other.states
def __hash__(self) -> int:
return hash(self.states)
def edges(self) -> list[tuple[Span, "NFASuperState"]]:
working: EdgeList[list[NFAState]] = EdgeList()
for st in self.states:
for span, targets in st.edges():
working.add_edge(span, targets)
# EdgeList maps span to list[list[State]] which we want to flatten.
last_upper = None
result = []
for span, stateses in working:
if last_upper is not None:
assert last_upper <= span.lower
last_upper = span.upper
s: list[NFAState] = []
for states in stateses:
s.extend(states)
result.append((span, NFASuperState(s)))
if len(result) > 0:
for i in range(0, len(result) - 1):
span = result[i][0]
next_span = result[i + 1][0]
assert span.upper <= next_span.lower
# TODO: Merge spans that are adjacent and go to the same state.
return result
def accept_terminal(self) -> Terminal | None:
accept = None
for st in self.states:
for ac in st.accept:
if accept is None:
accept = ac
elif accept.value != ac.value:
accept_regex = isinstance(accept.pattern, Re)
ac_regex = isinstance(ac.pattern, Re)
if accept_regex and not ac_regex:
accept = ac
elif ac_regex and not accept_regex:
pass
else:
raise ValueError(
f"Lexer is ambiguous: cannot distinguish between {accept.value} ('{accept.pattern}') and {ac.value} ('{ac.pattern}')"
)
return accept
def compile_lexer(x: Grammar) -> LexerTable:
# Parse the terminals all together into a big NFA rooted at `NFA`.
NFA = NFAState()
for terminal in x.terminals:
start = NFAState()
NFA.epsilons.append(start)
pattern = terminal.pattern
if isinstance(pattern, Re):
ending = pattern.to_nfa(start)
else:
ending = start
for c in pattern:
ending = ending.add_edge(Span.from_str(c), NFAState())
ending.accept.append(terminal)
NFA.dump_graph()
# Convert the NFA into a DFA in the most straightforward way (by tracking
# sets of state closures, called SuperStates.)
DFA: dict[NFASuperState, tuple[int, list[tuple[Span, NFASuperState]]]] = {}
stack = [NFASuperState([NFA])]
while len(stack) > 0:
ss = stack.pop()
if ss in DFA:
continue
edges = ss.edges()
DFA[ss] = (len(DFA), edges)
for _, target in edges:
stack.append(target)
return [
(
ss.accept_terminal(),
[(k, DFA[v][0]) for k, v in edges],
)
for ss, (_, edges) in DFA.items()
]
def dump_lexer_table(table: LexerTable):
with open("lexer.dot", "w", encoding="utf-8") as f:
f.write("digraph G {\n")
for index, (accept, edges) in enumerate(table):
label = accept.value if accept is not None else ""
f.write(f' {index} [label="{label}"];\n')
for span, target in edges:
label = str(span).replace('"', '\\"')
f.write(f' {index} -> {target} [label="{label}"];\n')
pass
f.write("}\n")

View file

@ -430,3 +430,58 @@ class Parser:
error_strings.append(f"{line_index}:{column_index}: {parse_error.message}")
return (result, error_strings)
def generic_tokenize(
src: str, table: parser.LexerTable
) -> typing.Iterable[tuple[parser.Terminal, int, int]]:
pos = 0
state = 0
start = 0
last_accept = None
last_accept_pos = 0
print(f"LEXING: {src} ({len(src)})")
while pos < len(src):
while state is not None:
accept, edges = table[state]
if accept is not None:
last_accept = accept
last_accept_pos = pos
print(f" @ {pos} state: {state} ({accept})")
if pos >= len(src):
break
char = ord(src[pos])
print(f" -> char: {char} ({repr(src[pos])})")
# Find the index of the span where the upper value is the tightest
# bound on the character.
state = None
index = bisect.bisect_right(edges, char, key=lambda x: x[0].upper)
print(f" -> {index}")
if index < len(edges):
span, target = edges[index]
print(f" -> {span}, {target}")
if char >= span.lower:
print(f" -> target: {target}")
state = target
pos += 1
else:
print(f" Nope (outside range)")
else:
print(f" Nope (at end)")
if last_accept is None:
raise Exception(f"Token error at {pos}")
yield (last_accept, start, last_accept_pos - start)
print(f" Yield: {last_accept}, reset to {last_accept_pos}")
last_accept = None
pos = last_accept_pos
start = pos
state = 0

51
pdm.lock generated
View file

@ -3,9 +3,26 @@
[metadata]
groups = ["default", "dev"]
strategy = ["cross_platform", "inherit_metadata"]
lock_version = "4.4.1"
content_hash = "sha256:143b06c001132ba589a47b2b3a498dd54f4840d95d216c794068089fcea48d4d"
strategy = ["inherit_metadata"]
lock_version = "4.5.0"
content_hash = "sha256:c4fec06f95402db1e9843df4a8a4a275273c6ec4f41f192f30d8a92ee52d15ea"
[[metadata.targets]]
requires_python = ">=3.12"
[[package]]
name = "attrs"
version = "24.2.0"
requires_python = ">=3.7"
summary = "Classes Without Boilerplate"
groups = ["dev"]
dependencies = [
"importlib-metadata; python_version < \"3.8\"",
]
files = [
{file = "attrs-24.2.0-py3-none-any.whl", hash = "sha256:81921eb96de3191c8258c199618104dd27ac608d9366f5e35d011eae1867ede2"},
{file = "attrs-24.2.0.tar.gz", hash = "sha256:5cfb1b9148b5b086569baec03f20d7b6bf3bcacc9a42bebf87ffaaca362f6346"},
]
[[package]]
name = "colorama"
@ -19,6 +36,22 @@ files = [
{file = "colorama-0.4.6.tar.gz", hash = "sha256:08695f5cb7ed6e0531a20572697297273c47b8cae5a63ffc6d6ed5c201be6e44"},
]
[[package]]
name = "hypothesis"
version = "6.111.1"
requires_python = ">=3.8"
summary = "A library for property-based testing"
groups = ["dev"]
dependencies = [
"attrs>=22.2.0",
"exceptiongroup>=1.0.0; python_version < \"3.11\"",
"sortedcontainers<3.0.0,>=2.1.0",
]
files = [
{file = "hypothesis-6.111.1-py3-none-any.whl", hash = "sha256:9422adbac4b2104f6cf92dc6604b5c9df975efc08ffc7145ecc39bc617243835"},
{file = "hypothesis-6.111.1.tar.gz", hash = "sha256:6ab6185a858fa692bf125c0d0a936134edc318bee01c05e407c71c9ead0b61c5"},
]
[[package]]
name = "iniconfig"
version = "2.0.0"
@ -60,11 +93,23 @@ summary = "pytest: simple powerful testing with Python"
groups = ["dev"]
dependencies = [
"colorama; sys_platform == \"win32\"",
"exceptiongroup>=1.0.0rc8; python_version < \"3.11\"",
"iniconfig",
"packaging",
"pluggy<2.0,>=1.5",
"tomli>=1; python_version < \"3.11\"",
]
files = [
{file = "pytest-8.2.2-py3-none-any.whl", hash = "sha256:c434598117762e2bd304e526244f67bf66bbd7b5d6cf22138be51ff661980343"},
{file = "pytest-8.2.2.tar.gz", hash = "sha256:de4bb8104e201939ccdc688b27a89a7be2079b22e2bd2b07f806b6ba71117977"},
]
[[package]]
name = "sortedcontainers"
version = "2.4.0"
summary = "Sorted Containers -- Sorted List, Sorted Dict, Sorted Set"
groups = ["dev"]
files = [
{file = "sortedcontainers-2.4.0-py2.py3-none-any.whl", hash = "sha256:a163dcaede0f1c021485e957a39245190e74249897e2ae4b2aa38595db237ee0"},
{file = "sortedcontainers-2.4.0.tar.gz", hash = "sha256:25caa5a06cc30b6b83d11423433f65d1f9d76c4c6a0c90e3379eaa43b9bfdb88"},
]

View file

@ -22,6 +22,7 @@ distribution = true
[tool.pdm.dev-dependencies]
dev = [
"pytest>=8.2.2",
"hypothesis>=6.111.1",
]
[tool.pyright]

View file

@ -38,25 +38,27 @@ def _tree(treeform) -> runtime.Tree | runtime.TokenValue:
def test_lr0_lr0():
"""An LR0 grammar should work with an LR0 generator."""
PLUS = Terminal("+")
LPAREN = Terminal("(")
RPAREN = Terminal(")")
IDENTIFIER = Terminal("id")
class LR0Grammar(Grammar):
class G(Grammar):
start = "E"
generator = parser.GenerateLR0
@rule
def E(self):
return seq(self.E, PLUS, self.T) | self.T
return seq(self.E, self.PLUS, self.T) | self.T
@rule
def T(self):
return seq(LPAREN, self.E, RPAREN) | IDENTIFIER
return seq(self.LPAREN, self.E, self.RPAREN) | self.IDENTIFIER
table = LR0Grammar().build_table()
tree, errors = runtime.Parser(table).parse(Tokens(IDENTIFIER, PLUS, LPAREN, IDENTIFIER, RPAREN))
PLUS = Terminal("+", name="+")
LPAREN = Terminal("(", name="(")
RPAREN = Terminal(")", name=")")
IDENTIFIER = Terminal("id", name="id")
table = G().build_table()
tree, errors = runtime.Parser(table).parse(
Tokens(G.IDENTIFIER, G.PLUS, G.LPAREN, G.IDENTIFIER, G.RPAREN)
)
assert errors == []
assert tree == _tree(("E", ("E", ("T", "id")), "+", ("T", "(", ("E", ("T", "id")), ")")))
@ -65,114 +67,114 @@ def test_lr0_lr0():
def test_lr0_shift_reduce():
"""This one should not work in LR0- it has a shift/reduce conflict, but works in SLR1."""
PLUS = Terminal("+")
LPAREN = Terminal("(")
RPAREN = Terminal(")")
LSQUARE = Terminal("[")
RSQUARE = Terminal("]")
IDENTIFIER = Terminal("id")
class TestGrammar(Grammar):
class G(Grammar):
start = "E"
generator = parser.GenerateLR0
@rule
def E(self):
return seq(self.E, PLUS, self.T) | self.T
return seq(self.E, self.PLUS, self.T) | self.T
@rule
def T(self):
return (
seq(LPAREN, self.E, RPAREN) | IDENTIFIER | seq(IDENTIFIER, LSQUARE, self.E, RSQUARE)
seq(self.LPAREN, self.E, self.RPAREN)
| self.IDENTIFIER
| seq(self.IDENTIFIER, self.LSQUARE, self.E, self.RSQUARE)
)
with pytest.raises(parser.AmbiguityError):
TestGrammar().build_table()
PLUS = Terminal("+")
LPAREN = Terminal("(")
RPAREN = Terminal(")")
LSQUARE = Terminal("[")
RSQUARE = Terminal("]")
IDENTIFIER = Terminal("id")
TestGrammar().build_table(generator=parser.GenerateSLR1)
with pytest.raises(parser.AmbiguityError):
G().build_table()
G().build_table(generator=parser.GenerateSLR1)
def test_lr0_reduce_reduce():
"""This one should not work, it has a reduce-reduce conflict."""
PLUS = Terminal("+")
EQUAL = Terminal("=")
LPAREN = Terminal("(")
RPAREN = Terminal(")")
IDENTIFIER = Terminal("id")
class TestGrammar(Grammar):
class G(Grammar):
start = "E"
generator = parser.GenerateLR0
@rule
def E(self):
return seq(self.E, PLUS, self.T) | self.T | seq(self.V, EQUAL, self.E)
return seq(self.E, self.PLUS, self.T) | self.T | seq(self.V, self.EQUAL, self.E)
@rule
def T(self):
return seq(LPAREN, self.E, RPAREN) | IDENTIFIER
return seq(self.LPAREN, self.E, self.RPAREN) | self.IDENTIFIER
@rule
def V(self):
return IDENTIFIER
return self.IDENTIFIER
PLUS = Terminal("+")
EQUAL = Terminal("=")
LPAREN = Terminal("(")
RPAREN = Terminal(")")
IDENTIFIER = Terminal("id")
with pytest.raises(parser.AmbiguityError):
TestGrammar().build_table()
G().build_table()
def test_lr0_empty():
"""LR0 can't handle empty productions because it doesn't know when to reduce."""
BOOP = Terminal("boop")
BEEP = Terminal("beep")
class TestGrammar(Grammar):
class G(Grammar):
start = "E"
generator = parser.GenerateLR0
@rule
def E(self):
return seq(self.F, BOOP)
return seq(self.F, self.BOOP)
@rule
def F(self):
return BEEP | parser.Nothing
return self.BEEP | parser.Nothing
BOOP = Terminal("boop")
BEEP = Terminal("beep")
with pytest.raises(parser.AmbiguityError):
TestGrammar().build_table()
G().build_table()
def test_grammar_aho_ullman_1():
EQUAL = Terminal("=")
STAR = Terminal("*")
ID = Terminal("id")
class TestGrammar(Grammar):
class G(Grammar):
start = "S"
generator = parser.GenerateSLR1
@rule
def S(self):
return seq(self.L, EQUAL, self.R) | self.R
return seq(self.L, self.EQUAL, self.R) | self.R
@rule
def L(self):
return seq(STAR, self.R) | ID
return seq(self.STAR, self.R) | self.ID
@rule
def R(self):
return self.L
with pytest.raises(parser.AmbiguityError):
TestGrammar().build_table()
EQUAL = Terminal("=")
STAR = Terminal("*")
ID = Terminal("id")
TestGrammar().build_table(generator=parser.GenerateLR1)
with pytest.raises(parser.AmbiguityError):
G().build_table()
G().build_table(generator=parser.GenerateLR1)
def test_grammar_aho_ullman_2():
A = Terminal("a")
B = Terminal("b")
class TestGrammar(Grammar):
start = "S"
generator = parser.GenerateSLR1
@ -183,7 +185,10 @@ def test_grammar_aho_ullman_2():
@rule
def X(self):
return seq(A, self.X) | B
return seq(self.A, self.X) | self.B
A = Terminal("a")
B = Terminal("b")
TestGrammar().build_table()
TestGrammar().build_table(generator=parser.GenerateLR1)
@ -191,11 +196,6 @@ def test_grammar_aho_ullman_2():
def test_fun_lalr():
PLUS = Terminal("+")
INT = Terminal("int")
ID = Terminal("id")
LPAREN = Terminal("(")
RPAREN = Terminal(")")
class TestGrammar(Grammar):
start = "S"
@ -207,15 +207,21 @@ def test_fun_lalr():
@rule
def E(self):
return self.F | seq(self.E, PLUS, self.F)
return self.F | seq(self.E, self.PLUS, self.F)
@rule
def F(self):
return self.V | INT | seq(LPAREN, self.E, RPAREN)
return self.V | self.INT | seq(self.LPAREN, self.E, self.RPAREN)
@rule
def V(self):
return ID
return self.ID
PLUS = Terminal("+")
INT = Terminal("int")
ID = Terminal("id")
LPAREN = Terminal("(")
RPAREN = Terminal(")")
TestGrammar().build_table()
@ -234,14 +240,14 @@ def test_conflicting_names():
to understand.
"""
IDENTIFIER = Terminal("Identifier")
class TestGrammar(Grammar):
start = "Identifier"
start = "IDENTIFIER"
@rule("Identifier")
@rule("IDENTIFIER")
def identifier(self):
return IDENTIFIER
return self.IDENTIFIER
IDENTIFIER = Terminal("Identifier")
with pytest.raises(ValueError):
TestGrammar().build_table()

384
tests/test_lexer.py Normal file
View file

@ -0,0 +1,384 @@
import collections
from hypothesis import assume, example, given
from hypothesis.strategies import integers, lists, tuples
import pytest
from parser import (
EdgeList,
Span,
Grammar,
rule,
Terminal,
compile_lexer,
dump_lexer_table,
Re,
)
from parser.runtime import generic_tokenize
def test_span_intersection():
pairs = [
((1, 3), (2, 4)),
((1, 3), (2, 3)),
((1, 3), (1, 2)),
((1, 3), (0, 2)),
((1, 3), (0, 4)),
]
for a, b in pairs:
left = Span(*a)
right = Span(*b)
assert left.intersects(right)
assert right.intersects(left)
def test_span_no_intersection():
pairs = [
((1, 2), (3, 4)),
]
for a, b in pairs:
left = Span(*a)
right = Span(*b)
assert not left.intersects(right)
assert not right.intersects(left)
def test_span_split():
TC = collections.namedtuple("TC", ["left", "right", "expected"])
cases = [
TC(
left=Span(1, 4),
right=Span(2, 3),
expected=(Span(1, 2), Span(2, 3), Span(3, 4)),
),
TC(
left=Span(1, 4),
right=Span(1, 2),
expected=(None, Span(1, 2), Span(2, 4)),
),
TC(
left=Span(1, 4),
right=Span(3, 4),
expected=(Span(1, 3), Span(3, 4), None),
),
TC(
left=Span(1, 4),
right=Span(1, 4),
expected=(None, Span(1, 4), None),
),
]
for left, right, expected in cases:
result = left.split(right)
assert result == expected
result = right.split(left)
assert result == expected
@given(integers(), integers())
def test_equal_span_mid_only(x, y):
"""Splitting spans against themselves results in an empty lo and hi bound."""
assume(x < y)
span = Span(x, y)
lo, mid, hi = span.split(span)
assert lo is None
assert hi is None
assert mid == span
three_distinct_points = lists(
integers(),
min_size=3,
max_size=3,
unique=True,
).map(sorted)
@given(three_distinct_points)
def test_span_low_align_lo_none(vals):
"""Splitting spans with aligned lower bounds results in an empty lo bound."""
# x y z
# [ a )
# [ b )
x, y, z = vals
a = Span(x, y)
b = Span(x, z)
lo, _, _ = a.split(b)
assert lo is None
@given(three_distinct_points)
def test_span_high_align_hi_none(vals):
"""Splitting spans with aligned lower bounds results in an empty lo bound."""
# x y z
# [ a )
# [ b )
x, y, z = vals
a = Span(y, z)
b = Span(x, z)
_, _, hi = a.split(b)
assert hi is None
four_distinct_points = lists(
integers(),
min_size=4,
max_size=4,
unique=True,
).map(sorted)
@given(four_distinct_points)
def test_span_split_overlapping_lo_left(vals):
"""Splitting two overlapping spans results in lo overlapping left."""
a, b, c, d = vals
left = Span(a, c)
right = Span(b, d)
lo, _, _ = left.split(right)
assert lo is not None
assert lo.intersects(left)
@given(four_distinct_points)
def test_span_split_overlapping_lo_not_right(vals):
"""Splitting two overlapping spans results in lo NOT overlapping right."""
a, b, c, d = vals
left = Span(a, c)
right = Span(b, d)
lo, _, _ = left.split(right)
assert lo is not None
assert not lo.intersects(right)
@given(four_distinct_points)
def test_span_split_overlapping_mid_left(vals):
"""Splitting two overlapping spans results in mid overlapping left."""
a, b, c, d = vals
left = Span(a, c)
right = Span(b, d)
_, mid, _ = left.split(right)
assert mid is not None
assert mid.intersects(left)
@given(four_distinct_points)
def test_span_split_overlapping_mid_right(vals):
"""Splitting two overlapping spans results in mid overlapping right."""
a, b, c, d = vals
left = Span(a, c)
right = Span(b, d)
_, mid, _ = left.split(right)
assert mid is not None
assert mid.intersects(right)
@given(four_distinct_points)
def test_span_split_overlapping_hi_right(vals):
"""Splitting two overlapping spans results in hi overlapping right."""
a, b, c, d = vals
left = Span(a, c)
right = Span(b, d)
_, _, hi = left.split(right)
assert hi is not None
assert hi.intersects(right)
@given(four_distinct_points)
def test_span_split_overlapping_hi_not_left(vals):
"""Splitting two overlapping spans results in hi NOT overlapping left."""
a, b, c, d = vals
left = Span(a, c)
right = Span(b, d)
_, _, hi = left.split(right)
assert hi is not None
assert not hi.intersects(left)
@given(four_distinct_points)
def test_span_split_embedded(vals):
"""Splitting two spans where one overlaps the other."""
a, b, c, d = vals
outer = Span(a, d)
inner = Span(b, c)
lo, mid, hi = outer.split(inner)
assert lo is not None
assert mid is not None
assert hi is not None
assert lo.intersects(outer)
assert not lo.intersects(inner)
assert mid.intersects(outer)
assert mid.intersects(inner)
assert hi.intersects(outer)
assert not hi.intersects(inner)
def test_edge_list_single():
el: EdgeList[str] = EdgeList()
el.add_edge(Span(1, 4), "A")
edges = list(el)
assert edges == [
(Span(1, 4), ["A"]),
]
def test_edge_list_fully_enclosed():
el: EdgeList[str] = EdgeList()
el.add_edge(Span(1, 4), "A")
el.add_edge(Span(2, 3), "B")
edges = list(el)
assert edges == [
(Span(1, 2), ["A"]),
(Span(2, 3), ["A", "B"]),
(Span(3, 4), ["A"]),
]
def test_edge_list_overlap():
el: EdgeList[str] = EdgeList()
el.add_edge(Span(1, 4), "A")
el.add_edge(Span(2, 5), "B")
edges = list(el)
assert edges == [
(Span(1, 2), ["A"]),
(Span(2, 4), ["A", "B"]),
(Span(4, 5), ["B"]),
]
def test_edge_list_no_overlap():
el: EdgeList[str] = EdgeList()
el.add_edge(Span(1, 4), "A")
el.add_edge(Span(5, 8), "B")
edges = list(el)
assert edges == [
(Span(1, 4), ["A"]),
(Span(5, 8), ["B"]),
]
def test_edge_list_no_overlap_ordered():
el: EdgeList[str] = EdgeList()
el.add_edge(Span(5, 8), "B")
el.add_edge(Span(1, 4), "A")
edges = list(el)
assert edges == [
(Span(1, 4), ["A"]),
(Span(5, 8), ["B"]),
]
def test_edge_list_overlap_span():
el: EdgeList[str] = EdgeList()
el.add_edge(Span(1, 3), "A")
el.add_edge(Span(4, 6), "B")
el.add_edge(Span(2, 5), "C")
edges = list(el)
assert edges == [
(Span(1, 2), ["A"]),
(Span(2, 3), ["A", "C"]),
(Span(3, 4), ["C"]),
(Span(4, 5), ["B", "C"]),
(Span(5, 6), ["B"]),
]
def test_edge_list_overlap_span_big():
el: EdgeList[str] = EdgeList()
el.add_edge(Span(2, 3), "A")
el.add_edge(Span(4, 5), "B")
el.add_edge(Span(6, 7), "C")
el.add_edge(Span(1, 8), "D")
edges = list(el)
assert edges == [
(Span(1, 2), ["D"]),
(Span(2, 3), ["A", "D"]),
(Span(3, 4), ["D"]),
(Span(4, 5), ["B", "D"]),
(Span(5, 6), ["D"]),
(Span(6, 7), ["C", "D"]),
(Span(7, 8), ["D"]),
]
@given(lists(lists(integers(), min_size=2, max_size=2, unique=True), min_size=1))
@example(points=[[0, 1], [1, 2]])
def test_edge_list_always_sorted(points: list[tuple[int, int]]):
# OK this is weird but stick with me.
el: EdgeList[str] = EdgeList()
for i, (a, b) in enumerate(points):
lower = min(a, b)
upper = max(a, b)
span = Span(lower, upper)
el.add_edge(span, str(i))
last_upper = None
for span, _ in el:
if last_upper is not None:
assert last_upper <= span.lower, "Edges from list are not sorted"
last_upper = span.upper
def test_lexer_compile():
class LexTest(Grammar):
@rule
def foo(self):
return self.IS
start = foo
IS = Terminal("is")
AS = Terminal("as")
IDENTIFIER = Terminal(
Re.seq(
Re.set(("a", "z"), ("A", "Z"), "_"),
Re.set(("a", "z"), ("A", "Z"), ("0", "9"), "_").star(),
)
)
BLANKS = Terminal(Re.set("\r", "\n", "\t", " ").plus())
lexer = compile_lexer(LexTest())
dump_lexer_table(lexer)
tokens = list(generic_tokenize("xy is ass", lexer))
assert tokens == [
(LexTest.IDENTIFIER, 0, 2),
(LexTest.BLANKS, 2, 1),
(LexTest.IS, 3, 2),
(LexTest.BLANKS, 5, 1),
(LexTest.IDENTIFIER, 6, 3),
]