From 58c3004702d07127db5cc7afc41742119a761a19 Mon Sep 17 00:00:00 2001 From: John Doty Date: Fri, 23 Aug 2024 07:24:30 -0700 Subject: [PATCH 1/2] Move terminals into grammar definition Starting to work on machine-generated lexers too --- grammar.py | 394 +++++++++++++++++++----------------- parser/parser.py | 198 ++++++++++++++++-- tests/test_grammar.py | 140 ++++++------- tests/test_lexer.py | 452 ++++++++++++++++++++++++++++++++++++++++++ 4 files changed, 917 insertions(+), 267 deletions(-) create mode 100644 tests/test_lexer.py diff --git a/grammar.py b/grammar.py index 38299e9..502c924 100644 --- a/grammar.py +++ b/grammar.py @@ -2,57 +2,7 @@ import re import typing -import parser -from parser import Assoc, Grammar, Nothing, Terminal, rule, seq, Rule - -ARROW = Terminal("Arrow") -AS = Terminal("As") -BAR = Terminal("Bar") -CLASS = Terminal("Class") -COLON = Terminal("Colon") -ELSE = Terminal("Else") -FOR = Terminal("For") -FUN = Terminal("Fun") -IDENTIFIER = Terminal("Identifier") -IF = Terminal("If") -IMPORT = Terminal("Import") -IN = Terminal("In") -LCURLY = Terminal("LeftBrace") -LET = Terminal("Let") -RCURLY = Terminal("RightBrace") -RETURN = Terminal("Return") -SEMICOLON = Terminal("Semicolon") -STRING = Terminal("String") -WHILE = Terminal("While") -EQUAL = Terminal("Equal") -LPAREN = Terminal("LeftParen") -RPAREN = Terminal("RightParen") -COMMA = Terminal("Comma") -SELF = Terminal("Selff") -OR = Terminal("Or") -IS = Terminal("Is") -AND = Terminal("And") -EQUALEQUAL = Terminal("EqualEqual") -BANGEQUAL = Terminal("BangEqual") -LESS = Terminal("Less") -GREATER = Terminal("Greater") -LESSEQUAL = Terminal("LessEqual") -GREATEREQUAL = Terminal("GreaterEqual") -PLUS = Terminal("Plus") -MINUS = Terminal("Minus") -STAR = Terminal("Star") -SLASH = Terminal("Slash") -NUMBER = Terminal("Number") -TRUE = Terminal("True") -FALSE = Terminal("False") -BANG = Terminal("Bang") -DOT = Terminal("Dot") -MATCH = Terminal("Match") -EXPORT = Terminal("Export") -UNDERSCORE = Terminal("Underscore") -NEW = Terminal("New") -LSQUARE = Terminal("LeftBracket") -RSQUARE = Terminal("RightBracket") +from parser import Assoc, Grammar, Nothing, rule, seq, Rule, Terminal class FineGrammar(Grammar): @@ -62,17 +12,17 @@ class FineGrammar(Grammar): def __init__(self): super().__init__( precedence=[ - (Assoc.RIGHT, [EQUAL]), - (Assoc.LEFT, [OR]), - (Assoc.LEFT, [IS]), - (Assoc.LEFT, [AND]), - (Assoc.LEFT, [EQUALEQUAL, BANGEQUAL]), - (Assoc.LEFT, [LESS, GREATER, GREATEREQUAL, LESSEQUAL]), - (Assoc.LEFT, [PLUS, MINUS]), - (Assoc.LEFT, [STAR, SLASH]), + (Assoc.RIGHT, [self.EQUAL]), + (Assoc.LEFT, [self.OR]), + (Assoc.LEFT, [self.IS]), + (Assoc.LEFT, [self.AND]), + (Assoc.LEFT, [self.EQUALEQUAL, self.BANGEQUAL]), + (Assoc.LEFT, [self.LESS, self.GREATER, self.GREATEREQUAL, self.LESSEQUAL]), + (Assoc.LEFT, [self.PLUS, self.MINUS]), + (Assoc.LEFT, [self.STAR, self.SLASH]), (Assoc.LEFT, [self.primary_expression]), - (Assoc.LEFT, [LPAREN]), - (Assoc.LEFT, [DOT]), + (Assoc.LEFT, [self.LPAREN]), + (Assoc.LEFT, [self.DOT]), # # If there's a confusion about whether to make an IF # statement or an expression, prefer the statement. @@ -97,15 +47,15 @@ class FineGrammar(Grammar): @rule def import_statement(self) -> Rule: - return seq(IMPORT, STRING, AS, IDENTIFIER, SEMICOLON) + return seq(self.IMPORT, self.STRING, self.AS, self.IDENTIFIER, self.SEMICOLON) @rule("ClassDeclaration") def class_declaration(self) -> Rule: - return seq(CLASS, IDENTIFIER, self._class_body) + return seq(self.CLASS, self.IDENTIFIER, self._class_body) @rule def _class_body(self) -> Rule: - return seq(LCURLY, RCURLY) | seq(LCURLY, self._class_members, RCURLY) + return seq(self.LCURLY, self.RCURLY) | seq(self.LCURLY, self._class_members, self.RCURLY) @rule def _class_members(self) -> Rule: @@ -117,7 +67,7 @@ class FineGrammar(Grammar): @rule("FieldDecl") def field_declaration(self) -> Rule: - return seq(IDENTIFIER, COLON, self.type_expression, SEMICOLON) + return seq(self.IDENTIFIER, self.COLON, self.type_expression, self.SEMICOLON) # Types @rule("TypeExpression") @@ -126,60 +76,65 @@ class FineGrammar(Grammar): @rule("AlternateType") def alternate_type(self) -> Rule: - return seq(self.type_expression, OR, self.type_identifier) + return seq(self.type_expression, self.OR, self.type_identifier) @rule("TypeIdentifier") def type_identifier(self) -> Rule: - return IDENTIFIER + return self.IDENTIFIER @rule def export_statement(self) -> Rule: return ( - seq(EXPORT, self.class_declaration) - | seq(EXPORT, self.function_declaration) - | seq(EXPORT, self.let_statement) - | seq(EXPORT, self.export_list, SEMICOLON) + seq(self.EXPORT, self.class_declaration) + | seq(self.EXPORT, self.function_declaration) + | seq(self.EXPORT, self.let_statement) + | seq(self.EXPORT, self.export_list, self.SEMICOLON) ) @rule def export_list(self) -> Rule: - return Nothing | IDENTIFIER | seq(IDENTIFIER, COMMA, self.export_list) + return Nothing | self.IDENTIFIER | seq(self.IDENTIFIER, self.COMMA, self.export_list) # Functions @rule("FunctionDecl") def function_declaration(self) -> Rule: - return seq(FUN, IDENTIFIER, self.function_parameters, self.block) | seq( - FUN, IDENTIFIER, self.function_parameters, ARROW, self.type_expression, self.block + return seq(self.FUN, self.IDENTIFIER, self.function_parameters, self.block) | seq( + self.FUN, + self.IDENTIFIER, + self.function_parameters, + self.ARROW, + self.type_expression, + self.block, ) @rule("ParamList") def function_parameters(self) -> Rule: return ( - seq(LPAREN, RPAREN) - | seq(LPAREN, self._first_parameter, RPAREN) - | seq(LPAREN, self._first_parameter, COMMA, self._parameter_list, RPAREN) + seq(self.LPAREN, self.RPAREN) + | seq(self.LPAREN, self._first_parameter, self.RPAREN) + | seq(self.LPAREN, self._first_parameter, self.COMMA, self._parameter_list, self.RPAREN) ) @rule def _first_parameter(self) -> Rule: - return SELF | self.parameter + return self.SELF | self.parameter @rule def _parameter_list(self) -> Rule: - return Nothing | self.parameter | seq(self.parameter, COMMA, self._parameter_list) + return Nothing | self.parameter | seq(self.parameter, self.COMMA, self._parameter_list) @rule("Parameter") def parameter(self) -> Rule: - return seq(IDENTIFIER, COLON, self.type_expression) + return seq(self.IDENTIFIER, self.COLON, self.type_expression) # Block @rule("Block") def block(self) -> Rule: return ( - seq(LCURLY, RCURLY) - | seq(LCURLY, self.expression, RCURLY) - | seq(LCURLY, self._statement_list, RCURLY) - | seq(LCURLY, self._statement_list, self.expression, RCURLY) + seq(self.LCURLY, self.RCURLY) + | seq(self.LCURLY, self.expression, self.RCURLY) + | seq(self.LCURLY, self._statement_list, self.RCURLY) + | seq(self.LCURLY, self._statement_list, self.expression, self.RCURLY) ) @rule @@ -200,19 +155,19 @@ class FineGrammar(Grammar): @rule("LetStatement") def let_statement(self) -> Rule: - return seq(LET, IDENTIFIER, EQUAL, self.expression, SEMICOLON) + return seq(self.LET, self.IDENTIFIER, self.EQUAL, self.expression, self.SEMICOLON) @rule("ReturnStatement") def return_statement(self) -> Rule: - return seq(RETURN, self.expression, SEMICOLON) | seq(RETURN, SEMICOLON) + return seq(self.RETURN, self.expression, self.SEMICOLON) | seq(self.RETURN, self.SEMICOLON) @rule("ForStatement") def for_statement(self) -> Rule: - return seq(FOR, self.iterator_variable, IN, self.expression, self.block) + return seq(self.FOR, self.iterator_variable, self.IN, self.expression, self.block) @rule("IteratorVariable") def iterator_variable(self) -> Rule: - return IDENTIFIER + return self.IDENTIFIER @rule("IfStatement") def if_statement(self) -> Rule: @@ -220,11 +175,11 @@ class FineGrammar(Grammar): @rule def while_statement(self) -> Rule: - return seq(WHILE, self.expression, self.block) + return seq(self.WHILE, self.expression, self.block) @rule def expression_statement(self) -> Rule: - return seq(self.expression, SEMICOLON) + return seq(self.expression, self.SEMICOLON) # Expressions @rule(transparent=True) @@ -234,91 +189,93 @@ class FineGrammar(Grammar): @rule("BinaryExpression") def binary_expression(self) -> Rule: return ( - seq(self.expression, EQUAL, self.expression) - | seq(self.expression, OR, self.expression) - | seq(self.expression, AND, self.expression) - | seq(self.expression, EQUALEQUAL, self.expression) - | seq(self.expression, BANGEQUAL, self.expression) - | seq(self.expression, LESS, self.expression) - | seq(self.expression, LESSEQUAL, self.expression) - | seq(self.expression, GREATER, self.expression) - | seq(self.expression, GREATEREQUAL, self.expression) - | seq(self.expression, PLUS, self.expression) - | seq(self.expression, MINUS, self.expression) - | seq(self.expression, STAR, self.expression) - | seq(self.expression, SLASH, self.expression) + seq(self.expression, self.EQUAL, self.expression) + | seq(self.expression, self.OR, self.expression) + | seq(self.expression, self.AND, self.expression) + | seq(self.expression, self.EQUALEQUAL, self.expression) + | seq(self.expression, self.BANGEQUAL, self.expression) + | seq(self.expression, self.LESS, self.expression) + | seq(self.expression, self.LESSEQUAL, self.expression) + | seq(self.expression, self.GREATER, self.expression) + | seq(self.expression, self.GREATEREQUAL, self.expression) + | seq(self.expression, self.PLUS, self.expression) + | seq(self.expression, self.MINUS, self.expression) + | seq(self.expression, self.STAR, self.expression) + | seq(self.expression, self.SLASH, self.expression) ) @rule("IsExpression") def is_expression(self) -> Rule: - return seq(self.expression, IS, self.pattern) + return seq(self.expression, self.IS, self.pattern) @rule def primary_expression(self) -> Rule: return ( self.identifier_expression | self.literal_expression - | SELF - | seq(BANG, self.primary_expression) - | seq(MINUS, self.primary_expression) + | self.SELF + | seq(self.BANG, self.primary_expression) + | seq(self.MINUS, self.primary_expression) | self.block | self.conditional_expression | self.list_constructor_expression | self.object_constructor_expression | self.match_expression - | seq(self.primary_expression, LPAREN, RPAREN) - | seq(self.primary_expression, LPAREN, self._expression_list, RPAREN) - | seq(self.primary_expression, DOT, IDENTIFIER) - | seq(LPAREN, self.expression, RPAREN) + | seq(self.primary_expression, self.LPAREN, self.RPAREN) + | seq(self.primary_expression, self.LPAREN, self._expression_list, self.RPAREN) + | seq(self.primary_expression, self.DOT, self.IDENTIFIER) + | seq(self.LPAREN, self.expression, self.RPAREN) ) @rule("IdentifierExpression") def identifier_expression(self): - return IDENTIFIER + return self.IDENTIFIER @rule("Literal") def literal_expression(self): - return NUMBER | STRING | TRUE | FALSE + return self.NUMBER | self.STRING | self.TRUE | self.FALSE @rule("ConditionalExpression") def conditional_expression(self) -> Rule: return ( - seq(IF, self.expression, self.block) - | seq(IF, self.expression, self.block, ELSE, self.conditional_expression) - | seq(IF, self.expression, self.block, ELSE, self.block) + seq(self.IF, self.expression, self.block) + | seq(self.IF, self.expression, self.block, self.ELSE, self.conditional_expression) + | seq(self.IF, self.expression, self.block, self.ELSE, self.block) ) @rule def list_constructor_expression(self) -> Rule: - return seq(LSQUARE, RSQUARE) | seq(LSQUARE, self._expression_list, RSQUARE) + return seq(self.LSQUARE, self.RSQUARE) | seq( + self.LSQUARE, self._expression_list, self.RSQUARE + ) @rule def _expression_list(self) -> Rule: return ( self.expression - | seq(self.expression, COMMA) - | seq(self.expression, COMMA, self._expression_list) + | seq(self.expression, self.COMMA) + | seq(self.expression, self.COMMA, self._expression_list) ) @rule def match_expression(self) -> Rule: - return seq(MATCH, self.expression, self.match_body) + return seq(self.MATCH, self.expression, self.match_body) @rule("MatchBody") def match_body(self) -> Rule: - return seq(LCURLY, RCURLY) | seq(LCURLY, self._match_arms, RCURLY) + return seq(self.LCURLY, self.RCURLY) | seq(self.LCURLY, self._match_arms, self.RCURLY) @rule def _match_arms(self) -> Rule: return ( self.match_arm - | seq(self.match_arm, COMMA) - | seq(self.match_arm, COMMA, self._match_arms) + | seq(self.match_arm, self.COMMA) + | seq(self.match_arm, self.COMMA, self._match_arms) ) @rule("MatchArm") def match_arm(self) -> Rule: - return seq(self.pattern, ARROW, self.expression) + return seq(self.pattern, self.ARROW, self.expression) @rule("Pattern") def pattern(self) -> Rule: @@ -330,7 +287,7 @@ class FineGrammar(Grammar): @rule def _pattern_predicate(self) -> Rule: - return seq(AND, self.expression) + return seq(self.AND, self.expression) @rule def _pattern_core(self) -> Rule: @@ -338,60 +295,116 @@ class FineGrammar(Grammar): @rule("WildcardPattern") def wildcard_pattern(self) -> Rule: - return UNDERSCORE + return self.UNDERSCORE @rule("VariableBinding") def variable_binding(self) -> Rule: - return seq(IDENTIFIER, COLON) + return seq(self.IDENTIFIER, self.COLON) @rule def object_constructor_expression(self) -> Rule: - return seq(NEW, self.type_identifier, self.field_list) + return seq(self.NEW, self.type_identifier, self.field_list) @rule def field_list(self) -> Rule: - return seq(LCURLY, RCURLY) | seq(LCURLY, self.field_values, RCURLY) + return seq(self.LCURLY, self.RCURLY) | seq(self.LCURLY, self.field_values, self.RCURLY) @rule def field_values(self) -> Rule: return ( self.field_value - | seq(self.field_value, COMMA) - | seq(self.field_value, COMMA, self.field_values) + | seq(self.field_value, self.COMMA) + | seq(self.field_value, self.COMMA, self.field_values) ) @rule def field_value(self) -> Rule: - return IDENTIFIER | seq(IDENTIFIER, COLON, self.expression) + return self.IDENTIFIER | seq(self.IDENTIFIER, self.COLON, self.expression) + + BLANK = Terminal("[ \t\r\n]+", regex=True) + + ARROW = Terminal("->") + AS = Terminal("as") + BAR = Terminal("bar") + CLASS = Terminal("class") + COLON = Terminal("colon") + COMMENT = Terminal("comment") + ELSE = Terminal("else") + FOR = Terminal("for") + FUN = Terminal("fun") + IDENTIFIER = Terminal("[A-Za-z_][A-Za-z0-9_]*", regex=True) + IF = Terminal("if") + IMPORT = Terminal("import") + IN = Terminal("in") + LCURLY = Terminal("{") + LET = Terminal("Let") + RCURLY = Terminal("}") + RETURN = Terminal("return") + SEMICOLON = Terminal(";") + STRING = Terminal('""', regex=True) + WHILE = Terminal("while") + EQUAL = Terminal("=") + LPAREN = Terminal("(") + RPAREN = Terminal(")") + COMMA = Terminal(",") + SELF = Terminal("self", name="SELFF") + OR = Terminal("or") + IS = Terminal("is") + AND = Terminal("and") + EQUALEQUAL = Terminal("==") + BANGEQUAL = Terminal("!=") + LESS = Terminal("<") + GREATER = Terminal(">") + LESSEQUAL = Terminal("<=") + GREATEREQUAL = Terminal(">=") + PLUS = Terminal("+") + MINUS = Terminal("-") + STAR = Terminal("*") + SLASH = Terminal("/") + NUMBER = Terminal("[0-9]+", regex=True) + TRUE = Terminal("true") + FALSE = Terminal("false") + BANG = Terminal("!") + DOT = Terminal(".") + MATCH = Terminal("match") + EXPORT = Terminal("export") + UNDERSCORE = Terminal("_") + NEW = Terminal("new") + LSQUARE = Terminal("[") + RSQUARE = Terminal("]") # ----------------------------------------------------------------------------- # DORKY LEXER # ----------------------------------------------------------------------------- +import bisect +import dataclasses + + NUMBER_RE = re.compile("[0-9]+(\\.[0-9]*([eE][-+]?[0-9]+)?)?") IDENTIFIER_RE = re.compile("[_A-Za-z][_A-Za-z0-9]*") KEYWORD_TABLE = { - "_": UNDERSCORE, - "and": AND, - "as": AS, - "class": CLASS, - "else": ELSE, - "export": EXPORT, - "false": FALSE, - "for": FOR, - "fun": FUN, - "if": IF, - "import": IMPORT, - "in": IN, - "is": IS, - "let": LET, - "match": MATCH, - "new": NEW, - "or": OR, - "return": RETURN, - "self": SELF, - "true": TRUE, - "while": WHILE, + "_": FineGrammar.UNDERSCORE, + "and": FineGrammar.AND, + "as": FineGrammar.AS, + "class": FineGrammar.CLASS, + "else": FineGrammar.ELSE, + "export": FineGrammar.EXPORT, + "false": FineGrammar.FALSE, + "for": FineGrammar.FOR, + "fun": FineGrammar.FUN, + "if": FineGrammar.IF, + "import": FineGrammar.IMPORT, + "in": FineGrammar.IN, + "is": FineGrammar.IS, + "let": FineGrammar.LET, + "match": FineGrammar.MATCH, + "new": FineGrammar.NEW, + "or": FineGrammar.OR, + "return": FineGrammar.RETURN, + "self": FineGrammar.SELF, + "true": FineGrammar.TRUE, + "while": FineGrammar.WHILE, } @@ -406,63 +419,63 @@ def tokenize(src: str): token = None if ch == "-": if src[pos : pos + 2] == "->": - token = (ARROW, pos, 2) + token = (FineGrammar.ARROW, pos, 2) else: - token = (MINUS, pos, 1) + token = (FineGrammar.MINUS, pos, 1) elif ch == "|": - token = (BAR, pos, 1) + token = (FineGrammar.BAR, pos, 1) elif ch == ":": - token = (COLON, pos, 1) + token = (FineGrammar.COLON, pos, 1) elif ch == "{": - token = (LCURLY, pos, 1) + token = (FineGrammar.LCURLY, pos, 1) elif ch == "}": - token = (RCURLY, pos, 1) + token = (FineGrammar.RCURLY, pos, 1) elif ch == ";": - token = (SEMICOLON, pos, 1) + token = (FineGrammar.SEMICOLON, pos, 1) elif ch == "=": if src[pos : pos + 2] == "==": - token = (EQUALEQUAL, pos, 2) + token = (FineGrammar.EQUALEQUAL, pos, 2) else: - token = (EQUAL, pos, 1) + token = (FineGrammar.EQUAL, pos, 1) elif ch == "(": - token = (LPAREN, pos, 1) + token = (FineGrammar.LPAREN, pos, 1) elif ch == ")": - token = (RPAREN, pos, 1) + token = (FineGrammar.RPAREN, pos, 1) elif ch == ",": - token = (COMMA, pos, 1) + token = (FineGrammar.COMMA, pos, 1) elif ch == "!": if src[pos : pos + 2] == "!=": - token = (BANGEQUAL, pos, 2) + token = (FineGrammar.BANGEQUAL, pos, 2) else: - token = (BANG, pos, 1) + token = (FineGrammar.BANG, pos, 1) elif ch == "<": if src[pos : pos + 2] == "<=": - token = (LESSEQUAL, pos, 2) + token = (FineGrammar.LESSEQUAL, pos, 2) else: - token = (LESS, pos, 1) + token = (FineGrammar.LESS, pos, 1) elif ch == ">": if src[pos : pos + 2] == ">=": - token = (GREATEREQUAL, pos, 2) + token = (FineGrammar.GREATEREQUAL, pos, 2) else: - token = (GREATER, pos, 1) + token = (FineGrammar.GREATER, pos, 1) elif ch == "+": - token = (PLUS, pos, 1) + token = (FineGrammar.PLUS, pos, 1) elif ch == "*": - token = (STAR, pos, 1) + token = (FineGrammar.STAR, pos, 1) elif ch == "/": if src[pos : pos + 2] == "//": @@ -470,16 +483,16 @@ def tokenize(src: str): pos = pos + 1 continue - token = (SLASH, pos, 1) + token = (FineGrammar.SLASH, pos, 1) elif ch == ".": - token = (DOT, pos, 1) + token = (FineGrammar.DOT, pos, 1) elif ch == "[": - token = (LSQUARE, pos, 1) + token = (FineGrammar.LSQUARE, pos, 1) elif ch == "]": - token = (RSQUARE, pos, 1) + token = (FineGrammar.RSQUARE, pos, 1) elif ch == '"' or ch == "'": end = pos + 1 @@ -490,12 +503,12 @@ def tokenize(src: str): if end == len(src): raise Exception(f"Unterminated string constant at {pos}") end += 1 - token = (STRING, pos, end - pos) + token = (FineGrammar.STRING, pos, end - pos) else: number_match = NUMBER_RE.match(src, pos) if number_match: - token = (NUMBER, pos, number_match.end() - pos) + token = (FineGrammar.NUMBER, pos, number_match.end() - pos) else: id_match = IDENTIFIER_RE.match(src, pos) if id_match: @@ -504,7 +517,7 @@ def tokenize(src: str): if keyword: token = (keyword, pos, len(fragment)) else: - token = (IDENTIFIER, pos, len(fragment)) + token = (FineGrammar.IDENTIFIER, pos, len(fragment)) if token is None: raise Exception("Token error") @@ -512,9 +525,6 @@ def tokenize(src: str): pos += token[2] -import bisect - - class FineTokens: def __init__(self, src: str): self.src = src @@ -546,4 +556,20 @@ class FineTokens: if __name__ == "__main__": - FineGrammar().build_table() + grammar = FineGrammar() + grammar.build_table() + + class LexTest(Grammar): + @rule + def foo(self): + return self.IS + + start = foo + + IS = Terminal("is") + AS = Terminal("as") + IDENTIFIER = Terminal("[a-z]+", regex=True) + # IDENTIFIER = Terminal("[A-Za-z_][A-Za-z0-9_]*", regex=True) + + lexer = compile_lexer(LexTest()) + dump_lexer_table(lexer) diff --git a/parser/parser.py b/parser/parser.py index d0cb1fc..4d19e29 100644 --- a/parser/parser.py +++ b/parser/parser.py @@ -21,19 +21,20 @@ To get started, create a grammar that derives from the `Grammar` class. Create one method per nonterminal, decorated with the `rule` decorator. Here's an example: - PLUS = Terminal('+') - LPAREN = Terminal('(') - RPAREN = Terminal(')') - ID = Terminal('id') class SimpleGrammar(Grammar): @rule def expression(self): - return seq(self.expression, PLUS, self.term) | self.term + return seq(self.expression, self.PLUS, self.term) | self.term @rule def term(self): - return seq(LPAREN, self.expression, RPAREN) | ID + return seq(self.LPAREN, self.expression, self.RPAREN) | self.ID + + PLUS = Terminal('+') + LPAREN = Terminal('(') + RPAREN = Terminal(')') + ID = Terminal('id') ## Using grammars @@ -1605,10 +1606,14 @@ class Rule: class Terminal(Rule): """A token, or terminal symbol in the grammar.""" - value: str + value: str | None + pattern: str + regex: bool - def __init__(self, value): - self.value = sys.intern(value) + def __init__(self, pattern, name=None, regex=False): + self.value = name + self.pattern = pattern + self.regex = regex def flatten(self) -> typing.Generator[list["str | Terminal"], None, None]: # We are just ourselves when flattened. @@ -1766,19 +1771,20 @@ class Grammar: Here's an example of a simple grammar: - PLUS = Terminal('+') - LPAREN = Terminal('(') - RPAREN = Terminal(')') - ID = Terminal('id') - class SimpleGrammar(Grammar): @rule def expression(self): - return seq(self.expression, PLUS, self.term) | self.term + return seq(self.expression, self.PLUS, self.term) | self.term @rule def term(self): - return seq(LPAREN, self.expression, RPAREN) | ID + return seq(self.LPAREN, self.expression, self.RPAREN) | self.ID + + PLUS = Terminal('+') + LPAREN = Terminal('(') + RPAREN = Terminal(')') + ID = Terminal('id') + Not very exciting, perhaps, but it's something. """ @@ -1786,6 +1792,7 @@ class Grammar: _precedence: dict[str, typing.Tuple[Assoc, int]] _start: str _generator: type[GenerateLR0] + _terminals: list[Terminal] def __init__( self, @@ -1809,6 +1816,14 @@ class Grammar: generator = getattr(self, "generator", GenerateLALR) assert generator is not None + # Fixup terminal names with the name of the member that declared it. + terminals = [] + for n, t in inspect.getmembers(self, lambda x: isinstance(x, Terminal)): + if t.value is None: + t.value = n + terminals.append(t) + + # Fix up the precedence table. precedence_table = {} for prec, (associativity, symbols) in enumerate(precedence): for symbol in symbols: @@ -1824,6 +1839,11 @@ class Grammar: self._precedence = precedence_table self._start = start self._generator = generator + self._terminals = terminals + + @property + def terminals(self) -> list[Terminal]: + return self._terminals def generate_nonterminal_dict( self, start: str | None = None @@ -1911,3 +1931,149 @@ class Grammar: gen = generator(start, desugared, precedence=self._precedence, transparents=transparents) table = gen.gen_table() return table + + +############################################################################### +# Lexer support +############################################################################### +# For machine-generated lexers + + +@dataclasses.dataclass(frozen=True, slots=True) +class Span: + lower: int # inclusive + upper: int # exclusive + + @classmethod + def from_str(cls, c: str) -> "Span": + return Span(lower=ord(c), upper=ord(c) + 1) + + def intersects(self, other: "Span") -> bool: + return self.lower < other.upper and self.upper > other.lower + + def split(self, other: "Span") -> tuple["Span|None", "Span", "Span|None"]: + assert self.intersects(other) + + first = min(self.lower, other.lower) + second = max(self.lower, other.lower) + third = min(self.upper, other.upper) + fourth = max(self.upper, other.upper) + + low = Span(first, second) if first != second else None + mid = Span(second, third) + hi = Span(third, fourth) if third != fourth else None + + return (low, mid, hi) + + def __str__(self) -> str: + if self.upper - self.lower == 1: + return str(self.lower) + + lower = str(self.lower) + upper = str(self.upper) + return f"[{lower}-{upper})" + + def __lt__(self, other: "Span") -> bool: + return self.lower < other.lower + + +ET = typing.TypeVar("ET") + + +class EdgeList[ET]: + """A list of edge transitions, keyed by *span*. A given span can have + multiple targets, because this supports NFAs.""" + + _edges: list[tuple[Span, list[ET]]] + + def __init__(self): + self._edges = [] + + def __iter__(self) -> typing.Iterator[tuple[Span, list[ET]]]: + return iter(self._edges) + + def __repr__(self) -> str: + return f"EdgeList[{','.join(str(s[0]) + '->' + repr(s[1]) for s in self._edges)}]" + + def add_edge(self, c: Span, s: ET): + """Add an edge for the given span to the list. If there are already + spans that overlap this one, split and generating multiple distinct + edges. + """ + # print(f" Adding {c}->{s} to {self}...") + # Look to see where we would put this span based solely on a + # sort of lower bounds. + point = bisect.bisect_left(self._edges, c, key=lambda x: x[0]) + + # If this is not the first span in the list then we might + # overlap with the span to our left.... + if point > 0: + left_point = point - 1 + left_span, left_targets = self._edges[left_point] + if c.intersects(left_span): + # ...if we intersect with the span to our left then we + # must split the span to our left with regards to our + # span. Then we have three target spans: + # + # - The lo one, which just has the targets from the old + # left span. (This may be empty if we overlap the + # left one completely on the left side.) + # + # - The mid one, which has both the targets from the + # old left and the new target. + # + # - The hi one, which if it exists only has our target. + # If it exists it basically replaces the current span + # for our future processing. (If not, then our span + # is completely subsumed into the left span and we + # can stop.) + # + del self._edges[left_point] + lo, mid, hi = c.split(left_span) + # print(f" <- {c} splits {left_span} -> {lo}, {mid}, {hi} @{left_point}") + self._edges.insert(left_point, (mid, left_targets + [s])) + if lo is not None: + self._edges.insert(left_point, (lo, left_targets)) + if hi is None or not hi.intersects(c): + # Yup, completely subsumed. + # print(f" result: {self} (left out)") + return + + # Continue processing with `c` as the hi split from the + # left. If the left and right spans abut each other then + # `c` will be subsumed in our right span. + c = hi + + # If point is not at the very end of the list then it might + # overlap the span to our right... + if point < len(self._edges): + right_span, right_targets = self._edges[point] + if c.intersects(right_span): + # ...this is similar to the left case, above, except the + # lower bound has the targets that our only ours, etc. + del self._edges[point] + lo, mid, hi = c.split(right_span) + # print(f" -> {c} splits {right_span} -> {lo}, {mid}, {hi} @{point}") + if hi is not None: + self._edges.insert(point, (hi, right_targets)) + self._edges.insert(point, (mid, right_targets + [s])) + if lo is None or not lo.intersects(c): + # Our span is completely subsumed on the lower side + # of the range; there is no lower side that just has + # our targets. Bail now. + # print(f" result: {self} (right out)") + return + + # Continue processing with `c` as the lo split, since + # that's the one that has only the specified state as the + # target. + c = lo + + # If we made it here then either we have a point that does not + # intersect at all, or it only partially intersects on either the + # left or right. Either way, we have ensured that: + # + # - c doesn't intersect with left or right (any more) + # - point is where it should go + self._edges.insert(point, (c, [s])) + # print(f" result: {self} (done)") diff --git a/tests/test_grammar.py b/tests/test_grammar.py index a320e06..26e5057 100644 --- a/tests/test_grammar.py +++ b/tests/test_grammar.py @@ -38,25 +38,27 @@ def _tree(treeform) -> runtime.Tree | runtime.TokenValue: def test_lr0_lr0(): """An LR0 grammar should work with an LR0 generator.""" - PLUS = Terminal("+") - LPAREN = Terminal("(") - RPAREN = Terminal(")") - IDENTIFIER = Terminal("id") - - class LR0Grammar(Grammar): + class G(Grammar): start = "E" generator = parser.GenerateLR0 @rule def E(self): - return seq(self.E, PLUS, self.T) | self.T + return seq(self.E, self.PLUS, self.T) | self.T @rule def T(self): - return seq(LPAREN, self.E, RPAREN) | IDENTIFIER + return seq(self.LPAREN, self.E, self.RPAREN) | self.IDENTIFIER - table = LR0Grammar().build_table() - tree, errors = runtime.Parser(table).parse(Tokens(IDENTIFIER, PLUS, LPAREN, IDENTIFIER, RPAREN)) + PLUS = Terminal("+", name="+") + LPAREN = Terminal("(", name="(") + RPAREN = Terminal(")", name=")") + IDENTIFIER = Terminal("id", name="id") + + table = G().build_table() + tree, errors = runtime.Parser(table).parse( + Tokens(G.IDENTIFIER, G.PLUS, G.LPAREN, G.IDENTIFIER, G.RPAREN) + ) assert errors == [] assert tree == _tree(("E", ("E", ("T", "id")), "+", ("T", "(", ("E", ("T", "id")), ")"))) @@ -65,114 +67,114 @@ def test_lr0_lr0(): def test_lr0_shift_reduce(): """This one should not work in LR0- it has a shift/reduce conflict, but works in SLR1.""" - PLUS = Terminal("+") - LPAREN = Terminal("(") - RPAREN = Terminal(")") - LSQUARE = Terminal("[") - RSQUARE = Terminal("]") - IDENTIFIER = Terminal("id") - - class TestGrammar(Grammar): + class G(Grammar): start = "E" generator = parser.GenerateLR0 @rule def E(self): - return seq(self.E, PLUS, self.T) | self.T + return seq(self.E, self.PLUS, self.T) | self.T @rule def T(self): return ( - seq(LPAREN, self.E, RPAREN) | IDENTIFIER | seq(IDENTIFIER, LSQUARE, self.E, RSQUARE) + seq(self.LPAREN, self.E, self.RPAREN) + | self.IDENTIFIER + | seq(self.IDENTIFIER, self.LSQUARE, self.E, self.RSQUARE) ) - with pytest.raises(parser.AmbiguityError): - TestGrammar().build_table() + PLUS = Terminal("+") + LPAREN = Terminal("(") + RPAREN = Terminal(")") + LSQUARE = Terminal("[") + RSQUARE = Terminal("]") + IDENTIFIER = Terminal("id") - TestGrammar().build_table(generator=parser.GenerateSLR1) + with pytest.raises(parser.AmbiguityError): + G().build_table() + + G().build_table(generator=parser.GenerateSLR1) def test_lr0_reduce_reduce(): """This one should not work, it has a reduce-reduce conflict.""" - PLUS = Terminal("+") - EQUAL = Terminal("=") - LPAREN = Terminal("(") - RPAREN = Terminal(")") - IDENTIFIER = Terminal("id") - - class TestGrammar(Grammar): + class G(Grammar): start = "E" generator = parser.GenerateLR0 @rule def E(self): - return seq(self.E, PLUS, self.T) | self.T | seq(self.V, EQUAL, self.E) + return seq(self.E, self.PLUS, self.T) | self.T | seq(self.V, self.EQUAL, self.E) @rule def T(self): - return seq(LPAREN, self.E, RPAREN) | IDENTIFIER + return seq(self.LPAREN, self.E, self.RPAREN) | self.IDENTIFIER @rule def V(self): - return IDENTIFIER + return self.IDENTIFIER + + PLUS = Terminal("+") + EQUAL = Terminal("=") + LPAREN = Terminal("(") + RPAREN = Terminal(")") + IDENTIFIER = Terminal("id") with pytest.raises(parser.AmbiguityError): - TestGrammar().build_table() + G().build_table() def test_lr0_empty(): """LR0 can't handle empty productions because it doesn't know when to reduce.""" - BOOP = Terminal("boop") - BEEP = Terminal("beep") - class TestGrammar(Grammar): + class G(Grammar): start = "E" generator = parser.GenerateLR0 @rule def E(self): - return seq(self.F, BOOP) + return seq(self.F, self.BOOP) @rule def F(self): - return BEEP | parser.Nothing + return self.BEEP | parser.Nothing + + BOOP = Terminal("boop") + BEEP = Terminal("beep") with pytest.raises(parser.AmbiguityError): - TestGrammar().build_table() + G().build_table() def test_grammar_aho_ullman_1(): - EQUAL = Terminal("=") - STAR = Terminal("*") - ID = Terminal("id") - - class TestGrammar(Grammar): + class G(Grammar): start = "S" generator = parser.GenerateSLR1 @rule def S(self): - return seq(self.L, EQUAL, self.R) | self.R + return seq(self.L, self.EQUAL, self.R) | self.R @rule def L(self): - return seq(STAR, self.R) | ID + return seq(self.STAR, self.R) | self.ID @rule def R(self): return self.L - with pytest.raises(parser.AmbiguityError): - TestGrammar().build_table() + EQUAL = Terminal("=") + STAR = Terminal("*") + ID = Terminal("id") - TestGrammar().build_table(generator=parser.GenerateLR1) + with pytest.raises(parser.AmbiguityError): + G().build_table() + + G().build_table(generator=parser.GenerateLR1) def test_grammar_aho_ullman_2(): - A = Terminal("a") - B = Terminal("b") - class TestGrammar(Grammar): start = "S" generator = parser.GenerateSLR1 @@ -183,7 +185,10 @@ def test_grammar_aho_ullman_2(): @rule def X(self): - return seq(A, self.X) | B + return seq(self.A, self.X) | self.B + + A = Terminal("a") + B = Terminal("b") TestGrammar().build_table() TestGrammar().build_table(generator=parser.GenerateLR1) @@ -191,11 +196,6 @@ def test_grammar_aho_ullman_2(): def test_fun_lalr(): - PLUS = Terminal("+") - INT = Terminal("int") - ID = Terminal("id") - LPAREN = Terminal("(") - RPAREN = Terminal(")") class TestGrammar(Grammar): start = "S" @@ -207,15 +207,21 @@ def test_fun_lalr(): @rule def E(self): - return self.F | seq(self.E, PLUS, self.F) + return self.F | seq(self.E, self.PLUS, self.F) @rule def F(self): - return self.V | INT | seq(LPAREN, self.E, RPAREN) + return self.V | self.INT | seq(self.LPAREN, self.E, self.RPAREN) @rule def V(self): - return ID + return self.ID + + PLUS = Terminal("+") + INT = Terminal("int") + ID = Terminal("id") + LPAREN = Terminal("(") + RPAREN = Terminal(")") TestGrammar().build_table() @@ -234,14 +240,14 @@ def test_conflicting_names(): to understand. """ - IDENTIFIER = Terminal("Identifier") - class TestGrammar(Grammar): - start = "Identifier" + start = "IDENTIFIER" - @rule("Identifier") + @rule("IDENTIFIER") def identifier(self): - return IDENTIFIER + return self.IDENTIFIER + + IDENTIFIER = Terminal("Identifier") with pytest.raises(ValueError): TestGrammar().build_table() diff --git a/tests/test_lexer.py b/tests/test_lexer.py new file mode 100644 index 0000000..b082889 --- /dev/null +++ b/tests/test_lexer.py @@ -0,0 +1,452 @@ +from parser import Span + +# LexerTable = list[tuple[Terminal | None, list[tuple[Span, int]]]] + + +# def compile_lexer(x: Grammar) -> LexerTable: + +# class State: +# """An NFA state. Each state can be the accept state, with one or more +# Terminals as the result.""" + +# accept: list[Terminal] +# epsilons: list["State"] +# _edges: EdgeList["State"] + +# def __init__(self): +# self.accept = [] +# self.epsilons = [] +# self._edges = EdgeList() + +# def __repr__(self): +# return f"State{id(self)}" + +# def edges(self) -> typing.Iterable[tuple[Span, list["State"]]]: +# return self._edges + +# def add_edge(self, c: Span, s: "State") -> "State": +# self._edges.add_edge(c, s) +# return s + +# def dump_graph(self, name="nfa.dot"): +# with open(name, "w", encoding="utf8") as f: +# f.write("digraph G {\n") + +# stack: list[State] = [self] +# visited = set() +# while len(stack) > 0: +# state = stack.pop() +# if state in visited: +# continue +# visited.add(state) + +# label = ", ".join([t.value for t in state.accept if t.value is not None]) +# f.write(f' {id(state)} [label="{label}"];\n') +# for target in state.epsilons: +# stack.append(target) +# f.write(f' {id(state)} -> {id(target)} [label="\u03B5"];\n') + +# for span, targets in state.edges(): +# label = str(span).replace('"', '\\"') +# for target in targets: +# stack.append(target) +# f.write(f' {id(state)} -> {id(target)} [label="{label}"];\n') + +# f.write("}\n") + +# @dataclasses.dataclass +# class RegexNode: +# def to_nfa(self, start: State) -> State: +# del start +# raise NotImplementedError() + +# def __str__(self) -> str: +# raise NotImplementedError() + +# @dataclasses.dataclass +# class RegexLiteral(RegexNode): +# values: list[tuple[str, str]] + +# def to_nfa(self, start: State) -> State: +# end = State() +# for s, e in self.values: +# start.add_edge(Span(ord(s), ord(e)), end) +# return end + +# def __str__(self) -> str: +# if len(self.values) == 1: +# start, end = self.values[0] +# if start == end: +# return start + +# ranges = [] +# for start, end in self.values: +# if start == end: +# ranges.append(start) +# else: +# ranges.append(f"{start}-{end}") +# return "![{}]".format("".join(ranges)) + +# @dataclasses.dataclass +# class RegexPlus(RegexNode): +# child: RegexNode + +# def to_nfa(self, start: State) -> State: +# end = self.child.to_nfa(start) +# end.epsilons.append(start) +# return end + +# def __str__(self) -> str: +# return f"({self.child})+" + +# @dataclasses.dataclass +# class RegexStar(RegexNode): +# child: RegexNode + +# def to_nfa(self, start: State) -> State: +# end = self.child.to_nfa(start) +# end.epsilons.append(start) +# start.epsilons.append(end) +# return end + +# def __str__(self) -> str: +# return f"({self.child})*" + +# @dataclasses.dataclass +# class RegexQuestion(RegexNode): +# child: RegexNode + +# def to_nfa(self, start: State) -> State: +# end = self.child.to_nfa(start) +# start.epsilons.append(end) +# return end + +# def __str__(self) -> str: +# return f"({self.child})?" + +# @dataclasses.dataclass +# class RegexSequence(RegexNode): +# left: RegexNode +# right: RegexNode + +# def to_nfa(self, start: State) -> State: +# mid = self.left.to_nfa(start) +# return self.right.to_nfa(mid) + +# def __str__(self) -> str: +# return f"{self.left}{self.right}" + +# @dataclasses.dataclass +# class RegexAlternation(RegexNode): +# left: RegexNode +# right: RegexNode + +# def to_nfa(self, start: State) -> State: +# left_start = State() +# start.epsilons.append(left_start) +# left_end = self.left.to_nfa(left_start) + +# right_start = State() +# start.epsilons.append(right_start) +# right_end = self.right.to_nfa(right_start) + +# end = State() +# left_end.epsilons.append(end) +# right_end.epsilons.append(end) + +# return end + +# def __str__(self) -> str: +# return f"(({self.left})||({self.right}))" + +# class RegexParser: +# # TODO: HANDLE ALTERNATION AND PRECEDENCE (CONCAT HAS HIGHEST PRECEDENCE) +# PREFIX: dict[str, typing.Callable[[str], RegexNode]] +# POSTFIX: dict[str, typing.Callable[[RegexNode, int], RegexNode]] +# BINDING: dict[str, tuple[int, int]] + +# index: int +# pattern: str + +# def __init__(self, pattern: str): +# self.PREFIX = { +# "(": self.parse_group, +# "[": self.parse_set, +# } +# self.POSTFIX = { +# "+": self.parse_plus, +# "*": self.parse_star, +# "?": self.parse_question, +# "|": self.parse_alternation, +# } + +# self.BINDING = { +# "|": (1, 1), +# "+": (2, 2), +# "*": (2, 2), +# "?": (2, 2), +# ")": (-1, -1), # Always stop parsing on ) +# } + +# self.index = 0 +# self.pattern = pattern + +# def consume(self) -> str: +# if self.index >= len(self.pattern): +# raise ValueError(f"Unable to parse regular expression '{self.pattern}'") +# result = self.pattern[self.index] +# self.index += 1 +# return result + +# def peek(self) -> str | None: +# if self.index >= len(self.pattern): +# return None +# return self.pattern[self.index] + +# def eof(self) -> bool: +# return self.index >= len(self.pattern) + +# def expect(self, ch: str): +# actual = self.consume() +# if ch != actual: +# raise ValueError(f"Expected '{ch}'") + +# def parse_regex(self, minimum_binding=0) -> RegexNode: +# ch = self.consume() +# parser = self.PREFIX.get(ch, self.parse_single) +# node = parser(ch) + +# while not self.eof(): +# ch = self.peek() +# assert ch is not None + +# lp, rp = self.BINDING.get(ch, (minimum_binding, minimum_binding)) +# if lp < minimum_binding: +# break + +# parser = self.POSTFIX.get(ch, self.parse_concat) +# node = parser(node, rp) + +# return node + +# def parse_single(self, ch: str) -> RegexNode: +# return RegexLiteral(values=[(ch, ch)]) + +# def parse_group(self, ch: str) -> RegexNode: +# del ch + +# node = self.parse_regex() +# self.expect(")") +# return node + +# def parse_set(self, ch: str) -> RegexNode: +# del ch + +# # TODO: INVERSION? +# ranges = [] +# while self.peek() not in (None, "]"): +# start = self.consume() +# if self.peek() == "-": +# self.consume() +# end = self.consume() +# else: +# end = start +# ranges.append((start, end)) + +# self.expect("]") +# return RegexLiteral(values=ranges) + +# def parse_alternation(self, node: RegexNode, rp: int) -> RegexNode: +# return RegexAlternation(left=node, right=self.parse_regex(rp)) + +# def parse_plus(self, left: RegexNode, rp: int) -> RegexNode: +# del rp +# self.expect("+") +# return RegexPlus(child=left) + +# def parse_star(self, left: RegexNode, rp: int) -> RegexNode: +# del rp +# self.expect("*") +# return RegexStar(child=left) + +# def parse_question(self, left: RegexNode, rp: int) -> RegexNode: +# del rp +# self.expect("?") +# return RegexQuestion(child=left) + +# def parse_concat(self, left: RegexNode, rp: int) -> RegexNode: +# return RegexSequence(left, self.parse_regex(rp)) + +# class SuperState: +# states: frozenset[State] +# index: int + +# def __init__(self, states: typing.Iterable[State]): +# # Close over the given states, including every state that is +# # reachable by epsilon-transition. +# stack = list(states) +# result = set() +# while len(stack) > 0: +# st = stack.pop() +# if st in result: +# continue +# result.add(st) +# stack.extend(st.epsilons) + +# self.states = frozenset(result) +# self.index = -1 + +# def __eq__(self, other): +# if not isinstance(other, SuperState): +# return False +# return self.states == other.states + +# def __hash__(self) -> int: +# return hash(self.states) + +# def edges(self) -> list[tuple[Span, "SuperState"]]: +# working: EdgeList[list[State]] = EdgeList() +# for st in self.states: +# for span, targets in st.edges(): +# working.add_edge(span, targets) + +# # EdgeList maps span to list[list[State]] which we want to flatten. +# result = [] +# for span, stateses in working: +# s: list[State] = [] +# for states in stateses: +# s.extend(states) + +# result.append((span, SuperState(s))) + +# return result + +# def accept_terminal(self) -> Terminal | None: +# accept = None +# for st in self.states: +# for ac in st.accept: +# if accept is None: +# accept = ac +# elif accept.value != ac.value: +# if accept.regex and not ac.regex: +# accept = ac +# elif ac.regex and not accept.regex: +# pass +# else: +# raise ValueError( +# f"Lexer is ambiguous: cannot distinguish between {accept.value} ('{accept.pattern}') and {ac.value} ('{ac.pattern}')" +# ) + +# return accept + +# # Parse the terminals all together into a big NFA rooted at `NFA`. +# NFA = State() +# for token in x.terminals: +# start = State() +# NFA.epsilons.append(start) + +# if token.regex: +# node = RegexParser(token.pattern).parse_regex() +# print(f" Parsed {token.pattern} to {node}") +# ending = node.to_nfa(start) + +# else: +# ending = start +# for c in token.pattern: +# ending = ending.add_edge(Span.from_str(c), State()) + +# ending.accept.append(token) + +# NFA.dump_graph() + +# # Convert the NFA into a DFA in the most straightforward way (by tracking +# # sets of state closures, called SuperStates.) +# DFA: dict[SuperState, list[tuple[Span, SuperState]]] = {} +# stack = [SuperState([NFA])] +# while len(stack) > 0: +# ss = stack.pop() +# if ss in DFA: +# continue + +# edges = ss.edges() + +# DFA[ss] = edges +# for _, target in edges: +# stack.append(target) + +# for i, k in enumerate(DFA): +# k.index = i + +# return [ +# ( +# ss.accept_terminal(), +# [(k, v.index) for k, v in edges], +# ) +# for ss, edges in DFA.items() +# ] + + +# def dump_lexer_table(table: LexerTable): +# with open("lexer.dot", "w", encoding="utf-8") as f: +# f.write("digraph G {\n") +# for index, (accept, edges) in enumerate(table): +# label = accept.value if accept is not None else "" +# f.write(f' {index} [label="{label}"];\n') +# for span, target in edges: +# label = str(span).replace('"', '\\"') +# f.write(f' {index} -> {target} [label="{label}"];\n') + +# pass +# f.write("}\n") + + +# def generic_tokenize(src: str, table: LexerTable): +# pos = 0 +# state = 0 +# start = 0 +# last_accept = None +# last_accept_pos = 0 + +# while pos < len(src): +# accept, edges = table[state] +# if accept is not None: +# last_accept = accept +# last_accept_pos = pos + 1 + +# char = ord(src[pos]) + +# # Find the index of the span where the upper value is the tightest +# # bound on the character. +# index = bisect.bisect_left(edges, char, key=lambda x: x[0].upper) +# # If the character is greater than or equal to the lower bound we +# # found then we have a hit, otherwise no. +# state = edges[index][1] if index < len(edges) and char >= edges[index][0].lower else None +# if state is None: +# if last_accept is None: +# raise Exception(f"Token error at {pos}") + +# yield (last_accept, start, last_accept_pos - start) + +# last_accept = None +# pos = last_accept_pos +# start = pos +# state = 0 + +# else: +# pos += 1 + + +def test_span_intersection(): + pairs = [ + ((1, 3), (2, 4)), + ((1, 3), (2, 3)), + ((1, 3), (1, 2)), + ((1, 3), (0, 2)), + ((1, 3), (0, 4)), + ] + + for a, b in pairs: + left = Span(*a) + right = Span(*b) + assert left.intersects(right) + assert right.intersects(left) From 72052645d6088c5d70358546f55d4e694ee78913 Mon Sep 17 00:00:00 2001 From: John Doty Date: Fri, 23 Aug 2024 15:32:35 -0700 Subject: [PATCH 2/2] Generated lexers actually kinda work But regular expressions are underpowered and verbose --- grammar.py | 40 +-- parser/parser.py | 558 ++++++++++++++++++++++++++----- parser/runtime.py | 55 +++ pdm.lock | 51 ++- pyproject.toml | 1 + tests/test_lexer.py | 796 ++++++++++++++++++++------------------------ 6 files changed, 957 insertions(+), 544 deletions(-) diff --git a/grammar.py b/grammar.py index 502c924..69f4de8 100644 --- a/grammar.py +++ b/grammar.py @@ -2,7 +2,17 @@ import re import typing -from parser import Assoc, Grammar, Nothing, rule, seq, Rule, Terminal +from parser import ( + Assoc, + Grammar, + Nothing, + rule, + seq, + Rule, + Terminal, + Re, +) +from parser.parser import compile_lexer, dump_lexer_table class FineGrammar(Grammar): @@ -321,7 +331,7 @@ class FineGrammar(Grammar): def field_value(self) -> Rule: return self.IDENTIFIER | seq(self.IDENTIFIER, self.COLON, self.expression) - BLANK = Terminal("[ \t\r\n]+", regex=True) + BLANK = Terminal(Re.set(" ", "\t", "\r", "\n").plus()) ARROW = Terminal("->") AS = Terminal("as") @@ -332,7 +342,12 @@ class FineGrammar(Grammar): ELSE = Terminal("else") FOR = Terminal("for") FUN = Terminal("fun") - IDENTIFIER = Terminal("[A-Za-z_][A-Za-z0-9_]*", regex=True) + IDENTIFIER = Terminal( + Re.seq( + Re.set(("a", "z"), ("A", "Z"), "_"), + Re.set(("a", "z"), ("A", "Z"), ("0", "9"), "_").star(), + ) + ) IF = Terminal("if") IMPORT = Terminal("import") IN = Terminal("in") @@ -341,7 +356,7 @@ class FineGrammar(Grammar): RCURLY = Terminal("}") RETURN = Terminal("return") SEMICOLON = Terminal(";") - STRING = Terminal('""', regex=True) + STRING = Terminal('""') # TODO WHILE = Terminal("while") EQUAL = Terminal("=") LPAREN = Terminal("(") @@ -361,7 +376,7 @@ class FineGrammar(Grammar): MINUS = Terminal("-") STAR = Terminal("*") SLASH = Terminal("/") - NUMBER = Terminal("[0-9]+", regex=True) + NUMBER = Terminal(Re.set(("0", "9")).plus()) TRUE = Terminal("true") FALSE = Terminal("false") BANG = Terminal("!") @@ -378,7 +393,6 @@ class FineGrammar(Grammar): # DORKY LEXER # ----------------------------------------------------------------------------- import bisect -import dataclasses NUMBER_RE = re.compile("[0-9]+(\\.[0-9]*([eE][-+]?[0-9]+)?)?") @@ -559,17 +573,5 @@ if __name__ == "__main__": grammar = FineGrammar() grammar.build_table() - class LexTest(Grammar): - @rule - def foo(self): - return self.IS - - start = foo - - IS = Terminal("is") - AS = Terminal("as") - IDENTIFIER = Terminal("[a-z]+", regex=True) - # IDENTIFIER = Terminal("[A-Za-z_][A-Za-z0-9_]*", regex=True) - - lexer = compile_lexer(LexTest()) + lexer = compile_lexer(grammar) dump_lexer_table(lexer) diff --git a/parser/parser.py b/parser/parser.py index 4d19e29..8a23d4e 100644 --- a/parser/parser.py +++ b/parser/parser.py @@ -131,13 +131,13 @@ May 2024 """ import abc +import bisect import collections import dataclasses import enum import functools import inspect import json -import sys import typing @@ -1607,18 +1607,19 @@ class Terminal(Rule): """A token, or terminal symbol in the grammar.""" value: str | None - pattern: str - regex: bool + pattern: "str | Re" - def __init__(self, pattern, name=None, regex=False): + def __init__(self, pattern, name=None): self.value = name self.pattern = pattern - self.regex = regex def flatten(self) -> typing.Generator[list["str | Terminal"], None, None]: # We are just ourselves when flattened. yield [self] + def __repr__(self) -> str: + return self.value or "???" + class NonTerminal(Rule): """A non-terminal, or a production, in the grammar. @@ -1945,14 +1946,65 @@ class Span: upper: int # exclusive @classmethod - def from_str(cls, c: str) -> "Span": - return Span(lower=ord(c), upper=ord(c) + 1) + def from_str(cls, lower: str, upper: str | None = None) -> "Span": + lo = ord(lower) + if upper is None: + hi = lo + 1 + else: + hi = ord(upper) + 1 + + return Span(lower=lo, upper=hi) + + def __len__(self) -> int: + return self.upper - self.lower def intersects(self, other: "Span") -> bool: + """Determine if this span intersects the other span.""" return self.lower < other.upper and self.upper > other.lower - def split(self, other: "Span") -> tuple["Span|None", "Span", "Span|None"]: - assert self.intersects(other) + def split(self, other: "Span") -> tuple["Span|None", "Span|None", "Span|None"]: + """Split two possibly-intersecting spans into three regions: a low + region, which covers just the lower part of the union, a mid region, + which covers the intersection, and a hi region, which covers just the + upper part of the union. + + Together, low and high cover the union of the two spans. Mid covers + the intersection. The implication is that if both spans are identical + then the low and high regions will both be None and mid will be equal + to both. + + Graphically, given two spans A and B: + + [ B ) + [ A ) + [ lo )[ mid )[ hi ) + + If the lower bounds align then the `lo` region is empty: + + [ B ) + [ A ) + [ mid )[ hi ) + + If the upper bounds align then the `hi` region is empty: + + [ B ) + [ A ) + [ lo )[ mid ) + + If both bounds align then both are empty: + + [ B ) + [ A ) + [ mid ) + + split is reflexive: it doesn't matter which order you split things in, + you will always get the same output spans, in the same order. + """ + if not self.intersects(other): + if self.lower < other.lower: + return (self, None, other) + else: + return (other, None, self) first = min(self.lower, other.lower) second = max(self.lower, other.lower) @@ -1966,23 +2018,14 @@ class Span: return (low, mid, hi) def __str__(self) -> str: - if self.upper - self.lower == 1: - return str(self.lower) - - lower = str(self.lower) - upper = str(self.upper) - return f"[{lower}-{upper})" - - def __lt__(self, other: "Span") -> bool: - return self.lower < other.lower + return f"[{self.lower}-{self.upper})" ET = typing.TypeVar("ET") class EdgeList[ET]: - """A list of edge transitions, keyed by *span*. A given span can have - multiple targets, because this supports NFAs.""" + """A list of edge transitions, keyed by *span*.""" _edges: list[tuple[Span, list[ET]]] @@ -2000,80 +2043,415 @@ class EdgeList[ET]: spans that overlap this one, split and generating multiple distinct edges. """ - # print(f" Adding {c}->{s} to {self}...") - # Look to see where we would put this span based solely on a - # sort of lower bounds. - point = bisect.bisect_left(self._edges, c, key=lambda x: x[0]) + our_targets = [s] - # If this is not the first span in the list then we might - # overlap with the span to our left.... - if point > 0: - left_point = point - 1 - left_span, left_targets = self._edges[left_point] - if c.intersects(left_span): - # ...if we intersect with the span to our left then we - # must split the span to our left with regards to our - # span. Then we have three target spans: - # - # - The lo one, which just has the targets from the old - # left span. (This may be empty if we overlap the - # left one completely on the left side.) - # - # - The mid one, which has both the targets from the - # old left and the new target. - # - # - The hi one, which if it exists only has our target. - # If it exists it basically replaces the current span - # for our future processing. (If not, then our span - # is completely subsumed into the left span and we - # can stop.) - # - del self._edges[left_point] - lo, mid, hi = c.split(left_span) - # print(f" <- {c} splits {left_span} -> {lo}, {mid}, {hi} @{left_point}") - self._edges.insert(left_point, (mid, left_targets + [s])) - if lo is not None: - self._edges.insert(left_point, (lo, left_targets)) - if hi is None or not hi.intersects(c): - # Yup, completely subsumed. - # print(f" result: {self} (left out)") - return + # Look to see where we would put this span based solely on a sort of + # lower bounds: find the lowest upper bound that is greater than the + # lower bound of the incoming span. + point = bisect.bisect_right(self._edges, c.lower, key=lambda x: x[0].upper) - # Continue processing with `c` as the hi split from the - # left. If the left and right spans abut each other then - # `c` will be subsumed in our right span. - c = hi + # We might need to run this in multiple iterations because we keep + # splitting against the *lowest* matching span. + next_span: Span | None = c + while next_span is not None: + c = next_span + next_span = None - # If point is not at the very end of the list then it might - # overlap the span to our right... - if point < len(self._edges): + # print(f" incoming: {self} @ {point} <- {c}->[{s}]") + + # Check to see if we've run off the end of the list of spans. + if point == len(self._edges): + self._edges.insert(point, (c, [s])) + # print(f" trivial end: {self}") + return + + # Nope, pull out the span to the right of us. right_span, right_targets = self._edges[point] - if c.intersects(right_span): - # ...this is similar to the left case, above, except the - # lower bound has the targets that our only ours, etc. - del self._edges[point] - lo, mid, hi = c.split(right_span) - # print(f" -> {c} splits {right_span} -> {lo}, {mid}, {hi} @{point}") - if hi is not None: + + # Because we intersect at least a little bit we know that we need to + # split and keep processing. + del self._edges[point] + lo, mid, hi = c.split(right_span) # Remember the semantics + # print(f" -> {c} splits {right_span} -> {lo}, {mid}, {hi} @{point}") + + # We do this from lo to hi, lo first. + if lo is not None: + # NOTE: lo will never intersect both no matter what. + if lo.intersects(right_span): + assert not lo.intersects(c) + targets = right_targets + else: + assert lo.intersects(c) + targets = our_targets + + self._edges.insert(point, (lo, targets)) + point += 1 # Adjust the insertion point, important for us to keep running. + + if mid is not None: + # If mid exists it is known to intersect with both so we can just + # do it. + self._edges.insert(point, (mid, right_targets + our_targets)) + point += 1 # Adjust the insertion point, important for us to keep running. + + if hi is not None: + # NOTE: Just like lo, hi will never intersect both no matter what. + if hi.intersects(right_span): + # If hi intersects the right span then we're done, no + # need to keep running. + assert not hi.intersects(c) self._edges.insert(point, (hi, right_targets)) - self._edges.insert(point, (mid, right_targets + [s])) - if lo is None or not lo.intersects(c): - # Our span is completely subsumed on the lower side - # of the range; there is no lower side that just has - # our targets. Bail now. - # print(f" result: {self} (right out)") - return - # Continue processing with `c` as the lo split, since - # that's the one that has only the specified state as the - # target. - c = lo + else: + # BUT! If hi intersects the incoming span then what we + # need to do is to replace the incoming span with hi + # (having chopped off the lower part of the incoming + # span) and continue to execute with only the upper part + # of the incoming span. + # + # Why? Because the upper part of the incoming span might + # intersect *more* spans, in which case we need to keep + # splitting and merging targets. + assert hi.intersects(c) + next_span = hi - # If we made it here then either we have a point that does not - # intersect at all, or it only partially intersects on either the - # left or right. Either way, we have ensured that: - # - # - c doesn't intersect with left or right (any more) - # - point is where it should go - self._edges.insert(point, (c, [s])) - # print(f" result: {self} (done)") + # print(f" result: {self}") + + +class NFAState: + """An NFA state. Each state can be the accept state, with one or more + Terminals as the result.""" + + accept: list[Terminal] + epsilons: list["NFAState"] + _edges: EdgeList["NFAState"] + + def __init__(self): + self.accept = [] + self.epsilons = [] + self._edges = EdgeList() + + def __repr__(self): + return f"State{id(self)}" + + def edges(self) -> typing.Iterable[tuple[Span, list["NFAState"]]]: + return self._edges + + def add_edge(self, c: Span, s: "NFAState") -> "NFAState": + self._edges.add_edge(c, s) + return s + + def dump_graph(self, name="nfa.dot"): + with open(name, "w", encoding="utf8") as f: + f.write("digraph G {\n") + + stack: list[NFAState] = [self] + visited = set() + while len(stack) > 0: + state = stack.pop() + if state in visited: + continue + visited.add(state) + + label = ", ".join([t.value for t in state.accept if t.value is not None]) + f.write(f' {id(state)} [label="{label}"];\n') + for target in state.epsilons: + stack.append(target) + f.write(f' {id(state)} -> {id(target)} [label="\u03B5"];\n') + + for span, targets in state.edges(): + label = str(span).replace('"', '\\"') + for target in targets: + stack.append(target) + f.write(f' {id(state)} -> {id(target)} [label="{label}"];\n') + + f.write("}\n") + + +@dataclasses.dataclass +class Re: + def to_nfa(self, start: NFAState) -> NFAState: + del start + raise NotImplementedError() + + def __str__(self) -> str: + raise NotImplementedError() + + @classmethod + def seq(cls, *values: "Re") -> "Re": + result = values[0] + for v in values[1:]: + result = RegexSequence(result, v) + return result + + @classmethod + def literal(cls, value: str) -> "Re": + return cls.seq(*[RegexLiteral.from_ranges(c) for c in value]) + + @classmethod + def set(cls, *args: str | tuple[str, str]) -> "Re": + return RegexLiteral.from_ranges(*args) + + def plus(self) -> "Re": + return RegexPlus(self) + + def star(self) -> "Re": + return RegexStar(self) + + def question(self) -> "Re": + return RegexQuestion(self) + + def __or__(self, value: "Re", /) -> "Re": + return RegexAlternation(self, value) + + +@dataclasses.dataclass +class RegexLiteral(Re): + values: list[Span] + + @classmethod + def from_ranges(cls, *args: str | tuple[str, str]) -> "RegexLiteral": + values = [] + for a in args: + if isinstance(a, str): + values.append(Span.from_str(a)) + else: + values.append(Span.from_str(a[0], a[1])) + + return RegexLiteral(values) + + def to_nfa(self, start: NFAState) -> NFAState: + end = NFAState() + for span in self.values: + start.add_edge(span, end) + return end + + def __str__(self) -> str: + if len(self.values) == 1: + span = self.values[0] + if len(span) == 1: + return chr(span.lower) + + ranges = [] + for span in self.values: + start = chr(span.lower) + end = chr(span.upper - 1) + if start == end: + ranges.append(start) + else: + ranges.append(f"{start}-{end}") + return "[{}]".format("".join(ranges)) + + +@dataclasses.dataclass +class RegexPlus(Re): + child: Re + + def to_nfa(self, start: NFAState) -> NFAState: + end = self.child.to_nfa(start) + end.epsilons.append(start) + return end + + def __str__(self) -> str: + return f"({self.child})+" + + +@dataclasses.dataclass +class RegexStar(Re): + child: Re + + def to_nfa(self, start: NFAState) -> NFAState: + end = self.child.to_nfa(start) + end.epsilons.append(start) + start.epsilons.append(end) + return end + + def __str__(self) -> str: + return f"({self.child})*" + + +@dataclasses.dataclass +class RegexQuestion(Re): + child: Re + + def to_nfa(self, start: NFAState) -> NFAState: + end = self.child.to_nfa(start) + start.epsilons.append(end) + return end + + def __str__(self) -> str: + return f"({self.child})?" + + +@dataclasses.dataclass +class RegexSequence(Re): + left: Re + right: Re + + def to_nfa(self, start: NFAState) -> NFAState: + mid = self.left.to_nfa(start) + return self.right.to_nfa(mid) + + def __str__(self) -> str: + return f"{self.left}{self.right}" + + +@dataclasses.dataclass +class RegexAlternation(Re): + left: Re + right: Re + + def to_nfa(self, start: NFAState) -> NFAState: + left_start = NFAState() + start.epsilons.append(left_start) + left_end = self.left.to_nfa(left_start) + + right_start = NFAState() + start.epsilons.append(right_start) + right_end = self.right.to_nfa(right_start) + + end = NFAState() + left_end.epsilons.append(end) + right_end.epsilons.append(end) + + return end + + def __str__(self) -> str: + return f"(({self.left})||({self.right}))" + + +LexerTable = list[tuple[Terminal | None, list[tuple[Span, int]]]] + + +class NFASuperState: + states: frozenset[NFAState] + + def __init__(self, states: typing.Iterable[NFAState]): + # Close over the given states, including every state that is + # reachable by epsilon-transition. + stack = list(states) + result = set() + while len(stack) > 0: + st = stack.pop() + if st in result: + continue + result.add(st) + stack.extend(st.epsilons) + + self.states = frozenset(result) + + def __eq__(self, other): + if not isinstance(other, NFASuperState): + return False + return self.states == other.states + + def __hash__(self) -> int: + return hash(self.states) + + def edges(self) -> list[tuple[Span, "NFASuperState"]]: + working: EdgeList[list[NFAState]] = EdgeList() + for st in self.states: + for span, targets in st.edges(): + working.add_edge(span, targets) + + # EdgeList maps span to list[list[State]] which we want to flatten. + last_upper = None + result = [] + for span, stateses in working: + if last_upper is not None: + assert last_upper <= span.lower + last_upper = span.upper + + s: list[NFAState] = [] + for states in stateses: + s.extend(states) + + result.append((span, NFASuperState(s))) + + if len(result) > 0: + for i in range(0, len(result) - 1): + span = result[i][0] + next_span = result[i + 1][0] + assert span.upper <= next_span.lower + + # TODO: Merge spans that are adjacent and go to the same state. + + return result + + def accept_terminal(self) -> Terminal | None: + accept = None + for st in self.states: + for ac in st.accept: + if accept is None: + accept = ac + elif accept.value != ac.value: + accept_regex = isinstance(accept.pattern, Re) + ac_regex = isinstance(ac.pattern, Re) + + if accept_regex and not ac_regex: + accept = ac + elif ac_regex and not accept_regex: + pass + else: + raise ValueError( + f"Lexer is ambiguous: cannot distinguish between {accept.value} ('{accept.pattern}') and {ac.value} ('{ac.pattern}')" + ) + + return accept + + +def compile_lexer(x: Grammar) -> LexerTable: + # Parse the terminals all together into a big NFA rooted at `NFA`. + NFA = NFAState() + for terminal in x.terminals: + start = NFAState() + NFA.epsilons.append(start) + + pattern = terminal.pattern + if isinstance(pattern, Re): + ending = pattern.to_nfa(start) + else: + ending = start + for c in pattern: + ending = ending.add_edge(Span.from_str(c), NFAState()) + + ending.accept.append(terminal) + + NFA.dump_graph() + + # Convert the NFA into a DFA in the most straightforward way (by tracking + # sets of state closures, called SuperStates.) + DFA: dict[NFASuperState, tuple[int, list[tuple[Span, NFASuperState]]]] = {} + + stack = [NFASuperState([NFA])] + while len(stack) > 0: + ss = stack.pop() + if ss in DFA: + continue + + edges = ss.edges() + + DFA[ss] = (len(DFA), edges) + for _, target in edges: + stack.append(target) + + return [ + ( + ss.accept_terminal(), + [(k, DFA[v][0]) for k, v in edges], + ) + for ss, (_, edges) in DFA.items() + ] + + +def dump_lexer_table(table: LexerTable): + with open("lexer.dot", "w", encoding="utf-8") as f: + f.write("digraph G {\n") + for index, (accept, edges) in enumerate(table): + label = accept.value if accept is not None else "" + f.write(f' {index} [label="{label}"];\n') + for span, target in edges: + label = str(span).replace('"', '\\"') + f.write(f' {index} -> {target} [label="{label}"];\n') + + pass + f.write("}\n") diff --git a/parser/runtime.py b/parser/runtime.py index f5be3a4..124bc7b 100644 --- a/parser/runtime.py +++ b/parser/runtime.py @@ -430,3 +430,58 @@ class Parser: error_strings.append(f"{line_index}:{column_index}: {parse_error.message}") return (result, error_strings) + + +def generic_tokenize( + src: str, table: parser.LexerTable +) -> typing.Iterable[tuple[parser.Terminal, int, int]]: + pos = 0 + state = 0 + start = 0 + last_accept = None + last_accept_pos = 0 + + print(f"LEXING: {src} ({len(src)})") + + while pos < len(src): + while state is not None: + accept, edges = table[state] + if accept is not None: + last_accept = accept + last_accept_pos = pos + + print(f" @ {pos} state: {state} ({accept})") + if pos >= len(src): + break + + char = ord(src[pos]) + print(f" -> char: {char} ({repr(src[pos])})") + + # Find the index of the span where the upper value is the tightest + # bound on the character. + state = None + index = bisect.bisect_right(edges, char, key=lambda x: x[0].upper) + print(f" -> {index}") + if index < len(edges): + span, target = edges[index] + print(f" -> {span}, {target}") + if char >= span.lower: + print(f" -> target: {target}") + state = target + pos += 1 + + else: + print(f" Nope (outside range)") + else: + print(f" Nope (at end)") + + if last_accept is None: + raise Exception(f"Token error at {pos}") + + yield (last_accept, start, last_accept_pos - start) + + print(f" Yield: {last_accept}, reset to {last_accept_pos}") + last_accept = None + pos = last_accept_pos + start = pos + state = 0 diff --git a/pdm.lock b/pdm.lock index b80bf6d..a937da9 100644 --- a/pdm.lock +++ b/pdm.lock @@ -3,9 +3,26 @@ [metadata] groups = ["default", "dev"] -strategy = ["cross_platform", "inherit_metadata"] -lock_version = "4.4.1" -content_hash = "sha256:143b06c001132ba589a47b2b3a498dd54f4840d95d216c794068089fcea48d4d" +strategy = ["inherit_metadata"] +lock_version = "4.5.0" +content_hash = "sha256:c4fec06f95402db1e9843df4a8a4a275273c6ec4f41f192f30d8a92ee52d15ea" + +[[metadata.targets]] +requires_python = ">=3.12" + +[[package]] +name = "attrs" +version = "24.2.0" +requires_python = ">=3.7" +summary = "Classes Without Boilerplate" +groups = ["dev"] +dependencies = [ + "importlib-metadata; python_version < \"3.8\"", +] +files = [ + {file = "attrs-24.2.0-py3-none-any.whl", hash = "sha256:81921eb96de3191c8258c199618104dd27ac608d9366f5e35d011eae1867ede2"}, + {file = "attrs-24.2.0.tar.gz", hash = "sha256:5cfb1b9148b5b086569baec03f20d7b6bf3bcacc9a42bebf87ffaaca362f6346"}, +] [[package]] name = "colorama" @@ -19,6 +36,22 @@ files = [ {file = "colorama-0.4.6.tar.gz", hash = "sha256:08695f5cb7ed6e0531a20572697297273c47b8cae5a63ffc6d6ed5c201be6e44"}, ] +[[package]] +name = "hypothesis" +version = "6.111.1" +requires_python = ">=3.8" +summary = "A library for property-based testing" +groups = ["dev"] +dependencies = [ + "attrs>=22.2.0", + "exceptiongroup>=1.0.0; python_version < \"3.11\"", + "sortedcontainers<3.0.0,>=2.1.0", +] +files = [ + {file = "hypothesis-6.111.1-py3-none-any.whl", hash = "sha256:9422adbac4b2104f6cf92dc6604b5c9df975efc08ffc7145ecc39bc617243835"}, + {file = "hypothesis-6.111.1.tar.gz", hash = "sha256:6ab6185a858fa692bf125c0d0a936134edc318bee01c05e407c71c9ead0b61c5"}, +] + [[package]] name = "iniconfig" version = "2.0.0" @@ -60,11 +93,23 @@ summary = "pytest: simple powerful testing with Python" groups = ["dev"] dependencies = [ "colorama; sys_platform == \"win32\"", + "exceptiongroup>=1.0.0rc8; python_version < \"3.11\"", "iniconfig", "packaging", "pluggy<2.0,>=1.5", + "tomli>=1; python_version < \"3.11\"", ] files = [ {file = "pytest-8.2.2-py3-none-any.whl", hash = "sha256:c434598117762e2bd304e526244f67bf66bbd7b5d6cf22138be51ff661980343"}, {file = "pytest-8.2.2.tar.gz", hash = "sha256:de4bb8104e201939ccdc688b27a89a7be2079b22e2bd2b07f806b6ba71117977"}, ] + +[[package]] +name = "sortedcontainers" +version = "2.4.0" +summary = "Sorted Containers -- Sorted List, Sorted Dict, Sorted Set" +groups = ["dev"] +files = [ + {file = "sortedcontainers-2.4.0-py2.py3-none-any.whl", hash = "sha256:a163dcaede0f1c021485e957a39245190e74249897e2ae4b2aa38595db237ee0"}, + {file = "sortedcontainers-2.4.0.tar.gz", hash = "sha256:25caa5a06cc30b6b83d11423433f65d1f9d76c4c6a0c90e3379eaa43b9bfdb88"}, +] diff --git a/pyproject.toml b/pyproject.toml index 1e28adc..c7721e1 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -22,6 +22,7 @@ distribution = true [tool.pdm.dev-dependencies] dev = [ "pytest>=8.2.2", + "hypothesis>=6.111.1", ] [tool.pyright] diff --git a/tests/test_lexer.py b/tests/test_lexer.py index b082889..fe442d8 100644 --- a/tests/test_lexer.py +++ b/tests/test_lexer.py @@ -1,439 +1,22 @@ -from parser import Span +import collections -# LexerTable = list[tuple[Terminal | None, list[tuple[Span, int]]]] +from hypothesis import assume, example, given +from hypothesis.strategies import integers, lists, tuples +import pytest -# def compile_lexer(x: Grammar) -> LexerTable: +from parser import ( + EdgeList, + Span, + Grammar, + rule, + Terminal, + compile_lexer, + dump_lexer_table, + Re, +) -# class State: -# """An NFA state. Each state can be the accept state, with one or more -# Terminals as the result.""" - -# accept: list[Terminal] -# epsilons: list["State"] -# _edges: EdgeList["State"] - -# def __init__(self): -# self.accept = [] -# self.epsilons = [] -# self._edges = EdgeList() - -# def __repr__(self): -# return f"State{id(self)}" - -# def edges(self) -> typing.Iterable[tuple[Span, list["State"]]]: -# return self._edges - -# def add_edge(self, c: Span, s: "State") -> "State": -# self._edges.add_edge(c, s) -# return s - -# def dump_graph(self, name="nfa.dot"): -# with open(name, "w", encoding="utf8") as f: -# f.write("digraph G {\n") - -# stack: list[State] = [self] -# visited = set() -# while len(stack) > 0: -# state = stack.pop() -# if state in visited: -# continue -# visited.add(state) - -# label = ", ".join([t.value for t in state.accept if t.value is not None]) -# f.write(f' {id(state)} [label="{label}"];\n') -# for target in state.epsilons: -# stack.append(target) -# f.write(f' {id(state)} -> {id(target)} [label="\u03B5"];\n') - -# for span, targets in state.edges(): -# label = str(span).replace('"', '\\"') -# for target in targets: -# stack.append(target) -# f.write(f' {id(state)} -> {id(target)} [label="{label}"];\n') - -# f.write("}\n") - -# @dataclasses.dataclass -# class RegexNode: -# def to_nfa(self, start: State) -> State: -# del start -# raise NotImplementedError() - -# def __str__(self) -> str: -# raise NotImplementedError() - -# @dataclasses.dataclass -# class RegexLiteral(RegexNode): -# values: list[tuple[str, str]] - -# def to_nfa(self, start: State) -> State: -# end = State() -# for s, e in self.values: -# start.add_edge(Span(ord(s), ord(e)), end) -# return end - -# def __str__(self) -> str: -# if len(self.values) == 1: -# start, end = self.values[0] -# if start == end: -# return start - -# ranges = [] -# for start, end in self.values: -# if start == end: -# ranges.append(start) -# else: -# ranges.append(f"{start}-{end}") -# return "![{}]".format("".join(ranges)) - -# @dataclasses.dataclass -# class RegexPlus(RegexNode): -# child: RegexNode - -# def to_nfa(self, start: State) -> State: -# end = self.child.to_nfa(start) -# end.epsilons.append(start) -# return end - -# def __str__(self) -> str: -# return f"({self.child})+" - -# @dataclasses.dataclass -# class RegexStar(RegexNode): -# child: RegexNode - -# def to_nfa(self, start: State) -> State: -# end = self.child.to_nfa(start) -# end.epsilons.append(start) -# start.epsilons.append(end) -# return end - -# def __str__(self) -> str: -# return f"({self.child})*" - -# @dataclasses.dataclass -# class RegexQuestion(RegexNode): -# child: RegexNode - -# def to_nfa(self, start: State) -> State: -# end = self.child.to_nfa(start) -# start.epsilons.append(end) -# return end - -# def __str__(self) -> str: -# return f"({self.child})?" - -# @dataclasses.dataclass -# class RegexSequence(RegexNode): -# left: RegexNode -# right: RegexNode - -# def to_nfa(self, start: State) -> State: -# mid = self.left.to_nfa(start) -# return self.right.to_nfa(mid) - -# def __str__(self) -> str: -# return f"{self.left}{self.right}" - -# @dataclasses.dataclass -# class RegexAlternation(RegexNode): -# left: RegexNode -# right: RegexNode - -# def to_nfa(self, start: State) -> State: -# left_start = State() -# start.epsilons.append(left_start) -# left_end = self.left.to_nfa(left_start) - -# right_start = State() -# start.epsilons.append(right_start) -# right_end = self.right.to_nfa(right_start) - -# end = State() -# left_end.epsilons.append(end) -# right_end.epsilons.append(end) - -# return end - -# def __str__(self) -> str: -# return f"(({self.left})||({self.right}))" - -# class RegexParser: -# # TODO: HANDLE ALTERNATION AND PRECEDENCE (CONCAT HAS HIGHEST PRECEDENCE) -# PREFIX: dict[str, typing.Callable[[str], RegexNode]] -# POSTFIX: dict[str, typing.Callable[[RegexNode, int], RegexNode]] -# BINDING: dict[str, tuple[int, int]] - -# index: int -# pattern: str - -# def __init__(self, pattern: str): -# self.PREFIX = { -# "(": self.parse_group, -# "[": self.parse_set, -# } -# self.POSTFIX = { -# "+": self.parse_plus, -# "*": self.parse_star, -# "?": self.parse_question, -# "|": self.parse_alternation, -# } - -# self.BINDING = { -# "|": (1, 1), -# "+": (2, 2), -# "*": (2, 2), -# "?": (2, 2), -# ")": (-1, -1), # Always stop parsing on ) -# } - -# self.index = 0 -# self.pattern = pattern - -# def consume(self) -> str: -# if self.index >= len(self.pattern): -# raise ValueError(f"Unable to parse regular expression '{self.pattern}'") -# result = self.pattern[self.index] -# self.index += 1 -# return result - -# def peek(self) -> str | None: -# if self.index >= len(self.pattern): -# return None -# return self.pattern[self.index] - -# def eof(self) -> bool: -# return self.index >= len(self.pattern) - -# def expect(self, ch: str): -# actual = self.consume() -# if ch != actual: -# raise ValueError(f"Expected '{ch}'") - -# def parse_regex(self, minimum_binding=0) -> RegexNode: -# ch = self.consume() -# parser = self.PREFIX.get(ch, self.parse_single) -# node = parser(ch) - -# while not self.eof(): -# ch = self.peek() -# assert ch is not None - -# lp, rp = self.BINDING.get(ch, (minimum_binding, minimum_binding)) -# if lp < minimum_binding: -# break - -# parser = self.POSTFIX.get(ch, self.parse_concat) -# node = parser(node, rp) - -# return node - -# def parse_single(self, ch: str) -> RegexNode: -# return RegexLiteral(values=[(ch, ch)]) - -# def parse_group(self, ch: str) -> RegexNode: -# del ch - -# node = self.parse_regex() -# self.expect(")") -# return node - -# def parse_set(self, ch: str) -> RegexNode: -# del ch - -# # TODO: INVERSION? -# ranges = [] -# while self.peek() not in (None, "]"): -# start = self.consume() -# if self.peek() == "-": -# self.consume() -# end = self.consume() -# else: -# end = start -# ranges.append((start, end)) - -# self.expect("]") -# return RegexLiteral(values=ranges) - -# def parse_alternation(self, node: RegexNode, rp: int) -> RegexNode: -# return RegexAlternation(left=node, right=self.parse_regex(rp)) - -# def parse_plus(self, left: RegexNode, rp: int) -> RegexNode: -# del rp -# self.expect("+") -# return RegexPlus(child=left) - -# def parse_star(self, left: RegexNode, rp: int) -> RegexNode: -# del rp -# self.expect("*") -# return RegexStar(child=left) - -# def parse_question(self, left: RegexNode, rp: int) -> RegexNode: -# del rp -# self.expect("?") -# return RegexQuestion(child=left) - -# def parse_concat(self, left: RegexNode, rp: int) -> RegexNode: -# return RegexSequence(left, self.parse_regex(rp)) - -# class SuperState: -# states: frozenset[State] -# index: int - -# def __init__(self, states: typing.Iterable[State]): -# # Close over the given states, including every state that is -# # reachable by epsilon-transition. -# stack = list(states) -# result = set() -# while len(stack) > 0: -# st = stack.pop() -# if st in result: -# continue -# result.add(st) -# stack.extend(st.epsilons) - -# self.states = frozenset(result) -# self.index = -1 - -# def __eq__(self, other): -# if not isinstance(other, SuperState): -# return False -# return self.states == other.states - -# def __hash__(self) -> int: -# return hash(self.states) - -# def edges(self) -> list[tuple[Span, "SuperState"]]: -# working: EdgeList[list[State]] = EdgeList() -# for st in self.states: -# for span, targets in st.edges(): -# working.add_edge(span, targets) - -# # EdgeList maps span to list[list[State]] which we want to flatten. -# result = [] -# for span, stateses in working: -# s: list[State] = [] -# for states in stateses: -# s.extend(states) - -# result.append((span, SuperState(s))) - -# return result - -# def accept_terminal(self) -> Terminal | None: -# accept = None -# for st in self.states: -# for ac in st.accept: -# if accept is None: -# accept = ac -# elif accept.value != ac.value: -# if accept.regex and not ac.regex: -# accept = ac -# elif ac.regex and not accept.regex: -# pass -# else: -# raise ValueError( -# f"Lexer is ambiguous: cannot distinguish between {accept.value} ('{accept.pattern}') and {ac.value} ('{ac.pattern}')" -# ) - -# return accept - -# # Parse the terminals all together into a big NFA rooted at `NFA`. -# NFA = State() -# for token in x.terminals: -# start = State() -# NFA.epsilons.append(start) - -# if token.regex: -# node = RegexParser(token.pattern).parse_regex() -# print(f" Parsed {token.pattern} to {node}") -# ending = node.to_nfa(start) - -# else: -# ending = start -# for c in token.pattern: -# ending = ending.add_edge(Span.from_str(c), State()) - -# ending.accept.append(token) - -# NFA.dump_graph() - -# # Convert the NFA into a DFA in the most straightforward way (by tracking -# # sets of state closures, called SuperStates.) -# DFA: dict[SuperState, list[tuple[Span, SuperState]]] = {} -# stack = [SuperState([NFA])] -# while len(stack) > 0: -# ss = stack.pop() -# if ss in DFA: -# continue - -# edges = ss.edges() - -# DFA[ss] = edges -# for _, target in edges: -# stack.append(target) - -# for i, k in enumerate(DFA): -# k.index = i - -# return [ -# ( -# ss.accept_terminal(), -# [(k, v.index) for k, v in edges], -# ) -# for ss, edges in DFA.items() -# ] - - -# def dump_lexer_table(table: LexerTable): -# with open("lexer.dot", "w", encoding="utf-8") as f: -# f.write("digraph G {\n") -# for index, (accept, edges) in enumerate(table): -# label = accept.value if accept is not None else "" -# f.write(f' {index} [label="{label}"];\n') -# for span, target in edges: -# label = str(span).replace('"', '\\"') -# f.write(f' {index} -> {target} [label="{label}"];\n') - -# pass -# f.write("}\n") - - -# def generic_tokenize(src: str, table: LexerTable): -# pos = 0 -# state = 0 -# start = 0 -# last_accept = None -# last_accept_pos = 0 - -# while pos < len(src): -# accept, edges = table[state] -# if accept is not None: -# last_accept = accept -# last_accept_pos = pos + 1 - -# char = ord(src[pos]) - -# # Find the index of the span where the upper value is the tightest -# # bound on the character. -# index = bisect.bisect_left(edges, char, key=lambda x: x[0].upper) -# # If the character is greater than or equal to the lower bound we -# # found then we have a hit, otherwise no. -# state = edges[index][1] if index < len(edges) and char >= edges[index][0].lower else None -# if state is None: -# if last_accept is None: -# raise Exception(f"Token error at {pos}") - -# yield (last_accept, start, last_accept_pos - start) - -# last_accept = None -# pos = last_accept_pos -# start = pos -# state = 0 - -# else: -# pos += 1 +from parser.runtime import generic_tokenize def test_span_intersection(): @@ -450,3 +33,352 @@ def test_span_intersection(): right = Span(*b) assert left.intersects(right) assert right.intersects(left) + + +def test_span_no_intersection(): + pairs = [ + ((1, 2), (3, 4)), + ] + + for a, b in pairs: + left = Span(*a) + right = Span(*b) + assert not left.intersects(right) + assert not right.intersects(left) + + +def test_span_split(): + TC = collections.namedtuple("TC", ["left", "right", "expected"]) + cases = [ + TC( + left=Span(1, 4), + right=Span(2, 3), + expected=(Span(1, 2), Span(2, 3), Span(3, 4)), + ), + TC( + left=Span(1, 4), + right=Span(1, 2), + expected=(None, Span(1, 2), Span(2, 4)), + ), + TC( + left=Span(1, 4), + right=Span(3, 4), + expected=(Span(1, 3), Span(3, 4), None), + ), + TC( + left=Span(1, 4), + right=Span(1, 4), + expected=(None, Span(1, 4), None), + ), + ] + + for left, right, expected in cases: + result = left.split(right) + assert result == expected + + result = right.split(left) + assert result == expected + + +@given(integers(), integers()) +def test_equal_span_mid_only(x, y): + """Splitting spans against themselves results in an empty lo and hi bound.""" + assume(x < y) + span = Span(x, y) + lo, mid, hi = span.split(span) + assert lo is None + assert hi is None + assert mid == span + + +three_distinct_points = lists( + integers(), + min_size=3, + max_size=3, + unique=True, +).map(sorted) + + +@given(three_distinct_points) +def test_span_low_align_lo_none(vals): + """Splitting spans with aligned lower bounds results in an empty lo bound.""" + # x y z + # [ a ) + # [ b ) + x, y, z = vals + + a = Span(x, y) + b = Span(x, z) + lo, _, _ = a.split(b) + + assert lo is None + + +@given(three_distinct_points) +def test_span_high_align_hi_none(vals): + """Splitting spans with aligned lower bounds results in an empty lo bound.""" + # x y z + # [ a ) + # [ b ) + x, y, z = vals + + a = Span(y, z) + b = Span(x, z) + _, _, hi = a.split(b) + + assert hi is None + + +four_distinct_points = lists( + integers(), + min_size=4, + max_size=4, + unique=True, +).map(sorted) + + +@given(four_distinct_points) +def test_span_split_overlapping_lo_left(vals): + """Splitting two overlapping spans results in lo overlapping left.""" + a, b, c, d = vals + + left = Span(a, c) + right = Span(b, d) + + lo, _, _ = left.split(right) + assert lo is not None + assert lo.intersects(left) + + +@given(four_distinct_points) +def test_span_split_overlapping_lo_not_right(vals): + """Splitting two overlapping spans results in lo NOT overlapping right.""" + a, b, c, d = vals + + left = Span(a, c) + right = Span(b, d) + + lo, _, _ = left.split(right) + assert lo is not None + assert not lo.intersects(right) + + +@given(four_distinct_points) +def test_span_split_overlapping_mid_left(vals): + """Splitting two overlapping spans results in mid overlapping left.""" + a, b, c, d = vals + + left = Span(a, c) + right = Span(b, d) + + _, mid, _ = left.split(right) + assert mid is not None + assert mid.intersects(left) + + +@given(four_distinct_points) +def test_span_split_overlapping_mid_right(vals): + """Splitting two overlapping spans results in mid overlapping right.""" + a, b, c, d = vals + + left = Span(a, c) + right = Span(b, d) + + _, mid, _ = left.split(right) + assert mid is not None + assert mid.intersects(right) + + +@given(four_distinct_points) +def test_span_split_overlapping_hi_right(vals): + """Splitting two overlapping spans results in hi overlapping right.""" + a, b, c, d = vals + + left = Span(a, c) + right = Span(b, d) + + _, _, hi = left.split(right) + assert hi is not None + assert hi.intersects(right) + + +@given(four_distinct_points) +def test_span_split_overlapping_hi_not_left(vals): + """Splitting two overlapping spans results in hi NOT overlapping left.""" + a, b, c, d = vals + + left = Span(a, c) + right = Span(b, d) + + _, _, hi = left.split(right) + assert hi is not None + assert not hi.intersects(left) + + +@given(four_distinct_points) +def test_span_split_embedded(vals): + """Splitting two spans where one overlaps the other.""" + a, b, c, d = vals + + outer = Span(a, d) + inner = Span(b, c) + + lo, mid, hi = outer.split(inner) + + assert lo is not None + assert mid is not None + assert hi is not None + + assert lo.intersects(outer) + assert not lo.intersects(inner) + + assert mid.intersects(outer) + assert mid.intersects(inner) + + assert hi.intersects(outer) + assert not hi.intersects(inner) + + +def test_edge_list_single(): + el: EdgeList[str] = EdgeList() + el.add_edge(Span(1, 4), "A") + + edges = list(el) + assert edges == [ + (Span(1, 4), ["A"]), + ] + + +def test_edge_list_fully_enclosed(): + el: EdgeList[str] = EdgeList() + el.add_edge(Span(1, 4), "A") + el.add_edge(Span(2, 3), "B") + + edges = list(el) + assert edges == [ + (Span(1, 2), ["A"]), + (Span(2, 3), ["A", "B"]), + (Span(3, 4), ["A"]), + ] + + +def test_edge_list_overlap(): + el: EdgeList[str] = EdgeList() + el.add_edge(Span(1, 4), "A") + el.add_edge(Span(2, 5), "B") + + edges = list(el) + assert edges == [ + (Span(1, 2), ["A"]), + (Span(2, 4), ["A", "B"]), + (Span(4, 5), ["B"]), + ] + + +def test_edge_list_no_overlap(): + el: EdgeList[str] = EdgeList() + el.add_edge(Span(1, 4), "A") + el.add_edge(Span(5, 8), "B") + + edges = list(el) + assert edges == [ + (Span(1, 4), ["A"]), + (Span(5, 8), ["B"]), + ] + + +def test_edge_list_no_overlap_ordered(): + el: EdgeList[str] = EdgeList() + el.add_edge(Span(5, 8), "B") + el.add_edge(Span(1, 4), "A") + + edges = list(el) + assert edges == [ + (Span(1, 4), ["A"]), + (Span(5, 8), ["B"]), + ] + + +def test_edge_list_overlap_span(): + el: EdgeList[str] = EdgeList() + el.add_edge(Span(1, 3), "A") + el.add_edge(Span(4, 6), "B") + el.add_edge(Span(2, 5), "C") + + edges = list(el) + assert edges == [ + (Span(1, 2), ["A"]), + (Span(2, 3), ["A", "C"]), + (Span(3, 4), ["C"]), + (Span(4, 5), ["B", "C"]), + (Span(5, 6), ["B"]), + ] + + +def test_edge_list_overlap_span_big(): + el: EdgeList[str] = EdgeList() + el.add_edge(Span(2, 3), "A") + el.add_edge(Span(4, 5), "B") + el.add_edge(Span(6, 7), "C") + el.add_edge(Span(1, 8), "D") + + edges = list(el) + assert edges == [ + (Span(1, 2), ["D"]), + (Span(2, 3), ["A", "D"]), + (Span(3, 4), ["D"]), + (Span(4, 5), ["B", "D"]), + (Span(5, 6), ["D"]), + (Span(6, 7), ["C", "D"]), + (Span(7, 8), ["D"]), + ] + + +@given(lists(lists(integers(), min_size=2, max_size=2, unique=True), min_size=1)) +@example(points=[[0, 1], [1, 2]]) +def test_edge_list_always_sorted(points: list[tuple[int, int]]): + # OK this is weird but stick with me. + el: EdgeList[str] = EdgeList() + for i, (a, b) in enumerate(points): + lower = min(a, b) + upper = max(a, b) + + span = Span(lower, upper) + + el.add_edge(span, str(i)) + + last_upper = None + for span, _ in el: + if last_upper is not None: + assert last_upper <= span.lower, "Edges from list are not sorted" + last_upper = span.upper + + +def test_lexer_compile(): + class LexTest(Grammar): + @rule + def foo(self): + return self.IS + + start = foo + + IS = Terminal("is") + AS = Terminal("as") + IDENTIFIER = Terminal( + Re.seq( + Re.set(("a", "z"), ("A", "Z"), "_"), + Re.set(("a", "z"), ("A", "Z"), ("0", "9"), "_").star(), + ) + ) + BLANKS = Terminal(Re.set("\r", "\n", "\t", " ").plus()) + + lexer = compile_lexer(LexTest()) + dump_lexer_table(lexer) + tokens = list(generic_tokenize("xy is ass", lexer)) + assert tokens == [ + (LexTest.IDENTIFIER, 0, 2), + (LexTest.BLANKS, 2, 1), + (LexTest.IS, 3, 2), + (LexTest.BLANKS, 5, 1), + (LexTest.IDENTIFIER, 6, 3), + ]