Chaos: split tables, interactions, Terminal

- Tables are split into `actions` and `goto` now to make formatting
  nicer
- Token is renamed Terminal
- Likes are now Florps
- Lexer now loaded dynamically (badly)
This commit is contained in:
John Doty 2024-05-30 08:02:47 -07:00
parent 71078f76b4
commit 56d24c5fb9
3 changed files with 342 additions and 286 deletions

View file

@ -1,56 +1,56 @@
# This is an example grammar. # This is an example grammar.
import re import re
from parser import Assoc, Grammar, Nothing, Token, rule, seq from parser import Assoc, Grammar, Nothing, Terminal, rule, seq, Rule
ARROW = Token("Arrow") ARROW = Terminal("Arrow")
AS = Token("As") AS = Terminal("As")
BAR = Token("Bar") BAR = Terminal("Bar")
CLASS = Token("Class") CLASS = Terminal("Class")
COLON = Token("Colon") COLON = Terminal("Colon")
ELSE = Token("Else") ELSE = Terminal("Else")
FOR = Token("For") FOR = Terminal("For")
FUN = Token("Fun") FUN = Terminal("Fun")
IDENTIFIER = Token("Identifier") IDENTIFIER = Terminal("Identifier")
IF = Token("If") IF = Terminal("If")
IMPORT = Token("Import") IMPORT = Terminal("Import")
IN = Token("In") IN = Terminal("In")
LCURLY = Token("LeftBrace") LCURLY = Terminal("LeftBrace")
LET = Token("Let") LET = Terminal("Let")
RCURLY = Token("RightBrace") RCURLY = Terminal("RightBrace")
RETURN = Token("Return") RETURN = Terminal("Return")
SEMICOLON = Token("Semicolon") SEMICOLON = Terminal("Semicolon")
STRING = Token("String") STRING = Terminal("String")
WHILE = Token("While") WHILE = Terminal("While")
EQUAL = Token("Equal") EQUAL = Terminal("Equal")
LPAREN = Token("LeftParen") LPAREN = Terminal("LeftParen")
RPAREN = Token("RightParen") RPAREN = Terminal("RightParen")
COMMA = Token("Comma") COMMA = Terminal("Comma")
SELF = Token("Selff") SELF = Terminal("Selff")
OR = Token("Or") OR = Terminal("Or")
IS = Token("Is") IS = Terminal("Is")
AND = Token("And") AND = Terminal("And")
EQUALEQUAL = Token("EqualEqual") EQUALEQUAL = Terminal("EqualEqual")
BANGEQUAL = Token("BangEqual") BANGEQUAL = Terminal("BangEqual")
LESS = Token("Less") LESS = Terminal("Less")
GREATER = Token("Greater") GREATER = Terminal("Greater")
LESSEQUAL = Token("LessEqual") LESSEQUAL = Terminal("LessEqual")
GREATEREQUAL = Token("GreaterEqual") GREATEREQUAL = Terminal("GreaterEqual")
PLUS = Token("Plus") PLUS = Terminal("Plus")
MINUS = Token("Minus") MINUS = Terminal("Minus")
STAR = Token("Star") STAR = Terminal("Star")
SLASH = Token("Slash") SLASH = Terminal("Slash")
NUMBER = Token("Number") NUMBER = Terminal("Number")
TRUE = Token("True") TRUE = Terminal("True")
FALSE = Token("False") FALSE = Terminal("False")
BANG = Token("Bang") BANG = Terminal("Bang")
DOT = Token("Dot") DOT = Terminal("Dot")
MATCH = Token("Match") MATCH = Terminal("Match")
EXPORT = Token("Export") EXPORT = Terminal("Export")
UNDERSCORE = Token("Underscore") UNDERSCORE = Terminal("Underscore")
NEW = Token("New") NEW = Terminal("New")
LSQUARE = Token("LeftBracket") LSQUARE = Terminal("LeftBracket")
RSQUARE = Token("RightBracket") RSQUARE = Terminal("RightBracket")
class FineGrammar(Grammar): class FineGrammar(Grammar):
@ -77,58 +77,58 @@ class FineGrammar(Grammar):
) )
@rule @rule
def file(self): def file(self) -> Rule:
return self._file_statement_list return self._file_statement_list
@rule @rule
def _file_statement_list(self): def _file_statement_list(self) -> Rule:
return self._file_statement | (self._file_statement_list + self._file_statement) return self._file_statement | (self._file_statement_list + self._file_statement)
@rule @rule
def _file_statement(self): def _file_statement(self) -> Rule:
return ( return (
self.import_statement | self.class_declaration | self.export_statement | self.statement self.import_statement | self.class_declaration | self.export_statement | self._statement
) )
@rule @rule
def import_statement(self): def import_statement(self) -> Rule:
return seq(IMPORT, STRING, AS, IDENTIFIER, SEMICOLON) return seq(IMPORT, STRING, AS, IDENTIFIER, SEMICOLON)
@rule @rule
def class_declaration(self): def class_declaration(self) -> Rule:
return seq(CLASS, IDENTIFIER, self.class_body) return seq(CLASS, IDENTIFIER, self.class_body)
@rule @rule
def class_body(self): def class_body(self) -> Rule:
return seq(LCURLY, RCURLY) | seq(LCURLY, self._class_members, RCURLY) return seq(LCURLY, RCURLY) | seq(LCURLY, self._class_members, RCURLY)
@rule @rule
def _class_members(self): def _class_members(self) -> Rule:
return self._class_member | seq(self._class_members, self._class_member) return self._class_member | seq(self._class_members, self._class_member)
@rule @rule
def _class_member(self): def _class_member(self) -> Rule:
return self.field_declaration | self.function_declaration return self.field_declaration | self.function_declaration
@rule @rule
def field_declaration(self): def field_declaration(self) -> Rule:
return seq(IDENTIFIER, COLON, self.type_expression, SEMICOLON) return seq(IDENTIFIER, COLON, self.type_expression, SEMICOLON)
# Types # Types
@rule @rule
def type_expression(self): def type_expression(self) -> Rule:
return self.alternate_type | self.type_identifier return self.alternate_type | self.type_identifier
@rule @rule
def alternate_type(self): def alternate_type(self) -> Rule:
return seq(self.type_expression, OR, self.type_identifier) return seq(self.type_expression, OR, self.type_identifier)
@rule @rule
def type_identifier(self): def type_identifier(self) -> Rule:
return IDENTIFIER return IDENTIFIER
@rule @rule
def export_statement(self): def export_statement(self) -> Rule:
return ( return (
seq(EXPORT, self.class_declaration) seq(EXPORT, self.class_declaration)
| seq(EXPORT, self.function_declaration) | seq(EXPORT, self.function_declaration)
@ -137,18 +137,18 @@ class FineGrammar(Grammar):
) )
@rule @rule
def export_list(self): def export_list(self) -> Rule:
return Nothing | IDENTIFIER | seq(IDENTIFIER, COMMA, self.export_list) return Nothing | IDENTIFIER | seq(IDENTIFIER, COMMA, self.export_list)
# Functions # Functions
@rule @rule
def function_declaration(self): def function_declaration(self) -> Rule:
return seq(FUN, IDENTIFIER, self.function_parameters, self.block) | seq( return seq(FUN, IDENTIFIER, self.function_parameters, self.block) | seq(
FUN, IDENTIFIER, self.function_parameters, ARROW, self.type_expression, self.block FUN, IDENTIFIER, self.function_parameters, ARROW, self.type_expression, self.block
) )
@rule @rule
def function_parameters(self): def function_parameters(self) -> Rule:
return ( return (
seq(LPAREN, RPAREN) seq(LPAREN, RPAREN)
| seq(LPAREN, self.first_parameter, RPAREN) | seq(LPAREN, self.first_parameter, RPAREN)
@ -156,33 +156,33 @@ class FineGrammar(Grammar):
) )
@rule @rule
def first_parameter(self): def first_parameter(self) -> Rule:
return SELF | self.parameter return SELF | self.parameter
@rule @rule
def parameter_list(self): def parameter_list(self) -> Rule:
return Nothing | self.parameter | seq(self.parameter, COMMA, self.parameter_list) return Nothing | self.parameter | seq(self.parameter, COMMA, self.parameter_list)
@rule @rule
def parameter(self): def parameter(self) -> Rule:
return seq(IDENTIFIER, COLON, self.type_expression) return seq(IDENTIFIER, COLON, self.type_expression)
# Block # Block
@rule @rule
def block(self): def block(self) -> Rule:
return ( return (
seq(LCURLY, RCURLY) seq(LCURLY, RCURLY)
| seq(LCURLY, self.expression, RCURLY) | seq(LCURLY, self.expression, RCURLY)
| seq(LCURLY, self.statement_list, RCURLY) | seq(LCURLY, self._statement_list, RCURLY)
| seq(LCURLY, self.statement_list, self.expression, RCURLY) | seq(LCURLY, self._statement_list, self.expression, RCURLY)
) )
@rule @rule
def statement_list(self): def _statement_list(self) -> Rule:
return self.statement | seq(self.statement_list, self.statement) return self._statement | seq(self._statement_list, self._statement)
@rule @rule
def statement(self): def _statement(self) -> Rule:
return ( return (
self.function_declaration self.function_declaration
| self.let_statement | self.let_statement
@ -194,56 +194,56 @@ class FineGrammar(Grammar):
) )
@rule @rule
def let_statement(self): def let_statement(self) -> Rule:
return seq(LET, IDENTIFIER, EQUAL, self.expression, SEMICOLON) return seq(LET, IDENTIFIER, EQUAL, self.expression, SEMICOLON)
@rule @rule
def return_statement(self): def return_statement(self) -> Rule:
return seq(RETURN, self.expression, SEMICOLON) | seq(RETURN, SEMICOLON) return seq(RETURN, self.expression, SEMICOLON) | seq(RETURN, SEMICOLON)
@rule @rule
def for_statement(self): def for_statement(self) -> Rule:
return seq(FOR, self.iterator_variable, IN, self.expression, self.block) return seq(FOR, self.iterator_variable, IN, self.expression, self.block)
@rule @rule
def iterator_variable(self): def iterator_variable(self) -> Rule:
return IDENTIFIER return IDENTIFIER
@rule @rule
def if_statement(self): def if_statement(self) -> Rule:
return self.conditional_expression return self.conditional_expression
@rule @rule
def while_statement(self): def while_statement(self) -> Rule:
return seq(WHILE, self.expression, self.block) return seq(WHILE, self.expression, self.block)
@rule @rule
def expression_statement(self): def expression_statement(self) -> Rule:
return seq(self.expression, SEMICOLON) return seq(self.expression, SEMICOLON)
# Expressions # Expressions
@rule @rule
def expression(self): def expression(self) -> Rule:
return self.assignment_expression return self.assignment_expression
@rule @rule
def assignment_expression(self): def assignment_expression(self) -> Rule:
return seq(self.or_expression, EQUAL, self.assignment_expression) | self.or_expression return seq(self.or_expression, EQUAL, self.assignment_expression) | self.or_expression
@rule @rule
def or_expression(self): def or_expression(self) -> Rule:
return seq(self.or_expression, OR, self.is_expression) | self.is_expression return seq(self.or_expression, OR, self.is_expression) | self.is_expression
@rule @rule
def is_expression(self): def is_expression(self) -> Rule:
return seq(self.is_expression, IS, self.pattern) | self.and_expression return seq(self.is_expression, IS, self.pattern) | self.and_expression
@rule @rule
def and_expression(self): def and_expression(self) -> Rule:
return seq(self.and_expression, AND, self.equality_expression) | self.equality_expression return seq(self.and_expression, AND, self.equality_expression) | self.equality_expression
@rule @rule
def equality_expression(self): def equality_expression(self) -> Rule:
return ( return (
seq(self.equality_expression, EQUALEQUAL, self.relation_expression) seq(self.equality_expression, EQUALEQUAL, self.relation_expression)
| seq(self.equality_expression, BANGEQUAL, self.relation_expression) | seq(self.equality_expression, BANGEQUAL, self.relation_expression)
@ -251,7 +251,7 @@ class FineGrammar(Grammar):
) )
@rule @rule
def relation_expression(self): def relation_expression(self) -> Rule:
return ( return (
seq(self.relation_expression, LESS, self.additive_expression) seq(self.relation_expression, LESS, self.additive_expression)
| seq(self.relation_expression, LESSEQUAL, self.additive_expression) | seq(self.relation_expression, LESSEQUAL, self.additive_expression)
@ -261,7 +261,7 @@ class FineGrammar(Grammar):
) )
@rule @rule
def additive_expression(self): def additive_expression(self) -> Rule:
return ( return (
seq(self.additive_expression, PLUS, self.multiplication_expression) seq(self.additive_expression, PLUS, self.multiplication_expression)
| seq(self.additive_expression, MINUS, self.multiplication_expression) | seq(self.additive_expression, MINUS, self.multiplication_expression)
@ -269,7 +269,7 @@ class FineGrammar(Grammar):
) )
@rule @rule
def multiplication_expression(self): def multiplication_expression(self) -> Rule:
return ( return (
seq(self.multiplication_expression, STAR, self.primary_expression) seq(self.multiplication_expression, STAR, self.primary_expression)
| seq(self.multiplication_expression, SLASH, self.primary_expression) | seq(self.multiplication_expression, SLASH, self.primary_expression)
@ -277,7 +277,7 @@ class FineGrammar(Grammar):
) )
@rule @rule
def primary_expression(self): def primary_expression(self) -> Rule:
return ( return (
IDENTIFIER IDENTIFIER
| SELF | SELF
@ -299,7 +299,7 @@ class FineGrammar(Grammar):
) )
@rule @rule
def conditional_expression(self): def conditional_expression(self) -> Rule:
return ( return (
seq(IF, self.expression, self.block) seq(IF, self.expression, self.block)
| seq(IF, self.expression, self.block, ELSE, self.conditional_expression) | seq(IF, self.expression, self.block, ELSE, self.conditional_expression)
@ -307,11 +307,11 @@ class FineGrammar(Grammar):
) )
@rule @rule
def list_constructor_expression(self): def list_constructor_expression(self) -> Rule:
return seq(LSQUARE, RSQUARE) | seq(LSQUARE, self.expression_list, RSQUARE) return seq(LSQUARE, RSQUARE) | seq(LSQUARE, self.expression_list, RSQUARE)
@rule @rule
def expression_list(self): def expression_list(self) -> Rule:
return ( return (
self.expression self.expression
| seq(self.expression, COMMA) | seq(self.expression, COMMA)
@ -319,15 +319,15 @@ class FineGrammar(Grammar):
) )
@rule @rule
def match_expression(self): def match_expression(self) -> Rule:
return seq(MATCH, self.expression, self.match_body) return seq(MATCH, self.expression, self.match_body)
@rule @rule
def match_body(self): def match_body(self) -> Rule:
return seq(LCURLY, RCURLY) | seq(LCURLY, self.match_arms, RCURLY) return seq(LCURLY, RCURLY) | seq(LCURLY, self.match_arms, RCURLY)
@rule @rule
def match_arms(self): def match_arms(self) -> Rule:
return ( return (
self.match_arm self.match_arm
| seq(self.match_arm, COMMA) | seq(self.match_arm, COMMA)
@ -335,11 +335,11 @@ class FineGrammar(Grammar):
) )
@rule @rule
def match_arm(self): def match_arm(self) -> Rule:
return seq(self.pattern, ARROW, self.expression) return seq(self.pattern, ARROW, self.expression)
@rule @rule
def pattern(self): def pattern(self) -> Rule:
return ( return (
seq(self.variable_binding, self.pattern_core, AND, self.and_expression) seq(self.variable_binding, self.pattern_core, AND, self.and_expression)
| seq(self.variable_binding, self.pattern_core) | seq(self.variable_binding, self.pattern_core)
@ -348,27 +348,27 @@ class FineGrammar(Grammar):
) )
@rule @rule
def pattern_core(self): def pattern_core(self) -> Rule:
return self.type_expression | self.wildcard_pattern return self.type_expression | self.wildcard_pattern
@rule @rule
def wildcard_pattern(self): def wildcard_pattern(self) -> Rule:
return UNDERSCORE return UNDERSCORE
@rule @rule
def variable_binding(self): def variable_binding(self) -> Rule:
return seq(IDENTIFIER, COLON) return seq(IDENTIFIER, COLON)
@rule @rule
def object_constructor_expression(self): def object_constructor_expression(self) -> Rule:
return seq(NEW, self.type_identifier, self.field_list) return seq(NEW, self.type_identifier, self.field_list)
@rule @rule
def field_list(self): def field_list(self) -> Rule:
return seq(LCURLY, RCURLY) | seq(LCURLY, self.field_values, RCURLY) return seq(LCURLY, RCURLY) | seq(LCURLY, self.field_values, RCURLY)
@rule @rule
def field_values(self): def field_values(self) -> Rule:
return ( return (
self.field_value self.field_value
| seq(self.field_value, COMMA) | seq(self.field_value, COMMA)
@ -376,7 +376,7 @@ class FineGrammar(Grammar):
) )
@rule @rule
def field_value(self): def field_value(self) -> Rule:
return IDENTIFIER | seq(IDENTIFIER, COLON, self.expression) return IDENTIFIER | seq(IDENTIFIER, COLON, self.expression)
@ -533,16 +533,19 @@ import bisect
class FineTokens: class FineTokens:
def __init__(self, src: str): def __init__(self, src: str):
self.src = src self.src = src
self.tokens = list(tokenize(src)) self._tokens = list(tokenize(src))
self.lines = [m.start() for m in re.finditer("\n", src)] self.lines = [m.start() for m in re.finditer("\n", src)]
def tokens(self):
return self._tokens
def dump(self, *, start=None, end=None): def dump(self, *, start=None, end=None):
if start is None: if start is None:
start = 0 start = 0
if end is None: if end is None:
end = len(self.tokens) end = len(self._tokens)
for token in self.tokens[start:end]: for token in self._tokens[start:end]:
(kind, start, length) = token (kind, start, length) = token
line_index = bisect.bisect_left(self.lines, start) line_index = bisect.bisect_left(self.lines, start)
if line_index == 0: if line_index == 0:
@ -553,14 +556,3 @@ class FineTokens:
print( print(
f"{start:04} {kind.value:12} {self.src[start:start+length]} ({line_index}, {column_index})" f"{start:04} {kind.value:12} {self.src[start:start+length]} ({line_index}, {column_index})"
) )
if __name__ == "__main__":
grammar = FineGrammar()
table = grammar.build_table(start="expression")
print(f"{len(table)} states")
average_entries = sum(len(row) for row in table) / len(table)
max_entries = max(len(row) for row in table)
print(f"{average_entries} average, {max_entries} max")

View file

@ -7,11 +7,12 @@ import select
import sys import sys
import termios import termios
import time import time
import traceback
import tty import tty
import types
import typing import typing
from dataclasses import dataclass from dataclasses import dataclass
import grammar
import parser import parser
# from parser import Token, Grammar, rule, seq # from parser import Token, Grammar, rule, seq
@ -47,7 +48,8 @@ def parse(table: parser.ParseTable, tokens, trace=None) -> typing.Tuple[Tree | N
This is not a *great* parser, it's really just a demo for what you can This is not a *great* parser, it's really just a demo for what you can
do with the table. do with the table.
""" """
input: list[str] = [t.value for (t, _, _) in tokens.tokens] input_tokens = tokens.tokens()
input: list[str] = [t.value for (t, _, _) in input_tokens]
assert "$" not in input assert "$" not in input
input = input + ["$"] input = input + ["$"]
@ -61,7 +63,7 @@ def parse(table: parser.ParseTable, tokens, trace=None) -> typing.Tuple[Tree | N
current_state = stack[-1][0] current_state = stack[-1][0]
current_token = input[input_index] current_token = input[input_index]
action = table.states[current_state].get(current_token, parser.Error()) action = table.actions[current_state].get(current_token, parser.Error())
if trace: if trace:
trace(stack, input, input_index, action) trace(stack, input, input_index, action)
@ -84,21 +86,21 @@ def parse(table: parser.ParseTable, tokens, trace=None) -> typing.Tuple[Tree | N
value = Tree(name=name if not transparent else None, children=tuple(children)) value = Tree(name=name if not transparent else None, children=tuple(children))
stack = stack[:-size] stack = stack[:-size]
goto = table.states[stack[-1][0]].get(name, parser.Error()) goto = table.gotos[stack[-1][0]].get(name)
assert isinstance(goto, parser.Goto) assert goto is not None
stack.append((goto.state, value)) stack.append((goto, value))
case parser.Shift(state): case parser.Shift(state):
stack.append((state, current_token)) stack.append((state, current_token))
input_index += 1 input_index += 1
case parser.Error(): case parser.Error():
if input_index >= len(tokens.tokens): if input_index >= len(input_tokens):
message = "Unexpected end of file" message = "Unexpected end of file"
start = tokens.tokens[-1][1] start = input_tokens[-1][1]
else: else:
message = f"Syntax error: unexpected symbol {current_token}" message = f"Syntax error: unexpected symbol {current_token}"
(_, start, _) = tokens.tokens[input_index] (_, start, _) = input_tokens[input_index]
line_index = bisect.bisect_left(tokens.lines, start) line_index = bisect.bisect_left(tokens.lines, start)
if line_index == 0: if line_index == 0:
@ -147,7 +149,7 @@ def CSI(x: bytes) -> bytes:
return ESC(b"[" + x) return ESC(b"[" + x)
CLEAR = CSI(b"2J") CLEAR = CSI(b"H") + CSI(b"J")
def enter_alt_screen(): def enter_alt_screen():
@ -158,15 +160,108 @@ def leave_alt_screen():
sys.stdout.buffer.write(CSI(b"?1049l")) sys.stdout.buffer.write(CSI(b"?1049l"))
class DynamicModule:
file_name: str
member_name: str | None
last_time: float | None
module: types.ModuleType | None
def __init__(self, file_name, member_name):
self.file_name = file_name
self.member_name = member_name
self.last_time = None
self.module = None
self.value = None
def _predicate(self, member) -> bool:
if not inspect.isclass(member):
return False
assert self.module is not None
if member.__module__ != self.module.__name__:
return False
return True
def _transform(self, value):
return value
def get(self):
st = os.stat(self.file_name)
if self.last_time == st.st_mtime:
assert self.value is not None
return self.value
self.value = None
if self.module is None:
mod_name = inspect.getmodulename(self.file_name)
if mod_name is None:
raise Exception(f"{self.file_name} does not seem to be a module")
self.module = importlib.import_module(mod_name)
else:
importlib.reload(self.module)
if self.member_name is None:
classes = inspect.getmembers(self.module, self._predicate)
if len(classes) == 0:
raise Exception(f"No grammars found in {self.file_name}")
if len(classes) > 1:
raise Exception(
f"{len(classes)} grammars found in {self.file_name}: {', '.join(c[0] for c in classes)}"
)
cls = classes[0][1]
else:
cls = getattr(self.module, self.member_name)
if cls is None:
raise Exception(f"Cannot find {self.member_name} in {self.file_name}")
if not self._predicate(cls):
raise Exception(f"{self.member_name} in {self.file_name} is not suitable")
self.value = self._transform(cls)
self.last_time = st.st_mtime
return self.value
class DynamicGrammarModule(DynamicModule):
def __init__(self, file_name, member_name, start_rule, generator):
super().__init__(file_name, member_name)
self.start_rule = start_rule
self.generator = generator
def _predicate(self, member) -> bool:
if not super()._predicate(member):
return False
if getattr(member, "build_table", None):
return True
return False
def _transform(self, value):
return value().build_table(start=self.start_rule, generator=self.generator)
class DynamicLexerModule(DynamicModule):
def _predicate(self, member) -> bool:
if not super()._predicate(member):
return False
if getattr(member, "tokens", None):
return True
return False
class Harness: class Harness:
source: str | None source: str | None
table: parser.ParseTable | None table: parser.ParseTable | None
tree: Tree | None tree: Tree | None
def __init__(self, lexer_func, start_rule, source_path): def __init__(self, start_rule, source_path):
# self.generator = parser.GenerateLR1
self.generator = parser.GenerateLALR
self.lexer_func = lexer_func
self.start_rule = start_rule self.start_rule = start_rule
self.source_path = source_path self.source_path = source_path
@ -176,10 +271,11 @@ class Harness:
self.tree = None self.tree = None
self.errors = None self.errors = None
self.grammar_file_name = "./grammar.py" self.grammar_module = DynamicGrammarModule(
self.last_grammar_time = None "./grammar.py", None, self.start_rule, generator=parser.GenerateLALR
self.grammar_module = None )
self.grammar_name = None
self.lexer_module = DynamicLexerModule("./grammar.py", None)
def run(self): def run(self):
while True: while True:
@ -191,71 +287,19 @@ class Harness:
self.update() self.update()
# def should_reload_grammar(self):
def load_grammar(self) -> parser.ParseTable: def load_grammar(self) -> parser.ParseTable:
st = os.stat(self.grammar_file_name) return self.grammar_module.get()
if self.last_grammar_time == st.st_mtime:
assert self.table is not None
return self.table
self.table = None
if self.grammar_module is None:
mod_name = inspect.getmodulename(self.grammar_file_name)
if mod_name is None:
raise Exception(f"{self.grammar_file_name} does not seem to be a module")
self.grammar_module = importlib.import_module(mod_name)
else:
importlib.reload(self.grammar_module)
def is_grammar(cls):
if not inspect.isclass(cls):
return False
assert self.grammar_module is not None
if cls.__module__ != self.grammar_module.__name__:
return False
if getattr(cls, "build_table", None):
return True
return False
if self.grammar_name is None:
classes = inspect.getmembers(self.grammar_module, is_grammar)
if len(classes) == 0:
raise Exception(f"No grammars found in {self.grammar_file_name}")
if len(classes) > 1:
raise Exception(
f"{len(classes)} grammars found in {self.grammar_file_name}: {', '.join(c[0] for c in classes)}"
)
grammar_func = classes[0][1]
else:
cls = getattr(self.grammar_module, self.grammar_name)
if cls is None:
raise Exception(f"Cannot find {self.grammar_name} in {self.grammar_file_name}")
if not is_grammar(cls):
raise Exception(
f"{self.grammar_name} in {self.grammar_file_name} does not seem to be a grammar"
)
grammar_func = cls
self.table = grammar_func().build_table(start=self.start_rule, generator=self.generator)
self.last_grammar_time = st.st_mtime
assert self.table is not None
return self.table
def update(self): def update(self):
start_time = time.time() start_time = time.time()
try: try:
table = self.load_grammar() table = self.load_grammar()
lexer_func = self.lexer_module.get()
with open(self.source_path, "r", encoding="utf-8") as f: with open(self.source_path, "r", encoding="utf-8") as f:
self.source = f.read() self.source = f.read()
self.tokens = self.lexer_func(self.source) self.tokens = lexer_func(self.source)
lex_time = time.time() lex_time = time.time()
# print(f"{tokens.lines}") # print(f"{tokens.lines}")
@ -268,7 +312,9 @@ class Harness:
except Exception as e: except Exception as e:
self.tree = None self.tree = None
self.errors = [f"Error loading grammar: {e}"] self.errors = ["Error loading grammar:"] + [
" " + l.rstrip() for fl in traceback.format_exception(e) for l in fl.splitlines()
]
parse_elapsed = time.time() - start_time parse_elapsed = time.time() - start_time
table = None table = None
@ -276,7 +322,7 @@ class Harness:
rows, cols = termios.tcgetwinsize(sys.stdout.fileno()) rows, cols = termios.tcgetwinsize(sys.stdout.fileno())
if table is not None: if table is not None:
states = table.states states = table.actions
average_entries = sum(len(row) for row in states) / len(states) average_entries = sum(len(row) for row in states) / len(states)
max_entries = max(len(row) for row in states) max_entries = max(len(row) for row in states)
print( print(
@ -320,7 +366,6 @@ if __name__ == "__main__":
enter_alt_screen() enter_alt_screen()
h = Harness( h = Harness(
lexer_func=grammar.FineTokens,
start_rule="file", start_rule="file",
source_path=source_path, source_path=source_path,
) )

197
parser.py
View file

@ -21,10 +21,10 @@ To get started, create a grammar that derives from the `Grammar` class. Create
one method per nonterminal, decorated with the `rule` decorator. Here's an one method per nonterminal, decorated with the `rule` decorator. Here's an
example: example:
PLUS = Token('+') PLUS = Terminal('+')
LPAREN = Token('(') LPAREN = Terminal('(')
RPAREN = Token(')') RPAREN = Terminal(')')
ID = Token('id') ID = Terminal('id')
class SimpleGrammar(Grammar): class SimpleGrammar(Grammar):
@rule @rule
@ -410,11 +410,6 @@ class Shift(Action):
state: int state: int
@dataclasses.dataclass
class Goto(Action):
state: int
@dataclasses.dataclass @dataclasses.dataclass
class Accept(Action): class Accept(Action):
pass pass
@ -511,8 +506,7 @@ class ErrorCollection:
case Accept(): case Accept():
action_str = "accept the parse" action_str = "accept the parse"
case _: case _:
assert isinstance(action, Goto) raise Exception(f"unknown action type {action}")
raise Exception("Shouldn't conflict on goto ever")
lines.append( lines.append(
f" - We are in the rule `{name}: {rule}` and we should {action_str}" f" - We are in the rule `{name}: {rule}` and we should {action_str}"
@ -525,7 +519,53 @@ class ErrorCollection:
@dataclasses.dataclass @dataclasses.dataclass
class ParseTable: class ParseTable:
states: list[dict[str, Action]] actions: list[dict[str, Action]]
gotos: list[dict[str, int]]
def format(self):
"""Format a parser table so pretty."""
def format_action(actions: dict[str, Action], terminal: str):
action = actions.get(terminal)
match action:
case Accept():
return "accept"
case Shift(state=state):
return f"s{state}"
case Reduce(count=count):
return f"r{count}"
case _:
return ""
def format_goto(gotos: dict[str, int], nt: str):
index = gotos.get(nt)
if index is None:
return ""
else:
return str(index)
terminals = list(sorted({k for row in self.actions for k in row.keys()}))
nonterminals = list(sorted({k for row in self.gotos for k in row.keys()}))
header = " | {terms} | {nts}".format(
terms=" ".join(f"{terminal: <6}" for terminal in terminals),
nts=" ".join(f"{nt: <5}" for nt in nonterminals),
)
lines = [
header,
"-" * len(header),
] + [
"{index: <4} | {actions} | {gotos}".format(
index=i,
actions=" ".join(
"{0: <6}".format(format_action(actions, terminal)) for terminal in terminals
),
gotos=" ".join("{0: <5}".format(format_goto(gotos, nt)) for nt in nonterminals),
)
for i, (actions, gotos) in enumerate(zip(self.actions, self.gotos))
]
return "\n".join(lines)
class TableBuilder(object): class TableBuilder(object):
@ -536,12 +576,14 @@ class TableBuilder(object):
""" """
errors: ErrorCollection errors: ErrorCollection
table: list[dict[str, Action]] actions: list[dict[str, Action]]
gotos: list[dict[str, int]]
alphabet: list[str] alphabet: list[str]
precedence: typing.Tuple[typing.Tuple[Assoc, int], ...] precedence: typing.Tuple[typing.Tuple[Assoc, int], ...]
transparents: set[str] transparents: set[str]
row: None | list[typing.Tuple[None | Action, None | Configuration]] action_row: None | list[typing.Tuple[None | Action, None | Configuration]]
goto_row: None | list[None | int]
def __init__( def __init__(
self, self,
@ -550,11 +592,14 @@ class TableBuilder(object):
transparents: set[str], transparents: set[str],
): ):
self.errors = ErrorCollection() self.errors = ErrorCollection()
self.table = [] self.actions = []
self.gotos = []
self.alphabet = alphabet self.alphabet = alphabet
self.precedence = precedence self.precedence = precedence
self.transparents = transparents self.transparents = transparents
self.row = None self.action_row = None
self.goto_row = None
def flush(self, all_sets: ConfigurationSetInfo) -> ParseTable: def flush(self, all_sets: ConfigurationSetInfo) -> ParseTable:
"""Finish building the table and return it. """Finish building the table and return it.
@ -565,20 +610,31 @@ class TableBuilder(object):
if self.errors.any(): if self.errors.any():
errors = self.errors.format(self.alphabet, all_sets) errors = self.errors.format(self.alphabet, all_sets)
raise ValueError(f"Errors building the table:\n\n{errors}") raise ValueError(f"Errors building the table:\n\n{errors}")
return ParseTable(states=self.table) return ParseTable(actions=self.actions, gotos=self.gotos)
def new_row(self, config_set: ConfigSet): def new_row(self, config_set: ConfigSet):
"""Start a new row, processing the given config set. Call this before """Start a new row, processing the given config set. Call this before
doing anything else. doing anything else.
""" """
self._flush_row() self._flush_row()
self.row = [(None, None) for _ in self.alphabet] self.action_row = [(None, None) for _ in self.alphabet]
self.goto_row = [None for _ in self.alphabet]
self.current_config_set = config_set self.current_config_set = config_set
def _flush_row(self): def _flush_row(self):
if self.row: if self.action_row:
actions = {self.alphabet[k]: v[0] for k, v in enumerate(self.row) if v[0] is not None} actions = {
self.table.append(actions) self.alphabet[sym]: e[0]
for sym, e in enumerate(self.action_row)
if e[0] is not None
}
self.actions.append(actions)
if self.goto_row:
gotos = {self.alphabet[sym]: e for sym, e in enumerate(self.goto_row) if e is not None}
self.gotos.append(gotos)
def set_table_reduce(self, symbol: int, config: Configuration): def set_table_reduce(self, symbol: int, config: Configuration):
"""Mark a reduce of the given configuration for the given symbol in the """Mark a reduce of the given configuration for the given symbol in the
@ -604,7 +660,9 @@ class TableBuilder(object):
def set_table_goto(self, symbol: int, index: int): def set_table_goto(self, symbol: int, index: int):
"""Set the goto for the given nonterminal symbol in the current row.""" """Set the goto for the given nonterminal symbol in the current row."""
self._set_table_action(symbol, Goto(index), None) assert self.goto_row is not None
assert self.goto_row[symbol] is None # ?
self.goto_row[symbol] = index
def _action_precedence(self, symbol: int, action: Action, config: Configuration): def _action_precedence(self, symbol: int, action: Action, config: Configuration):
if isinstance(action, Shift): if isinstance(action, Shift):
@ -620,8 +678,8 @@ class TableBuilder(object):
""" """
assert isinstance(symbol_id, int) assert isinstance(symbol_id, int)
assert self.row is not None assert self.action_row is not None
existing, existing_config = self.row[symbol_id] existing, existing_config = self.action_row[symbol_id]
if existing is not None and existing != action: if existing is not None and existing != action:
assert existing_config is not None assert existing_config is not None
assert config is not None assert config is not None
@ -675,7 +733,7 @@ class TableBuilder(object):
# action, just allow the overwrite with no change. # action, just allow the overwrite with no change.
pass pass
self.row[symbol_id] = (action, config) self.action_row[symbol_id] = (action, config)
class GenerateLR0: class GenerateLR0:
@ -1036,7 +1094,7 @@ def parse(table: ParseTable, input, trace=False):
current_state = stack[-1][0] current_state = stack[-1][0]
current_token = input[input_index] current_token = input[input_index]
action = table.states[current_state].get(current_token, Error()) action = table.actions[current_state].get(current_token, Error())
if trace: if trace:
print( print(
"{stack: <20} {input: <50} {action: <5}".format( "{stack: <20} {input: <50} {action: <5}".format(
@ -1061,9 +1119,9 @@ def parse(table: ParseTable, input, trace=False):
value = (name if not transparent else None, tuple(children)) value = (name if not transparent else None, tuple(children))
stack = stack[:-size] stack = stack[:-size]
goto = table.states[stack[-1][0]].get(name, Error()) goto = table.gotos[stack[-1][0]].get(name)
assert isinstance(goto, Goto) assert goto is not None
stack.append((goto.state, value)) stack.append((goto, value))
case Shift(state): case Shift(state):
stack.append((state, (current_token, ()))) stack.append((state, (current_token, ())))
@ -1554,7 +1612,7 @@ class Rule:
return SequenceRule(self, other) return SequenceRule(self, other)
@abc.abstractmethod @abc.abstractmethod
def flatten(self) -> typing.Generator[list["str | Token"], None, None]: def flatten(self) -> typing.Generator[list["str | Terminal"], None, None]:
"""Convert this potentially nested and branching set of rules into a """Convert this potentially nested and branching set of rules into a
series of nice, flat symbol lists. series of nice, flat symbol lists.
@ -1574,7 +1632,7 @@ class Rule:
raise NotImplementedError() raise NotImplementedError()
class Token(Rule): class Terminal(Rule):
"""A token, or terminal symbol in the grammar.""" """A token, or terminal symbol in the grammar."""
value: str value: str
@ -1582,7 +1640,7 @@ class Token(Rule):
def __init__(self, value): def __init__(self, value):
self.value = sys.intern(value) self.value = sys.intern(value)
def flatten(self) -> typing.Generator[list["str | Token"], None, None]: def flatten(self) -> typing.Generator[list["str | Terminal"], None, None]:
# We are just ourselves when flattened. # We are just ourselves when flattened.
yield [self] yield [self]
@ -1616,7 +1674,7 @@ class NonTerminal(Rule):
self.name = name or fn.__name__ self.name = name or fn.__name__
self.transparent = transparent self.transparent = transparent
def generate_body(self, grammar) -> list[list[str | Token]]: def generate_body(self, grammar) -> list[list[str | Terminal]]:
"""Generate the body of the non-terminal. """Generate the body of the non-terminal.
We do this by first calling the associated function in order to get a We do this by first calling the associated function in order to get a
@ -1625,7 +1683,7 @@ class NonTerminal(Rule):
""" """
return [rule for rule in self.fn(grammar).flatten()] return [rule for rule in self.fn(grammar).flatten()]
def flatten(self) -> typing.Generator[list[str | Token], None, None]: def flatten(self) -> typing.Generator[list[str | Terminal], None, None]:
# Although we contain multitudes, when flattened we're being asked in # Although we contain multitudes, when flattened we're being asked in
# the context of some other production. Yield ourselves, and trust that # the context of some other production. Yield ourselves, and trust that
# in time we will be asked to generate our body. # in time we will be asked to generate our body.
@ -1639,7 +1697,7 @@ class AlternativeRule(Rule):
self.left = left self.left = left
self.right = right self.right = right
def flatten(self) -> typing.Generator[list[str | Token], None, None]: def flatten(self) -> typing.Generator[list[str | Terminal], None, None]:
# All the things from the left of the alternative, then all the things # All the things from the left of the alternative, then all the things
# from the right, never intermingled. # from the right, never intermingled.
yield from self.left.flatten() yield from self.left.flatten()
@ -1655,7 +1713,7 @@ class SequenceRule(Rule):
self.first = first self.first = first
self.second = second self.second = second
def flatten(self) -> typing.Generator[list[str | Token], None, None]: def flatten(self) -> typing.Generator[list[str | Terminal], None, None]:
# All the things in the prefix.... # All the things in the prefix....
for first in self.first.flatten(): for first in self.first.flatten():
# ...potentially followed by all the things in the suffix. # ...potentially followed by all the things in the suffix.
@ -1668,7 +1726,7 @@ class NothingRule(Rule):
these, you're probably better off just using the singleton `Nothing`. these, you're probably better off just using the singleton `Nothing`.
""" """
def flatten(self) -> typing.Generator[list[str | Token], None, None]: def flatten(self) -> typing.Generator[list[str | Terminal], None, None]:
# It's quiet in here. # It's quiet in here.
yield [] yield []
@ -1720,10 +1778,10 @@ class Grammar:
Here's an example of a simple grammar: Here's an example of a simple grammar:
PLUS = Token('+') PLUS = Terminal('+')
LPAREN = Token('(') LPAREN = Terminal('(')
RPAREN = Token(')') RPAREN = Terminal(')')
ID = Token('id') ID = Terminal('id')
class SimpleGrammar(Grammar): class SimpleGrammar(Grammar):
@rule @rule
@ -1745,7 +1803,7 @@ class Grammar:
precedence_table = {} precedence_table = {}
for prec, (associativity, symbols) in enumerate(precedence): for prec, (associativity, symbols) in enumerate(precedence):
for symbol in symbols: for symbol in symbols:
if isinstance(symbol, Token): if isinstance(symbol, Terminal):
key = symbol.value key = symbol.value
elif isinstance(symbol, NonTerminal): elif isinstance(symbol, NonTerminal):
key = symbol.name key = symbol.name
@ -1758,7 +1816,7 @@ class Grammar:
def generate_nonterminal_dict( def generate_nonterminal_dict(
self, start: str self, start: str
) -> typing.Tuple[dict[str, list[list[str | Token]]], set[str]]: ) -> typing.Tuple[dict[str, list[list[str | Terminal]]], set[str]]:
"""Convert the rules into a dictionary of productions. """Convert the rules into a dictionary of productions.
Our table generators work on a very flat set of productions. This is the Our table generators work on a very flat set of productions. This is the
@ -1785,7 +1843,7 @@ class Grammar:
body = rule.generate_body(self) body = rule.generate_body(self)
for clause in body: for clause in body:
for symbol in clause: for symbol in clause:
if not isinstance(symbol, Token): if not isinstance(symbol, Terminal):
assert isinstance(symbol, str) assert isinstance(symbol, str)
nonterminal = nonterminals.get(symbol) nonterminal = nonterminals.get(symbol)
if nonterminal is None: if nonterminal is None:
@ -1811,7 +1869,7 @@ class Grammar:
for clause in clauses: for clause in clauses:
new_clause = [] new_clause = []
for symbol in clause: for symbol in clause:
if isinstance(symbol, Token): if isinstance(symbol, Terminal):
new_clause.append(symbol.value) new_clause.append(symbol.value)
else: else:
new_clause.append(symbol) new_clause.append(symbol)
@ -1842,45 +1900,6 @@ def format_node(node):
return "\n".join(lines) return "\n".join(lines)
def format_table(generator, table: ParseTable):
"""Format a parser table so pretty."""
def format_action(state, terminal):
action = state.get(terminal, ("error",))
if action[0] == "accept":
return "accept"
elif action[0] == "shift":
return "s" + str(action[1])
elif action[0] == "error":
return ""
elif action[0] == "reduce":
return "r" + str(action[1])
terminals = list(sorted(generator.alphabet[i] for i, v in enumerate(generator.terminal) if v))
nonterminals = list(
sorted(generator.alphabet[i] for i, v in enumerate(generator.nonterminal) if v)
)
header = " | {terms} | {nts}".format(
terms=" ".join("{0: <6}".format(terminal) for terminal in terminals),
nts=" ".join("{0: <5}".format(nt) for nt in nonterminals),
)
lines = [
header,
"-" * len(header),
] + [
"{index: <3} | {actions} | {gotos}".format(
index=i,
actions=" ".join(
"{0: <6}".format(format_action(row, terminal)) for terminal in terminals
),
gotos=" ".join("{0: <5}".format(row.get(nt, ("error", ""))[1]) for nt in nonterminals),
)
for i, row in enumerate(table.states)
]
return "\n".join(lines)
############################################################################### ###############################################################################
# Examples # Examples
############################################################################### ###############################################################################
@ -1901,7 +1920,7 @@ def examples():
gen = GenerateLR0("E", grammar_simple) gen = GenerateLR0("E", grammar_simple)
table = gen.gen_table() table = gen.gen_table()
print(format_table(gen, table)) print(table.format())
tree = parse(table, ["id", "+", "(", "id", ")"]) tree = parse(table, ["id", "+", "(", "id", ")"])
print(format_node(tree) + "\n") print(format_node(tree) + "\n")
print() print()
@ -1954,7 +1973,7 @@ def examples():
gen = GenerateSLR1("E", grammar_lr0_shift_reduce) gen = GenerateSLR1("E", grammar_lr0_shift_reduce)
print(f"Follow('E'): {str([gen.alphabet[f] for f in gen.gen_follow(gen.symbol_key['E'])])}") print(f"Follow('E'): {str([gen.alphabet[f] for f in gen.gen_follow(gen.symbol_key['E'])])}")
table = gen.gen_table() table = gen.gen_table()
print(format_table(gen, table)) print(table.format())
tree = parse(table, ["id", "+", "(", "id", "[", "id", "]", ")"], trace=True) tree = parse(table, ["id", "+", "(", "id", "[", "id", "]", ")"], trace=True)
print(format_node(tree) + "\n") print(format_node(tree) + "\n")
print() print()
@ -1985,7 +2004,7 @@ def examples():
] ]
gen = GenerateLR1("S", grammar_aho_ullman_2) gen = GenerateLR1("S", grammar_aho_ullman_2)
table = gen.gen_table() table = gen.gen_table()
print(format_table(gen, table)) print(table.format())
parse(table, ["b", "a", "a", "b"], trace=True) parse(table, ["b", "a", "a", "b"], trace=True)
print() print()
@ -1993,7 +2012,7 @@ def examples():
print("grammar_aho_ullman_2 (LALR):") print("grammar_aho_ullman_2 (LALR):")
gen = GenerateLALR("S", grammar_aho_ullman_2) gen = GenerateLALR("S", grammar_aho_ullman_2)
table = gen.gen_table() table = gen.gen_table()
print(format_table(gen, table)) print(table.format())
print() print()
# A fun LALAR grammar. # A fun LALAR grammar.
@ -2009,7 +2028,7 @@ def examples():
] ]
gen = GenerateLALR("S", grammar_lalr) gen = GenerateLALR("S", grammar_lalr)
table = gen.gen_table() table = gen.gen_table()
print(format_table(gen, table)) print(table.format())
print() print()