From 56d24c5fb917a0b65ced0b52ff5c8c5cb84c4533 Mon Sep 17 00:00:00 2001 From: John Doty Date: Thu, 30 May 2024 08:02:47 -0700 Subject: [PATCH] Chaos: split tables, interactions, Terminal - Tables are split into `actions` and `goto` now to make formatting nicer - Token is renamed Terminal - Likes are now Florps - Lexer now loaded dynamically (badly) --- grammar.py | 234 ++++++++++++++++++++++++++--------------------------- harness.py | 197 +++++++++++++++++++++++++++----------------- parser.py | 197 ++++++++++++++++++++++++-------------------- 3 files changed, 342 insertions(+), 286 deletions(-) diff --git a/grammar.py b/grammar.py index 64722d4..ad4ad24 100644 --- a/grammar.py +++ b/grammar.py @@ -1,56 +1,56 @@ # This is an example grammar. import re -from parser import Assoc, Grammar, Nothing, Token, rule, seq +from parser import Assoc, Grammar, Nothing, Terminal, rule, seq, Rule -ARROW = Token("Arrow") -AS = Token("As") -BAR = Token("Bar") -CLASS = Token("Class") -COLON = Token("Colon") -ELSE = Token("Else") -FOR = Token("For") -FUN = Token("Fun") -IDENTIFIER = Token("Identifier") -IF = Token("If") -IMPORT = Token("Import") -IN = Token("In") -LCURLY = Token("LeftBrace") -LET = Token("Let") -RCURLY = Token("RightBrace") -RETURN = Token("Return") -SEMICOLON = Token("Semicolon") -STRING = Token("String") -WHILE = Token("While") -EQUAL = Token("Equal") -LPAREN = Token("LeftParen") -RPAREN = Token("RightParen") -COMMA = Token("Comma") -SELF = Token("Selff") -OR = Token("Or") -IS = Token("Is") -AND = Token("And") -EQUALEQUAL = Token("EqualEqual") -BANGEQUAL = Token("BangEqual") -LESS = Token("Less") -GREATER = Token("Greater") -LESSEQUAL = Token("LessEqual") -GREATEREQUAL = Token("GreaterEqual") -PLUS = Token("Plus") -MINUS = Token("Minus") -STAR = Token("Star") -SLASH = Token("Slash") -NUMBER = Token("Number") -TRUE = Token("True") -FALSE = Token("False") -BANG = Token("Bang") -DOT = Token("Dot") -MATCH = Token("Match") -EXPORT = Token("Export") -UNDERSCORE = Token("Underscore") -NEW = Token("New") -LSQUARE = Token("LeftBracket") -RSQUARE = Token("RightBracket") +ARROW = Terminal("Arrow") +AS = Terminal("As") +BAR = Terminal("Bar") +CLASS = Terminal("Class") +COLON = Terminal("Colon") +ELSE = Terminal("Else") +FOR = Terminal("For") +FUN = Terminal("Fun") +IDENTIFIER = Terminal("Identifier") +IF = Terminal("If") +IMPORT = Terminal("Import") +IN = Terminal("In") +LCURLY = Terminal("LeftBrace") +LET = Terminal("Let") +RCURLY = Terminal("RightBrace") +RETURN = Terminal("Return") +SEMICOLON = Terminal("Semicolon") +STRING = Terminal("String") +WHILE = Terminal("While") +EQUAL = Terminal("Equal") +LPAREN = Terminal("LeftParen") +RPAREN = Terminal("RightParen") +COMMA = Terminal("Comma") +SELF = Terminal("Selff") +OR = Terminal("Or") +IS = Terminal("Is") +AND = Terminal("And") +EQUALEQUAL = Terminal("EqualEqual") +BANGEQUAL = Terminal("BangEqual") +LESS = Terminal("Less") +GREATER = Terminal("Greater") +LESSEQUAL = Terminal("LessEqual") +GREATEREQUAL = Terminal("GreaterEqual") +PLUS = Terminal("Plus") +MINUS = Terminal("Minus") +STAR = Terminal("Star") +SLASH = Terminal("Slash") +NUMBER = Terminal("Number") +TRUE = Terminal("True") +FALSE = Terminal("False") +BANG = Terminal("Bang") +DOT = Terminal("Dot") +MATCH = Terminal("Match") +EXPORT = Terminal("Export") +UNDERSCORE = Terminal("Underscore") +NEW = Terminal("New") +LSQUARE = Terminal("LeftBracket") +RSQUARE = Terminal("RightBracket") class FineGrammar(Grammar): @@ -77,58 +77,58 @@ class FineGrammar(Grammar): ) @rule - def file(self): + def file(self) -> Rule: return self._file_statement_list @rule - def _file_statement_list(self): + def _file_statement_list(self) -> Rule: return self._file_statement | (self._file_statement_list + self._file_statement) @rule - def _file_statement(self): + def _file_statement(self) -> Rule: return ( - self.import_statement | self.class_declaration | self.export_statement | self.statement + self.import_statement | self.class_declaration | self.export_statement | self._statement ) @rule - def import_statement(self): + def import_statement(self) -> Rule: return seq(IMPORT, STRING, AS, IDENTIFIER, SEMICOLON) @rule - def class_declaration(self): + def class_declaration(self) -> Rule: return seq(CLASS, IDENTIFIER, self.class_body) @rule - def class_body(self): + def class_body(self) -> Rule: return seq(LCURLY, RCURLY) | seq(LCURLY, self._class_members, RCURLY) @rule - def _class_members(self): + def _class_members(self) -> Rule: return self._class_member | seq(self._class_members, self._class_member) @rule - def _class_member(self): + def _class_member(self) -> Rule: return self.field_declaration | self.function_declaration @rule - def field_declaration(self): + def field_declaration(self) -> Rule: return seq(IDENTIFIER, COLON, self.type_expression, SEMICOLON) # Types @rule - def type_expression(self): + def type_expression(self) -> Rule: return self.alternate_type | self.type_identifier @rule - def alternate_type(self): + def alternate_type(self) -> Rule: return seq(self.type_expression, OR, self.type_identifier) @rule - def type_identifier(self): + def type_identifier(self) -> Rule: return IDENTIFIER @rule - def export_statement(self): + def export_statement(self) -> Rule: return ( seq(EXPORT, self.class_declaration) | seq(EXPORT, self.function_declaration) @@ -137,18 +137,18 @@ class FineGrammar(Grammar): ) @rule - def export_list(self): + def export_list(self) -> Rule: return Nothing | IDENTIFIER | seq(IDENTIFIER, COMMA, self.export_list) # Functions @rule - def function_declaration(self): + def function_declaration(self) -> Rule: return seq(FUN, IDENTIFIER, self.function_parameters, self.block) | seq( FUN, IDENTIFIER, self.function_parameters, ARROW, self.type_expression, self.block ) @rule - def function_parameters(self): + def function_parameters(self) -> Rule: return ( seq(LPAREN, RPAREN) | seq(LPAREN, self.first_parameter, RPAREN) @@ -156,33 +156,33 @@ class FineGrammar(Grammar): ) @rule - def first_parameter(self): + def first_parameter(self) -> Rule: return SELF | self.parameter @rule - def parameter_list(self): + def parameter_list(self) -> Rule: return Nothing | self.parameter | seq(self.parameter, COMMA, self.parameter_list) @rule - def parameter(self): + def parameter(self) -> Rule: return seq(IDENTIFIER, COLON, self.type_expression) # Block @rule - def block(self): + def block(self) -> Rule: return ( seq(LCURLY, RCURLY) | seq(LCURLY, self.expression, RCURLY) - | seq(LCURLY, self.statement_list, RCURLY) - | seq(LCURLY, self.statement_list, self.expression, RCURLY) + | seq(LCURLY, self._statement_list, RCURLY) + | seq(LCURLY, self._statement_list, self.expression, RCURLY) ) @rule - def statement_list(self): - return self.statement | seq(self.statement_list, self.statement) + def _statement_list(self) -> Rule: + return self._statement | seq(self._statement_list, self._statement) @rule - def statement(self): + def _statement(self) -> Rule: return ( self.function_declaration | self.let_statement @@ -194,56 +194,56 @@ class FineGrammar(Grammar): ) @rule - def let_statement(self): + def let_statement(self) -> Rule: return seq(LET, IDENTIFIER, EQUAL, self.expression, SEMICOLON) @rule - def return_statement(self): + def return_statement(self) -> Rule: return seq(RETURN, self.expression, SEMICOLON) | seq(RETURN, SEMICOLON) @rule - def for_statement(self): + def for_statement(self) -> Rule: return seq(FOR, self.iterator_variable, IN, self.expression, self.block) @rule - def iterator_variable(self): + def iterator_variable(self) -> Rule: return IDENTIFIER @rule - def if_statement(self): + def if_statement(self) -> Rule: return self.conditional_expression @rule - def while_statement(self): + def while_statement(self) -> Rule: return seq(WHILE, self.expression, self.block) @rule - def expression_statement(self): + def expression_statement(self) -> Rule: return seq(self.expression, SEMICOLON) # Expressions @rule - def expression(self): + def expression(self) -> Rule: return self.assignment_expression @rule - def assignment_expression(self): + def assignment_expression(self) -> Rule: return seq(self.or_expression, EQUAL, self.assignment_expression) | self.or_expression @rule - def or_expression(self): + def or_expression(self) -> Rule: return seq(self.or_expression, OR, self.is_expression) | self.is_expression @rule - def is_expression(self): + def is_expression(self) -> Rule: return seq(self.is_expression, IS, self.pattern) | self.and_expression @rule - def and_expression(self): + def and_expression(self) -> Rule: return seq(self.and_expression, AND, self.equality_expression) | self.equality_expression @rule - def equality_expression(self): + def equality_expression(self) -> Rule: return ( seq(self.equality_expression, EQUALEQUAL, self.relation_expression) | seq(self.equality_expression, BANGEQUAL, self.relation_expression) @@ -251,7 +251,7 @@ class FineGrammar(Grammar): ) @rule - def relation_expression(self): + def relation_expression(self) -> Rule: return ( seq(self.relation_expression, LESS, self.additive_expression) | seq(self.relation_expression, LESSEQUAL, self.additive_expression) @@ -261,7 +261,7 @@ class FineGrammar(Grammar): ) @rule - def additive_expression(self): + def additive_expression(self) -> Rule: return ( seq(self.additive_expression, PLUS, self.multiplication_expression) | seq(self.additive_expression, MINUS, self.multiplication_expression) @@ -269,7 +269,7 @@ class FineGrammar(Grammar): ) @rule - def multiplication_expression(self): + def multiplication_expression(self) -> Rule: return ( seq(self.multiplication_expression, STAR, self.primary_expression) | seq(self.multiplication_expression, SLASH, self.primary_expression) @@ -277,7 +277,7 @@ class FineGrammar(Grammar): ) @rule - def primary_expression(self): + def primary_expression(self) -> Rule: return ( IDENTIFIER | SELF @@ -299,7 +299,7 @@ class FineGrammar(Grammar): ) @rule - def conditional_expression(self): + def conditional_expression(self) -> Rule: return ( seq(IF, self.expression, self.block) | seq(IF, self.expression, self.block, ELSE, self.conditional_expression) @@ -307,11 +307,11 @@ class FineGrammar(Grammar): ) @rule - def list_constructor_expression(self): + def list_constructor_expression(self) -> Rule: return seq(LSQUARE, RSQUARE) | seq(LSQUARE, self.expression_list, RSQUARE) @rule - def expression_list(self): + def expression_list(self) -> Rule: return ( self.expression | seq(self.expression, COMMA) @@ -319,15 +319,15 @@ class FineGrammar(Grammar): ) @rule - def match_expression(self): + def match_expression(self) -> Rule: return seq(MATCH, self.expression, self.match_body) @rule - def match_body(self): + def match_body(self) -> Rule: return seq(LCURLY, RCURLY) | seq(LCURLY, self.match_arms, RCURLY) @rule - def match_arms(self): + def match_arms(self) -> Rule: return ( self.match_arm | seq(self.match_arm, COMMA) @@ -335,11 +335,11 @@ class FineGrammar(Grammar): ) @rule - def match_arm(self): + def match_arm(self) -> Rule: return seq(self.pattern, ARROW, self.expression) @rule - def pattern(self): + def pattern(self) -> Rule: return ( seq(self.variable_binding, self.pattern_core, AND, self.and_expression) | seq(self.variable_binding, self.pattern_core) @@ -348,27 +348,27 @@ class FineGrammar(Grammar): ) @rule - def pattern_core(self): + def pattern_core(self) -> Rule: return self.type_expression | self.wildcard_pattern @rule - def wildcard_pattern(self): + def wildcard_pattern(self) -> Rule: return UNDERSCORE @rule - def variable_binding(self): + def variable_binding(self) -> Rule: return seq(IDENTIFIER, COLON) @rule - def object_constructor_expression(self): + def object_constructor_expression(self) -> Rule: return seq(NEW, self.type_identifier, self.field_list) @rule - def field_list(self): + def field_list(self) -> Rule: return seq(LCURLY, RCURLY) | seq(LCURLY, self.field_values, RCURLY) @rule - def field_values(self): + def field_values(self) -> Rule: return ( self.field_value | seq(self.field_value, COMMA) @@ -376,7 +376,7 @@ class FineGrammar(Grammar): ) @rule - def field_value(self): + def field_value(self) -> Rule: return IDENTIFIER | seq(IDENTIFIER, COLON, self.expression) @@ -533,16 +533,19 @@ import bisect class FineTokens: def __init__(self, src: str): self.src = src - self.tokens = list(tokenize(src)) + self._tokens = list(tokenize(src)) self.lines = [m.start() for m in re.finditer("\n", src)] + def tokens(self): + return self._tokens + def dump(self, *, start=None, end=None): if start is None: start = 0 if end is None: - end = len(self.tokens) + end = len(self._tokens) - for token in self.tokens[start:end]: + for token in self._tokens[start:end]: (kind, start, length) = token line_index = bisect.bisect_left(self.lines, start) if line_index == 0: @@ -553,14 +556,3 @@ class FineTokens: print( f"{start:04} {kind.value:12} {self.src[start:start+length]} ({line_index}, {column_index})" ) - - -if __name__ == "__main__": - grammar = FineGrammar() - table = grammar.build_table(start="expression") - - print(f"{len(table)} states") - - average_entries = sum(len(row) for row in table) / len(table) - max_entries = max(len(row) for row in table) - print(f"{average_entries} average, {max_entries} max") diff --git a/harness.py b/harness.py index a61d295..8b8350b 100644 --- a/harness.py +++ b/harness.py @@ -7,11 +7,12 @@ import select import sys import termios import time +import traceback import tty +import types import typing from dataclasses import dataclass -import grammar import parser # from parser import Token, Grammar, rule, seq @@ -47,7 +48,8 @@ def parse(table: parser.ParseTable, tokens, trace=None) -> typing.Tuple[Tree | N This is not a *great* parser, it's really just a demo for what you can do with the table. """ - input: list[str] = [t.value for (t, _, _) in tokens.tokens] + input_tokens = tokens.tokens() + input: list[str] = [t.value for (t, _, _) in input_tokens] assert "$" not in input input = input + ["$"] @@ -61,7 +63,7 @@ def parse(table: parser.ParseTable, tokens, trace=None) -> typing.Tuple[Tree | N current_state = stack[-1][0] current_token = input[input_index] - action = table.states[current_state].get(current_token, parser.Error()) + action = table.actions[current_state].get(current_token, parser.Error()) if trace: trace(stack, input, input_index, action) @@ -84,21 +86,21 @@ def parse(table: parser.ParseTable, tokens, trace=None) -> typing.Tuple[Tree | N value = Tree(name=name if not transparent else None, children=tuple(children)) stack = stack[:-size] - goto = table.states[stack[-1][0]].get(name, parser.Error()) - assert isinstance(goto, parser.Goto) - stack.append((goto.state, value)) + goto = table.gotos[stack[-1][0]].get(name) + assert goto is not None + stack.append((goto, value)) case parser.Shift(state): stack.append((state, current_token)) input_index += 1 case parser.Error(): - if input_index >= len(tokens.tokens): + if input_index >= len(input_tokens): message = "Unexpected end of file" - start = tokens.tokens[-1][1] + start = input_tokens[-1][1] else: message = f"Syntax error: unexpected symbol {current_token}" - (_, start, _) = tokens.tokens[input_index] + (_, start, _) = input_tokens[input_index] line_index = bisect.bisect_left(tokens.lines, start) if line_index == 0: @@ -147,7 +149,7 @@ def CSI(x: bytes) -> bytes: return ESC(b"[" + x) -CLEAR = CSI(b"2J") +CLEAR = CSI(b"H") + CSI(b"J") def enter_alt_screen(): @@ -158,15 +160,108 @@ def leave_alt_screen(): sys.stdout.buffer.write(CSI(b"?1049l")) +class DynamicModule: + file_name: str + member_name: str | None + + last_time: float | None + module: types.ModuleType | None + + def __init__(self, file_name, member_name): + self.file_name = file_name + self.member_name = member_name + + self.last_time = None + self.module = None + self.value = None + + def _predicate(self, member) -> bool: + if not inspect.isclass(member): + return False + + assert self.module is not None + if member.__module__ != self.module.__name__: + return False + + return True + + def _transform(self, value): + return value + + def get(self): + st = os.stat(self.file_name) + if self.last_time == st.st_mtime: + assert self.value is not None + return self.value + + self.value = None + + if self.module is None: + mod_name = inspect.getmodulename(self.file_name) + if mod_name is None: + raise Exception(f"{self.file_name} does not seem to be a module") + self.module = importlib.import_module(mod_name) + else: + importlib.reload(self.module) + + if self.member_name is None: + classes = inspect.getmembers(self.module, self._predicate) + if len(classes) == 0: + raise Exception(f"No grammars found in {self.file_name}") + if len(classes) > 1: + raise Exception( + f"{len(classes)} grammars found in {self.file_name}: {', '.join(c[0] for c in classes)}" + ) + cls = classes[0][1] + else: + cls = getattr(self.module, self.member_name) + if cls is None: + raise Exception(f"Cannot find {self.member_name} in {self.file_name}") + if not self._predicate(cls): + raise Exception(f"{self.member_name} in {self.file_name} is not suitable") + + self.value = self._transform(cls) + self.last_time = st.st_mtime + return self.value + + +class DynamicGrammarModule(DynamicModule): + def __init__(self, file_name, member_name, start_rule, generator): + super().__init__(file_name, member_name) + + self.start_rule = start_rule + self.generator = generator + + def _predicate(self, member) -> bool: + if not super()._predicate(member): + return False + + if getattr(member, "build_table", None): + return True + + return False + + def _transform(self, value): + return value().build_table(start=self.start_rule, generator=self.generator) + + +class DynamicLexerModule(DynamicModule): + def _predicate(self, member) -> bool: + if not super()._predicate(member): + return False + + if getattr(member, "tokens", None): + return True + + return False + + class Harness: source: str | None table: parser.ParseTable | None tree: Tree | None - def __init__(self, lexer_func, start_rule, source_path): - # self.generator = parser.GenerateLR1 - self.generator = parser.GenerateLALR - self.lexer_func = lexer_func + def __init__(self, start_rule, source_path): self.start_rule = start_rule self.source_path = source_path @@ -176,10 +271,11 @@ class Harness: self.tree = None self.errors = None - self.grammar_file_name = "./grammar.py" - self.last_grammar_time = None - self.grammar_module = None - self.grammar_name = None + self.grammar_module = DynamicGrammarModule( + "./grammar.py", None, self.start_rule, generator=parser.GenerateLALR + ) + + self.lexer_module = DynamicLexerModule("./grammar.py", None) def run(self): while True: @@ -191,71 +287,19 @@ class Harness: self.update() - # def should_reload_grammar(self): - def load_grammar(self) -> parser.ParseTable: - st = os.stat(self.grammar_file_name) - if self.last_grammar_time == st.st_mtime: - assert self.table is not None - return self.table - - self.table = None - - if self.grammar_module is None: - mod_name = inspect.getmodulename(self.grammar_file_name) - if mod_name is None: - raise Exception(f"{self.grammar_file_name} does not seem to be a module") - self.grammar_module = importlib.import_module(mod_name) - else: - importlib.reload(self.grammar_module) - - def is_grammar(cls): - if not inspect.isclass(cls): - return False - - assert self.grammar_module is not None - if cls.__module__ != self.grammar_module.__name__: - return False - - if getattr(cls, "build_table", None): - return True - - return False - - if self.grammar_name is None: - classes = inspect.getmembers(self.grammar_module, is_grammar) - if len(classes) == 0: - raise Exception(f"No grammars found in {self.grammar_file_name}") - if len(classes) > 1: - raise Exception( - f"{len(classes)} grammars found in {self.grammar_file_name}: {', '.join(c[0] for c in classes)}" - ) - grammar_func = classes[0][1] - else: - cls = getattr(self.grammar_module, self.grammar_name) - if cls is None: - raise Exception(f"Cannot find {self.grammar_name} in {self.grammar_file_name}") - if not is_grammar(cls): - raise Exception( - f"{self.grammar_name} in {self.grammar_file_name} does not seem to be a grammar" - ) - grammar_func = cls - - self.table = grammar_func().build_table(start=self.start_rule, generator=self.generator) - self.last_grammar_time = st.st_mtime - - assert self.table is not None - return self.table + return self.grammar_module.get() def update(self): start_time = time.time() try: table = self.load_grammar() + lexer_func = self.lexer_module.get() with open(self.source_path, "r", encoding="utf-8") as f: self.source = f.read() - self.tokens = self.lexer_func(self.source) + self.tokens = lexer_func(self.source) lex_time = time.time() # print(f"{tokens.lines}") @@ -268,7 +312,9 @@ class Harness: except Exception as e: self.tree = None - self.errors = [f"Error loading grammar: {e}"] + self.errors = ["Error loading grammar:"] + [ + " " + l.rstrip() for fl in traceback.format_exception(e) for l in fl.splitlines() + ] parse_elapsed = time.time() - start_time table = None @@ -276,7 +322,7 @@ class Harness: rows, cols = termios.tcgetwinsize(sys.stdout.fileno()) if table is not None: - states = table.states + states = table.actions average_entries = sum(len(row) for row in states) / len(states) max_entries = max(len(row) for row in states) print( @@ -320,7 +366,6 @@ if __name__ == "__main__": enter_alt_screen() h = Harness( - lexer_func=grammar.FineTokens, start_rule="file", source_path=source_path, ) diff --git a/parser.py b/parser.py index fd4ab24..12dcf67 100644 --- a/parser.py +++ b/parser.py @@ -21,10 +21,10 @@ To get started, create a grammar that derives from the `Grammar` class. Create one method per nonterminal, decorated with the `rule` decorator. Here's an example: - PLUS = Token('+') - LPAREN = Token('(') - RPAREN = Token(')') - ID = Token('id') + PLUS = Terminal('+') + LPAREN = Terminal('(') + RPAREN = Terminal(')') + ID = Terminal('id') class SimpleGrammar(Grammar): @rule @@ -410,11 +410,6 @@ class Shift(Action): state: int -@dataclasses.dataclass -class Goto(Action): - state: int - - @dataclasses.dataclass class Accept(Action): pass @@ -511,8 +506,7 @@ class ErrorCollection: case Accept(): action_str = "accept the parse" case _: - assert isinstance(action, Goto) - raise Exception("Shouldn't conflict on goto ever") + raise Exception(f"unknown action type {action}") lines.append( f" - We are in the rule `{name}: {rule}` and we should {action_str}" @@ -525,7 +519,53 @@ class ErrorCollection: @dataclasses.dataclass class ParseTable: - states: list[dict[str, Action]] + actions: list[dict[str, Action]] + gotos: list[dict[str, int]] + + def format(self): + """Format a parser table so pretty.""" + + def format_action(actions: dict[str, Action], terminal: str): + action = actions.get(terminal) + match action: + case Accept(): + return "accept" + case Shift(state=state): + return f"s{state}" + case Reduce(count=count): + return f"r{count}" + case _: + return "" + + def format_goto(gotos: dict[str, int], nt: str): + index = gotos.get(nt) + if index is None: + return "" + else: + return str(index) + + terminals = list(sorted({k for row in self.actions for k in row.keys()})) + nonterminals = list(sorted({k for row in self.gotos for k in row.keys()})) + + header = " | {terms} | {nts}".format( + terms=" ".join(f"{terminal: <6}" for terminal in terminals), + nts=" ".join(f"{nt: <5}" for nt in nonterminals), + ) + + lines = [ + header, + "-" * len(header), + ] + [ + "{index: <4} | {actions} | {gotos}".format( + index=i, + actions=" ".join( + "{0: <6}".format(format_action(actions, terminal)) for terminal in terminals + ), + gotos=" ".join("{0: <5}".format(format_goto(gotos, nt)) for nt in nonterminals), + ) + for i, (actions, gotos) in enumerate(zip(self.actions, self.gotos)) + ] + return "\n".join(lines) class TableBuilder(object): @@ -536,12 +576,14 @@ class TableBuilder(object): """ errors: ErrorCollection - table: list[dict[str, Action]] + actions: list[dict[str, Action]] + gotos: list[dict[str, int]] alphabet: list[str] precedence: typing.Tuple[typing.Tuple[Assoc, int], ...] transparents: set[str] - row: None | list[typing.Tuple[None | Action, None | Configuration]] + action_row: None | list[typing.Tuple[None | Action, None | Configuration]] + goto_row: None | list[None | int] def __init__( self, @@ -550,11 +592,14 @@ class TableBuilder(object): transparents: set[str], ): self.errors = ErrorCollection() - self.table = [] + self.actions = [] + self.gotos = [] + self.alphabet = alphabet self.precedence = precedence self.transparents = transparents - self.row = None + self.action_row = None + self.goto_row = None def flush(self, all_sets: ConfigurationSetInfo) -> ParseTable: """Finish building the table and return it. @@ -565,20 +610,31 @@ class TableBuilder(object): if self.errors.any(): errors = self.errors.format(self.alphabet, all_sets) raise ValueError(f"Errors building the table:\n\n{errors}") - return ParseTable(states=self.table) + return ParseTable(actions=self.actions, gotos=self.gotos) def new_row(self, config_set: ConfigSet): """Start a new row, processing the given config set. Call this before doing anything else. """ self._flush_row() - self.row = [(None, None) for _ in self.alphabet] + self.action_row = [(None, None) for _ in self.alphabet] + self.goto_row = [None for _ in self.alphabet] self.current_config_set = config_set def _flush_row(self): - if self.row: - actions = {self.alphabet[k]: v[0] for k, v in enumerate(self.row) if v[0] is not None} - self.table.append(actions) + if self.action_row: + actions = { + self.alphabet[sym]: e[0] + for sym, e in enumerate(self.action_row) + if e[0] is not None + } + + self.actions.append(actions) + + if self.goto_row: + gotos = {self.alphabet[sym]: e for sym, e in enumerate(self.goto_row) if e is not None} + + self.gotos.append(gotos) def set_table_reduce(self, symbol: int, config: Configuration): """Mark a reduce of the given configuration for the given symbol in the @@ -604,7 +660,9 @@ class TableBuilder(object): def set_table_goto(self, symbol: int, index: int): """Set the goto for the given nonterminal symbol in the current row.""" - self._set_table_action(symbol, Goto(index), None) + assert self.goto_row is not None + assert self.goto_row[symbol] is None # ? + self.goto_row[symbol] = index def _action_precedence(self, symbol: int, action: Action, config: Configuration): if isinstance(action, Shift): @@ -620,8 +678,8 @@ class TableBuilder(object): """ assert isinstance(symbol_id, int) - assert self.row is not None - existing, existing_config = self.row[symbol_id] + assert self.action_row is not None + existing, existing_config = self.action_row[symbol_id] if existing is not None and existing != action: assert existing_config is not None assert config is not None @@ -675,7 +733,7 @@ class TableBuilder(object): # action, just allow the overwrite with no change. pass - self.row[symbol_id] = (action, config) + self.action_row[symbol_id] = (action, config) class GenerateLR0: @@ -1036,7 +1094,7 @@ def parse(table: ParseTable, input, trace=False): current_state = stack[-1][0] current_token = input[input_index] - action = table.states[current_state].get(current_token, Error()) + action = table.actions[current_state].get(current_token, Error()) if trace: print( "{stack: <20} {input: <50} {action: <5}".format( @@ -1061,9 +1119,9 @@ def parse(table: ParseTable, input, trace=False): value = (name if not transparent else None, tuple(children)) stack = stack[:-size] - goto = table.states[stack[-1][0]].get(name, Error()) - assert isinstance(goto, Goto) - stack.append((goto.state, value)) + goto = table.gotos[stack[-1][0]].get(name) + assert goto is not None + stack.append((goto, value)) case Shift(state): stack.append((state, (current_token, ()))) @@ -1554,7 +1612,7 @@ class Rule: return SequenceRule(self, other) @abc.abstractmethod - def flatten(self) -> typing.Generator[list["str | Token"], None, None]: + def flatten(self) -> typing.Generator[list["str | Terminal"], None, None]: """Convert this potentially nested and branching set of rules into a series of nice, flat symbol lists. @@ -1574,7 +1632,7 @@ class Rule: raise NotImplementedError() -class Token(Rule): +class Terminal(Rule): """A token, or terminal symbol in the grammar.""" value: str @@ -1582,7 +1640,7 @@ class Token(Rule): def __init__(self, value): self.value = sys.intern(value) - def flatten(self) -> typing.Generator[list["str | Token"], None, None]: + def flatten(self) -> typing.Generator[list["str | Terminal"], None, None]: # We are just ourselves when flattened. yield [self] @@ -1616,7 +1674,7 @@ class NonTerminal(Rule): self.name = name or fn.__name__ self.transparent = transparent - def generate_body(self, grammar) -> list[list[str | Token]]: + def generate_body(self, grammar) -> list[list[str | Terminal]]: """Generate the body of the non-terminal. We do this by first calling the associated function in order to get a @@ -1625,7 +1683,7 @@ class NonTerminal(Rule): """ return [rule for rule in self.fn(grammar).flatten()] - def flatten(self) -> typing.Generator[list[str | Token], None, None]: + def flatten(self) -> typing.Generator[list[str | Terminal], None, None]: # Although we contain multitudes, when flattened we're being asked in # the context of some other production. Yield ourselves, and trust that # in time we will be asked to generate our body. @@ -1639,7 +1697,7 @@ class AlternativeRule(Rule): self.left = left self.right = right - def flatten(self) -> typing.Generator[list[str | Token], None, None]: + def flatten(self) -> typing.Generator[list[str | Terminal], None, None]: # All the things from the left of the alternative, then all the things # from the right, never intermingled. yield from self.left.flatten() @@ -1655,7 +1713,7 @@ class SequenceRule(Rule): self.first = first self.second = second - def flatten(self) -> typing.Generator[list[str | Token], None, None]: + def flatten(self) -> typing.Generator[list[str | Terminal], None, None]: # All the things in the prefix.... for first in self.first.flatten(): # ...potentially followed by all the things in the suffix. @@ -1668,7 +1726,7 @@ class NothingRule(Rule): these, you're probably better off just using the singleton `Nothing`. """ - def flatten(self) -> typing.Generator[list[str | Token], None, None]: + def flatten(self) -> typing.Generator[list[str | Terminal], None, None]: # It's quiet in here. yield [] @@ -1720,10 +1778,10 @@ class Grammar: Here's an example of a simple grammar: - PLUS = Token('+') - LPAREN = Token('(') - RPAREN = Token(')') - ID = Token('id') + PLUS = Terminal('+') + LPAREN = Terminal('(') + RPAREN = Terminal(')') + ID = Terminal('id') class SimpleGrammar(Grammar): @rule @@ -1745,7 +1803,7 @@ class Grammar: precedence_table = {} for prec, (associativity, symbols) in enumerate(precedence): for symbol in symbols: - if isinstance(symbol, Token): + if isinstance(symbol, Terminal): key = symbol.value elif isinstance(symbol, NonTerminal): key = symbol.name @@ -1758,7 +1816,7 @@ class Grammar: def generate_nonterminal_dict( self, start: str - ) -> typing.Tuple[dict[str, list[list[str | Token]]], set[str]]: + ) -> typing.Tuple[dict[str, list[list[str | Terminal]]], set[str]]: """Convert the rules into a dictionary of productions. Our table generators work on a very flat set of productions. This is the @@ -1785,7 +1843,7 @@ class Grammar: body = rule.generate_body(self) for clause in body: for symbol in clause: - if not isinstance(symbol, Token): + if not isinstance(symbol, Terminal): assert isinstance(symbol, str) nonterminal = nonterminals.get(symbol) if nonterminal is None: @@ -1811,7 +1869,7 @@ class Grammar: for clause in clauses: new_clause = [] for symbol in clause: - if isinstance(symbol, Token): + if isinstance(symbol, Terminal): new_clause.append(symbol.value) else: new_clause.append(symbol) @@ -1842,45 +1900,6 @@ def format_node(node): return "\n".join(lines) -def format_table(generator, table: ParseTable): - """Format a parser table so pretty.""" - - def format_action(state, terminal): - action = state.get(terminal, ("error",)) - if action[0] == "accept": - return "accept" - elif action[0] == "shift": - return "s" + str(action[1]) - elif action[0] == "error": - return "" - elif action[0] == "reduce": - return "r" + str(action[1]) - - terminals = list(sorted(generator.alphabet[i] for i, v in enumerate(generator.terminal) if v)) - nonterminals = list( - sorted(generator.alphabet[i] for i, v in enumerate(generator.nonterminal) if v) - ) - header = " | {terms} | {nts}".format( - terms=" ".join("{0: <6}".format(terminal) for terminal in terminals), - nts=" ".join("{0: <5}".format(nt) for nt in nonterminals), - ) - - lines = [ - header, - "-" * len(header), - ] + [ - "{index: <3} | {actions} | {gotos}".format( - index=i, - actions=" ".join( - "{0: <6}".format(format_action(row, terminal)) for terminal in terminals - ), - gotos=" ".join("{0: <5}".format(row.get(nt, ("error", ""))[1]) for nt in nonterminals), - ) - for i, row in enumerate(table.states) - ] - return "\n".join(lines) - - ############################################################################### # Examples ############################################################################### @@ -1901,7 +1920,7 @@ def examples(): gen = GenerateLR0("E", grammar_simple) table = gen.gen_table() - print(format_table(gen, table)) + print(table.format()) tree = parse(table, ["id", "+", "(", "id", ")"]) print(format_node(tree) + "\n") print() @@ -1954,7 +1973,7 @@ def examples(): gen = GenerateSLR1("E", grammar_lr0_shift_reduce) print(f"Follow('E'): {str([gen.alphabet[f] for f in gen.gen_follow(gen.symbol_key['E'])])}") table = gen.gen_table() - print(format_table(gen, table)) + print(table.format()) tree = parse(table, ["id", "+", "(", "id", "[", "id", "]", ")"], trace=True) print(format_node(tree) + "\n") print() @@ -1985,7 +2004,7 @@ def examples(): ] gen = GenerateLR1("S", grammar_aho_ullman_2) table = gen.gen_table() - print(format_table(gen, table)) + print(table.format()) parse(table, ["b", "a", "a", "b"], trace=True) print() @@ -1993,7 +2012,7 @@ def examples(): print("grammar_aho_ullman_2 (LALR):") gen = GenerateLALR("S", grammar_aho_ullman_2) table = gen.gen_table() - print(format_table(gen, table)) + print(table.format()) print() # A fun LALAR grammar. @@ -2009,7 +2028,7 @@ def examples(): ] gen = GenerateLALR("S", grammar_lalr) table = gen.gen_table() - print(format_table(gen, table)) + print(table.format()) print()