diff --git a/grammar.py b/grammar.py index 23baf96..e95ccc0 100644 --- a/grammar.py +++ b/grammar.py @@ -78,11 +78,11 @@ class FineGrammar(Grammar): @rule def file(self): - return self._file_statement_list + return self.file_statement_list @rule - def _file_statement_list(self): - return self.file_statement | (self._file_statement_list + self.file_statement) + def file_statement_list(self): + return self.file_statement | (self.file_statement_list + self.file_statement) @rule def file_statement(self): diff --git a/harness.py b/harness.py index 283c0f4..8255a41 100644 --- a/harness.py +++ b/harness.py @@ -1,10 +1,4 @@ import bisect -from dataclasses import dataclass -import enum -import select -import sys -import termios -import tty import typing import grammar @@ -23,13 +17,7 @@ def trace_state(stack, input, input_index, action): ) -@dataclass -class Tree: - name: str | None - children: typing.Tuple["Tree | str", ...] - - -def parse(table: parser.ParseTable, tokens, trace=None) -> typing.Tuple[Tree | None, list[str]]: +def parse(table, tokens, trace=None): """Parse the input with the generated parsing table and return the concrete syntax tree. @@ -43,7 +31,7 @@ def parse(table: parser.ParseTable, tokens, trace=None) -> typing.Tuple[Tree | N This is not a *great* parser, it's really just a demo for what you can do with the table. """ - input: list[str] = [t.value for (t, _, _) in tokens.tokens] + input = [t.value for (t, _, _) in tokens.tokens] assert "$" not in input input = input + ["$"] @@ -52,50 +40,38 @@ def parse(table: parser.ParseTable, tokens, trace=None) -> typing.Tuple[Tree | N # Our stack is a stack of tuples, where the first entry is the state number # and the second entry is the 'value' that was generated when the state was # pushed. - stack: list[typing.Tuple[int, str | Tree | None]] = [(0, None)] + stack: list[typing.Tuple[int, typing.Any]] = [(0, None)] while True: current_state = stack[-1][0] current_token = input[input_index] - action = table.states[current_state].get(current_token, parser.Error()) + action = table[current_state].get(current_token, ("error",)) if trace: trace(stack, input, input_index, action) - match action: - case parser.Accept(): - result = stack[-1][1] - assert isinstance(result, Tree) - return (result, []) + if action[0] == "accept": + return (stack[-1][1], []) - case parser.Reduce(name=name, count=size, transparent=transparent): - children: list[str | Tree] = [] - for _, c in stack[-size:]: - if c is None: - continue - elif isinstance(c, Tree) and c.name is None: - children.extend(c.children) - else: - children.append(c) + elif action[0] == "reduce": + name = action[1] + size = action[2] - value = Tree(name=name if not transparent else None, children=tuple(children)) - stack = stack[:-size] + value = (name, tuple(s[1] for s in stack[-size:])) + stack = stack[:-size] - goto = table.states[stack[-1][0]].get(name, parser.Error()) - assert isinstance(goto, parser.Goto) - stack.append((goto.state, value)) + goto = table[stack[-1][0]].get(name, ("error",)) + assert goto[0] == "goto" # Corrupt table? + stack.append((goto[1], value)) - case parser.Shift(state): - stack.append((state, current_token)) - input_index += 1 - - case parser.Error(): - if input_index >= len(tokens.tokens): - message = "Unexpected end of file" - start = tokens.tokens[-1][1] - else: - message = f"Syntax error: unexpected symbol {current_token}" - (_, start, _) = tokens.tokens[input_index] + elif action[0] == "shift": + stack.append((action[1], (current_token, ()))) + input_index += 1 + elif action[0] == "error": + if input_index >= len(tokens.tokens): + raise ValueError("Unexpected end of file") + else: + (_, start, _) = tokens.tokens[input_index] line_index = bisect.bisect_left(tokens.lines, start) if line_index == 0: col_start = 0 @@ -104,154 +80,54 @@ def parse(table: parser.ParseTable, tokens, trace=None) -> typing.Tuple[Tree | N column_index = start - col_start line_index += 1 - error = f"{line_index}:{column_index}: {message}" - return (None, [error]) - - case _: - raise ValueError(f"Unknown action type: {action}") + return ( + None, + [ + f"{line_index}:{column_index}: Syntax error: unexpected symbol {current_token}" + ], + ) -# https://en.wikipedia.org/wiki/ANSI_escape_code -# https://gist.github.com/fnky/458719343aabd01cfb17a3a4f7296797 +def harness(lexer_func, grammar_func, start_rule, source_path): + # generator = parser.GenerateLR1 + generator = parser.GenerateLALR + trace = None + # trace = trace_state -class CharColor(enum.IntEnum): - CHAR_COLOR_DEFAULT = 0 - CHAR_COLOR_BLACK = 30 - CHAR_COLOR_RED = enum.auto() - CHAR_COLOR_GREEN = enum.auto() - CHAR_COLOR_YELLOW = enum.auto() - CHAR_COLOR_BLUE = enum.auto() - CHAR_COLOR_MAGENTA = enum.auto() - CHAR_COLOR_CYAN = enum.auto() - CHAR_COLOR_WHITE = enum.auto() # Really light gray - CHAR_COLOR_BRIGHT_BLACK = 90 # Really dark gray - CHAR_COLOR_BRIGHT_RED = enum.auto() - CHAR_COLOR_BRIGHT_GREEN = enum.auto() - CHAR_COLOR_BRIGHT_YELLOW = enum.auto() - CHAR_COLOR_BRIGHT_BLUE = enum.auto() - CHAR_COLOR_BRIGHT_MAGENTA = enum.auto() - CHAR_COLOR_BRIGHT_CYAN = enum.auto() - CHAR_COLOR_BRIGHT_WHITE = enum.auto() + table = grammar_func().build_table(start=start_rule, generator=generator) + print(f"{len(table)} states") + average_entries = sum(len(row) for row in table) / len(table) + max_entries = max(len(row) for row in table) + print(f"{average_entries} average, {max_entries} max") -def ESC(x: bytes) -> bytes: - return b"\033" + x - - -def CSI(x: bytes) -> bytes: - return ESC(b"[" + x) - - -CLEAR = CSI(b"H") + CSI(b"0m") - - -def enter_alt_screen(): - sys.stdout.buffer.write(CSI(b"?1049h")) - - -def leave_alt_screen(): - sys.stdout.buffer.write(CSI(b"?1049l")) - - -class Harness: - source: str | None - table: parser.ParseTable | None - tree: Tree | None - - def __init__(self, lexer_func, grammar_func, start_rule, source_path): - # self.generator = parser.GenerateLR1 - self.generator = parser.GenerateLALR - self.lexer_func = lexer_func - self.grammar_func = grammar_func - self.start_rule = start_rule - self.source_path = source_path - - self.source = None - self.table = None - self.tokens = None - self.tree = None - self.errors = None - - def run(self): - while True: - i, _, _ = select.select([sys.stdin], [], [], 1) - if i: - k = sys.stdin.read(1) - print(f"Key {k}\r") - return - - self.update() - - def update(self): - if self.table is None: - self.table = self.grammar_func().build_table( - start=self.start_rule, generator=self.generator - ) - assert self.table is not None - - if self.tokens is None: - with open(self.source_path, "r", encoding="utf-8") as f: - self.source = f.read() - self.tokens = self.lexer_func(self.source) - + if source_path: + with open(source_path, "r", encoding="utf-8") as f: + src = f.read() + tokens = lexer_func(src) # print(f"{tokens.lines}") # tokens.dump(end=5) - if self.tree is None and self.errors is None: - (tree, errors) = parse(self.table, self.tokens, trace=None) - self.tree = tree - self.errors = errors - - sys.stdout.buffer.write(CLEAR) - rows, cols = termios.tcgetwinsize(sys.stdout.fileno()) - - states = self.table.states - average_entries = sum(len(row) for row in states) / len(states) - max_entries = max(len(row) for row in states) - print(f"{len(states)} states - {average_entries} average, {max_entries} max\r") - - if self.tree is not None: - lines = [] - self.format_node(lines, self.tree) - for line in lines[: rows - 2]: - print(line[:cols] + "\r") - - sys.stdout.flush() - sys.stdout.buffer.flush() - - def format_node(self, lines, node: Tree | str, indent=0): - """Print out an indented concrete syntax tree, from parse().""" - match node: - case Tree(name, children): - lines.append((" " * indent) + (name or "???")) - for child in children: - self.format_node(lines, child, indent + 2) - case _: - lines.append((" " * indent) + str(node)) + (_, errors) = parse(table, tokens, trace=trace) + if len(errors) > 0: + print(f"{len(errors)} errors:") + for error in errors: + print(f" {error}") if __name__ == "__main__": + import sys + source_path = None if len(sys.argv) == 2: source_path = sys.argv[1] - fd = sys.stdin.fileno() - old_settings = termios.tcgetattr(fd) - try: - tty.setraw(fd) - enter_alt_screen() - - h = Harness( - lexer_func=grammar.FineTokens, - grammar_func=grammar.FineGrammar, - start_rule="file", - source_path=source_path, - ) - h.run() - - finally: - leave_alt_screen() - termios.tcsetattr(fd, termios.TCSADRAIN, old_settings) + harness( + lexer_func=grammar.FineTokens, + grammar_func=grammar.FineGrammar, + start_rule="file", + source_path=source_path, + ) # print(parser_faster.format_table(gen, table)) # print() diff --git a/parser.py b/parser.py index fd4ab24..8e7e753 100644 --- a/parser.py +++ b/parser.py @@ -393,45 +393,13 @@ class Assoc(enum.Enum): RIGHT = 2 -@dataclasses.dataclass -class Action: - pass - - -@dataclasses.dataclass -class Reduce(Action): - name: str - count: int - transparent: bool - - -@dataclasses.dataclass -class Shift(Action): - state: int - - -@dataclasses.dataclass -class Goto(Action): - state: int - - -@dataclasses.dataclass -class Accept(Action): - pass - - -@dataclasses.dataclass -class Error(Action): - pass - - class ErrorCollection: """A collection of errors. The errors are grouped by config set and alphabet symbol, so that we can group the error strings appropriately when we format the error. """ - errors: dict[ConfigSet, dict[int, dict[Configuration, Action]]] + errors: dict[ConfigSet, dict[int, dict[Configuration, typing.Tuple]]] def __init__(self): self.errors = {} @@ -445,7 +413,7 @@ class ErrorCollection: config_set: ConfigSet, symbol: int, config: Configuration, - action: Action, + action: typing.Tuple, ): """Add an error to the collection. @@ -502,17 +470,15 @@ class ErrorCollection: if config.next is None: rule += " *" - match action: - case Reduce(name=name, count=count, transparent=transparent): - name_str = name if not transparent else "transparent node" - action_str = f"pop {count} values off the stack and make a {name_str}" - case Shift(): - action_str = "consume the token and keep going" - case Accept(): - action_str = "accept the parse" - case _: - assert isinstance(action, Goto) - raise Exception("Shouldn't conflict on goto ever") + if action[0] == "reduce": + action_str = f"pop {action[2]} values off the stack and make a {action[1]}" + elif action[0] == "shift": + action_str = "consume the token and keep going" + elif action[0] == "accept": + action_str = "accept the parse" + else: + assert action[0] == "goto", f"Unknown action {action[0]}" + raise Exception("Shouldn't conflict on goto ever") lines.append( f" - We are in the rule `{name}: {rule}` and we should {action_str}" @@ -523,11 +489,6 @@ class ErrorCollection: return "\n\n".join(errors) -@dataclasses.dataclass -class ParseTable: - states: list[dict[str, Action]] - - class TableBuilder(object): """A helper object to assemble actions into build parse tables. @@ -536,27 +497,23 @@ class TableBuilder(object): """ errors: ErrorCollection - table: list[dict[str, Action]] + table: list[dict[str, typing.Tuple]] alphabet: list[str] precedence: typing.Tuple[typing.Tuple[Assoc, int], ...] - transparents: set[str] - - row: None | list[typing.Tuple[None | Action, None | Configuration]] + row: None | list[typing.Tuple[None | typing.Tuple, None | Configuration]] def __init__( self, alphabet: list[str], precedence: typing.Tuple[typing.Tuple[Assoc, int], ...], - transparents: set[str], ): self.errors = ErrorCollection() self.table = [] self.alphabet = alphabet self.precedence = precedence - self.transparents = transparents self.row = None - def flush(self, all_sets: ConfigurationSetInfo) -> ParseTable: + def flush(self, all_sets: ConfigurationSetInfo) -> list[dict[str, typing.Tuple]]: """Finish building the table and return it. Raises ValueError if there were any conflicts during construction. @@ -565,7 +522,7 @@ class TableBuilder(object): if self.errors.any(): errors = self.errors.format(self.alphabet, all_sets) raise ValueError(f"Errors building the table:\n\n{errors}") - return ParseTable(states=self.table) + return self.table def new_row(self, config_set: ConfigSet): """Start a new row, processing the given config set. Call this before @@ -584,35 +541,36 @@ class TableBuilder(object): """Mark a reduce of the given configuration for the given symbol in the current row. """ - name = self.alphabet[config.name] - transparent = name in self.transparents - action = Reduce(name, len(config.symbols), transparent) + action = ("reduce", self.alphabet[config.name], len(config.symbols)) self._set_table_action(symbol, action, config) def set_table_accept(self, symbol: int, config: Configuration): """Mark a accept of the given configuration for the given symbol in the current row. """ - self._set_table_action(symbol, Accept(), config) + action = ("accept",) + self._set_table_action(symbol, action, config) def set_table_shift(self, symbol: int, index: int, config: Configuration): """Mark a shift in the current row of the given given symbol to the given index. The configuration here provides debugging informtion for conflicts. """ - self._set_table_action(symbol, Shift(index), config) + action = ("shift", index) + self._set_table_action(symbol, action, config) def set_table_goto(self, symbol: int, index: int): """Set the goto for the given nonterminal symbol in the current row.""" - self._set_table_action(symbol, Goto(index), None) + action = ("goto", index) + self._set_table_action(symbol, action, None) - def _action_precedence(self, symbol: int, action: Action, config: Configuration): - if isinstance(action, Shift): + def _action_precedence(self, symbol: int, action: typing.Tuple, config: Configuration): + if action[0] == "shift": return self.precedence[symbol] else: return self.precedence[config.name] - def _set_table_action(self, symbol_id: int, action: Action, config: Configuration | None): + def _set_table_action(self, symbol_id: int, action: typing.Tuple, config: Configuration | None): """Set the action for 'symbol' in the table row to 'action'. This is destructive; it changes the table. It records an error if @@ -649,17 +607,17 @@ class TableBuilder(object): resolved = False if assoc == Assoc.LEFT: # Prefer reduce over shift - if isinstance(action, Shift) and isinstance(existing, Reduce): + if action[0] == "shift" and existing[0] == "reduce": action = existing resolved = True - elif isinstance(action, Reduce) and isinstance(existing, Shift): + elif action[0] == "reduce" and existing[0] == "shift": resolved = True elif assoc == Assoc.RIGHT: # Prefer shift over reduce - if isinstance(action, Shift) and isinstance(existing, Reduce): + if action[0] == "shift" and existing[0] == "reduce": resolved = True - elif isinstance(action, Reduce) and isinstance(existing, Shift): + elif action[0] == "reduce" and existing[0] == "shift": action = existing resolved = True @@ -678,7 +636,7 @@ class TableBuilder(object): self.row[symbol_id] = (action, config) -class GenerateLR0: +class GenerateLR0(object): """Generate parser tables for an LR0 parser.""" # Internally we use integers as symbols, not strings. Mostly this is fine, @@ -701,10 +659,6 @@ class GenerateLR0: # for a symbol, then its entry in this tuple will be (NONE, 0). precedence: typing.Tuple[typing.Tuple[Assoc, int], ...] - # The set of symbols for which we should reduce "transparently." This doesn't - # affect state generation at all, only the generation of the final table. - transparents: set[str] - # The lookup that maps a particular symbol to an integer. (Only really used # for debugging.) symbol_key: dict[str, int] @@ -721,7 +675,6 @@ class GenerateLR0: start: str, grammar: list[typing.Tuple[str, list[str]]], precedence: None | dict[str, typing.Tuple[Assoc, int]] = None, - transparents: None | set[str] = None, ): """Initialize the parser generator with the specified grammar and start symbol. @@ -824,10 +777,6 @@ class GenerateLR0: precedence = {} self.precedence = tuple(precedence.get(a, (Assoc.NONE, 0)) for a in self.alphabet) - if transparents is None: - transparents = set() - self.transparents = transparents - self.symbol_key = symbol_key self.start_symbol = start_symbol self.end_symbol = end_symbol @@ -954,7 +903,7 @@ class GenerateLR0: del config return [index for index, value in enumerate(self.terminal) if value] - def gen_table(self) -> ParseTable: + def gen_table(self): """Generate the parse table. The parse table is a list of states. The first state in the list is @@ -983,7 +932,7 @@ class GenerateLR0: Anything missing from the row indicates an error. """ config_sets = self.gen_all_sets() - builder = TableBuilder(self.alphabet, self.precedence, self.transparents) + builder = TableBuilder(self.alphabet, self.precedence) for config_set_id, config_set in enumerate(config_sets.sets): builder.new_row(config_set) @@ -1010,7 +959,7 @@ class GenerateLR0: return builder.flush(config_sets) -def parse(table: ParseTable, input, trace=False): +def parse(table, input, trace=False): """Parse the input with the generated parsing table and return the concrete syntax tree. @@ -1036,7 +985,7 @@ def parse(table: ParseTable, input, trace=False): current_state = stack[-1][0] current_token = input[input_index] - action = table.states[current_state].get(current_token, Error()) + action = table[current_state].get(current_token, ("error",)) if trace: print( "{stack: <20} {input: <50} {action: <5}".format( @@ -1046,35 +995,30 @@ def parse(table: ParseTable, input, trace=False): ) ) - match action: - case Accept(): - return stack[-1][1] + if action[0] == "accept": + return stack[-1][1] - case Reduce(name=name, count=size, transparent=transparent): - children = [] - for _, c in stack[-size:]: - if isinstance(c, tuple) and c[0] is None: - children.extend(c[1]) - else: - children.append(c) + elif action[0] == "reduce": + name = action[1] + size = action[2] - value = (name if not transparent else None, tuple(children)) - stack = stack[:-size] + value = (name, tuple(s[1] for s in stack[-size:])) + stack = stack[:-size] - goto = table.states[stack[-1][0]].get(name, Error()) - assert isinstance(goto, Goto) - stack.append((goto.state, value)) + goto = table[stack[-1][0]].get(name, ("error",)) + assert goto[0] == "goto" # Corrupt table? + stack.append((goto[1], value)) - case Shift(state): - stack.append((state, (current_token, ()))) - input_index += 1 + elif action[0] == "shift": + stack.append((action[1], (current_token, ()))) + input_index += 1 - case Error(): - raise ValueError( - "Syntax error: unexpected symbol {sym}".format( - sym=current_token, - ), - ) + elif action[0] == "error": + raise ValueError( + "Syntax error: unexpected symbol {sym}".format( + sym=current_token, + ), + ) ############################################################################### @@ -1595,16 +1539,7 @@ class NonTerminal(Rule): grammar class. """ - fn: typing.Callable[["Grammar"], Rule] - name: str - transparent: bool - - def __init__( - self, - fn: typing.Callable[["Grammar"], Rule], - name: str | None = None, - transparent: bool = False, - ): + def __init__(self, fn: typing.Callable[["Grammar"], Rule], name: str | None = None): """Create a new NonTerminal. `fn` is the function that will yield the `Rule` which is the @@ -1614,7 +1549,6 @@ class NonTerminal(Rule): """ self.fn = fn self.name = name or fn.__name__ - self.transparent = transparent def generate_body(self, grammar) -> list[list[str | Token]]: """Generate the body of the non-terminal. @@ -1704,8 +1638,7 @@ def rule(f: typing.Callable) -> Rule: of the nonterminal, which defaults to the name of the function. """ name = f.__name__ - transparent = name.startswith("_") - return NonTerminal(f, name, transparent) + return NonTerminal(f, name) PrecedenceList = list[typing.Tuple[Assoc, list[Rule]]] @@ -1756,9 +1689,7 @@ class Grammar: self._precedence = precedence_table - def generate_nonterminal_dict( - self, start: str - ) -> typing.Tuple[dict[str, list[list[str | Token]]], set[str]]: + def generate_nonterminal_dict(self, start: str) -> dict[str, list[list[str | Token]]]: """Convert the rules into a dictionary of productions. Our table generators work on a very flat set of productions. This is the @@ -1769,7 +1700,6 @@ class Grammar: """ rules = inspect.getmembers(self, lambda x: isinstance(x, NonTerminal)) nonterminals = {rule.name: rule for _, rule in rules} - transparents = {rule.name for _, rule in rules if rule.transparent} grammar = {} @@ -1794,9 +1724,9 @@ class Grammar: grammar[rule.name] = body - return (grammar, transparents) + return grammar - def desugar(self, start: str) -> typing.Tuple[list[typing.Tuple[str, list[str]]], set[str]]: + def desugar(self, start: str) -> list[typing.Tuple[str, list[str]]]: """Convert the rules into a flat list of productions. Our table generators work from a very flat set of productions. The form @@ -1804,7 +1734,7 @@ class Grammar: generate_nonterminal_dict- less useful to people, probably, but it is the input form needed by the Generator. """ - temp_grammar, transparents = self.generate_nonterminal_dict(start) + temp_grammar = self.generate_nonterminal_dict(start) grammar = [] for rule_name, clauses in temp_grammar.items(): @@ -1818,15 +1748,15 @@ class Grammar: grammar.append((rule_name, new_clause)) - return grammar, transparents + return grammar def build_table(self, start: str, generator=GenerateLALR): """Construct a parse table for this grammar, starting at the named nonterminal rule. """ - desugared, transparents = self.desugar(start) + desugared = self.desugar(start) - gen = generator(start, desugared, precedence=self._precedence, transparents=transparents) + gen = generator(start, desugared, precedence=self._precedence) table = gen.gen_table() return table @@ -1842,7 +1772,7 @@ def format_node(node): return "\n".join(lines) -def format_table(generator, table: ParseTable): +def format_table(generator, table): """Format a parser table so pretty.""" def format_action(state, terminal): @@ -1876,7 +1806,7 @@ def format_table(generator, table: ParseTable): ), gotos=" ".join("{0: <5}".format(row.get(nt, ("error", ""))[1]) for nt in nonterminals), ) - for i, row in enumerate(table.states) + for i, row in enumerate(table) ] return "\n".join(lines)