import bisect from dataclasses import dataclass import enum import select import sys import termios import tty import typing import grammar import parser # from parser import Token, Grammar, rule, seq def trace_state(stack, input, input_index, action): print( "{stack: <20} {input: <50} {action: <5}".format( stack=repr([s[0] for s in stack]), input=repr(input[input_index : input_index + 4]), action=repr(action), ) ) @dataclass class Tree: name: str | None children: typing.Tuple["Tree | str", ...] def parse(table: parser.ParseTable, tokens, trace=None) -> typing.Tuple[Tree | None, list[str]]: """Parse the input with the generated parsing table and return the concrete syntax tree. The parsing table can be generated by GenerateLR0.gen_table() or by any of the other generators below. The parsing mechanism never changes, only the table generation mechanism. input is a list of tokens. Don't stick an end-of-stream marker, I'll stick one on for you. This is not a *great* parser, it's really just a demo for what you can do with the table. """ input: list[str] = [t.value for (t, _, _) in tokens.tokens] assert "$" not in input input = input + ["$"] input_index = 0 # Our stack is a stack of tuples, where the first entry is the state number # and the second entry is the 'value' that was generated when the state was # pushed. stack: list[typing.Tuple[int, str | Tree | None]] = [(0, None)] while True: current_state = stack[-1][0] current_token = input[input_index] action = table.states[current_state].get(current_token, parser.Error()) if trace: trace(stack, input, input_index, action) match action: case parser.Accept(): result = stack[-1][1] assert isinstance(result, Tree) return (result, []) case parser.Reduce(name=name, count=size, transparent=transparent): children: list[str | Tree] = [] for _, c in stack[-size:]: if c is None: continue elif isinstance(c, Tree) and c.name is None: children.extend(c.children) else: children.append(c) value = Tree(name=name if not transparent else None, children=tuple(children)) stack = stack[:-size] goto = table.states[stack[-1][0]].get(name, parser.Error()) assert isinstance(goto, parser.Goto) stack.append((goto.state, value)) case parser.Shift(state): stack.append((state, current_token)) input_index += 1 case parser.Error(): if input_index >= len(tokens.tokens): message = "Unexpected end of file" start = tokens.tokens[-1][1] else: message = f"Syntax error: unexpected symbol {current_token}" (_, start, _) = tokens.tokens[input_index] line_index = bisect.bisect_left(tokens.lines, start) if line_index == 0: col_start = 0 else: col_start = tokens.lines[line_index - 1] + 1 column_index = start - col_start line_index += 1 error = f"{line_index}:{column_index}: {message}" return (None, [error]) case _: raise ValueError(f"Unknown action type: {action}") # https://en.wikipedia.org/wiki/ANSI_escape_code # https://gist.github.com/fnky/458719343aabd01cfb17a3a4f7296797 class CharColor(enum.IntEnum): CHAR_COLOR_DEFAULT = 0 CHAR_COLOR_BLACK = 30 CHAR_COLOR_RED = enum.auto() CHAR_COLOR_GREEN = enum.auto() CHAR_COLOR_YELLOW = enum.auto() CHAR_COLOR_BLUE = enum.auto() CHAR_COLOR_MAGENTA = enum.auto() CHAR_COLOR_CYAN = enum.auto() CHAR_COLOR_WHITE = enum.auto() # Really light gray CHAR_COLOR_BRIGHT_BLACK = 90 # Really dark gray CHAR_COLOR_BRIGHT_RED = enum.auto() CHAR_COLOR_BRIGHT_GREEN = enum.auto() CHAR_COLOR_BRIGHT_YELLOW = enum.auto() CHAR_COLOR_BRIGHT_BLUE = enum.auto() CHAR_COLOR_BRIGHT_MAGENTA = enum.auto() CHAR_COLOR_BRIGHT_CYAN = enum.auto() CHAR_COLOR_BRIGHT_WHITE = enum.auto() def ESC(x: bytes) -> bytes: return b"\033" + x def CSI(x: bytes) -> bytes: return ESC(b"[" + x) CLEAR = CSI(b"H") + CSI(b"0m") def enter_alt_screen(): sys.stdout.buffer.write(CSI(b"?1049h")) def leave_alt_screen(): sys.stdout.buffer.write(CSI(b"?1049l")) class Harness: source: str | None table: parser.ParseTable | None tree: Tree | None def __init__(self, lexer_func, grammar_func, start_rule, source_path): # self.generator = parser.GenerateLR1 self.generator = parser.GenerateLALR self.lexer_func = lexer_func self.grammar_func = grammar_func self.start_rule = start_rule self.source_path = source_path self.source = None self.table = None self.tokens = None self.tree = None self.errors = None def run(self): while True: i, _, _ = select.select([sys.stdin], [], [], 1) if i: k = sys.stdin.read(1) print(f"Key {k}\r") return self.update() def update(self): if self.table is None: self.table = self.grammar_func().build_table( start=self.start_rule, generator=self.generator ) assert self.table is not None if self.tokens is None: with open(self.source_path, "r", encoding="utf-8") as f: self.source = f.read() self.tokens = self.lexer_func(self.source) # print(f"{tokens.lines}") # tokens.dump(end=5) if self.tree is None and self.errors is None: (tree, errors) = parse(self.table, self.tokens, trace=None) self.tree = tree self.errors = errors sys.stdout.buffer.write(CLEAR) rows, cols = termios.tcgetwinsize(sys.stdout.fileno()) states = self.table.states average_entries = sum(len(row) for row in states) / len(states) max_entries = max(len(row) for row in states) print(f"{len(states)} states - {average_entries} average, {max_entries} max\r") if self.tree is not None: lines = [] self.format_node(lines, self.tree) for line in lines[: rows - 2]: print(line[:cols] + "\r") sys.stdout.flush() sys.stdout.buffer.flush() def format_node(self, lines, node: Tree | str, indent=0): """Print out an indented concrete syntax tree, from parse().""" match node: case Tree(name, children): lines.append((" " * indent) + (name or "???")) for child in children: self.format_node(lines, child, indent + 2) case _: lines.append((" " * indent) + str(node)) if __name__ == "__main__": source_path = None if len(sys.argv) == 2: source_path = sys.argv[1] fd = sys.stdin.fileno() old_settings = termios.tcgetattr(fd) try: tty.setraw(fd) enter_alt_screen() h = Harness( lexer_func=grammar.FineTokens, grammar_func=grammar.FineGrammar, start_rule="file", source_path=source_path, ) h.run() finally: leave_alt_screen() termios.tcsetattr(fd, termios.TCSADRAIN, old_settings) # print(parser_faster.format_table(gen, table)) # print() # tree = parse(table, ["id", "+", "(", "id", "[", "id", "]", ")"])