lrparsers/harness.py

335 lines
10 KiB
Python

import bisect
import importlib
import inspect
import enum
import os
import select
import sys
import termios
import time
import tty
import typing
from dataclasses import dataclass
import grammar
import parser
# from parser import Token, Grammar, rule, seq
def trace_state(stack, input, input_index, action):
print(
"{stack: <20} {input: <50} {action: <5}".format(
stack=repr([s[0] for s in stack]),
input=repr(input[input_index : input_index + 4]),
action=repr(action),
)
)
@dataclass
class Tree:
name: str | None
children: typing.Tuple["Tree | str", ...]
def parse(table: parser.ParseTable, tokens, trace=None) -> typing.Tuple[Tree | None, list[str]]:
"""Parse the input with the generated parsing table and return the
concrete syntax tree.
The parsing table can be generated by GenerateLR0.gen_table() or by any
of the other generators below. The parsing mechanism never changes, only
the table generation mechanism.
input is a list of tokens. Don't stick an end-of-stream marker, I'll stick
one on for you.
This is not a *great* parser, it's really just a demo for what you can
do with the table.
"""
input: list[str] = [t.value for (t, _, _) in tokens.tokens]
assert "$" not in input
input = input + ["$"]
input_index = 0
# Our stack is a stack of tuples, where the first entry is the state number
# and the second entry is the 'value' that was generated when the state was
# pushed.
stack: list[typing.Tuple[int, str | Tree | None]] = [(0, None)]
while True:
current_state = stack[-1][0]
current_token = input[input_index]
action = table.states[current_state].get(current_token, parser.Error())
if trace:
trace(stack, input, input_index, action)
match action:
case parser.Accept():
result = stack[-1][1]
assert isinstance(result, Tree)
return (result, [])
case parser.Reduce(name=name, count=size, transparent=transparent):
children: list[str | Tree] = []
for _, c in stack[-size:]:
if c is None:
continue
elif isinstance(c, Tree) and c.name is None:
children.extend(c.children)
else:
children.append(c)
value = Tree(name=name if not transparent else None, children=tuple(children))
stack = stack[:-size]
goto = table.states[stack[-1][0]].get(name, parser.Error())
assert isinstance(goto, parser.Goto)
stack.append((goto.state, value))
case parser.Shift(state):
stack.append((state, current_token))
input_index += 1
case parser.Error():
if input_index >= len(tokens.tokens):
message = "Unexpected end of file"
start = tokens.tokens[-1][1]
else:
message = f"Syntax error: unexpected symbol {current_token}"
(_, start, _) = tokens.tokens[input_index]
line_index = bisect.bisect_left(tokens.lines, start)
if line_index == 0:
col_start = 0
else:
col_start = tokens.lines[line_index - 1] + 1
column_index = start - col_start
line_index += 1
error = f"{line_index}:{column_index}: {message}"
return (None, [error])
case _:
raise ValueError(f"Unknown action type: {action}")
# https://en.wikipedia.org/wiki/ANSI_escape_code
# https://gist.github.com/fnky/458719343aabd01cfb17a3a4f7296797
class CharColor(enum.IntEnum):
CHAR_COLOR_DEFAULT = 0
CHAR_COLOR_BLACK = 30
CHAR_COLOR_RED = enum.auto()
CHAR_COLOR_GREEN = enum.auto()
CHAR_COLOR_YELLOW = enum.auto()
CHAR_COLOR_BLUE = enum.auto()
CHAR_COLOR_MAGENTA = enum.auto()
CHAR_COLOR_CYAN = enum.auto()
CHAR_COLOR_WHITE = enum.auto() # Really light gray
CHAR_COLOR_BRIGHT_BLACK = 90 # Really dark gray
CHAR_COLOR_BRIGHT_RED = enum.auto()
CHAR_COLOR_BRIGHT_GREEN = enum.auto()
CHAR_COLOR_BRIGHT_YELLOW = enum.auto()
CHAR_COLOR_BRIGHT_BLUE = enum.auto()
CHAR_COLOR_BRIGHT_MAGENTA = enum.auto()
CHAR_COLOR_BRIGHT_CYAN = enum.auto()
CHAR_COLOR_BRIGHT_WHITE = enum.auto()
def ESC(x: bytes) -> bytes:
return b"\033" + x
def CSI(x: bytes) -> bytes:
return ESC(b"[" + x)
CLEAR = CSI(b"2J")
def enter_alt_screen():
sys.stdout.buffer.write(CSI(b"?1049h"))
def leave_alt_screen():
sys.stdout.buffer.write(CSI(b"?1049l"))
class Harness:
source: str | None
table: parser.ParseTable | None
tree: Tree | None
def __init__(self, lexer_func, start_rule, source_path):
# self.generator = parser.GenerateLR1
self.generator = parser.GenerateLALR
self.lexer_func = lexer_func
self.start_rule = start_rule
self.source_path = source_path
self.source = None
self.table = None
self.tokens = None
self.tree = None
self.errors = None
self.grammar_file_name = "./grammar.py"
self.last_grammar_time = None
self.grammar_module = None
self.grammar_name = None
def run(self):
while True:
i, _, _ = select.select([sys.stdin], [], [], 1)
if i:
k = sys.stdin.read(1)
print(f"Key {k}\r")
return
self.update()
# def should_reload_grammar(self):
def load_grammar(self) -> parser.ParseTable:
st = os.stat(self.grammar_file_name)
if self.last_grammar_time == st.st_mtime:
assert self.table is not None
return self.table
self.table = None
if self.grammar_module is None:
mod_name = inspect.getmodulename(self.grammar_file_name)
if mod_name is None:
raise Exception(f"{self.grammar_file_name} does not seem to be a module")
self.grammar_module = importlib.import_module(mod_name)
else:
importlib.reload(self.grammar_module)
def is_grammar(cls):
if not inspect.isclass(cls):
return False
assert self.grammar_module is not None
if cls.__module__ != self.grammar_module.__name__:
return False
if getattr(cls, "build_table", None):
return True
return False
if self.grammar_name is None:
classes = inspect.getmembers(self.grammar_module, is_grammar)
if len(classes) == 0:
raise Exception(f"No grammars found in {self.grammar_file_name}")
if len(classes) > 1:
raise Exception(
f"{len(classes)} grammars found in {self.grammar_file_name}: {', '.join(c[0] for c in classes)}"
)
grammar_func = classes[0][1]
else:
cls = getattr(self.grammar_module, self.grammar_name)
if cls is None:
raise Exception(f"Cannot find {self.grammar_name} in {self.grammar_file_name}")
if not is_grammar(cls):
raise Exception(
f"{self.grammar_name} in {self.grammar_file_name} does not seem to be a grammar"
)
grammar_func = cls
self.table = grammar_func().build_table(start=self.start_rule, generator=self.generator)
self.last_grammar_time = st.st_mtime
assert self.table is not None
return self.table
def update(self):
start_time = time.time()
try:
table = self.load_grammar()
with open(self.source_path, "r", encoding="utf-8") as f:
self.source = f.read()
self.tokens = self.lexer_func(self.source)
lex_time = time.time()
# print(f"{tokens.lines}")
# tokens.dump(end=5)
(tree, errors) = parse(table, self.tokens, trace=None)
parse_time = time.time()
self.tree = tree
self.errors = errors
parse_elapsed = parse_time - lex_time
except Exception as e:
self.tree = None
self.errors = [f"Error loading grammar: {e}"]
parse_elapsed = time.time() - start_time
table = None
sys.stdout.buffer.write(CLEAR)
rows, cols = termios.tcgetwinsize(sys.stdout.fileno())
if table is not None:
states = table.states
average_entries = sum(len(row) for row in states) / len(states)
max_entries = max(len(row) for row in states)
print(
f"{len(states)} states - {average_entries:.3} average, {max_entries} max - {parse_elapsed:.3}s \r"
)
else:
print("No table\r\n")
if self.tree is not None:
lines = []
self.format_node(lines, self.tree)
for line in lines[: rows - 2]:
print(line[:cols] + "\r")
else:
for error in self.errors[: rows - 2]:
print(error[:cols] + "\r")
sys.stdout.flush()
sys.stdout.buffer.flush()
def format_node(self, lines, node: Tree | str, indent=0):
"""Print out an indented concrete syntax tree, from parse()."""
match node:
case Tree(name, children):
lines.append((" " * indent) + (name or "???"))
for child in children:
self.format_node(lines, child, indent + 2)
case _:
lines.append((" " * indent) + str(node))
if __name__ == "__main__":
source_path = None
if len(sys.argv) == 2:
source_path = sys.argv[1]
fd = sys.stdin.fileno()
old_settings = termios.tcgetattr(fd)
try:
tty.setraw(fd)
enter_alt_screen()
h = Harness(
lexer_func=grammar.FineTokens,
start_rule="file",
source_path=source_path,
)
h.run()
finally:
leave_alt_screen()
termios.tcsetattr(fd, termios.TCSADRAIN, old_settings)
# print(parser_faster.format_table(gen, table))
# print()
# tree = parse(table, ["id", "+", "(", "id", "[", "id", "]", ")"])