lrparsers/harness.py
John Doty fee1c68dea Wrap errors
Maybe people want to see them
2024-05-31 08:31:47 -07:00

474 lines
15 KiB
Python

import argparse
import bisect
import importlib
import inspect
import enum
import os
import select
import sys
import termios
import textwrap
import time
import traceback
import tty
import types
import typing
from dataclasses import dataclass
import parser
# from parser import Token, Grammar, rule, seq
###############################################################################
# Parsing Stuff
###############################################################################
def trace_state(stack, input, input_index, action):
print(
"{stack: <20} {input: <50} {action: <5}".format(
stack=repr([s[0] for s in stack]),
input=repr(input[input_index : input_index + 4]),
action=repr(action),
)
)
@dataclass
class TokenValue:
kind: str
start: int
end: int
@dataclass
class Tree:
name: str | None
start: int
end: int
children: typing.Tuple["Tree | TokenValue", ...]
def parse(table: parser.ParseTable, tokens, trace=None) -> typing.Tuple[Tree | None, list[str]]:
"""Parse the input with the generated parsing table and return the
concrete syntax tree.
The parsing table can be generated by GenerateLR0.gen_table() or by any
of the other generators below. The parsing mechanism never changes, only
the table generation mechanism.
input is a list of tokens. Don't stick an end-of-stream marker, I'll stick
one on for you.
This is not a *great* parser, it's really just a demo for what you can
do with the table.
"""
input_tokens = tokens.tokens()
input: list[str] = [t.value for (t, _, _) in input_tokens]
assert "$" not in input
input = input + ["$"]
input_index = 0
# Our stack is a stack of tuples, where the first entry is the state number
# and the second entry is the 'value' that was generated when the state was
# pushed.
stack: list[typing.Tuple[int, TokenValue | Tree | None]] = [(0, None)]
while True:
current_state = stack[-1][0]
current_token = input[input_index]
action = table.actions[current_state].get(current_token, parser.Error())
if trace:
trace(stack, input, input_index, action)
match action:
case parser.Accept():
result = stack[-1][1]
assert isinstance(result, Tree)
return (result, [])
case parser.Reduce(name=name, count=size, transparent=transparent):
children: list[TokenValue | Tree] = []
for _, c in stack[-size:]:
if c is None:
continue
elif isinstance(c, Tree) and c.name is None:
children.extend(c.children)
else:
children.append(c)
value = Tree(
name=name if not transparent else None,
start=children[0].start,
end=children[-1].end,
children=tuple(children),
)
stack = stack[:-size]
goto = table.gotos[stack[-1][0]].get(name)
assert goto is not None
stack.append((goto, value))
case parser.Shift(state):
(kind, start, length) = input_tokens[input_index]
tval = TokenValue(kind=kind.value, start=start, end=start + length)
stack.append((state, tval))
input_index += 1
case parser.Error():
if input_index >= len(input_tokens):
message = "Unexpected end of file"
start = input_tokens[-1][1]
else:
message = f"Syntax error: unexpected symbol {current_token}"
(_, start, _) = input_tokens[input_index]
line_index = bisect.bisect_left(tokens.lines, start)
if line_index == 0:
col_start = 0
else:
col_start = tokens.lines[line_index - 1] + 1
column_index = start - col_start
line_index += 1
error = f"{line_index}:{column_index}: {message}"
return (None, [error])
case _:
raise ValueError(f"Unknown action type: {action}")
###############################################################################
# Screen Stuff
###############################################################################
# https://en.wikipedia.org/wiki/ANSI_escape_code
# https://gist.github.com/fnky/458719343aabd01cfb17a3a4f7296797
class CharColor(enum.IntEnum):
CHAR_COLOR_DEFAULT = 0
CHAR_COLOR_BLACK = 30
CHAR_COLOR_RED = enum.auto()
CHAR_COLOR_GREEN = enum.auto()
CHAR_COLOR_YELLOW = enum.auto()
CHAR_COLOR_BLUE = enum.auto()
CHAR_COLOR_MAGENTA = enum.auto()
CHAR_COLOR_CYAN = enum.auto()
CHAR_COLOR_WHITE = enum.auto() # Really light gray
CHAR_COLOR_BRIGHT_BLACK = 90 # Really dark gray
CHAR_COLOR_BRIGHT_RED = enum.auto()
CHAR_COLOR_BRIGHT_GREEN = enum.auto()
CHAR_COLOR_BRIGHT_YELLOW = enum.auto()
CHAR_COLOR_BRIGHT_BLUE = enum.auto()
CHAR_COLOR_BRIGHT_MAGENTA = enum.auto()
CHAR_COLOR_BRIGHT_CYAN = enum.auto()
CHAR_COLOR_BRIGHT_WHITE = enum.auto()
def ESC(x: bytes) -> bytes:
return b"\033" + x
def CSI(x: bytes) -> bytes:
return ESC(b"[" + x)
CLEAR = CSI(b"H") + CSI(b"J")
def enter_alt_screen():
sys.stdout.buffer.write(CSI(b"?1049h"))
def leave_alt_screen():
sys.stdout.buffer.write(CSI(b"?1049l"))
###############################################################################
# Dynamic Modules: Detect and Reload Modules when they Change
###############################################################################
class DynamicModule:
file_name: str
member_name: str | None
last_time: float | None
module: types.ModuleType | None
def __init__(self, file_name, member_name):
self.file_name = file_name
self.member_name = member_name
self.last_time = None
self.module = None
self.value = None
def _predicate(self, member) -> bool:
if not inspect.isclass(member):
return False
assert self.module is not None
if member.__module__ != self.module.__name__:
return False
return True
def _transform(self, value):
return value
def get(self):
st = os.stat(self.file_name)
if self.last_time == st.st_mtime:
assert self.value is not None
return self.value
self.value = None
if self.module is None:
mod_name = inspect.getmodulename(self.file_name)
if mod_name is None:
raise Exception(f"{self.file_name} does not seem to be a module")
self.module = importlib.import_module(mod_name)
else:
importlib.reload(self.module)
if self.member_name is None:
classes = inspect.getmembers(self.module, self._predicate)
if len(classes) == 0:
raise Exception(f"No grammars found in {self.file_name}")
if len(classes) > 1:
raise Exception(
f"{len(classes)} grammars found in {self.file_name}: {', '.join(c[0] for c in classes)}"
)
cls = classes[0][1]
else:
cls = getattr(self.module, self.member_name)
if cls is None:
raise Exception(f"Cannot find {self.member_name} in {self.file_name}")
if not self._predicate(cls):
raise Exception(f"{self.member_name} in {self.file_name} is not suitable")
self.value = self._transform(cls)
self.last_time = st.st_mtime
return self.value
class DynamicGrammarModule(DynamicModule):
def __init__(self, file_name, member_name, start_rule, generator):
super().__init__(file_name, member_name)
self.start_rule = start_rule
self.generator = generator
def _predicate(self, member) -> bool:
if not super()._predicate(member):
return False
if getattr(member, "build_table", None):
return True
return False
def _transform(self, value):
return value().build_table(start=self.start_rule, generator=self.generator)
class DynamicLexerModule(DynamicModule):
def _predicate(self, member) -> bool:
if not super()._predicate(member):
return False
if getattr(member, "tokens", None):
return True
return False
class Harness:
grammar_file: str
grammar_member: str | None
lexer_file: str
lexer_member: str | None
start_rule: str | None
source: str | None
table: parser.ParseTable | None
tree: Tree | None
def __init__(
self, grammar_file, grammar_member, lexer_file, lexer_member, start_rule, source_path
):
self.grammar_file = grammar_file
self.grammar_member = grammar_member
self.lexer_file = lexer_file or grammar_file
self.lexer_member = lexer_member
self.start_rule = start_rule
self.source_path = source_path
self.source = None
self.table = None
self.tokens = None
self.tree = None
self.errors = []
self.state_count = 0
self.average_entries = 0
self.max_entries = 0
self.grammar_module = DynamicGrammarModule(
self.grammar_file, self.grammar_member, self.start_rule, generator=parser.GenerateLALR
)
self.lexer_module = DynamicLexerModule(self.lexer_file, self.lexer_member)
def run(self):
while True:
i, _, _ = select.select([sys.stdin], [], [], 1)
if i:
k = sys.stdin.read(1)
print(f"Key {k}\r")
return
self.update()
self.render()
def load_grammar(self) -> parser.ParseTable:
return self.grammar_module.get()
def update(self):
start_time = time.time()
try:
table = self.load_grammar()
lexer_func = self.lexer_module.get()
with open(self.source_path, "r", encoding="utf-8") as f:
self.source = f.read()
self.tokens = lexer_func(self.source)
lex_time = time.time()
# print(f"{tokens.lines}")
# tokens.dump(end=5)
(tree, errors) = parse(table, self.tokens, trace=None)
parse_time = time.time()
self.tree = tree
self.errors = errors
self.parse_elapsed = parse_time - lex_time
states = table.actions
self.state_count = len(states)
self.average_entries = sum(len(row) for row in states) / len(states)
self.max_entries = max(len(row) for row in states)
except Exception as e:
self.tree = None
self.errors = ["Error loading grammar:"] + [
" " + l.rstrip() for fl in traceback.format_exception(e) for l in fl.splitlines()
]
self.parse_elapsed = time.time() - start_time
self.state_count = 0
self.average_entries = 0
self.max_entries = 0
def render(self):
sys.stdout.buffer.write(CLEAR)
rows, cols = termios.tcgetwinsize(sys.stdout.fileno())
if self.state_count > 0:
print(
f"{self.state_count} states - {self.average_entries:.3} average, {self.max_entries} max - {self.parse_elapsed:.3}s\r"
)
else:
print(f"No table\r")
print(("\u2500" * cols) + "\r")
if self.tree is not None:
lines = []
self.format_node(lines, self.tree)
for line in lines[: rows - 3]:
print(line[:cols] + "\r")
else:
wrapper = textwrap.TextWrapper(width=cols)
lines = [line for error in self.errors for line in wrapper.wrap(error)]
for line in lines[: rows - 3]:
print(line + "\r")
sys.stdout.flush()
sys.stdout.buffer.flush()
def format_node(self, lines, node: Tree | TokenValue, indent=0):
"""Print out an indented concrete syntax tree, from parse()."""
match node:
case Tree(name=name, start=start, end=end, children=children):
lines.append((" " * indent) + f"{name or '???'} [{start}, {end})")
for child in children:
self.format_node(lines, child, indent + 2)
case TokenValue(kind=kind, start=start, end=end):
assert self.source is not None
value = self.source[start:end]
lines.append((" " * indent) + f"{kind}:'{value}' [{start}, {end})")
def main(args: list[str]):
parser = argparse.ArgumentParser(description="An interactive debugging harness for grammars")
parser.add_argument("grammar", help="Path to a python file containing the grammar to load")
parser.add_argument("source_path", help="Path to an input file to parse")
parser.add_argument(
"--grammar-member",
type=str,
default=None,
help="The name of the member in the grammar module to load. The default is to search "
"the module for a class that looks like a Grammar. You should only need to specify "
"this if you have more than one grammar in your module, or if it's hidden somehow.",
)
parser.add_argument(
"--start-rule",
type=str,
default=None,
help="The name of the production to start parsing with. The default is the one "
"specified by the grammar.",
)
parser.add_argument(
"--lexer",
type=str,
default=None,
help="Path to a python file containing the lexer to load. The default is to use the "
"grammar file.",
)
parser.add_argument(
"--lexer-member",
type=str,
default=None,
help="The name of the lexer in the lexer module to load. The default is to search "
"the module for a class that looks like a lexer. You should only need to specify this "
"if you have more than one Lexer in the file, or if your lexer is hidden somehow.",
)
parsed = parser.parse_args(args[1:])
fd = sys.stdin.fileno()
old_settings = termios.tcgetattr(fd)
try:
tty.setraw(fd)
enter_alt_screen()
h = Harness(
grammar_file=parsed.grammar,
grammar_member=parsed.grammar_member,
lexer_file=parsed.lexer,
lexer_member=parsed.lexer_member,
start_rule=parsed.start_rule,
source_path=parsed.source_path,
)
h.run()
finally:
leave_alt_screen()
termios.tcsetattr(fd, termios.TCSADRAIN, old_settings)
if __name__ == "__main__":
main(sys.argv)