Compare commits

..

No commits in common. "45a9303a2796f2cf2a61c4a60514deb766775442" and "7c4705714eed7ea62322fe6941dd081e504e46bc" have entirely different histories.

3 changed files with 122 additions and 316 deletions

View file

@ -78,11 +78,11 @@ class FineGrammar(Grammar):
@rule
def file(self):
return self._file_statement_list
return self.file_statement_list
@rule
def _file_statement_list(self):
return self.file_statement | (self._file_statement_list + self.file_statement)
def file_statement_list(self):
return self.file_statement | (self.file_statement_list + self.file_statement)
@rule
def file_statement(self):

View file

@ -1,10 +1,4 @@
import bisect
from dataclasses import dataclass
import enum
import select
import sys
import termios
import tty
import typing
import grammar
@ -23,13 +17,7 @@ def trace_state(stack, input, input_index, action):
)
@dataclass
class Tree:
name: str | None
children: typing.Tuple["Tree | str", ...]
def parse(table: parser.ParseTable, tokens, trace=None) -> typing.Tuple[Tree | None, list[str]]:
def parse(table, tokens, trace=None):
"""Parse the input with the generated parsing table and return the
concrete syntax tree.
@ -43,7 +31,7 @@ def parse(table: parser.ParseTable, tokens, trace=None) -> typing.Tuple[Tree | N
This is not a *great* parser, it's really just a demo for what you can
do with the table.
"""
input: list[str] = [t.value for (t, _, _) in tokens.tokens]
input = [t.value for (t, _, _) in tokens.tokens]
assert "$" not in input
input = input + ["$"]
@ -52,50 +40,38 @@ def parse(table: parser.ParseTable, tokens, trace=None) -> typing.Tuple[Tree | N
# Our stack is a stack of tuples, where the first entry is the state number
# and the second entry is the 'value' that was generated when the state was
# pushed.
stack: list[typing.Tuple[int, str | Tree | None]] = [(0, None)]
stack: list[typing.Tuple[int, typing.Any]] = [(0, None)]
while True:
current_state = stack[-1][0]
current_token = input[input_index]
action = table.states[current_state].get(current_token, parser.Error())
action = table[current_state].get(current_token, ("error",))
if trace:
trace(stack, input, input_index, action)
match action:
case parser.Accept():
result = stack[-1][1]
assert isinstance(result, Tree)
return (result, [])
if action[0] == "accept":
return (stack[-1][1], [])
case parser.Reduce(name=name, count=size, transparent=transparent):
children: list[str | Tree] = []
for _, c in stack[-size:]:
if c is None:
continue
elif isinstance(c, Tree) and c.name is None:
children.extend(c.children)
else:
children.append(c)
elif action[0] == "reduce":
name = action[1]
size = action[2]
value = Tree(name=name if not transparent else None, children=tuple(children))
value = (name, tuple(s[1] for s in stack[-size:]))
stack = stack[:-size]
goto = table.states[stack[-1][0]].get(name, parser.Error())
assert isinstance(goto, parser.Goto)
stack.append((goto.state, value))
goto = table[stack[-1][0]].get(name, ("error",))
assert goto[0] == "goto" # Corrupt table?
stack.append((goto[1], value))
case parser.Shift(state):
stack.append((state, current_token))
elif action[0] == "shift":
stack.append((action[1], (current_token, ())))
input_index += 1
case parser.Error():
elif action[0] == "error":
if input_index >= len(tokens.tokens):
message = "Unexpected end of file"
start = tokens.tokens[-1][1]
raise ValueError("Unexpected end of file")
else:
message = f"Syntax error: unexpected symbol {current_token}"
(_, start, _) = tokens.tokens[input_index]
line_index = bisect.bisect_left(tokens.lines, start)
if line_index == 0:
col_start = 0
@ -104,154 +80,54 @@ def parse(table: parser.ParseTable, tokens, trace=None) -> typing.Tuple[Tree | N
column_index = start - col_start
line_index += 1
error = f"{line_index}:{column_index}: {message}"
return (None, [error])
case _:
raise ValueError(f"Unknown action type: {action}")
# https://en.wikipedia.org/wiki/ANSI_escape_code
# https://gist.github.com/fnky/458719343aabd01cfb17a3a4f7296797
class CharColor(enum.IntEnum):
CHAR_COLOR_DEFAULT = 0
CHAR_COLOR_BLACK = 30
CHAR_COLOR_RED = enum.auto()
CHAR_COLOR_GREEN = enum.auto()
CHAR_COLOR_YELLOW = enum.auto()
CHAR_COLOR_BLUE = enum.auto()
CHAR_COLOR_MAGENTA = enum.auto()
CHAR_COLOR_CYAN = enum.auto()
CHAR_COLOR_WHITE = enum.auto() # Really light gray
CHAR_COLOR_BRIGHT_BLACK = 90 # Really dark gray
CHAR_COLOR_BRIGHT_RED = enum.auto()
CHAR_COLOR_BRIGHT_GREEN = enum.auto()
CHAR_COLOR_BRIGHT_YELLOW = enum.auto()
CHAR_COLOR_BRIGHT_BLUE = enum.auto()
CHAR_COLOR_BRIGHT_MAGENTA = enum.auto()
CHAR_COLOR_BRIGHT_CYAN = enum.auto()
CHAR_COLOR_BRIGHT_WHITE = enum.auto()
def ESC(x: bytes) -> bytes:
return b"\033" + x
def CSI(x: bytes) -> bytes:
return ESC(b"[" + x)
CLEAR = CSI(b"H") + CSI(b"0m")
def enter_alt_screen():
sys.stdout.buffer.write(CSI(b"?1049h"))
def leave_alt_screen():
sys.stdout.buffer.write(CSI(b"?1049l"))
class Harness:
source: str | None
table: parser.ParseTable | None
tree: Tree | None
def __init__(self, lexer_func, grammar_func, start_rule, source_path):
# self.generator = parser.GenerateLR1
self.generator = parser.GenerateLALR
self.lexer_func = lexer_func
self.grammar_func = grammar_func
self.start_rule = start_rule
self.source_path = source_path
self.source = None
self.table = None
self.tokens = None
self.tree = None
self.errors = None
def run(self):
while True:
i, _, _ = select.select([sys.stdin], [], [], 1)
if i:
k = sys.stdin.read(1)
print(f"Key {k}\r")
return
self.update()
def update(self):
if self.table is None:
self.table = self.grammar_func().build_table(
start=self.start_rule, generator=self.generator
return (
None,
[
f"{line_index}:{column_index}: Syntax error: unexpected symbol {current_token}"
],
)
assert self.table is not None
if self.tokens is None:
with open(self.source_path, "r", encoding="utf-8") as f:
self.source = f.read()
self.tokens = self.lexer_func(self.source)
def harness(lexer_func, grammar_func, start_rule, source_path):
# generator = parser.GenerateLR1
generator = parser.GenerateLALR
trace = None
# trace = trace_state
table = grammar_func().build_table(start=start_rule, generator=generator)
print(f"{len(table)} states")
average_entries = sum(len(row) for row in table) / len(table)
max_entries = max(len(row) for row in table)
print(f"{average_entries} average, {max_entries} max")
if source_path:
with open(source_path, "r", encoding="utf-8") as f:
src = f.read()
tokens = lexer_func(src)
# print(f"{tokens.lines}")
# tokens.dump(end=5)
if self.tree is None and self.errors is None:
(tree, errors) = parse(self.table, self.tokens, trace=None)
self.tree = tree
self.errors = errors
sys.stdout.buffer.write(CLEAR)
rows, cols = termios.tcgetwinsize(sys.stdout.fileno())
states = self.table.states
average_entries = sum(len(row) for row in states) / len(states)
max_entries = max(len(row) for row in states)
print(f"{len(states)} states - {average_entries} average, {max_entries} max\r")
if self.tree is not None:
lines = []
self.format_node(lines, self.tree)
for line in lines[: rows - 2]:
print(line[:cols] + "\r")
sys.stdout.flush()
sys.stdout.buffer.flush()
def format_node(self, lines, node: Tree | str, indent=0):
"""Print out an indented concrete syntax tree, from parse()."""
match node:
case Tree(name, children):
lines.append((" " * indent) + (name or "???"))
for child in children:
self.format_node(lines, child, indent + 2)
case _:
lines.append((" " * indent) + str(node))
(_, errors) = parse(table, tokens, trace=trace)
if len(errors) > 0:
print(f"{len(errors)} errors:")
for error in errors:
print(f" {error}")
if __name__ == "__main__":
import sys
source_path = None
if len(sys.argv) == 2:
source_path = sys.argv[1]
fd = sys.stdin.fileno()
old_settings = termios.tcgetattr(fd)
try:
tty.setraw(fd)
enter_alt_screen()
h = Harness(
harness(
lexer_func=grammar.FineTokens,
grammar_func=grammar.FineGrammar,
start_rule="file",
source_path=source_path,
)
h.run()
finally:
leave_alt_screen()
termios.tcsetattr(fd, termios.TCSADRAIN, old_settings)
# print(parser_faster.format_table(gen, table))
# print()

176
parser.py
View file

@ -393,45 +393,13 @@ class Assoc(enum.Enum):
RIGHT = 2
@dataclasses.dataclass
class Action:
pass
@dataclasses.dataclass
class Reduce(Action):
name: str
count: int
transparent: bool
@dataclasses.dataclass
class Shift(Action):
state: int
@dataclasses.dataclass
class Goto(Action):
state: int
@dataclasses.dataclass
class Accept(Action):
pass
@dataclasses.dataclass
class Error(Action):
pass
class ErrorCollection:
"""A collection of errors. The errors are grouped by config set and alphabet
symbol, so that we can group the error strings appropriately when we format
the error.
"""
errors: dict[ConfigSet, dict[int, dict[Configuration, Action]]]
errors: dict[ConfigSet, dict[int, dict[Configuration, typing.Tuple]]]
def __init__(self):
self.errors = {}
@ -445,7 +413,7 @@ class ErrorCollection:
config_set: ConfigSet,
symbol: int,
config: Configuration,
action: Action,
action: typing.Tuple,
):
"""Add an error to the collection.
@ -502,16 +470,14 @@ class ErrorCollection:
if config.next is None:
rule += " *"
match action:
case Reduce(name=name, count=count, transparent=transparent):
name_str = name if not transparent else "transparent node"
action_str = f"pop {count} values off the stack and make a {name_str}"
case Shift():
if action[0] == "reduce":
action_str = f"pop {action[2]} values off the stack and make a {action[1]}"
elif action[0] == "shift":
action_str = "consume the token and keep going"
case Accept():
elif action[0] == "accept":
action_str = "accept the parse"
case _:
assert isinstance(action, Goto)
else:
assert action[0] == "goto", f"Unknown action {action[0]}"
raise Exception("Shouldn't conflict on goto ever")
lines.append(
@ -523,11 +489,6 @@ class ErrorCollection:
return "\n\n".join(errors)
@dataclasses.dataclass
class ParseTable:
states: list[dict[str, Action]]
class TableBuilder(object):
"""A helper object to assemble actions into build parse tables.
@ -536,27 +497,23 @@ class TableBuilder(object):
"""
errors: ErrorCollection
table: list[dict[str, Action]]
table: list[dict[str, typing.Tuple]]
alphabet: list[str]
precedence: typing.Tuple[typing.Tuple[Assoc, int], ...]
transparents: set[str]
row: None | list[typing.Tuple[None | Action, None | Configuration]]
row: None | list[typing.Tuple[None | typing.Tuple, None | Configuration]]
def __init__(
self,
alphabet: list[str],
precedence: typing.Tuple[typing.Tuple[Assoc, int], ...],
transparents: set[str],
):
self.errors = ErrorCollection()
self.table = []
self.alphabet = alphabet
self.precedence = precedence
self.transparents = transparents
self.row = None
def flush(self, all_sets: ConfigurationSetInfo) -> ParseTable:
def flush(self, all_sets: ConfigurationSetInfo) -> list[dict[str, typing.Tuple]]:
"""Finish building the table and return it.
Raises ValueError if there were any conflicts during construction.
@ -565,7 +522,7 @@ class TableBuilder(object):
if self.errors.any():
errors = self.errors.format(self.alphabet, all_sets)
raise ValueError(f"Errors building the table:\n\n{errors}")
return ParseTable(states=self.table)
return self.table
def new_row(self, config_set: ConfigSet):
"""Start a new row, processing the given config set. Call this before
@ -584,35 +541,36 @@ class TableBuilder(object):
"""Mark a reduce of the given configuration for the given symbol in the
current row.
"""
name = self.alphabet[config.name]
transparent = name in self.transparents
action = Reduce(name, len(config.symbols), transparent)
action = ("reduce", self.alphabet[config.name], len(config.symbols))
self._set_table_action(symbol, action, config)
def set_table_accept(self, symbol: int, config: Configuration):
"""Mark a accept of the given configuration for the given symbol in the
current row.
"""
self._set_table_action(symbol, Accept(), config)
action = ("accept",)
self._set_table_action(symbol, action, config)
def set_table_shift(self, symbol: int, index: int, config: Configuration):
"""Mark a shift in the current row of the given given symbol to the
given index. The configuration here provides debugging informtion for
conflicts.
"""
self._set_table_action(symbol, Shift(index), config)
action = ("shift", index)
self._set_table_action(symbol, action, config)
def set_table_goto(self, symbol: int, index: int):
"""Set the goto for the given nonterminal symbol in the current row."""
self._set_table_action(symbol, Goto(index), None)
action = ("goto", index)
self._set_table_action(symbol, action, None)
def _action_precedence(self, symbol: int, action: Action, config: Configuration):
if isinstance(action, Shift):
def _action_precedence(self, symbol: int, action: typing.Tuple, config: Configuration):
if action[0] == "shift":
return self.precedence[symbol]
else:
return self.precedence[config.name]
def _set_table_action(self, symbol_id: int, action: Action, config: Configuration | None):
def _set_table_action(self, symbol_id: int, action: typing.Tuple, config: Configuration | None):
"""Set the action for 'symbol' in the table row to 'action'.
This is destructive; it changes the table. It records an error if
@ -649,17 +607,17 @@ class TableBuilder(object):
resolved = False
if assoc == Assoc.LEFT:
# Prefer reduce over shift
if isinstance(action, Shift) and isinstance(existing, Reduce):
if action[0] == "shift" and existing[0] == "reduce":
action = existing
resolved = True
elif isinstance(action, Reduce) and isinstance(existing, Shift):
elif action[0] == "reduce" and existing[0] == "shift":
resolved = True
elif assoc == Assoc.RIGHT:
# Prefer shift over reduce
if isinstance(action, Shift) and isinstance(existing, Reduce):
if action[0] == "shift" and existing[0] == "reduce":
resolved = True
elif isinstance(action, Reduce) and isinstance(existing, Shift):
elif action[0] == "reduce" and existing[0] == "shift":
action = existing
resolved = True
@ -678,7 +636,7 @@ class TableBuilder(object):
self.row[symbol_id] = (action, config)
class GenerateLR0:
class GenerateLR0(object):
"""Generate parser tables for an LR0 parser."""
# Internally we use integers as symbols, not strings. Mostly this is fine,
@ -701,10 +659,6 @@ class GenerateLR0:
# for a symbol, then its entry in this tuple will be (NONE, 0).
precedence: typing.Tuple[typing.Tuple[Assoc, int], ...]
# The set of symbols for which we should reduce "transparently." This doesn't
# affect state generation at all, only the generation of the final table.
transparents: set[str]
# The lookup that maps a particular symbol to an integer. (Only really used
# for debugging.)
symbol_key: dict[str, int]
@ -721,7 +675,6 @@ class GenerateLR0:
start: str,
grammar: list[typing.Tuple[str, list[str]]],
precedence: None | dict[str, typing.Tuple[Assoc, int]] = None,
transparents: None | set[str] = None,
):
"""Initialize the parser generator with the specified grammar and
start symbol.
@ -824,10 +777,6 @@ class GenerateLR0:
precedence = {}
self.precedence = tuple(precedence.get(a, (Assoc.NONE, 0)) for a in self.alphabet)
if transparents is None:
transparents = set()
self.transparents = transparents
self.symbol_key = symbol_key
self.start_symbol = start_symbol
self.end_symbol = end_symbol
@ -954,7 +903,7 @@ class GenerateLR0:
del config
return [index for index, value in enumerate(self.terminal) if value]
def gen_table(self) -> ParseTable:
def gen_table(self):
"""Generate the parse table.
The parse table is a list of states. The first state in the list is
@ -983,7 +932,7 @@ class GenerateLR0:
Anything missing from the row indicates an error.
"""
config_sets = self.gen_all_sets()
builder = TableBuilder(self.alphabet, self.precedence, self.transparents)
builder = TableBuilder(self.alphabet, self.precedence)
for config_set_id, config_set in enumerate(config_sets.sets):
builder.new_row(config_set)
@ -1010,7 +959,7 @@ class GenerateLR0:
return builder.flush(config_sets)
def parse(table: ParseTable, input, trace=False):
def parse(table, input, trace=False):
"""Parse the input with the generated parsing table and return the
concrete syntax tree.
@ -1036,7 +985,7 @@ def parse(table: ParseTable, input, trace=False):
current_state = stack[-1][0]
current_token = input[input_index]
action = table.states[current_state].get(current_token, Error())
action = table[current_state].get(current_token, ("error",))
if trace:
print(
"{stack: <20} {input: <50} {action: <5}".format(
@ -1046,30 +995,25 @@ def parse(table: ParseTable, input, trace=False):
)
)
match action:
case Accept():
if action[0] == "accept":
return stack[-1][1]
case Reduce(name=name, count=size, transparent=transparent):
children = []
for _, c in stack[-size:]:
if isinstance(c, tuple) and c[0] is None:
children.extend(c[1])
else:
children.append(c)
elif action[0] == "reduce":
name = action[1]
size = action[2]
value = (name if not transparent else None, tuple(children))
value = (name, tuple(s[1] for s in stack[-size:]))
stack = stack[:-size]
goto = table.states[stack[-1][0]].get(name, Error())
assert isinstance(goto, Goto)
stack.append((goto.state, value))
goto = table[stack[-1][0]].get(name, ("error",))
assert goto[0] == "goto" # Corrupt table?
stack.append((goto[1], value))
case Shift(state):
stack.append((state, (current_token, ())))
elif action[0] == "shift":
stack.append((action[1], (current_token, ())))
input_index += 1
case Error():
elif action[0] == "error":
raise ValueError(
"Syntax error: unexpected symbol {sym}".format(
sym=current_token,
@ -1595,16 +1539,7 @@ class NonTerminal(Rule):
grammar class.
"""
fn: typing.Callable[["Grammar"], Rule]
name: str
transparent: bool
def __init__(
self,
fn: typing.Callable[["Grammar"], Rule],
name: str | None = None,
transparent: bool = False,
):
def __init__(self, fn: typing.Callable[["Grammar"], Rule], name: str | None = None):
"""Create a new NonTerminal.
`fn` is the function that will yield the `Rule` which is the
@ -1614,7 +1549,6 @@ class NonTerminal(Rule):
"""
self.fn = fn
self.name = name or fn.__name__
self.transparent = transparent
def generate_body(self, grammar) -> list[list[str | Token]]:
"""Generate the body of the non-terminal.
@ -1704,8 +1638,7 @@ def rule(f: typing.Callable) -> Rule:
of the nonterminal, which defaults to the name of the function.
"""
name = f.__name__
transparent = name.startswith("_")
return NonTerminal(f, name, transparent)
return NonTerminal(f, name)
PrecedenceList = list[typing.Tuple[Assoc, list[Rule]]]
@ -1756,9 +1689,7 @@ class Grammar:
self._precedence = precedence_table
def generate_nonterminal_dict(
self, start: str
) -> typing.Tuple[dict[str, list[list[str | Token]]], set[str]]:
def generate_nonterminal_dict(self, start: str) -> dict[str, list[list[str | Token]]]:
"""Convert the rules into a dictionary of productions.
Our table generators work on a very flat set of productions. This is the
@ -1769,7 +1700,6 @@ class Grammar:
"""
rules = inspect.getmembers(self, lambda x: isinstance(x, NonTerminal))
nonterminals = {rule.name: rule for _, rule in rules}
transparents = {rule.name for _, rule in rules if rule.transparent}
grammar = {}
@ -1794,9 +1724,9 @@ class Grammar:
grammar[rule.name] = body
return (grammar, transparents)
return grammar
def desugar(self, start: str) -> typing.Tuple[list[typing.Tuple[str, list[str]]], set[str]]:
def desugar(self, start: str) -> list[typing.Tuple[str, list[str]]]:
"""Convert the rules into a flat list of productions.
Our table generators work from a very flat set of productions. The form
@ -1804,7 +1734,7 @@ class Grammar:
generate_nonterminal_dict- less useful to people, probably, but it is
the input form needed by the Generator.
"""
temp_grammar, transparents = self.generate_nonterminal_dict(start)
temp_grammar = self.generate_nonterminal_dict(start)
grammar = []
for rule_name, clauses in temp_grammar.items():
@ -1818,15 +1748,15 @@ class Grammar:
grammar.append((rule_name, new_clause))
return grammar, transparents
return grammar
def build_table(self, start: str, generator=GenerateLALR):
"""Construct a parse table for this grammar, starting at the named
nonterminal rule.
"""
desugared, transparents = self.desugar(start)
desugared = self.desugar(start)
gen = generator(start, desugared, precedence=self._precedence, transparents=transparents)
gen = generator(start, desugared, precedence=self._precedence)
table = gen.gen_table()
return table
@ -1842,7 +1772,7 @@ def format_node(node):
return "\n".join(lines)
def format_table(generator, table: ParseTable):
def format_table(generator, table):
"""Format a parser table so pretty."""
def format_action(state, terminal):
@ -1876,7 +1806,7 @@ def format_table(generator, table: ParseTable):
),
gotos=" ".join("{0: <5}".format(row.get(nt, ("error", ""))[1]) for nt in nonterminals),
)
for i, row in enumerate(table.states)
for i, row in enumerate(table)
]
return "\n".join(lines)