Also somehow I was not merging things correctly for LALR; this merges more completely and winds up with 215 states for the fine grammar, which is like half of what it used to be?
134 lines
4 KiB
Python
134 lines
4 KiB
Python
import bisect
|
|
import typing
|
|
|
|
import grammar
|
|
import parser
|
|
|
|
# from parser import Token, Grammar, rule, seq
|
|
|
|
|
|
def trace_state(stack, input, input_index, action):
|
|
print(
|
|
"{stack: <20} {input: <50} {action: <5}".format(
|
|
stack=repr([s[0] for s in stack]),
|
|
input=repr(input[input_index : input_index + 4]),
|
|
action=repr(action),
|
|
)
|
|
)
|
|
|
|
|
|
def parse(table, tokens, trace=None):
|
|
"""Parse the input with the generated parsing table and return the
|
|
concrete syntax tree.
|
|
|
|
The parsing table can be generated by GenerateLR0.gen_table() or by any
|
|
of the other generators below. The parsing mechanism never changes, only
|
|
the table generation mechanism.
|
|
|
|
input is a list of tokens. Don't stick an end-of-stream marker, I'll stick
|
|
one on for you.
|
|
|
|
This is not a *great* parser, it's really just a demo for what you can
|
|
do with the table.
|
|
"""
|
|
input = [t.value for (t, _, _) in tokens.tokens]
|
|
|
|
assert "$" not in input
|
|
input = input + ["$"]
|
|
input_index = 0
|
|
|
|
# Our stack is a stack of tuples, where the first entry is the state number
|
|
# and the second entry is the 'value' that was generated when the state was
|
|
# pushed.
|
|
stack: list[typing.Tuple[int, typing.Any]] = [(0, None)]
|
|
while True:
|
|
current_state = stack[-1][0]
|
|
current_token = input[input_index]
|
|
|
|
action = table[current_state].get(current_token, ("error",))
|
|
if trace:
|
|
trace(stack, input, input_index, action)
|
|
|
|
if action[0] == "accept":
|
|
return (stack[-1][1], [])
|
|
|
|
elif action[0] == "reduce":
|
|
name = action[1]
|
|
size = action[2]
|
|
|
|
value = (name, tuple(s[1] for s in stack[-size:]))
|
|
stack = stack[:-size]
|
|
|
|
goto = table[stack[-1][0]].get(name, ("error",))
|
|
assert goto[0] == "goto" # Corrupt table?
|
|
stack.append((goto[1], value))
|
|
|
|
elif action[0] == "shift":
|
|
stack.append((action[1], (current_token, ())))
|
|
input_index += 1
|
|
|
|
elif action[0] == "error":
|
|
if input_index >= len(tokens.tokens):
|
|
raise ValueError("Unexpected end of file")
|
|
else:
|
|
(_, start, _) = tokens.tokens[input_index]
|
|
line_index = bisect.bisect_left(tokens.lines, start)
|
|
if line_index == 0:
|
|
col_start = 0
|
|
else:
|
|
col_start = tokens.lines[line_index - 1] + 1
|
|
column_index = start - col_start
|
|
line_index += 1
|
|
|
|
return (
|
|
None,
|
|
[
|
|
f"{line_index}:{column_index}: Syntax error: unexpected symbol {current_token}"
|
|
],
|
|
)
|
|
|
|
|
|
def harness(lexer_func, grammar_func, start_rule, source_path):
|
|
# generator = parser.GenerateLR1
|
|
generator = parser.GenerateLALR
|
|
|
|
trace = None
|
|
# trace = trace_state
|
|
|
|
table = grammar_func().build_table(start=start_rule, generator=generator)
|
|
print(f"{len(table)} states")
|
|
|
|
average_entries = sum(len(row) for row in table) / len(table)
|
|
max_entries = max(len(row) for row in table)
|
|
print(f"{average_entries} average, {max_entries} max")
|
|
|
|
if source_path:
|
|
with open(source_path, "r", encoding="utf-8") as f:
|
|
src = f.read()
|
|
tokens = lexer_func(src)
|
|
# print(f"{tokens.lines}")
|
|
# tokens.dump(end=5)
|
|
(_, errors) = parse(table, tokens, trace=trace)
|
|
if len(errors) > 0:
|
|
print(f"{len(errors)} errors:")
|
|
for error in errors:
|
|
print(f" {error}")
|
|
|
|
|
|
if __name__ == "__main__":
|
|
import sys
|
|
|
|
source_path = None
|
|
if len(sys.argv) == 2:
|
|
source_path = sys.argv[1]
|
|
|
|
harness(
|
|
lexer_func=grammar.FineTokens,
|
|
grammar_func=grammar.FineGrammar,
|
|
start_rule="file",
|
|
source_path=source_path,
|
|
)
|
|
|
|
# print(parser_faster.format_table(gen, table))
|
|
# print()
|
|
# tree = parse(table, ["id", "+", "(", "id", "[", "id", "]", ")"])
|