Fix LALR. Small, but very very slow.

This commit is contained in:
John Doty 2024-05-27 22:31:33 -07:00
parent 0fc04cf11e
commit 8d58c64040
2 changed files with 45 additions and 19 deletions

View file

@ -2,10 +2,22 @@ import bisect
import typing import typing
import grammar import grammar
from parser import Token, Grammar, rule, seq import parser
# from parser import Token, Grammar, rule, seq
def parse(table, tokens, trace=False): def trace_state(stack, input, input_index, action):
print(
"{stack: <20} {input: <50} {action: <5}".format(
stack=repr([s[0] for s in stack]),
input=repr(input[input_index : input_index + 4]),
action=repr(action),
)
)
def parse(table, tokens, trace=None):
"""Parse the input with the generated parsing table and return the """Parse the input with the generated parsing table and return the
concrete syntax tree. concrete syntax tree.
@ -35,13 +47,7 @@ def parse(table, tokens, trace=False):
action = table[current_state].get(current_token, ("error",)) action = table[current_state].get(current_token, ("error",))
if trace: if trace:
print( trace(stack, input, input_index, action)
"{stack: <20} {input: <50} {action: <5}".format(
stack=repr([s[0] for s in stack]),
input=repr(input[input_index : input_index + 4]),
action=repr(action),
)
)
if action[0] == "accept": if action[0] == "accept":
return (stack[-1][1], []) return (stack[-1][1], [])
@ -83,7 +89,9 @@ def parse(table, tokens, trace=False):
def harness(lexer_func, grammar_func, start_rule, source_path): def harness(lexer_func, grammar_func, start_rule, source_path):
table = grammar_func().build_table(start=start_rule) # generator = parser.GenerateLR1
generator = parser.GenerateLALR
table = grammar_func().build_table(start=start_rule, generator=generator)
print(f"{len(table)} states") print(f"{len(table)} states")
average_entries = sum(len(row) for row in table) / len(table) average_entries = sum(len(row) for row in table) / len(table)
@ -96,7 +104,7 @@ def harness(lexer_func, grammar_func, start_rule, source_path):
tokens = lexer_func(src) tokens = lexer_func(src)
# print(f"{tokens.lines}") # print(f"{tokens.lines}")
# tokens.dump(end=5) # tokens.dump(end=5)
(_, errors) = parse(table, tokens, trace=True) (_, errors) = parse(table, tokens)
if len(errors) > 0: if len(errors) > 0:
print(f"{len(errors)} errors:") print(f"{len(errors)} errors:")
for error in errors: for error in errors:

View file

@ -257,6 +257,14 @@ class Configuration:
lookahead=(), lookahead=(),
) )
def replace_lookahead(self, lookahead: typing.Tuple[int, ...]):
return Configuration(
name=self.name,
symbols=self.symbols,
position=self.position,
lookahead=lookahead,
)
@property @property
def rest(self): def rest(self):
return self.symbols[(self.position + 1) :] return self.symbols[(self.position + 1) :]
@ -1382,7 +1390,11 @@ class GenerateLALR(GenerateLR1):
use a bunch of improvement, probably.) use a bunch of improvement, probably.)
""" """
def merge_sets(self, config_set_a, config_set_b): def merge_sets(
self,
config_set_a: typing.Tuple[Configuration, ...],
config_set_b: typing.Tuple[Configuration, ...],
):
"""Merge the two config sets, by keeping the item cores but merging """Merge the two config sets, by keeping the item cores but merging
the lookahead sets for each item. the lookahead sets for each item.
""" """
@ -1394,7 +1406,7 @@ class GenerateLALR(GenerateLR1):
new_lookahead = a.lookahead + b.lookahead new_lookahead = a.lookahead + b.lookahead
new_lookahead = tuple(sorted(set(new_lookahead))) new_lookahead = tuple(sorted(set(new_lookahead)))
merged.append(a.clear_lookahead()) merged.append(a.replace_lookahead(new_lookahead))
return tuple(merged) return tuple(merged)
@ -1403,7 +1415,7 @@ class GenerateLALR(GenerateLR1):
b_no_la = tuple(s.clear_lookahead() for s in b) b_no_la = tuple(s.clear_lookahead() for s in b)
return a_no_la == b_no_la return a_no_la == b_no_la
def gen_sets(self, config_set) -> ConfigurationSetInfo: def gen_sets(self, config_set: typing.Tuple[Configuration, ...]) -> ConfigurationSetInfo:
"""Recursively generate all configuration sets starting from the """Recursively generate all configuration sets starting from the
provided set, and merge them with the provided set 'F'. provided set, and merge them with the provided set 'F'.
@ -1414,10 +1426,15 @@ class GenerateLALR(GenerateLR1):
and replace the set in F, returning the modified set. and replace the set in F, returning the modified set.
""" """
F = {} F = {}
seen = set()
successors = [] successors = []
pending = [config_set] pending = [config_set]
while len(pending) > 0: while len(pending) > 0:
config_set = pending.pop() config_set = pending.pop()
if config_set in seen:
continue
seen.add(config_set)
config_set_no_la = tuple(s.clear_lookahead() for s in config_set) config_set_no_la = tuple(s.clear_lookahead() for s in config_set)
existing = F.get(config_set_no_la) existing = F.get(config_set_no_la)
@ -1425,6 +1442,7 @@ class GenerateLALR(GenerateLR1):
F[config_set_no_la] = self.merge_sets(config_set, existing) F[config_set_no_la] = self.merge_sets(config_set, existing)
else: else:
F[config_set_no_la] = config_set F[config_set_no_la] = config_set
for symbol, successor in self.gen_all_successors(config_set): for symbol, successor in self.gen_all_successors(config_set):
successor_no_la = tuple(s.clear_lookahead() for s in successor) successor_no_la = tuple(s.clear_lookahead() for s in successor)
successors.append((config_set_no_la, symbol, successor_no_la)) successors.append((config_set_no_la, symbol, successor_no_la))
@ -1723,7 +1741,7 @@ class Grammar:
return grammar return grammar
def build_table(self, start: str, generator=GenerateLR1): def build_table(self, start: str, generator=GenerateLALR):
"""Construct a parse table for this grammar, starting at the named """Construct a parse table for this grammar, starting at the named
nonterminal rule. nonterminal rule.
""" """