From 8d58c6404080fe4d83a7b81ba7f4dda1a1e4f60b Mon Sep 17 00:00:00 2001 From: John Doty Date: Mon, 27 May 2024 22:31:33 -0700 Subject: [PATCH] Fix LALR. Small, but very very slow. --- harness.py | 30 +++++++++++++++++++----------- parser.py | 34 ++++++++++++++++++++++++++-------- 2 files changed, 45 insertions(+), 19 deletions(-) diff --git a/harness.py b/harness.py index d4d2161..4a1b0a9 100644 --- a/harness.py +++ b/harness.py @@ -2,10 +2,22 @@ import bisect import typing import grammar -from parser import Token, Grammar, rule, seq +import parser + +# from parser import Token, Grammar, rule, seq -def parse(table, tokens, trace=False): +def trace_state(stack, input, input_index, action): + print( + "{stack: <20} {input: <50} {action: <5}".format( + stack=repr([s[0] for s in stack]), + input=repr(input[input_index : input_index + 4]), + action=repr(action), + ) + ) + + +def parse(table, tokens, trace=None): """Parse the input with the generated parsing table and return the concrete syntax tree. @@ -35,13 +47,7 @@ def parse(table, tokens, trace=False): action = table[current_state].get(current_token, ("error",)) if trace: - print( - "{stack: <20} {input: <50} {action: <5}".format( - stack=repr([s[0] for s in stack]), - input=repr(input[input_index : input_index + 4]), - action=repr(action), - ) - ) + trace(stack, input, input_index, action) if action[0] == "accept": return (stack[-1][1], []) @@ -83,7 +89,9 @@ def parse(table, tokens, trace=False): def harness(lexer_func, grammar_func, start_rule, source_path): - table = grammar_func().build_table(start=start_rule) + # generator = parser.GenerateLR1 + generator = parser.GenerateLALR + table = grammar_func().build_table(start=start_rule, generator=generator) print(f"{len(table)} states") average_entries = sum(len(row) for row in table) / len(table) @@ -96,7 +104,7 @@ def harness(lexer_func, grammar_func, start_rule, source_path): tokens = lexer_func(src) # print(f"{tokens.lines}") # tokens.dump(end=5) - (_, errors) = parse(table, tokens, trace=True) + (_, errors) = parse(table, tokens) if len(errors) > 0: print(f"{len(errors)} errors:") for error in errors: diff --git a/parser.py b/parser.py index 838f8c4..6a8c510 100644 --- a/parser.py +++ b/parser.py @@ -257,6 +257,14 @@ class Configuration: lookahead=(), ) + def replace_lookahead(self, lookahead: typing.Tuple[int, ...]): + return Configuration( + name=self.name, + symbols=self.symbols, + position=self.position, + lookahead=lookahead, + ) + @property def rest(self): return self.symbols[(self.position + 1) :] @@ -1382,7 +1390,11 @@ class GenerateLALR(GenerateLR1): use a bunch of improvement, probably.) """ - def merge_sets(self, config_set_a, config_set_b): + def merge_sets( + self, + config_set_a: typing.Tuple[Configuration, ...], + config_set_b: typing.Tuple[Configuration, ...], + ): """Merge the two config sets, by keeping the item cores but merging the lookahead sets for each item. """ @@ -1394,7 +1406,7 @@ class GenerateLALR(GenerateLR1): new_lookahead = a.lookahead + b.lookahead new_lookahead = tuple(sorted(set(new_lookahead))) - merged.append(a.clear_lookahead()) + merged.append(a.replace_lookahead(new_lookahead)) return tuple(merged) @@ -1403,7 +1415,7 @@ class GenerateLALR(GenerateLR1): b_no_la = tuple(s.clear_lookahead() for s in b) return a_no_la == b_no_la - def gen_sets(self, config_set) -> ConfigurationSetInfo: + def gen_sets(self, config_set: typing.Tuple[Configuration, ...]) -> ConfigurationSetInfo: """Recursively generate all configuration sets starting from the provided set, and merge them with the provided set 'F'. @@ -1414,10 +1426,15 @@ class GenerateLALR(GenerateLR1): and replace the set in F, returning the modified set. """ F = {} + seen = set() successors = [] pending = [config_set] while len(pending) > 0: config_set = pending.pop() + if config_set in seen: + continue + seen.add(config_set) + config_set_no_la = tuple(s.clear_lookahead() for s in config_set) existing = F.get(config_set_no_la) @@ -1425,10 +1442,11 @@ class GenerateLALR(GenerateLR1): F[config_set_no_la] = self.merge_sets(config_set, existing) else: F[config_set_no_la] = config_set - for symbol, successor in self.gen_all_successors(config_set): - successor_no_la = tuple(s.clear_lookahead() for s in successor) - successors.append((config_set_no_la, symbol, successor_no_la)) - pending.append(successor) + + for symbol, successor in self.gen_all_successors(config_set): + successor_no_la = tuple(s.clear_lookahead() for s in successor) + successors.append((config_set_no_la, symbol, successor_no_la)) + pending.append(successor) # Register all the actually merged, final config sets. result = ConfigurationSetInfo() @@ -1723,7 +1741,7 @@ class Grammar: return grammar - def build_table(self, start: str, generator=GenerateLR1): + def build_table(self, start: str, generator=GenerateLALR): """Construct a parse table for this grammar, starting at the named nonterminal rule. """