From 8d58c6404080fe4d83a7b81ba7f4dda1a1e4f60b Mon Sep 17 00:00:00 2001
From: John Doty <john@d0ty.me>
Date: Mon, 27 May 2024 22:31:33 -0700
Subject: [PATCH] Fix LALR. Small, but very very slow.

---
 harness.py | 30 +++++++++++++++++++-----------
 parser.py  | 34 ++++++++++++++++++++++++++--------
 2 files changed, 45 insertions(+), 19 deletions(-)

diff --git a/harness.py b/harness.py
index d4d2161..4a1b0a9 100644
--- a/harness.py
+++ b/harness.py
@@ -2,10 +2,22 @@ import bisect
 import typing
 
 import grammar
-from parser import Token, Grammar, rule, seq
+import parser
+
+# from parser import Token, Grammar, rule, seq
 
 
-def parse(table, tokens, trace=False):
+def trace_state(stack, input, input_index, action):
+    print(
+        "{stack: <20}  {input: <50}  {action: <5}".format(
+            stack=repr([s[0] for s in stack]),
+            input=repr(input[input_index : input_index + 4]),
+            action=repr(action),
+        )
+    )
+
+
+def parse(table, tokens, trace=None):
     """Parse the input with the generated parsing table and return the
     concrete syntax tree.
 
@@ -35,13 +47,7 @@ def parse(table, tokens, trace=False):
 
         action = table[current_state].get(current_token, ("error",))
         if trace:
-            print(
-                "{stack: <20}  {input: <50}  {action: <5}".format(
-                    stack=repr([s[0] for s in stack]),
-                    input=repr(input[input_index : input_index + 4]),
-                    action=repr(action),
-                )
-            )
+            trace(stack, input, input_index, action)
 
         if action[0] == "accept":
             return (stack[-1][1], [])
@@ -83,7 +89,9 @@ def parse(table, tokens, trace=False):
 
 
 def harness(lexer_func, grammar_func, start_rule, source_path):
-    table = grammar_func().build_table(start=start_rule)
+    # generator = parser.GenerateLR1
+    generator = parser.GenerateLALR
+    table = grammar_func().build_table(start=start_rule, generator=generator)
     print(f"{len(table)} states")
 
     average_entries = sum(len(row) for row in table) / len(table)
@@ -96,7 +104,7 @@ def harness(lexer_func, grammar_func, start_rule, source_path):
         tokens = lexer_func(src)
         # print(f"{tokens.lines}")
         # tokens.dump(end=5)
-        (_, errors) = parse(table, tokens, trace=True)
+        (_, errors) = parse(table, tokens)
         if len(errors) > 0:
             print(f"{len(errors)} errors:")
             for error in errors:
diff --git a/parser.py b/parser.py
index 838f8c4..6a8c510 100644
--- a/parser.py
+++ b/parser.py
@@ -257,6 +257,14 @@ class Configuration:
             lookahead=(),
         )
 
+    def replace_lookahead(self, lookahead: typing.Tuple[int, ...]):
+        return Configuration(
+            name=self.name,
+            symbols=self.symbols,
+            position=self.position,
+            lookahead=lookahead,
+        )
+
     @property
     def rest(self):
         return self.symbols[(self.position + 1) :]
@@ -1382,7 +1390,11 @@ class GenerateLALR(GenerateLR1):
     use a bunch of improvement, probably.)
     """
 
-    def merge_sets(self, config_set_a, config_set_b):
+    def merge_sets(
+        self,
+        config_set_a: typing.Tuple[Configuration, ...],
+        config_set_b: typing.Tuple[Configuration, ...],
+    ):
         """Merge the two config sets, by keeping the item cores but merging
         the lookahead sets for each item.
         """
@@ -1394,7 +1406,7 @@ class GenerateLALR(GenerateLR1):
 
             new_lookahead = a.lookahead + b.lookahead
             new_lookahead = tuple(sorted(set(new_lookahead)))
-            merged.append(a.clear_lookahead())
+            merged.append(a.replace_lookahead(new_lookahead))
 
         return tuple(merged)
 
@@ -1403,7 +1415,7 @@ class GenerateLALR(GenerateLR1):
         b_no_la = tuple(s.clear_lookahead() for s in b)
         return a_no_la == b_no_la
 
-    def gen_sets(self, config_set) -> ConfigurationSetInfo:
+    def gen_sets(self, config_set: typing.Tuple[Configuration, ...]) -> ConfigurationSetInfo:
         """Recursively generate all configuration sets starting from the
         provided set, and merge them with the provided set 'F'.
 
@@ -1414,10 +1426,15 @@ class GenerateLALR(GenerateLR1):
         and replace the set in F, returning the modified set.
         """
         F = {}
+        seen = set()
         successors = []
         pending = [config_set]
         while len(pending) > 0:
             config_set = pending.pop()
+            if config_set in seen:
+                continue
+            seen.add(config_set)
+
             config_set_no_la = tuple(s.clear_lookahead() for s in config_set)
 
             existing = F.get(config_set_no_la)
@@ -1425,10 +1442,11 @@ class GenerateLALR(GenerateLR1):
                 F[config_set_no_la] = self.merge_sets(config_set, existing)
             else:
                 F[config_set_no_la] = config_set
-                for symbol, successor in self.gen_all_successors(config_set):
-                    successor_no_la = tuple(s.clear_lookahead() for s in successor)
-                    successors.append((config_set_no_la, symbol, successor_no_la))
-                    pending.append(successor)
+
+            for symbol, successor in self.gen_all_successors(config_set):
+                successor_no_la = tuple(s.clear_lookahead() for s in successor)
+                successors.append((config_set_no_la, symbol, successor_no_la))
+                pending.append(successor)
 
         # Register all the actually merged, final config sets.
         result = ConfigurationSetInfo()
@@ -1723,7 +1741,7 @@ class Grammar:
 
         return grammar
 
-    def build_table(self, start: str, generator=GenerateLR1):
+    def build_table(self, start: str, generator=GenerateLALR):
         """Construct a parse table for this grammar, starting at the named
         nonterminal rule.
         """