From 2656a1d328cd7457abe91e250a290fed0166f928 Mon Sep 17 00:00:00 2001
From: John Doty <john@d0ty.me>
Date: Thu, 10 Oct 2024 07:58:16 -0700
Subject: [PATCH] [parser] Remove bad LALR implementation, start cleanup

---
 TODO                  |   2 -
 grammar.py            |   1 -
 parser/parser.py      | 166 +++++++++---------------------------------
 tests/test_grammar.py |   2 -
 4 files changed, 36 insertions(+), 135 deletions(-)

diff --git a/TODO b/TODO
index 9f4739c..e69de29 100644
--- a/TODO
+++ b/TODO
@@ -1,2 +0,0 @@
-- Generate LALR lookaheads directly from LR0 states, not as LR1 + Merge, for speed
-  (Alternately, implement the Pager/Chen algorithm on LR1 for state merging)
\ No newline at end of file
diff --git a/grammar.py b/grammar.py
index 9c369df..aee5f78 100644
--- a/grammar.py
+++ b/grammar.py
@@ -24,7 +24,6 @@ from parser import (
 class FineGrammar(Grammar):
     # generator = parser.GenerateLR1
     # generator = parser.GeneratePager
-    # generator = parser.GenerateLALR
     start = "File"
 
     trivia = ["BLANKS", "LINE_BREAK", "COMMENT"]
diff --git a/parser/parser.py b/parser/parser.py
index 96163ff..c686796 100644
--- a/parser/parser.py
+++ b/parser/parser.py
@@ -1369,7 +1369,7 @@ class GenerateSLR1(GenerateLR0):
         super().__init__(*args, **kwargs)
 
         # We store the firsts not because we need them here, but because LR1
-        # and LALR need them.
+        # and Pager need them.
         self._firsts = FirstInfo.from_grammar(self.grammar, self.terminal)
         self._follows = FollowInfo.from_grammar(
             self.grammar,
@@ -1483,111 +1483,6 @@ class GenerateLR1(GenerateSLR1):
         return self.gen_sets(seeds)
 
 
-class GenerateLALR(GenerateLR1):
-    """Generate tables for LALR.
-
-    LALR is smaller than LR(1) but bigger than SLR(1). It works by generating
-    the LR(1) configuration sets, but merging configuration sets which are
-    equal in everything but their lookaheads. This works in that it doesn't
-    generate any shift/reduce conflicts that weren't already in the LR(1)
-    grammar. It can, however, introduce new reduce/reduce conflicts, because
-    it does lose information. The advantage is that the number of parser
-    states is much much smaller in LALR than in LR(1).
-
-    If you can get away with generating LALR tables for a grammar than you
-    should do it.
-
-    (Note that because we use immutable state everywhere this generator does
-    a lot of copying and allocation. This particular generator could still
-    use a bunch of improvement, probably.)
-    """
-
-    def gen_sets(self, seeds: list[Configuration]) -> ConfigurationSetInfo:
-        """Recursively generate all configuration sets starting from the
-        provided set.
-
-        The difference between this method and the one in GenerateLR0, where
-        this comes from, is that we're going to be keeping track of states
-        that we found that are equivalent in lookahead.
-        """
-        #
-        # First, do the actual walk. Don't merge yet: just keep track of all
-        # the config sets that need to be merged.
-        #
-        F: dict[CoreSet, list[ConfigSet]] = {}
-        seen: set[ConfigSet] = set()
-        closed_cores: dict[CoreSet, CoreSet] = {}
-        successors: list[typing.Tuple[CoreSet, int, CoreSet]] = []
-
-        pending = [(ConfigSet(seeds), CoreSet(s.core for s in seeds))]
-        while len(pending) > 0:
-            seed_set, seed_core = pending.pop()
-            if seed_set in seen:
-                continue
-            seen.add(seed_set)
-
-            closure = self.gen_closure(seed_set)
-            closure_core = CoreSet(s.core for s in closure)
-            closed_cores[seed_core] = closure_core
-
-            existing = F.get(closure_core)
-            if existing is not None:
-                existing.append(closure)
-            else:
-                F[closure_core] = [closure]
-
-            for symbol, successor in self.gen_all_successors(closure):
-                successor_seed_core = CoreSet(s.core for s in successor)
-                successors.append((closure_core, symbol, successor_seed_core))
-                pending.append((successor, successor_seed_core))
-
-        # Now we gathered the sets, merge them all.
-        final_sets: dict[CoreSet, ConfigSet] = {}
-        for key, config_sets in F.items():
-            la_merge: dict[ConfigurationCore, set[int]] = {}
-            for config_set in config_sets:
-                for config in config_set:
-                    la_key = config.core
-                    la_set = la_merge.get(la_key)
-                    if la_set is None:
-                        la_merge[la_key] = set(config.lookahead)
-                    else:
-                        la_set.update(config.lookahead)
-
-            final_set = ConfigSet(
-                Configuration(core=core, lookahead=tuple(sorted(la)))
-                for core, la in la_merge.items()
-            )
-            final_sets[key] = final_set
-
-        # Register all the actually merged, final config sets.
-        result = ConfigurationSetInfo()
-        for config_set in final_sets.values():
-            # Because we're building this so late we don't distinguish.
-            # This is probably a hack, and a sign the tracker should be better.
-            id, _ = result.register_core(config_set)
-            result.register_config_closure(id, config_set)
-
-        # Now record all the successors that we found. Of course, the actual
-        # sets that wound up in the ConfigurationSetInfo don't match anything
-        # we found during the previous phase.
-        #
-        # *Fortunately* we recorded the no-lookahead keys in the successors
-        # so we can find the final sets, then look them up in the registered
-        # sets, and actually register the successor.
-        for config_core, symbol, successor_seed_core in successors:
-            actual_config_set = final_sets[config_core]
-            from_index = result.config_set_key[actual_config_set]
-
-            successor_no_la = closed_cores[successor_seed_core]
-            actual_successor = final_sets[successor_no_la]
-            to_index = result.config_set_key[actual_successor]
-
-            result.add_successor(from_index, symbol, to_index)
-
-        return result
-
-
 # Here we have a slightly different definition of a ConfigurationSet; we keep the
 # lookaheads outside and use a dictionary to check for containment quickly.
 # ItemSet is used in the GRM/Pager/Chin algorithm.
@@ -1673,16 +1568,41 @@ class ItemSet:
 
 
 class GeneratePager(GenerateLR1):
-    """Pager's algorithm as interpreted through GRMTools"""
+    """Pager's algorithm.
+
+    I'll be honest, I don't understnd this one as well as the pure LR1
+    algorithm. It proceeds as LR1, generating successor states, but every
+    time it makes a new state it searches the states it has already made for
+    one that is "weakly compatible;" ifit finds one it merges the new state
+    with the old state and marks the old state to be re-visited.
+
+    The implementation here follows from the implementation in
+    `GRMTools<https://github.com/softdevteam/grmtools/blob/master/lrtable/src/lib/pager.rs>`_.
+
+    As they explain there:
+
+    > The general algorithms that form the basis of what's used in this file
+    > can be found in:
+    >
+    >      A Practical General Method for Constructing LR(k) Parsers
+    >         David Pager, Acta Informatica 7, 249--268, 1977
+    >
+    > However Pager's paper is dense, and doesn't name sub-parts of the
+    > algorithm. We mostly reference the (still incomplete, but less
+    > incomplete) version of the algorithm found in:
+    >
+    >      Measuring and extending LR(1) parser generation
+    >         Xin Chen, PhD thesis, University of Hawaii, 2009
+    """
 
     def gen_sets(self, seeds: list[Configuration]) -> ConfigurationSetInfo:
         # This function can be seen as a modified version of items() from
         # Chen's dissertation.
         #
-        # (It is also (practically) a converted version from grmtools into
-        # python, more or less verbatim at this point. I have no idea what's
-        # going on.)
-        # firsts = self._firsts
+        # DOTY: It is also (practically) a converted version from grmtools
+        #       into python, more or less verbatim at this point. I have some
+        #       sense of what is going on, and attempt to elaborate with
+        #       these comments.
 
         # closed_states and core_states are both equally sized vectors of
         # states. Core states are smaller, and used for the weakly compatible
@@ -1693,34 +1613,20 @@ class GeneratePager(GenerateLR1):
         core_states: list[ItemSet] = []
         edges: list[dict[int, int]] = []
 
-        # Because we GC states later, it's possible that we will end up with
-        # more states before GC than `StorageT` can hold. We thus do all our
-        # calculations in this function in terms of `usize`s before
-        # converting them to `StorageT` later.
-        #
-        # DOTY: This comment is useless for us: we don't optimize the storage
-        #       of the state graph so StorageT is useless.
-        #
-        # DOTY: This next bit here is basically figuring out the seeds, which
-        #       we have already done. We just need to convert them into an
-        #       itemset.
-        #
+        # Convert the incoming seed configurations into item sets.
+        # TODO: Convert everything to ItemSet natively.
         state0 = ItemSet({seed.core: set(seed.lookahead) for seed in seeds})
         core_states.append(state0)
         closed_states.append(None)
         edges.append({})
 
-        # We maintain two lists of which rules and tokens we've seen; when
-        # processing a given state there's no point processing a rule or token
-        # more than once.
-        #
-        # DOTY: Our alphabet is in a single range so we just have a single set.
+        # We maintain a set of which rules and tokens we've seen; when
+        # processing a given state there's no point processing a rule or
+        # token more than once.
         seen: set[int] = set()
 
         # new_states is used to separate out iterating over states vs.
         # mutating it
-        #
-        # DOTY: TODO: Do we need this?
         new_states: list[tuple[int, ItemSet]] = []
 
         # cnd_[rule|token]_weaklies represent which states are possible weakly
diff --git a/tests/test_grammar.py b/tests/test_grammar.py
index 7c5b9f2..af0f1d5 100644
--- a/tests/test_grammar.py
+++ b/tests/test_grammar.py
@@ -89,7 +89,6 @@ def test_all_generators():
         parser.GenerateLR0,
         parser.GeneratePager,
         parser.GenerateLR1,
-        parser.GenerateLALR,
     ]
     for generator in GENERATORS:
         table = G().build_table(generator=generator)
@@ -234,7 +233,6 @@ def test_grammar_aho_ullman_2():
 
     TestGrammar().build_table()
     TestGrammar().build_table(generator=parser.GenerateLR1)
-    TestGrammar().build_table(generator=parser.GenerateLALR)
     TestGrammar().build_table(generator=parser.GeneratePager)