[parser] Remove bad LALR implementation, start cleanup

2024-10-10 07:58:16 -07:00 · 2024-10-10 07:58:16 -07:00 · 2656a1d328
commit 2656a1d328
parent da7ca95a86
4 changed files with 36 additions and 135 deletions
--- a/2
+++ b/2
@ -1,2 +0,0 @@
 - Generate LALR lookaheads directly from LR0 states, not as LR1 + Merge, for speed
  (Alternately, implement the Pager/Chen algorithm on LR1 for state merging)
--- a/grammar.py
+++ b/grammar.py
@ -24,7 +24,6 @@ from parser import (
 class FineGrammar(Grammar):
    # generator = parser.GenerateLR1
    # generator = parser.GeneratePager
    # generator = parser.GenerateLALR
    start = "File"
    trivia = ["BLANKS", "LINE_BREAK", "COMMENT"]
--- a/parser/parser.py
+++ b/parser/parser.py
@ -1369,7 +1369,7 @@ class GenerateSLR1(GenerateLR0):
        super().__init__(*args, **kwargs)
        # We store the firsts not because we need them here, but because LR1
-        # and LALR need them.
+        # and Pager need them.
        self._firsts = FirstInfo.from_grammar(self.grammar, self.terminal)
        self._follows = FollowInfo.from_grammar(
            self.grammar,
@ -1483,111 +1483,6 @@ class GenerateLR1(GenerateSLR1):
        return self.gen_sets(seeds)
 class GenerateLALR(GenerateLR1):
    """Generate tables for LALR.
    LALR is smaller than LR(1) but bigger than SLR(1). It works by generating
    the LR(1) configuration sets, but merging configuration sets which are
    equal in everything but their lookaheads. This works in that it doesn't
    generate any shift/reduce conflicts that weren't already in the LR(1)
    grammar. It can, however, introduce new reduce/reduce conflicts, because
    it does lose information. The advantage is that the number of parser
    states is much much smaller in LALR than in LR(1).
    If you can get away with generating LALR tables for a grammar than you
    should do it.
    (Note that because we use immutable state everywhere this generator does
    a lot of copying and allocation. This particular generator could still
    use a bunch of improvement, probably.)
    """
    def gen_sets(self, seeds: list[Configuration]) -> ConfigurationSetInfo:
        """Recursively generate all configuration sets starting from the
        provided set.
        The difference between this method and the one in GenerateLR0, where
        this comes from, is that we're going to be keeping track of states
        that we found that are equivalent in lookahead.
        """
        #
        # First, do the actual walk. Don't merge yet: just keep track of all
        # the config sets that need to be merged.
        #
        F: dict[CoreSet, list[ConfigSet]] = {}
        seen: set[ConfigSet] = set()
        closed_cores: dict[CoreSet, CoreSet] = {}
        successors: list[typing.Tuple[CoreSet, int, CoreSet]] = []
        pending = [(ConfigSet(seeds), CoreSet(s.core for s in seeds))]
        while len(pending) > 0:
            seed_set, seed_core = pending.pop()
            if seed_set in seen:
                continue
            seen.add(seed_set)
            closure = self.gen_closure(seed_set)
            closure_core = CoreSet(s.core for s in closure)
            closed_cores[seed_core] = closure_core
            existing = F.get(closure_core)
            if existing is not None:
                existing.append(closure)
            else:
                F[closure_core] = [closure]
            for symbol, successor in self.gen_all_successors(closure):
                successor_seed_core = CoreSet(s.core for s in successor)
                successors.append((closure_core, symbol, successor_seed_core))
                pending.append((successor, successor_seed_core))
        # Now we gathered the sets, merge them all.
        final_sets: dict[CoreSet, ConfigSet] = {}
        for key, config_sets in F.items():
            la_merge: dict[ConfigurationCore, set[int]] = {}
            for config_set in config_sets:
                for config in config_set:
                    la_key = config.core
                    la_set = la_merge.get(la_key)
                    if la_set is None:
                        la_merge[la_key] = set(config.lookahead)
                    else:
                        la_set.update(config.lookahead)
            final_set = ConfigSet(
                Configuration(core=core, lookahead=tuple(sorted(la)))
                for core, la in la_merge.items()
            )
            final_sets[key] = final_set
        # Register all the actually merged, final config sets.
        result = ConfigurationSetInfo()
        for config_set in final_sets.values():
            # Because we're building this so late we don't distinguish.
            # This is probably a hack, and a sign the tracker should be better.
            id, _ = result.register_core(config_set)
            result.register_config_closure(id, config_set)
        # Now record all the successors that we found. Of course, the actual
        # sets that wound up in the ConfigurationSetInfo don't match anything
        # we found during the previous phase.
        #
        # *Fortunately* we recorded the no-lookahead keys in the successors
        # so we can find the final sets, then look them up in the registered
        # sets, and actually register the successor.
        for config_core, symbol, successor_seed_core in successors:
            actual_config_set = final_sets[config_core]
            from_index = result.config_set_key[actual_config_set]
            successor_no_la = closed_cores[successor_seed_core]
            actual_successor = final_sets[successor_no_la]
            to_index = result.config_set_key[actual_successor]
            result.add_successor(from_index, symbol, to_index)
        return result
 # Here we have a slightly different definition of a ConfigurationSet; we keep the
 # lookaheads outside and use a dictionary to check for containment quickly.
 # ItemSet is used in the GRM/Pager/Chin algorithm.
@ -1673,16 +1568,41 @@ class ItemSet:
 class GeneratePager(GenerateLR1):
-    """Pager's algorithm as interpreted through GRMTools"""
+    """Pager's algorithm.
    I'll be honest, I don't understnd this one as well as the pure LR1
    algorithm. It proceeds as LR1, generating successor states, but every
    time it makes a new state it searches the states it has already made for
    one that is "weakly compatible;" ifit finds one it merges the new state
    with the old state and marks the old state to be re-visited.
    The implementation here follows from the implementation in
    `GRMTools<https://github.com/softdevteam/grmtools/blob/master/lrtable/src/lib/pager.rs>`_.
    As they explain there:
    > The general algorithms that form the basis of what's used in this file
    > can be found in:
    >
    >      A Practical General Method for Constructing LR(k) Parsers
    >         David Pager, Acta Informatica 7, 249--268, 1977
    >
    > However Pager's paper is dense, and doesn't name sub-parts of the
    > algorithm. We mostly reference the (still incomplete, but less
    > incomplete) version of the algorithm found in:
    >
    >      Measuring and extending LR(1) parser generation
    >         Xin Chen, PhD thesis, University of Hawaii, 2009
    """
    def gen_sets(self, seeds: list[Configuration]) -> ConfigurationSetInfo:
        # This function can be seen as a modified version of items() from
        # Chen's dissertation.
        #
-        # (It is also (practically) a converted version from grmtools into
+        # DOTY: It is also (practically) a converted version from grmtools
-        # python, more or less verbatim at this point. I have no idea what's
+        #       into python, more or less verbatim at this point. I have some
-        # going on.)
+        #       sense of what is going on, and attempt to elaborate with
-        # firsts = self._firsts
+        #       these comments.
        # closed_states and core_states are both equally sized vectors of
        # states. Core states are smaller, and used for the weakly compatible
@ -1693,34 +1613,20 @@ class GeneratePager(GenerateLR1):
        core_states: list[ItemSet] = []
        edges: list[dict[int, int]] = []
-        # Because we GC states later, it's possible that we will end up with
+        # Convert the incoming seed configurations into item sets.
-        # more states before GC than `StorageT` can hold. We thus do all our
+        # TODO: Convert everything to ItemSet natively.
        # calculations in this function in terms of `usize`s before
        # converting them to `StorageT` later.
        #
        # DOTY: This comment is useless for us: we don't optimize the storage
        #       of the state graph so StorageT is useless.
        #
        # DOTY: This next bit here is basically figuring out the seeds, which
        #       we have already done. We just need to convert them into an
        #       itemset.
        #
        state0 = ItemSet({seed.core: set(seed.lookahead) for seed in seeds})
        core_states.append(state0)
        closed_states.append(None)
        edges.append({})
-        # We maintain two lists of which rules and tokens we've seen; when
+        # We maintain a set of which rules and tokens we've seen; when
-        # processing a given state there's no point processing a rule or token
+        # processing a given state there's no point processing a rule or
-        # more than once.
+        # token more than once.
        #
        # DOTY: Our alphabet is in a single range so we just have a single set.
        seen: set[int] = set()
        # new_states is used to separate out iterating over states vs.
        # mutating it
        #
        # DOTY: TODO: Do we need this?
        new_states: list[tuple[int, ItemSet]] = []
        # cnd_[rule|token]_weaklies represent which states are possible weakly
--- a/tests/test_grammar.py
+++ b/tests/test_grammar.py
@ -89,7 +89,6 @@ def test_all_generators():
        parser.GenerateLR0,
        parser.GeneratePager,
        parser.GenerateLR1,
        parser.GenerateLALR,
    ]
    for generator in GENERATORS:
        table = G().build_table(generator=generator)
@ -234,7 +233,6 @@ def test_grammar_aho_ullman_2():
    TestGrammar().build_table()
    TestGrammar().build_table(generator=parser.GenerateLR1)
    TestGrammar().build_table(generator=parser.GenerateLALR)
    TestGrammar().build_table(generator=parser.GeneratePager)
		`@ -1,2 +0,0 @@`
			`- Generate LALR lookaheads directly from LR0 states, not as LR1 + Merge, for speed`
			`(Alternately, implement the Pager/Chen algorithm on LR1 for state merging)`