[parser] Simplify StateGraph

[parser] Remove Canonical LR1 generator
This is fine probably.
2024-10-26 07:35:28 -07:00 · 2024-10-26 07:25:37 -07:00 · 2024-10-26 06:56:30 -07:00 · 2024-10-26 06:53:53 -07:00 · 2024-10-26 06:53:36 -07:00
2 changed files with 338 additions and 523 deletions
--- a/parser/parser.py
+++ b/parser/parser.py
@ -135,7 +135,6 @@ import bisect
 import collections
 import dataclasses
 import enum
-import functools
 import inspect
 import itertools
 import json
@ -274,7 +273,100 @@ class ConfigSet(frozenset[Configuration]):
    pass


-class ConfigurationSetInfo:
+# Here we have a slightly different definition of a ConfigurationSet; we keep
+# the lookaheads outside and use a dictionary to check for containment
+# quickly. ItemSet is used in the GRM/Pager/Chin algorithm.
+@dataclasses.dataclass
+class ItemSet:
+    """An ItemSet is a group of configuration cores together with their
+    "contexts", or lookahead sets.
+
+    An ItemSet is comparable for equality, and also supports this lesser notion
+    of "weakly compatible" which is used to collapse states in the pager
+    algorithm.
+    """
+
+    items: dict[ConfigurationCore, set[int]]
+
+    def __init__(self, items=None):
+        self.items = items or {}
+
+    @classmethod
+    def from_config_set(cls, config_set: ConfigSet) -> "ItemSet":
+        return ItemSet({config.core: set(config.lookahead) for config in config_set})
+
+    def weakly_compatible(self, other: "ItemSet") -> bool:
+        a = self.items
+        b = other.items
+
+        if len(a) != len(b):
+            return False
+
+        for acore in a:
+            if acore not in b:
+                return False
+
+        if len(a) == 1:
+            return True
+
+        # DOTY: This loop I do not understand, truly. What the heck is happening here?
+        a_keys = list(a.keys())
+        for i, i_key in enumerate(itertools.islice(a_keys, 0, len(a_keys) - 1)):
+            for j_key in itertools.islice(a_keys, i + 1, None):
+                a_i_key = a[i_key]
+                b_i_key = b[i_key]
+                a_j_key = a[j_key]
+                b_j_key = b[j_key]
+
+                # DOTY: GRMTools written with intersects(); we don't have that we have
+                #       `not disjoint()`. :P There are many double negatives....
+                #
+                #  not (intersect(a_i, b_j) or intersect(a_j, b_i))
+                #  not ((not disjoint(a_i, b_j)) or (not disjoint(a_j, b_i)))
+                #  ((not not disjoint(a_i, b_j)) and (not not disjoint(a_j, b_i)))
+                #  disjoint(a_i, b_j) and disjoint(a_j, b_i)
+                if a_i_key.isdisjoint(b_j_key) and a_j_key.isdisjoint(b_i_key):
+                    continue
+
+                # intersect(a_i, a_j) or intersect(b_i, b_j)
+                # (not disjoint(a_i, a_j)) or (not disjoint(b_i, b_j))
+                # not (disjoint(a_i, a_j) and disjoint(b_i, b_j))
+                if not (a_i_key.isdisjoint(a_j_key) and b_i_key.isdisjoint(b_j_key)):
+                    continue
+
+                return False
+
+        return True
+
+    def weakly_merge(self, other: "ItemSet") -> bool:
+        """Merge b into a, returning True if this lead to any changes."""
+        a = self.items
+        b = other.items
+
+        changed = False
+        for a_key, a_ctx in a.items():
+            start_len = len(a_ctx)
+            a_ctx.update(b[a_key])  # Python doesn't tell us changes
+            changed = changed or (start_len != len(a_ctx))
+
+        return changed
+
+    def goto(self, symbol: int) -> "ItemSet":
+        result = ItemSet()
+        for core, context in self.items.items():
+            if core.next == symbol:
+                next = core.replace_position(core.position + 1)
+                result.items[next] = set(context)
+        return result
+
+    def to_config_set(self) -> ConfigSet:
+        return ConfigSet(
+            {Configuration(core, tuple(sorted(ctx))) for core, ctx in self.items.items()}
+        )
+
+
+@dataclasses.dataclass
+class StateGraph:
    """When we build a grammar into a table, the first thing we need to do is
    generate all the configuration sets and their successors.

@ -289,65 +381,23 @@ class ConfigurationSetInfo:
    structure, but they all compute this information.)
    """

-    core_key: dict[ConfigSet, int]  # Map a ConfigSet into am index
-    config_set_key: dict[ConfigSet, int]  # Map a ConfigSet into am index
-    sets: list[ConfigSet]  # Map the index back into a set
-    closures: list[ConfigSet | None]  # Track closures
+    closures: list[ConfigSet]

    # All the sucessors for all of the sets. `successors[i]` is the mapping
    # from grammar symbol to the index of the set you get by processing that
    # symbol.
    successors: list[dict[int, int]]

-    def __init__(self):
-        self.core_key = {}
-        self.config_set_key = {}
-        self.sets = []
-        self.closures = []
-        self.successors = []
-
-    def register_core(self, c: ConfigSet) -> typing.Tuple[int, bool]:
-        """Potentially add a new config set to the set of sets. Returns the
-        canonical ID of the set within this structure, along with a boolean
-        indicating whether the set was just added or not.
-
-        (You can use this integer to get the set back, if you need it, and
-        also access the successors table.)
-        """
-        existing = self.core_key.get(c)
-        if existing is not None:
-            return existing, False
-
-        index = len(self.sets)
-        self.sets.append(c)
-        self.closures.append(None)
-        self.successors.append({})
-        self.core_key[c] = index
-        return index, True
-
-    def register_config_closure(self, c_id: int, closure: ConfigSet):
-        assert self.closures[c_id] is None
-        self.closures[c_id] = closure
-        self.config_set_key[closure] = c_id
-
-    def add_successor(self, c_id: int, symbol: int, successor: int):
-        """Register sucessor(`c_id`, `symbol`) -> `successor`, where c_id
-        is the id of the set in this structure, and symbol is the id of a
-        symbol in the alphabet of the grammar.
-        """
-        self.successors[c_id][symbol] = successor
-
    def dump_state(self, alphabet: list[str]) -> str:
        return json.dumps(
            {
                str(set_index): {
-                    "configs": [c.format(alphabet) for c in config_set],
-                    "closures": [c.format(alphabet) for c in self.closures[set_index] or []],
-                    "successors": {
-                        alphabet[k]: str(v) for k, v in self.successors[set_index].items()
-                    },
+                    "closures": [c.format(alphabet) for c in closure],
+                    "successors": {alphabet[k]: str(v) for k, v in successors.items()},
                }
-                for set_index, config_set in enumerate(self.sets)
+                for set_index, (closure, successors) in enumerate(
+                    zip(self.closures, self.successors)
+                )
            },
            indent=4,
            sort_keys=True,
@ -364,7 +414,8 @@ class ConfigurationSetInfo:

        This function raises KeyError if no path is found.
        """
-        target_index = self.config_set_key[target_set]
+        # TODO: This should be tested.
+        target_index = self.closures.index(target_set)
        visited = set()

        queue: collections.deque = collections.deque()
@ -507,7 +558,7 @@ class ErrorCollection:
    def gen_exception(
        self,
        alphabet: list[str],
-        all_sets: ConfigurationSetInfo,
+        all_sets: StateGraph,
    ) -> AmbiguityError | None:
        """Format all the errors into an error, or return None if there are no
        errors.
@ -644,7 +695,7 @@ class TableBuilder(object):
        self.action_row = None
        self.goto_row = None

-    def flush(self, all_sets: ConfigurationSetInfo) -> ParseTable:
+    def flush(self, all_sets: StateGraph) -> ParseTable:
        """Finish building the table and return it.

        Raises ValueError if there were any conflicts during construction.
@ -1007,108 +1058,36 @@ class FollowInfo:
        return FollowInfo(follows=follows)


-# Here we have a slightly different definition of a ConfigurationSet; we keep the
-# lookaheads outside and use a dictionary to check for containment quickly.
-# ItemSet is used in the GRM/Pager/Chin algorithm.
-@dataclasses.dataclass
-class ItemSet:
-    """An ItemSet is a group of configuration cores together with their
-    "contexts", or lookahead sets.
+class ParserGenerator:
+    """Generate parse tables for LR1 grammars.

-    An ItemSet is comparable for equality, and also supports this lesser notion
-    of "weakly compatible" which is used to collapse states in the pager
-    algorithm.
-    """
+    This class implements a variant of pager's algorithm to generate the parse
+    tables, which support the same set of languages as Canonical LR1 but with
+    much smaller resulting parse tables.

-    items: dict[ConfigurationCore, set[int]]
+    I'll be honest, I don't understnd this one as well as the pure LR1
+    algorithm. It proceeds as LR1, generating successor states, but every
+    time it makes a new state it searches the states it has already made for
+    one that is "weakly compatible;" if it finds one it merges the new state
+    with the old state and marks the old state to be re-visited.

-    def __init__(self, items=None):
-        self.items = items or {}
+    The implementation here follows from the implementation in
+    `GRMTools<https://github.com/softdevteam/grmtools/blob/master/lrtable/src/lib/pager.rs>`_.

-    @classmethod
-    def from_config_set(cls, config_set: ConfigSet) -> "ItemSet":
-        return ItemSet({config.core: set(config.lookahead) for config in config_set})
+    As they explain there:

-    def weakly_compatible(self, other: "ItemSet") -> bool:
-        a = self.items
-        b = other.items
-
-        if len(a) != len(b):
-            return False
-
-        for acore in a:
-            if acore not in b:
-                return False
-
-        if len(a) == 1:
-            return True
-
-        # DOTY: This loop I do not understand, truly. What the heck is happening here?
-        a_keys = list(a.keys())
-        for i, i_key in enumerate(itertools.islice(a_keys, 0, len(a_keys) - 1)):
-            for j_key in itertools.islice(a_keys, i + 1, None):
-                a_i_key = a[i_key]
-                b_i_key = b[i_key]
-                a_j_key = a[j_key]
-                b_j_key = b[j_key]
-
-                # DOTY: GRMTools written with intersects(); we don't have that we have
-                #       `not disjoint()`. :P There are many double negatives....
-                #
-                #  not (intersect(a_i, b_j) or intersect(a_j, b_i))
-                #  not ((not disjoint(a_i, b_j)) or (not disjoint(a_j, b_i)))
-                #  ((not not disjoint(a_i, b_j)) and (not not disjoint(a_j, b_i)))
-                #  disjoint(a_i, b_j) and disjoint(a_j, b_i)
-                if a_i_key.isdisjoint(b_j_key) and a_j_key.isdisjoint(b_i_key):
-                    continue
-
-                # intersect(a_i, a_j) or intersect(b_i, b_j)
-                # (not disjoint(a_i, a_j)) or (not disjoint(b_i, b_j))
-                # not (disjoint(a_i, a_j) and disjoint(b_i, b_j))
-                if not (a_i_key.isdisjoint(a_j_key) and b_i_key.isdisjoint(b_j_key)):
-                    continue
-
-                return False
-
-        return True
-
-    def weakly_merge(self, other: "ItemSet") -> bool:
-        """Merge b into a, returning True if this lead to any changes."""
-        a = self.items
-        b = other.items
-
-        changed = False
-        for a_key, a_ctx in a.items():
-            start_len = len(a_ctx)
-            a_ctx.update(b[a_key])  # Python doesn't tell us changes
-            changed = changed or (start_len != len(a_ctx))
-
-        return changed
-
-    def goto(self, symbol: int) -> "ItemSet":
-        result = ItemSet()
-        for core, context in self.items.items():
-            if core.next == symbol:
-                next = core.replace_position(core.position + 1)
-                result.items[next] = set(context)
-        return result
-
-    def to_config_set(self) -> ConfigSet:
-        return ConfigSet(
-            {Configuration(core, tuple(sorted(ctx))) for core, ctx in self.items.items()}
-        )
-
-
-class GenerateLR1:
-    """Generate parse tables for LR1, or "canonical LR" grammars.
-
-    LR1 parsers can recognize more than SLR parsers. Like SLR parsers, they
-    are choosier about when they reduce. But unlike SLR parsers, they specify
-    the terminals on which they reduce by carrying a 'lookahead' terminal in
-    the configuration. The lookahead of a configuration is computed as the
-    closure of a configuration set is computed, so see gen_closure_next for
-    details. (Except for the start configuration, which has '$' as its
-    lookahead.)
+    > The general algorithms that form the basis of what's used in this file
+    > can be found in:
+    >
+    >      A Practical General Method for Constructing LR(k) Parsers
+    >         David Pager, Acta Informatica 7, 249--268, 1977
+    >
+    > However Pager's paper is dense, and doesn't name sub-parts of the
+    > algorithm. We mostly reference the (still incomplete, but less
+    > incomplete) version of the algorithm found in:
+    >
+    >      Measuring and extending LR(1) parser generation
+    >         Xin Chen, PhD thesis, University of Hawaii, 2009
    """

    # Internally we use integers as symbols, not strings. Mostly this is fine,
@ -1171,9 +1150,9 @@ class GenerateLR1:
        non-terminal being added, and the second elment of the tuple is the
        list of terminals and non-terminals that make up the production.

-        There is currently no support for custom actions or alternation or
-        anything like that. If you want alternations that you'll have to lower
-        the grammar by hand into the simpler form first.
+        There is no support for alternation. If you want alternations that
+        you'll have to lower the grammar by hand into the simpler form first,
+        but that's what the Grammar and NonTerminal classes are for.

        Don't name anything with double-underscores; those are reserved for
        the generator. Don't add '$' either, as it is reserved to mean
@ -1273,105 +1252,215 @@ class GenerateLR1:
            self._firsts,
        )

-    def gen_closure(self, seeds: typing.Iterable[Configuration]) -> ConfigSet:
-        """Compute the closure for the specified configs. The closure is all
-        of the configurations we could be in. Specifically, if the position
-        for a config is just before a non-terminal then we must also consider
-        configurations where the rule is the rule for the non-terminal and
-        the position is just before the beginning of the rule.
+    def gen_sets(self, seeds: list[Configuration]) -> StateGraph:
+        # This function can be seen as a modified version of items() from
+        # Chen's dissertation.
+        #
+        # DOTY: It is also (practically) a converted version from grmtools
+        #       into python, more or less verbatim at this point. I have some
+        #       sense of what is going on, and attempt to elaborate with
+        #       these comments.

-        (We have replaced a recursive version with an iterative one.)
-        """
-        closure: set[Configuration] = set()
-        pending = list(seeds)
-        pending_next = []
-        while len(pending) > 0:
-            for config in pending:
-                if config in closure:
+        # closed_states and core_states are both equally sized vectors of
+        # states. Core states are smaller, and used for the weakly compatible
+        # checks, but we ultimately need to return closed states. Closed
+        # states which are None are those which require processing; thus
+        # closed_states also implicitly serves as a todo list.
+        closed_states: list[ItemSet | None] = []
+        core_states: list[ItemSet] = []
+        edges: list[dict[int, int]] = []
+
+        # Convert the incoming seed configurations into item sets.
+        # TODO: Convert everything to ItemSet natively.
+        state0 = ItemSet({seed.core: set(seed.lookahead) for seed in seeds})
+        core_states.append(state0)
+        closed_states.append(None)
+        edges.append({})
+
+        # We maintain a set of which rules and tokens we've seen; when
+        # processing a given state there's no point processing a rule or
+        # token more than once.
+        seen: set[int] = set()
+
+        # cnd_[rule|token]_weaklies represent which states are possible weakly
+        # compatible matches for a given symbol.
+        #
+        # DOTY: As with `seen`, we have a uniform space so we can have a
+        #       uniform one of these too.
+        cnd_weaklies: list[list[int]] = [[] for _ in range(len(self.alphabet))]
+
+        todo = 1  # How many None values are there in closed_states?
+        todo_off = 0  # Offset in closed states to start searching for the next todo.
+        while todo > 0:
+            assert len(core_states) == len(closed_states)
+            assert len(core_states) == len(edges)
+
+            # state_i is the next item to process. We don't want to
+            # continually search for the next None from the beginning, so we
+            # remember where we last saw a None (todo_off) and search from
+            # that point onwards, wrapping as necessary. Since processing a
+            # state x disproportionately causes state x + 1 to require
+            # processing, this prevents the search from becoming horribly
+            # non-linear.
+            try:
+                state_i = closed_states.index(None, todo_off)
+            except ValueError:
+                state_i = closed_states.index(None)  # DOTY: Will not raise, given todo > 0
+
+            todo_off = state_i + 1
+            todo -= 1
+
+            cl_state = self.gen_closure(core_states[state_i])
+            closed_states[state_i] = cl_state
+
+            seen.clear()
+            for core in cl_state.items.keys():
+                sym = core.next
+                if sym is None or sym in seen:
+                    continue
+                seen.add(sym)
+
+                nstate = cl_state.goto(sym)
+
+                # Try and find a compatible match for this state.
+                cnd_states = cnd_weaklies[sym]
+
+                # First of all see if any of the candidate states are exactly
+                # the same as the new state, in which case we only need to
+                # add an edge to the candidate state. This isn't just an
+                # optimisation (though it does avoid the expense of change
+                # propagation), but has a correctness aspect: there's no
+                # guarantee that the weakly compatible check is reflexive
+                # (i.e. a state may not be weakly compatible with itself).
+                found = False
+                for cnd in cnd_states:
+                    if core_states[cnd] == nstate:
+                        edges[state_i][sym] = cnd
+                        found = True
+                        break
+
+                if found:
                    continue

-                closure.add(config)
-                pending_next.extend(self.gen_closure_next(config))
+                # No candidate states were equal to the new state, so we need
+                # to look for a candidate state which is weakly compatible.
+                m: int | None = None
+                for cnd in cnd_states:
+                    if core_states[cnd].weakly_compatible(nstate):
+                        m = cnd
+                        break

-            temp = pending
-            pending = pending_next
-            pending_next = temp
-            pending_next.clear()
+                if m is not None:
+                    # A weakly compatible match has been found.
+                    edges[state_i][sym] = m
+                    assert core_states[m].weakly_compatible(nstate)  # TODO: REMOVE, TOO SLOW
+                    if core_states[m].weakly_merge(nstate):
+                        # We only do the simplest change propagation, forcing possibly
+                        # affected sets to be entirely reprocessed (which will recursively
+                        # force propagation too). Even though this does unnecessary
+                        # computation, it is still pretty fast.
+                        #
+                        # Note also that edges[k] will be completely regenerated, overwriting
+                        # all existing entries and possibly adding new ones. We thus don't
+                        # need to clear it manually.
+                        if closed_states[m] is not None:
+                            closed_states[m] = None
+                            todo += 1

-        # NOTE: The generation of this closure *might* have generated
-        #       multiple cores with different lookaheads; if that's
-        #       the case we need to merge.
-        merged: dict[ConfigurationCore, set[int]] = {}
-        for c in closure:
-            existing = merged.get(c.core)
-            if existing is not None:
-                existing.update(c.lookahead)
-            else:
-                merged[c.core] = set(c.lookahead)
+                else:
+                    stidx = len(core_states)

-        return ConfigSet(Configuration(k, tuple(sorted(v))) for k, v in merged.items())
+                    cnd_weaklies[sym].append(stidx)
+                    edges[state_i][sym] = stidx

-    def gen_all_successors(
-        self, config_set: typing.Iterable[Configuration]
-    ) -> list[typing.Tuple[int, ConfigSet]]:
-        """Return all of the non-empty successors for the given config set.
+                    edges.append({})
+                    closed_states.append(None)
+                    core_states.append(nstate)
+                    todo += 1

-        (That is, given the config set, pretend we see all the symbols we
-        could possibly see, and figure out which configs sets we get from
-        those symbols. Those are the successors of this set.)
-        """
-        possible = {config.core.next for config in config_set if config.core.next is not None}
+        # Although the Pager paper doesn't talk about it, the algorithm above
+        # can create unreachable states due to the non-determinism inherent
+        # in working with hashsets. Indeed, this can even happen with the
+        # example from Pager's paper (on perhaps 1 out of 100 runs, 24 or 25
+        # states will be created instead of 23). We thus need to weed out
+        # unreachable states and update edges accordingly.
+        assert len(core_states) == len(closed_states)

-        next = []
-        for symbol in possible:
-            seeds = ConfigSet(
-                config.replace_position(config.core.position + 1)
-                for config in config_set
-                if config.core.next == symbol
-            )
-            if len(seeds) > 0:
-                next.append((symbol, seeds))
+        all_states = []
+        for core_state, closed_state in zip(core_states, closed_states):
+            assert closed_state is not None
+            all_states.append((core_state, closed_state))
+        gc_states, gc_edges = self.gc(all_states, edges)

-        return next
+        # DOTY: UGH this is so bad, we should rewrite to use ItemSet everywehre
+        #       probably, which actually means getting rid of the pluggable
+        #       generator because who actually needs that?

-    def gen_sets(self, seeds: list[Configuration]) -> ConfigurationSetInfo:
-        """Generate all configuration sets starting from the provided seeds."""
-        result = ConfigurationSetInfo()
+        # Register all the actually merged, final config sets. I should *not*
+        # have to do all this work. Really really garbage.
+        return StateGraph(
+            closures=[closed_state.to_config_set() for _, closed_state in gc_states],
+            successors=gc_edges,
+        )

-        successors = []
-        pending = [ConfigSet(seeds)]
-        pending_next = []
-        while len(pending) > 0:
-            for core in pending:
-                id, is_new = result.register_core(core)
-                if is_new:
-                    config_set = self.gen_closure(core)
-                    result.register_config_closure(id, config_set)
-                    for symbol, successor in self.gen_all_successors(config_set):
-                        successors.append((id, symbol, successor))
-                        pending_next.append(successor)
+    def gc(
+        self,
+        states: list[tuple[ItemSet, ItemSet]],
+        edges: list[dict[int, int]],
+    ) -> tuple[list[tuple[ItemSet, ItemSet]], list[dict[int, int]]]:
+        # First of all, do a simple pass over all states. All state indexes
+        # reachable from the start state will be inserted into the 'seen'
+        # set.
+        todo = [0]
+        seen = set()
+        while len(todo) > 0:
+            item = todo.pop()
+            if item in seen:
+                continue
+            seen.add(item)
+            todo.extend(e for e in edges[item].values() if e not in seen)

-            temp = pending
-            pending = pending_next
-            pending_next = temp
-            pending_next.clear()
+        if len(seen) == len(states):
+            # Every state is reachable.
+            return states, edges

-        for id, symbol, successor in successors:
-            result.add_successor(id, symbol, result.core_key[successor])
+        # Imagine we started with 3 states and their edges:
+        #   states: [0, 1, 2]
+        #   edges : [[_ => 2]]
+        #
+        # At this point, 'seen' will be the set {0, 2}. What we need to do is
+        # to create a new list of states that doesn't have state 1 in it.
+        # That will cause state 2 to become to state 1, meaning that we need
+        # to adjust edges so that the pointer to state 2 is updated to state
+        # 1. In other words we want to achieve this output:
+        #
+        #   states: [0, 2]
+        #   edges : [_ => 1]
+        #
+        # The way we do this is to first iterate over all states, working out
+        # what the mapping from seen states to their new offsets is.
+        gc_states: list[tuple[ItemSet, ItemSet]] = []
+        offsets: list[int] = []
+        offset = 0
+        for state_i, zstate in enumerate(states):
+            offsets.append(state_i - offset)
+            if state_i not in seen:
+                offset += 1
+                continue

-        return result
+            gc_states.append(zstate)

-    def gen_follow(self, symbol: int) -> set[int]:
-        """Generate the follow set for the given nonterminal.
+        # At this point the offsets list will be [0, 1, 1]. We now create new
+        # edges where each offset is corrected by looking it up in the
+        # offsets list.
+        gc_edges: list[dict[int, int]] = []
+        for st_edge_i, st_edges in enumerate(edges):
+            if st_edge_i not in seen:
+                continue

-        The follow set for a nonterminal is the set of terminals that can
-        follow the nonterminal in a valid sentence. The resulting set never
-        contains epsilon and is never empty, since we should always at least
-        ground out at '$', which is the end-of-stream marker.
+            gc_edges.append({k: offsets[v] for k, v in st_edges.items()})

-        See FollowInfo for more information on how this is determined.
-        """
-        return self._follows.follows[symbol]
+        return (gc_states, gc_edges)

    def gen_first(self, symbols: typing.Iterable[int]) -> typing.Tuple[set[int], bool]:
        """Return the first set for a *sequence* of symbols.
@ -1394,45 +1483,15 @@ class GenerateLR1:

        return (result, True)

-    def gen_reduce_set(self, config: Configuration) -> typing.Iterable[int]:
-        """Return the set of symbols that indicate we should reduce the given
-        config.
+    def gen_closure(self, items: ItemSet) -> ItemSet:
+        """Generate the closure of the given ItemSet.

-        In an LR1 parser, this is the lookahead of the configuration.
+        Some of the configurations the ItemSet might be positioned right before
+        nonterminals. In that case, obviously, we should *also* behave as if we
+        were right at the beginning of each production for that nonterminal. The
+        set of all those productions combined with all the incoming productions
+        is the closure.
        """
-        return config.lookahead
-
-    def gen_closure_next(self, config: Configuration):
-        """Return the next set of configurations in the closure for config.
-
-        In LR1 parsers, we must compute the lookahead for the configurations
-        we're adding to the closure. The lookahead for the new configurations
-        is the first() of the rest of this config's production. If that
-        contains epsilon, then the lookahead *also* contains the lookahead we
-        already have. (This lookahead was presumably generated by the same
-        process, so in some sense it is a 'parent' lookahead, or a lookahead
-        from an upstream production in the grammar.)
-
-        (See the documentation in GenerateLR0 for more information on how
-        this function fits into the whole process, specifically `gen_closure`.)
-        """
-        config_next = config.core.next
-        if config_next is None:
-            return ()
-        else:
-            lookahead, epsilon = self.gen_first(config.rest)
-            if epsilon:
-                lookahead.update(config.lookahead)
-            lookahead_tuple = tuple(sorted(lookahead))
-
-            next = []
-            for rule in self.grammar[config_next]:
-                rr = Configuration.from_rule(config_next, rule, lookahead=lookahead_tuple)
-                next.append(rr)
-
-            return tuple(next)
-
-    def gen_closure_x(self, items: ItemSet) -> ItemSet:
        closure: dict[ConfigurationCore, set[int]] = {}

        # We're going to maintain a set of things to look at, rules that we
@ -1524,7 +1583,7 @@ class GenerateLR1:
                config_next = config.core.next
                if config_next is None:
                    if config.core.name != self.start_symbol:
-                        for a in self.gen_reduce_set(config):
+                        for a in config.lookahead:
                            builder.set_table_reduce(a, config)
                    else:
                        builder.set_table_accept(self.end_symbol, config)
@ -1541,249 +1600,6 @@ class GenerateLR1:
        return builder.flush(config_sets)


-class GeneratePager(GenerateLR1):
-    """Pager's algorithm.
-
-    I'll be honest, I don't understnd this one as well as the pure LR1
-    algorithm. It proceeds as LR1, generating successor states, but every
-    time it makes a new state it searches the states it has already made for
-    one that is "weakly compatible;" ifit finds one it merges the new state
-    with the old state and marks the old state to be re-visited.
-
-    The implementation here follows from the implementation in
-    `GRMTools<https://github.com/softdevteam/grmtools/blob/master/lrtable/src/lib/pager.rs>`_.
-
-    As they explain there:
-
-    > The general algorithms that form the basis of what's used in this file
-    > can be found in:
-    >
-    >      A Practical General Method for Constructing LR(k) Parsers
-    >         David Pager, Acta Informatica 7, 249--268, 1977
-    >
-    > However Pager's paper is dense, and doesn't name sub-parts of the
-    > algorithm. We mostly reference the (still incomplete, but less
-    > incomplete) version of the algorithm found in:
-    >
-    >      Measuring and extending LR(1) parser generation
-    >         Xin Chen, PhD thesis, University of Hawaii, 2009
-    """
-
-    def gen_sets(self, seeds: list[Configuration]) -> ConfigurationSetInfo:
-        # This function can be seen as a modified version of items() from
-        # Chen's dissertation.
-        #
-        # DOTY: It is also (practically) a converted version from grmtools
-        #       into python, more or less verbatim at this point. I have some
-        #       sense of what is going on, and attempt to elaborate with
-        #       these comments.
-
-        # closed_states and core_states are both equally sized vectors of
-        # states. Core states are smaller, and used for the weakly compatible
-        # checks, but we ultimately need to return closed states. Closed
-        # states which are None are those which require processing; thus
-        # closed_states also implicitly serves as a todo list.
-        closed_states: list[ItemSet | None] = []
-        core_states: list[ItemSet] = []
-        edges: list[dict[int, int]] = []
-
-        # Convert the incoming seed configurations into item sets.
-        # TODO: Convert everything to ItemSet natively.
-        state0 = ItemSet({seed.core: set(seed.lookahead) for seed in seeds})
-        core_states.append(state0)
-        closed_states.append(None)
-        edges.append({})
-
-        # We maintain a set of which rules and tokens we've seen; when
-        # processing a given state there's no point processing a rule or
-        # token more than once.
-        seen: set[int] = set()
-
-        # cnd_[rule|token]_weaklies represent which states are possible weakly
-        # compatible matches for a given symbol.
-        #
-        # DOTY: As with `seen`, we have a uniform space so we can have a
-        #       uniform one of these too.
-        cnd_weaklies: list[list[int]] = [[] for _ in range(len(self.alphabet))]
-
-        todo = 1  # How many None values are there in closed_states?
-        todo_off = 0  # Offset in closed states to start searching for the next todo.
-        while todo > 0:
-            assert len(core_states) == len(closed_states)
-            assert len(core_states) == len(edges)
-
-            # state_i is the next item to process. We don't want to
-            # continually search for the next None from the beginning, so we
-            # remember where we last saw a None (todo_off) and search from
-            # that point onwards, wrapping as necessary. Since processing a
-            # state x disproportionately causes state x + 1 to require
-            # processing, this prevents the search from becoming horribly
-            # non-linear.
-            try:
-                state_i = closed_states.index(None, todo_off)
-            except ValueError:
-                state_i = closed_states.index(None)  # DOTY: Will not raise, given todo > 0
-
-            todo_off = state_i + 1
-            todo -= 1
-
-            cl_state = self.gen_closure_x(core_states[state_i])
-            closed_states[state_i] = cl_state
-
-            seen.clear()
-            for core in cl_state.items.keys():
-                sym = core.next
-                if sym is None or sym in seen:
-                    continue
-                seen.add(sym)
-
-                nstate = cl_state.goto(sym)
-
-                # Try and find a compatible match for this state.
-                cnd_states = cnd_weaklies[sym]
-
-                # First of all see if any of the candidate states are exactly
-                # the same as the new state, in which case we only need to
-                # add an edge to the candidate state. This isn't just an
-                # optimisation (though it does avoid the expense of change
-                # propagation), but has a correctness aspect: there's no
-                # guarantee that the weakly compatible check is reflexive
-                # (i.e. a state may not be weakly compatible with itself).
-                found = False
-                for cnd in cnd_states:
-                    if core_states[cnd] == nstate:
-                        edges[state_i][sym] = cnd
-                        found = True
-                        break
-
-                if found:
-                    continue
-
-                # No candidate states were equal to the new state, so we need
-                # to look for a candidate state which is weakly compatible.
-                m: int | None = None
-                for cnd in cnd_states:
-                    if core_states[cnd].weakly_compatible(nstate):
-                        m = cnd
-                        break
-
-                if m is not None:
-                    # A weakly compatible match has been found.
-                    edges[state_i][sym] = m
-                    assert core_states[m].weakly_compatible(nstate)  # TODO: REMOVE, TOO SLOW
-                    if core_states[m].weakly_merge(nstate):
-                        # We only do the simplest change propagation, forcing possibly
-                        # affected sets to be entirely reprocessed (which will recursively
-                        # force propagation too). Even though this does unnecessary
-                        # computation, it is still pretty fast.
-                        #
-                        # Note also that edges[k] will be completely regenerated, overwriting
-                        # all existing entries and possibly adding new ones. We thus don't
-                        # need to clear it manually.
-                        if closed_states[m] is not None:
-                            closed_states[m] = None
-                            todo += 1
-
-                else:
-                    stidx = len(core_states)
-
-                    cnd_weaklies[sym].append(stidx)
-                    edges[state_i][sym] = stidx
-
-                    edges.append({})
-                    closed_states.append(None)
-                    core_states.append(nstate)
-                    todo += 1
-
-        # Although the Pager paper doesn't talk about it, the algorithm above
-        # can create unreachable states due to the non-determinism inherent
-        # in working with hashsets. Indeed, this can even happen with the
-        # example from Pager's paper (on perhaps 1 out of 100 runs, 24 or 25
-        # states will be created instead of 23). We thus need to weed out
-        # unreachable states and update edges accordingly.
-        assert len(core_states) == len(closed_states)
-
-        all_states = []
-        for core_state, closed_state in zip(core_states, closed_states):
-            assert closed_state is not None
-            all_states.append((core_state, closed_state))
-        gc_states, gc_edges = self.gc(all_states, edges)
-
-        # DOTY: UGH this is so bad, we should rewrite to use ItemSet everywehre
-        #       probably, which actually means getting rid of the pluggable
-        #       generator because who actually needs that?
-
-        # Register all the actually merged, final config sets. I should *not*
-        # have to do all this work. Really really garbage.
-        result = ConfigurationSetInfo()
-        result.sets = [core_state.to_config_set() for core_state, _ in gc_states]
-        result.core_key = {s: i for i, s in enumerate(result.sets)}
-        result.closures = [closed_state.to_config_set() for _, closed_state in gc_states]
-        result.config_set_key = {s: i for i, s in enumerate(result.closures) if s is not None}
-        result.successors = gc_edges
-
-        return result
-
-    def gc(
-        self,
-        states: list[tuple[ItemSet, ItemSet]],
-        edges: list[dict[int, int]],
-    ) -> tuple[list[tuple[ItemSet, ItemSet]], list[dict[int, int]]]:
-        # First of all, do a simple pass over all states. All state indexes
-        # reachable from the start state will be inserted into the 'seen'
-        # set.
-        todo = [0]
-        seen = set()
-        while len(todo) > 0:
-            item = todo.pop()
-            if item in seen:
-                continue
-            seen.add(item)
-            todo.extend(e for e in edges[item].values() if e not in seen)
-
-        if len(seen) == len(states):
-            # Every state is reachable.
-            return states, edges
-
-        # Imagine we started with 3 states and their edges:
-        #   states: [0, 1, 2]
-        #   edges : [[_ => 2]]
-        #
-        # At this point, 'seen' will be the set {0, 2}. What we need to do is
-        # to create a new list of states that doesn't have state 1 in it.
-        # That will cause state 2 to become to state 1, meaning that we need
-        # to adjust edges so that the pointer to state 2 is updated to state
-        # 1. In other words we want to achieve this output:
-        #
-        #   states: [0, 2]
-        #   edges : [_ => 1]
-        #
-        # The way we do this is to first iterate over all states, working out
-        # what the mapping from seen states to their new offsets is.
-        gc_states: list[tuple[ItemSet, ItemSet]] = []
-        offsets: list[int] = []
-        offset = 0
-        for state_i, zstate in enumerate(states):
-            offsets.append(state_i - offset)
-            if state_i not in seen:
-                offset += 1
-                continue
-
-            gc_states.append(zstate)
-
-        # At this point the offsets list will be [0, 1, 1]. We now create new
-        # edges where each offset is corrected by looking it up in the
-        # offsets list.
-        gc_edges: list[dict[int, int]] = []
-        for st_edge_i, st_edges in enumerate(edges):
-            if st_edge_i not in seen:
-                continue
-
-            gc_edges.append({k: offsets[v] for k, v in st_edges.items()})
-
-        return (gc_states, gc_edges)
-
-
 FlattenedWithMetadata = list["str|Terminal|tuple[dict[str,typing.Any],FlattenedWithMetadata]"]


@ -3009,7 +2825,7 @@ class Grammar:
    """

    _precedence: dict[str, typing.Tuple[Assoc, int]]
-    _generator: type[GenerateLR1]
+    _generator: type[ParserGenerator]
    _terminals: dict[str, Terminal]
    _nonterminals: dict[str, NonTerminal]
    _trivia: list[Terminal]
@ -3018,7 +2834,7 @@ class Grammar:
        self,
        start: str | NonTerminal | None = None,
        precedence: PrecedenceList | None = None,
-        generator: type[GenerateLR1] | None = None,
+        generator: type[ParserGenerator] | None = None,
        trivia: list[str | Terminal] | None = None,
        name: str | None = None,
    ):
@ -3037,7 +2853,7 @@ class Grammar:
        assert precedence is not None

        if generator is None:
-            generator = getattr(self, "generator", GeneratePager)
+            generator = getattr(self, "generator", ParserGenerator)
        assert generator is not None

        if trivia is None:
--- a/tests/test_grammar.py
+++ b/tests/test_grammar.py
@ -87,8 +87,8 @@ def test_all_generators():

    GENERATORS = [
        # parser.GenerateLR0,
-        parser.GeneratePager,
-        parser.GenerateLR1,
+        # parser.GeneratePager,
+        parser.ParserGenerator,
    ]
    for generator in GENERATORS:
        table = G().build_table(generator=generator)
@ -119,15 +119,14 @@ def test_grammar_aho_ullman_2():
        A = Terminal("a")
        B = Terminal("b")

-    TestGrammar().build_table(generator=parser.GenerateLR1)
-    TestGrammar().build_table(generator=parser.GeneratePager)
+    TestGrammar().build_table(generator=parser.ParserGenerator)
+    # TestGrammar().build_table(generator=parser.GeneratePager)


 def test_fun_lalr():

    class TestGrammar(Grammar):
        start = "S"
-        generator = parser.GeneratePager

        @rule
        def S(self):
Author	SHA1	Message	Date
John Doty	923b01f6fd	[parser] Simplify StateGraph	2024-10-26 07:35:28 -07:00
John Doty	27e6bb413c	[parser] Remove Canonical LR1 generator This is fine probably.	2024-10-26 07:25:37 -07:00
John Doty	2b72811486	[parser] ConfigurationSetInfo -> StateGraph	2024-10-26 06:56:30 -07:00
John Doty	e501caa073	[parser] Remove unused import	2024-10-26 06:53:53 -07:00
John Doty	e55bc140f9	[parser] Move ItemSet	2024-10-26 06:53:36 -07:00