diff --git a/parser/parser.py b/parser/parser.py index ee228ad..6faaf6e 100644 --- a/parser/parser.py +++ b/parser/parser.py @@ -135,6 +135,7 @@ import bisect import collections import dataclasses import enum +import functools import inspect import itertools import json @@ -273,100 +274,7 @@ class ConfigSet(frozenset[Configuration]): pass -# Here we have a slightly different definition of a ConfigurationSet; we keep -# the lookaheads outside and use a dictionary to check for containment -# quickly. ItemSet is used in the GRM/Pager/Chin algorithm. -@dataclasses.dataclass -class ItemSet: - """An ItemSet is a group of configuration cores together with their - "contexts", or lookahead sets. - - An ItemSet is comparable for equality, and also supports this lesser notion - of "weakly compatible" which is used to collapse states in the pager - algorithm. - """ - - items: dict[ConfigurationCore, set[int]] - - def __init__(self, items=None): - self.items = items or {} - - @classmethod - def from_config_set(cls, config_set: ConfigSet) -> "ItemSet": - return ItemSet({config.core: set(config.lookahead) for config in config_set}) - - def weakly_compatible(self, other: "ItemSet") -> bool: - a = self.items - b = other.items - - if len(a) != len(b): - return False - - for acore in a: - if acore not in b: - return False - - if len(a) == 1: - return True - - # DOTY: This loop I do not understand, truly. What the heck is happening here? - a_keys = list(a.keys()) - for i, i_key in enumerate(itertools.islice(a_keys, 0, len(a_keys) - 1)): - for j_key in itertools.islice(a_keys, i + 1, None): - a_i_key = a[i_key] - b_i_key = b[i_key] - a_j_key = a[j_key] - b_j_key = b[j_key] - - # DOTY: GRMTools written with intersects(); we don't have that we have - # `not disjoint()`. :P There are many double negatives.... - # - # not (intersect(a_i, b_j) or intersect(a_j, b_i)) - # not ((not disjoint(a_i, b_j)) or (not disjoint(a_j, b_i))) - # ((not not disjoint(a_i, b_j)) and (not not disjoint(a_j, b_i))) - # disjoint(a_i, b_j) and disjoint(a_j, b_i) - if a_i_key.isdisjoint(b_j_key) and a_j_key.isdisjoint(b_i_key): - continue - - # intersect(a_i, a_j) or intersect(b_i, b_j) - # (not disjoint(a_i, a_j)) or (not disjoint(b_i, b_j)) - # not (disjoint(a_i, a_j) and disjoint(b_i, b_j)) - if not (a_i_key.isdisjoint(a_j_key) and b_i_key.isdisjoint(b_j_key)): - continue - - return False - - return True - - def weakly_merge(self, other: "ItemSet") -> bool: - """Merge b into a, returning True if this lead to any changes.""" - a = self.items - b = other.items - - changed = False - for a_key, a_ctx in a.items(): - start_len = len(a_ctx) - a_ctx.update(b[a_key]) # Python doesn't tell us changes - changed = changed or (start_len != len(a_ctx)) - - return changed - - def goto(self, symbol: int) -> "ItemSet": - result = ItemSet() - for core, context in self.items.items(): - if core.next == symbol: - next = core.replace_position(core.position + 1) - result.items[next] = set(context) - return result - - def to_config_set(self) -> ConfigSet: - return ConfigSet( - {Configuration(core, tuple(sorted(ctx))) for core, ctx in self.items.items()} - ) - - -@dataclasses.dataclass -class StateGraph: +class ConfigurationSetInfo: """When we build a grammar into a table, the first thing we need to do is generate all the configuration sets and their successors. @@ -381,23 +289,65 @@ class StateGraph: structure, but they all compute this information.) """ - closures: list[ConfigSet] + core_key: dict[ConfigSet, int] # Map a ConfigSet into am index + config_set_key: dict[ConfigSet, int] # Map a ConfigSet into am index + sets: list[ConfigSet] # Map the index back into a set + closures: list[ConfigSet | None] # Track closures # All the sucessors for all of the sets. `successors[i]` is the mapping # from grammar symbol to the index of the set you get by processing that # symbol. successors: list[dict[int, int]] + def __init__(self): + self.core_key = {} + self.config_set_key = {} + self.sets = [] + self.closures = [] + self.successors = [] + + def register_core(self, c: ConfigSet) -> typing.Tuple[int, bool]: + """Potentially add a new config set to the set of sets. Returns the + canonical ID of the set within this structure, along with a boolean + indicating whether the set was just added or not. + + (You can use this integer to get the set back, if you need it, and + also access the successors table.) + """ + existing = self.core_key.get(c) + if existing is not None: + return existing, False + + index = len(self.sets) + self.sets.append(c) + self.closures.append(None) + self.successors.append({}) + self.core_key[c] = index + return index, True + + def register_config_closure(self, c_id: int, closure: ConfigSet): + assert self.closures[c_id] is None + self.closures[c_id] = closure + self.config_set_key[closure] = c_id + + def add_successor(self, c_id: int, symbol: int, successor: int): + """Register sucessor(`c_id`, `symbol`) -> `successor`, where c_id + is the id of the set in this structure, and symbol is the id of a + symbol in the alphabet of the grammar. + """ + self.successors[c_id][symbol] = successor + def dump_state(self, alphabet: list[str]) -> str: return json.dumps( { str(set_index): { - "closures": [c.format(alphabet) for c in closure], - "successors": {alphabet[k]: str(v) for k, v in successors.items()}, + "configs": [c.format(alphabet) for c in config_set], + "closures": [c.format(alphabet) for c in self.closures[set_index] or []], + "successors": { + alphabet[k]: str(v) for k, v in self.successors[set_index].items() + }, } - for set_index, (closure, successors) in enumerate( - zip(self.closures, self.successors) - ) + for set_index, config_set in enumerate(self.sets) }, indent=4, sort_keys=True, @@ -414,8 +364,7 @@ class StateGraph: This function raises KeyError if no path is found. """ - # TODO: This should be tested. - target_index = self.closures.index(target_set) + target_index = self.config_set_key[target_set] visited = set() queue: collections.deque = collections.deque() @@ -558,7 +507,7 @@ class ErrorCollection: def gen_exception( self, alphabet: list[str], - all_sets: StateGraph, + all_sets: ConfigurationSetInfo, ) -> AmbiguityError | None: """Format all the errors into an error, or return None if there are no errors. @@ -695,7 +644,7 @@ class TableBuilder(object): self.action_row = None self.goto_row = None - def flush(self, all_sets: StateGraph) -> ParseTable: + def flush(self, all_sets: ConfigurationSetInfo) -> ParseTable: """Finish building the table and return it. Raises ValueError if there were any conflicts during construction. @@ -1058,36 +1007,108 @@ class FollowInfo: return FollowInfo(follows=follows) -class ParserGenerator: - """Generate parse tables for LR1 grammars. +# Here we have a slightly different definition of a ConfigurationSet; we keep the +# lookaheads outside and use a dictionary to check for containment quickly. +# ItemSet is used in the GRM/Pager/Chin algorithm. +@dataclasses.dataclass +class ItemSet: + """An ItemSet is a group of configuration cores together with their + "contexts", or lookahead sets. - This class implements a variant of pager's algorithm to generate the parse - tables, which support the same set of languages as Canonical LR1 but with - much smaller resulting parse tables. + An ItemSet is comparable for equality, and also supports this lesser notion + of "weakly compatible" which is used to collapse states in the pager + algorithm. + """ - I'll be honest, I don't understnd this one as well as the pure LR1 - algorithm. It proceeds as LR1, generating successor states, but every - time it makes a new state it searches the states it has already made for - one that is "weakly compatible;" if it finds one it merges the new state - with the old state and marks the old state to be re-visited. + items: dict[ConfigurationCore, set[int]] - The implementation here follows from the implementation in - `GRMTools`_. + def __init__(self, items=None): + self.items = items or {} - As they explain there: + @classmethod + def from_config_set(cls, config_set: ConfigSet) -> "ItemSet": + return ItemSet({config.core: set(config.lookahead) for config in config_set}) - > The general algorithms that form the basis of what's used in this file - > can be found in: - > - > A Practical General Method for Constructing LR(k) Parsers - > David Pager, Acta Informatica 7, 249--268, 1977 - > - > However Pager's paper is dense, and doesn't name sub-parts of the - > algorithm. We mostly reference the (still incomplete, but less - > incomplete) version of the algorithm found in: - > - > Measuring and extending LR(1) parser generation - > Xin Chen, PhD thesis, University of Hawaii, 2009 + def weakly_compatible(self, other: "ItemSet") -> bool: + a = self.items + b = other.items + + if len(a) != len(b): + return False + + for acore in a: + if acore not in b: + return False + + if len(a) == 1: + return True + + # DOTY: This loop I do not understand, truly. What the heck is happening here? + a_keys = list(a.keys()) + for i, i_key in enumerate(itertools.islice(a_keys, 0, len(a_keys) - 1)): + for j_key in itertools.islice(a_keys, i + 1, None): + a_i_key = a[i_key] + b_i_key = b[i_key] + a_j_key = a[j_key] + b_j_key = b[j_key] + + # DOTY: GRMTools written with intersects(); we don't have that we have + # `not disjoint()`. :P There are many double negatives.... + # + # not (intersect(a_i, b_j) or intersect(a_j, b_i)) + # not ((not disjoint(a_i, b_j)) or (not disjoint(a_j, b_i))) + # ((not not disjoint(a_i, b_j)) and (not not disjoint(a_j, b_i))) + # disjoint(a_i, b_j) and disjoint(a_j, b_i) + if a_i_key.isdisjoint(b_j_key) and a_j_key.isdisjoint(b_i_key): + continue + + # intersect(a_i, a_j) or intersect(b_i, b_j) + # (not disjoint(a_i, a_j)) or (not disjoint(b_i, b_j)) + # not (disjoint(a_i, a_j) and disjoint(b_i, b_j)) + if not (a_i_key.isdisjoint(a_j_key) and b_i_key.isdisjoint(b_j_key)): + continue + + return False + + return True + + def weakly_merge(self, other: "ItemSet") -> bool: + """Merge b into a, returning True if this lead to any changes.""" + a = self.items + b = other.items + + changed = False + for a_key, a_ctx in a.items(): + start_len = len(a_ctx) + a_ctx.update(b[a_key]) # Python doesn't tell us changes + changed = changed or (start_len != len(a_ctx)) + + return changed + + def goto(self, symbol: int) -> "ItemSet": + result = ItemSet() + for core, context in self.items.items(): + if core.next == symbol: + next = core.replace_position(core.position + 1) + result.items[next] = set(context) + return result + + def to_config_set(self) -> ConfigSet: + return ConfigSet( + {Configuration(core, tuple(sorted(ctx))) for core, ctx in self.items.items()} + ) + + +class GenerateLR1: + """Generate parse tables for LR1, or "canonical LR" grammars. + + LR1 parsers can recognize more than SLR parsers. Like SLR parsers, they + are choosier about when they reduce. But unlike SLR parsers, they specify + the terminals on which they reduce by carrying a 'lookahead' terminal in + the configuration. The lookahead of a configuration is computed as the + closure of a configuration set is computed, so see gen_closure_next for + details. (Except for the start configuration, which has '$' as its + lookahead.) """ # Internally we use integers as symbols, not strings. Mostly this is fine, @@ -1150,9 +1171,9 @@ class ParserGenerator: non-terminal being added, and the second elment of the tuple is the list of terminals and non-terminals that make up the production. - There is no support for alternation. If you want alternations that - you'll have to lower the grammar by hand into the simpler form first, - but that's what the Grammar and NonTerminal classes are for. + There is currently no support for custom actions or alternation or + anything like that. If you want alternations that you'll have to lower + the grammar by hand into the simpler form first. Don't name anything with double-underscores; those are reserved for the generator. Don't add '$' either, as it is reserved to mean @@ -1252,215 +1273,105 @@ class ParserGenerator: self._firsts, ) - def gen_sets(self, seeds: list[Configuration]) -> StateGraph: - # This function can be seen as a modified version of items() from - # Chen's dissertation. - # - # DOTY: It is also (practically) a converted version from grmtools - # into python, more or less verbatim at this point. I have some - # sense of what is going on, and attempt to elaborate with - # these comments. + def gen_closure(self, seeds: typing.Iterable[Configuration]) -> ConfigSet: + """Compute the closure for the specified configs. The closure is all + of the configurations we could be in. Specifically, if the position + for a config is just before a non-terminal then we must also consider + configurations where the rule is the rule for the non-terminal and + the position is just before the beginning of the rule. - # closed_states and core_states are both equally sized vectors of - # states. Core states are smaller, and used for the weakly compatible - # checks, but we ultimately need to return closed states. Closed - # states which are None are those which require processing; thus - # closed_states also implicitly serves as a todo list. - closed_states: list[ItemSet | None] = [] - core_states: list[ItemSet] = [] - edges: list[dict[int, int]] = [] - - # Convert the incoming seed configurations into item sets. - # TODO: Convert everything to ItemSet natively. - state0 = ItemSet({seed.core: set(seed.lookahead) for seed in seeds}) - core_states.append(state0) - closed_states.append(None) - edges.append({}) - - # We maintain a set of which rules and tokens we've seen; when - # processing a given state there's no point processing a rule or - # token more than once. - seen: set[int] = set() - - # cnd_[rule|token]_weaklies represent which states are possible weakly - # compatible matches for a given symbol. - # - # DOTY: As with `seen`, we have a uniform space so we can have a - # uniform one of these too. - cnd_weaklies: list[list[int]] = [[] for _ in range(len(self.alphabet))] - - todo = 1 # How many None values are there in closed_states? - todo_off = 0 # Offset in closed states to start searching for the next todo. - while todo > 0: - assert len(core_states) == len(closed_states) - assert len(core_states) == len(edges) - - # state_i is the next item to process. We don't want to - # continually search for the next None from the beginning, so we - # remember where we last saw a None (todo_off) and search from - # that point onwards, wrapping as necessary. Since processing a - # state x disproportionately causes state x + 1 to require - # processing, this prevents the search from becoming horribly - # non-linear. - try: - state_i = closed_states.index(None, todo_off) - except ValueError: - state_i = closed_states.index(None) # DOTY: Will not raise, given todo > 0 - - todo_off = state_i + 1 - todo -= 1 - - cl_state = self.gen_closure(core_states[state_i]) - closed_states[state_i] = cl_state - - seen.clear() - for core in cl_state.items.keys(): - sym = core.next - if sym is None or sym in seen: - continue - seen.add(sym) - - nstate = cl_state.goto(sym) - - # Try and find a compatible match for this state. - cnd_states = cnd_weaklies[sym] - - # First of all see if any of the candidate states are exactly - # the same as the new state, in which case we only need to - # add an edge to the candidate state. This isn't just an - # optimisation (though it does avoid the expense of change - # propagation), but has a correctness aspect: there's no - # guarantee that the weakly compatible check is reflexive - # (i.e. a state may not be weakly compatible with itself). - found = False - for cnd in cnd_states: - if core_states[cnd] == nstate: - edges[state_i][sym] = cnd - found = True - break - - if found: + (We have replaced a recursive version with an iterative one.) + """ + closure: set[Configuration] = set() + pending = list(seeds) + pending_next = [] + while len(pending) > 0: + for config in pending: + if config in closure: continue - # No candidate states were equal to the new state, so we need - # to look for a candidate state which is weakly compatible. - m: int | None = None - for cnd in cnd_states: - if core_states[cnd].weakly_compatible(nstate): - m = cnd - break + closure.add(config) + pending_next.extend(self.gen_closure_next(config)) - if m is not None: - # A weakly compatible match has been found. - edges[state_i][sym] = m - assert core_states[m].weakly_compatible(nstate) # TODO: REMOVE, TOO SLOW - if core_states[m].weakly_merge(nstate): - # We only do the simplest change propagation, forcing possibly - # affected sets to be entirely reprocessed (which will recursively - # force propagation too). Even though this does unnecessary - # computation, it is still pretty fast. - # - # Note also that edges[k] will be completely regenerated, overwriting - # all existing entries and possibly adding new ones. We thus don't - # need to clear it manually. - if closed_states[m] is not None: - closed_states[m] = None - todo += 1 + temp = pending + pending = pending_next + pending_next = temp + pending_next.clear() - else: - stidx = len(core_states) + # NOTE: The generation of this closure *might* have generated + # multiple cores with different lookaheads; if that's + # the case we need to merge. + merged: dict[ConfigurationCore, set[int]] = {} + for c in closure: + existing = merged.get(c.core) + if existing is not None: + existing.update(c.lookahead) + else: + merged[c.core] = set(c.lookahead) - cnd_weaklies[sym].append(stidx) - edges[state_i][sym] = stidx + return ConfigSet(Configuration(k, tuple(sorted(v))) for k, v in merged.items()) - edges.append({}) - closed_states.append(None) - core_states.append(nstate) - todo += 1 + def gen_all_successors( + self, config_set: typing.Iterable[Configuration] + ) -> list[typing.Tuple[int, ConfigSet]]: + """Return all of the non-empty successors for the given config set. - # Although the Pager paper doesn't talk about it, the algorithm above - # can create unreachable states due to the non-determinism inherent - # in working with hashsets. Indeed, this can even happen with the - # example from Pager's paper (on perhaps 1 out of 100 runs, 24 or 25 - # states will be created instead of 23). We thus need to weed out - # unreachable states and update edges accordingly. - assert len(core_states) == len(closed_states) + (That is, given the config set, pretend we see all the symbols we + could possibly see, and figure out which configs sets we get from + those symbols. Those are the successors of this set.) + """ + possible = {config.core.next for config in config_set if config.core.next is not None} - all_states = [] - for core_state, closed_state in zip(core_states, closed_states): - assert closed_state is not None - all_states.append((core_state, closed_state)) - gc_states, gc_edges = self.gc(all_states, edges) + next = [] + for symbol in possible: + seeds = ConfigSet( + config.replace_position(config.core.position + 1) + for config in config_set + if config.core.next == symbol + ) + if len(seeds) > 0: + next.append((symbol, seeds)) - # DOTY: UGH this is so bad, we should rewrite to use ItemSet everywehre - # probably, which actually means getting rid of the pluggable - # generator because who actually needs that? + return next - # Register all the actually merged, final config sets. I should *not* - # have to do all this work. Really really garbage. - return StateGraph( - closures=[closed_state.to_config_set() for _, closed_state in gc_states], - successors=gc_edges, - ) + def gen_sets(self, seeds: list[Configuration]) -> ConfigurationSetInfo: + """Generate all configuration sets starting from the provided seeds.""" + result = ConfigurationSetInfo() - def gc( - self, - states: list[tuple[ItemSet, ItemSet]], - edges: list[dict[int, int]], - ) -> tuple[list[tuple[ItemSet, ItemSet]], list[dict[int, int]]]: - # First of all, do a simple pass over all states. All state indexes - # reachable from the start state will be inserted into the 'seen' - # set. - todo = [0] - seen = set() - while len(todo) > 0: - item = todo.pop() - if item in seen: - continue - seen.add(item) - todo.extend(e for e in edges[item].values() if e not in seen) + successors = [] + pending = [ConfigSet(seeds)] + pending_next = [] + while len(pending) > 0: + for core in pending: + id, is_new = result.register_core(core) + if is_new: + config_set = self.gen_closure(core) + result.register_config_closure(id, config_set) + for symbol, successor in self.gen_all_successors(config_set): + successors.append((id, symbol, successor)) + pending_next.append(successor) - if len(seen) == len(states): - # Every state is reachable. - return states, edges + temp = pending + pending = pending_next + pending_next = temp + pending_next.clear() - # Imagine we started with 3 states and their edges: - # states: [0, 1, 2] - # edges : [[_ => 2]] - # - # At this point, 'seen' will be the set {0, 2}. What we need to do is - # to create a new list of states that doesn't have state 1 in it. - # That will cause state 2 to become to state 1, meaning that we need - # to adjust edges so that the pointer to state 2 is updated to state - # 1. In other words we want to achieve this output: - # - # states: [0, 2] - # edges : [_ => 1] - # - # The way we do this is to first iterate over all states, working out - # what the mapping from seen states to their new offsets is. - gc_states: list[tuple[ItemSet, ItemSet]] = [] - offsets: list[int] = [] - offset = 0 - for state_i, zstate in enumerate(states): - offsets.append(state_i - offset) - if state_i not in seen: - offset += 1 - continue + for id, symbol, successor in successors: + result.add_successor(id, symbol, result.core_key[successor]) - gc_states.append(zstate) + return result - # At this point the offsets list will be [0, 1, 1]. We now create new - # edges where each offset is corrected by looking it up in the - # offsets list. - gc_edges: list[dict[int, int]] = [] - for st_edge_i, st_edges in enumerate(edges): - if st_edge_i not in seen: - continue + def gen_follow(self, symbol: int) -> set[int]: + """Generate the follow set for the given nonterminal. - gc_edges.append({k: offsets[v] for k, v in st_edges.items()}) + The follow set for a nonterminal is the set of terminals that can + follow the nonterminal in a valid sentence. The resulting set never + contains epsilon and is never empty, since we should always at least + ground out at '$', which is the end-of-stream marker. - return (gc_states, gc_edges) + See FollowInfo for more information on how this is determined. + """ + return self._follows.follows[symbol] def gen_first(self, symbols: typing.Iterable[int]) -> typing.Tuple[set[int], bool]: """Return the first set for a *sequence* of symbols. @@ -1483,15 +1394,45 @@ class ParserGenerator: return (result, True) - def gen_closure(self, items: ItemSet) -> ItemSet: - """Generate the closure of the given ItemSet. + def gen_reduce_set(self, config: Configuration) -> typing.Iterable[int]: + """Return the set of symbols that indicate we should reduce the given + config. - Some of the configurations the ItemSet might be positioned right before - nonterminals. In that case, obviously, we should *also* behave as if we - were right at the beginning of each production for that nonterminal. The - set of all those productions combined with all the incoming productions - is the closure. + In an LR1 parser, this is the lookahead of the configuration. """ + return config.lookahead + + def gen_closure_next(self, config: Configuration): + """Return the next set of configurations in the closure for config. + + In LR1 parsers, we must compute the lookahead for the configurations + we're adding to the closure. The lookahead for the new configurations + is the first() of the rest of this config's production. If that + contains epsilon, then the lookahead *also* contains the lookahead we + already have. (This lookahead was presumably generated by the same + process, so in some sense it is a 'parent' lookahead, or a lookahead + from an upstream production in the grammar.) + + (See the documentation in GenerateLR0 for more information on how + this function fits into the whole process, specifically `gen_closure`.) + """ + config_next = config.core.next + if config_next is None: + return () + else: + lookahead, epsilon = self.gen_first(config.rest) + if epsilon: + lookahead.update(config.lookahead) + lookahead_tuple = tuple(sorted(lookahead)) + + next = [] + for rule in self.grammar[config_next]: + rr = Configuration.from_rule(config_next, rule, lookahead=lookahead_tuple) + next.append(rr) + + return tuple(next) + + def gen_closure_x(self, items: ItemSet) -> ItemSet: closure: dict[ConfigurationCore, set[int]] = {} # We're going to maintain a set of things to look at, rules that we @@ -1583,7 +1524,7 @@ class ParserGenerator: config_next = config.core.next if config_next is None: if config.core.name != self.start_symbol: - for a in config.lookahead: + for a in self.gen_reduce_set(config): builder.set_table_reduce(a, config) else: builder.set_table_accept(self.end_symbol, config) @@ -1600,6 +1541,249 @@ class ParserGenerator: return builder.flush(config_sets) +class GeneratePager(GenerateLR1): + """Pager's algorithm. + + I'll be honest, I don't understnd this one as well as the pure LR1 + algorithm. It proceeds as LR1, generating successor states, but every + time it makes a new state it searches the states it has already made for + one that is "weakly compatible;" ifit finds one it merges the new state + with the old state and marks the old state to be re-visited. + + The implementation here follows from the implementation in + `GRMTools`_. + + As they explain there: + + > The general algorithms that form the basis of what's used in this file + > can be found in: + > + > A Practical General Method for Constructing LR(k) Parsers + > David Pager, Acta Informatica 7, 249--268, 1977 + > + > However Pager's paper is dense, and doesn't name sub-parts of the + > algorithm. We mostly reference the (still incomplete, but less + > incomplete) version of the algorithm found in: + > + > Measuring and extending LR(1) parser generation + > Xin Chen, PhD thesis, University of Hawaii, 2009 + """ + + def gen_sets(self, seeds: list[Configuration]) -> ConfigurationSetInfo: + # This function can be seen as a modified version of items() from + # Chen's dissertation. + # + # DOTY: It is also (practically) a converted version from grmtools + # into python, more or less verbatim at this point. I have some + # sense of what is going on, and attempt to elaborate with + # these comments. + + # closed_states and core_states are both equally sized vectors of + # states. Core states are smaller, and used for the weakly compatible + # checks, but we ultimately need to return closed states. Closed + # states which are None are those which require processing; thus + # closed_states also implicitly serves as a todo list. + closed_states: list[ItemSet | None] = [] + core_states: list[ItemSet] = [] + edges: list[dict[int, int]] = [] + + # Convert the incoming seed configurations into item sets. + # TODO: Convert everything to ItemSet natively. + state0 = ItemSet({seed.core: set(seed.lookahead) for seed in seeds}) + core_states.append(state0) + closed_states.append(None) + edges.append({}) + + # We maintain a set of which rules and tokens we've seen; when + # processing a given state there's no point processing a rule or + # token more than once. + seen: set[int] = set() + + # cnd_[rule|token]_weaklies represent which states are possible weakly + # compatible matches for a given symbol. + # + # DOTY: As with `seen`, we have a uniform space so we can have a + # uniform one of these too. + cnd_weaklies: list[list[int]] = [[] for _ in range(len(self.alphabet))] + + todo = 1 # How many None values are there in closed_states? + todo_off = 0 # Offset in closed states to start searching for the next todo. + while todo > 0: + assert len(core_states) == len(closed_states) + assert len(core_states) == len(edges) + + # state_i is the next item to process. We don't want to + # continually search for the next None from the beginning, so we + # remember where we last saw a None (todo_off) and search from + # that point onwards, wrapping as necessary. Since processing a + # state x disproportionately causes state x + 1 to require + # processing, this prevents the search from becoming horribly + # non-linear. + try: + state_i = closed_states.index(None, todo_off) + except ValueError: + state_i = closed_states.index(None) # DOTY: Will not raise, given todo > 0 + + todo_off = state_i + 1 + todo -= 1 + + cl_state = self.gen_closure_x(core_states[state_i]) + closed_states[state_i] = cl_state + + seen.clear() + for core in cl_state.items.keys(): + sym = core.next + if sym is None or sym in seen: + continue + seen.add(sym) + + nstate = cl_state.goto(sym) + + # Try and find a compatible match for this state. + cnd_states = cnd_weaklies[sym] + + # First of all see if any of the candidate states are exactly + # the same as the new state, in which case we only need to + # add an edge to the candidate state. This isn't just an + # optimisation (though it does avoid the expense of change + # propagation), but has a correctness aspect: there's no + # guarantee that the weakly compatible check is reflexive + # (i.e. a state may not be weakly compatible with itself). + found = False + for cnd in cnd_states: + if core_states[cnd] == nstate: + edges[state_i][sym] = cnd + found = True + break + + if found: + continue + + # No candidate states were equal to the new state, so we need + # to look for a candidate state which is weakly compatible. + m: int | None = None + for cnd in cnd_states: + if core_states[cnd].weakly_compatible(nstate): + m = cnd + break + + if m is not None: + # A weakly compatible match has been found. + edges[state_i][sym] = m + assert core_states[m].weakly_compatible(nstate) # TODO: REMOVE, TOO SLOW + if core_states[m].weakly_merge(nstate): + # We only do the simplest change propagation, forcing possibly + # affected sets to be entirely reprocessed (which will recursively + # force propagation too). Even though this does unnecessary + # computation, it is still pretty fast. + # + # Note also that edges[k] will be completely regenerated, overwriting + # all existing entries and possibly adding new ones. We thus don't + # need to clear it manually. + if closed_states[m] is not None: + closed_states[m] = None + todo += 1 + + else: + stidx = len(core_states) + + cnd_weaklies[sym].append(stidx) + edges[state_i][sym] = stidx + + edges.append({}) + closed_states.append(None) + core_states.append(nstate) + todo += 1 + + # Although the Pager paper doesn't talk about it, the algorithm above + # can create unreachable states due to the non-determinism inherent + # in working with hashsets. Indeed, this can even happen with the + # example from Pager's paper (on perhaps 1 out of 100 runs, 24 or 25 + # states will be created instead of 23). We thus need to weed out + # unreachable states and update edges accordingly. + assert len(core_states) == len(closed_states) + + all_states = [] + for core_state, closed_state in zip(core_states, closed_states): + assert closed_state is not None + all_states.append((core_state, closed_state)) + gc_states, gc_edges = self.gc(all_states, edges) + + # DOTY: UGH this is so bad, we should rewrite to use ItemSet everywehre + # probably, which actually means getting rid of the pluggable + # generator because who actually needs that? + + # Register all the actually merged, final config sets. I should *not* + # have to do all this work. Really really garbage. + result = ConfigurationSetInfo() + result.sets = [core_state.to_config_set() for core_state, _ in gc_states] + result.core_key = {s: i for i, s in enumerate(result.sets)} + result.closures = [closed_state.to_config_set() for _, closed_state in gc_states] + result.config_set_key = {s: i for i, s in enumerate(result.closures) if s is not None} + result.successors = gc_edges + + return result + + def gc( + self, + states: list[tuple[ItemSet, ItemSet]], + edges: list[dict[int, int]], + ) -> tuple[list[tuple[ItemSet, ItemSet]], list[dict[int, int]]]: + # First of all, do a simple pass over all states. All state indexes + # reachable from the start state will be inserted into the 'seen' + # set. + todo = [0] + seen = set() + while len(todo) > 0: + item = todo.pop() + if item in seen: + continue + seen.add(item) + todo.extend(e for e in edges[item].values() if e not in seen) + + if len(seen) == len(states): + # Every state is reachable. + return states, edges + + # Imagine we started with 3 states and their edges: + # states: [0, 1, 2] + # edges : [[_ => 2]] + # + # At this point, 'seen' will be the set {0, 2}. What we need to do is + # to create a new list of states that doesn't have state 1 in it. + # That will cause state 2 to become to state 1, meaning that we need + # to adjust edges so that the pointer to state 2 is updated to state + # 1. In other words we want to achieve this output: + # + # states: [0, 2] + # edges : [_ => 1] + # + # The way we do this is to first iterate over all states, working out + # what the mapping from seen states to their new offsets is. + gc_states: list[tuple[ItemSet, ItemSet]] = [] + offsets: list[int] = [] + offset = 0 + for state_i, zstate in enumerate(states): + offsets.append(state_i - offset) + if state_i not in seen: + offset += 1 + continue + + gc_states.append(zstate) + + # At this point the offsets list will be [0, 1, 1]. We now create new + # edges where each offset is corrected by looking it up in the + # offsets list. + gc_edges: list[dict[int, int]] = [] + for st_edge_i, st_edges in enumerate(edges): + if st_edge_i not in seen: + continue + + gc_edges.append({k: offsets[v] for k, v in st_edges.items()}) + + return (gc_states, gc_edges) + + FlattenedWithMetadata = list["str|Terminal|tuple[dict[str,typing.Any],FlattenedWithMetadata]"] @@ -2825,7 +3009,7 @@ class Grammar: """ _precedence: dict[str, typing.Tuple[Assoc, int]] - _generator: type[ParserGenerator] + _generator: type[GenerateLR1] _terminals: dict[str, Terminal] _nonterminals: dict[str, NonTerminal] _trivia: list[Terminal] @@ -2834,7 +3018,7 @@ class Grammar: self, start: str | NonTerminal | None = None, precedence: PrecedenceList | None = None, - generator: type[ParserGenerator] | None = None, + generator: type[GenerateLR1] | None = None, trivia: list[str | Terminal] | None = None, name: str | None = None, ): @@ -2853,7 +3037,7 @@ class Grammar: assert precedence is not None if generator is None: - generator = getattr(self, "generator", ParserGenerator) + generator = getattr(self, "generator", GeneratePager) assert generator is not None if trivia is None: diff --git a/tests/test_grammar.py b/tests/test_grammar.py index c12380b..870e5b8 100644 --- a/tests/test_grammar.py +++ b/tests/test_grammar.py @@ -87,8 +87,8 @@ def test_all_generators(): GENERATORS = [ # parser.GenerateLR0, - # parser.GeneratePager, - parser.ParserGenerator, + parser.GeneratePager, + parser.GenerateLR1, ] for generator in GENERATORS: table = G().build_table(generator=generator) @@ -119,14 +119,15 @@ def test_grammar_aho_ullman_2(): A = Terminal("a") B = Terminal("b") - TestGrammar().build_table(generator=parser.ParserGenerator) - # TestGrammar().build_table(generator=parser.GeneratePager) + TestGrammar().build_table(generator=parser.GenerateLR1) + TestGrammar().build_table(generator=parser.GeneratePager) def test_fun_lalr(): class TestGrammar(Grammar): start = "S" + generator = parser.GeneratePager @rule def S(self):