diff --git a/parser/parser.py b/parser/parser.py index 6faaf6e..ee228ad 100644 --- a/parser/parser.py +++ b/parser/parser.py @@ -135,7 +135,6 @@ import bisect import collections import dataclasses import enum -import functools import inspect import itertools import json @@ -274,7 +273,100 @@ class ConfigSet(frozenset[Configuration]): pass -class ConfigurationSetInfo: +# Here we have a slightly different definition of a ConfigurationSet; we keep +# the lookaheads outside and use a dictionary to check for containment +# quickly. ItemSet is used in the GRM/Pager/Chin algorithm. +@dataclasses.dataclass +class ItemSet: + """An ItemSet is a group of configuration cores together with their + "contexts", or lookahead sets. + + An ItemSet is comparable for equality, and also supports this lesser notion + of "weakly compatible" which is used to collapse states in the pager + algorithm. + """ + + items: dict[ConfigurationCore, set[int]] + + def __init__(self, items=None): + self.items = items or {} + + @classmethod + def from_config_set(cls, config_set: ConfigSet) -> "ItemSet": + return ItemSet({config.core: set(config.lookahead) for config in config_set}) + + def weakly_compatible(self, other: "ItemSet") -> bool: + a = self.items + b = other.items + + if len(a) != len(b): + return False + + for acore in a: + if acore not in b: + return False + + if len(a) == 1: + return True + + # DOTY: This loop I do not understand, truly. What the heck is happening here? + a_keys = list(a.keys()) + for i, i_key in enumerate(itertools.islice(a_keys, 0, len(a_keys) - 1)): + for j_key in itertools.islice(a_keys, i + 1, None): + a_i_key = a[i_key] + b_i_key = b[i_key] + a_j_key = a[j_key] + b_j_key = b[j_key] + + # DOTY: GRMTools written with intersects(); we don't have that we have + # `not disjoint()`. :P There are many double negatives.... + # + # not (intersect(a_i, b_j) or intersect(a_j, b_i)) + # not ((not disjoint(a_i, b_j)) or (not disjoint(a_j, b_i))) + # ((not not disjoint(a_i, b_j)) and (not not disjoint(a_j, b_i))) + # disjoint(a_i, b_j) and disjoint(a_j, b_i) + if a_i_key.isdisjoint(b_j_key) and a_j_key.isdisjoint(b_i_key): + continue + + # intersect(a_i, a_j) or intersect(b_i, b_j) + # (not disjoint(a_i, a_j)) or (not disjoint(b_i, b_j)) + # not (disjoint(a_i, a_j) and disjoint(b_i, b_j)) + if not (a_i_key.isdisjoint(a_j_key) and b_i_key.isdisjoint(b_j_key)): + continue + + return False + + return True + + def weakly_merge(self, other: "ItemSet") -> bool: + """Merge b into a, returning True if this lead to any changes.""" + a = self.items + b = other.items + + changed = False + for a_key, a_ctx in a.items(): + start_len = len(a_ctx) + a_ctx.update(b[a_key]) # Python doesn't tell us changes + changed = changed or (start_len != len(a_ctx)) + + return changed + + def goto(self, symbol: int) -> "ItemSet": + result = ItemSet() + for core, context in self.items.items(): + if core.next == symbol: + next = core.replace_position(core.position + 1) + result.items[next] = set(context) + return result + + def to_config_set(self) -> ConfigSet: + return ConfigSet( + {Configuration(core, tuple(sorted(ctx))) for core, ctx in self.items.items()} + ) + + +@dataclasses.dataclass +class StateGraph: """When we build a grammar into a table, the first thing we need to do is generate all the configuration sets and their successors. @@ -289,65 +381,23 @@ class ConfigurationSetInfo: structure, but they all compute this information.) """ - core_key: dict[ConfigSet, int] # Map a ConfigSet into am index - config_set_key: dict[ConfigSet, int] # Map a ConfigSet into am index - sets: list[ConfigSet] # Map the index back into a set - closures: list[ConfigSet | None] # Track closures + closures: list[ConfigSet] # All the sucessors for all of the sets. `successors[i]` is the mapping # from grammar symbol to the index of the set you get by processing that # symbol. successors: list[dict[int, int]] - def __init__(self): - self.core_key = {} - self.config_set_key = {} - self.sets = [] - self.closures = [] - self.successors = [] - - def register_core(self, c: ConfigSet) -> typing.Tuple[int, bool]: - """Potentially add a new config set to the set of sets. Returns the - canonical ID of the set within this structure, along with a boolean - indicating whether the set was just added or not. - - (You can use this integer to get the set back, if you need it, and - also access the successors table.) - """ - existing = self.core_key.get(c) - if existing is not None: - return existing, False - - index = len(self.sets) - self.sets.append(c) - self.closures.append(None) - self.successors.append({}) - self.core_key[c] = index - return index, True - - def register_config_closure(self, c_id: int, closure: ConfigSet): - assert self.closures[c_id] is None - self.closures[c_id] = closure - self.config_set_key[closure] = c_id - - def add_successor(self, c_id: int, symbol: int, successor: int): - """Register sucessor(`c_id`, `symbol`) -> `successor`, where c_id - is the id of the set in this structure, and symbol is the id of a - symbol in the alphabet of the grammar. - """ - self.successors[c_id][symbol] = successor - def dump_state(self, alphabet: list[str]) -> str: return json.dumps( { str(set_index): { - "configs": [c.format(alphabet) for c in config_set], - "closures": [c.format(alphabet) for c in self.closures[set_index] or []], - "successors": { - alphabet[k]: str(v) for k, v in self.successors[set_index].items() - }, + "closures": [c.format(alphabet) for c in closure], + "successors": {alphabet[k]: str(v) for k, v in successors.items()}, } - for set_index, config_set in enumerate(self.sets) + for set_index, (closure, successors) in enumerate( + zip(self.closures, self.successors) + ) }, indent=4, sort_keys=True, @@ -364,7 +414,8 @@ class ConfigurationSetInfo: This function raises KeyError if no path is found. """ - target_index = self.config_set_key[target_set] + # TODO: This should be tested. + target_index = self.closures.index(target_set) visited = set() queue: collections.deque = collections.deque() @@ -507,7 +558,7 @@ class ErrorCollection: def gen_exception( self, alphabet: list[str], - all_sets: ConfigurationSetInfo, + all_sets: StateGraph, ) -> AmbiguityError | None: """Format all the errors into an error, or return None if there are no errors. @@ -644,7 +695,7 @@ class TableBuilder(object): self.action_row = None self.goto_row = None - def flush(self, all_sets: ConfigurationSetInfo) -> ParseTable: + def flush(self, all_sets: StateGraph) -> ParseTable: """Finish building the table and return it. Raises ValueError if there were any conflicts during construction. @@ -1007,108 +1058,36 @@ class FollowInfo: return FollowInfo(follows=follows) -# Here we have a slightly different definition of a ConfigurationSet; we keep the -# lookaheads outside and use a dictionary to check for containment quickly. -# ItemSet is used in the GRM/Pager/Chin algorithm. -@dataclasses.dataclass -class ItemSet: - """An ItemSet is a group of configuration cores together with their - "contexts", or lookahead sets. +class ParserGenerator: + """Generate parse tables for LR1 grammars. - An ItemSet is comparable for equality, and also supports this lesser notion - of "weakly compatible" which is used to collapse states in the pager - algorithm. - """ + This class implements a variant of pager's algorithm to generate the parse + tables, which support the same set of languages as Canonical LR1 but with + much smaller resulting parse tables. - items: dict[ConfigurationCore, set[int]] + I'll be honest, I don't understnd this one as well as the pure LR1 + algorithm. It proceeds as LR1, generating successor states, but every + time it makes a new state it searches the states it has already made for + one that is "weakly compatible;" if it finds one it merges the new state + with the old state and marks the old state to be re-visited. - def __init__(self, items=None): - self.items = items or {} + The implementation here follows from the implementation in + `GRMTools`_. - @classmethod - def from_config_set(cls, config_set: ConfigSet) -> "ItemSet": - return ItemSet({config.core: set(config.lookahead) for config in config_set}) + As they explain there: - def weakly_compatible(self, other: "ItemSet") -> bool: - a = self.items - b = other.items - - if len(a) != len(b): - return False - - for acore in a: - if acore not in b: - return False - - if len(a) == 1: - return True - - # DOTY: This loop I do not understand, truly. What the heck is happening here? - a_keys = list(a.keys()) - for i, i_key in enumerate(itertools.islice(a_keys, 0, len(a_keys) - 1)): - for j_key in itertools.islice(a_keys, i + 1, None): - a_i_key = a[i_key] - b_i_key = b[i_key] - a_j_key = a[j_key] - b_j_key = b[j_key] - - # DOTY: GRMTools written with intersects(); we don't have that we have - # `not disjoint()`. :P There are many double negatives.... - # - # not (intersect(a_i, b_j) or intersect(a_j, b_i)) - # not ((not disjoint(a_i, b_j)) or (not disjoint(a_j, b_i))) - # ((not not disjoint(a_i, b_j)) and (not not disjoint(a_j, b_i))) - # disjoint(a_i, b_j) and disjoint(a_j, b_i) - if a_i_key.isdisjoint(b_j_key) and a_j_key.isdisjoint(b_i_key): - continue - - # intersect(a_i, a_j) or intersect(b_i, b_j) - # (not disjoint(a_i, a_j)) or (not disjoint(b_i, b_j)) - # not (disjoint(a_i, a_j) and disjoint(b_i, b_j)) - if not (a_i_key.isdisjoint(a_j_key) and b_i_key.isdisjoint(b_j_key)): - continue - - return False - - return True - - def weakly_merge(self, other: "ItemSet") -> bool: - """Merge b into a, returning True if this lead to any changes.""" - a = self.items - b = other.items - - changed = False - for a_key, a_ctx in a.items(): - start_len = len(a_ctx) - a_ctx.update(b[a_key]) # Python doesn't tell us changes - changed = changed or (start_len != len(a_ctx)) - - return changed - - def goto(self, symbol: int) -> "ItemSet": - result = ItemSet() - for core, context in self.items.items(): - if core.next == symbol: - next = core.replace_position(core.position + 1) - result.items[next] = set(context) - return result - - def to_config_set(self) -> ConfigSet: - return ConfigSet( - {Configuration(core, tuple(sorted(ctx))) for core, ctx in self.items.items()} - ) - - -class GenerateLR1: - """Generate parse tables for LR1, or "canonical LR" grammars. - - LR1 parsers can recognize more than SLR parsers. Like SLR parsers, they - are choosier about when they reduce. But unlike SLR parsers, they specify - the terminals on which they reduce by carrying a 'lookahead' terminal in - the configuration. The lookahead of a configuration is computed as the - closure of a configuration set is computed, so see gen_closure_next for - details. (Except for the start configuration, which has '$' as its - lookahead.) + > The general algorithms that form the basis of what's used in this file + > can be found in: + > + > A Practical General Method for Constructing LR(k) Parsers + > David Pager, Acta Informatica 7, 249--268, 1977 + > + > However Pager's paper is dense, and doesn't name sub-parts of the + > algorithm. We mostly reference the (still incomplete, but less + > incomplete) version of the algorithm found in: + > + > Measuring and extending LR(1) parser generation + > Xin Chen, PhD thesis, University of Hawaii, 2009 """ # Internally we use integers as symbols, not strings. Mostly this is fine, @@ -1171,9 +1150,9 @@ class GenerateLR1: non-terminal being added, and the second elment of the tuple is the list of terminals and non-terminals that make up the production. - There is currently no support for custom actions or alternation or - anything like that. If you want alternations that you'll have to lower - the grammar by hand into the simpler form first. + There is no support for alternation. If you want alternations that + you'll have to lower the grammar by hand into the simpler form first, + but that's what the Grammar and NonTerminal classes are for. Don't name anything with double-underscores; those are reserved for the generator. Don't add '$' either, as it is reserved to mean @@ -1273,105 +1252,215 @@ class GenerateLR1: self._firsts, ) - def gen_closure(self, seeds: typing.Iterable[Configuration]) -> ConfigSet: - """Compute the closure for the specified configs. The closure is all - of the configurations we could be in. Specifically, if the position - for a config is just before a non-terminal then we must also consider - configurations where the rule is the rule for the non-terminal and - the position is just before the beginning of the rule. + def gen_sets(self, seeds: list[Configuration]) -> StateGraph: + # This function can be seen as a modified version of items() from + # Chen's dissertation. + # + # DOTY: It is also (practically) a converted version from grmtools + # into python, more or less verbatim at this point. I have some + # sense of what is going on, and attempt to elaborate with + # these comments. - (We have replaced a recursive version with an iterative one.) - """ - closure: set[Configuration] = set() - pending = list(seeds) - pending_next = [] - while len(pending) > 0: - for config in pending: - if config in closure: + # closed_states and core_states are both equally sized vectors of + # states. Core states are smaller, and used for the weakly compatible + # checks, but we ultimately need to return closed states. Closed + # states which are None are those which require processing; thus + # closed_states also implicitly serves as a todo list. + closed_states: list[ItemSet | None] = [] + core_states: list[ItemSet] = [] + edges: list[dict[int, int]] = [] + + # Convert the incoming seed configurations into item sets. + # TODO: Convert everything to ItemSet natively. + state0 = ItemSet({seed.core: set(seed.lookahead) for seed in seeds}) + core_states.append(state0) + closed_states.append(None) + edges.append({}) + + # We maintain a set of which rules and tokens we've seen; when + # processing a given state there's no point processing a rule or + # token more than once. + seen: set[int] = set() + + # cnd_[rule|token]_weaklies represent which states are possible weakly + # compatible matches for a given symbol. + # + # DOTY: As with `seen`, we have a uniform space so we can have a + # uniform one of these too. + cnd_weaklies: list[list[int]] = [[] for _ in range(len(self.alphabet))] + + todo = 1 # How many None values are there in closed_states? + todo_off = 0 # Offset in closed states to start searching for the next todo. + while todo > 0: + assert len(core_states) == len(closed_states) + assert len(core_states) == len(edges) + + # state_i is the next item to process. We don't want to + # continually search for the next None from the beginning, so we + # remember where we last saw a None (todo_off) and search from + # that point onwards, wrapping as necessary. Since processing a + # state x disproportionately causes state x + 1 to require + # processing, this prevents the search from becoming horribly + # non-linear. + try: + state_i = closed_states.index(None, todo_off) + except ValueError: + state_i = closed_states.index(None) # DOTY: Will not raise, given todo > 0 + + todo_off = state_i + 1 + todo -= 1 + + cl_state = self.gen_closure(core_states[state_i]) + closed_states[state_i] = cl_state + + seen.clear() + for core in cl_state.items.keys(): + sym = core.next + if sym is None or sym in seen: + continue + seen.add(sym) + + nstate = cl_state.goto(sym) + + # Try and find a compatible match for this state. + cnd_states = cnd_weaklies[sym] + + # First of all see if any of the candidate states are exactly + # the same as the new state, in which case we only need to + # add an edge to the candidate state. This isn't just an + # optimisation (though it does avoid the expense of change + # propagation), but has a correctness aspect: there's no + # guarantee that the weakly compatible check is reflexive + # (i.e. a state may not be weakly compatible with itself). + found = False + for cnd in cnd_states: + if core_states[cnd] == nstate: + edges[state_i][sym] = cnd + found = True + break + + if found: continue - closure.add(config) - pending_next.extend(self.gen_closure_next(config)) + # No candidate states were equal to the new state, so we need + # to look for a candidate state which is weakly compatible. + m: int | None = None + for cnd in cnd_states: + if core_states[cnd].weakly_compatible(nstate): + m = cnd + break - temp = pending - pending = pending_next - pending_next = temp - pending_next.clear() + if m is not None: + # A weakly compatible match has been found. + edges[state_i][sym] = m + assert core_states[m].weakly_compatible(nstate) # TODO: REMOVE, TOO SLOW + if core_states[m].weakly_merge(nstate): + # We only do the simplest change propagation, forcing possibly + # affected sets to be entirely reprocessed (which will recursively + # force propagation too). Even though this does unnecessary + # computation, it is still pretty fast. + # + # Note also that edges[k] will be completely regenerated, overwriting + # all existing entries and possibly adding new ones. We thus don't + # need to clear it manually. + if closed_states[m] is not None: + closed_states[m] = None + todo += 1 - # NOTE: The generation of this closure *might* have generated - # multiple cores with different lookaheads; if that's - # the case we need to merge. - merged: dict[ConfigurationCore, set[int]] = {} - for c in closure: - existing = merged.get(c.core) - if existing is not None: - existing.update(c.lookahead) - else: - merged[c.core] = set(c.lookahead) + else: + stidx = len(core_states) - return ConfigSet(Configuration(k, tuple(sorted(v))) for k, v in merged.items()) + cnd_weaklies[sym].append(stidx) + edges[state_i][sym] = stidx - def gen_all_successors( - self, config_set: typing.Iterable[Configuration] - ) -> list[typing.Tuple[int, ConfigSet]]: - """Return all of the non-empty successors for the given config set. + edges.append({}) + closed_states.append(None) + core_states.append(nstate) + todo += 1 - (That is, given the config set, pretend we see all the symbols we - could possibly see, and figure out which configs sets we get from - those symbols. Those are the successors of this set.) - """ - possible = {config.core.next for config in config_set if config.core.next is not None} + # Although the Pager paper doesn't talk about it, the algorithm above + # can create unreachable states due to the non-determinism inherent + # in working with hashsets. Indeed, this can even happen with the + # example from Pager's paper (on perhaps 1 out of 100 runs, 24 or 25 + # states will be created instead of 23). We thus need to weed out + # unreachable states and update edges accordingly. + assert len(core_states) == len(closed_states) - next = [] - for symbol in possible: - seeds = ConfigSet( - config.replace_position(config.core.position + 1) - for config in config_set - if config.core.next == symbol - ) - if len(seeds) > 0: - next.append((symbol, seeds)) + all_states = [] + for core_state, closed_state in zip(core_states, closed_states): + assert closed_state is not None + all_states.append((core_state, closed_state)) + gc_states, gc_edges = self.gc(all_states, edges) - return next + # DOTY: UGH this is so bad, we should rewrite to use ItemSet everywehre + # probably, which actually means getting rid of the pluggable + # generator because who actually needs that? - def gen_sets(self, seeds: list[Configuration]) -> ConfigurationSetInfo: - """Generate all configuration sets starting from the provided seeds.""" - result = ConfigurationSetInfo() + # Register all the actually merged, final config sets. I should *not* + # have to do all this work. Really really garbage. + return StateGraph( + closures=[closed_state.to_config_set() for _, closed_state in gc_states], + successors=gc_edges, + ) - successors = [] - pending = [ConfigSet(seeds)] - pending_next = [] - while len(pending) > 0: - for core in pending: - id, is_new = result.register_core(core) - if is_new: - config_set = self.gen_closure(core) - result.register_config_closure(id, config_set) - for symbol, successor in self.gen_all_successors(config_set): - successors.append((id, symbol, successor)) - pending_next.append(successor) + def gc( + self, + states: list[tuple[ItemSet, ItemSet]], + edges: list[dict[int, int]], + ) -> tuple[list[tuple[ItemSet, ItemSet]], list[dict[int, int]]]: + # First of all, do a simple pass over all states. All state indexes + # reachable from the start state will be inserted into the 'seen' + # set. + todo = [0] + seen = set() + while len(todo) > 0: + item = todo.pop() + if item in seen: + continue + seen.add(item) + todo.extend(e for e in edges[item].values() if e not in seen) - temp = pending - pending = pending_next - pending_next = temp - pending_next.clear() + if len(seen) == len(states): + # Every state is reachable. + return states, edges - for id, symbol, successor in successors: - result.add_successor(id, symbol, result.core_key[successor]) + # Imagine we started with 3 states and their edges: + # states: [0, 1, 2] + # edges : [[_ => 2]] + # + # At this point, 'seen' will be the set {0, 2}. What we need to do is + # to create a new list of states that doesn't have state 1 in it. + # That will cause state 2 to become to state 1, meaning that we need + # to adjust edges so that the pointer to state 2 is updated to state + # 1. In other words we want to achieve this output: + # + # states: [0, 2] + # edges : [_ => 1] + # + # The way we do this is to first iterate over all states, working out + # what the mapping from seen states to their new offsets is. + gc_states: list[tuple[ItemSet, ItemSet]] = [] + offsets: list[int] = [] + offset = 0 + for state_i, zstate in enumerate(states): + offsets.append(state_i - offset) + if state_i not in seen: + offset += 1 + continue - return result + gc_states.append(zstate) - def gen_follow(self, symbol: int) -> set[int]: - """Generate the follow set for the given nonterminal. + # At this point the offsets list will be [0, 1, 1]. We now create new + # edges where each offset is corrected by looking it up in the + # offsets list. + gc_edges: list[dict[int, int]] = [] + for st_edge_i, st_edges in enumerate(edges): + if st_edge_i not in seen: + continue - The follow set for a nonterminal is the set of terminals that can - follow the nonterminal in a valid sentence. The resulting set never - contains epsilon and is never empty, since we should always at least - ground out at '$', which is the end-of-stream marker. + gc_edges.append({k: offsets[v] for k, v in st_edges.items()}) - See FollowInfo for more information on how this is determined. - """ - return self._follows.follows[symbol] + return (gc_states, gc_edges) def gen_first(self, symbols: typing.Iterable[int]) -> typing.Tuple[set[int], bool]: """Return the first set for a *sequence* of symbols. @@ -1394,45 +1483,15 @@ class GenerateLR1: return (result, True) - def gen_reduce_set(self, config: Configuration) -> typing.Iterable[int]: - """Return the set of symbols that indicate we should reduce the given - config. + def gen_closure(self, items: ItemSet) -> ItemSet: + """Generate the closure of the given ItemSet. - In an LR1 parser, this is the lookahead of the configuration. + Some of the configurations the ItemSet might be positioned right before + nonterminals. In that case, obviously, we should *also* behave as if we + were right at the beginning of each production for that nonterminal. The + set of all those productions combined with all the incoming productions + is the closure. """ - return config.lookahead - - def gen_closure_next(self, config: Configuration): - """Return the next set of configurations in the closure for config. - - In LR1 parsers, we must compute the lookahead for the configurations - we're adding to the closure. The lookahead for the new configurations - is the first() of the rest of this config's production. If that - contains epsilon, then the lookahead *also* contains the lookahead we - already have. (This lookahead was presumably generated by the same - process, so in some sense it is a 'parent' lookahead, or a lookahead - from an upstream production in the grammar.) - - (See the documentation in GenerateLR0 for more information on how - this function fits into the whole process, specifically `gen_closure`.) - """ - config_next = config.core.next - if config_next is None: - return () - else: - lookahead, epsilon = self.gen_first(config.rest) - if epsilon: - lookahead.update(config.lookahead) - lookahead_tuple = tuple(sorted(lookahead)) - - next = [] - for rule in self.grammar[config_next]: - rr = Configuration.from_rule(config_next, rule, lookahead=lookahead_tuple) - next.append(rr) - - return tuple(next) - - def gen_closure_x(self, items: ItemSet) -> ItemSet: closure: dict[ConfigurationCore, set[int]] = {} # We're going to maintain a set of things to look at, rules that we @@ -1524,7 +1583,7 @@ class GenerateLR1: config_next = config.core.next if config_next is None: if config.core.name != self.start_symbol: - for a in self.gen_reduce_set(config): + for a in config.lookahead: builder.set_table_reduce(a, config) else: builder.set_table_accept(self.end_symbol, config) @@ -1541,249 +1600,6 @@ class GenerateLR1: return builder.flush(config_sets) -class GeneratePager(GenerateLR1): - """Pager's algorithm. - - I'll be honest, I don't understnd this one as well as the pure LR1 - algorithm. It proceeds as LR1, generating successor states, but every - time it makes a new state it searches the states it has already made for - one that is "weakly compatible;" ifit finds one it merges the new state - with the old state and marks the old state to be re-visited. - - The implementation here follows from the implementation in - `GRMTools`_. - - As they explain there: - - > The general algorithms that form the basis of what's used in this file - > can be found in: - > - > A Practical General Method for Constructing LR(k) Parsers - > David Pager, Acta Informatica 7, 249--268, 1977 - > - > However Pager's paper is dense, and doesn't name sub-parts of the - > algorithm. We mostly reference the (still incomplete, but less - > incomplete) version of the algorithm found in: - > - > Measuring and extending LR(1) parser generation - > Xin Chen, PhD thesis, University of Hawaii, 2009 - """ - - def gen_sets(self, seeds: list[Configuration]) -> ConfigurationSetInfo: - # This function can be seen as a modified version of items() from - # Chen's dissertation. - # - # DOTY: It is also (practically) a converted version from grmtools - # into python, more or less verbatim at this point. I have some - # sense of what is going on, and attempt to elaborate with - # these comments. - - # closed_states and core_states are both equally sized vectors of - # states. Core states are smaller, and used for the weakly compatible - # checks, but we ultimately need to return closed states. Closed - # states which are None are those which require processing; thus - # closed_states also implicitly serves as a todo list. - closed_states: list[ItemSet | None] = [] - core_states: list[ItemSet] = [] - edges: list[dict[int, int]] = [] - - # Convert the incoming seed configurations into item sets. - # TODO: Convert everything to ItemSet natively. - state0 = ItemSet({seed.core: set(seed.lookahead) for seed in seeds}) - core_states.append(state0) - closed_states.append(None) - edges.append({}) - - # We maintain a set of which rules and tokens we've seen; when - # processing a given state there's no point processing a rule or - # token more than once. - seen: set[int] = set() - - # cnd_[rule|token]_weaklies represent which states are possible weakly - # compatible matches for a given symbol. - # - # DOTY: As with `seen`, we have a uniform space so we can have a - # uniform one of these too. - cnd_weaklies: list[list[int]] = [[] for _ in range(len(self.alphabet))] - - todo = 1 # How many None values are there in closed_states? - todo_off = 0 # Offset in closed states to start searching for the next todo. - while todo > 0: - assert len(core_states) == len(closed_states) - assert len(core_states) == len(edges) - - # state_i is the next item to process. We don't want to - # continually search for the next None from the beginning, so we - # remember where we last saw a None (todo_off) and search from - # that point onwards, wrapping as necessary. Since processing a - # state x disproportionately causes state x + 1 to require - # processing, this prevents the search from becoming horribly - # non-linear. - try: - state_i = closed_states.index(None, todo_off) - except ValueError: - state_i = closed_states.index(None) # DOTY: Will not raise, given todo > 0 - - todo_off = state_i + 1 - todo -= 1 - - cl_state = self.gen_closure_x(core_states[state_i]) - closed_states[state_i] = cl_state - - seen.clear() - for core in cl_state.items.keys(): - sym = core.next - if sym is None or sym in seen: - continue - seen.add(sym) - - nstate = cl_state.goto(sym) - - # Try and find a compatible match for this state. - cnd_states = cnd_weaklies[sym] - - # First of all see if any of the candidate states are exactly - # the same as the new state, in which case we only need to - # add an edge to the candidate state. This isn't just an - # optimisation (though it does avoid the expense of change - # propagation), but has a correctness aspect: there's no - # guarantee that the weakly compatible check is reflexive - # (i.e. a state may not be weakly compatible with itself). - found = False - for cnd in cnd_states: - if core_states[cnd] == nstate: - edges[state_i][sym] = cnd - found = True - break - - if found: - continue - - # No candidate states were equal to the new state, so we need - # to look for a candidate state which is weakly compatible. - m: int | None = None - for cnd in cnd_states: - if core_states[cnd].weakly_compatible(nstate): - m = cnd - break - - if m is not None: - # A weakly compatible match has been found. - edges[state_i][sym] = m - assert core_states[m].weakly_compatible(nstate) # TODO: REMOVE, TOO SLOW - if core_states[m].weakly_merge(nstate): - # We only do the simplest change propagation, forcing possibly - # affected sets to be entirely reprocessed (which will recursively - # force propagation too). Even though this does unnecessary - # computation, it is still pretty fast. - # - # Note also that edges[k] will be completely regenerated, overwriting - # all existing entries and possibly adding new ones. We thus don't - # need to clear it manually. - if closed_states[m] is not None: - closed_states[m] = None - todo += 1 - - else: - stidx = len(core_states) - - cnd_weaklies[sym].append(stidx) - edges[state_i][sym] = stidx - - edges.append({}) - closed_states.append(None) - core_states.append(nstate) - todo += 1 - - # Although the Pager paper doesn't talk about it, the algorithm above - # can create unreachable states due to the non-determinism inherent - # in working with hashsets. Indeed, this can even happen with the - # example from Pager's paper (on perhaps 1 out of 100 runs, 24 or 25 - # states will be created instead of 23). We thus need to weed out - # unreachable states and update edges accordingly. - assert len(core_states) == len(closed_states) - - all_states = [] - for core_state, closed_state in zip(core_states, closed_states): - assert closed_state is not None - all_states.append((core_state, closed_state)) - gc_states, gc_edges = self.gc(all_states, edges) - - # DOTY: UGH this is so bad, we should rewrite to use ItemSet everywehre - # probably, which actually means getting rid of the pluggable - # generator because who actually needs that? - - # Register all the actually merged, final config sets. I should *not* - # have to do all this work. Really really garbage. - result = ConfigurationSetInfo() - result.sets = [core_state.to_config_set() for core_state, _ in gc_states] - result.core_key = {s: i for i, s in enumerate(result.sets)} - result.closures = [closed_state.to_config_set() for _, closed_state in gc_states] - result.config_set_key = {s: i for i, s in enumerate(result.closures) if s is not None} - result.successors = gc_edges - - return result - - def gc( - self, - states: list[tuple[ItemSet, ItemSet]], - edges: list[dict[int, int]], - ) -> tuple[list[tuple[ItemSet, ItemSet]], list[dict[int, int]]]: - # First of all, do a simple pass over all states. All state indexes - # reachable from the start state will be inserted into the 'seen' - # set. - todo = [0] - seen = set() - while len(todo) > 0: - item = todo.pop() - if item in seen: - continue - seen.add(item) - todo.extend(e for e in edges[item].values() if e not in seen) - - if len(seen) == len(states): - # Every state is reachable. - return states, edges - - # Imagine we started with 3 states and their edges: - # states: [0, 1, 2] - # edges : [[_ => 2]] - # - # At this point, 'seen' will be the set {0, 2}. What we need to do is - # to create a new list of states that doesn't have state 1 in it. - # That will cause state 2 to become to state 1, meaning that we need - # to adjust edges so that the pointer to state 2 is updated to state - # 1. In other words we want to achieve this output: - # - # states: [0, 2] - # edges : [_ => 1] - # - # The way we do this is to first iterate over all states, working out - # what the mapping from seen states to their new offsets is. - gc_states: list[tuple[ItemSet, ItemSet]] = [] - offsets: list[int] = [] - offset = 0 - for state_i, zstate in enumerate(states): - offsets.append(state_i - offset) - if state_i not in seen: - offset += 1 - continue - - gc_states.append(zstate) - - # At this point the offsets list will be [0, 1, 1]. We now create new - # edges where each offset is corrected by looking it up in the - # offsets list. - gc_edges: list[dict[int, int]] = [] - for st_edge_i, st_edges in enumerate(edges): - if st_edge_i not in seen: - continue - - gc_edges.append({k: offsets[v] for k, v in st_edges.items()}) - - return (gc_states, gc_edges) - - FlattenedWithMetadata = list["str|Terminal|tuple[dict[str,typing.Any],FlattenedWithMetadata]"] @@ -3009,7 +2825,7 @@ class Grammar: """ _precedence: dict[str, typing.Tuple[Assoc, int]] - _generator: type[GenerateLR1] + _generator: type[ParserGenerator] _terminals: dict[str, Terminal] _nonterminals: dict[str, NonTerminal] _trivia: list[Terminal] @@ -3018,7 +2834,7 @@ class Grammar: self, start: str | NonTerminal | None = None, precedence: PrecedenceList | None = None, - generator: type[GenerateLR1] | None = None, + generator: type[ParserGenerator] | None = None, trivia: list[str | Terminal] | None = None, name: str | None = None, ): @@ -3037,7 +2853,7 @@ class Grammar: assert precedence is not None if generator is None: - generator = getattr(self, "generator", GeneratePager) + generator = getattr(self, "generator", ParserGenerator) assert generator is not None if trivia is None: diff --git a/tests/test_grammar.py b/tests/test_grammar.py index 870e5b8..c12380b 100644 --- a/tests/test_grammar.py +++ b/tests/test_grammar.py @@ -87,8 +87,8 @@ def test_all_generators(): GENERATORS = [ # parser.GenerateLR0, - parser.GeneratePager, - parser.GenerateLR1, + # parser.GeneratePager, + parser.ParserGenerator, ] for generator in GENERATORS: table = G().build_table(generator=generator) @@ -119,15 +119,14 @@ def test_grammar_aho_ullman_2(): A = Terminal("a") B = Terminal("b") - TestGrammar().build_table(generator=parser.GenerateLR1) - TestGrammar().build_table(generator=parser.GeneratePager) + TestGrammar().build_table(generator=parser.ParserGenerator) + # TestGrammar().build_table(generator=parser.GeneratePager) def test_fun_lalr(): class TestGrammar(Grammar): start = "S" - generator = parser.GeneratePager @rule def S(self):