From 2d5c73f0b023d87b4a1f806de48542209082c32b Mon Sep 17 00:00:00 2001 From: John Doty Date: Tue, 15 Oct 2024 07:43:52 -0700 Subject: [PATCH] [parser] Remove LR0 and SLR1 Sorry, when this was educational it was nice to have the other generators but as part of cleaning I'm just getting rid of them. --- parser/parser.py | 917 ++++++++++++++++++++---------------------- tests/test_grammar.py | 117 +----- 2 files changed, 443 insertions(+), 591 deletions(-) diff --git a/parser/parser.py b/parser/parser.py index 10da061..6faaf6e 100644 --- a/parser/parser.py +++ b/parser/parser.py @@ -158,7 +158,8 @@ class ConfigurationCore(typing.NamedTuple): # TODO: Possible improvement: make `symbols` an index into a production # list. This would not make this smaller but it might make comparisons - # faster. + # faster. This could also just be a production index and a position, + # we could find the name from the production index, etc. name: int symbols: typing.Tuple[int, ...] position: int @@ -273,98 +274,6 @@ class ConfigSet(frozenset[Configuration]): pass -# Here we have a slightly different definition of a ConfigurationSet; we keep the -# lookaheads outside and use a dictionary to check for containment quickly. -# ItemSet is used in the GRM/Pager/Chin algorithm. -@dataclasses.dataclass -class ItemSet: - """An ItemSet is a group of configuration cores together with their - "contexts", or lookahead sets. - - An ItemSet is comparable for equality, and also supports this lesser notion - of "weakly compatible" which is used to collapse states in the pager - algorithm. - """ - - items: dict[ConfigurationCore, set[int]] - - def __init__(self, items=None): - self.items = items or {} - - @classmethod - def from_config_set(cls, config_set: ConfigSet) -> "ItemSet": - return ItemSet({config.core: set(config.lookahead) for config in config_set}) - - def weakly_compatible(self, other: "ItemSet") -> bool: - a = self.items - b = other.items - - if len(a) != len(b): - return False - - for acore in a: - if acore not in b: - return False - - if len(a) == 1: - return True - - # DOTY: This loop I do not understand, truly. What the heck is happening here? - a_keys = list(a.keys()) - for i, i_key in enumerate(itertools.islice(a_keys, 0, len(a_keys) - 1)): - for j_key in itertools.islice(a_keys, i + 1, None): - a_i_key = a[i_key] - b_i_key = b[i_key] - a_j_key = a[j_key] - b_j_key = b[j_key] - - # DOTY: GRMTools written with intersects(); we don't have that we have - # `not disjoint()`. :P There are many double negatives.... - # - # not (intersect(a_i, b_j) or intersect(a_j, b_i)) - # not ((not disjoint(a_i, b_j)) or (not disjoint(a_j, b_i))) - # ((not not disjoint(a_i, b_j)) and (not not disjoint(a_j, b_i))) - # disjoint(a_i, b_j) and disjoint(a_j, b_i) - if a_i_key.isdisjoint(b_j_key) and a_j_key.isdisjoint(b_i_key): - continue - - # intersect(a_i, a_j) or intersect(b_i, b_j) - # (not disjoint(a_i, a_j)) or (not disjoint(b_i, b_j)) - # not (disjoint(a_i, a_j) and disjoint(b_i, b_j)) - if not (a_i_key.isdisjoint(a_j_key) and b_i_key.isdisjoint(b_j_key)): - continue - - return False - - return True - - def weakly_merge(self, other: "ItemSet") -> bool: - """Merge b into a, returning True if this lead to any changes.""" - a = self.items - b = other.items - - changed = False - for a_key, a_ctx in a.items(): - start_len = len(a_ctx) - a_ctx.update(b[a_key]) # Python doesn't tell us changes - changed = changed or (start_len != len(a_ctx)) - - return changed - - def goto(self, symbol: int) -> "ItemSet": - result = ItemSet() - for core, context in self.items.items(): - if core.next == symbol: - next = core.replace_position(core.position + 1) - result.items[next] = set(context) - return result - - def to_config_set(self) -> ConfigSet: - return ConfigSet( - {Configuration(core, tuple(sorted(ctx))) for core, ctx in self.items.items()} - ) - - class ConfigurationSetInfo: """When we build a grammar into a table, the first thing we need to do is generate all the configuration sets and their successors. @@ -876,338 +785,6 @@ class TableBuilder(object): self.action_row[symbol_id] = (action, config) -class GenerateLR0: - """Generate parser tables for an LR0 parser.""" - - # Internally we use integers as symbols, not strings. Mostly this is fine, - # but when we need to map back from integer to string we index this list. - alphabet: list[str] - - # The grammar we work with. The outer list is indexed by grammar symbol, - # terminal *and* non-terminal. The inner list is the list of productions - # for the given nonterminal symbol. (If you have a terminal `t` and look it - # up you'll just get an empty list.) - grammar: list[list[typing.Tuple[int, ...]]] - - # nonterminal[i] is True if alphabet[i] is a nonterminal. - nonterminal: typing.Tuple[bool, ...] - # The complement of nonterminal. terminal[i] is True if alphabet[i] is a - # terminal. - terminal: typing.Tuple[bool, ...] - - # The precedence of every symbol. If no precedence was explicitly provided - # for a symbol, then its entry in this tuple will be (NONE, 0). - precedence: typing.Tuple[typing.Tuple[Assoc, int], ...] - - # The set of symbols for which we should reduce "transparently." This doesn't - # affect state generation at all, only the generation of the final table. - transparents: set[str] - - # The lookup that maps a particular symbol to an integer. (Only really used - # for debugging.) - symbol_key: dict[str, int] - # The start symbol of the grammar. - start_symbol: int - # The end symbol of the grammar. - end_symbol: int - - def __init__( - self, - start: str, - grammar: list[typing.Tuple[str, list[str]]], - precedence: None | dict[str, typing.Tuple[Assoc, int]] = None, - transparents: None | set[str] = None, - ): - """Initialize the parser generator with the specified grammar and - start symbol. - - The input grammars are of the form: - - grammar_simple = [ - ('E', ['E', '+', 'T']), - ('E', ['T']), - ('T', ['(', 'E', ')']), - ('T', ['id']), - ] - - Which is to say, they are a list of productions. Each production is a - tuple where the first element of the tuple is the name of the - non-terminal being added, and the second elment of the tuple is the - list of terminals and non-terminals that make up the production. - - There is currently no support for custom actions or alternation or - anything like that. If you want alternations that you'll have to lower - the grammar by hand into the simpler form first. - - Don't name anything with double-underscores; those are reserved for - the generator. Don't add '$' either, as it is reserved to mean - end-of-stream. Use an empty list to indicate nullability, that is: - - ('O', []), - - means that O can be matched with nothing. - - This isn't a *great* way to author these things, but it is very simple - and flexible. You probably don't want to author this on your own; see - the Grammar class for a high-level API. - - The precedence dictionary, if provided, maps a given symbol to an - associativity and a precedence. Any symbol not in the dictionary is - presumed to have an associativity of NONE and a precedence of zero. - """ - - # Work out the alphabet. - alphabet = set() - for name, rule in grammar: - alphabet.add(name) - alphabet.update(symbol for symbol in rule) - - # Check to make sure they didn't use anything that will give us - # heartburn later. - reserved = [a for a in alphabet if a.startswith("__") or a == "$"] - if reserved: - raise ValueError( - "Can't use {symbols} in grammars, {what} reserved.".format( - symbols=" or ".join(reserved), - what="it's" if len(reserved) == 1 else "they're", - ) - ) - - alphabet.add("__start") - alphabet.add("$") - self.alphabet = list(sorted(alphabet)) - - symbol_key = {symbol: index for index, symbol in enumerate(self.alphabet)} - - start_symbol = symbol_key["__start"] - end_symbol = symbol_key["$"] - - assert self.alphabet[start_symbol] == "__start" - assert self.alphabet[end_symbol] == "$" - - # Turn the incoming grammar into a dictionary, indexed by nonterminal. - # - # We count on python dictionaries retaining the insertion order, like - # it or not. - full_grammar: list[list] = [list() for _ in self.alphabet] - terminal: list[bool] = [True for _ in self.alphabet] - assert terminal[end_symbol] - - nonterminal = [False for _ in self.alphabet] - - for name, rule in grammar: - name_symbol = symbol_key[name] - - terminal[name_symbol] = False - nonterminal[name_symbol] = True - - rules = full_grammar[name_symbol] - rules.append(tuple(symbol_key[symbol] for symbol in rule)) - - self.grammar = full_grammar - self.grammar[start_symbol].append((symbol_key[start],)) - terminal[start_symbol] = False - nonterminal[start_symbol] = True - - self.terminal = tuple(terminal) - self.nonterminal = tuple(nonterminal) - - assert self.terminal[end_symbol] - assert self.nonterminal[start_symbol] - - if precedence is None: - precedence = {} - self.precedence = tuple(precedence.get(a, (Assoc.NONE, 0)) for a in self.alphabet) - - if transparents is None: - transparents = set() - self.transparents = transparents - - self.symbol_key = symbol_key - self.start_symbol = start_symbol - self.end_symbol = end_symbol - - def gen_closure_next(self, config: Configuration): - """Return the next set of configurations in the closure for config. - - If the position for config is just before a non-terminal, then the - next set of configurations is configurations for all of the - productions for that non-terminal, with the position at the - beginning. (If the position for config is just before a terminal, - or at the end of the production, then the next set is empty.) - """ - next = config.core.next - if next is None: - return () - else: - return tuple(Configuration.from_rule(next, rule) for rule in self.grammar[next]) - - def gen_closure(self, seeds: typing.Iterable[Configuration]) -> ConfigSet: - """Compute the closure for the specified configs. The closure is all - of the configurations we could be in. Specifically, if the position - for a config is just before a non-terminal then we must also consider - configurations where the rule is the rule for the non-terminal and - the position is just before the beginning of the rule. - - (We have replaced a recursive version with an iterative one.) - """ - closure: set[Configuration] = set() - pending = list(seeds) - pending_next = [] - while len(pending) > 0: - for config in pending: - if config in closure: - continue - - closure.add(config) - pending_next.extend(self.gen_closure_next(config)) - - temp = pending - pending = pending_next - pending_next = temp - pending_next.clear() - - # NOTE: The generation of this closure *might* have generated - # multiple cores with different lookaheads; if that's - # the case we need to merge. - merged: dict[ConfigurationCore, set[int]] = {} - for c in closure: - existing = merged.get(c.core) - if existing is not None: - existing.update(c.lookahead) - else: - merged[c.core] = set(c.lookahead) - - return ConfigSet(Configuration(k, tuple(sorted(v))) for k, v in merged.items()) - - def gen_all_successors( - self, config_set: typing.Iterable[Configuration] - ) -> list[typing.Tuple[int, ConfigSet]]: - """Return all of the non-empty successors for the given config set. - - (That is, given the config set, pretend we see all the symbols we - could possibly see, and figure out which configs sets we get from - those symbols. Those are the successors of this set.) - """ - possible = {config.core.next for config in config_set if config.core.next is not None} - - next = [] - for symbol in possible: - seeds = ConfigSet( - config.replace_position(config.core.position + 1) - for config in config_set - if config.core.next == symbol - ) - if len(seeds) > 0: - next.append((symbol, seeds)) - - return next - - def gen_sets(self, seeds: list[Configuration]) -> ConfigurationSetInfo: - """Generate all configuration sets starting from the provided seeds.""" - result = ConfigurationSetInfo() - - successors = [] - pending = [ConfigSet(seeds)] - pending_next = [] - while len(pending) > 0: - for core in pending: - id, is_new = result.register_core(core) - if is_new: - config_set = self.gen_closure(core) - result.register_config_closure(id, config_set) - for symbol, successor in self.gen_all_successors(config_set): - successors.append((id, symbol, successor)) - pending_next.append(successor) - - temp = pending - pending = pending_next - pending_next = temp - pending_next.clear() - - for id, symbol, successor in successors: - result.add_successor(id, symbol, result.core_key[successor]) - - return result - - def gen_all_sets(self) -> ConfigurationSetInfo: - """Generate all of the configuration sets for the grammar.""" - seeds = [ - Configuration.from_rule(self.start_symbol, rule) - for rule in self.grammar[self.start_symbol] - ] - return self.gen_sets(seeds) - - def gen_reduce_set(self, config: Configuration) -> typing.Iterable[int]: - """Return the set of symbols that indicate we should reduce the given - configuration. - - In an LR0 parser, this is just the set of all terminals. - """ - del config - return [index for index, value in enumerate(self.terminal) if value] - - def gen_table(self) -> ParseTable: - """Generate the parse table. - - The parse table is a list of states. The first state in the list is - the starting state. Each state is a dictionary that maps a symbol to an - action. Each action is a tuple. The first element of the tuple is a - string describing what to do: - - - 'shift': The second element of the tuple is the state - number. Consume the input and push that state onto the stack. - - - 'reduce': The second element is the name of the non-terminal being - reduced, and the third element is the number of states to remove - from the stack. Don't consume the input; just remove the specified - number of things from the stack, and then consult the table again, - this time using the new top-of-stack as the current state and the - name of the non-terminal to find out what to do. - - - 'goto': The second element is the state number to push onto the - stack. In the literature, these entries are treated distinctly from - the actions, but we mix them here because they never overlap with the - other actions. (These are always associated with non-terminals, and - the other actions are always associated with terminals.) - - - 'accept': Accept the result of the parse, it worked. - - Anything missing from the row indicates an error. - """ - config_sets = self.gen_all_sets() - # print(config_sets.dump_state(self.alphabet)) - builder = TableBuilder(self.alphabet, self.precedence, self.transparents) - - for config_set_id, config_set in enumerate(config_sets.closures): - assert config_set is not None - builder.new_row(config_set) - successors = config_sets.successors[config_set_id] - - for config in config_set: - config_next = config.core.next - if config_next is None: - if config.core.name != self.start_symbol: - for a in self.gen_reduce_set(config): - builder.set_table_reduce(a, config) - else: - builder.set_table_accept(self.end_symbol, config) - - elif self.terminal[config_next]: - index = successors[config_next] - builder.set_table_shift(config_next, index, config) - - # Gotos - for symbol, index in successors.items(): - if self.nonterminal[symbol]: - builder.set_table_goto(symbol, index) - - return builder.flush(config_sets) - - -############################################################################### -# SLR(1) -############################################################################### def update_changed(items: set[int], other: set[int]) -> bool: """Merge the `other` set into the `items` set, and return True if this changed the items set. @@ -1430,32 +1007,264 @@ class FollowInfo: return FollowInfo(follows=follows) -class GenerateSLR1(GenerateLR0): - """Generate parse tables for SLR1 grammars. +# Here we have a slightly different definition of a ConfigurationSet; we keep the +# lookaheads outside and use a dictionary to check for containment quickly. +# ItemSet is used in the GRM/Pager/Chin algorithm. +@dataclasses.dataclass +class ItemSet: + """An ItemSet is a group of configuration cores together with their + "contexts", or lookahead sets. - SLR1 parsers can recognize more than LR0 parsers, because they have a - little bit more information: instead of generating reduce actions for a - production on all possible inputs, as LR0 parsers do, they generate - reduce actions only for inputs that are in the 'follow' set of the - non-terminal. - - That means SLR1 parsers need to know how to generate 'follow(A)', which - means they need to know how to generate 'first(A)'. See FirstInfo and - FollowInfo for the details on how this is computed. + An ItemSet is comparable for equality, and also supports this lesser notion + of "weakly compatible" which is used to collapse states in the pager + algorithm. """ + items: dict[ConfigurationCore, set[int]] + + def __init__(self, items=None): + self.items = items or {} + + @classmethod + def from_config_set(cls, config_set: ConfigSet) -> "ItemSet": + return ItemSet({config.core: set(config.lookahead) for config in config_set}) + + def weakly_compatible(self, other: "ItemSet") -> bool: + a = self.items + b = other.items + + if len(a) != len(b): + return False + + for acore in a: + if acore not in b: + return False + + if len(a) == 1: + return True + + # DOTY: This loop I do not understand, truly. What the heck is happening here? + a_keys = list(a.keys()) + for i, i_key in enumerate(itertools.islice(a_keys, 0, len(a_keys) - 1)): + for j_key in itertools.islice(a_keys, i + 1, None): + a_i_key = a[i_key] + b_i_key = b[i_key] + a_j_key = a[j_key] + b_j_key = b[j_key] + + # DOTY: GRMTools written with intersects(); we don't have that we have + # `not disjoint()`. :P There are many double negatives.... + # + # not (intersect(a_i, b_j) or intersect(a_j, b_i)) + # not ((not disjoint(a_i, b_j)) or (not disjoint(a_j, b_i))) + # ((not not disjoint(a_i, b_j)) and (not not disjoint(a_j, b_i))) + # disjoint(a_i, b_j) and disjoint(a_j, b_i) + if a_i_key.isdisjoint(b_j_key) and a_j_key.isdisjoint(b_i_key): + continue + + # intersect(a_i, a_j) or intersect(b_i, b_j) + # (not disjoint(a_i, a_j)) or (not disjoint(b_i, b_j)) + # not (disjoint(a_i, a_j) and disjoint(b_i, b_j)) + if not (a_i_key.isdisjoint(a_j_key) and b_i_key.isdisjoint(b_j_key)): + continue + + return False + + return True + + def weakly_merge(self, other: "ItemSet") -> bool: + """Merge b into a, returning True if this lead to any changes.""" + a = self.items + b = other.items + + changed = False + for a_key, a_ctx in a.items(): + start_len = len(a_ctx) + a_ctx.update(b[a_key]) # Python doesn't tell us changes + changed = changed or (start_len != len(a_ctx)) + + return changed + + def goto(self, symbol: int) -> "ItemSet": + result = ItemSet() + for core, context in self.items.items(): + if core.next == symbol: + next = core.replace_position(core.position + 1) + result.items[next] = set(context) + return result + + def to_config_set(self) -> ConfigSet: + return ConfigSet( + {Configuration(core, tuple(sorted(ctx))) for core, ctx in self.items.items()} + ) + + +class GenerateLR1: + """Generate parse tables for LR1, or "canonical LR" grammars. + + LR1 parsers can recognize more than SLR parsers. Like SLR parsers, they + are choosier about when they reduce. But unlike SLR parsers, they specify + the terminals on which they reduce by carrying a 'lookahead' terminal in + the configuration. The lookahead of a configuration is computed as the + closure of a configuration set is computed, so see gen_closure_next for + details. (Except for the start configuration, which has '$' as its + lookahead.) + """ + + # Internally we use integers as symbols, not strings. Mostly this is fine, + # but when we need to map back from integer to string we index this list. + alphabet: list[str] + + # The grammar we work with. The outer list is indexed by grammar symbol, + # terminal *and* non-terminal. The inner list is the list of productions + # for the given nonterminal symbol. (If you have a terminal `t` and look it + # up you'll just get an empty list.) + grammar: list[list[typing.Tuple[int, ...]]] + + # nonterminal[i] is True if alphabet[i] is a nonterminal. + nonterminal: typing.Tuple[bool, ...] + # The complement of nonterminal. terminal[i] is True if alphabet[i] is a + # terminal. + terminal: typing.Tuple[bool, ...] + + # The precedence of every symbol. If no precedence was explicitly provided + # for a symbol, then its entry in this tuple will be (NONE, 0). + precedence: typing.Tuple[typing.Tuple[Assoc, int], ...] + + # The set of symbols for which we should reduce "transparently." This doesn't + # affect state generation at all, only the generation of the final table. + transparents: set[str] + + # The lookup that maps a particular symbol to an integer. (Only really used + # for debugging.) + symbol_key: dict[str, int] + # The start symbol of the grammar. + start_symbol: int + # The end symbol of the grammar. + end_symbol: int + _firsts: FirstInfo + _follows: FollowInfo - def __init__(self, *args, **kwargs): - """See the constructor of GenerateLR0 for an explanation of the - parameters to the constructor and what they mean. - """ - super().__init__(*args, **kwargs) + def __init__( + self, + start: str, + grammar: list[typing.Tuple[str, list[str]]], + precedence: None | dict[str, typing.Tuple[Assoc, int]] = None, + transparents: None | set[str] = None, + ): + """Initialize the parser generator with the specified grammar and + start symbol. + + The input grammars are of the form: + + grammar_simple = [ + ('E', ['E', '+', 'T']), + ('E', ['T']), + ('T', ['(', 'E', ')']), + ('T', ['id']), + ] + + Which is to say, they are a list of productions. Each production is a + tuple where the first element of the tuple is the name of the + non-terminal being added, and the second elment of the tuple is the + list of terminals and non-terminals that make up the production. + + There is currently no support for custom actions or alternation or + anything like that. If you want alternations that you'll have to lower + the grammar by hand into the simpler form first. + + Don't name anything with double-underscores; those are reserved for + the generator. Don't add '$' either, as it is reserved to mean + end-of-stream. Use an empty list to indicate nullability, that is: + + ('O', []), + + means that O can be matched with nothing. + + This isn't a *great* way to author these things, but it is very simple + and flexible. You probably don't want to author this on your own; see + the Grammar class for a high-level API. + + The precedence dictionary, if provided, maps a given symbol to an + associativity and a precedence. Any symbol not in the dictionary is + presumed to have an associativity of NONE and a precedence of zero. + """ + + # Work out the alphabet. + alphabet = set() + for name, rule in grammar: + alphabet.add(name) + alphabet.update(symbol for symbol in rule) + + # Check to make sure they didn't use anything that will give us + # heartburn later. + reserved = [a for a in alphabet if a.startswith("__") or a == "$"] + if reserved: + raise ValueError( + "Can't use {symbols} in grammars, {what} reserved.".format( + symbols=" or ".join(reserved), + what="it's" if len(reserved) == 1 else "they're", + ) + ) + + alphabet.add("__start") + alphabet.add("$") + self.alphabet = list(sorted(alphabet)) + + symbol_key = {symbol: index for index, symbol in enumerate(self.alphabet)} + + start_symbol = symbol_key["__start"] + end_symbol = symbol_key["$"] + + assert self.alphabet[start_symbol] == "__start" + assert self.alphabet[end_symbol] == "$" + + # Turn the incoming grammar into a dictionary, indexed by nonterminal. + # + # We count on python dictionaries retaining the insertion order, like + # it or not. + full_grammar: list[list] = [list() for _ in self.alphabet] + terminal: list[bool] = [True for _ in self.alphabet] + assert terminal[end_symbol] + + nonterminal = [False for _ in self.alphabet] + + for name, rule in grammar: + name_symbol = symbol_key[name] + + terminal[name_symbol] = False + nonterminal[name_symbol] = True + + rules = full_grammar[name_symbol] + rules.append(tuple(symbol_key[symbol] for symbol in rule)) + + self.grammar = full_grammar + self.grammar[start_symbol].append((symbol_key[start],)) + terminal[start_symbol] = False + nonterminal[start_symbol] = True + + self.terminal = tuple(terminal) + self.nonterminal = tuple(nonterminal) + + assert self.terminal[end_symbol] + assert self.nonterminal[start_symbol] + + if precedence is None: + precedence = {} + self.precedence = tuple(precedence.get(a, (Assoc.NONE, 0)) for a in self.alphabet) + + if transparents is None: + transparents = set() + self.transparents = transparents + + self.symbol_key = symbol_key + self.start_symbol = start_symbol + self.end_symbol = end_symbol - # We store the firsts not because we need them here, but because LR1 - # and Pager need them. self._firsts = FirstInfo.from_grammar(self.grammar, self.terminal) + self._follows = FollowInfo.from_grammar( self.grammar, self.terminal, @@ -1464,6 +1273,94 @@ class GenerateSLR1(GenerateLR0): self._firsts, ) + def gen_closure(self, seeds: typing.Iterable[Configuration]) -> ConfigSet: + """Compute the closure for the specified configs. The closure is all + of the configurations we could be in. Specifically, if the position + for a config is just before a non-terminal then we must also consider + configurations where the rule is the rule for the non-terminal and + the position is just before the beginning of the rule. + + (We have replaced a recursive version with an iterative one.) + """ + closure: set[Configuration] = set() + pending = list(seeds) + pending_next = [] + while len(pending) > 0: + for config in pending: + if config in closure: + continue + + closure.add(config) + pending_next.extend(self.gen_closure_next(config)) + + temp = pending + pending = pending_next + pending_next = temp + pending_next.clear() + + # NOTE: The generation of this closure *might* have generated + # multiple cores with different lookaheads; if that's + # the case we need to merge. + merged: dict[ConfigurationCore, set[int]] = {} + for c in closure: + existing = merged.get(c.core) + if existing is not None: + existing.update(c.lookahead) + else: + merged[c.core] = set(c.lookahead) + + return ConfigSet(Configuration(k, tuple(sorted(v))) for k, v in merged.items()) + + def gen_all_successors( + self, config_set: typing.Iterable[Configuration] + ) -> list[typing.Tuple[int, ConfigSet]]: + """Return all of the non-empty successors for the given config set. + + (That is, given the config set, pretend we see all the symbols we + could possibly see, and figure out which configs sets we get from + those symbols. Those are the successors of this set.) + """ + possible = {config.core.next for config in config_set if config.core.next is not None} + + next = [] + for symbol in possible: + seeds = ConfigSet( + config.replace_position(config.core.position + 1) + for config in config_set + if config.core.next == symbol + ) + if len(seeds) > 0: + next.append((symbol, seeds)) + + return next + + def gen_sets(self, seeds: list[Configuration]) -> ConfigurationSetInfo: + """Generate all configuration sets starting from the provided seeds.""" + result = ConfigurationSetInfo() + + successors = [] + pending = [ConfigSet(seeds)] + pending_next = [] + while len(pending) > 0: + for core in pending: + id, is_new = result.register_core(core) + if is_new: + config_set = self.gen_closure(core) + result.register_config_closure(id, config_set) + for symbol, successor in self.gen_all_successors(config_set): + successors.append((id, symbol, successor)) + pending_next.append(successor) + + temp = pending + pending = pending_next + pending_next = temp + pending_next.clear() + + for id, symbol, successor in successors: + result.add_successor(id, symbol, result.core_key[successor]) + + return result + def gen_follow(self, symbol: int) -> set[int]: """Generate the follow set for the given nonterminal. @@ -1476,27 +1373,6 @@ class GenerateSLR1(GenerateLR0): """ return self._follows.follows[symbol] - def gen_reduce_set(self, config: Configuration) -> typing.Iterable[int]: - """Return the set of symbols that indicate we should reduce the given - config. - - In an SLR1 parser, this is the follow set of the config nonterminal. - """ - return self.gen_follow(config.core.name) - - -class GenerateLR1(GenerateSLR1): - """Generate parse tables for LR1, or "canonical LR" grammars. - - LR1 parsers can recognize more than SLR parsers. Like SLR parsers, they - are choosier about when they reduce. But unlike SLR parsers, they specify - the terminals on which they reduce by carrying a 'lookahead' terminal in - the configuration. The lookahead of a configuration is computed as the - closure of a configuration set is computed, so see gen_closure_next for - details. (Except for the start configuration, which has '$' as its - lookahead.) - """ - def gen_first(self, symbols: typing.Iterable[int]) -> typing.Tuple[set[int], bool]: """Return the first set for a *sequence* of symbols. @@ -1551,10 +1427,50 @@ class GenerateLR1(GenerateSLR1): next = [] for rule in self.grammar[config_next]: - next.append(Configuration.from_rule(config_next, rule, lookahead=lookahead_tuple)) + rr = Configuration.from_rule(config_next, rule, lookahead=lookahead_tuple) + next.append(rr) return tuple(next) + def gen_closure_x(self, items: ItemSet) -> ItemSet: + closure: dict[ConfigurationCore, set[int]] = {} + + # We're going to maintain a set of things to look at, rules that we + # still need to close over. Assume that starts with everything in us. + todo = [(core, context) for core, context in items.items.items()] + while len(todo) > 0: + core, context = todo.pop() + + existing_context = closure.get(core) + if existing_context is None or not context <= existing_context: + # Either context is none or something in context is not in + # existing_context, so we need to process this one. + if existing_context is not None: + existing_context.update(context) + else: + # NOTE: context in the set is a lookahead and got + # generated exactly once for all the child rules. + # we have to copy somewhere, this here seems best. + closure[core] = set(context) + + config_next = core.next + if config_next is None: + # No closure for this one, we're at the end. + continue + + rules = self.grammar[config_next] + if len(rules) > 0: + lookahead, epsilon = self.gen_first(core.rest) + print(f" LA {core.rest} -> {lookahead} e:{epsilon}") + if epsilon: + lookahead.update(context) + + for rule in rules: + new_core = ConfigurationCore.from_rule(config_next, rule) + todo.append((new_core, lookahead)) + + return ItemSet(closure) + def gen_all_sets(self): """Generate all of the configuration sets for the grammar. @@ -1567,6 +1483,63 @@ class GenerateLR1(GenerateSLR1): ] return self.gen_sets(seeds) + def gen_table(self) -> ParseTable: + """Generate the parse table. + + The parse table is a list of states. The first state in the list is + the starting state. Each state is a dictionary that maps a symbol to an + action. Each action is a tuple. The first element of the tuple is a + string describing what to do: + + - 'shift': The second element of the tuple is the state + number. Consume the input and push that state onto the stack. + + - 'reduce': The second element is the name of the non-terminal being + reduced, and the third element is the number of states to remove + from the stack. Don't consume the input; just remove the specified + number of things from the stack, and then consult the table again, + this time using the new top-of-stack as the current state and the + name of the non-terminal to find out what to do. + + - 'goto': The second element is the state number to push onto the + stack. In the literature, these entries are treated distinctly from + the actions, but we mix them here because they never overlap with the + other actions. (These are always associated with non-terminals, and + the other actions are always associated with terminals.) + + - 'accept': Accept the result of the parse, it worked. + + Anything missing from the row indicates an error. + """ + config_sets = self.gen_all_sets() + # print(config_sets.dump_state(self.alphabet)) + builder = TableBuilder(self.alphabet, self.precedence, self.transparents) + + for config_set_id, config_set in enumerate(config_sets.closures): + assert config_set is not None + builder.new_row(config_set) + successors = config_sets.successors[config_set_id] + + for config in config_set: + config_next = config.core.next + if config_next is None: + if config.core.name != self.start_symbol: + for a in self.gen_reduce_set(config): + builder.set_table_reduce(a, config) + else: + builder.set_table_accept(self.end_symbol, config) + + elif self.terminal[config_next]: + index = successors[config_next] + builder.set_table_shift(config_next, index, config) + + # Gotos + for symbol, index in successors.items(): + if self.nonterminal[symbol]: + builder.set_table_goto(symbol, index) + + return builder.flush(config_sets) + class GeneratePager(GenerateLR1): """Pager's algorithm. @@ -1654,15 +1627,7 @@ class GeneratePager(GenerateLR1): todo_off = state_i + 1 todo -= 1 - # DOTY: TODO: We convert here back and forth to Configuration - # objects, but maybe we can make ItemSet our core - # representation throughout this file. (Even in LR0.) So - # never use Configuration, always ItemSet and ConfigCore. - # - # Or just rebuild gen_closure inside ItemSet. shrug - temp_set = core_states[state_i].to_config_set() - closure = self.gen_closure(temp_set) - cl_state = ItemSet.from_config_set(closure) + cl_state = self.gen_closure_x(core_states[state_i]) closed_states[state_i] = cl_state seen.clear() @@ -3044,7 +3009,7 @@ class Grammar: """ _precedence: dict[str, typing.Tuple[Assoc, int]] - _generator: type[GenerateLR0] + _generator: type[GenerateLR1] _terminals: dict[str, Terminal] _nonterminals: dict[str, NonTerminal] _trivia: list[Terminal] @@ -3053,7 +3018,7 @@ class Grammar: self, start: str | NonTerminal | None = None, precedence: PrecedenceList | None = None, - generator: type[GenerateLR0] | None = None, + generator: type[GenerateLR1] | None = None, trivia: list[str | Terminal] | None = None, name: str | None = None, ): diff --git a/tests/test_grammar.py b/tests/test_grammar.py index af0f1d5..870e5b8 100644 --- a/tests/test_grammar.py +++ b/tests/test_grammar.py @@ -42,7 +42,7 @@ def test_lr0_lr0(): class G(Grammar): start = "E" - generator = parser.GenerateLR0 + # generator = parser.GenerateLR0 @rule def E(self): @@ -86,7 +86,7 @@ def test_all_generators(): IDENTIFIER = Terminal("id", name="id") GENERATORS = [ - parser.GenerateLR0, + # parser.GenerateLR0, parser.GeneratePager, parser.GenerateLR1, ] @@ -104,121 +104,9 @@ def test_all_generators(): assert tree == _tree(("E", ("E", ("T", "id")), "+", ("T", "(", ("E", ("T", "id")), ")"))) -def test_lr0_shift_reduce(): - """This one should not work in LR0- it has a shift/reduce conflict, but works in SLR1.""" - - class G(Grammar): - start = "E" - generator = parser.GenerateLR0 - - @rule - def E(self): - return seq(self.E, self.PLUS, self.T) | self.T - - @rule - def T(self): - return ( - seq(self.LPAREN, self.E, self.RPAREN) - | self.IDENTIFIER - | seq(self.IDENTIFIER, self.LSQUARE, self.E, self.RSQUARE) - ) - - PLUS = Terminal("+") - LPAREN = Terminal("(") - RPAREN = Terminal(")") - LSQUARE = Terminal("[") - RSQUARE = Terminal("]") - IDENTIFIER = Terminal("id") - - with pytest.raises(parser.AmbiguityError): - G().build_table() - - G().build_table(generator=parser.GenerateSLR1) - - -def test_lr0_reduce_reduce(): - """This one should not work, it has a reduce-reduce conflict.""" - - class G(Grammar): - start = "E" - generator = parser.GenerateLR0 - - @rule - def E(self): - return seq(self.E, self.PLUS, self.T) | self.T | seq(self.V, self.EQUAL, self.E) - - @rule - def T(self): - return seq(self.LPAREN, self.E, self.RPAREN) | self.IDENTIFIER - - @rule - def V(self): - return self.IDENTIFIER - - PLUS = Terminal("+") - EQUAL = Terminal("=") - LPAREN = Terminal("(") - RPAREN = Terminal(")") - IDENTIFIER = Terminal("id") - - with pytest.raises(parser.AmbiguityError): - G().build_table() - - -def test_lr0_empty(): - """LR0 can't handle empty productions because it doesn't know when to reduce.""" - - class G(Grammar): - start = "E" - generator = parser.GenerateLR0 - - @rule - def E(self): - return seq(self.F, self.BOOP) - - @rule - def F(self): - return self.BEEP | parser.Nothing - - BOOP = Terminal("boop") - BEEP = Terminal("beep") - - with pytest.raises(parser.AmbiguityError): - G().build_table() - - -def test_grammar_aho_ullman_1(): - class G(Grammar): - start = "S" - generator = parser.GenerateSLR1 - - @rule - def S(self): - return seq(self.L, self.EQUAL, self.R) | self.R - - @rule - def L(self): - return seq(self.STAR, self.R) | self.ID - - @rule - def R(self): - return self.L - - EQUAL = Terminal("=") - STAR = Terminal("*") - ID = Terminal("id") - - with pytest.raises(parser.AmbiguityError): - G().build_table() - - G().build_table(generator=parser.GenerateLR1) - G().build_table(generator=parser.GeneratePager) - - def test_grammar_aho_ullman_2(): class TestGrammar(Grammar): start = "S" - generator = parser.GenerateSLR1 @rule def S(self): @@ -231,7 +119,6 @@ def test_grammar_aho_ullman_2(): A = Terminal("a") B = Terminal("b") - TestGrammar().build_table() TestGrammar().build_table(generator=parser.GenerateLR1) TestGrammar().build_table(generator=parser.GeneratePager)