From e55bc140f93b8bd8c079d0236e5e3529bfea2abc Mon Sep 17 00:00:00 2001 From: John Doty Date: Sat, 26 Oct 2024 06:53:36 -0700 Subject: [PATCH 1/5] [parser] Move ItemSet --- parser/parser.py | 184 +++++++++++++++++++++++------------------------ 1 file changed, 92 insertions(+), 92 deletions(-) diff --git a/parser/parser.py b/parser/parser.py index 6faaf6e..6cfc2fb 100644 --- a/parser/parser.py +++ b/parser/parser.py @@ -274,6 +274,98 @@ class ConfigSet(frozenset[Configuration]): pass +# Here we have a slightly different definition of a ConfigurationSet; we keep +# the lookaheads outside and use a dictionary to check for containment +# quickly. ItemSet is used in the GRM/Pager/Chin algorithm. +@dataclasses.dataclass +class ItemSet: + """An ItemSet is a group of configuration cores together with their + "contexts", or lookahead sets. + + An ItemSet is comparable for equality, and also supports this lesser notion + of "weakly compatible" which is used to collapse states in the pager + algorithm. + """ + + items: dict[ConfigurationCore, set[int]] + + def __init__(self, items=None): + self.items = items or {} + + @classmethod + def from_config_set(cls, config_set: ConfigSet) -> "ItemSet": + return ItemSet({config.core: set(config.lookahead) for config in config_set}) + + def weakly_compatible(self, other: "ItemSet") -> bool: + a = self.items + b = other.items + + if len(a) != len(b): + return False + + for acore in a: + if acore not in b: + return False + + if len(a) == 1: + return True + + # DOTY: This loop I do not understand, truly. What the heck is happening here? + a_keys = list(a.keys()) + for i, i_key in enumerate(itertools.islice(a_keys, 0, len(a_keys) - 1)): + for j_key in itertools.islice(a_keys, i + 1, None): + a_i_key = a[i_key] + b_i_key = b[i_key] + a_j_key = a[j_key] + b_j_key = b[j_key] + + # DOTY: GRMTools written with intersects(); we don't have that we have + # `not disjoint()`. :P There are many double negatives.... + # + # not (intersect(a_i, b_j) or intersect(a_j, b_i)) + # not ((not disjoint(a_i, b_j)) or (not disjoint(a_j, b_i))) + # ((not not disjoint(a_i, b_j)) and (not not disjoint(a_j, b_i))) + # disjoint(a_i, b_j) and disjoint(a_j, b_i) + if a_i_key.isdisjoint(b_j_key) and a_j_key.isdisjoint(b_i_key): + continue + + # intersect(a_i, a_j) or intersect(b_i, b_j) + # (not disjoint(a_i, a_j)) or (not disjoint(b_i, b_j)) + # not (disjoint(a_i, a_j) and disjoint(b_i, b_j)) + if not (a_i_key.isdisjoint(a_j_key) and b_i_key.isdisjoint(b_j_key)): + continue + + return False + + return True + + def weakly_merge(self, other: "ItemSet") -> bool: + """Merge b into a, returning True if this lead to any changes.""" + a = self.items + b = other.items + + changed = False + for a_key, a_ctx in a.items(): + start_len = len(a_ctx) + a_ctx.update(b[a_key]) # Python doesn't tell us changes + changed = changed or (start_len != len(a_ctx)) + + return changed + + def goto(self, symbol: int) -> "ItemSet": + result = ItemSet() + for core, context in self.items.items(): + if core.next == symbol: + next = core.replace_position(core.position + 1) + result.items[next] = set(context) + return result + + def to_config_set(self) -> ConfigSet: + return ConfigSet( + {Configuration(core, tuple(sorted(ctx))) for core, ctx in self.items.items()} + ) + + class ConfigurationSetInfo: """When we build a grammar into a table, the first thing we need to do is generate all the configuration sets and their successors. @@ -1007,98 +1099,6 @@ class FollowInfo: return FollowInfo(follows=follows) -# Here we have a slightly different definition of a ConfigurationSet; we keep the -# lookaheads outside and use a dictionary to check for containment quickly. -# ItemSet is used in the GRM/Pager/Chin algorithm. -@dataclasses.dataclass -class ItemSet: - """An ItemSet is a group of configuration cores together with their - "contexts", or lookahead sets. - - An ItemSet is comparable for equality, and also supports this lesser notion - of "weakly compatible" which is used to collapse states in the pager - algorithm. - """ - - items: dict[ConfigurationCore, set[int]] - - def __init__(self, items=None): - self.items = items or {} - - @classmethod - def from_config_set(cls, config_set: ConfigSet) -> "ItemSet": - return ItemSet({config.core: set(config.lookahead) for config in config_set}) - - def weakly_compatible(self, other: "ItemSet") -> bool: - a = self.items - b = other.items - - if len(a) != len(b): - return False - - for acore in a: - if acore not in b: - return False - - if len(a) == 1: - return True - - # DOTY: This loop I do not understand, truly. What the heck is happening here? - a_keys = list(a.keys()) - for i, i_key in enumerate(itertools.islice(a_keys, 0, len(a_keys) - 1)): - for j_key in itertools.islice(a_keys, i + 1, None): - a_i_key = a[i_key] - b_i_key = b[i_key] - a_j_key = a[j_key] - b_j_key = b[j_key] - - # DOTY: GRMTools written with intersects(); we don't have that we have - # `not disjoint()`. :P There are many double negatives.... - # - # not (intersect(a_i, b_j) or intersect(a_j, b_i)) - # not ((not disjoint(a_i, b_j)) or (not disjoint(a_j, b_i))) - # ((not not disjoint(a_i, b_j)) and (not not disjoint(a_j, b_i))) - # disjoint(a_i, b_j) and disjoint(a_j, b_i) - if a_i_key.isdisjoint(b_j_key) and a_j_key.isdisjoint(b_i_key): - continue - - # intersect(a_i, a_j) or intersect(b_i, b_j) - # (not disjoint(a_i, a_j)) or (not disjoint(b_i, b_j)) - # not (disjoint(a_i, a_j) and disjoint(b_i, b_j)) - if not (a_i_key.isdisjoint(a_j_key) and b_i_key.isdisjoint(b_j_key)): - continue - - return False - - return True - - def weakly_merge(self, other: "ItemSet") -> bool: - """Merge b into a, returning True if this lead to any changes.""" - a = self.items - b = other.items - - changed = False - for a_key, a_ctx in a.items(): - start_len = len(a_ctx) - a_ctx.update(b[a_key]) # Python doesn't tell us changes - changed = changed or (start_len != len(a_ctx)) - - return changed - - def goto(self, symbol: int) -> "ItemSet": - result = ItemSet() - for core, context in self.items.items(): - if core.next == symbol: - next = core.replace_position(core.position + 1) - result.items[next] = set(context) - return result - - def to_config_set(self) -> ConfigSet: - return ConfigSet( - {Configuration(core, tuple(sorted(ctx))) for core, ctx in self.items.items()} - ) - - class GenerateLR1: """Generate parse tables for LR1, or "canonical LR" grammars. From e501caa073b2bf8a6426c6960e0b0dfae50ce475 Mon Sep 17 00:00:00 2001 From: John Doty Date: Sat, 26 Oct 2024 06:53:53 -0700 Subject: [PATCH 2/5] [parser] Remove unused import --- parser/parser.py | 1 - 1 file changed, 1 deletion(-) diff --git a/parser/parser.py b/parser/parser.py index 6cfc2fb..a901ec2 100644 --- a/parser/parser.py +++ b/parser/parser.py @@ -135,7 +135,6 @@ import bisect import collections import dataclasses import enum -import functools import inspect import itertools import json From 2b72811486b3cb60d47ccd452519a016506d9293 Mon Sep 17 00:00:00 2001 From: John Doty Date: Sat, 26 Oct 2024 06:56:30 -0700 Subject: [PATCH 3/5] [parser] ConfigurationSetInfo -> StateGraph --- parser/parser.py | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/parser/parser.py b/parser/parser.py index a901ec2..fdd724e 100644 --- a/parser/parser.py +++ b/parser/parser.py @@ -365,7 +365,7 @@ class ItemSet: ) -class ConfigurationSetInfo: +class StateGraph: """When we build a grammar into a table, the first thing we need to do is generate all the configuration sets and their successors. @@ -598,7 +598,7 @@ class ErrorCollection: def gen_exception( self, alphabet: list[str], - all_sets: ConfigurationSetInfo, + all_sets: StateGraph, ) -> AmbiguityError | None: """Format all the errors into an error, or return None if there are no errors. @@ -735,7 +735,7 @@ class TableBuilder(object): self.action_row = None self.goto_row = None - def flush(self, all_sets: ConfigurationSetInfo) -> ParseTable: + def flush(self, all_sets: StateGraph) -> ParseTable: """Finish building the table and return it. Raises ValueError if there were any conflicts during construction. @@ -1333,9 +1333,9 @@ class GenerateLR1: return next - def gen_sets(self, seeds: list[Configuration]) -> ConfigurationSetInfo: + def gen_sets(self, seeds: list[Configuration]) -> StateGraph: """Generate all configuration sets starting from the provided seeds.""" - result = ConfigurationSetInfo() + result = StateGraph() successors = [] pending = [ConfigSet(seeds)] @@ -1568,7 +1568,7 @@ class GeneratePager(GenerateLR1): > Xin Chen, PhD thesis, University of Hawaii, 2009 """ - def gen_sets(self, seeds: list[Configuration]) -> ConfigurationSetInfo: + def gen_sets(self, seeds: list[Configuration]) -> StateGraph: # This function can be seen as a modified version of items() from # Chen's dissertation. # @@ -1714,7 +1714,7 @@ class GeneratePager(GenerateLR1): # Register all the actually merged, final config sets. I should *not* # have to do all this work. Really really garbage. - result = ConfigurationSetInfo() + result = StateGraph() result.sets = [core_state.to_config_set() for core_state, _ in gc_states] result.core_key = {s: i for i, s in enumerate(result.sets)} result.closures = [closed_state.to_config_set() for _, closed_state in gc_states] From 27e6bb413cbd75c1757994f008954eaf73ee4ec9 Mon Sep 17 00:00:00 2001 From: John Doty Date: Sat, 26 Oct 2024 07:25:37 -0700 Subject: [PATCH 4/5] [parser] Remove Canonical LR1 generator This is fine probably. --- parser/parser.py | 485 +++++++++++++++--------------------------- tests/test_grammar.py | 9 +- 2 files changed, 177 insertions(+), 317 deletions(-) diff --git a/parser/parser.py b/parser/parser.py index fdd724e..c99bed5 100644 --- a/parser/parser.py +++ b/parser/parser.py @@ -1098,16 +1098,36 @@ class FollowInfo: return FollowInfo(follows=follows) -class GenerateLR1: - """Generate parse tables for LR1, or "canonical LR" grammars. +class ParserGenerator: + """Generate parse tables for LR1 grammars. - LR1 parsers can recognize more than SLR parsers. Like SLR parsers, they - are choosier about when they reduce. But unlike SLR parsers, they specify - the terminals on which they reduce by carrying a 'lookahead' terminal in - the configuration. The lookahead of a configuration is computed as the - closure of a configuration set is computed, so see gen_closure_next for - details. (Except for the start configuration, which has '$' as its - lookahead.) + This class implements a variant of pager's algorithm to generate the parse + tables, which support the same set of languages as Canonical LR1 but with + much smaller resulting parse tables. + + I'll be honest, I don't understnd this one as well as the pure LR1 + algorithm. It proceeds as LR1, generating successor states, but every + time it makes a new state it searches the states it has already made for + one that is "weakly compatible;" if it finds one it merges the new state + with the old state and marks the old state to be re-visited. + + The implementation here follows from the implementation in + `GRMTools`_. + + As they explain there: + + > The general algorithms that form the basis of what's used in this file + > can be found in: + > + > A Practical General Method for Constructing LR(k) Parsers + > David Pager, Acta Informatica 7, 249--268, 1977 + > + > However Pager's paper is dense, and doesn't name sub-parts of the + > algorithm. We mostly reference the (still incomplete, but less + > incomplete) version of the algorithm found in: + > + > Measuring and extending LR(1) parser generation + > Xin Chen, PhD thesis, University of Hawaii, 2009 """ # Internally we use integers as symbols, not strings. Mostly this is fine, @@ -1170,9 +1190,9 @@ class GenerateLR1: non-terminal being added, and the second elment of the tuple is the list of terminals and non-terminals that make up the production. - There is currently no support for custom actions or alternation or - anything like that. If you want alternations that you'll have to lower - the grammar by hand into the simpler form first. + There is no support for alternation. If you want alternations that + you'll have to lower the grammar by hand into the simpler form first, + but that's what the Grammar and NonTerminal classes are for. Don't name anything with double-underscores; those are reserved for the generator. Don't add '$' either, as it is reserved to mean @@ -1272,302 +1292,6 @@ class GenerateLR1: self._firsts, ) - def gen_closure(self, seeds: typing.Iterable[Configuration]) -> ConfigSet: - """Compute the closure for the specified configs. The closure is all - of the configurations we could be in. Specifically, if the position - for a config is just before a non-terminal then we must also consider - configurations where the rule is the rule for the non-terminal and - the position is just before the beginning of the rule. - - (We have replaced a recursive version with an iterative one.) - """ - closure: set[Configuration] = set() - pending = list(seeds) - pending_next = [] - while len(pending) > 0: - for config in pending: - if config in closure: - continue - - closure.add(config) - pending_next.extend(self.gen_closure_next(config)) - - temp = pending - pending = pending_next - pending_next = temp - pending_next.clear() - - # NOTE: The generation of this closure *might* have generated - # multiple cores with different lookaheads; if that's - # the case we need to merge. - merged: dict[ConfigurationCore, set[int]] = {} - for c in closure: - existing = merged.get(c.core) - if existing is not None: - existing.update(c.lookahead) - else: - merged[c.core] = set(c.lookahead) - - return ConfigSet(Configuration(k, tuple(sorted(v))) for k, v in merged.items()) - - def gen_all_successors( - self, config_set: typing.Iterable[Configuration] - ) -> list[typing.Tuple[int, ConfigSet]]: - """Return all of the non-empty successors for the given config set. - - (That is, given the config set, pretend we see all the symbols we - could possibly see, and figure out which configs sets we get from - those symbols. Those are the successors of this set.) - """ - possible = {config.core.next for config in config_set if config.core.next is not None} - - next = [] - for symbol in possible: - seeds = ConfigSet( - config.replace_position(config.core.position + 1) - for config in config_set - if config.core.next == symbol - ) - if len(seeds) > 0: - next.append((symbol, seeds)) - - return next - - def gen_sets(self, seeds: list[Configuration]) -> StateGraph: - """Generate all configuration sets starting from the provided seeds.""" - result = StateGraph() - - successors = [] - pending = [ConfigSet(seeds)] - pending_next = [] - while len(pending) > 0: - for core in pending: - id, is_new = result.register_core(core) - if is_new: - config_set = self.gen_closure(core) - result.register_config_closure(id, config_set) - for symbol, successor in self.gen_all_successors(config_set): - successors.append((id, symbol, successor)) - pending_next.append(successor) - - temp = pending - pending = pending_next - pending_next = temp - pending_next.clear() - - for id, symbol, successor in successors: - result.add_successor(id, symbol, result.core_key[successor]) - - return result - - def gen_follow(self, symbol: int) -> set[int]: - """Generate the follow set for the given nonterminal. - - The follow set for a nonterminal is the set of terminals that can - follow the nonterminal in a valid sentence. The resulting set never - contains epsilon and is never empty, since we should always at least - ground out at '$', which is the end-of-stream marker. - - See FollowInfo for more information on how this is determined. - """ - return self._follows.follows[symbol] - - def gen_first(self, symbols: typing.Iterable[int]) -> typing.Tuple[set[int], bool]: - """Return the first set for a *sequence* of symbols. - - (This is more than FIRST: we need to know the first thing that can - happen in this particular sequence right here.) - - Build the set by combining the first sets of the symbols from left to - right as long as epsilon remains in the first set. If we reach the end - and every symbol has had epsilon, then this set also has epsilon. - - Otherwise we can stop as soon as we get to a non-epsilon first(), and - our result does not have epsilon. - """ - result = set() - for s in symbols: - result.update(self._firsts.firsts[s]) - if not self._firsts.is_epsilon[s]: - return (result, False) - - return (result, True) - - def gen_reduce_set(self, config: Configuration) -> typing.Iterable[int]: - """Return the set of symbols that indicate we should reduce the given - config. - - In an LR1 parser, this is the lookahead of the configuration. - """ - return config.lookahead - - def gen_closure_next(self, config: Configuration): - """Return the next set of configurations in the closure for config. - - In LR1 parsers, we must compute the lookahead for the configurations - we're adding to the closure. The lookahead for the new configurations - is the first() of the rest of this config's production. If that - contains epsilon, then the lookahead *also* contains the lookahead we - already have. (This lookahead was presumably generated by the same - process, so in some sense it is a 'parent' lookahead, or a lookahead - from an upstream production in the grammar.) - - (See the documentation in GenerateLR0 for more information on how - this function fits into the whole process, specifically `gen_closure`.) - """ - config_next = config.core.next - if config_next is None: - return () - else: - lookahead, epsilon = self.gen_first(config.rest) - if epsilon: - lookahead.update(config.lookahead) - lookahead_tuple = tuple(sorted(lookahead)) - - next = [] - for rule in self.grammar[config_next]: - rr = Configuration.from_rule(config_next, rule, lookahead=lookahead_tuple) - next.append(rr) - - return tuple(next) - - def gen_closure_x(self, items: ItemSet) -> ItemSet: - closure: dict[ConfigurationCore, set[int]] = {} - - # We're going to maintain a set of things to look at, rules that we - # still need to close over. Assume that starts with everything in us. - todo = [(core, context) for core, context in items.items.items()] - while len(todo) > 0: - core, context = todo.pop() - - existing_context = closure.get(core) - if existing_context is None or not context <= existing_context: - # Either context is none or something in context is not in - # existing_context, so we need to process this one. - if existing_context is not None: - existing_context.update(context) - else: - # NOTE: context in the set is a lookahead and got - # generated exactly once for all the child rules. - # we have to copy somewhere, this here seems best. - closure[core] = set(context) - - config_next = core.next - if config_next is None: - # No closure for this one, we're at the end. - continue - - rules = self.grammar[config_next] - if len(rules) > 0: - lookahead, epsilon = self.gen_first(core.rest) - print(f" LA {core.rest} -> {lookahead} e:{epsilon}") - if epsilon: - lookahead.update(context) - - for rule in rules: - new_core = ConfigurationCore.from_rule(config_next, rule) - todo.append((new_core, lookahead)) - - return ItemSet(closure) - - def gen_all_sets(self): - """Generate all of the configuration sets for the grammar. - - In LR1 parsers, we must remember to set the lookahead of the start - symbol to '$'. - """ - seeds = [ - Configuration.from_rule(self.start_symbol, rule, lookahead=(self.end_symbol,)) - for rule in self.grammar[self.start_symbol] - ] - return self.gen_sets(seeds) - - def gen_table(self) -> ParseTable: - """Generate the parse table. - - The parse table is a list of states. The first state in the list is - the starting state. Each state is a dictionary that maps a symbol to an - action. Each action is a tuple. The first element of the tuple is a - string describing what to do: - - - 'shift': The second element of the tuple is the state - number. Consume the input and push that state onto the stack. - - - 'reduce': The second element is the name of the non-terminal being - reduced, and the third element is the number of states to remove - from the stack. Don't consume the input; just remove the specified - number of things from the stack, and then consult the table again, - this time using the new top-of-stack as the current state and the - name of the non-terminal to find out what to do. - - - 'goto': The second element is the state number to push onto the - stack. In the literature, these entries are treated distinctly from - the actions, but we mix them here because they never overlap with the - other actions. (These are always associated with non-terminals, and - the other actions are always associated with terminals.) - - - 'accept': Accept the result of the parse, it worked. - - Anything missing from the row indicates an error. - """ - config_sets = self.gen_all_sets() - # print(config_sets.dump_state(self.alphabet)) - builder = TableBuilder(self.alphabet, self.precedence, self.transparents) - - for config_set_id, config_set in enumerate(config_sets.closures): - assert config_set is not None - builder.new_row(config_set) - successors = config_sets.successors[config_set_id] - - for config in config_set: - config_next = config.core.next - if config_next is None: - if config.core.name != self.start_symbol: - for a in self.gen_reduce_set(config): - builder.set_table_reduce(a, config) - else: - builder.set_table_accept(self.end_symbol, config) - - elif self.terminal[config_next]: - index = successors[config_next] - builder.set_table_shift(config_next, index, config) - - # Gotos - for symbol, index in successors.items(): - if self.nonterminal[symbol]: - builder.set_table_goto(symbol, index) - - return builder.flush(config_sets) - - -class GeneratePager(GenerateLR1): - """Pager's algorithm. - - I'll be honest, I don't understnd this one as well as the pure LR1 - algorithm. It proceeds as LR1, generating successor states, but every - time it makes a new state it searches the states it has already made for - one that is "weakly compatible;" ifit finds one it merges the new state - with the old state and marks the old state to be re-visited. - - The implementation here follows from the implementation in - `GRMTools`_. - - As they explain there: - - > The general algorithms that form the basis of what's used in this file - > can be found in: - > - > A Practical General Method for Constructing LR(k) Parsers - > David Pager, Acta Informatica 7, 249--268, 1977 - > - > However Pager's paper is dense, and doesn't name sub-parts of the - > algorithm. We mostly reference the (still incomplete, but less - > incomplete) version of the algorithm found in: - > - > Measuring and extending LR(1) parser generation - > Xin Chen, PhD thesis, University of Hawaii, 2009 - """ - def gen_sets(self, seeds: list[Configuration]) -> StateGraph: # This function can be seen as a modified version of items() from # Chen's dissertation. @@ -1626,7 +1350,7 @@ class GeneratePager(GenerateLR1): todo_off = state_i + 1 todo -= 1 - cl_state = self.gen_closure_x(core_states[state_i]) + cl_state = self.gen_closure(core_states[state_i]) closed_states[state_i] = cl_state seen.clear() @@ -1782,6 +1506,143 @@ class GeneratePager(GenerateLR1): return (gc_states, gc_edges) + def gen_first(self, symbols: typing.Iterable[int]) -> typing.Tuple[set[int], bool]: + """Return the first set for a *sequence* of symbols. + + (This is more than FIRST: we need to know the first thing that can + happen in this particular sequence right here.) + + Build the set by combining the first sets of the symbols from left to + right as long as epsilon remains in the first set. If we reach the end + and every symbol has had epsilon, then this set also has epsilon. + + Otherwise we can stop as soon as we get to a non-epsilon first(), and + our result does not have epsilon. + """ + result = set() + for s in symbols: + result.update(self._firsts.firsts[s]) + if not self._firsts.is_epsilon[s]: + return (result, False) + + return (result, True) + + def gen_closure(self, items: ItemSet) -> ItemSet: + """Generate the closure of the given ItemSet. + + Some of the configurations the ItemSet might be positioned right before + nonterminals. In that case, obviously, we should *also* behave as if we + were right at the beginning of each production for that nonterminal. The + set of all those productions combined with all the incoming productions + is the closure. + """ + closure: dict[ConfigurationCore, set[int]] = {} + + # We're going to maintain a set of things to look at, rules that we + # still need to close over. Assume that starts with everything in us. + todo = [(core, context) for core, context in items.items.items()] + while len(todo) > 0: + core, context = todo.pop() + + existing_context = closure.get(core) + if existing_context is None or not context <= existing_context: + # Either context is none or something in context is not in + # existing_context, so we need to process this one. + if existing_context is not None: + existing_context.update(context) + else: + # NOTE: context in the set is a lookahead and got + # generated exactly once for all the child rules. + # we have to copy somewhere, this here seems best. + closure[core] = set(context) + + config_next = core.next + if config_next is None: + # No closure for this one, we're at the end. + continue + + rules = self.grammar[config_next] + if len(rules) > 0: + lookahead, epsilon = self.gen_first(core.rest) + print(f" LA {core.rest} -> {lookahead} e:{epsilon}") + if epsilon: + lookahead.update(context) + + for rule in rules: + new_core = ConfigurationCore.from_rule(config_next, rule) + todo.append((new_core, lookahead)) + + return ItemSet(closure) + + def gen_all_sets(self): + """Generate all of the configuration sets for the grammar. + + In LR1 parsers, we must remember to set the lookahead of the start + symbol to '$'. + """ + seeds = [ + Configuration.from_rule(self.start_symbol, rule, lookahead=(self.end_symbol,)) + for rule in self.grammar[self.start_symbol] + ] + return self.gen_sets(seeds) + + def gen_table(self) -> ParseTable: + """Generate the parse table. + + The parse table is a list of states. The first state in the list is + the starting state. Each state is a dictionary that maps a symbol to an + action. Each action is a tuple. The first element of the tuple is a + string describing what to do: + + - 'shift': The second element of the tuple is the state + number. Consume the input and push that state onto the stack. + + - 'reduce': The second element is the name of the non-terminal being + reduced, and the third element is the number of states to remove + from the stack. Don't consume the input; just remove the specified + number of things from the stack, and then consult the table again, + this time using the new top-of-stack as the current state and the + name of the non-terminal to find out what to do. + + - 'goto': The second element is the state number to push onto the + stack. In the literature, these entries are treated distinctly from + the actions, but we mix them here because they never overlap with the + other actions. (These are always associated with non-terminals, and + the other actions are always associated with terminals.) + + - 'accept': Accept the result of the parse, it worked. + + Anything missing from the row indicates an error. + """ + config_sets = self.gen_all_sets() + # print(config_sets.dump_state(self.alphabet)) + builder = TableBuilder(self.alphabet, self.precedence, self.transparents) + + for config_set_id, config_set in enumerate(config_sets.closures): + assert config_set is not None + builder.new_row(config_set) + successors = config_sets.successors[config_set_id] + + for config in config_set: + config_next = config.core.next + if config_next is None: + if config.core.name != self.start_symbol: + for a in config.lookahead: + builder.set_table_reduce(a, config) + else: + builder.set_table_accept(self.end_symbol, config) + + elif self.terminal[config_next]: + index = successors[config_next] + builder.set_table_shift(config_next, index, config) + + # Gotos + for symbol, index in successors.items(): + if self.nonterminal[symbol]: + builder.set_table_goto(symbol, index) + + return builder.flush(config_sets) + FlattenedWithMetadata = list["str|Terminal|tuple[dict[str,typing.Any],FlattenedWithMetadata]"] @@ -3008,7 +2869,7 @@ class Grammar: """ _precedence: dict[str, typing.Tuple[Assoc, int]] - _generator: type[GenerateLR1] + _generator: type[ParserGenerator] _terminals: dict[str, Terminal] _nonterminals: dict[str, NonTerminal] _trivia: list[Terminal] @@ -3017,7 +2878,7 @@ class Grammar: self, start: str | NonTerminal | None = None, precedence: PrecedenceList | None = None, - generator: type[GenerateLR1] | None = None, + generator: type[ParserGenerator] | None = None, trivia: list[str | Terminal] | None = None, name: str | None = None, ): @@ -3036,7 +2897,7 @@ class Grammar: assert precedence is not None if generator is None: - generator = getattr(self, "generator", GeneratePager) + generator = getattr(self, "generator", ParserGenerator) assert generator is not None if trivia is None: diff --git a/tests/test_grammar.py b/tests/test_grammar.py index 870e5b8..c12380b 100644 --- a/tests/test_grammar.py +++ b/tests/test_grammar.py @@ -87,8 +87,8 @@ def test_all_generators(): GENERATORS = [ # parser.GenerateLR0, - parser.GeneratePager, - parser.GenerateLR1, + # parser.GeneratePager, + parser.ParserGenerator, ] for generator in GENERATORS: table = G().build_table(generator=generator) @@ -119,15 +119,14 @@ def test_grammar_aho_ullman_2(): A = Terminal("a") B = Terminal("b") - TestGrammar().build_table(generator=parser.GenerateLR1) - TestGrammar().build_table(generator=parser.GeneratePager) + TestGrammar().build_table(generator=parser.ParserGenerator) + # TestGrammar().build_table(generator=parser.GeneratePager) def test_fun_lalr(): class TestGrammar(Grammar): start = "S" - generator = parser.GeneratePager @rule def S(self): From 923b01f6fde917d641b6d26cffb690d771e46ee3 Mon Sep 17 00:00:00 2001 From: John Doty Date: Sat, 26 Oct 2024 07:35:28 -0700 Subject: [PATCH 5/5] [parser] Simplify StateGraph --- parser/parser.py | 70 +++++++++--------------------------------------- 1 file changed, 13 insertions(+), 57 deletions(-) diff --git a/parser/parser.py b/parser/parser.py index c99bed5..ee228ad 100644 --- a/parser/parser.py +++ b/parser/parser.py @@ -365,6 +365,7 @@ class ItemSet: ) +@dataclasses.dataclass class StateGraph: """When we build a grammar into a table, the first thing we need to do is generate all the configuration sets and their successors. @@ -380,65 +381,23 @@ class StateGraph: structure, but they all compute this information.) """ - core_key: dict[ConfigSet, int] # Map a ConfigSet into am index - config_set_key: dict[ConfigSet, int] # Map a ConfigSet into am index - sets: list[ConfigSet] # Map the index back into a set - closures: list[ConfigSet | None] # Track closures + closures: list[ConfigSet] # All the sucessors for all of the sets. `successors[i]` is the mapping # from grammar symbol to the index of the set you get by processing that # symbol. successors: list[dict[int, int]] - def __init__(self): - self.core_key = {} - self.config_set_key = {} - self.sets = [] - self.closures = [] - self.successors = [] - - def register_core(self, c: ConfigSet) -> typing.Tuple[int, bool]: - """Potentially add a new config set to the set of sets. Returns the - canonical ID of the set within this structure, along with a boolean - indicating whether the set was just added or not. - - (You can use this integer to get the set back, if you need it, and - also access the successors table.) - """ - existing = self.core_key.get(c) - if existing is not None: - return existing, False - - index = len(self.sets) - self.sets.append(c) - self.closures.append(None) - self.successors.append({}) - self.core_key[c] = index - return index, True - - def register_config_closure(self, c_id: int, closure: ConfigSet): - assert self.closures[c_id] is None - self.closures[c_id] = closure - self.config_set_key[closure] = c_id - - def add_successor(self, c_id: int, symbol: int, successor: int): - """Register sucessor(`c_id`, `symbol`) -> `successor`, where c_id - is the id of the set in this structure, and symbol is the id of a - symbol in the alphabet of the grammar. - """ - self.successors[c_id][symbol] = successor - def dump_state(self, alphabet: list[str]) -> str: return json.dumps( { str(set_index): { - "configs": [c.format(alphabet) for c in config_set], - "closures": [c.format(alphabet) for c in self.closures[set_index] or []], - "successors": { - alphabet[k]: str(v) for k, v in self.successors[set_index].items() - }, + "closures": [c.format(alphabet) for c in closure], + "successors": {alphabet[k]: str(v) for k, v in successors.items()}, } - for set_index, config_set in enumerate(self.sets) + for set_index, (closure, successors) in enumerate( + zip(self.closures, self.successors) + ) }, indent=4, sort_keys=True, @@ -455,7 +414,8 @@ class StateGraph: This function raises KeyError if no path is found. """ - target_index = self.config_set_key[target_set] + # TODO: This should be tested. + target_index = self.closures.index(target_set) visited = set() queue: collections.deque = collections.deque() @@ -1438,14 +1398,10 @@ class ParserGenerator: # Register all the actually merged, final config sets. I should *not* # have to do all this work. Really really garbage. - result = StateGraph() - result.sets = [core_state.to_config_set() for core_state, _ in gc_states] - result.core_key = {s: i for i, s in enumerate(result.sets)} - result.closures = [closed_state.to_config_set() for _, closed_state in gc_states] - result.config_set_key = {s: i for i, s in enumerate(result.closures) if s is not None} - result.successors = gc_edges - - return result + return StateGraph( + closures=[closed_state.to_config_set() for _, closed_state in gc_states], + successors=gc_edges, + ) def gc( self,