From 385c378edbd8ebeeef3a5eee2356a6c5024276fc Mon Sep 17 00:00:00 2001 From: John Doty Date: Sat, 26 Oct 2024 07:51:13 -0700 Subject: [PATCH] [parser] Everything is an ItemSet now --- parser/parser.py | 128 ++++++++++++----------------------------------- 1 file changed, 33 insertions(+), 95 deletions(-) diff --git a/parser/parser.py b/parser/parser.py index ee228ad..2a7c872 100644 --- a/parser/parser.py +++ b/parser/parser.py @@ -146,7 +146,7 @@ import typing # # We start with LR0 parsers, because they form the basis of everything else. ############################################################################### -class ConfigurationCore(typing.NamedTuple): +class Configuration(typing.NamedTuple): """A core configuration, basically, a position within a rule. These need to be as small and as tight as you can make them. They are @@ -170,7 +170,7 @@ class ConfigurationCore(typing.NamedTuple): next = None else: next = symbols[0] - return ConfigurationCore( + return Configuration( name=name, symbols=symbols, position=0, @@ -186,7 +186,7 @@ class ConfigurationCore(typing.NamedTuple): next = None else: next = self.symbols[new_position] - return ConfigurationCore( + return Configuration( name=self.name, symbols=self.symbols, position=new_position, @@ -222,57 +222,6 @@ class ConfigurationCore(typing.NamedTuple): ) -class Configuration(typing.NamedTuple): - """A rule being tracked in a state. That is, a specific position within a - specific rule, with an associated lookahead state. - - (Note: technically, lookahead isn't used until we get to LR(1) parsers, - but if left at its default it's harmless. Ignore it until you get to - the part about LR(1).) - """ - - core: ConfigurationCore - lookahead: typing.Tuple[int, ...] - - @classmethod - def from_rule(cls, name: int, symbols: typing.Tuple[int, ...], lookahead=()): - # Consider adding at_end and next to the namedtuple. - return Configuration( - core=ConfigurationCore.from_rule(name, symbols), - lookahead=lookahead, - ) - - @property - def at_end(self) -> bool: - return self.core.next is None - - def replace_position(self, new_position): - return Configuration( - core=self.core.replace_position(new_position), - lookahead=self.lookahead, - ) - - @property - def rest(self): - return self.core.symbols[(self.core.position + 1) :] - - def __repr__(self) -> str: - la = ", " + str(self.lookahead) if self.lookahead != () else "" - return f"{repr(self.core)}{la}" - - def format(self, alphabet: list[str]) -> str: - if self.lookahead != (): - la = " ctx:{" + ",".join(alphabet[i] for i in self.lookahead) + "}" - else: - la = " ctx:{}" - - return f"{self.core.format(alphabet)}{la}" - - -class ConfigSet(frozenset[Configuration]): - pass - - # Here we have a slightly different definition of a ConfigurationSet; we keep # the lookaheads outside and use a dictionary to check for containment # quickly. ItemSet is used in the GRM/Pager/Chin algorithm. @@ -286,15 +235,11 @@ class ItemSet: algorithm. """ - items: dict[ConfigurationCore, set[int]] + items: dict[Configuration, set[int]] def __init__(self, items=None): self.items = items or {} - @classmethod - def from_config_set(cls, config_set: ConfigSet) -> "ItemSet": - return ItemSet({config.core: set(config.lookahead) for config in config_set}) - def weakly_compatible(self, other: "ItemSet") -> bool: a = self.items b = other.items @@ -359,11 +304,6 @@ class ItemSet: result.items[next] = set(context) return result - def to_config_set(self) -> ConfigSet: - return ConfigSet( - {Configuration(core, tuple(sorted(ctx))) for core, ctx in self.items.items()} - ) - @dataclasses.dataclass class StateGraph: @@ -381,7 +321,7 @@ class StateGraph: structure, but they all compute this information.) """ - closures: list[ConfigSet] + closures: list[ItemSet] # All the sucessors for all of the sets. `successors[i]` is the mapping # from grammar symbol to the index of the set you get by processing that @@ -392,7 +332,7 @@ class StateGraph: return json.dumps( { str(set_index): { - "closures": [c.format(alphabet) for c in closure], + "closures": [f"{c.format(alphabet)} -> {l}" for c, l in closure.items.items()], "successors": {alphabet[k]: str(v) for k, v in successors.items()}, } for set_index, (closure, successors) in enumerate( @@ -403,14 +343,14 @@ class StateGraph: sort_keys=True, ) - def find_path_to_set(self, target_set: ConfigSet) -> list[int]: + def find_path_to_set(self, target_set: ItemSet) -> list[int]: """Trace the path of grammar symbols from the first set (which always set 0) to the target set. This is useful in conflict reporting, - because we'll be *at* a ConfigSet and want to show the grammar symbols + because we'll be *at* an ItemSet and want to show the grammar symbols that get us to where we found the conflict. The return value is a list of grammar symbols to get to the specified - ConfigSet. + ItemSet. This function raises KeyError if no path is found. """ @@ -518,7 +458,7 @@ class ErrorCollection: the error. """ - errors: dict[ConfigSet, dict[int, dict[Configuration, Action]]] + errors: dict[ItemSet, dict[int, dict[Configuration, Action]]] def __init__(self): self.errors = {} @@ -529,7 +469,7 @@ class ErrorCollection: def add_error( self, - config_set: ConfigSet, + config_set: ItemSet, symbol: int, config: Configuration, action: Action, @@ -581,11 +521,10 @@ class ErrorCollection: for symbol, symbol_errors in set_errors.items(): actions = [] for config, action in symbol_errors.items(): - core = config.core - name = alphabet[core.name] + name = alphabet[config.name] rule = " ".join( - f"{'* ' if core.position == i else ''}{alphabet[s]}" - for i, s in enumerate(core.symbols) + f"{'* ' if config.position == i else ''}{alphabet[s]}" + for i, s in enumerate(config.symbols) ) if config.at_end: rule += " *" @@ -707,7 +646,7 @@ class TableBuilder(object): return ParseTable(actions=self.actions, gotos=self.gotos, trivia=set()) - def new_row(self, config_set: ConfigSet): + def new_row(self, config_set: ItemSet): """Start a new row, processing the given config set. Call this before doing anything else. """ @@ -735,9 +674,9 @@ class TableBuilder(object): """Mark a reduce of the given configuration for the given symbol in the current row. """ - name = self.alphabet[config.core.name] + name = self.alphabet[config.name] transparent = name in self.transparents - action = Reduce(name, len(config.core.symbols), transparent) + action = Reduce(name, len(config.symbols), transparent) self._set_table_action(symbol, action, config) def set_table_accept(self, symbol: int, config: Configuration): @@ -768,7 +707,7 @@ class TableBuilder(object): if isinstance(action, Shift): return self.precedence[symbol] else: - return self.precedence[config.core.name] + return self.precedence[config.name] def _set_table_action(self, symbol_id: int, action: ParseAction, config: Configuration | None): """Set the action for 'symbol' in the table row to 'action'. @@ -1252,7 +1191,7 @@ class ParserGenerator: self._firsts, ) - def gen_sets(self, seeds: list[Configuration]) -> StateGraph: + def gen_sets(self, seeds: ItemSet) -> StateGraph: # This function can be seen as a modified version of items() from # Chen's dissertation. # @@ -1270,10 +1209,7 @@ class ParserGenerator: core_states: list[ItemSet] = [] edges: list[dict[int, int]] = [] - # Convert the incoming seed configurations into item sets. - # TODO: Convert everything to ItemSet natively. - state0 = ItemSet({seed.core: set(seed.lookahead) for seed in seeds}) - core_states.append(state0) + core_states.append(seeds) closed_states.append(None) edges.append({}) @@ -1399,7 +1335,7 @@ class ParserGenerator: # Register all the actually merged, final config sets. I should *not* # have to do all this work. Really really garbage. return StateGraph( - closures=[closed_state.to_config_set() for _, closed_state in gc_states], + closures=[closed_state for _, closed_state in gc_states], successors=gc_edges, ) @@ -1492,7 +1428,7 @@ class ParserGenerator: set of all those productions combined with all the incoming productions is the closure. """ - closure: dict[ConfigurationCore, set[int]] = {} + closure: dict[Configuration, set[int]] = {} # We're going to maintain a set of things to look at, rules that we # still need to close over. Assume that starts with everything in us. @@ -1525,7 +1461,7 @@ class ParserGenerator: lookahead.update(context) for rule in rules: - new_core = ConfigurationCore.from_rule(config_next, rule) + new_core = Configuration.from_rule(config_next, rule) todo.append((new_core, lookahead)) return ItemSet(closure) @@ -1536,10 +1472,12 @@ class ParserGenerator: In LR1 parsers, we must remember to set the lookahead of the start symbol to '$'. """ - seeds = [ - Configuration.from_rule(self.start_symbol, rule, lookahead=(self.end_symbol,)) - for rule in self.grammar[self.start_symbol] - ] + seeds = ItemSet( + { + Configuration.from_rule(self.start_symbol, rule): {self.end_symbol} + for rule in self.grammar[self.start_symbol] + } + ) return self.gen_sets(seeds) def gen_table(self) -> ParseTable: @@ -1579,11 +1517,11 @@ class ParserGenerator: builder.new_row(config_set) successors = config_sets.successors[config_set_id] - for config in config_set: - config_next = config.core.next + for config, lookahead in config_set.items.items(): + config_next = config.next if config_next is None: - if config.core.name != self.start_symbol: - for a in config.lookahead: + if config.name != self.start_symbol: + for a in lookahead: builder.set_table_reduce(a, config) else: builder.set_table_accept(self.end_symbol, config)