[parser] Everything is an ItemSet now

This commit is contained in:
John Doty 2024-10-26 07:51:13 -07:00
parent 923b01f6fd
commit 385c378edb

View file

@ -146,7 +146,7 @@ import typing
# #
# We start with LR0 parsers, because they form the basis of everything else. # We start with LR0 parsers, because they form the basis of everything else.
############################################################################### ###############################################################################
class ConfigurationCore(typing.NamedTuple): class Configuration(typing.NamedTuple):
"""A core configuration, basically, a position within a rule. """A core configuration, basically, a position within a rule.
These need to be as small and as tight as you can make them. They are These need to be as small and as tight as you can make them. They are
@ -170,7 +170,7 @@ class ConfigurationCore(typing.NamedTuple):
next = None next = None
else: else:
next = symbols[0] next = symbols[0]
return ConfigurationCore( return Configuration(
name=name, name=name,
symbols=symbols, symbols=symbols,
position=0, position=0,
@ -186,7 +186,7 @@ class ConfigurationCore(typing.NamedTuple):
next = None next = None
else: else:
next = self.symbols[new_position] next = self.symbols[new_position]
return ConfigurationCore( return Configuration(
name=self.name, name=self.name,
symbols=self.symbols, symbols=self.symbols,
position=new_position, position=new_position,
@ -222,57 +222,6 @@ class ConfigurationCore(typing.NamedTuple):
) )
class Configuration(typing.NamedTuple):
"""A rule being tracked in a state. That is, a specific position within a
specific rule, with an associated lookahead state.
(Note: technically, lookahead isn't used until we get to LR(1) parsers,
but if left at its default it's harmless. Ignore it until you get to
the part about LR(1).)
"""
core: ConfigurationCore
lookahead: typing.Tuple[int, ...]
@classmethod
def from_rule(cls, name: int, symbols: typing.Tuple[int, ...], lookahead=()):
# Consider adding at_end and next to the namedtuple.
return Configuration(
core=ConfigurationCore.from_rule(name, symbols),
lookahead=lookahead,
)
@property
def at_end(self) -> bool:
return self.core.next is None
def replace_position(self, new_position):
return Configuration(
core=self.core.replace_position(new_position),
lookahead=self.lookahead,
)
@property
def rest(self):
return self.core.symbols[(self.core.position + 1) :]
def __repr__(self) -> str:
la = ", " + str(self.lookahead) if self.lookahead != () else ""
return f"{repr(self.core)}{la}"
def format(self, alphabet: list[str]) -> str:
if self.lookahead != ():
la = " ctx:{" + ",".join(alphabet[i] for i in self.lookahead) + "}"
else:
la = " ctx:{}"
return f"{self.core.format(alphabet)}{la}"
class ConfigSet(frozenset[Configuration]):
pass
# Here we have a slightly different definition of a ConfigurationSet; we keep # Here we have a slightly different definition of a ConfigurationSet; we keep
# the lookaheads outside and use a dictionary to check for containment # the lookaheads outside and use a dictionary to check for containment
# quickly. ItemSet is used in the GRM/Pager/Chin algorithm. # quickly. ItemSet is used in the GRM/Pager/Chin algorithm.
@ -286,15 +235,11 @@ class ItemSet:
algorithm. algorithm.
""" """
items: dict[ConfigurationCore, set[int]] items: dict[Configuration, set[int]]
def __init__(self, items=None): def __init__(self, items=None):
self.items = items or {} self.items = items or {}
@classmethod
def from_config_set(cls, config_set: ConfigSet) -> "ItemSet":
return ItemSet({config.core: set(config.lookahead) for config in config_set})
def weakly_compatible(self, other: "ItemSet") -> bool: def weakly_compatible(self, other: "ItemSet") -> bool:
a = self.items a = self.items
b = other.items b = other.items
@ -359,11 +304,6 @@ class ItemSet:
result.items[next] = set(context) result.items[next] = set(context)
return result return result
def to_config_set(self) -> ConfigSet:
return ConfigSet(
{Configuration(core, tuple(sorted(ctx))) for core, ctx in self.items.items()}
)
@dataclasses.dataclass @dataclasses.dataclass
class StateGraph: class StateGraph:
@ -381,7 +321,7 @@ class StateGraph:
structure, but they all compute this information.) structure, but they all compute this information.)
""" """
closures: list[ConfigSet] closures: list[ItemSet]
# All the sucessors for all of the sets. `successors[i]` is the mapping # All the sucessors for all of the sets. `successors[i]` is the mapping
# from grammar symbol to the index of the set you get by processing that # from grammar symbol to the index of the set you get by processing that
@ -392,7 +332,7 @@ class StateGraph:
return json.dumps( return json.dumps(
{ {
str(set_index): { str(set_index): {
"closures": [c.format(alphabet) for c in closure], "closures": [f"{c.format(alphabet)} -> {l}" for c, l in closure.items.items()],
"successors": {alphabet[k]: str(v) for k, v in successors.items()}, "successors": {alphabet[k]: str(v) for k, v in successors.items()},
} }
for set_index, (closure, successors) in enumerate( for set_index, (closure, successors) in enumerate(
@ -403,14 +343,14 @@ class StateGraph:
sort_keys=True, sort_keys=True,
) )
def find_path_to_set(self, target_set: ConfigSet) -> list[int]: def find_path_to_set(self, target_set: ItemSet) -> list[int]:
"""Trace the path of grammar symbols from the first set (which always """Trace the path of grammar symbols from the first set (which always
set 0) to the target set. This is useful in conflict reporting, set 0) to the target set. This is useful in conflict reporting,
because we'll be *at* a ConfigSet and want to show the grammar symbols because we'll be *at* an ItemSet and want to show the grammar symbols
that get us to where we found the conflict. that get us to where we found the conflict.
The return value is a list of grammar symbols to get to the specified The return value is a list of grammar symbols to get to the specified
ConfigSet. ItemSet.
This function raises KeyError if no path is found. This function raises KeyError if no path is found.
""" """
@ -518,7 +458,7 @@ class ErrorCollection:
the error. the error.
""" """
errors: dict[ConfigSet, dict[int, dict[Configuration, Action]]] errors: dict[ItemSet, dict[int, dict[Configuration, Action]]]
def __init__(self): def __init__(self):
self.errors = {} self.errors = {}
@ -529,7 +469,7 @@ class ErrorCollection:
def add_error( def add_error(
self, self,
config_set: ConfigSet, config_set: ItemSet,
symbol: int, symbol: int,
config: Configuration, config: Configuration,
action: Action, action: Action,
@ -581,11 +521,10 @@ class ErrorCollection:
for symbol, symbol_errors in set_errors.items(): for symbol, symbol_errors in set_errors.items():
actions = [] actions = []
for config, action in symbol_errors.items(): for config, action in symbol_errors.items():
core = config.core name = alphabet[config.name]
name = alphabet[core.name]
rule = " ".join( rule = " ".join(
f"{'* ' if core.position == i else ''}{alphabet[s]}" f"{'* ' if config.position == i else ''}{alphabet[s]}"
for i, s in enumerate(core.symbols) for i, s in enumerate(config.symbols)
) )
if config.at_end: if config.at_end:
rule += " *" rule += " *"
@ -707,7 +646,7 @@ class TableBuilder(object):
return ParseTable(actions=self.actions, gotos=self.gotos, trivia=set()) return ParseTable(actions=self.actions, gotos=self.gotos, trivia=set())
def new_row(self, config_set: ConfigSet): def new_row(self, config_set: ItemSet):
"""Start a new row, processing the given config set. Call this before """Start a new row, processing the given config set. Call this before
doing anything else. doing anything else.
""" """
@ -735,9 +674,9 @@ class TableBuilder(object):
"""Mark a reduce of the given configuration for the given symbol in the """Mark a reduce of the given configuration for the given symbol in the
current row. current row.
""" """
name = self.alphabet[config.core.name] name = self.alphabet[config.name]
transparent = name in self.transparents transparent = name in self.transparents
action = Reduce(name, len(config.core.symbols), transparent) action = Reduce(name, len(config.symbols), transparent)
self._set_table_action(symbol, action, config) self._set_table_action(symbol, action, config)
def set_table_accept(self, symbol: int, config: Configuration): def set_table_accept(self, symbol: int, config: Configuration):
@ -768,7 +707,7 @@ class TableBuilder(object):
if isinstance(action, Shift): if isinstance(action, Shift):
return self.precedence[symbol] return self.precedence[symbol]
else: else:
return self.precedence[config.core.name] return self.precedence[config.name]
def _set_table_action(self, symbol_id: int, action: ParseAction, config: Configuration | None): def _set_table_action(self, symbol_id: int, action: ParseAction, config: Configuration | None):
"""Set the action for 'symbol' in the table row to 'action'. """Set the action for 'symbol' in the table row to 'action'.
@ -1252,7 +1191,7 @@ class ParserGenerator:
self._firsts, self._firsts,
) )
def gen_sets(self, seeds: list[Configuration]) -> StateGraph: def gen_sets(self, seeds: ItemSet) -> StateGraph:
# This function can be seen as a modified version of items() from # This function can be seen as a modified version of items() from
# Chen's dissertation. # Chen's dissertation.
# #
@ -1270,10 +1209,7 @@ class ParserGenerator:
core_states: list[ItemSet] = [] core_states: list[ItemSet] = []
edges: list[dict[int, int]] = [] edges: list[dict[int, int]] = []
# Convert the incoming seed configurations into item sets. core_states.append(seeds)
# TODO: Convert everything to ItemSet natively.
state0 = ItemSet({seed.core: set(seed.lookahead) for seed in seeds})
core_states.append(state0)
closed_states.append(None) closed_states.append(None)
edges.append({}) edges.append({})
@ -1399,7 +1335,7 @@ class ParserGenerator:
# Register all the actually merged, final config sets. I should *not* # Register all the actually merged, final config sets. I should *not*
# have to do all this work. Really really garbage. # have to do all this work. Really really garbage.
return StateGraph( return StateGraph(
closures=[closed_state.to_config_set() for _, closed_state in gc_states], closures=[closed_state for _, closed_state in gc_states],
successors=gc_edges, successors=gc_edges,
) )
@ -1492,7 +1428,7 @@ class ParserGenerator:
set of all those productions combined with all the incoming productions set of all those productions combined with all the incoming productions
is the closure. is the closure.
""" """
closure: dict[ConfigurationCore, set[int]] = {} closure: dict[Configuration, set[int]] = {}
# We're going to maintain a set of things to look at, rules that we # We're going to maintain a set of things to look at, rules that we
# still need to close over. Assume that starts with everything in us. # still need to close over. Assume that starts with everything in us.
@ -1525,7 +1461,7 @@ class ParserGenerator:
lookahead.update(context) lookahead.update(context)
for rule in rules: for rule in rules:
new_core = ConfigurationCore.from_rule(config_next, rule) new_core = Configuration.from_rule(config_next, rule)
todo.append((new_core, lookahead)) todo.append((new_core, lookahead))
return ItemSet(closure) return ItemSet(closure)
@ -1536,10 +1472,12 @@ class ParserGenerator:
In LR1 parsers, we must remember to set the lookahead of the start In LR1 parsers, we must remember to set the lookahead of the start
symbol to '$'. symbol to '$'.
""" """
seeds = [ seeds = ItemSet(
Configuration.from_rule(self.start_symbol, rule, lookahead=(self.end_symbol,)) {
for rule in self.grammar[self.start_symbol] Configuration.from_rule(self.start_symbol, rule): {self.end_symbol}
] for rule in self.grammar[self.start_symbol]
}
)
return self.gen_sets(seeds) return self.gen_sets(seeds)
def gen_table(self) -> ParseTable: def gen_table(self) -> ParseTable:
@ -1579,11 +1517,11 @@ class ParserGenerator:
builder.new_row(config_set) builder.new_row(config_set)
successors = config_sets.successors[config_set_id] successors = config_sets.successors[config_set_id]
for config in config_set: for config, lookahead in config_set.items.items():
config_next = config.core.next config_next = config.next
if config_next is None: if config_next is None:
if config.core.name != self.start_symbol: if config.name != self.start_symbol:
for a in config.lookahead: for a in lookahead:
builder.set_table_reduce(a, config) builder.set_table_reduce(a, config)
else: else:
builder.set_table_accept(self.end_symbol, config) builder.set_table_accept(self.end_symbol, config)