[parser] Everything is an ItemSet now

This commit is contained in:
John Doty 2024-10-26 07:51:13 -07:00
parent 923b01f6fd
commit 385c378edb

View file

@ -146,7 +146,7 @@ import typing
#
# We start with LR0 parsers, because they form the basis of everything else.
###############################################################################
class ConfigurationCore(typing.NamedTuple):
class Configuration(typing.NamedTuple):
"""A core configuration, basically, a position within a rule.
These need to be as small and as tight as you can make them. They are
@ -170,7 +170,7 @@ class ConfigurationCore(typing.NamedTuple):
next = None
else:
next = symbols[0]
return ConfigurationCore(
return Configuration(
name=name,
symbols=symbols,
position=0,
@ -186,7 +186,7 @@ class ConfigurationCore(typing.NamedTuple):
next = None
else:
next = self.symbols[new_position]
return ConfigurationCore(
return Configuration(
name=self.name,
symbols=self.symbols,
position=new_position,
@ -222,57 +222,6 @@ class ConfigurationCore(typing.NamedTuple):
)
class Configuration(typing.NamedTuple):
"""A rule being tracked in a state. That is, a specific position within a
specific rule, with an associated lookahead state.
(Note: technically, lookahead isn't used until we get to LR(1) parsers,
but if left at its default it's harmless. Ignore it until you get to
the part about LR(1).)
"""
core: ConfigurationCore
lookahead: typing.Tuple[int, ...]
@classmethod
def from_rule(cls, name: int, symbols: typing.Tuple[int, ...], lookahead=()):
# Consider adding at_end and next to the namedtuple.
return Configuration(
core=ConfigurationCore.from_rule(name, symbols),
lookahead=lookahead,
)
@property
def at_end(self) -> bool:
return self.core.next is None
def replace_position(self, new_position):
return Configuration(
core=self.core.replace_position(new_position),
lookahead=self.lookahead,
)
@property
def rest(self):
return self.core.symbols[(self.core.position + 1) :]
def __repr__(self) -> str:
la = ", " + str(self.lookahead) if self.lookahead != () else ""
return f"{repr(self.core)}{la}"
def format(self, alphabet: list[str]) -> str:
if self.lookahead != ():
la = " ctx:{" + ",".join(alphabet[i] for i in self.lookahead) + "}"
else:
la = " ctx:{}"
return f"{self.core.format(alphabet)}{la}"
class ConfigSet(frozenset[Configuration]):
pass
# Here we have a slightly different definition of a ConfigurationSet; we keep
# the lookaheads outside and use a dictionary to check for containment
# quickly. ItemSet is used in the GRM/Pager/Chin algorithm.
@ -286,15 +235,11 @@ class ItemSet:
algorithm.
"""
items: dict[ConfigurationCore, set[int]]
items: dict[Configuration, set[int]]
def __init__(self, items=None):
self.items = items or {}
@classmethod
def from_config_set(cls, config_set: ConfigSet) -> "ItemSet":
return ItemSet({config.core: set(config.lookahead) for config in config_set})
def weakly_compatible(self, other: "ItemSet") -> bool:
a = self.items
b = other.items
@ -359,11 +304,6 @@ class ItemSet:
result.items[next] = set(context)
return result
def to_config_set(self) -> ConfigSet:
return ConfigSet(
{Configuration(core, tuple(sorted(ctx))) for core, ctx in self.items.items()}
)
@dataclasses.dataclass
class StateGraph:
@ -381,7 +321,7 @@ class StateGraph:
structure, but they all compute this information.)
"""
closures: list[ConfigSet]
closures: list[ItemSet]
# All the sucessors for all of the sets. `successors[i]` is the mapping
# from grammar symbol to the index of the set you get by processing that
@ -392,7 +332,7 @@ class StateGraph:
return json.dumps(
{
str(set_index): {
"closures": [c.format(alphabet) for c in closure],
"closures": [f"{c.format(alphabet)} -> {l}" for c, l in closure.items.items()],
"successors": {alphabet[k]: str(v) for k, v in successors.items()},
}
for set_index, (closure, successors) in enumerate(
@ -403,14 +343,14 @@ class StateGraph:
sort_keys=True,
)
def find_path_to_set(self, target_set: ConfigSet) -> list[int]:
def find_path_to_set(self, target_set: ItemSet) -> list[int]:
"""Trace the path of grammar symbols from the first set (which always
set 0) to the target set. This is useful in conflict reporting,
because we'll be *at* a ConfigSet and want to show the grammar symbols
because we'll be *at* an ItemSet and want to show the grammar symbols
that get us to where we found the conflict.
The return value is a list of grammar symbols to get to the specified
ConfigSet.
ItemSet.
This function raises KeyError if no path is found.
"""
@ -518,7 +458,7 @@ class ErrorCollection:
the error.
"""
errors: dict[ConfigSet, dict[int, dict[Configuration, Action]]]
errors: dict[ItemSet, dict[int, dict[Configuration, Action]]]
def __init__(self):
self.errors = {}
@ -529,7 +469,7 @@ class ErrorCollection:
def add_error(
self,
config_set: ConfigSet,
config_set: ItemSet,
symbol: int,
config: Configuration,
action: Action,
@ -581,11 +521,10 @@ class ErrorCollection:
for symbol, symbol_errors in set_errors.items():
actions = []
for config, action in symbol_errors.items():
core = config.core
name = alphabet[core.name]
name = alphabet[config.name]
rule = " ".join(
f"{'* ' if core.position == i else ''}{alphabet[s]}"
for i, s in enumerate(core.symbols)
f"{'* ' if config.position == i else ''}{alphabet[s]}"
for i, s in enumerate(config.symbols)
)
if config.at_end:
rule += " *"
@ -707,7 +646,7 @@ class TableBuilder(object):
return ParseTable(actions=self.actions, gotos=self.gotos, trivia=set())
def new_row(self, config_set: ConfigSet):
def new_row(self, config_set: ItemSet):
"""Start a new row, processing the given config set. Call this before
doing anything else.
"""
@ -735,9 +674,9 @@ class TableBuilder(object):
"""Mark a reduce of the given configuration for the given symbol in the
current row.
"""
name = self.alphabet[config.core.name]
name = self.alphabet[config.name]
transparent = name in self.transparents
action = Reduce(name, len(config.core.symbols), transparent)
action = Reduce(name, len(config.symbols), transparent)
self._set_table_action(symbol, action, config)
def set_table_accept(self, symbol: int, config: Configuration):
@ -768,7 +707,7 @@ class TableBuilder(object):
if isinstance(action, Shift):
return self.precedence[symbol]
else:
return self.precedence[config.core.name]
return self.precedence[config.name]
def _set_table_action(self, symbol_id: int, action: ParseAction, config: Configuration | None):
"""Set the action for 'symbol' in the table row to 'action'.
@ -1252,7 +1191,7 @@ class ParserGenerator:
self._firsts,
)
def gen_sets(self, seeds: list[Configuration]) -> StateGraph:
def gen_sets(self, seeds: ItemSet) -> StateGraph:
# This function can be seen as a modified version of items() from
# Chen's dissertation.
#
@ -1270,10 +1209,7 @@ class ParserGenerator:
core_states: list[ItemSet] = []
edges: list[dict[int, int]] = []
# Convert the incoming seed configurations into item sets.
# TODO: Convert everything to ItemSet natively.
state0 = ItemSet({seed.core: set(seed.lookahead) for seed in seeds})
core_states.append(state0)
core_states.append(seeds)
closed_states.append(None)
edges.append({})
@ -1399,7 +1335,7 @@ class ParserGenerator:
# Register all the actually merged, final config sets. I should *not*
# have to do all this work. Really really garbage.
return StateGraph(
closures=[closed_state.to_config_set() for _, closed_state in gc_states],
closures=[closed_state for _, closed_state in gc_states],
successors=gc_edges,
)
@ -1492,7 +1428,7 @@ class ParserGenerator:
set of all those productions combined with all the incoming productions
is the closure.
"""
closure: dict[ConfigurationCore, set[int]] = {}
closure: dict[Configuration, set[int]] = {}
# We're going to maintain a set of things to look at, rules that we
# still need to close over. Assume that starts with everything in us.
@ -1525,7 +1461,7 @@ class ParserGenerator:
lookahead.update(context)
for rule in rules:
new_core = ConfigurationCore.from_rule(config_next, rule)
new_core = Configuration.from_rule(config_next, rule)
todo.append((new_core, lookahead))
return ItemSet(closure)
@ -1536,10 +1472,12 @@ class ParserGenerator:
In LR1 parsers, we must remember to set the lookahead of the start
symbol to '$'.
"""
seeds = [
Configuration.from_rule(self.start_symbol, rule, lookahead=(self.end_symbol,))
for rule in self.grammar[self.start_symbol]
]
seeds = ItemSet(
{
Configuration.from_rule(self.start_symbol, rule): {self.end_symbol}
for rule in self.grammar[self.start_symbol]
}
)
return self.gen_sets(seeds)
def gen_table(self) -> ParseTable:
@ -1579,11 +1517,11 @@ class ParserGenerator:
builder.new_row(config_set)
successors = config_sets.successors[config_set_id]
for config in config_set:
config_next = config.core.next
for config, lookahead in config_set.items.items():
config_next = config.next
if config_next is None:
if config.core.name != self.start_symbol:
for a in config.lookahead:
if config.name != self.start_symbol:
for a in lookahead:
builder.set_table_reduce(a, config)
else:
builder.set_table_accept(self.end_symbol, config)