Compare commits

...

5 commits

Author SHA1 Message Date
923b01f6fd [parser] Simplify StateGraph 2024-10-26 07:35:28 -07:00
27e6bb413c [parser] Remove Canonical LR1 generator
This is fine probably.
2024-10-26 07:25:37 -07:00
2b72811486 [parser] ConfigurationSetInfo -> StateGraph 2024-10-26 06:56:30 -07:00
e501caa073 [parser] Remove unused import 2024-10-26 06:53:53 -07:00
e55bc140f9 [parser] Move ItemSet 2024-10-26 06:53:36 -07:00
2 changed files with 338 additions and 523 deletions

View file

@ -135,7 +135,6 @@ import bisect
import collections
import dataclasses
import enum
import functools
import inspect
import itertools
import json
@ -274,7 +273,100 @@ class ConfigSet(frozenset[Configuration]):
pass
class ConfigurationSetInfo:
# Here we have a slightly different definition of a ConfigurationSet; we keep
# the lookaheads outside and use a dictionary to check for containment
# quickly. ItemSet is used in the GRM/Pager/Chin algorithm.
@dataclasses.dataclass
class ItemSet:
"""An ItemSet is a group of configuration cores together with their
"contexts", or lookahead sets.
An ItemSet is comparable for equality, and also supports this lesser notion
of "weakly compatible" which is used to collapse states in the pager
algorithm.
"""
items: dict[ConfigurationCore, set[int]]
def __init__(self, items=None):
self.items = items or {}
@classmethod
def from_config_set(cls, config_set: ConfigSet) -> "ItemSet":
return ItemSet({config.core: set(config.lookahead) for config in config_set})
def weakly_compatible(self, other: "ItemSet") -> bool:
a = self.items
b = other.items
if len(a) != len(b):
return False
for acore in a:
if acore not in b:
return False
if len(a) == 1:
return True
# DOTY: This loop I do not understand, truly. What the heck is happening here?
a_keys = list(a.keys())
for i, i_key in enumerate(itertools.islice(a_keys, 0, len(a_keys) - 1)):
for j_key in itertools.islice(a_keys, i + 1, None):
a_i_key = a[i_key]
b_i_key = b[i_key]
a_j_key = a[j_key]
b_j_key = b[j_key]
# DOTY: GRMTools written with intersects(); we don't have that we have
# `not disjoint()`. :P There are many double negatives....
#
# not (intersect(a_i, b_j) or intersect(a_j, b_i))
# not ((not disjoint(a_i, b_j)) or (not disjoint(a_j, b_i)))
# ((not not disjoint(a_i, b_j)) and (not not disjoint(a_j, b_i)))
# disjoint(a_i, b_j) and disjoint(a_j, b_i)
if a_i_key.isdisjoint(b_j_key) and a_j_key.isdisjoint(b_i_key):
continue
# intersect(a_i, a_j) or intersect(b_i, b_j)
# (not disjoint(a_i, a_j)) or (not disjoint(b_i, b_j))
# not (disjoint(a_i, a_j) and disjoint(b_i, b_j))
if not (a_i_key.isdisjoint(a_j_key) and b_i_key.isdisjoint(b_j_key)):
continue
return False
return True
def weakly_merge(self, other: "ItemSet") -> bool:
"""Merge b into a, returning True if this lead to any changes."""
a = self.items
b = other.items
changed = False
for a_key, a_ctx in a.items():
start_len = len(a_ctx)
a_ctx.update(b[a_key]) # Python doesn't tell us changes
changed = changed or (start_len != len(a_ctx))
return changed
def goto(self, symbol: int) -> "ItemSet":
result = ItemSet()
for core, context in self.items.items():
if core.next == symbol:
next = core.replace_position(core.position + 1)
result.items[next] = set(context)
return result
def to_config_set(self) -> ConfigSet:
return ConfigSet(
{Configuration(core, tuple(sorted(ctx))) for core, ctx in self.items.items()}
)
@dataclasses.dataclass
class StateGraph:
"""When we build a grammar into a table, the first thing we need to do is
generate all the configuration sets and their successors.
@ -289,65 +381,23 @@ class ConfigurationSetInfo:
structure, but they all compute this information.)
"""
core_key: dict[ConfigSet, int] # Map a ConfigSet into am index
config_set_key: dict[ConfigSet, int] # Map a ConfigSet into am index
sets: list[ConfigSet] # Map the index back into a set
closures: list[ConfigSet | None] # Track closures
closures: list[ConfigSet]
# All the sucessors for all of the sets. `successors[i]` is the mapping
# from grammar symbol to the index of the set you get by processing that
# symbol.
successors: list[dict[int, int]]
def __init__(self):
self.core_key = {}
self.config_set_key = {}
self.sets = []
self.closures = []
self.successors = []
def register_core(self, c: ConfigSet) -> typing.Tuple[int, bool]:
"""Potentially add a new config set to the set of sets. Returns the
canonical ID of the set within this structure, along with a boolean
indicating whether the set was just added or not.
(You can use this integer to get the set back, if you need it, and
also access the successors table.)
"""
existing = self.core_key.get(c)
if existing is not None:
return existing, False
index = len(self.sets)
self.sets.append(c)
self.closures.append(None)
self.successors.append({})
self.core_key[c] = index
return index, True
def register_config_closure(self, c_id: int, closure: ConfigSet):
assert self.closures[c_id] is None
self.closures[c_id] = closure
self.config_set_key[closure] = c_id
def add_successor(self, c_id: int, symbol: int, successor: int):
"""Register sucessor(`c_id`, `symbol`) -> `successor`, where c_id
is the id of the set in this structure, and symbol is the id of a
symbol in the alphabet of the grammar.
"""
self.successors[c_id][symbol] = successor
def dump_state(self, alphabet: list[str]) -> str:
return json.dumps(
{
str(set_index): {
"configs": [c.format(alphabet) for c in config_set],
"closures": [c.format(alphabet) for c in self.closures[set_index] or []],
"successors": {
alphabet[k]: str(v) for k, v in self.successors[set_index].items()
},
"closures": [c.format(alphabet) for c in closure],
"successors": {alphabet[k]: str(v) for k, v in successors.items()},
}
for set_index, config_set in enumerate(self.sets)
for set_index, (closure, successors) in enumerate(
zip(self.closures, self.successors)
)
},
indent=4,
sort_keys=True,
@ -364,7 +414,8 @@ class ConfigurationSetInfo:
This function raises KeyError if no path is found.
"""
target_index = self.config_set_key[target_set]
# TODO: This should be tested.
target_index = self.closures.index(target_set)
visited = set()
queue: collections.deque = collections.deque()
@ -507,7 +558,7 @@ class ErrorCollection:
def gen_exception(
self,
alphabet: list[str],
all_sets: ConfigurationSetInfo,
all_sets: StateGraph,
) -> AmbiguityError | None:
"""Format all the errors into an error, or return None if there are no
errors.
@ -644,7 +695,7 @@ class TableBuilder(object):
self.action_row = None
self.goto_row = None
def flush(self, all_sets: ConfigurationSetInfo) -> ParseTable:
def flush(self, all_sets: StateGraph) -> ParseTable:
"""Finish building the table and return it.
Raises ValueError if there were any conflicts during construction.
@ -1007,108 +1058,36 @@ class FollowInfo:
return FollowInfo(follows=follows)
# Here we have a slightly different definition of a ConfigurationSet; we keep the
# lookaheads outside and use a dictionary to check for containment quickly.
# ItemSet is used in the GRM/Pager/Chin algorithm.
@dataclasses.dataclass
class ItemSet:
"""An ItemSet is a group of configuration cores together with their
"contexts", or lookahead sets.
class ParserGenerator:
"""Generate parse tables for LR1 grammars.
An ItemSet is comparable for equality, and also supports this lesser notion
of "weakly compatible" which is used to collapse states in the pager
algorithm.
"""
This class implements a variant of pager's algorithm to generate the parse
tables, which support the same set of languages as Canonical LR1 but with
much smaller resulting parse tables.
items: dict[ConfigurationCore, set[int]]
I'll be honest, I don't understnd this one as well as the pure LR1
algorithm. It proceeds as LR1, generating successor states, but every
time it makes a new state it searches the states it has already made for
one that is "weakly compatible;" if it finds one it merges the new state
with the old state and marks the old state to be re-visited.
def __init__(self, items=None):
self.items = items or {}
The implementation here follows from the implementation in
`GRMTools<https://github.com/softdevteam/grmtools/blob/master/lrtable/src/lib/pager.rs>`_.
@classmethod
def from_config_set(cls, config_set: ConfigSet) -> "ItemSet":
return ItemSet({config.core: set(config.lookahead) for config in config_set})
As they explain there:
def weakly_compatible(self, other: "ItemSet") -> bool:
a = self.items
b = other.items
if len(a) != len(b):
return False
for acore in a:
if acore not in b:
return False
if len(a) == 1:
return True
# DOTY: This loop I do not understand, truly. What the heck is happening here?
a_keys = list(a.keys())
for i, i_key in enumerate(itertools.islice(a_keys, 0, len(a_keys) - 1)):
for j_key in itertools.islice(a_keys, i + 1, None):
a_i_key = a[i_key]
b_i_key = b[i_key]
a_j_key = a[j_key]
b_j_key = b[j_key]
# DOTY: GRMTools written with intersects(); we don't have that we have
# `not disjoint()`. :P There are many double negatives....
#
# not (intersect(a_i, b_j) or intersect(a_j, b_i))
# not ((not disjoint(a_i, b_j)) or (not disjoint(a_j, b_i)))
# ((not not disjoint(a_i, b_j)) and (not not disjoint(a_j, b_i)))
# disjoint(a_i, b_j) and disjoint(a_j, b_i)
if a_i_key.isdisjoint(b_j_key) and a_j_key.isdisjoint(b_i_key):
continue
# intersect(a_i, a_j) or intersect(b_i, b_j)
# (not disjoint(a_i, a_j)) or (not disjoint(b_i, b_j))
# not (disjoint(a_i, a_j) and disjoint(b_i, b_j))
if not (a_i_key.isdisjoint(a_j_key) and b_i_key.isdisjoint(b_j_key)):
continue
return False
return True
def weakly_merge(self, other: "ItemSet") -> bool:
"""Merge b into a, returning True if this lead to any changes."""
a = self.items
b = other.items
changed = False
for a_key, a_ctx in a.items():
start_len = len(a_ctx)
a_ctx.update(b[a_key]) # Python doesn't tell us changes
changed = changed or (start_len != len(a_ctx))
return changed
def goto(self, symbol: int) -> "ItemSet":
result = ItemSet()
for core, context in self.items.items():
if core.next == symbol:
next = core.replace_position(core.position + 1)
result.items[next] = set(context)
return result
def to_config_set(self) -> ConfigSet:
return ConfigSet(
{Configuration(core, tuple(sorted(ctx))) for core, ctx in self.items.items()}
)
class GenerateLR1:
"""Generate parse tables for LR1, or "canonical LR" grammars.
LR1 parsers can recognize more than SLR parsers. Like SLR parsers, they
are choosier about when they reduce. But unlike SLR parsers, they specify
the terminals on which they reduce by carrying a 'lookahead' terminal in
the configuration. The lookahead of a configuration is computed as the
closure of a configuration set is computed, so see gen_closure_next for
details. (Except for the start configuration, which has '$' as its
lookahead.)
> The general algorithms that form the basis of what's used in this file
> can be found in:
>
> A Practical General Method for Constructing LR(k) Parsers
> David Pager, Acta Informatica 7, 249--268, 1977
>
> However Pager's paper is dense, and doesn't name sub-parts of the
> algorithm. We mostly reference the (still incomplete, but less
> incomplete) version of the algorithm found in:
>
> Measuring and extending LR(1) parser generation
> Xin Chen, PhD thesis, University of Hawaii, 2009
"""
# Internally we use integers as symbols, not strings. Mostly this is fine,
@ -1171,9 +1150,9 @@ class GenerateLR1:
non-terminal being added, and the second elment of the tuple is the
list of terminals and non-terminals that make up the production.
There is currently no support for custom actions or alternation or
anything like that. If you want alternations that you'll have to lower
the grammar by hand into the simpler form first.
There is no support for alternation. If you want alternations that
you'll have to lower the grammar by hand into the simpler form first,
but that's what the Grammar and NonTerminal classes are for.
Don't name anything with double-underscores; those are reserved for
the generator. Don't add '$' either, as it is reserved to mean
@ -1273,105 +1252,215 @@ class GenerateLR1:
self._firsts,
)
def gen_closure(self, seeds: typing.Iterable[Configuration]) -> ConfigSet:
"""Compute the closure for the specified configs. The closure is all
of the configurations we could be in. Specifically, if the position
for a config is just before a non-terminal then we must also consider
configurations where the rule is the rule for the non-terminal and
the position is just before the beginning of the rule.
def gen_sets(self, seeds: list[Configuration]) -> StateGraph:
# This function can be seen as a modified version of items() from
# Chen's dissertation.
#
# DOTY: It is also (practically) a converted version from grmtools
# into python, more or less verbatim at this point. I have some
# sense of what is going on, and attempt to elaborate with
# these comments.
(We have replaced a recursive version with an iterative one.)
"""
closure: set[Configuration] = set()
pending = list(seeds)
pending_next = []
while len(pending) > 0:
for config in pending:
if config in closure:
# closed_states and core_states are both equally sized vectors of
# states. Core states are smaller, and used for the weakly compatible
# checks, but we ultimately need to return closed states. Closed
# states which are None are those which require processing; thus
# closed_states also implicitly serves as a todo list.
closed_states: list[ItemSet | None] = []
core_states: list[ItemSet] = []
edges: list[dict[int, int]] = []
# Convert the incoming seed configurations into item sets.
# TODO: Convert everything to ItemSet natively.
state0 = ItemSet({seed.core: set(seed.lookahead) for seed in seeds})
core_states.append(state0)
closed_states.append(None)
edges.append({})
# We maintain a set of which rules and tokens we've seen; when
# processing a given state there's no point processing a rule or
# token more than once.
seen: set[int] = set()
# cnd_[rule|token]_weaklies represent which states are possible weakly
# compatible matches for a given symbol.
#
# DOTY: As with `seen`, we have a uniform space so we can have a
# uniform one of these too.
cnd_weaklies: list[list[int]] = [[] for _ in range(len(self.alphabet))]
todo = 1 # How many None values are there in closed_states?
todo_off = 0 # Offset in closed states to start searching for the next todo.
while todo > 0:
assert len(core_states) == len(closed_states)
assert len(core_states) == len(edges)
# state_i is the next item to process. We don't want to
# continually search for the next None from the beginning, so we
# remember where we last saw a None (todo_off) and search from
# that point onwards, wrapping as necessary. Since processing a
# state x disproportionately causes state x + 1 to require
# processing, this prevents the search from becoming horribly
# non-linear.
try:
state_i = closed_states.index(None, todo_off)
except ValueError:
state_i = closed_states.index(None) # DOTY: Will not raise, given todo > 0
todo_off = state_i + 1
todo -= 1
cl_state = self.gen_closure(core_states[state_i])
closed_states[state_i] = cl_state
seen.clear()
for core in cl_state.items.keys():
sym = core.next
if sym is None or sym in seen:
continue
seen.add(sym)
nstate = cl_state.goto(sym)
# Try and find a compatible match for this state.
cnd_states = cnd_weaklies[sym]
# First of all see if any of the candidate states are exactly
# the same as the new state, in which case we only need to
# add an edge to the candidate state. This isn't just an
# optimisation (though it does avoid the expense of change
# propagation), but has a correctness aspect: there's no
# guarantee that the weakly compatible check is reflexive
# (i.e. a state may not be weakly compatible with itself).
found = False
for cnd in cnd_states:
if core_states[cnd] == nstate:
edges[state_i][sym] = cnd
found = True
break
if found:
continue
closure.add(config)
pending_next.extend(self.gen_closure_next(config))
# No candidate states were equal to the new state, so we need
# to look for a candidate state which is weakly compatible.
m: int | None = None
for cnd in cnd_states:
if core_states[cnd].weakly_compatible(nstate):
m = cnd
break
temp = pending
pending = pending_next
pending_next = temp
pending_next.clear()
if m is not None:
# A weakly compatible match has been found.
edges[state_i][sym] = m
assert core_states[m].weakly_compatible(nstate) # TODO: REMOVE, TOO SLOW
if core_states[m].weakly_merge(nstate):
# We only do the simplest change propagation, forcing possibly
# affected sets to be entirely reprocessed (which will recursively
# force propagation too). Even though this does unnecessary
# computation, it is still pretty fast.
#
# Note also that edges[k] will be completely regenerated, overwriting
# all existing entries and possibly adding new ones. We thus don't
# need to clear it manually.
if closed_states[m] is not None:
closed_states[m] = None
todo += 1
# NOTE: The generation of this closure *might* have generated
# multiple cores with different lookaheads; if that's
# the case we need to merge.
merged: dict[ConfigurationCore, set[int]] = {}
for c in closure:
existing = merged.get(c.core)
if existing is not None:
existing.update(c.lookahead)
else:
merged[c.core] = set(c.lookahead)
else:
stidx = len(core_states)
return ConfigSet(Configuration(k, tuple(sorted(v))) for k, v in merged.items())
cnd_weaklies[sym].append(stidx)
edges[state_i][sym] = stidx
def gen_all_successors(
self, config_set: typing.Iterable[Configuration]
) -> list[typing.Tuple[int, ConfigSet]]:
"""Return all of the non-empty successors for the given config set.
edges.append({})
closed_states.append(None)
core_states.append(nstate)
todo += 1
(That is, given the config set, pretend we see all the symbols we
could possibly see, and figure out which configs sets we get from
those symbols. Those are the successors of this set.)
"""
possible = {config.core.next for config in config_set if config.core.next is not None}
# Although the Pager paper doesn't talk about it, the algorithm above
# can create unreachable states due to the non-determinism inherent
# in working with hashsets. Indeed, this can even happen with the
# example from Pager's paper (on perhaps 1 out of 100 runs, 24 or 25
# states will be created instead of 23). We thus need to weed out
# unreachable states and update edges accordingly.
assert len(core_states) == len(closed_states)
next = []
for symbol in possible:
seeds = ConfigSet(
config.replace_position(config.core.position + 1)
for config in config_set
if config.core.next == symbol
)
if len(seeds) > 0:
next.append((symbol, seeds))
all_states = []
for core_state, closed_state in zip(core_states, closed_states):
assert closed_state is not None
all_states.append((core_state, closed_state))
gc_states, gc_edges = self.gc(all_states, edges)
return next
# DOTY: UGH this is so bad, we should rewrite to use ItemSet everywehre
# probably, which actually means getting rid of the pluggable
# generator because who actually needs that?
def gen_sets(self, seeds: list[Configuration]) -> ConfigurationSetInfo:
"""Generate all configuration sets starting from the provided seeds."""
result = ConfigurationSetInfo()
# Register all the actually merged, final config sets. I should *not*
# have to do all this work. Really really garbage.
return StateGraph(
closures=[closed_state.to_config_set() for _, closed_state in gc_states],
successors=gc_edges,
)
successors = []
pending = [ConfigSet(seeds)]
pending_next = []
while len(pending) > 0:
for core in pending:
id, is_new = result.register_core(core)
if is_new:
config_set = self.gen_closure(core)
result.register_config_closure(id, config_set)
for symbol, successor in self.gen_all_successors(config_set):
successors.append((id, symbol, successor))
pending_next.append(successor)
def gc(
self,
states: list[tuple[ItemSet, ItemSet]],
edges: list[dict[int, int]],
) -> tuple[list[tuple[ItemSet, ItemSet]], list[dict[int, int]]]:
# First of all, do a simple pass over all states. All state indexes
# reachable from the start state will be inserted into the 'seen'
# set.
todo = [0]
seen = set()
while len(todo) > 0:
item = todo.pop()
if item in seen:
continue
seen.add(item)
todo.extend(e for e in edges[item].values() if e not in seen)
temp = pending
pending = pending_next
pending_next = temp
pending_next.clear()
if len(seen) == len(states):
# Every state is reachable.
return states, edges
for id, symbol, successor in successors:
result.add_successor(id, symbol, result.core_key[successor])
# Imagine we started with 3 states and their edges:
# states: [0, 1, 2]
# edges : [[_ => 2]]
#
# At this point, 'seen' will be the set {0, 2}. What we need to do is
# to create a new list of states that doesn't have state 1 in it.
# That will cause state 2 to become to state 1, meaning that we need
# to adjust edges so that the pointer to state 2 is updated to state
# 1. In other words we want to achieve this output:
#
# states: [0, 2]
# edges : [_ => 1]
#
# The way we do this is to first iterate over all states, working out
# what the mapping from seen states to their new offsets is.
gc_states: list[tuple[ItemSet, ItemSet]] = []
offsets: list[int] = []
offset = 0
for state_i, zstate in enumerate(states):
offsets.append(state_i - offset)
if state_i not in seen:
offset += 1
continue
return result
gc_states.append(zstate)
def gen_follow(self, symbol: int) -> set[int]:
"""Generate the follow set for the given nonterminal.
# At this point the offsets list will be [0, 1, 1]. We now create new
# edges where each offset is corrected by looking it up in the
# offsets list.
gc_edges: list[dict[int, int]] = []
for st_edge_i, st_edges in enumerate(edges):
if st_edge_i not in seen:
continue
The follow set for a nonterminal is the set of terminals that can
follow the nonterminal in a valid sentence. The resulting set never
contains epsilon and is never empty, since we should always at least
ground out at '$', which is the end-of-stream marker.
gc_edges.append({k: offsets[v] for k, v in st_edges.items()})
See FollowInfo for more information on how this is determined.
"""
return self._follows.follows[symbol]
return (gc_states, gc_edges)
def gen_first(self, symbols: typing.Iterable[int]) -> typing.Tuple[set[int], bool]:
"""Return the first set for a *sequence* of symbols.
@ -1394,45 +1483,15 @@ class GenerateLR1:
return (result, True)
def gen_reduce_set(self, config: Configuration) -> typing.Iterable[int]:
"""Return the set of symbols that indicate we should reduce the given
config.
def gen_closure(self, items: ItemSet) -> ItemSet:
"""Generate the closure of the given ItemSet.
In an LR1 parser, this is the lookahead of the configuration.
Some of the configurations the ItemSet might be positioned right before
nonterminals. In that case, obviously, we should *also* behave as if we
were right at the beginning of each production for that nonterminal. The
set of all those productions combined with all the incoming productions
is the closure.
"""
return config.lookahead
def gen_closure_next(self, config: Configuration):
"""Return the next set of configurations in the closure for config.
In LR1 parsers, we must compute the lookahead for the configurations
we're adding to the closure. The lookahead for the new configurations
is the first() of the rest of this config's production. If that
contains epsilon, then the lookahead *also* contains the lookahead we
already have. (This lookahead was presumably generated by the same
process, so in some sense it is a 'parent' lookahead, or a lookahead
from an upstream production in the grammar.)
(See the documentation in GenerateLR0 for more information on how
this function fits into the whole process, specifically `gen_closure`.)
"""
config_next = config.core.next
if config_next is None:
return ()
else:
lookahead, epsilon = self.gen_first(config.rest)
if epsilon:
lookahead.update(config.lookahead)
lookahead_tuple = tuple(sorted(lookahead))
next = []
for rule in self.grammar[config_next]:
rr = Configuration.from_rule(config_next, rule, lookahead=lookahead_tuple)
next.append(rr)
return tuple(next)
def gen_closure_x(self, items: ItemSet) -> ItemSet:
closure: dict[ConfigurationCore, set[int]] = {}
# We're going to maintain a set of things to look at, rules that we
@ -1524,7 +1583,7 @@ class GenerateLR1:
config_next = config.core.next
if config_next is None:
if config.core.name != self.start_symbol:
for a in self.gen_reduce_set(config):
for a in config.lookahead:
builder.set_table_reduce(a, config)
else:
builder.set_table_accept(self.end_symbol, config)
@ -1541,249 +1600,6 @@ class GenerateLR1:
return builder.flush(config_sets)
class GeneratePager(GenerateLR1):
"""Pager's algorithm.
I'll be honest, I don't understnd this one as well as the pure LR1
algorithm. It proceeds as LR1, generating successor states, but every
time it makes a new state it searches the states it has already made for
one that is "weakly compatible;" ifit finds one it merges the new state
with the old state and marks the old state to be re-visited.
The implementation here follows from the implementation in
`GRMTools<https://github.com/softdevteam/grmtools/blob/master/lrtable/src/lib/pager.rs>`_.
As they explain there:
> The general algorithms that form the basis of what's used in this file
> can be found in:
>
> A Practical General Method for Constructing LR(k) Parsers
> David Pager, Acta Informatica 7, 249--268, 1977
>
> However Pager's paper is dense, and doesn't name sub-parts of the
> algorithm. We mostly reference the (still incomplete, but less
> incomplete) version of the algorithm found in:
>
> Measuring and extending LR(1) parser generation
> Xin Chen, PhD thesis, University of Hawaii, 2009
"""
def gen_sets(self, seeds: list[Configuration]) -> ConfigurationSetInfo:
# This function can be seen as a modified version of items() from
# Chen's dissertation.
#
# DOTY: It is also (practically) a converted version from grmtools
# into python, more or less verbatim at this point. I have some
# sense of what is going on, and attempt to elaborate with
# these comments.
# closed_states and core_states are both equally sized vectors of
# states. Core states are smaller, and used for the weakly compatible
# checks, but we ultimately need to return closed states. Closed
# states which are None are those which require processing; thus
# closed_states also implicitly serves as a todo list.
closed_states: list[ItemSet | None] = []
core_states: list[ItemSet] = []
edges: list[dict[int, int]] = []
# Convert the incoming seed configurations into item sets.
# TODO: Convert everything to ItemSet natively.
state0 = ItemSet({seed.core: set(seed.lookahead) for seed in seeds})
core_states.append(state0)
closed_states.append(None)
edges.append({})
# We maintain a set of which rules and tokens we've seen; when
# processing a given state there's no point processing a rule or
# token more than once.
seen: set[int] = set()
# cnd_[rule|token]_weaklies represent which states are possible weakly
# compatible matches for a given symbol.
#
# DOTY: As with `seen`, we have a uniform space so we can have a
# uniform one of these too.
cnd_weaklies: list[list[int]] = [[] for _ in range(len(self.alphabet))]
todo = 1 # How many None values are there in closed_states?
todo_off = 0 # Offset in closed states to start searching for the next todo.
while todo > 0:
assert len(core_states) == len(closed_states)
assert len(core_states) == len(edges)
# state_i is the next item to process. We don't want to
# continually search for the next None from the beginning, so we
# remember where we last saw a None (todo_off) and search from
# that point onwards, wrapping as necessary. Since processing a
# state x disproportionately causes state x + 1 to require
# processing, this prevents the search from becoming horribly
# non-linear.
try:
state_i = closed_states.index(None, todo_off)
except ValueError:
state_i = closed_states.index(None) # DOTY: Will not raise, given todo > 0
todo_off = state_i + 1
todo -= 1
cl_state = self.gen_closure_x(core_states[state_i])
closed_states[state_i] = cl_state
seen.clear()
for core in cl_state.items.keys():
sym = core.next
if sym is None or sym in seen:
continue
seen.add(sym)
nstate = cl_state.goto(sym)
# Try and find a compatible match for this state.
cnd_states = cnd_weaklies[sym]
# First of all see if any of the candidate states are exactly
# the same as the new state, in which case we only need to
# add an edge to the candidate state. This isn't just an
# optimisation (though it does avoid the expense of change
# propagation), but has a correctness aspect: there's no
# guarantee that the weakly compatible check is reflexive
# (i.e. a state may not be weakly compatible with itself).
found = False
for cnd in cnd_states:
if core_states[cnd] == nstate:
edges[state_i][sym] = cnd
found = True
break
if found:
continue
# No candidate states were equal to the new state, so we need
# to look for a candidate state which is weakly compatible.
m: int | None = None
for cnd in cnd_states:
if core_states[cnd].weakly_compatible(nstate):
m = cnd
break
if m is not None:
# A weakly compatible match has been found.
edges[state_i][sym] = m
assert core_states[m].weakly_compatible(nstate) # TODO: REMOVE, TOO SLOW
if core_states[m].weakly_merge(nstate):
# We only do the simplest change propagation, forcing possibly
# affected sets to be entirely reprocessed (which will recursively
# force propagation too). Even though this does unnecessary
# computation, it is still pretty fast.
#
# Note also that edges[k] will be completely regenerated, overwriting
# all existing entries and possibly adding new ones. We thus don't
# need to clear it manually.
if closed_states[m] is not None:
closed_states[m] = None
todo += 1
else:
stidx = len(core_states)
cnd_weaklies[sym].append(stidx)
edges[state_i][sym] = stidx
edges.append({})
closed_states.append(None)
core_states.append(nstate)
todo += 1
# Although the Pager paper doesn't talk about it, the algorithm above
# can create unreachable states due to the non-determinism inherent
# in working with hashsets. Indeed, this can even happen with the
# example from Pager's paper (on perhaps 1 out of 100 runs, 24 or 25
# states will be created instead of 23). We thus need to weed out
# unreachable states and update edges accordingly.
assert len(core_states) == len(closed_states)
all_states = []
for core_state, closed_state in zip(core_states, closed_states):
assert closed_state is not None
all_states.append((core_state, closed_state))
gc_states, gc_edges = self.gc(all_states, edges)
# DOTY: UGH this is so bad, we should rewrite to use ItemSet everywehre
# probably, which actually means getting rid of the pluggable
# generator because who actually needs that?
# Register all the actually merged, final config sets. I should *not*
# have to do all this work. Really really garbage.
result = ConfigurationSetInfo()
result.sets = [core_state.to_config_set() for core_state, _ in gc_states]
result.core_key = {s: i for i, s in enumerate(result.sets)}
result.closures = [closed_state.to_config_set() for _, closed_state in gc_states]
result.config_set_key = {s: i for i, s in enumerate(result.closures) if s is not None}
result.successors = gc_edges
return result
def gc(
self,
states: list[tuple[ItemSet, ItemSet]],
edges: list[dict[int, int]],
) -> tuple[list[tuple[ItemSet, ItemSet]], list[dict[int, int]]]:
# First of all, do a simple pass over all states. All state indexes
# reachable from the start state will be inserted into the 'seen'
# set.
todo = [0]
seen = set()
while len(todo) > 0:
item = todo.pop()
if item in seen:
continue
seen.add(item)
todo.extend(e for e in edges[item].values() if e not in seen)
if len(seen) == len(states):
# Every state is reachable.
return states, edges
# Imagine we started with 3 states and their edges:
# states: [0, 1, 2]
# edges : [[_ => 2]]
#
# At this point, 'seen' will be the set {0, 2}. What we need to do is
# to create a new list of states that doesn't have state 1 in it.
# That will cause state 2 to become to state 1, meaning that we need
# to adjust edges so that the pointer to state 2 is updated to state
# 1. In other words we want to achieve this output:
#
# states: [0, 2]
# edges : [_ => 1]
#
# The way we do this is to first iterate over all states, working out
# what the mapping from seen states to their new offsets is.
gc_states: list[tuple[ItemSet, ItemSet]] = []
offsets: list[int] = []
offset = 0
for state_i, zstate in enumerate(states):
offsets.append(state_i - offset)
if state_i not in seen:
offset += 1
continue
gc_states.append(zstate)
# At this point the offsets list will be [0, 1, 1]. We now create new
# edges where each offset is corrected by looking it up in the
# offsets list.
gc_edges: list[dict[int, int]] = []
for st_edge_i, st_edges in enumerate(edges):
if st_edge_i not in seen:
continue
gc_edges.append({k: offsets[v] for k, v in st_edges.items()})
return (gc_states, gc_edges)
FlattenedWithMetadata = list["str|Terminal|tuple[dict[str,typing.Any],FlattenedWithMetadata]"]
@ -3009,7 +2825,7 @@ class Grammar:
"""
_precedence: dict[str, typing.Tuple[Assoc, int]]
_generator: type[GenerateLR1]
_generator: type[ParserGenerator]
_terminals: dict[str, Terminal]
_nonterminals: dict[str, NonTerminal]
_trivia: list[Terminal]
@ -3018,7 +2834,7 @@ class Grammar:
self,
start: str | NonTerminal | None = None,
precedence: PrecedenceList | None = None,
generator: type[GenerateLR1] | None = None,
generator: type[ParserGenerator] | None = None,
trivia: list[str | Terminal] | None = None,
name: str | None = None,
):
@ -3037,7 +2853,7 @@ class Grammar:
assert precedence is not None
if generator is None:
generator = getattr(self, "generator", GeneratePager)
generator = getattr(self, "generator", ParserGenerator)
assert generator is not None
if trivia is None:

View file

@ -87,8 +87,8 @@ def test_all_generators():
GENERATORS = [
# parser.GenerateLR0,
parser.GeneratePager,
parser.GenerateLR1,
# parser.GeneratePager,
parser.ParserGenerator,
]
for generator in GENERATORS:
table = G().build_table(generator=generator)
@ -119,15 +119,14 @@ def test_grammar_aho_ullman_2():
A = Terminal("a")
B = Terminal("b")
TestGrammar().build_table(generator=parser.GenerateLR1)
TestGrammar().build_table(generator=parser.GeneratePager)
TestGrammar().build_table(generator=parser.ParserGenerator)
# TestGrammar().build_table(generator=parser.GeneratePager)
def test_fun_lalr():
class TestGrammar(Grammar):
start = "S"
generator = parser.GeneratePager
@rule
def S(self):