Compare commits

..

No commits in common. "923b01f6fde917d641b6d26cffb690d771e46ee3" and "2d5c73f0b023d87b4a1f806de48542209082c32b" have entirely different histories.

2 changed files with 523 additions and 338 deletions

View file

@ -135,6 +135,7 @@ import bisect
import collections import collections
import dataclasses import dataclasses
import enum import enum
import functools
import inspect import inspect
import itertools import itertools
import json import json
@ -273,100 +274,7 @@ class ConfigSet(frozenset[Configuration]):
pass pass
# Here we have a slightly different definition of a ConfigurationSet; we keep class ConfigurationSetInfo:
# the lookaheads outside and use a dictionary to check for containment
# quickly. ItemSet is used in the GRM/Pager/Chin algorithm.
@dataclasses.dataclass
class ItemSet:
"""An ItemSet is a group of configuration cores together with their
"contexts", or lookahead sets.
An ItemSet is comparable for equality, and also supports this lesser notion
of "weakly compatible" which is used to collapse states in the pager
algorithm.
"""
items: dict[ConfigurationCore, set[int]]
def __init__(self, items=None):
self.items = items or {}
@classmethod
def from_config_set(cls, config_set: ConfigSet) -> "ItemSet":
return ItemSet({config.core: set(config.lookahead) for config in config_set})
def weakly_compatible(self, other: "ItemSet") -> bool:
a = self.items
b = other.items
if len(a) != len(b):
return False
for acore in a:
if acore not in b:
return False
if len(a) == 1:
return True
# DOTY: This loop I do not understand, truly. What the heck is happening here?
a_keys = list(a.keys())
for i, i_key in enumerate(itertools.islice(a_keys, 0, len(a_keys) - 1)):
for j_key in itertools.islice(a_keys, i + 1, None):
a_i_key = a[i_key]
b_i_key = b[i_key]
a_j_key = a[j_key]
b_j_key = b[j_key]
# DOTY: GRMTools written with intersects(); we don't have that we have
# `not disjoint()`. :P There are many double negatives....
#
# not (intersect(a_i, b_j) or intersect(a_j, b_i))
# not ((not disjoint(a_i, b_j)) or (not disjoint(a_j, b_i)))
# ((not not disjoint(a_i, b_j)) and (not not disjoint(a_j, b_i)))
# disjoint(a_i, b_j) and disjoint(a_j, b_i)
if a_i_key.isdisjoint(b_j_key) and a_j_key.isdisjoint(b_i_key):
continue
# intersect(a_i, a_j) or intersect(b_i, b_j)
# (not disjoint(a_i, a_j)) or (not disjoint(b_i, b_j))
# not (disjoint(a_i, a_j) and disjoint(b_i, b_j))
if not (a_i_key.isdisjoint(a_j_key) and b_i_key.isdisjoint(b_j_key)):
continue
return False
return True
def weakly_merge(self, other: "ItemSet") -> bool:
"""Merge b into a, returning True if this lead to any changes."""
a = self.items
b = other.items
changed = False
for a_key, a_ctx in a.items():
start_len = len(a_ctx)
a_ctx.update(b[a_key]) # Python doesn't tell us changes
changed = changed or (start_len != len(a_ctx))
return changed
def goto(self, symbol: int) -> "ItemSet":
result = ItemSet()
for core, context in self.items.items():
if core.next == symbol:
next = core.replace_position(core.position + 1)
result.items[next] = set(context)
return result
def to_config_set(self) -> ConfigSet:
return ConfigSet(
{Configuration(core, tuple(sorted(ctx))) for core, ctx in self.items.items()}
)
@dataclasses.dataclass
class StateGraph:
"""When we build a grammar into a table, the first thing we need to do is """When we build a grammar into a table, the first thing we need to do is
generate all the configuration sets and their successors. generate all the configuration sets and their successors.
@ -381,23 +289,65 @@ class StateGraph:
structure, but they all compute this information.) structure, but they all compute this information.)
""" """
closures: list[ConfigSet] core_key: dict[ConfigSet, int] # Map a ConfigSet into am index
config_set_key: dict[ConfigSet, int] # Map a ConfigSet into am index
sets: list[ConfigSet] # Map the index back into a set
closures: list[ConfigSet | None] # Track closures
# All the sucessors for all of the sets. `successors[i]` is the mapping # All the sucessors for all of the sets. `successors[i]` is the mapping
# from grammar symbol to the index of the set you get by processing that # from grammar symbol to the index of the set you get by processing that
# symbol. # symbol.
successors: list[dict[int, int]] successors: list[dict[int, int]]
def __init__(self):
self.core_key = {}
self.config_set_key = {}
self.sets = []
self.closures = []
self.successors = []
def register_core(self, c: ConfigSet) -> typing.Tuple[int, bool]:
"""Potentially add a new config set to the set of sets. Returns the
canonical ID of the set within this structure, along with a boolean
indicating whether the set was just added or not.
(You can use this integer to get the set back, if you need it, and
also access the successors table.)
"""
existing = self.core_key.get(c)
if existing is not None:
return existing, False
index = len(self.sets)
self.sets.append(c)
self.closures.append(None)
self.successors.append({})
self.core_key[c] = index
return index, True
def register_config_closure(self, c_id: int, closure: ConfigSet):
assert self.closures[c_id] is None
self.closures[c_id] = closure
self.config_set_key[closure] = c_id
def add_successor(self, c_id: int, symbol: int, successor: int):
"""Register sucessor(`c_id`, `symbol`) -> `successor`, where c_id
is the id of the set in this structure, and symbol is the id of a
symbol in the alphabet of the grammar.
"""
self.successors[c_id][symbol] = successor
def dump_state(self, alphabet: list[str]) -> str: def dump_state(self, alphabet: list[str]) -> str:
return json.dumps( return json.dumps(
{ {
str(set_index): { str(set_index): {
"closures": [c.format(alphabet) for c in closure], "configs": [c.format(alphabet) for c in config_set],
"successors": {alphabet[k]: str(v) for k, v in successors.items()}, "closures": [c.format(alphabet) for c in self.closures[set_index] or []],
"successors": {
alphabet[k]: str(v) for k, v in self.successors[set_index].items()
},
} }
for set_index, (closure, successors) in enumerate( for set_index, config_set in enumerate(self.sets)
zip(self.closures, self.successors)
)
}, },
indent=4, indent=4,
sort_keys=True, sort_keys=True,
@ -414,8 +364,7 @@ class StateGraph:
This function raises KeyError if no path is found. This function raises KeyError if no path is found.
""" """
# TODO: This should be tested. target_index = self.config_set_key[target_set]
target_index = self.closures.index(target_set)
visited = set() visited = set()
queue: collections.deque = collections.deque() queue: collections.deque = collections.deque()
@ -558,7 +507,7 @@ class ErrorCollection:
def gen_exception( def gen_exception(
self, self,
alphabet: list[str], alphabet: list[str],
all_sets: StateGraph, all_sets: ConfigurationSetInfo,
) -> AmbiguityError | None: ) -> AmbiguityError | None:
"""Format all the errors into an error, or return None if there are no """Format all the errors into an error, or return None if there are no
errors. errors.
@ -695,7 +644,7 @@ class TableBuilder(object):
self.action_row = None self.action_row = None
self.goto_row = None self.goto_row = None
def flush(self, all_sets: StateGraph) -> ParseTable: def flush(self, all_sets: ConfigurationSetInfo) -> ParseTable:
"""Finish building the table and return it. """Finish building the table and return it.
Raises ValueError if there were any conflicts during construction. Raises ValueError if there were any conflicts during construction.
@ -1058,36 +1007,108 @@ class FollowInfo:
return FollowInfo(follows=follows) return FollowInfo(follows=follows)
class ParserGenerator: # Here we have a slightly different definition of a ConfigurationSet; we keep the
"""Generate parse tables for LR1 grammars. # lookaheads outside and use a dictionary to check for containment quickly.
# ItemSet is used in the GRM/Pager/Chin algorithm.
@dataclasses.dataclass
class ItemSet:
"""An ItemSet is a group of configuration cores together with their
"contexts", or lookahead sets.
This class implements a variant of pager's algorithm to generate the parse An ItemSet is comparable for equality, and also supports this lesser notion
tables, which support the same set of languages as Canonical LR1 but with of "weakly compatible" which is used to collapse states in the pager
much smaller resulting parse tables. algorithm.
"""
I'll be honest, I don't understnd this one as well as the pure LR1 items: dict[ConfigurationCore, set[int]]
algorithm. It proceeds as LR1, generating successor states, but every
time it makes a new state it searches the states it has already made for
one that is "weakly compatible;" if it finds one it merges the new state
with the old state and marks the old state to be re-visited.
The implementation here follows from the implementation in def __init__(self, items=None):
`GRMTools<https://github.com/softdevteam/grmtools/blob/master/lrtable/src/lib/pager.rs>`_. self.items = items or {}
As they explain there: @classmethod
def from_config_set(cls, config_set: ConfigSet) -> "ItemSet":
return ItemSet({config.core: set(config.lookahead) for config in config_set})
> The general algorithms that form the basis of what's used in this file def weakly_compatible(self, other: "ItemSet") -> bool:
> can be found in: a = self.items
> b = other.items
> A Practical General Method for Constructing LR(k) Parsers
> David Pager, Acta Informatica 7, 249--268, 1977 if len(a) != len(b):
> return False
> However Pager's paper is dense, and doesn't name sub-parts of the
> algorithm. We mostly reference the (still incomplete, but less for acore in a:
> incomplete) version of the algorithm found in: if acore not in b:
> return False
> Measuring and extending LR(1) parser generation
> Xin Chen, PhD thesis, University of Hawaii, 2009 if len(a) == 1:
return True
# DOTY: This loop I do not understand, truly. What the heck is happening here?
a_keys = list(a.keys())
for i, i_key in enumerate(itertools.islice(a_keys, 0, len(a_keys) - 1)):
for j_key in itertools.islice(a_keys, i + 1, None):
a_i_key = a[i_key]
b_i_key = b[i_key]
a_j_key = a[j_key]
b_j_key = b[j_key]
# DOTY: GRMTools written with intersects(); we don't have that we have
# `not disjoint()`. :P There are many double negatives....
#
# not (intersect(a_i, b_j) or intersect(a_j, b_i))
# not ((not disjoint(a_i, b_j)) or (not disjoint(a_j, b_i)))
# ((not not disjoint(a_i, b_j)) and (not not disjoint(a_j, b_i)))
# disjoint(a_i, b_j) and disjoint(a_j, b_i)
if a_i_key.isdisjoint(b_j_key) and a_j_key.isdisjoint(b_i_key):
continue
# intersect(a_i, a_j) or intersect(b_i, b_j)
# (not disjoint(a_i, a_j)) or (not disjoint(b_i, b_j))
# not (disjoint(a_i, a_j) and disjoint(b_i, b_j))
if not (a_i_key.isdisjoint(a_j_key) and b_i_key.isdisjoint(b_j_key)):
continue
return False
return True
def weakly_merge(self, other: "ItemSet") -> bool:
"""Merge b into a, returning True if this lead to any changes."""
a = self.items
b = other.items
changed = False
for a_key, a_ctx in a.items():
start_len = len(a_ctx)
a_ctx.update(b[a_key]) # Python doesn't tell us changes
changed = changed or (start_len != len(a_ctx))
return changed
def goto(self, symbol: int) -> "ItemSet":
result = ItemSet()
for core, context in self.items.items():
if core.next == symbol:
next = core.replace_position(core.position + 1)
result.items[next] = set(context)
return result
def to_config_set(self) -> ConfigSet:
return ConfigSet(
{Configuration(core, tuple(sorted(ctx))) for core, ctx in self.items.items()}
)
class GenerateLR1:
"""Generate parse tables for LR1, or "canonical LR" grammars.
LR1 parsers can recognize more than SLR parsers. Like SLR parsers, they
are choosier about when they reduce. But unlike SLR parsers, they specify
the terminals on which they reduce by carrying a 'lookahead' terminal in
the configuration. The lookahead of a configuration is computed as the
closure of a configuration set is computed, so see gen_closure_next for
details. (Except for the start configuration, which has '$' as its
lookahead.)
""" """
# Internally we use integers as symbols, not strings. Mostly this is fine, # Internally we use integers as symbols, not strings. Mostly this is fine,
@ -1150,9 +1171,9 @@ class ParserGenerator:
non-terminal being added, and the second elment of the tuple is the non-terminal being added, and the second elment of the tuple is the
list of terminals and non-terminals that make up the production. list of terminals and non-terminals that make up the production.
There is no support for alternation. If you want alternations that There is currently no support for custom actions or alternation or
you'll have to lower the grammar by hand into the simpler form first, anything like that. If you want alternations that you'll have to lower
but that's what the Grammar and NonTerminal classes are for. the grammar by hand into the simpler form first.
Don't name anything with double-underscores; those are reserved for Don't name anything with double-underscores; those are reserved for
the generator. Don't add '$' either, as it is reserved to mean the generator. Don't add '$' either, as it is reserved to mean
@ -1252,215 +1273,105 @@ class ParserGenerator:
self._firsts, self._firsts,
) )
def gen_sets(self, seeds: list[Configuration]) -> StateGraph: def gen_closure(self, seeds: typing.Iterable[Configuration]) -> ConfigSet:
# This function can be seen as a modified version of items() from """Compute the closure for the specified configs. The closure is all
# Chen's dissertation. of the configurations we could be in. Specifically, if the position
# for a config is just before a non-terminal then we must also consider
# DOTY: It is also (practically) a converted version from grmtools configurations where the rule is the rule for the non-terminal and
# into python, more or less verbatim at this point. I have some the position is just before the beginning of the rule.
# sense of what is going on, and attempt to elaborate with
# these comments.
# closed_states and core_states are both equally sized vectors of (We have replaced a recursive version with an iterative one.)
# states. Core states are smaller, and used for the weakly compatible """
# checks, but we ultimately need to return closed states. Closed closure: set[Configuration] = set()
# states which are None are those which require processing; thus pending = list(seeds)
# closed_states also implicitly serves as a todo list. pending_next = []
closed_states: list[ItemSet | None] = [] while len(pending) > 0:
core_states: list[ItemSet] = [] for config in pending:
edges: list[dict[int, int]] = [] if config in closure:
# Convert the incoming seed configurations into item sets.
# TODO: Convert everything to ItemSet natively.
state0 = ItemSet({seed.core: set(seed.lookahead) for seed in seeds})
core_states.append(state0)
closed_states.append(None)
edges.append({})
# We maintain a set of which rules and tokens we've seen; when
# processing a given state there's no point processing a rule or
# token more than once.
seen: set[int] = set()
# cnd_[rule|token]_weaklies represent which states are possible weakly
# compatible matches for a given symbol.
#
# DOTY: As with `seen`, we have a uniform space so we can have a
# uniform one of these too.
cnd_weaklies: list[list[int]] = [[] for _ in range(len(self.alphabet))]
todo = 1 # How many None values are there in closed_states?
todo_off = 0 # Offset in closed states to start searching for the next todo.
while todo > 0:
assert len(core_states) == len(closed_states)
assert len(core_states) == len(edges)
# state_i is the next item to process. We don't want to
# continually search for the next None from the beginning, so we
# remember where we last saw a None (todo_off) and search from
# that point onwards, wrapping as necessary. Since processing a
# state x disproportionately causes state x + 1 to require
# processing, this prevents the search from becoming horribly
# non-linear.
try:
state_i = closed_states.index(None, todo_off)
except ValueError:
state_i = closed_states.index(None) # DOTY: Will not raise, given todo > 0
todo_off = state_i + 1
todo -= 1
cl_state = self.gen_closure(core_states[state_i])
closed_states[state_i] = cl_state
seen.clear()
for core in cl_state.items.keys():
sym = core.next
if sym is None or sym in seen:
continue
seen.add(sym)
nstate = cl_state.goto(sym)
# Try and find a compatible match for this state.
cnd_states = cnd_weaklies[sym]
# First of all see if any of the candidate states are exactly
# the same as the new state, in which case we only need to
# add an edge to the candidate state. This isn't just an
# optimisation (though it does avoid the expense of change
# propagation), but has a correctness aspect: there's no
# guarantee that the weakly compatible check is reflexive
# (i.e. a state may not be weakly compatible with itself).
found = False
for cnd in cnd_states:
if core_states[cnd] == nstate:
edges[state_i][sym] = cnd
found = True
break
if found:
continue continue
# No candidate states were equal to the new state, so we need closure.add(config)
# to look for a candidate state which is weakly compatible. pending_next.extend(self.gen_closure_next(config))
m: int | None = None
for cnd in cnd_states:
if core_states[cnd].weakly_compatible(nstate):
m = cnd
break
if m is not None: temp = pending
# A weakly compatible match has been found. pending = pending_next
edges[state_i][sym] = m pending_next = temp
assert core_states[m].weakly_compatible(nstate) # TODO: REMOVE, TOO SLOW pending_next.clear()
if core_states[m].weakly_merge(nstate):
# We only do the simplest change propagation, forcing possibly
# affected sets to be entirely reprocessed (which will recursively
# force propagation too). Even though this does unnecessary
# computation, it is still pretty fast.
#
# Note also that edges[k] will be completely regenerated, overwriting
# all existing entries and possibly adding new ones. We thus don't
# need to clear it manually.
if closed_states[m] is not None:
closed_states[m] = None
todo += 1
# NOTE: The generation of this closure *might* have generated
# multiple cores with different lookaheads; if that's
# the case we need to merge.
merged: dict[ConfigurationCore, set[int]] = {}
for c in closure:
existing = merged.get(c.core)
if existing is not None:
existing.update(c.lookahead)
else: else:
stidx = len(core_states) merged[c.core] = set(c.lookahead)
cnd_weaklies[sym].append(stidx) return ConfigSet(Configuration(k, tuple(sorted(v))) for k, v in merged.items())
edges[state_i][sym] = stidx
edges.append({}) def gen_all_successors(
closed_states.append(None) self, config_set: typing.Iterable[Configuration]
core_states.append(nstate) ) -> list[typing.Tuple[int, ConfigSet]]:
todo += 1 """Return all of the non-empty successors for the given config set.
# Although the Pager paper doesn't talk about it, the algorithm above (That is, given the config set, pretend we see all the symbols we
# can create unreachable states due to the non-determinism inherent could possibly see, and figure out which configs sets we get from
# in working with hashsets. Indeed, this can even happen with the those symbols. Those are the successors of this set.)
# example from Pager's paper (on perhaps 1 out of 100 runs, 24 or 25 """
# states will be created instead of 23). We thus need to weed out possible = {config.core.next for config in config_set if config.core.next is not None}
# unreachable states and update edges accordingly.
assert len(core_states) == len(closed_states)
all_states = [] next = []
for core_state, closed_state in zip(core_states, closed_states): for symbol in possible:
assert closed_state is not None seeds = ConfigSet(
all_states.append((core_state, closed_state)) config.replace_position(config.core.position + 1)
gc_states, gc_edges = self.gc(all_states, edges) for config in config_set
if config.core.next == symbol
# DOTY: UGH this is so bad, we should rewrite to use ItemSet everywehre
# probably, which actually means getting rid of the pluggable
# generator because who actually needs that?
# Register all the actually merged, final config sets. I should *not*
# have to do all this work. Really really garbage.
return StateGraph(
closures=[closed_state.to_config_set() for _, closed_state in gc_states],
successors=gc_edges,
) )
if len(seeds) > 0:
next.append((symbol, seeds))
def gc( return next
self,
states: list[tuple[ItemSet, ItemSet]],
edges: list[dict[int, int]],
) -> tuple[list[tuple[ItemSet, ItemSet]], list[dict[int, int]]]:
# First of all, do a simple pass over all states. All state indexes
# reachable from the start state will be inserted into the 'seen'
# set.
todo = [0]
seen = set()
while len(todo) > 0:
item = todo.pop()
if item in seen:
continue
seen.add(item)
todo.extend(e for e in edges[item].values() if e not in seen)
if len(seen) == len(states): def gen_sets(self, seeds: list[Configuration]) -> ConfigurationSetInfo:
# Every state is reachable. """Generate all configuration sets starting from the provided seeds."""
return states, edges result = ConfigurationSetInfo()
# Imagine we started with 3 states and their edges: successors = []
# states: [0, 1, 2] pending = [ConfigSet(seeds)]
# edges : [[_ => 2]] pending_next = []
# while len(pending) > 0:
# At this point, 'seen' will be the set {0, 2}. What we need to do is for core in pending:
# to create a new list of states that doesn't have state 1 in it. id, is_new = result.register_core(core)
# That will cause state 2 to become to state 1, meaning that we need if is_new:
# to adjust edges so that the pointer to state 2 is updated to state config_set = self.gen_closure(core)
# 1. In other words we want to achieve this output: result.register_config_closure(id, config_set)
# for symbol, successor in self.gen_all_successors(config_set):
# states: [0, 2] successors.append((id, symbol, successor))
# edges : [_ => 1] pending_next.append(successor)
#
# The way we do this is to first iterate over all states, working out
# what the mapping from seen states to their new offsets is.
gc_states: list[tuple[ItemSet, ItemSet]] = []
offsets: list[int] = []
offset = 0
for state_i, zstate in enumerate(states):
offsets.append(state_i - offset)
if state_i not in seen:
offset += 1
continue
gc_states.append(zstate) temp = pending
pending = pending_next
pending_next = temp
pending_next.clear()
# At this point the offsets list will be [0, 1, 1]. We now create new for id, symbol, successor in successors:
# edges where each offset is corrected by looking it up in the result.add_successor(id, symbol, result.core_key[successor])
# offsets list.
gc_edges: list[dict[int, int]] = []
for st_edge_i, st_edges in enumerate(edges):
if st_edge_i not in seen:
continue
gc_edges.append({k: offsets[v] for k, v in st_edges.items()}) return result
return (gc_states, gc_edges) def gen_follow(self, symbol: int) -> set[int]:
"""Generate the follow set for the given nonterminal.
The follow set for a nonterminal is the set of terminals that can
follow the nonterminal in a valid sentence. The resulting set never
contains epsilon and is never empty, since we should always at least
ground out at '$', which is the end-of-stream marker.
See FollowInfo for more information on how this is determined.
"""
return self._follows.follows[symbol]
def gen_first(self, symbols: typing.Iterable[int]) -> typing.Tuple[set[int], bool]: def gen_first(self, symbols: typing.Iterable[int]) -> typing.Tuple[set[int], bool]:
"""Return the first set for a *sequence* of symbols. """Return the first set for a *sequence* of symbols.
@ -1483,15 +1394,45 @@ class ParserGenerator:
return (result, True) return (result, True)
def gen_closure(self, items: ItemSet) -> ItemSet: def gen_reduce_set(self, config: Configuration) -> typing.Iterable[int]:
"""Generate the closure of the given ItemSet. """Return the set of symbols that indicate we should reduce the given
config.
Some of the configurations the ItemSet might be positioned right before In an LR1 parser, this is the lookahead of the configuration.
nonterminals. In that case, obviously, we should *also* behave as if we
were right at the beginning of each production for that nonterminal. The
set of all those productions combined with all the incoming productions
is the closure.
""" """
return config.lookahead
def gen_closure_next(self, config: Configuration):
"""Return the next set of configurations in the closure for config.
In LR1 parsers, we must compute the lookahead for the configurations
we're adding to the closure. The lookahead for the new configurations
is the first() of the rest of this config's production. If that
contains epsilon, then the lookahead *also* contains the lookahead we
already have. (This lookahead was presumably generated by the same
process, so in some sense it is a 'parent' lookahead, or a lookahead
from an upstream production in the grammar.)
(See the documentation in GenerateLR0 for more information on how
this function fits into the whole process, specifically `gen_closure`.)
"""
config_next = config.core.next
if config_next is None:
return ()
else:
lookahead, epsilon = self.gen_first(config.rest)
if epsilon:
lookahead.update(config.lookahead)
lookahead_tuple = tuple(sorted(lookahead))
next = []
for rule in self.grammar[config_next]:
rr = Configuration.from_rule(config_next, rule, lookahead=lookahead_tuple)
next.append(rr)
return tuple(next)
def gen_closure_x(self, items: ItemSet) -> ItemSet:
closure: dict[ConfigurationCore, set[int]] = {} closure: dict[ConfigurationCore, set[int]] = {}
# We're going to maintain a set of things to look at, rules that we # We're going to maintain a set of things to look at, rules that we
@ -1583,7 +1524,7 @@ class ParserGenerator:
config_next = config.core.next config_next = config.core.next
if config_next is None: if config_next is None:
if config.core.name != self.start_symbol: if config.core.name != self.start_symbol:
for a in config.lookahead: for a in self.gen_reduce_set(config):
builder.set_table_reduce(a, config) builder.set_table_reduce(a, config)
else: else:
builder.set_table_accept(self.end_symbol, config) builder.set_table_accept(self.end_symbol, config)
@ -1600,6 +1541,249 @@ class ParserGenerator:
return builder.flush(config_sets) return builder.flush(config_sets)
class GeneratePager(GenerateLR1):
"""Pager's algorithm.
I'll be honest, I don't understnd this one as well as the pure LR1
algorithm. It proceeds as LR1, generating successor states, but every
time it makes a new state it searches the states it has already made for
one that is "weakly compatible;" ifit finds one it merges the new state
with the old state and marks the old state to be re-visited.
The implementation here follows from the implementation in
`GRMTools<https://github.com/softdevteam/grmtools/blob/master/lrtable/src/lib/pager.rs>`_.
As they explain there:
> The general algorithms that form the basis of what's used in this file
> can be found in:
>
> A Practical General Method for Constructing LR(k) Parsers
> David Pager, Acta Informatica 7, 249--268, 1977
>
> However Pager's paper is dense, and doesn't name sub-parts of the
> algorithm. We mostly reference the (still incomplete, but less
> incomplete) version of the algorithm found in:
>
> Measuring and extending LR(1) parser generation
> Xin Chen, PhD thesis, University of Hawaii, 2009
"""
def gen_sets(self, seeds: list[Configuration]) -> ConfigurationSetInfo:
# This function can be seen as a modified version of items() from
# Chen's dissertation.
#
# DOTY: It is also (practically) a converted version from grmtools
# into python, more or less verbatim at this point. I have some
# sense of what is going on, and attempt to elaborate with
# these comments.
# closed_states and core_states are both equally sized vectors of
# states. Core states are smaller, and used for the weakly compatible
# checks, but we ultimately need to return closed states. Closed
# states which are None are those which require processing; thus
# closed_states also implicitly serves as a todo list.
closed_states: list[ItemSet | None] = []
core_states: list[ItemSet] = []
edges: list[dict[int, int]] = []
# Convert the incoming seed configurations into item sets.
# TODO: Convert everything to ItemSet natively.
state0 = ItemSet({seed.core: set(seed.lookahead) for seed in seeds})
core_states.append(state0)
closed_states.append(None)
edges.append({})
# We maintain a set of which rules and tokens we've seen; when
# processing a given state there's no point processing a rule or
# token more than once.
seen: set[int] = set()
# cnd_[rule|token]_weaklies represent which states are possible weakly
# compatible matches for a given symbol.
#
# DOTY: As with `seen`, we have a uniform space so we can have a
# uniform one of these too.
cnd_weaklies: list[list[int]] = [[] for _ in range(len(self.alphabet))]
todo = 1 # How many None values are there in closed_states?
todo_off = 0 # Offset in closed states to start searching for the next todo.
while todo > 0:
assert len(core_states) == len(closed_states)
assert len(core_states) == len(edges)
# state_i is the next item to process. We don't want to
# continually search for the next None from the beginning, so we
# remember where we last saw a None (todo_off) and search from
# that point onwards, wrapping as necessary. Since processing a
# state x disproportionately causes state x + 1 to require
# processing, this prevents the search from becoming horribly
# non-linear.
try:
state_i = closed_states.index(None, todo_off)
except ValueError:
state_i = closed_states.index(None) # DOTY: Will not raise, given todo > 0
todo_off = state_i + 1
todo -= 1
cl_state = self.gen_closure_x(core_states[state_i])
closed_states[state_i] = cl_state
seen.clear()
for core in cl_state.items.keys():
sym = core.next
if sym is None or sym in seen:
continue
seen.add(sym)
nstate = cl_state.goto(sym)
# Try and find a compatible match for this state.
cnd_states = cnd_weaklies[sym]
# First of all see if any of the candidate states are exactly
# the same as the new state, in which case we only need to
# add an edge to the candidate state. This isn't just an
# optimisation (though it does avoid the expense of change
# propagation), but has a correctness aspect: there's no
# guarantee that the weakly compatible check is reflexive
# (i.e. a state may not be weakly compatible with itself).
found = False
for cnd in cnd_states:
if core_states[cnd] == nstate:
edges[state_i][sym] = cnd
found = True
break
if found:
continue
# No candidate states were equal to the new state, so we need
# to look for a candidate state which is weakly compatible.
m: int | None = None
for cnd in cnd_states:
if core_states[cnd].weakly_compatible(nstate):
m = cnd
break
if m is not None:
# A weakly compatible match has been found.
edges[state_i][sym] = m
assert core_states[m].weakly_compatible(nstate) # TODO: REMOVE, TOO SLOW
if core_states[m].weakly_merge(nstate):
# We only do the simplest change propagation, forcing possibly
# affected sets to be entirely reprocessed (which will recursively
# force propagation too). Even though this does unnecessary
# computation, it is still pretty fast.
#
# Note also that edges[k] will be completely regenerated, overwriting
# all existing entries and possibly adding new ones. We thus don't
# need to clear it manually.
if closed_states[m] is not None:
closed_states[m] = None
todo += 1
else:
stidx = len(core_states)
cnd_weaklies[sym].append(stidx)
edges[state_i][sym] = stidx
edges.append({})
closed_states.append(None)
core_states.append(nstate)
todo += 1
# Although the Pager paper doesn't talk about it, the algorithm above
# can create unreachable states due to the non-determinism inherent
# in working with hashsets. Indeed, this can even happen with the
# example from Pager's paper (on perhaps 1 out of 100 runs, 24 or 25
# states will be created instead of 23). We thus need to weed out
# unreachable states and update edges accordingly.
assert len(core_states) == len(closed_states)
all_states = []
for core_state, closed_state in zip(core_states, closed_states):
assert closed_state is not None
all_states.append((core_state, closed_state))
gc_states, gc_edges = self.gc(all_states, edges)
# DOTY: UGH this is so bad, we should rewrite to use ItemSet everywehre
# probably, which actually means getting rid of the pluggable
# generator because who actually needs that?
# Register all the actually merged, final config sets. I should *not*
# have to do all this work. Really really garbage.
result = ConfigurationSetInfo()
result.sets = [core_state.to_config_set() for core_state, _ in gc_states]
result.core_key = {s: i for i, s in enumerate(result.sets)}
result.closures = [closed_state.to_config_set() for _, closed_state in gc_states]
result.config_set_key = {s: i for i, s in enumerate(result.closures) if s is not None}
result.successors = gc_edges
return result
def gc(
self,
states: list[tuple[ItemSet, ItemSet]],
edges: list[dict[int, int]],
) -> tuple[list[tuple[ItemSet, ItemSet]], list[dict[int, int]]]:
# First of all, do a simple pass over all states. All state indexes
# reachable from the start state will be inserted into the 'seen'
# set.
todo = [0]
seen = set()
while len(todo) > 0:
item = todo.pop()
if item in seen:
continue
seen.add(item)
todo.extend(e for e in edges[item].values() if e not in seen)
if len(seen) == len(states):
# Every state is reachable.
return states, edges
# Imagine we started with 3 states and their edges:
# states: [0, 1, 2]
# edges : [[_ => 2]]
#
# At this point, 'seen' will be the set {0, 2}. What we need to do is
# to create a new list of states that doesn't have state 1 in it.
# That will cause state 2 to become to state 1, meaning that we need
# to adjust edges so that the pointer to state 2 is updated to state
# 1. In other words we want to achieve this output:
#
# states: [0, 2]
# edges : [_ => 1]
#
# The way we do this is to first iterate over all states, working out
# what the mapping from seen states to their new offsets is.
gc_states: list[tuple[ItemSet, ItemSet]] = []
offsets: list[int] = []
offset = 0
for state_i, zstate in enumerate(states):
offsets.append(state_i - offset)
if state_i not in seen:
offset += 1
continue
gc_states.append(zstate)
# At this point the offsets list will be [0, 1, 1]. We now create new
# edges where each offset is corrected by looking it up in the
# offsets list.
gc_edges: list[dict[int, int]] = []
for st_edge_i, st_edges in enumerate(edges):
if st_edge_i not in seen:
continue
gc_edges.append({k: offsets[v] for k, v in st_edges.items()})
return (gc_states, gc_edges)
FlattenedWithMetadata = list["str|Terminal|tuple[dict[str,typing.Any],FlattenedWithMetadata]"] FlattenedWithMetadata = list["str|Terminal|tuple[dict[str,typing.Any],FlattenedWithMetadata]"]
@ -2825,7 +3009,7 @@ class Grammar:
""" """
_precedence: dict[str, typing.Tuple[Assoc, int]] _precedence: dict[str, typing.Tuple[Assoc, int]]
_generator: type[ParserGenerator] _generator: type[GenerateLR1]
_terminals: dict[str, Terminal] _terminals: dict[str, Terminal]
_nonterminals: dict[str, NonTerminal] _nonterminals: dict[str, NonTerminal]
_trivia: list[Terminal] _trivia: list[Terminal]
@ -2834,7 +3018,7 @@ class Grammar:
self, self,
start: str | NonTerminal | None = None, start: str | NonTerminal | None = None,
precedence: PrecedenceList | None = None, precedence: PrecedenceList | None = None,
generator: type[ParserGenerator] | None = None, generator: type[GenerateLR1] | None = None,
trivia: list[str | Terminal] | None = None, trivia: list[str | Terminal] | None = None,
name: str | None = None, name: str | None = None,
): ):
@ -2853,7 +3037,7 @@ class Grammar:
assert precedence is not None assert precedence is not None
if generator is None: if generator is None:
generator = getattr(self, "generator", ParserGenerator) generator = getattr(self, "generator", GeneratePager)
assert generator is not None assert generator is not None
if trivia is None: if trivia is None:

View file

@ -87,8 +87,8 @@ def test_all_generators():
GENERATORS = [ GENERATORS = [
# parser.GenerateLR0, # parser.GenerateLR0,
# parser.GeneratePager, parser.GeneratePager,
parser.ParserGenerator, parser.GenerateLR1,
] ]
for generator in GENERATORS: for generator in GENERATORS:
table = G().build_table(generator=generator) table = G().build_table(generator=generator)
@ -119,14 +119,15 @@ def test_grammar_aho_ullman_2():
A = Terminal("a") A = Terminal("a")
B = Terminal("b") B = Terminal("b")
TestGrammar().build_table(generator=parser.ParserGenerator) TestGrammar().build_table(generator=parser.GenerateLR1)
# TestGrammar().build_table(generator=parser.GeneratePager) TestGrammar().build_table(generator=parser.GeneratePager)
def test_fun_lalr(): def test_fun_lalr():
class TestGrammar(Grammar): class TestGrammar(Grammar):
start = "S" start = "S"
generator = parser.GeneratePager
@rule @rule
def S(self): def S(self):