Compare commits
5 commits
2d5c73f0b0
...
923b01f6fd
| Author | SHA1 | Date | |
|---|---|---|---|
| 923b01f6fd | |||
| 27e6bb413c | |||
| 2b72811486 | |||
| e501caa073 | |||
| e55bc140f9 |
2 changed files with 338 additions and 523 deletions
852
parser/parser.py
852
parser/parser.py
|
|
@ -135,7 +135,6 @@ import bisect
|
|||
import collections
|
||||
import dataclasses
|
||||
import enum
|
||||
import functools
|
||||
import inspect
|
||||
import itertools
|
||||
import json
|
||||
|
|
@ -274,7 +273,100 @@ class ConfigSet(frozenset[Configuration]):
|
|||
pass
|
||||
|
||||
|
||||
class ConfigurationSetInfo:
|
||||
# Here we have a slightly different definition of a ConfigurationSet; we keep
|
||||
# the lookaheads outside and use a dictionary to check for containment
|
||||
# quickly. ItemSet is used in the GRM/Pager/Chin algorithm.
|
||||
@dataclasses.dataclass
|
||||
class ItemSet:
|
||||
"""An ItemSet is a group of configuration cores together with their
|
||||
"contexts", or lookahead sets.
|
||||
|
||||
An ItemSet is comparable for equality, and also supports this lesser notion
|
||||
of "weakly compatible" which is used to collapse states in the pager
|
||||
algorithm.
|
||||
"""
|
||||
|
||||
items: dict[ConfigurationCore, set[int]]
|
||||
|
||||
def __init__(self, items=None):
|
||||
self.items = items or {}
|
||||
|
||||
@classmethod
|
||||
def from_config_set(cls, config_set: ConfigSet) -> "ItemSet":
|
||||
return ItemSet({config.core: set(config.lookahead) for config in config_set})
|
||||
|
||||
def weakly_compatible(self, other: "ItemSet") -> bool:
|
||||
a = self.items
|
||||
b = other.items
|
||||
|
||||
if len(a) != len(b):
|
||||
return False
|
||||
|
||||
for acore in a:
|
||||
if acore not in b:
|
||||
return False
|
||||
|
||||
if len(a) == 1:
|
||||
return True
|
||||
|
||||
# DOTY: This loop I do not understand, truly. What the heck is happening here?
|
||||
a_keys = list(a.keys())
|
||||
for i, i_key in enumerate(itertools.islice(a_keys, 0, len(a_keys) - 1)):
|
||||
for j_key in itertools.islice(a_keys, i + 1, None):
|
||||
a_i_key = a[i_key]
|
||||
b_i_key = b[i_key]
|
||||
a_j_key = a[j_key]
|
||||
b_j_key = b[j_key]
|
||||
|
||||
# DOTY: GRMTools written with intersects(); we don't have that we have
|
||||
# `not disjoint()`. :P There are many double negatives....
|
||||
#
|
||||
# not (intersect(a_i, b_j) or intersect(a_j, b_i))
|
||||
# not ((not disjoint(a_i, b_j)) or (not disjoint(a_j, b_i)))
|
||||
# ((not not disjoint(a_i, b_j)) and (not not disjoint(a_j, b_i)))
|
||||
# disjoint(a_i, b_j) and disjoint(a_j, b_i)
|
||||
if a_i_key.isdisjoint(b_j_key) and a_j_key.isdisjoint(b_i_key):
|
||||
continue
|
||||
|
||||
# intersect(a_i, a_j) or intersect(b_i, b_j)
|
||||
# (not disjoint(a_i, a_j)) or (not disjoint(b_i, b_j))
|
||||
# not (disjoint(a_i, a_j) and disjoint(b_i, b_j))
|
||||
if not (a_i_key.isdisjoint(a_j_key) and b_i_key.isdisjoint(b_j_key)):
|
||||
continue
|
||||
|
||||
return False
|
||||
|
||||
return True
|
||||
|
||||
def weakly_merge(self, other: "ItemSet") -> bool:
|
||||
"""Merge b into a, returning True if this lead to any changes."""
|
||||
a = self.items
|
||||
b = other.items
|
||||
|
||||
changed = False
|
||||
for a_key, a_ctx in a.items():
|
||||
start_len = len(a_ctx)
|
||||
a_ctx.update(b[a_key]) # Python doesn't tell us changes
|
||||
changed = changed or (start_len != len(a_ctx))
|
||||
|
||||
return changed
|
||||
|
||||
def goto(self, symbol: int) -> "ItemSet":
|
||||
result = ItemSet()
|
||||
for core, context in self.items.items():
|
||||
if core.next == symbol:
|
||||
next = core.replace_position(core.position + 1)
|
||||
result.items[next] = set(context)
|
||||
return result
|
||||
|
||||
def to_config_set(self) -> ConfigSet:
|
||||
return ConfigSet(
|
||||
{Configuration(core, tuple(sorted(ctx))) for core, ctx in self.items.items()}
|
||||
)
|
||||
|
||||
|
||||
@dataclasses.dataclass
|
||||
class StateGraph:
|
||||
"""When we build a grammar into a table, the first thing we need to do is
|
||||
generate all the configuration sets and their successors.
|
||||
|
||||
|
|
@ -289,65 +381,23 @@ class ConfigurationSetInfo:
|
|||
structure, but they all compute this information.)
|
||||
"""
|
||||
|
||||
core_key: dict[ConfigSet, int] # Map a ConfigSet into am index
|
||||
config_set_key: dict[ConfigSet, int] # Map a ConfigSet into am index
|
||||
sets: list[ConfigSet] # Map the index back into a set
|
||||
closures: list[ConfigSet | None] # Track closures
|
||||
closures: list[ConfigSet]
|
||||
|
||||
# All the sucessors for all of the sets. `successors[i]` is the mapping
|
||||
# from grammar symbol to the index of the set you get by processing that
|
||||
# symbol.
|
||||
successors: list[dict[int, int]]
|
||||
|
||||
def __init__(self):
|
||||
self.core_key = {}
|
||||
self.config_set_key = {}
|
||||
self.sets = []
|
||||
self.closures = []
|
||||
self.successors = []
|
||||
|
||||
def register_core(self, c: ConfigSet) -> typing.Tuple[int, bool]:
|
||||
"""Potentially add a new config set to the set of sets. Returns the
|
||||
canonical ID of the set within this structure, along with a boolean
|
||||
indicating whether the set was just added or not.
|
||||
|
||||
(You can use this integer to get the set back, if you need it, and
|
||||
also access the successors table.)
|
||||
"""
|
||||
existing = self.core_key.get(c)
|
||||
if existing is not None:
|
||||
return existing, False
|
||||
|
||||
index = len(self.sets)
|
||||
self.sets.append(c)
|
||||
self.closures.append(None)
|
||||
self.successors.append({})
|
||||
self.core_key[c] = index
|
||||
return index, True
|
||||
|
||||
def register_config_closure(self, c_id: int, closure: ConfigSet):
|
||||
assert self.closures[c_id] is None
|
||||
self.closures[c_id] = closure
|
||||
self.config_set_key[closure] = c_id
|
||||
|
||||
def add_successor(self, c_id: int, symbol: int, successor: int):
|
||||
"""Register sucessor(`c_id`, `symbol`) -> `successor`, where c_id
|
||||
is the id of the set in this structure, and symbol is the id of a
|
||||
symbol in the alphabet of the grammar.
|
||||
"""
|
||||
self.successors[c_id][symbol] = successor
|
||||
|
||||
def dump_state(self, alphabet: list[str]) -> str:
|
||||
return json.dumps(
|
||||
{
|
||||
str(set_index): {
|
||||
"configs": [c.format(alphabet) for c in config_set],
|
||||
"closures": [c.format(alphabet) for c in self.closures[set_index] or []],
|
||||
"successors": {
|
||||
alphabet[k]: str(v) for k, v in self.successors[set_index].items()
|
||||
},
|
||||
"closures": [c.format(alphabet) for c in closure],
|
||||
"successors": {alphabet[k]: str(v) for k, v in successors.items()},
|
||||
}
|
||||
for set_index, config_set in enumerate(self.sets)
|
||||
for set_index, (closure, successors) in enumerate(
|
||||
zip(self.closures, self.successors)
|
||||
)
|
||||
},
|
||||
indent=4,
|
||||
sort_keys=True,
|
||||
|
|
@ -364,7 +414,8 @@ class ConfigurationSetInfo:
|
|||
|
||||
This function raises KeyError if no path is found.
|
||||
"""
|
||||
target_index = self.config_set_key[target_set]
|
||||
# TODO: This should be tested.
|
||||
target_index = self.closures.index(target_set)
|
||||
visited = set()
|
||||
|
||||
queue: collections.deque = collections.deque()
|
||||
|
|
@ -507,7 +558,7 @@ class ErrorCollection:
|
|||
def gen_exception(
|
||||
self,
|
||||
alphabet: list[str],
|
||||
all_sets: ConfigurationSetInfo,
|
||||
all_sets: StateGraph,
|
||||
) -> AmbiguityError | None:
|
||||
"""Format all the errors into an error, or return None if there are no
|
||||
errors.
|
||||
|
|
@ -644,7 +695,7 @@ class TableBuilder(object):
|
|||
self.action_row = None
|
||||
self.goto_row = None
|
||||
|
||||
def flush(self, all_sets: ConfigurationSetInfo) -> ParseTable:
|
||||
def flush(self, all_sets: StateGraph) -> ParseTable:
|
||||
"""Finish building the table and return it.
|
||||
|
||||
Raises ValueError if there were any conflicts during construction.
|
||||
|
|
@ -1007,108 +1058,36 @@ class FollowInfo:
|
|||
return FollowInfo(follows=follows)
|
||||
|
||||
|
||||
# Here we have a slightly different definition of a ConfigurationSet; we keep the
|
||||
# lookaheads outside and use a dictionary to check for containment quickly.
|
||||
# ItemSet is used in the GRM/Pager/Chin algorithm.
|
||||
@dataclasses.dataclass
|
||||
class ItemSet:
|
||||
"""An ItemSet is a group of configuration cores together with their
|
||||
"contexts", or lookahead sets.
|
||||
class ParserGenerator:
|
||||
"""Generate parse tables for LR1 grammars.
|
||||
|
||||
An ItemSet is comparable for equality, and also supports this lesser notion
|
||||
of "weakly compatible" which is used to collapse states in the pager
|
||||
algorithm.
|
||||
"""
|
||||
This class implements a variant of pager's algorithm to generate the parse
|
||||
tables, which support the same set of languages as Canonical LR1 but with
|
||||
much smaller resulting parse tables.
|
||||
|
||||
items: dict[ConfigurationCore, set[int]]
|
||||
I'll be honest, I don't understnd this one as well as the pure LR1
|
||||
algorithm. It proceeds as LR1, generating successor states, but every
|
||||
time it makes a new state it searches the states it has already made for
|
||||
one that is "weakly compatible;" if it finds one it merges the new state
|
||||
with the old state and marks the old state to be re-visited.
|
||||
|
||||
def __init__(self, items=None):
|
||||
self.items = items or {}
|
||||
The implementation here follows from the implementation in
|
||||
`GRMTools<https://github.com/softdevteam/grmtools/blob/master/lrtable/src/lib/pager.rs>`_.
|
||||
|
||||
@classmethod
|
||||
def from_config_set(cls, config_set: ConfigSet) -> "ItemSet":
|
||||
return ItemSet({config.core: set(config.lookahead) for config in config_set})
|
||||
As they explain there:
|
||||
|
||||
def weakly_compatible(self, other: "ItemSet") -> bool:
|
||||
a = self.items
|
||||
b = other.items
|
||||
|
||||
if len(a) != len(b):
|
||||
return False
|
||||
|
||||
for acore in a:
|
||||
if acore not in b:
|
||||
return False
|
||||
|
||||
if len(a) == 1:
|
||||
return True
|
||||
|
||||
# DOTY: This loop I do not understand, truly. What the heck is happening here?
|
||||
a_keys = list(a.keys())
|
||||
for i, i_key in enumerate(itertools.islice(a_keys, 0, len(a_keys) - 1)):
|
||||
for j_key in itertools.islice(a_keys, i + 1, None):
|
||||
a_i_key = a[i_key]
|
||||
b_i_key = b[i_key]
|
||||
a_j_key = a[j_key]
|
||||
b_j_key = b[j_key]
|
||||
|
||||
# DOTY: GRMTools written with intersects(); we don't have that we have
|
||||
# `not disjoint()`. :P There are many double negatives....
|
||||
#
|
||||
# not (intersect(a_i, b_j) or intersect(a_j, b_i))
|
||||
# not ((not disjoint(a_i, b_j)) or (not disjoint(a_j, b_i)))
|
||||
# ((not not disjoint(a_i, b_j)) and (not not disjoint(a_j, b_i)))
|
||||
# disjoint(a_i, b_j) and disjoint(a_j, b_i)
|
||||
if a_i_key.isdisjoint(b_j_key) and a_j_key.isdisjoint(b_i_key):
|
||||
continue
|
||||
|
||||
# intersect(a_i, a_j) or intersect(b_i, b_j)
|
||||
# (not disjoint(a_i, a_j)) or (not disjoint(b_i, b_j))
|
||||
# not (disjoint(a_i, a_j) and disjoint(b_i, b_j))
|
||||
if not (a_i_key.isdisjoint(a_j_key) and b_i_key.isdisjoint(b_j_key)):
|
||||
continue
|
||||
|
||||
return False
|
||||
|
||||
return True
|
||||
|
||||
def weakly_merge(self, other: "ItemSet") -> bool:
|
||||
"""Merge b into a, returning True if this lead to any changes."""
|
||||
a = self.items
|
||||
b = other.items
|
||||
|
||||
changed = False
|
||||
for a_key, a_ctx in a.items():
|
||||
start_len = len(a_ctx)
|
||||
a_ctx.update(b[a_key]) # Python doesn't tell us changes
|
||||
changed = changed or (start_len != len(a_ctx))
|
||||
|
||||
return changed
|
||||
|
||||
def goto(self, symbol: int) -> "ItemSet":
|
||||
result = ItemSet()
|
||||
for core, context in self.items.items():
|
||||
if core.next == symbol:
|
||||
next = core.replace_position(core.position + 1)
|
||||
result.items[next] = set(context)
|
||||
return result
|
||||
|
||||
def to_config_set(self) -> ConfigSet:
|
||||
return ConfigSet(
|
||||
{Configuration(core, tuple(sorted(ctx))) for core, ctx in self.items.items()}
|
||||
)
|
||||
|
||||
|
||||
class GenerateLR1:
|
||||
"""Generate parse tables for LR1, or "canonical LR" grammars.
|
||||
|
||||
LR1 parsers can recognize more than SLR parsers. Like SLR parsers, they
|
||||
are choosier about when they reduce. But unlike SLR parsers, they specify
|
||||
the terminals on which they reduce by carrying a 'lookahead' terminal in
|
||||
the configuration. The lookahead of a configuration is computed as the
|
||||
closure of a configuration set is computed, so see gen_closure_next for
|
||||
details. (Except for the start configuration, which has '$' as its
|
||||
lookahead.)
|
||||
> The general algorithms that form the basis of what's used in this file
|
||||
> can be found in:
|
||||
>
|
||||
> A Practical General Method for Constructing LR(k) Parsers
|
||||
> David Pager, Acta Informatica 7, 249--268, 1977
|
||||
>
|
||||
> However Pager's paper is dense, and doesn't name sub-parts of the
|
||||
> algorithm. We mostly reference the (still incomplete, but less
|
||||
> incomplete) version of the algorithm found in:
|
||||
>
|
||||
> Measuring and extending LR(1) parser generation
|
||||
> Xin Chen, PhD thesis, University of Hawaii, 2009
|
||||
"""
|
||||
|
||||
# Internally we use integers as symbols, not strings. Mostly this is fine,
|
||||
|
|
@ -1171,9 +1150,9 @@ class GenerateLR1:
|
|||
non-terminal being added, and the second elment of the tuple is the
|
||||
list of terminals and non-terminals that make up the production.
|
||||
|
||||
There is currently no support for custom actions or alternation or
|
||||
anything like that. If you want alternations that you'll have to lower
|
||||
the grammar by hand into the simpler form first.
|
||||
There is no support for alternation. If you want alternations that
|
||||
you'll have to lower the grammar by hand into the simpler form first,
|
||||
but that's what the Grammar and NonTerminal classes are for.
|
||||
|
||||
Don't name anything with double-underscores; those are reserved for
|
||||
the generator. Don't add '$' either, as it is reserved to mean
|
||||
|
|
@ -1273,105 +1252,215 @@ class GenerateLR1:
|
|||
self._firsts,
|
||||
)
|
||||
|
||||
def gen_closure(self, seeds: typing.Iterable[Configuration]) -> ConfigSet:
|
||||
"""Compute the closure for the specified configs. The closure is all
|
||||
of the configurations we could be in. Specifically, if the position
|
||||
for a config is just before a non-terminal then we must also consider
|
||||
configurations where the rule is the rule for the non-terminal and
|
||||
the position is just before the beginning of the rule.
|
||||
def gen_sets(self, seeds: list[Configuration]) -> StateGraph:
|
||||
# This function can be seen as a modified version of items() from
|
||||
# Chen's dissertation.
|
||||
#
|
||||
# DOTY: It is also (practically) a converted version from grmtools
|
||||
# into python, more or less verbatim at this point. I have some
|
||||
# sense of what is going on, and attempt to elaborate with
|
||||
# these comments.
|
||||
|
||||
(We have replaced a recursive version with an iterative one.)
|
||||
"""
|
||||
closure: set[Configuration] = set()
|
||||
pending = list(seeds)
|
||||
pending_next = []
|
||||
while len(pending) > 0:
|
||||
for config in pending:
|
||||
if config in closure:
|
||||
# closed_states and core_states are both equally sized vectors of
|
||||
# states. Core states are smaller, and used for the weakly compatible
|
||||
# checks, but we ultimately need to return closed states. Closed
|
||||
# states which are None are those which require processing; thus
|
||||
# closed_states also implicitly serves as a todo list.
|
||||
closed_states: list[ItemSet | None] = []
|
||||
core_states: list[ItemSet] = []
|
||||
edges: list[dict[int, int]] = []
|
||||
|
||||
# Convert the incoming seed configurations into item sets.
|
||||
# TODO: Convert everything to ItemSet natively.
|
||||
state0 = ItemSet({seed.core: set(seed.lookahead) for seed in seeds})
|
||||
core_states.append(state0)
|
||||
closed_states.append(None)
|
||||
edges.append({})
|
||||
|
||||
# We maintain a set of which rules and tokens we've seen; when
|
||||
# processing a given state there's no point processing a rule or
|
||||
# token more than once.
|
||||
seen: set[int] = set()
|
||||
|
||||
# cnd_[rule|token]_weaklies represent which states are possible weakly
|
||||
# compatible matches for a given symbol.
|
||||
#
|
||||
# DOTY: As with `seen`, we have a uniform space so we can have a
|
||||
# uniform one of these too.
|
||||
cnd_weaklies: list[list[int]] = [[] for _ in range(len(self.alphabet))]
|
||||
|
||||
todo = 1 # How many None values are there in closed_states?
|
||||
todo_off = 0 # Offset in closed states to start searching for the next todo.
|
||||
while todo > 0:
|
||||
assert len(core_states) == len(closed_states)
|
||||
assert len(core_states) == len(edges)
|
||||
|
||||
# state_i is the next item to process. We don't want to
|
||||
# continually search for the next None from the beginning, so we
|
||||
# remember where we last saw a None (todo_off) and search from
|
||||
# that point onwards, wrapping as necessary. Since processing a
|
||||
# state x disproportionately causes state x + 1 to require
|
||||
# processing, this prevents the search from becoming horribly
|
||||
# non-linear.
|
||||
try:
|
||||
state_i = closed_states.index(None, todo_off)
|
||||
except ValueError:
|
||||
state_i = closed_states.index(None) # DOTY: Will not raise, given todo > 0
|
||||
|
||||
todo_off = state_i + 1
|
||||
todo -= 1
|
||||
|
||||
cl_state = self.gen_closure(core_states[state_i])
|
||||
closed_states[state_i] = cl_state
|
||||
|
||||
seen.clear()
|
||||
for core in cl_state.items.keys():
|
||||
sym = core.next
|
||||
if sym is None or sym in seen:
|
||||
continue
|
||||
seen.add(sym)
|
||||
|
||||
nstate = cl_state.goto(sym)
|
||||
|
||||
# Try and find a compatible match for this state.
|
||||
cnd_states = cnd_weaklies[sym]
|
||||
|
||||
# First of all see if any of the candidate states are exactly
|
||||
# the same as the new state, in which case we only need to
|
||||
# add an edge to the candidate state. This isn't just an
|
||||
# optimisation (though it does avoid the expense of change
|
||||
# propagation), but has a correctness aspect: there's no
|
||||
# guarantee that the weakly compatible check is reflexive
|
||||
# (i.e. a state may not be weakly compatible with itself).
|
||||
found = False
|
||||
for cnd in cnd_states:
|
||||
if core_states[cnd] == nstate:
|
||||
edges[state_i][sym] = cnd
|
||||
found = True
|
||||
break
|
||||
|
||||
if found:
|
||||
continue
|
||||
|
||||
closure.add(config)
|
||||
pending_next.extend(self.gen_closure_next(config))
|
||||
# No candidate states were equal to the new state, so we need
|
||||
# to look for a candidate state which is weakly compatible.
|
||||
m: int | None = None
|
||||
for cnd in cnd_states:
|
||||
if core_states[cnd].weakly_compatible(nstate):
|
||||
m = cnd
|
||||
break
|
||||
|
||||
temp = pending
|
||||
pending = pending_next
|
||||
pending_next = temp
|
||||
pending_next.clear()
|
||||
if m is not None:
|
||||
# A weakly compatible match has been found.
|
||||
edges[state_i][sym] = m
|
||||
assert core_states[m].weakly_compatible(nstate) # TODO: REMOVE, TOO SLOW
|
||||
if core_states[m].weakly_merge(nstate):
|
||||
# We only do the simplest change propagation, forcing possibly
|
||||
# affected sets to be entirely reprocessed (which will recursively
|
||||
# force propagation too). Even though this does unnecessary
|
||||
# computation, it is still pretty fast.
|
||||
#
|
||||
# Note also that edges[k] will be completely regenerated, overwriting
|
||||
# all existing entries and possibly adding new ones. We thus don't
|
||||
# need to clear it manually.
|
||||
if closed_states[m] is not None:
|
||||
closed_states[m] = None
|
||||
todo += 1
|
||||
|
||||
# NOTE: The generation of this closure *might* have generated
|
||||
# multiple cores with different lookaheads; if that's
|
||||
# the case we need to merge.
|
||||
merged: dict[ConfigurationCore, set[int]] = {}
|
||||
for c in closure:
|
||||
existing = merged.get(c.core)
|
||||
if existing is not None:
|
||||
existing.update(c.lookahead)
|
||||
else:
|
||||
merged[c.core] = set(c.lookahead)
|
||||
else:
|
||||
stidx = len(core_states)
|
||||
|
||||
return ConfigSet(Configuration(k, tuple(sorted(v))) for k, v in merged.items())
|
||||
cnd_weaklies[sym].append(stidx)
|
||||
edges[state_i][sym] = stidx
|
||||
|
||||
def gen_all_successors(
|
||||
self, config_set: typing.Iterable[Configuration]
|
||||
) -> list[typing.Tuple[int, ConfigSet]]:
|
||||
"""Return all of the non-empty successors for the given config set.
|
||||
edges.append({})
|
||||
closed_states.append(None)
|
||||
core_states.append(nstate)
|
||||
todo += 1
|
||||
|
||||
(That is, given the config set, pretend we see all the symbols we
|
||||
could possibly see, and figure out which configs sets we get from
|
||||
those symbols. Those are the successors of this set.)
|
||||
"""
|
||||
possible = {config.core.next for config in config_set if config.core.next is not None}
|
||||
# Although the Pager paper doesn't talk about it, the algorithm above
|
||||
# can create unreachable states due to the non-determinism inherent
|
||||
# in working with hashsets. Indeed, this can even happen with the
|
||||
# example from Pager's paper (on perhaps 1 out of 100 runs, 24 or 25
|
||||
# states will be created instead of 23). We thus need to weed out
|
||||
# unreachable states and update edges accordingly.
|
||||
assert len(core_states) == len(closed_states)
|
||||
|
||||
next = []
|
||||
for symbol in possible:
|
||||
seeds = ConfigSet(
|
||||
config.replace_position(config.core.position + 1)
|
||||
for config in config_set
|
||||
if config.core.next == symbol
|
||||
)
|
||||
if len(seeds) > 0:
|
||||
next.append((symbol, seeds))
|
||||
all_states = []
|
||||
for core_state, closed_state in zip(core_states, closed_states):
|
||||
assert closed_state is not None
|
||||
all_states.append((core_state, closed_state))
|
||||
gc_states, gc_edges = self.gc(all_states, edges)
|
||||
|
||||
return next
|
||||
# DOTY: UGH this is so bad, we should rewrite to use ItemSet everywehre
|
||||
# probably, which actually means getting rid of the pluggable
|
||||
# generator because who actually needs that?
|
||||
|
||||
def gen_sets(self, seeds: list[Configuration]) -> ConfigurationSetInfo:
|
||||
"""Generate all configuration sets starting from the provided seeds."""
|
||||
result = ConfigurationSetInfo()
|
||||
# Register all the actually merged, final config sets. I should *not*
|
||||
# have to do all this work. Really really garbage.
|
||||
return StateGraph(
|
||||
closures=[closed_state.to_config_set() for _, closed_state in gc_states],
|
||||
successors=gc_edges,
|
||||
)
|
||||
|
||||
successors = []
|
||||
pending = [ConfigSet(seeds)]
|
||||
pending_next = []
|
||||
while len(pending) > 0:
|
||||
for core in pending:
|
||||
id, is_new = result.register_core(core)
|
||||
if is_new:
|
||||
config_set = self.gen_closure(core)
|
||||
result.register_config_closure(id, config_set)
|
||||
for symbol, successor in self.gen_all_successors(config_set):
|
||||
successors.append((id, symbol, successor))
|
||||
pending_next.append(successor)
|
||||
def gc(
|
||||
self,
|
||||
states: list[tuple[ItemSet, ItemSet]],
|
||||
edges: list[dict[int, int]],
|
||||
) -> tuple[list[tuple[ItemSet, ItemSet]], list[dict[int, int]]]:
|
||||
# First of all, do a simple pass over all states. All state indexes
|
||||
# reachable from the start state will be inserted into the 'seen'
|
||||
# set.
|
||||
todo = [0]
|
||||
seen = set()
|
||||
while len(todo) > 0:
|
||||
item = todo.pop()
|
||||
if item in seen:
|
||||
continue
|
||||
seen.add(item)
|
||||
todo.extend(e for e in edges[item].values() if e not in seen)
|
||||
|
||||
temp = pending
|
||||
pending = pending_next
|
||||
pending_next = temp
|
||||
pending_next.clear()
|
||||
if len(seen) == len(states):
|
||||
# Every state is reachable.
|
||||
return states, edges
|
||||
|
||||
for id, symbol, successor in successors:
|
||||
result.add_successor(id, symbol, result.core_key[successor])
|
||||
# Imagine we started with 3 states and their edges:
|
||||
# states: [0, 1, 2]
|
||||
# edges : [[_ => 2]]
|
||||
#
|
||||
# At this point, 'seen' will be the set {0, 2}. What we need to do is
|
||||
# to create a new list of states that doesn't have state 1 in it.
|
||||
# That will cause state 2 to become to state 1, meaning that we need
|
||||
# to adjust edges so that the pointer to state 2 is updated to state
|
||||
# 1. In other words we want to achieve this output:
|
||||
#
|
||||
# states: [0, 2]
|
||||
# edges : [_ => 1]
|
||||
#
|
||||
# The way we do this is to first iterate over all states, working out
|
||||
# what the mapping from seen states to their new offsets is.
|
||||
gc_states: list[tuple[ItemSet, ItemSet]] = []
|
||||
offsets: list[int] = []
|
||||
offset = 0
|
||||
for state_i, zstate in enumerate(states):
|
||||
offsets.append(state_i - offset)
|
||||
if state_i not in seen:
|
||||
offset += 1
|
||||
continue
|
||||
|
||||
return result
|
||||
gc_states.append(zstate)
|
||||
|
||||
def gen_follow(self, symbol: int) -> set[int]:
|
||||
"""Generate the follow set for the given nonterminal.
|
||||
# At this point the offsets list will be [0, 1, 1]. We now create new
|
||||
# edges where each offset is corrected by looking it up in the
|
||||
# offsets list.
|
||||
gc_edges: list[dict[int, int]] = []
|
||||
for st_edge_i, st_edges in enumerate(edges):
|
||||
if st_edge_i not in seen:
|
||||
continue
|
||||
|
||||
The follow set for a nonterminal is the set of terminals that can
|
||||
follow the nonterminal in a valid sentence. The resulting set never
|
||||
contains epsilon and is never empty, since we should always at least
|
||||
ground out at '$', which is the end-of-stream marker.
|
||||
gc_edges.append({k: offsets[v] for k, v in st_edges.items()})
|
||||
|
||||
See FollowInfo for more information on how this is determined.
|
||||
"""
|
||||
return self._follows.follows[symbol]
|
||||
return (gc_states, gc_edges)
|
||||
|
||||
def gen_first(self, symbols: typing.Iterable[int]) -> typing.Tuple[set[int], bool]:
|
||||
"""Return the first set for a *sequence* of symbols.
|
||||
|
|
@ -1394,45 +1483,15 @@ class GenerateLR1:
|
|||
|
||||
return (result, True)
|
||||
|
||||
def gen_reduce_set(self, config: Configuration) -> typing.Iterable[int]:
|
||||
"""Return the set of symbols that indicate we should reduce the given
|
||||
config.
|
||||
def gen_closure(self, items: ItemSet) -> ItemSet:
|
||||
"""Generate the closure of the given ItemSet.
|
||||
|
||||
In an LR1 parser, this is the lookahead of the configuration.
|
||||
Some of the configurations the ItemSet might be positioned right before
|
||||
nonterminals. In that case, obviously, we should *also* behave as if we
|
||||
were right at the beginning of each production for that nonterminal. The
|
||||
set of all those productions combined with all the incoming productions
|
||||
is the closure.
|
||||
"""
|
||||
return config.lookahead
|
||||
|
||||
def gen_closure_next(self, config: Configuration):
|
||||
"""Return the next set of configurations in the closure for config.
|
||||
|
||||
In LR1 parsers, we must compute the lookahead for the configurations
|
||||
we're adding to the closure. The lookahead for the new configurations
|
||||
is the first() of the rest of this config's production. If that
|
||||
contains epsilon, then the lookahead *also* contains the lookahead we
|
||||
already have. (This lookahead was presumably generated by the same
|
||||
process, so in some sense it is a 'parent' lookahead, or a lookahead
|
||||
from an upstream production in the grammar.)
|
||||
|
||||
(See the documentation in GenerateLR0 for more information on how
|
||||
this function fits into the whole process, specifically `gen_closure`.)
|
||||
"""
|
||||
config_next = config.core.next
|
||||
if config_next is None:
|
||||
return ()
|
||||
else:
|
||||
lookahead, epsilon = self.gen_first(config.rest)
|
||||
if epsilon:
|
||||
lookahead.update(config.lookahead)
|
||||
lookahead_tuple = tuple(sorted(lookahead))
|
||||
|
||||
next = []
|
||||
for rule in self.grammar[config_next]:
|
||||
rr = Configuration.from_rule(config_next, rule, lookahead=lookahead_tuple)
|
||||
next.append(rr)
|
||||
|
||||
return tuple(next)
|
||||
|
||||
def gen_closure_x(self, items: ItemSet) -> ItemSet:
|
||||
closure: dict[ConfigurationCore, set[int]] = {}
|
||||
|
||||
# We're going to maintain a set of things to look at, rules that we
|
||||
|
|
@ -1524,7 +1583,7 @@ class GenerateLR1:
|
|||
config_next = config.core.next
|
||||
if config_next is None:
|
||||
if config.core.name != self.start_symbol:
|
||||
for a in self.gen_reduce_set(config):
|
||||
for a in config.lookahead:
|
||||
builder.set_table_reduce(a, config)
|
||||
else:
|
||||
builder.set_table_accept(self.end_symbol, config)
|
||||
|
|
@ -1541,249 +1600,6 @@ class GenerateLR1:
|
|||
return builder.flush(config_sets)
|
||||
|
||||
|
||||
class GeneratePager(GenerateLR1):
|
||||
"""Pager's algorithm.
|
||||
|
||||
I'll be honest, I don't understnd this one as well as the pure LR1
|
||||
algorithm. It proceeds as LR1, generating successor states, but every
|
||||
time it makes a new state it searches the states it has already made for
|
||||
one that is "weakly compatible;" ifit finds one it merges the new state
|
||||
with the old state and marks the old state to be re-visited.
|
||||
|
||||
The implementation here follows from the implementation in
|
||||
`GRMTools<https://github.com/softdevteam/grmtools/blob/master/lrtable/src/lib/pager.rs>`_.
|
||||
|
||||
As they explain there:
|
||||
|
||||
> The general algorithms that form the basis of what's used in this file
|
||||
> can be found in:
|
||||
>
|
||||
> A Practical General Method for Constructing LR(k) Parsers
|
||||
> David Pager, Acta Informatica 7, 249--268, 1977
|
||||
>
|
||||
> However Pager's paper is dense, and doesn't name sub-parts of the
|
||||
> algorithm. We mostly reference the (still incomplete, but less
|
||||
> incomplete) version of the algorithm found in:
|
||||
>
|
||||
> Measuring and extending LR(1) parser generation
|
||||
> Xin Chen, PhD thesis, University of Hawaii, 2009
|
||||
"""
|
||||
|
||||
def gen_sets(self, seeds: list[Configuration]) -> ConfigurationSetInfo:
|
||||
# This function can be seen as a modified version of items() from
|
||||
# Chen's dissertation.
|
||||
#
|
||||
# DOTY: It is also (practically) a converted version from grmtools
|
||||
# into python, more or less verbatim at this point. I have some
|
||||
# sense of what is going on, and attempt to elaborate with
|
||||
# these comments.
|
||||
|
||||
# closed_states and core_states are both equally sized vectors of
|
||||
# states. Core states are smaller, and used for the weakly compatible
|
||||
# checks, but we ultimately need to return closed states. Closed
|
||||
# states which are None are those which require processing; thus
|
||||
# closed_states also implicitly serves as a todo list.
|
||||
closed_states: list[ItemSet | None] = []
|
||||
core_states: list[ItemSet] = []
|
||||
edges: list[dict[int, int]] = []
|
||||
|
||||
# Convert the incoming seed configurations into item sets.
|
||||
# TODO: Convert everything to ItemSet natively.
|
||||
state0 = ItemSet({seed.core: set(seed.lookahead) for seed in seeds})
|
||||
core_states.append(state0)
|
||||
closed_states.append(None)
|
||||
edges.append({})
|
||||
|
||||
# We maintain a set of which rules and tokens we've seen; when
|
||||
# processing a given state there's no point processing a rule or
|
||||
# token more than once.
|
||||
seen: set[int] = set()
|
||||
|
||||
# cnd_[rule|token]_weaklies represent which states are possible weakly
|
||||
# compatible matches for a given symbol.
|
||||
#
|
||||
# DOTY: As with `seen`, we have a uniform space so we can have a
|
||||
# uniform one of these too.
|
||||
cnd_weaklies: list[list[int]] = [[] for _ in range(len(self.alphabet))]
|
||||
|
||||
todo = 1 # How many None values are there in closed_states?
|
||||
todo_off = 0 # Offset in closed states to start searching for the next todo.
|
||||
while todo > 0:
|
||||
assert len(core_states) == len(closed_states)
|
||||
assert len(core_states) == len(edges)
|
||||
|
||||
# state_i is the next item to process. We don't want to
|
||||
# continually search for the next None from the beginning, so we
|
||||
# remember where we last saw a None (todo_off) and search from
|
||||
# that point onwards, wrapping as necessary. Since processing a
|
||||
# state x disproportionately causes state x + 1 to require
|
||||
# processing, this prevents the search from becoming horribly
|
||||
# non-linear.
|
||||
try:
|
||||
state_i = closed_states.index(None, todo_off)
|
||||
except ValueError:
|
||||
state_i = closed_states.index(None) # DOTY: Will not raise, given todo > 0
|
||||
|
||||
todo_off = state_i + 1
|
||||
todo -= 1
|
||||
|
||||
cl_state = self.gen_closure_x(core_states[state_i])
|
||||
closed_states[state_i] = cl_state
|
||||
|
||||
seen.clear()
|
||||
for core in cl_state.items.keys():
|
||||
sym = core.next
|
||||
if sym is None or sym in seen:
|
||||
continue
|
||||
seen.add(sym)
|
||||
|
||||
nstate = cl_state.goto(sym)
|
||||
|
||||
# Try and find a compatible match for this state.
|
||||
cnd_states = cnd_weaklies[sym]
|
||||
|
||||
# First of all see if any of the candidate states are exactly
|
||||
# the same as the new state, in which case we only need to
|
||||
# add an edge to the candidate state. This isn't just an
|
||||
# optimisation (though it does avoid the expense of change
|
||||
# propagation), but has a correctness aspect: there's no
|
||||
# guarantee that the weakly compatible check is reflexive
|
||||
# (i.e. a state may not be weakly compatible with itself).
|
||||
found = False
|
||||
for cnd in cnd_states:
|
||||
if core_states[cnd] == nstate:
|
||||
edges[state_i][sym] = cnd
|
||||
found = True
|
||||
break
|
||||
|
||||
if found:
|
||||
continue
|
||||
|
||||
# No candidate states were equal to the new state, so we need
|
||||
# to look for a candidate state which is weakly compatible.
|
||||
m: int | None = None
|
||||
for cnd in cnd_states:
|
||||
if core_states[cnd].weakly_compatible(nstate):
|
||||
m = cnd
|
||||
break
|
||||
|
||||
if m is not None:
|
||||
# A weakly compatible match has been found.
|
||||
edges[state_i][sym] = m
|
||||
assert core_states[m].weakly_compatible(nstate) # TODO: REMOVE, TOO SLOW
|
||||
if core_states[m].weakly_merge(nstate):
|
||||
# We only do the simplest change propagation, forcing possibly
|
||||
# affected sets to be entirely reprocessed (which will recursively
|
||||
# force propagation too). Even though this does unnecessary
|
||||
# computation, it is still pretty fast.
|
||||
#
|
||||
# Note also that edges[k] will be completely regenerated, overwriting
|
||||
# all existing entries and possibly adding new ones. We thus don't
|
||||
# need to clear it manually.
|
||||
if closed_states[m] is not None:
|
||||
closed_states[m] = None
|
||||
todo += 1
|
||||
|
||||
else:
|
||||
stidx = len(core_states)
|
||||
|
||||
cnd_weaklies[sym].append(stidx)
|
||||
edges[state_i][sym] = stidx
|
||||
|
||||
edges.append({})
|
||||
closed_states.append(None)
|
||||
core_states.append(nstate)
|
||||
todo += 1
|
||||
|
||||
# Although the Pager paper doesn't talk about it, the algorithm above
|
||||
# can create unreachable states due to the non-determinism inherent
|
||||
# in working with hashsets. Indeed, this can even happen with the
|
||||
# example from Pager's paper (on perhaps 1 out of 100 runs, 24 or 25
|
||||
# states will be created instead of 23). We thus need to weed out
|
||||
# unreachable states and update edges accordingly.
|
||||
assert len(core_states) == len(closed_states)
|
||||
|
||||
all_states = []
|
||||
for core_state, closed_state in zip(core_states, closed_states):
|
||||
assert closed_state is not None
|
||||
all_states.append((core_state, closed_state))
|
||||
gc_states, gc_edges = self.gc(all_states, edges)
|
||||
|
||||
# DOTY: UGH this is so bad, we should rewrite to use ItemSet everywehre
|
||||
# probably, which actually means getting rid of the pluggable
|
||||
# generator because who actually needs that?
|
||||
|
||||
# Register all the actually merged, final config sets. I should *not*
|
||||
# have to do all this work. Really really garbage.
|
||||
result = ConfigurationSetInfo()
|
||||
result.sets = [core_state.to_config_set() for core_state, _ in gc_states]
|
||||
result.core_key = {s: i for i, s in enumerate(result.sets)}
|
||||
result.closures = [closed_state.to_config_set() for _, closed_state in gc_states]
|
||||
result.config_set_key = {s: i for i, s in enumerate(result.closures) if s is not None}
|
||||
result.successors = gc_edges
|
||||
|
||||
return result
|
||||
|
||||
def gc(
|
||||
self,
|
||||
states: list[tuple[ItemSet, ItemSet]],
|
||||
edges: list[dict[int, int]],
|
||||
) -> tuple[list[tuple[ItemSet, ItemSet]], list[dict[int, int]]]:
|
||||
# First of all, do a simple pass over all states. All state indexes
|
||||
# reachable from the start state will be inserted into the 'seen'
|
||||
# set.
|
||||
todo = [0]
|
||||
seen = set()
|
||||
while len(todo) > 0:
|
||||
item = todo.pop()
|
||||
if item in seen:
|
||||
continue
|
||||
seen.add(item)
|
||||
todo.extend(e for e in edges[item].values() if e not in seen)
|
||||
|
||||
if len(seen) == len(states):
|
||||
# Every state is reachable.
|
||||
return states, edges
|
||||
|
||||
# Imagine we started with 3 states and their edges:
|
||||
# states: [0, 1, 2]
|
||||
# edges : [[_ => 2]]
|
||||
#
|
||||
# At this point, 'seen' will be the set {0, 2}. What we need to do is
|
||||
# to create a new list of states that doesn't have state 1 in it.
|
||||
# That will cause state 2 to become to state 1, meaning that we need
|
||||
# to adjust edges so that the pointer to state 2 is updated to state
|
||||
# 1. In other words we want to achieve this output:
|
||||
#
|
||||
# states: [0, 2]
|
||||
# edges : [_ => 1]
|
||||
#
|
||||
# The way we do this is to first iterate over all states, working out
|
||||
# what the mapping from seen states to their new offsets is.
|
||||
gc_states: list[tuple[ItemSet, ItemSet]] = []
|
||||
offsets: list[int] = []
|
||||
offset = 0
|
||||
for state_i, zstate in enumerate(states):
|
||||
offsets.append(state_i - offset)
|
||||
if state_i not in seen:
|
||||
offset += 1
|
||||
continue
|
||||
|
||||
gc_states.append(zstate)
|
||||
|
||||
# At this point the offsets list will be [0, 1, 1]. We now create new
|
||||
# edges where each offset is corrected by looking it up in the
|
||||
# offsets list.
|
||||
gc_edges: list[dict[int, int]] = []
|
||||
for st_edge_i, st_edges in enumerate(edges):
|
||||
if st_edge_i not in seen:
|
||||
continue
|
||||
|
||||
gc_edges.append({k: offsets[v] for k, v in st_edges.items()})
|
||||
|
||||
return (gc_states, gc_edges)
|
||||
|
||||
|
||||
FlattenedWithMetadata = list["str|Terminal|tuple[dict[str,typing.Any],FlattenedWithMetadata]"]
|
||||
|
||||
|
||||
|
|
@ -3009,7 +2825,7 @@ class Grammar:
|
|||
"""
|
||||
|
||||
_precedence: dict[str, typing.Tuple[Assoc, int]]
|
||||
_generator: type[GenerateLR1]
|
||||
_generator: type[ParserGenerator]
|
||||
_terminals: dict[str, Terminal]
|
||||
_nonterminals: dict[str, NonTerminal]
|
||||
_trivia: list[Terminal]
|
||||
|
|
@ -3018,7 +2834,7 @@ class Grammar:
|
|||
self,
|
||||
start: str | NonTerminal | None = None,
|
||||
precedence: PrecedenceList | None = None,
|
||||
generator: type[GenerateLR1] | None = None,
|
||||
generator: type[ParserGenerator] | None = None,
|
||||
trivia: list[str | Terminal] | None = None,
|
||||
name: str | None = None,
|
||||
):
|
||||
|
|
@ -3037,7 +2853,7 @@ class Grammar:
|
|||
assert precedence is not None
|
||||
|
||||
if generator is None:
|
||||
generator = getattr(self, "generator", GeneratePager)
|
||||
generator = getattr(self, "generator", ParserGenerator)
|
||||
assert generator is not None
|
||||
|
||||
if trivia is None:
|
||||
|
|
|
|||
|
|
@ -87,8 +87,8 @@ def test_all_generators():
|
|||
|
||||
GENERATORS = [
|
||||
# parser.GenerateLR0,
|
||||
parser.GeneratePager,
|
||||
parser.GenerateLR1,
|
||||
# parser.GeneratePager,
|
||||
parser.ParserGenerator,
|
||||
]
|
||||
for generator in GENERATORS:
|
||||
table = G().build_table(generator=generator)
|
||||
|
|
@ -119,15 +119,14 @@ def test_grammar_aho_ullman_2():
|
|||
A = Terminal("a")
|
||||
B = Terminal("b")
|
||||
|
||||
TestGrammar().build_table(generator=parser.GenerateLR1)
|
||||
TestGrammar().build_table(generator=parser.GeneratePager)
|
||||
TestGrammar().build_table(generator=parser.ParserGenerator)
|
||||
# TestGrammar().build_table(generator=parser.GeneratePager)
|
||||
|
||||
|
||||
def test_fun_lalr():
|
||||
|
||||
class TestGrammar(Grammar):
|
||||
start = "S"
|
||||
generator = parser.GeneratePager
|
||||
|
||||
@rule
|
||||
def S(self):
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue