Vendor things
This commit is contained in:
parent
5deceec006
commit
977e3c17e5
19434 changed files with 10682014 additions and 0 deletions
599
third-party/vendor/regex-automata/src/dfa/determinize.rs
vendored
Normal file
599
third-party/vendor/regex-automata/src/dfa/determinize.rs
vendored
Normal file
|
|
@ -0,0 +1,599 @@
|
|||
use alloc::{collections::BTreeMap, vec::Vec};
|
||||
|
||||
use crate::{
|
||||
dfa::{
|
||||
dense::{self, BuildError},
|
||||
DEAD,
|
||||
},
|
||||
nfa::thompson,
|
||||
util::{
|
||||
self,
|
||||
alphabet::{self, ByteSet},
|
||||
determinize::{State, StateBuilderEmpty, StateBuilderNFA},
|
||||
primitives::{PatternID, StateID},
|
||||
search::{Anchored, MatchKind},
|
||||
sparse_set::SparseSets,
|
||||
start::Start,
|
||||
},
|
||||
};
|
||||
|
||||
/// A builder for configuring and running a DFA determinizer.
|
||||
#[derive(Clone, Debug)]
|
||||
pub(crate) struct Config {
|
||||
match_kind: MatchKind,
|
||||
quit: ByteSet,
|
||||
dfa_size_limit: Option<usize>,
|
||||
determinize_size_limit: Option<usize>,
|
||||
}
|
||||
|
||||
impl Config {
|
||||
/// Create a new default config for a determinizer. The determinizer may be
|
||||
/// configured before calling `run`.
|
||||
pub fn new() -> Config {
|
||||
Config {
|
||||
match_kind: MatchKind::LeftmostFirst,
|
||||
quit: ByteSet::empty(),
|
||||
dfa_size_limit: None,
|
||||
determinize_size_limit: None,
|
||||
}
|
||||
}
|
||||
|
||||
/// Run determinization on the given NFA and write the resulting DFA into
|
||||
/// the one given. The DFA given should be initialized but otherwise empty.
|
||||
/// "Initialized" means that it is setup to handle the NFA's byte classes,
|
||||
/// number of patterns and whether to build start states for each pattern.
|
||||
pub fn run(
|
||||
&self,
|
||||
nfa: &thompson::NFA,
|
||||
dfa: &mut dense::OwnedDFA,
|
||||
) -> Result<(), BuildError> {
|
||||
let dead = State::dead();
|
||||
let quit = State::dead();
|
||||
let mut cache = StateMap::default();
|
||||
// We only insert the dead state here since its representation is
|
||||
// identical to the quit state. And we never want anything pointing
|
||||
// to the quit state other than specific transitions derived from the
|
||||
// determinizer's configured "quit" bytes.
|
||||
//
|
||||
// We do put the quit state into 'builder_states' below. This ensures
|
||||
// that a proper DFA state ID is allocated for it, and that no other
|
||||
// DFA state uses the "location after the DEAD state." That is, it
|
||||
// is assumed that the quit state is always the state immediately
|
||||
// following the DEAD state.
|
||||
cache.insert(dead.clone(), DEAD);
|
||||
|
||||
let runner = Runner {
|
||||
config: self.clone(),
|
||||
nfa,
|
||||
dfa,
|
||||
builder_states: alloc::vec![dead, quit],
|
||||
cache,
|
||||
memory_usage_state: 0,
|
||||
sparses: SparseSets::new(nfa.states().len()),
|
||||
stack: alloc::vec![],
|
||||
scratch_state_builder: StateBuilderEmpty::new(),
|
||||
};
|
||||
runner.run()
|
||||
}
|
||||
|
||||
/// The match semantics to use for determinization.
|
||||
///
|
||||
/// MatchKind::All corresponds to the standard textbook construction.
|
||||
/// All possible match states are represented in the DFA.
|
||||
/// MatchKind::LeftmostFirst permits greediness and otherwise tries to
|
||||
/// simulate the match semantics of backtracking regex engines. Namely,
|
||||
/// only a subset of match states are built, and dead states are used to
|
||||
/// stop searches with an unanchored prefix.
|
||||
///
|
||||
/// The default is MatchKind::LeftmostFirst.
|
||||
pub fn match_kind(&mut self, kind: MatchKind) -> &mut Config {
|
||||
self.match_kind = kind;
|
||||
self
|
||||
}
|
||||
|
||||
/// The set of bytes to use that will cause the DFA to enter a quit state,
|
||||
/// stop searching and return an error. By default, this is empty.
|
||||
pub fn quit(&mut self, set: ByteSet) -> &mut Config {
|
||||
self.quit = set;
|
||||
self
|
||||
}
|
||||
|
||||
/// The limit, in bytes of the heap, that the DFA is permitted to use. This
|
||||
/// does not include the auxiliary heap storage used by determinization.
|
||||
pub fn dfa_size_limit(&mut self, bytes: Option<usize>) -> &mut Config {
|
||||
self.dfa_size_limit = bytes;
|
||||
self
|
||||
}
|
||||
|
||||
/// The limit, in bytes of the heap, that determinization itself is allowed
|
||||
/// to use. This does not include the size of the DFA being built.
|
||||
pub fn determinize_size_limit(
|
||||
&mut self,
|
||||
bytes: Option<usize>,
|
||||
) -> &mut Config {
|
||||
self.determinize_size_limit = bytes;
|
||||
self
|
||||
}
|
||||
}
|
||||
|
||||
/// The actual implementation of determinization that converts an NFA to a DFA
|
||||
/// through powerset construction.
|
||||
///
|
||||
/// This determinizer roughly follows the typical powerset construction, where
|
||||
/// each DFA state is comprised of one or more NFA states. In the worst case,
|
||||
/// there is one DFA state for every possible combination of NFA states. In
|
||||
/// practice, this only happens in certain conditions, typically when there are
|
||||
/// bounded repetitions.
|
||||
///
|
||||
/// The main differences between this implementation and typical deteminization
|
||||
/// are that this implementation delays matches by one state and hackily makes
|
||||
/// look-around work. Comments below attempt to explain this.
|
||||
///
|
||||
/// The lifetime variable `'a` refers to the lifetime of the NFA or DFA,
|
||||
/// whichever is shorter.
|
||||
#[derive(Debug)]
|
||||
struct Runner<'a> {
|
||||
/// The configuration used to initialize determinization.
|
||||
config: Config,
|
||||
/// The NFA we're converting into a DFA.
|
||||
nfa: &'a thompson::NFA,
|
||||
/// The DFA we're building.
|
||||
dfa: &'a mut dense::OwnedDFA,
|
||||
/// Each DFA state being built is defined as an *ordered* set of NFA
|
||||
/// states, along with some meta facts about the ordered set of NFA states.
|
||||
///
|
||||
/// This is never empty. The first state is always a dummy state such that
|
||||
/// a state id == 0 corresponds to a dead state. The second state is always
|
||||
/// the quit state.
|
||||
///
|
||||
/// Why do we have states in both a `Vec` and in a cache map below?
|
||||
/// Well, they serve two different roles based on access patterns.
|
||||
/// `builder_states` is the canonical home of each state, and provides
|
||||
/// constant random access by a DFA state's ID. The cache map below, on
|
||||
/// the other hand, provides a quick way of searching for identical DFA
|
||||
/// states by using the DFA state as a key in the map. Of course, we use
|
||||
/// reference counting to avoid actually duplicating the state's data
|
||||
/// itself. (Although this has never been benchmarked.) Note that the cache
|
||||
/// map does not give us full minimization; it just lets us avoid some very
|
||||
/// obvious redundant states.
|
||||
///
|
||||
/// Note that the index into this Vec isn't quite the DFA's state ID.
|
||||
/// Rather, it's just an index. To get the state ID, you have to multiply
|
||||
/// it by the DFA's stride. That's done by self.dfa.from_index. And the
|
||||
/// inverse is self.dfa.to_index.
|
||||
///
|
||||
/// Moreover, DFA states don't usually retain the IDs assigned to them
|
||||
/// by their position in this Vec. After determinization completes,
|
||||
/// states are shuffled around to support other optimizations. See the
|
||||
/// sibling 'special' module for more details on that. (The reason for
|
||||
/// mentioning this is that if you print out the DFA for debugging during
|
||||
/// determinization, and then print out the final DFA after it is fully
|
||||
/// built, then the state IDs likely won't match up.)
|
||||
builder_states: Vec<State>,
|
||||
/// A cache of DFA states that already exist and can be easily looked up
|
||||
/// via ordered sets of NFA states.
|
||||
///
|
||||
/// See `builder_states` docs for why we store states in two different
|
||||
/// ways.
|
||||
cache: StateMap,
|
||||
/// The memory usage, in bytes, used by builder_states and cache. We track
|
||||
/// this as new states are added since states use a variable amount of
|
||||
/// heap. Tracking this as we add states makes it possible to compute the
|
||||
/// total amount of memory used by the determinizer in constant time.
|
||||
memory_usage_state: usize,
|
||||
/// A pair of sparse sets for tracking ordered sets of NFA state IDs.
|
||||
/// These are reused throughout determinization. A bounded sparse set
|
||||
/// gives us constant time insertion, membership testing and clearing.
|
||||
sparses: SparseSets,
|
||||
/// Scratch space for a stack of NFA states to visit, for depth first
|
||||
/// visiting without recursion.
|
||||
stack: Vec<StateID>,
|
||||
/// Scratch space for storing an ordered sequence of NFA states, for
|
||||
/// amortizing allocation. This is principally useful for when we avoid
|
||||
/// adding a new DFA state since it already exists. In order to detect this
|
||||
/// case though, we still need an ordered set of NFA state IDs. So we use
|
||||
/// this space to stage that ordered set before we know whether we need to
|
||||
/// create a new DFA state or not.
|
||||
scratch_state_builder: StateBuilderEmpty,
|
||||
}
|
||||
|
||||
/// A map from states to state identifiers. When using std, we use a standard
|
||||
/// hashmap, since it's a bit faster for this use case. (Other maps, like
|
||||
/// one's based on FNV, have not yet been benchmarked.)
|
||||
///
|
||||
/// The main purpose of this map is to reuse states where possible. This won't
|
||||
/// fully minimize the DFA, but it works well in a lot of cases.
|
||||
#[cfg(feature = "std")]
|
||||
type StateMap = std::collections::HashMap<State, StateID>;
|
||||
#[cfg(not(feature = "std"))]
|
||||
type StateMap = BTreeMap<State, StateID>;
|
||||
|
||||
impl<'a> Runner<'a> {
|
||||
/// Build the DFA. If there was a problem constructing the DFA (e.g., if
|
||||
/// the chosen state identifier representation is too small), then an error
|
||||
/// is returned.
|
||||
fn run(mut self) -> Result<(), BuildError> {
|
||||
if self.nfa.look_set_any().contains_word_unicode()
|
||||
&& !self.config.quit.contains_range(0x80, 0xFF)
|
||||
{
|
||||
return Err(BuildError::unsupported_dfa_word_boundary_unicode());
|
||||
}
|
||||
|
||||
// A sequence of "representative" bytes drawn from each equivalence
|
||||
// class. These representative bytes are fed to the NFA to compute
|
||||
// state transitions. This allows us to avoid re-computing state
|
||||
// transitions for bytes that are guaranteed to produce identical
|
||||
// results. Since computing the representatives needs to do a little
|
||||
// work, we do it once here because we'll be iterating over them a lot.
|
||||
let representatives: Vec<alphabet::Unit> =
|
||||
self.dfa.byte_classes().representatives(..).collect();
|
||||
// The set of all DFA state IDs that still need to have their
|
||||
// transitions set. We start by seeding this with all starting states.
|
||||
let mut uncompiled = alloc::vec![];
|
||||
self.add_all_starts(&mut uncompiled)?;
|
||||
while let Some(dfa_id) = uncompiled.pop() {
|
||||
for &unit in &representatives {
|
||||
if unit.as_u8().map_or(false, |b| self.config.quit.contains(b))
|
||||
{
|
||||
continue;
|
||||
}
|
||||
// In many cases, the state we transition to has already been
|
||||
// computed. 'cached_state' will do the minimal amount of work
|
||||
// to check this, and if it exists, immediately return an
|
||||
// already existing state ID.
|
||||
let (next_dfa_id, is_new) = self.cached_state(dfa_id, unit)?;
|
||||
self.dfa.set_transition(dfa_id, unit, next_dfa_id);
|
||||
// If the state ID we got back is newly created, then we need
|
||||
// to compile it, so add it to our uncompiled frontier.
|
||||
if is_new {
|
||||
uncompiled.push(next_dfa_id);
|
||||
}
|
||||
}
|
||||
}
|
||||
debug!(
|
||||
"determinization complete, memory usage: {}, \
|
||||
dense DFA size: {}, \
|
||||
is reverse? {}",
|
||||
self.memory_usage(),
|
||||
self.dfa.memory_usage(),
|
||||
self.nfa.is_reverse(),
|
||||
);
|
||||
|
||||
// A map from DFA state ID to one or more NFA match IDs. Each NFA match
|
||||
// ID corresponds to a distinct regex pattern that matches in the state
|
||||
// corresponding to the key.
|
||||
let mut matches: BTreeMap<StateID, Vec<PatternID>> = BTreeMap::new();
|
||||
self.cache.clear();
|
||||
#[cfg(feature = "logging")]
|
||||
let mut total_pat_len = 0;
|
||||
for (i, state) in self.builder_states.into_iter().enumerate() {
|
||||
if let Some(pat_ids) = state.match_pattern_ids() {
|
||||
let id = self.dfa.to_state_id(i);
|
||||
log! {
|
||||
total_pat_len += pat_ids.len();
|
||||
}
|
||||
matches.insert(id, pat_ids);
|
||||
}
|
||||
}
|
||||
log! {
|
||||
use core::mem::size_of;
|
||||
let per_elem = size_of::<StateID>() + size_of::<Vec<PatternID>>();
|
||||
let pats = total_pat_len * size_of::<PatternID>();
|
||||
let mem = (matches.len() * per_elem) + pats;
|
||||
log::debug!("matches map built, memory usage: {}", mem);
|
||||
}
|
||||
// At this point, we shuffle the "special" states in the final DFA.
|
||||
// This permits a DFA's match loop to detect a match condition (among
|
||||
// other things) by merely inspecting the current state's identifier,
|
||||
// and avoids the need for any additional auxiliary storage.
|
||||
self.dfa.shuffle(matches)?;
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Return the identifier for the next DFA state given an existing DFA
|
||||
/// state and an input byte. If the next DFA state already exists, then
|
||||
/// return its identifier from the cache. Otherwise, build the state, cache
|
||||
/// it and return its identifier.
|
||||
///
|
||||
/// This routine returns a boolean indicating whether a new state was
|
||||
/// built. If a new state is built, then the caller needs to add it to its
|
||||
/// frontier of uncompiled DFA states to compute transitions for.
|
||||
fn cached_state(
|
||||
&mut self,
|
||||
dfa_id: StateID,
|
||||
unit: alphabet::Unit,
|
||||
) -> Result<(StateID, bool), BuildError> {
|
||||
// Compute the set of all reachable NFA states, including epsilons.
|
||||
let empty_builder = self.get_state_builder();
|
||||
let builder = util::determinize::next(
|
||||
self.nfa,
|
||||
self.config.match_kind,
|
||||
&mut self.sparses,
|
||||
&mut self.stack,
|
||||
&self.builder_states[self.dfa.to_index(dfa_id)],
|
||||
unit,
|
||||
empty_builder,
|
||||
);
|
||||
self.maybe_add_state(builder)
|
||||
}
|
||||
|
||||
/// Compute the set of DFA start states and add their identifiers in
|
||||
/// 'dfa_state_ids' (no duplicates are added).
|
||||
fn add_all_starts(
|
||||
&mut self,
|
||||
dfa_state_ids: &mut Vec<StateID>,
|
||||
) -> Result<(), BuildError> {
|
||||
// These should be the first states added.
|
||||
assert!(dfa_state_ids.is_empty());
|
||||
// We only want to add (un)anchored starting states that is consistent
|
||||
// with our DFA's configuration. Unconditionally adding both (although
|
||||
// it is the default) can make DFAs quite a bit bigger.
|
||||
if self.dfa.start_kind().has_unanchored() {
|
||||
self.add_start_group(Anchored::No, dfa_state_ids)?;
|
||||
}
|
||||
if self.dfa.start_kind().has_anchored() {
|
||||
self.add_start_group(Anchored::Yes, dfa_state_ids)?;
|
||||
}
|
||||
// I previously has an 'assert' here checking that either
|
||||
// 'dfa_state_ids' was non-empty, or the NFA had zero patterns. But it
|
||||
// turns out this isn't always true. For example, the NFA might have
|
||||
// one or more patterns but where all such patterns are just 'fail'
|
||||
// states. These will ultimately just compile down to DFA dead states,
|
||||
// and since the dead state was added earlier, no new DFA states are
|
||||
// added. And thus, it is valid and okay for 'dfa_state_ids' to be
|
||||
// empty even if there are a non-zero number of patterns in the NFA.
|
||||
|
||||
// We only need to compute anchored start states for each pattern if it
|
||||
// was requested to do so.
|
||||
if self.dfa.starts_for_each_pattern() {
|
||||
for pid in self.nfa.patterns() {
|
||||
self.add_start_group(Anchored::Pattern(pid), dfa_state_ids)?;
|
||||
}
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Add a group of start states for the given match pattern ID. Any new
|
||||
/// DFA states added are pushed on to 'dfa_state_ids'. (No duplicates are
|
||||
/// pushed.)
|
||||
///
|
||||
/// When pattern_id is None, then this will compile a group of unanchored
|
||||
/// start states (if the DFA is unanchored). When the pattern_id is
|
||||
/// present, then this will compile a group of anchored start states that
|
||||
/// only match the given pattern.
|
||||
///
|
||||
/// This panics if `anchored` corresponds to an invalid pattern ID.
|
||||
fn add_start_group(
|
||||
&mut self,
|
||||
anchored: Anchored,
|
||||
dfa_state_ids: &mut Vec<StateID>,
|
||||
) -> Result<(), BuildError> {
|
||||
let nfa_start = match anchored {
|
||||
Anchored::No => self.nfa.start_unanchored(),
|
||||
Anchored::Yes => self.nfa.start_anchored(),
|
||||
Anchored::Pattern(pid) => {
|
||||
self.nfa.start_pattern(pid).expect("valid pattern ID")
|
||||
}
|
||||
};
|
||||
|
||||
// When compiling start states, we're careful not to build additional
|
||||
// states that aren't necessary. For example, if the NFA has no word
|
||||
// boundary assertion, then there's no reason to have distinct start
|
||||
// states for 'NonWordByte' and 'WordByte' starting configurations.
|
||||
// Instead, the 'WordByte' starting configuration can just point
|
||||
// directly to the start state for the 'NonWordByte' config.
|
||||
//
|
||||
// Note though that we only need to care about assertions in the prefix
|
||||
// of an NFA since this only concerns the starting states. (Actually,
|
||||
// the most precisely thing we could do it is look at the prefix
|
||||
// assertions of each pattern when 'anchored == Anchored::Pattern',
|
||||
// and then only compile extra states if the prefix is non-empty.) But
|
||||
// we settle for simplicity here instead of absolute minimalism. It is
|
||||
// somewhat rare, after all, for multiple patterns in the same regex to
|
||||
// have different prefix look-arounds.
|
||||
|
||||
let (id, is_new) =
|
||||
self.add_one_start(nfa_start, Start::NonWordByte)?;
|
||||
self.dfa.set_start_state(anchored, Start::NonWordByte, id);
|
||||
if is_new {
|
||||
dfa_state_ids.push(id);
|
||||
}
|
||||
|
||||
if !self.nfa.look_set_prefix_any().contains_word() {
|
||||
self.dfa.set_start_state(anchored, Start::WordByte, id);
|
||||
} else {
|
||||
let (id, is_new) =
|
||||
self.add_one_start(nfa_start, Start::WordByte)?;
|
||||
self.dfa.set_start_state(anchored, Start::WordByte, id);
|
||||
if is_new {
|
||||
dfa_state_ids.push(id);
|
||||
}
|
||||
}
|
||||
if !self.nfa.look_set_prefix_any().contains_anchor() {
|
||||
self.dfa.set_start_state(anchored, Start::Text, id);
|
||||
self.dfa.set_start_state(anchored, Start::LineLF, id);
|
||||
self.dfa.set_start_state(anchored, Start::LineCR, id);
|
||||
self.dfa.set_start_state(
|
||||
anchored,
|
||||
Start::CustomLineTerminator,
|
||||
id,
|
||||
);
|
||||
} else {
|
||||
let (id, is_new) = self.add_one_start(nfa_start, Start::Text)?;
|
||||
self.dfa.set_start_state(anchored, Start::Text, id);
|
||||
if is_new {
|
||||
dfa_state_ids.push(id);
|
||||
}
|
||||
|
||||
let (id, is_new) = self.add_one_start(nfa_start, Start::LineLF)?;
|
||||
self.dfa.set_start_state(anchored, Start::LineLF, id);
|
||||
if is_new {
|
||||
dfa_state_ids.push(id);
|
||||
}
|
||||
|
||||
let (id, is_new) = self.add_one_start(nfa_start, Start::LineCR)?;
|
||||
self.dfa.set_start_state(anchored, Start::LineCR, id);
|
||||
if is_new {
|
||||
dfa_state_ids.push(id);
|
||||
}
|
||||
|
||||
let (id, is_new) =
|
||||
self.add_one_start(nfa_start, Start::CustomLineTerminator)?;
|
||||
self.dfa.set_start_state(
|
||||
anchored,
|
||||
Start::CustomLineTerminator,
|
||||
id,
|
||||
);
|
||||
if is_new {
|
||||
dfa_state_ids.push(id);
|
||||
}
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Add a new DFA start state corresponding to the given starting NFA
|
||||
/// state, and the starting search configuration. (The starting search
|
||||
/// configuration essentially tells us which look-behind assertions are
|
||||
/// true for this particular state.)
|
||||
///
|
||||
/// The boolean returned indicates whether the state ID returned is a newly
|
||||
/// created state, or a previously cached state.
|
||||
fn add_one_start(
|
||||
&mut self,
|
||||
nfa_start: StateID,
|
||||
start: Start,
|
||||
) -> Result<(StateID, bool), BuildError> {
|
||||
// Compute the look-behind assertions that are true in this starting
|
||||
// configuration, and the determine the epsilon closure. While
|
||||
// computing the epsilon closure, we only follow condiional epsilon
|
||||
// transitions that satisfy the look-behind assertions in 'look_have'.
|
||||
let mut builder_matches = self.get_state_builder().into_matches();
|
||||
util::determinize::set_lookbehind_from_start(
|
||||
self.nfa,
|
||||
&start,
|
||||
&mut builder_matches,
|
||||
);
|
||||
self.sparses.set1.clear();
|
||||
util::determinize::epsilon_closure(
|
||||
self.nfa,
|
||||
nfa_start,
|
||||
builder_matches.look_have(),
|
||||
&mut self.stack,
|
||||
&mut self.sparses.set1,
|
||||
);
|
||||
let mut builder = builder_matches.into_nfa();
|
||||
util::determinize::add_nfa_states(
|
||||
&self.nfa,
|
||||
&self.sparses.set1,
|
||||
&mut builder,
|
||||
);
|
||||
self.maybe_add_state(builder)
|
||||
}
|
||||
|
||||
/// Adds the given state to the DFA being built depending on whether it
|
||||
/// already exists in this determinizer's cache.
|
||||
///
|
||||
/// If it does exist, then the memory used by 'state' is put back into the
|
||||
/// determinizer and the previously created state's ID is returned. (Along
|
||||
/// with 'false', indicating that no new state was added.)
|
||||
///
|
||||
/// If it does not exist, then the state is added to the DFA being built
|
||||
/// and a fresh ID is allocated (if ID allocation fails, then an error is
|
||||
/// returned) and returned. (Along with 'true', indicating that a new state
|
||||
/// was added.)
|
||||
fn maybe_add_state(
|
||||
&mut self,
|
||||
builder: StateBuilderNFA,
|
||||
) -> Result<(StateID, bool), BuildError> {
|
||||
if let Some(&cached_id) = self.cache.get(builder.as_bytes()) {
|
||||
// Since we have a cached state, put the constructed state's
|
||||
// memory back into our scratch space, so that it can be reused.
|
||||
self.put_state_builder(builder);
|
||||
return Ok((cached_id, false));
|
||||
}
|
||||
self.add_state(builder).map(|sid| (sid, true))
|
||||
}
|
||||
|
||||
/// Add the given state to the DFA and make it available in the cache.
|
||||
///
|
||||
/// The state initially has no transitions. That is, it transitions to the
|
||||
/// dead state for all possible inputs, and transitions to the quit state
|
||||
/// for all quit bytes.
|
||||
///
|
||||
/// If adding the state would exceed the maximum value for StateID, then an
|
||||
/// error is returned.
|
||||
fn add_state(
|
||||
&mut self,
|
||||
builder: StateBuilderNFA,
|
||||
) -> Result<StateID, BuildError> {
|
||||
let id = self.dfa.add_empty_state()?;
|
||||
if !self.config.quit.is_empty() {
|
||||
for b in self.config.quit.iter() {
|
||||
self.dfa.set_transition(
|
||||
id,
|
||||
alphabet::Unit::u8(b),
|
||||
self.dfa.quit_id(),
|
||||
);
|
||||
}
|
||||
}
|
||||
let state = builder.to_state();
|
||||
// States use reference counting internally, so we only need to count
|
||||
// their memory usage once.
|
||||
self.memory_usage_state += state.memory_usage();
|
||||
self.builder_states.push(state.clone());
|
||||
self.cache.insert(state, id);
|
||||
self.put_state_builder(builder);
|
||||
if let Some(limit) = self.config.dfa_size_limit {
|
||||
if self.dfa.memory_usage() > limit {
|
||||
return Err(BuildError::dfa_exceeded_size_limit(limit));
|
||||
}
|
||||
}
|
||||
if let Some(limit) = self.config.determinize_size_limit {
|
||||
if self.memory_usage() > limit {
|
||||
return Err(BuildError::determinize_exceeded_size_limit(
|
||||
limit,
|
||||
));
|
||||
}
|
||||
}
|
||||
Ok(id)
|
||||
}
|
||||
|
||||
/// Returns a state builder from this determinizer that might have existing
|
||||
/// capacity. This helps avoid allocs in cases where a state is built that
|
||||
/// turns out to already be cached.
|
||||
///
|
||||
/// Callers must put the state builder back with 'put_state_builder',
|
||||
/// otherwise the allocation reuse won't work.
|
||||
fn get_state_builder(&mut self) -> StateBuilderEmpty {
|
||||
core::mem::replace(
|
||||
&mut self.scratch_state_builder,
|
||||
StateBuilderEmpty::new(),
|
||||
)
|
||||
}
|
||||
|
||||
/// Puts the given state builder back into this determinizer for reuse.
|
||||
///
|
||||
/// Note that building a 'State' from a builder always creates a new
|
||||
/// alloc, so callers should always put the builder back.
|
||||
fn put_state_builder(&mut self, builder: StateBuilderNFA) {
|
||||
let _ = core::mem::replace(
|
||||
&mut self.scratch_state_builder,
|
||||
builder.clear(),
|
||||
);
|
||||
}
|
||||
|
||||
/// Return the memory usage, in bytes, of this determinizer at the current
|
||||
/// point in time. This does not include memory used by the NFA or the
|
||||
/// dense DFA itself.
|
||||
fn memory_usage(&self) -> usize {
|
||||
use core::mem::size_of;
|
||||
|
||||
self.builder_states.len() * size_of::<State>()
|
||||
// Maps likely use more memory than this, but it's probably close.
|
||||
+ self.cache.len() * (size_of::<State>() + size_of::<StateID>())
|
||||
+ self.memory_usage_state
|
||||
+ self.stack.capacity() * size_of::<StateID>()
|
||||
+ self.scratch_state_builder.capacity()
|
||||
}
|
||||
}
|
||||
Loading…
Add table
Add a link
Reference in a new issue