Vendor things

This commit is contained in:
John Doty 2024-03-08 11:03:01 -08:00
parent 5deceec006
commit 977e3c17e5
19434 changed files with 10682014 additions and 0 deletions

View file

@ -0,0 +1,517 @@
// This module defines some core types for dealing with accelerated DFA states.
// Briefly, a DFA state can be "accelerated" if all of its transitions except
// for a few loop back to itself. This directly implies that the only way out
// of such a state is if a byte corresponding to one of those non-loopback
// transitions is found. Such states are often found in simple repetitions in
// non-Unicode regexes. For example, consider '(?-u)[^a]+a'. We can look at its
// DFA with regex-cli:
//
// $ regex-cli debug dense dfa -p '(?-u)[^a]+a' -BbC --no-table
// D 000000:
// Q 000001:
// *000002:
// A 000003: \x00-` => 3, a => 8, b-\xFF => 3
// A 000004: \x00-` => 4, a => 7, b-\xFF => 4
// 000005: \x00-` => 4, b-\xFF => 4
// 000006: \x00-` => 3, a => 6, b-\xFF => 3
// 000007: \x00-\xFF => 2, EOI => 2
// 000008: \x00-\xFF => 2, EOI => 2
//
// In particular, state 3 is accelerated (shown via the 'A' indicator) since
// the only way to leave that state once entered is to see an 'a' byte. If
// there is a long run of non-'a' bytes, then using something like 'memchr'
// to find the next 'a' byte can be significantly faster than just using the
// standard byte-at-a-time state machine.
//
// Unfortunately, this optimization rarely applies when Unicode is enabled.
// For example, patterns like '[^a]' don't actually match any byte that isn't
// 'a', but rather, any UTF-8 encoding of a Unicode scalar value that isn't
// 'a'. This makes the state machine much more complex---far beyond a single
// state---and removes the ability to easily accelerate it. (Because if the
// machine sees a non-UTF-8 sequence, then the machine won't match through it.)
//
// In practice, we only consider accelerating states that have 3 or fewer
// non-loop transitions. At a certain point, you get diminishing returns, but
// also because that's what the memchr crate supports. The structures below
// hard-code this assumption and provide (de)serialization APIs for use inside
// a DFA.
//
// And finally, note that there is some trickery involved in making it very
// fast to not only check whether a state is accelerated at search time, but
// also to access the bytes to search for to implement the acceleration itself.
// dfa/special.rs provides more detail, but the short story is that all
// accelerated states appear contiguously in a DFA. This means we can represent
// the ID space of all accelerated DFA states with a single range. So given
// a state ID, we can determine whether it's accelerated via
//
// min_accel_id <= id <= max_accel_id
//
// And find its corresponding accelerator with:
//
// accels.get((id - min_accel_id) / dfa_stride)
#[cfg(feature = "dfa-build")]
use alloc::{vec, vec::Vec};
use crate::util::{
int::Pointer,
memchr,
wire::{self, DeserializeError, Endian, SerializeError},
};
/// The base type used to represent a collection of accelerators.
///
/// While an `Accel` is represented as a fixed size array of bytes, a
/// *collection* of `Accel`s (called `Accels`) is represented internally as a
/// slice of u32. While it's a bit unnatural to do this and costs us a bit of
/// fairly low-risk not-safe code, it lets us remove the need for a second type
/// parameter in the definition of dense::DFA. (Which really wants everything
/// to be a slice of u32.)
type AccelTy = u32;
/// The size of the unit of representation for accelerators.
///
/// ACCEL_CAP *must* be a multiple of this size.
const ACCEL_TY_SIZE: usize = core::mem::size_of::<AccelTy>();
/// The maximum length in bytes that a single Accel can be. This is distinct
/// from the capacity of an accelerator in that the length represents only the
/// bytes that should be read.
const ACCEL_LEN: usize = 4;
/// The capacity of each accelerator, in bytes. We set this to 8 since it's a
/// multiple of 4 (our ID size) and because it gives us a little wiggle room
/// if we want to support more accel bytes in the future without a breaking
/// change.
///
/// This MUST be a multiple of ACCEL_TY_SIZE.
const ACCEL_CAP: usize = 8;
/// Search for between 1 and 3 needle bytes in the given haystack, starting the
/// search at the given position. If `needles` has a length other than 1-3,
/// then this panics.
#[cfg_attr(feature = "perf-inline", inline(always))]
pub(crate) fn find_fwd(
needles: &[u8],
haystack: &[u8],
at: usize,
) -> Option<usize> {
let bs = needles;
let i = match needles.len() {
1 => memchr::memchr(bs[0], &haystack[at..])?,
2 => memchr::memchr2(bs[0], bs[1], &haystack[at..])?,
3 => memchr::memchr3(bs[0], bs[1], bs[2], &haystack[at..])?,
0 => panic!("cannot find with empty needles"),
n => panic!("invalid needles length: {}", n),
};
Some(at + i)
}
/// Search for between 1 and 3 needle bytes in the given haystack in reverse,
/// starting the search at the given position. If `needles` has a length other
/// than 1-3, then this panics.
#[cfg_attr(feature = "perf-inline", inline(always))]
pub(crate) fn find_rev(
needles: &[u8],
haystack: &[u8],
at: usize,
) -> Option<usize> {
let bs = needles;
match needles.len() {
1 => memchr::memrchr(bs[0], &haystack[..at]),
2 => memchr::memrchr2(bs[0], bs[1], &haystack[..at]),
3 => memchr::memrchr3(bs[0], bs[1], bs[2], &haystack[..at]),
0 => panic!("cannot find with empty needles"),
n => panic!("invalid needles length: {}", n),
}
}
/// Represents the accelerators for all accelerated states in a dense DFA.
///
/// The `A` type parameter represents the type of the underlying bytes.
/// Generally, this is either `&[AccelTy]` or `Vec<AccelTy>`.
#[derive(Clone)]
pub(crate) struct Accels<A> {
/// A length prefixed slice of contiguous accelerators. See the top comment
/// in this module for more details on how we can jump from a DFA's state
/// ID to an accelerator in this list.
///
/// The first 4 bytes always correspond to the number of accelerators
/// that follow.
accels: A,
}
#[cfg(feature = "dfa-build")]
impl Accels<Vec<AccelTy>> {
/// Create an empty sequence of accelerators for a DFA.
pub fn empty() -> Accels<Vec<AccelTy>> {
Accels { accels: vec![0] }
}
/// Add an accelerator to this sequence.
///
/// This adds to the accelerator to the end of the sequence and therefore
/// should be done in correspondence with its state in the DFA.
///
/// This panics if this results in more accelerators than AccelTy::MAX.
pub fn add(&mut self, accel: Accel) {
self.accels.extend_from_slice(&accel.as_accel_tys());
let len = self.len();
self.set_len(len + 1);
}
/// Set the number of accelerators in this sequence, which is encoded in
/// the first 4 bytes of the underlying bytes.
fn set_len(&mut self, new_len: usize) {
// The only way an accelerator gets added is if a state exists for
// it, and if a state exists, then its index is guaranteed to be
// representable by a AccelTy by virtue of the guarantees provided by
// StateID.
let new_len = AccelTy::try_from(new_len).unwrap();
self.accels[0] = new_len;
}
}
impl<'a> Accels<&'a [AccelTy]> {
/// Deserialize a sequence of accelerators from the given bytes. If there
/// was a problem deserializing, then an error is returned.
///
/// This is guaranteed to run in constant time. This does not guarantee
/// that every accelerator in the returned collection is valid. Thus,
/// accessing one may panic, or not-safe code that relies on accelerators
/// being correct my result in UB.
///
/// Callers may check the validity of every accelerator with the `validate`
/// method.
pub fn from_bytes_unchecked(
mut slice: &'a [u8],
) -> Result<(Accels<&'a [AccelTy]>, usize), DeserializeError> {
let slice_start = slice.as_ptr().as_usize();
let (accel_len, _) =
wire::try_read_u32_as_usize(slice, "accelerators length")?;
// The accelerator length is part of the accel_tys slice that
// we deserialize. This is perhaps a bit idiosyncratic. It would
// probably be better to split out the length into a real field.
let accel_tys_len = wire::add(
wire::mul(accel_len, 2, "total number of accelerator accel_tys")?,
1,
"total number of accel_tys",
)?;
let accel_tys_bytes_len = wire::mul(
ACCEL_TY_SIZE,
accel_tys_len,
"total number of bytes in accelerators",
)?;
wire::check_slice_len(slice, accel_tys_bytes_len, "accelerators")?;
wire::check_alignment::<AccelTy>(slice)?;
let accel_tys = &slice[..accel_tys_bytes_len];
slice = &slice[accel_tys_bytes_len..];
// SAFETY: We've checked the length and alignment above, and since
// slice is just bytes and AccelTy is just a u32, we can safely cast to
// a slice of &[AccelTy].
let accels = unsafe {
core::slice::from_raw_parts(
accel_tys.as_ptr().cast::<AccelTy>(),
accel_tys_len,
)
};
Ok((Accels { accels }, slice.as_ptr().as_usize() - slice_start))
}
}
impl<A: AsRef<[AccelTy]>> Accels<A> {
/// Return an owned version of the accelerators.
#[cfg(feature = "alloc")]
pub fn to_owned(&self) -> Accels<alloc::vec::Vec<AccelTy>> {
Accels { accels: self.accels.as_ref().to_vec() }
}
/// Return a borrowed version of the accelerators.
pub fn as_ref(&self) -> Accels<&[AccelTy]> {
Accels { accels: self.accels.as_ref() }
}
/// Return the bytes representing the serialization of the accelerators.
pub fn as_bytes(&self) -> &[u8] {
let accels = self.accels.as_ref();
// SAFETY: This is safe because accels is a just a slice of AccelTy,
// and u8 always has a smaller alignment.
unsafe {
core::slice::from_raw_parts(
accels.as_ptr().cast::<u8>(),
accels.len() * ACCEL_TY_SIZE,
)
}
}
/// Returns the memory usage, in bytes, of these accelerators.
///
/// The memory usage is computed based on the number of bytes used to
/// represent all of the accelerators.
///
/// This does **not** include the stack size used by this value.
pub fn memory_usage(&self) -> usize {
self.as_bytes().len()
}
/// Return the bytes to search for corresponding to the accelerator in this
/// sequence at index `i`. If no such accelerator exists, then this panics.
///
/// The significance of the index is that it should be in correspondence
/// with the index of the corresponding DFA. That is, accelerated DFA
/// states are stored contiguously in the DFA and have an ordering implied
/// by their respective state IDs. The state's index in that sequence
/// corresponds to the index of its corresponding accelerator.
#[cfg_attr(feature = "perf-inline", inline(always))]
pub fn needles(&self, i: usize) -> &[u8] {
if i >= self.len() {
panic!("invalid accelerator index {}", i);
}
let bytes = self.as_bytes();
let offset = ACCEL_TY_SIZE + i * ACCEL_CAP;
let len = usize::from(bytes[offset]);
&bytes[offset + 1..offset + 1 + len]
}
/// Return the total number of accelerators in this sequence.
pub fn len(&self) -> usize {
// This should never panic since deserialization checks that the
// length can fit into a usize.
usize::try_from(self.accels.as_ref()[0]).unwrap()
}
/// Return the accelerator in this sequence at index `i`. If no such
/// accelerator exists, then this returns None.
///
/// See the docs for `needles` on the significance of the index.
fn get(&self, i: usize) -> Option<Accel> {
if i >= self.len() {
return None;
}
let offset = ACCEL_TY_SIZE + i * ACCEL_CAP;
let accel = Accel::from_slice(&self.as_bytes()[offset..])
.expect("Accels must contain valid accelerators");
Some(accel)
}
/// Returns an iterator of accelerators in this sequence.
fn iter(&self) -> IterAccels<'_, A> {
IterAccels { accels: self, i: 0 }
}
/// Writes these accelerators to the given byte buffer using the indicated
/// endianness. If the given buffer is too small, then an error is
/// returned. Upon success, the total number of bytes written is returned.
/// The number of bytes written is guaranteed to be a multiple of 8.
pub fn write_to<E: Endian>(
&self,
dst: &mut [u8],
) -> Result<usize, SerializeError> {
let nwrite = self.write_to_len();
assert_eq!(
nwrite % ACCEL_TY_SIZE,
0,
"expected accelerator bytes written to be a multiple of {}",
ACCEL_TY_SIZE,
);
if dst.len() < nwrite {
return Err(SerializeError::buffer_too_small("accelerators"));
}
// The number of accelerators can never exceed AccelTy::MAX.
E::write_u32(AccelTy::try_from(self.len()).unwrap(), dst);
// The actual accelerators are just raw bytes and thus their endianness
// is irrelevant. So we can copy them as bytes.
dst[ACCEL_TY_SIZE..nwrite]
.copy_from_slice(&self.as_bytes()[ACCEL_TY_SIZE..nwrite]);
Ok(nwrite)
}
/// Validates that every accelerator in this collection can be successfully
/// deserialized as a valid accelerator.
pub fn validate(&self) -> Result<(), DeserializeError> {
for chunk in self.as_bytes()[ACCEL_TY_SIZE..].chunks(ACCEL_CAP) {
let _ = Accel::from_slice(chunk)?;
}
Ok(())
}
/// Returns the total number of bytes written by `write_to`.
pub fn write_to_len(&self) -> usize {
self.as_bytes().len()
}
}
impl<A: AsRef<[AccelTy]>> core::fmt::Debug for Accels<A> {
fn fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result {
write!(f, "Accels(")?;
let mut list = f.debug_list();
for a in self.iter() {
list.entry(&a);
}
list.finish()?;
write!(f, ")")
}
}
#[derive(Debug)]
struct IterAccels<'a, A: AsRef<[AccelTy]>> {
accels: &'a Accels<A>,
i: usize,
}
impl<'a, A: AsRef<[AccelTy]>> Iterator for IterAccels<'a, A> {
type Item = Accel;
fn next(&mut self) -> Option<Accel> {
let accel = self.accels.get(self.i)?;
self.i += 1;
Some(accel)
}
}
/// Accel represents a structure for determining how to "accelerate" a DFA
/// state.
///
/// Namely, it contains zero or more bytes that must be seen in order for the
/// DFA to leave the state it is associated with. In practice, the actual range
/// is 1 to 3 bytes.
///
/// The purpose of acceleration is to identify states whose vast majority
/// of transitions are just loops back to the same state. For example,
/// in the regex `(?-u)^[^a]+b`, the corresponding DFA will have a state
/// (corresponding to `[^a]+`) where all transitions *except* for `a` and
/// `b` loop back to itself. Thus, this state can be "accelerated" by simply
/// looking for the next occurrence of either `a` or `b` instead of explicitly
/// following transitions. (In this case, `b` transitions to the next state
/// where as `a` would transition to the dead state.)
#[derive(Clone)]
pub(crate) struct Accel {
/// The first byte is the length. Subsequent bytes are the accelerated
/// bytes.
///
/// Note that we make every accelerator 8 bytes as a slightly wasteful
/// way of making sure alignment is always correct for state ID sizes of
/// 1, 2, 4 and 8. This should be okay since accelerated states aren't
/// particularly common, especially when Unicode is enabled.
bytes: [u8; ACCEL_CAP],
}
impl Accel {
/// Returns an empty accel, where no bytes are accelerated.
#[cfg(feature = "dfa-build")]
pub fn new() -> Accel {
Accel { bytes: [0; ACCEL_CAP] }
}
/// Returns a verified accelerator derived from the beginning of the given
/// slice.
///
/// If the slice is not long enough or contains invalid bytes for an
/// accelerator, then this returns an error.
pub fn from_slice(mut slice: &[u8]) -> Result<Accel, DeserializeError> {
slice = &slice[..core::cmp::min(ACCEL_LEN, slice.len())];
let bytes = slice
.try_into()
.map_err(|_| DeserializeError::buffer_too_small("accelerator"))?;
Accel::from_bytes(bytes)
}
/// Returns a verified accelerator derived from raw bytes.
///
/// If the given bytes are invalid, then this returns an error.
fn from_bytes(bytes: [u8; 4]) -> Result<Accel, DeserializeError> {
if usize::from(bytes[0]) >= ACCEL_LEN {
return Err(DeserializeError::generic(
"accelerator bytes cannot have length more than 3",
));
}
Ok(Accel::from_bytes_unchecked(bytes))
}
/// Returns an accelerator derived from raw bytes.
///
/// This does not check whether the given bytes are valid. Invalid bytes
/// cannot sacrifice memory safety, but may result in panics or silent
/// logic bugs.
fn from_bytes_unchecked(bytes: [u8; 4]) -> Accel {
Accel { bytes: [bytes[0], bytes[1], bytes[2], bytes[3], 0, 0, 0, 0] }
}
/// Attempts to add the given byte to this accelerator. If the accelerator
/// is already full or thinks the byte is a poor accelerator, then this
/// returns false. Otherwise, returns true.
///
/// If the given byte is already in this accelerator, then it panics.
#[cfg(feature = "dfa-build")]
pub fn add(&mut self, byte: u8) -> bool {
if self.len() >= 3 {
return false;
}
// As a special case, we totally reject trying to accelerate a state
// with an ASCII space. In most cases, it occurs very frequently, and
// tends to result in worse overall performance.
if byte == b' ' {
return false;
}
assert!(
!self.contains(byte),
"accelerator already contains {:?}",
crate::util::escape::DebugByte(byte)
);
self.bytes[self.len() + 1] = byte;
self.bytes[0] += 1;
true
}
/// Return the number of bytes in this accelerator.
pub fn len(&self) -> usize {
usize::from(self.bytes[0])
}
/// Returns true if and only if there are no bytes in this accelerator.
#[cfg(feature = "dfa-build")]
pub fn is_empty(&self) -> bool {
self.len() == 0
}
/// Returns the slice of bytes to accelerate.
///
/// If this accelerator is empty, then this returns an empty slice.
fn needles(&self) -> &[u8] {
&self.bytes[1..1 + self.len()]
}
/// Returns true if and only if this accelerator will accelerate the given
/// byte.
#[cfg(feature = "dfa-build")]
fn contains(&self, byte: u8) -> bool {
self.needles().iter().position(|&b| b == byte).is_some()
}
/// Returns the accelerator bytes as an array of AccelTys.
#[cfg(feature = "dfa-build")]
fn as_accel_tys(&self) -> [AccelTy; 2] {
assert_eq!(ACCEL_CAP, 8);
// These unwraps are OK since ACCEL_CAP is set to 8.
let first =
AccelTy::from_ne_bytes(self.bytes[0..4].try_into().unwrap());
let second =
AccelTy::from_ne_bytes(self.bytes[4..8].try_into().unwrap());
[first, second]
}
}
impl core::fmt::Debug for Accel {
fn fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result {
write!(f, "Accel(")?;
let mut set = f.debug_set();
for &b in self.needles() {
set.entry(&crate::util::escape::DebugByte(b));
}
set.finish()?;
write!(f, ")")
}
}

File diff suppressed because it is too large Load diff

File diff suppressed because it is too large Load diff

View file

@ -0,0 +1,599 @@
use alloc::{collections::BTreeMap, vec::Vec};
use crate::{
dfa::{
dense::{self, BuildError},
DEAD,
},
nfa::thompson,
util::{
self,
alphabet::{self, ByteSet},
determinize::{State, StateBuilderEmpty, StateBuilderNFA},
primitives::{PatternID, StateID},
search::{Anchored, MatchKind},
sparse_set::SparseSets,
start::Start,
},
};
/// A builder for configuring and running a DFA determinizer.
#[derive(Clone, Debug)]
pub(crate) struct Config {
match_kind: MatchKind,
quit: ByteSet,
dfa_size_limit: Option<usize>,
determinize_size_limit: Option<usize>,
}
impl Config {
/// Create a new default config for a determinizer. The determinizer may be
/// configured before calling `run`.
pub fn new() -> Config {
Config {
match_kind: MatchKind::LeftmostFirst,
quit: ByteSet::empty(),
dfa_size_limit: None,
determinize_size_limit: None,
}
}
/// Run determinization on the given NFA and write the resulting DFA into
/// the one given. The DFA given should be initialized but otherwise empty.
/// "Initialized" means that it is setup to handle the NFA's byte classes,
/// number of patterns and whether to build start states for each pattern.
pub fn run(
&self,
nfa: &thompson::NFA,
dfa: &mut dense::OwnedDFA,
) -> Result<(), BuildError> {
let dead = State::dead();
let quit = State::dead();
let mut cache = StateMap::default();
// We only insert the dead state here since its representation is
// identical to the quit state. And we never want anything pointing
// to the quit state other than specific transitions derived from the
// determinizer's configured "quit" bytes.
//
// We do put the quit state into 'builder_states' below. This ensures
// that a proper DFA state ID is allocated for it, and that no other
// DFA state uses the "location after the DEAD state." That is, it
// is assumed that the quit state is always the state immediately
// following the DEAD state.
cache.insert(dead.clone(), DEAD);
let runner = Runner {
config: self.clone(),
nfa,
dfa,
builder_states: alloc::vec![dead, quit],
cache,
memory_usage_state: 0,
sparses: SparseSets::new(nfa.states().len()),
stack: alloc::vec![],
scratch_state_builder: StateBuilderEmpty::new(),
};
runner.run()
}
/// The match semantics to use for determinization.
///
/// MatchKind::All corresponds to the standard textbook construction.
/// All possible match states are represented in the DFA.
/// MatchKind::LeftmostFirst permits greediness and otherwise tries to
/// simulate the match semantics of backtracking regex engines. Namely,
/// only a subset of match states are built, and dead states are used to
/// stop searches with an unanchored prefix.
///
/// The default is MatchKind::LeftmostFirst.
pub fn match_kind(&mut self, kind: MatchKind) -> &mut Config {
self.match_kind = kind;
self
}
/// The set of bytes to use that will cause the DFA to enter a quit state,
/// stop searching and return an error. By default, this is empty.
pub fn quit(&mut self, set: ByteSet) -> &mut Config {
self.quit = set;
self
}
/// The limit, in bytes of the heap, that the DFA is permitted to use. This
/// does not include the auxiliary heap storage used by determinization.
pub fn dfa_size_limit(&mut self, bytes: Option<usize>) -> &mut Config {
self.dfa_size_limit = bytes;
self
}
/// The limit, in bytes of the heap, that determinization itself is allowed
/// to use. This does not include the size of the DFA being built.
pub fn determinize_size_limit(
&mut self,
bytes: Option<usize>,
) -> &mut Config {
self.determinize_size_limit = bytes;
self
}
}
/// The actual implementation of determinization that converts an NFA to a DFA
/// through powerset construction.
///
/// This determinizer roughly follows the typical powerset construction, where
/// each DFA state is comprised of one or more NFA states. In the worst case,
/// there is one DFA state for every possible combination of NFA states. In
/// practice, this only happens in certain conditions, typically when there are
/// bounded repetitions.
///
/// The main differences between this implementation and typical deteminization
/// are that this implementation delays matches by one state and hackily makes
/// look-around work. Comments below attempt to explain this.
///
/// The lifetime variable `'a` refers to the lifetime of the NFA or DFA,
/// whichever is shorter.
#[derive(Debug)]
struct Runner<'a> {
/// The configuration used to initialize determinization.
config: Config,
/// The NFA we're converting into a DFA.
nfa: &'a thompson::NFA,
/// The DFA we're building.
dfa: &'a mut dense::OwnedDFA,
/// Each DFA state being built is defined as an *ordered* set of NFA
/// states, along with some meta facts about the ordered set of NFA states.
///
/// This is never empty. The first state is always a dummy state such that
/// a state id == 0 corresponds to a dead state. The second state is always
/// the quit state.
///
/// Why do we have states in both a `Vec` and in a cache map below?
/// Well, they serve two different roles based on access patterns.
/// `builder_states` is the canonical home of each state, and provides
/// constant random access by a DFA state's ID. The cache map below, on
/// the other hand, provides a quick way of searching for identical DFA
/// states by using the DFA state as a key in the map. Of course, we use
/// reference counting to avoid actually duplicating the state's data
/// itself. (Although this has never been benchmarked.) Note that the cache
/// map does not give us full minimization; it just lets us avoid some very
/// obvious redundant states.
///
/// Note that the index into this Vec isn't quite the DFA's state ID.
/// Rather, it's just an index. To get the state ID, you have to multiply
/// it by the DFA's stride. That's done by self.dfa.from_index. And the
/// inverse is self.dfa.to_index.
///
/// Moreover, DFA states don't usually retain the IDs assigned to them
/// by their position in this Vec. After determinization completes,
/// states are shuffled around to support other optimizations. See the
/// sibling 'special' module for more details on that. (The reason for
/// mentioning this is that if you print out the DFA for debugging during
/// determinization, and then print out the final DFA after it is fully
/// built, then the state IDs likely won't match up.)
builder_states: Vec<State>,
/// A cache of DFA states that already exist and can be easily looked up
/// via ordered sets of NFA states.
///
/// See `builder_states` docs for why we store states in two different
/// ways.
cache: StateMap,
/// The memory usage, in bytes, used by builder_states and cache. We track
/// this as new states are added since states use a variable amount of
/// heap. Tracking this as we add states makes it possible to compute the
/// total amount of memory used by the determinizer in constant time.
memory_usage_state: usize,
/// A pair of sparse sets for tracking ordered sets of NFA state IDs.
/// These are reused throughout determinization. A bounded sparse set
/// gives us constant time insertion, membership testing and clearing.
sparses: SparseSets,
/// Scratch space for a stack of NFA states to visit, for depth first
/// visiting without recursion.
stack: Vec<StateID>,
/// Scratch space for storing an ordered sequence of NFA states, for
/// amortizing allocation. This is principally useful for when we avoid
/// adding a new DFA state since it already exists. In order to detect this
/// case though, we still need an ordered set of NFA state IDs. So we use
/// this space to stage that ordered set before we know whether we need to
/// create a new DFA state or not.
scratch_state_builder: StateBuilderEmpty,
}
/// A map from states to state identifiers. When using std, we use a standard
/// hashmap, since it's a bit faster for this use case. (Other maps, like
/// one's based on FNV, have not yet been benchmarked.)
///
/// The main purpose of this map is to reuse states where possible. This won't
/// fully minimize the DFA, but it works well in a lot of cases.
#[cfg(feature = "std")]
type StateMap = std::collections::HashMap<State, StateID>;
#[cfg(not(feature = "std"))]
type StateMap = BTreeMap<State, StateID>;
impl<'a> Runner<'a> {
/// Build the DFA. If there was a problem constructing the DFA (e.g., if
/// the chosen state identifier representation is too small), then an error
/// is returned.
fn run(mut self) -> Result<(), BuildError> {
if self.nfa.look_set_any().contains_word_unicode()
&& !self.config.quit.contains_range(0x80, 0xFF)
{
return Err(BuildError::unsupported_dfa_word_boundary_unicode());
}
// A sequence of "representative" bytes drawn from each equivalence
// class. These representative bytes are fed to the NFA to compute
// state transitions. This allows us to avoid re-computing state
// transitions for bytes that are guaranteed to produce identical
// results. Since computing the representatives needs to do a little
// work, we do it once here because we'll be iterating over them a lot.
let representatives: Vec<alphabet::Unit> =
self.dfa.byte_classes().representatives(..).collect();
// The set of all DFA state IDs that still need to have their
// transitions set. We start by seeding this with all starting states.
let mut uncompiled = alloc::vec![];
self.add_all_starts(&mut uncompiled)?;
while let Some(dfa_id) = uncompiled.pop() {
for &unit in &representatives {
if unit.as_u8().map_or(false, |b| self.config.quit.contains(b))
{
continue;
}
// In many cases, the state we transition to has already been
// computed. 'cached_state' will do the minimal amount of work
// to check this, and if it exists, immediately return an
// already existing state ID.
let (next_dfa_id, is_new) = self.cached_state(dfa_id, unit)?;
self.dfa.set_transition(dfa_id, unit, next_dfa_id);
// If the state ID we got back is newly created, then we need
// to compile it, so add it to our uncompiled frontier.
if is_new {
uncompiled.push(next_dfa_id);
}
}
}
debug!(
"determinization complete, memory usage: {}, \
dense DFA size: {}, \
is reverse? {}",
self.memory_usage(),
self.dfa.memory_usage(),
self.nfa.is_reverse(),
);
// A map from DFA state ID to one or more NFA match IDs. Each NFA match
// ID corresponds to a distinct regex pattern that matches in the state
// corresponding to the key.
let mut matches: BTreeMap<StateID, Vec<PatternID>> = BTreeMap::new();
self.cache.clear();
#[cfg(feature = "logging")]
let mut total_pat_len = 0;
for (i, state) in self.builder_states.into_iter().enumerate() {
if let Some(pat_ids) = state.match_pattern_ids() {
let id = self.dfa.to_state_id(i);
log! {
total_pat_len += pat_ids.len();
}
matches.insert(id, pat_ids);
}
}
log! {
use core::mem::size_of;
let per_elem = size_of::<StateID>() + size_of::<Vec<PatternID>>();
let pats = total_pat_len * size_of::<PatternID>();
let mem = (matches.len() * per_elem) + pats;
log::debug!("matches map built, memory usage: {}", mem);
}
// At this point, we shuffle the "special" states in the final DFA.
// This permits a DFA's match loop to detect a match condition (among
// other things) by merely inspecting the current state's identifier,
// and avoids the need for any additional auxiliary storage.
self.dfa.shuffle(matches)?;
Ok(())
}
/// Return the identifier for the next DFA state given an existing DFA
/// state and an input byte. If the next DFA state already exists, then
/// return its identifier from the cache. Otherwise, build the state, cache
/// it and return its identifier.
///
/// This routine returns a boolean indicating whether a new state was
/// built. If a new state is built, then the caller needs to add it to its
/// frontier of uncompiled DFA states to compute transitions for.
fn cached_state(
&mut self,
dfa_id: StateID,
unit: alphabet::Unit,
) -> Result<(StateID, bool), BuildError> {
// Compute the set of all reachable NFA states, including epsilons.
let empty_builder = self.get_state_builder();
let builder = util::determinize::next(
self.nfa,
self.config.match_kind,
&mut self.sparses,
&mut self.stack,
&self.builder_states[self.dfa.to_index(dfa_id)],
unit,
empty_builder,
);
self.maybe_add_state(builder)
}
/// Compute the set of DFA start states and add their identifiers in
/// 'dfa_state_ids' (no duplicates are added).
fn add_all_starts(
&mut self,
dfa_state_ids: &mut Vec<StateID>,
) -> Result<(), BuildError> {
// These should be the first states added.
assert!(dfa_state_ids.is_empty());
// We only want to add (un)anchored starting states that is consistent
// with our DFA's configuration. Unconditionally adding both (although
// it is the default) can make DFAs quite a bit bigger.
if self.dfa.start_kind().has_unanchored() {
self.add_start_group(Anchored::No, dfa_state_ids)?;
}
if self.dfa.start_kind().has_anchored() {
self.add_start_group(Anchored::Yes, dfa_state_ids)?;
}
// I previously has an 'assert' here checking that either
// 'dfa_state_ids' was non-empty, or the NFA had zero patterns. But it
// turns out this isn't always true. For example, the NFA might have
// one or more patterns but where all such patterns are just 'fail'
// states. These will ultimately just compile down to DFA dead states,
// and since the dead state was added earlier, no new DFA states are
// added. And thus, it is valid and okay for 'dfa_state_ids' to be
// empty even if there are a non-zero number of patterns in the NFA.
// We only need to compute anchored start states for each pattern if it
// was requested to do so.
if self.dfa.starts_for_each_pattern() {
for pid in self.nfa.patterns() {
self.add_start_group(Anchored::Pattern(pid), dfa_state_ids)?;
}
}
Ok(())
}
/// Add a group of start states for the given match pattern ID. Any new
/// DFA states added are pushed on to 'dfa_state_ids'. (No duplicates are
/// pushed.)
///
/// When pattern_id is None, then this will compile a group of unanchored
/// start states (if the DFA is unanchored). When the pattern_id is
/// present, then this will compile a group of anchored start states that
/// only match the given pattern.
///
/// This panics if `anchored` corresponds to an invalid pattern ID.
fn add_start_group(
&mut self,
anchored: Anchored,
dfa_state_ids: &mut Vec<StateID>,
) -> Result<(), BuildError> {
let nfa_start = match anchored {
Anchored::No => self.nfa.start_unanchored(),
Anchored::Yes => self.nfa.start_anchored(),
Anchored::Pattern(pid) => {
self.nfa.start_pattern(pid).expect("valid pattern ID")
}
};
// When compiling start states, we're careful not to build additional
// states that aren't necessary. For example, if the NFA has no word
// boundary assertion, then there's no reason to have distinct start
// states for 'NonWordByte' and 'WordByte' starting configurations.
// Instead, the 'WordByte' starting configuration can just point
// directly to the start state for the 'NonWordByte' config.
//
// Note though that we only need to care about assertions in the prefix
// of an NFA since this only concerns the starting states. (Actually,
// the most precisely thing we could do it is look at the prefix
// assertions of each pattern when 'anchored == Anchored::Pattern',
// and then only compile extra states if the prefix is non-empty.) But
// we settle for simplicity here instead of absolute minimalism. It is
// somewhat rare, after all, for multiple patterns in the same regex to
// have different prefix look-arounds.
let (id, is_new) =
self.add_one_start(nfa_start, Start::NonWordByte)?;
self.dfa.set_start_state(anchored, Start::NonWordByte, id);
if is_new {
dfa_state_ids.push(id);
}
if !self.nfa.look_set_prefix_any().contains_word() {
self.dfa.set_start_state(anchored, Start::WordByte, id);
} else {
let (id, is_new) =
self.add_one_start(nfa_start, Start::WordByte)?;
self.dfa.set_start_state(anchored, Start::WordByte, id);
if is_new {
dfa_state_ids.push(id);
}
}
if !self.nfa.look_set_prefix_any().contains_anchor() {
self.dfa.set_start_state(anchored, Start::Text, id);
self.dfa.set_start_state(anchored, Start::LineLF, id);
self.dfa.set_start_state(anchored, Start::LineCR, id);
self.dfa.set_start_state(
anchored,
Start::CustomLineTerminator,
id,
);
} else {
let (id, is_new) = self.add_one_start(nfa_start, Start::Text)?;
self.dfa.set_start_state(anchored, Start::Text, id);
if is_new {
dfa_state_ids.push(id);
}
let (id, is_new) = self.add_one_start(nfa_start, Start::LineLF)?;
self.dfa.set_start_state(anchored, Start::LineLF, id);
if is_new {
dfa_state_ids.push(id);
}
let (id, is_new) = self.add_one_start(nfa_start, Start::LineCR)?;
self.dfa.set_start_state(anchored, Start::LineCR, id);
if is_new {
dfa_state_ids.push(id);
}
let (id, is_new) =
self.add_one_start(nfa_start, Start::CustomLineTerminator)?;
self.dfa.set_start_state(
anchored,
Start::CustomLineTerminator,
id,
);
if is_new {
dfa_state_ids.push(id);
}
}
Ok(())
}
/// Add a new DFA start state corresponding to the given starting NFA
/// state, and the starting search configuration. (The starting search
/// configuration essentially tells us which look-behind assertions are
/// true for this particular state.)
///
/// The boolean returned indicates whether the state ID returned is a newly
/// created state, or a previously cached state.
fn add_one_start(
&mut self,
nfa_start: StateID,
start: Start,
) -> Result<(StateID, bool), BuildError> {
// Compute the look-behind assertions that are true in this starting
// configuration, and the determine the epsilon closure. While
// computing the epsilon closure, we only follow condiional epsilon
// transitions that satisfy the look-behind assertions in 'look_have'.
let mut builder_matches = self.get_state_builder().into_matches();
util::determinize::set_lookbehind_from_start(
self.nfa,
&start,
&mut builder_matches,
);
self.sparses.set1.clear();
util::determinize::epsilon_closure(
self.nfa,
nfa_start,
builder_matches.look_have(),
&mut self.stack,
&mut self.sparses.set1,
);
let mut builder = builder_matches.into_nfa();
util::determinize::add_nfa_states(
&self.nfa,
&self.sparses.set1,
&mut builder,
);
self.maybe_add_state(builder)
}
/// Adds the given state to the DFA being built depending on whether it
/// already exists in this determinizer's cache.
///
/// If it does exist, then the memory used by 'state' is put back into the
/// determinizer and the previously created state's ID is returned. (Along
/// with 'false', indicating that no new state was added.)
///
/// If it does not exist, then the state is added to the DFA being built
/// and a fresh ID is allocated (if ID allocation fails, then an error is
/// returned) and returned. (Along with 'true', indicating that a new state
/// was added.)
fn maybe_add_state(
&mut self,
builder: StateBuilderNFA,
) -> Result<(StateID, bool), BuildError> {
if let Some(&cached_id) = self.cache.get(builder.as_bytes()) {
// Since we have a cached state, put the constructed state's
// memory back into our scratch space, so that it can be reused.
self.put_state_builder(builder);
return Ok((cached_id, false));
}
self.add_state(builder).map(|sid| (sid, true))
}
/// Add the given state to the DFA and make it available in the cache.
///
/// The state initially has no transitions. That is, it transitions to the
/// dead state for all possible inputs, and transitions to the quit state
/// for all quit bytes.
///
/// If adding the state would exceed the maximum value for StateID, then an
/// error is returned.
fn add_state(
&mut self,
builder: StateBuilderNFA,
) -> Result<StateID, BuildError> {
let id = self.dfa.add_empty_state()?;
if !self.config.quit.is_empty() {
for b in self.config.quit.iter() {
self.dfa.set_transition(
id,
alphabet::Unit::u8(b),
self.dfa.quit_id(),
);
}
}
let state = builder.to_state();
// States use reference counting internally, so we only need to count
// their memory usage once.
self.memory_usage_state += state.memory_usage();
self.builder_states.push(state.clone());
self.cache.insert(state, id);
self.put_state_builder(builder);
if let Some(limit) = self.config.dfa_size_limit {
if self.dfa.memory_usage() > limit {
return Err(BuildError::dfa_exceeded_size_limit(limit));
}
}
if let Some(limit) = self.config.determinize_size_limit {
if self.memory_usage() > limit {
return Err(BuildError::determinize_exceeded_size_limit(
limit,
));
}
}
Ok(id)
}
/// Returns a state builder from this determinizer that might have existing
/// capacity. This helps avoid allocs in cases where a state is built that
/// turns out to already be cached.
///
/// Callers must put the state builder back with 'put_state_builder',
/// otherwise the allocation reuse won't work.
fn get_state_builder(&mut self) -> StateBuilderEmpty {
core::mem::replace(
&mut self.scratch_state_builder,
StateBuilderEmpty::new(),
)
}
/// Puts the given state builder back into this determinizer for reuse.
///
/// Note that building a 'State' from a builder always creates a new
/// alloc, so callers should always put the builder back.
fn put_state_builder(&mut self, builder: StateBuilderNFA) {
let _ = core::mem::replace(
&mut self.scratch_state_builder,
builder.clear(),
);
}
/// Return the memory usage, in bytes, of this determinizer at the current
/// point in time. This does not include memory used by the NFA or the
/// dense DFA itself.
fn memory_usage(&self) -> usize {
use core::mem::size_of;
self.builder_states.len() * size_of::<State>()
// Maps likely use more memory than this, but it's probably close.
+ self.cache.len() * (size_of::<State>() + size_of::<StateID>())
+ self.memory_usage_state
+ self.stack.capacity() * size_of::<StateID>()
+ self.scratch_state_builder.capacity()
}
}

View file

@ -0,0 +1,463 @@
use core::{cell::RefCell, fmt, mem};
use alloc::{collections::BTreeMap, rc::Rc, vec, vec::Vec};
use crate::{
dfa::{automaton::Automaton, dense, DEAD},
util::{
alphabet,
primitives::{PatternID, StateID},
},
};
/// An implementation of Hopcroft's algorithm for minimizing DFAs.
///
/// The algorithm implemented here is mostly taken from Wikipedia:
/// https://en.wikipedia.org/wiki/DFA_minimization#Hopcroft's_algorithm
///
/// This code has had some light optimization attention paid to it,
/// particularly in the form of reducing allocation as much as possible.
/// However, it is still generally slow. Future optimization work should
/// probably focus on the bigger picture rather than micro-optimizations. For
/// example:
///
/// 1. Figure out how to more intelligently create initial partitions. That is,
/// Hopcroft's algorithm starts by creating two partitions of DFA states
/// that are known to NOT be equivalent: match states and non-match states.
/// The algorithm proceeds by progressively refining these partitions into
/// smaller partitions. If we could start with more partitions, then we
/// could reduce the amount of work that Hopcroft's algorithm needs to do.
/// 2. For every partition that we visit, we find all incoming transitions to
/// every state in the partition for *every* element in the alphabet. (This
/// is why using byte classes can significantly decrease minimization times,
/// since byte classes shrink the alphabet.) This is quite costly and there
/// is perhaps some redundant work being performed depending on the specific
/// states in the set. For example, we might be able to only visit some
/// elements of the alphabet based on the transitions.
/// 3. Move parts of minimization into determinization. If minimization has
/// fewer states to deal with, then it should run faster. A prime example
/// of this might be large Unicode classes, which are generated in way that
/// can create a lot of redundant states. (Some work has been done on this
/// point during NFA compilation via the algorithm described in the
/// "Incremental Construction of MinimalAcyclic Finite-State Automata"
/// paper.)
pub(crate) struct Minimizer<'a> {
dfa: &'a mut dense::OwnedDFA,
in_transitions: Vec<Vec<Vec<StateID>>>,
partitions: Vec<StateSet>,
waiting: Vec<StateSet>,
}
impl<'a> fmt::Debug for Minimizer<'a> {
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
f.debug_struct("Minimizer")
.field("dfa", &self.dfa)
.field("in_transitions", &self.in_transitions)
.field("partitions", &self.partitions)
.field("waiting", &self.waiting)
.finish()
}
}
/// A set of states. A state set makes up a single partition in Hopcroft's
/// algorithm.
///
/// It is represented by an ordered set of state identifiers. We use shared
/// ownership so that a single state set can be in both the set of partitions
/// and in the set of waiting sets simultaneously without an additional
/// allocation. Generally, once a state set is built, it becomes immutable.
///
/// We use this representation because it avoids the overhead of more
/// traditional set data structures (HashSet/BTreeSet), and also because
/// computing intersection/subtraction on this representation is especially
/// fast.
#[derive(Clone, Debug, Eq, PartialEq, PartialOrd, Ord)]
struct StateSet {
ids: Rc<RefCell<Vec<StateID>>>,
}
impl<'a> Minimizer<'a> {
pub fn new(dfa: &'a mut dense::OwnedDFA) -> Minimizer<'a> {
let in_transitions = Minimizer::incoming_transitions(dfa);
let partitions = Minimizer::initial_partitions(dfa);
let waiting = partitions.clone();
Minimizer { dfa, in_transitions, partitions, waiting }
}
pub fn run(mut self) {
let stride2 = self.dfa.stride2();
let as_state_id = |index: usize| -> StateID {
StateID::new(index << stride2).unwrap()
};
let as_index = |id: StateID| -> usize { id.as_usize() >> stride2 };
let mut incoming = StateSet::empty();
let mut scratch1 = StateSet::empty();
let mut scratch2 = StateSet::empty();
let mut newparts = vec![];
// This loop is basically Hopcroft's algorithm. Everything else is just
// shuffling data around to fit our representation.
while let Some(set) = self.waiting.pop() {
for b in self.dfa.byte_classes().iter() {
self.find_incoming_to(b, &set, &mut incoming);
// If incoming is empty, then the intersection with any other
// set must also be empty. So 'newparts' just ends up being
// 'self.partitions'. So there's no need to go through the loop
// below.
//
// This actually turns out to be rather large optimization. On
// the order of making minimization 4-5x faster. It's likely
// that the vast majority of all states have very few incoming
// transitions.
if incoming.is_empty() {
continue;
}
for p in 0..self.partitions.len() {
self.partitions[p].intersection(&incoming, &mut scratch1);
if scratch1.is_empty() {
newparts.push(self.partitions[p].clone());
continue;
}
self.partitions[p].subtract(&incoming, &mut scratch2);
if scratch2.is_empty() {
newparts.push(self.partitions[p].clone());
continue;
}
let (x, y) =
(scratch1.deep_clone(), scratch2.deep_clone());
newparts.push(x.clone());
newparts.push(y.clone());
match self.find_waiting(&self.partitions[p]) {
Some(i) => {
self.waiting[i] = x;
self.waiting.push(y);
}
None => {
if x.len() <= y.len() {
self.waiting.push(x);
} else {
self.waiting.push(y);
}
}
}
}
newparts = mem::replace(&mut self.partitions, newparts);
newparts.clear();
}
}
// At this point, we now have a minimal partitioning of states, where
// each partition is an equivalence class of DFA states. Now we need to
// use this partitioning to update the DFA to only contain one state for
// each partition.
// Create a map from DFA state ID to the representative ID of the
// equivalence class to which it belongs. The representative ID of an
// equivalence class of states is the minimum ID in that class.
let mut state_to_part = vec![DEAD; self.dfa.state_len()];
for p in &self.partitions {
p.iter(|id| state_to_part[as_index(id)] = p.min());
}
// Generate a new contiguous sequence of IDs for minimal states, and
// create a map from equivalence IDs to the new IDs. Thus, the new
// minimal ID of *any* state in the unminimized DFA can be obtained
// with minimals_ids[state_to_part[old_id]].
let mut minimal_ids = vec![DEAD; self.dfa.state_len()];
let mut new_index = 0;
for state in self.dfa.states() {
if state_to_part[as_index(state.id())] == state.id() {
minimal_ids[as_index(state.id())] = as_state_id(new_index);
new_index += 1;
}
}
// The total number of states in the minimal DFA.
let minimal_count = new_index;
// Convenience function for remapping state IDs. This takes an old ID,
// looks up its Hopcroft partition and then maps that to the new ID
// range.
let remap = |old| minimal_ids[as_index(state_to_part[as_index(old)])];
// Re-map this DFA in place such that the only states remaining
// correspond to the representative states of every equivalence class.
for id in (0..self.dfa.state_len()).map(as_state_id) {
// If this state isn't a representative for an equivalence class,
// then we skip it since it won't appear in the minimal DFA.
if state_to_part[as_index(id)] != id {
continue;
}
self.dfa.remap_state(id, remap);
self.dfa.swap_states(id, minimal_ids[as_index(id)]);
}
// Trim off all unused states from the pre-minimized DFA. This
// represents all states that were merged into a non-singleton
// equivalence class of states, and appeared after the first state
// in each such class. (Because the state with the smallest ID in each
// equivalence class is its representative ID.)
self.dfa.truncate_states(minimal_count);
// Update the new start states, which is now just the minimal ID of
// whatever state the old start state was collapsed into. Also, we
// collect everything before-hand to work around the borrow checker.
// We're already allocating so much that this is probably fine. If this
// turns out to be costly, then I guess add a `starts_mut` iterator.
let starts: Vec<_> = self.dfa.starts().collect();
for (old_start_id, anchored, start_type) in starts {
self.dfa.set_start_state(
anchored,
start_type,
remap(old_start_id),
);
}
// Update the match state pattern ID list for multi-regexes. All we
// need to do is remap the match state IDs. The pattern ID lists are
// always the same as they were since match states with distinct
// pattern ID lists are always considered distinct states.
let mut pmap = BTreeMap::new();
for (match_id, pattern_ids) in self.dfa.pattern_map() {
let new_id = remap(match_id);
pmap.insert(new_id, pattern_ids);
}
// This unwrap is OK because minimization never increases the number of
// match states or patterns in those match states. Since minimization
// runs after the pattern map has already been set at least once, we
// know that our match states cannot error.
self.dfa.set_pattern_map(&pmap).unwrap();
// In order to update the ID of the maximum match state, we need to
// find the maximum ID among all of the match states in the minimized
// DFA. This is not necessarily the new ID of the unminimized maximum
// match state, since that could have been collapsed with a much
// earlier match state. Therefore, to find the new max match state,
// we iterate over all previous match states, find their corresponding
// new minimal ID, and take the maximum of those.
let old = self.dfa.special().clone();
let new = self.dfa.special_mut();
// ... but only remap if we had match states.
if old.matches() {
new.min_match = StateID::MAX;
new.max_match = StateID::ZERO;
for i in as_index(old.min_match)..=as_index(old.max_match) {
let new_id = remap(as_state_id(i));
if new_id < new.min_match {
new.min_match = new_id;
}
if new_id > new.max_match {
new.max_match = new_id;
}
}
}
// ... same, but for start states.
if old.starts() {
new.min_start = StateID::MAX;
new.max_start = StateID::ZERO;
for i in as_index(old.min_start)..=as_index(old.max_start) {
let new_id = remap(as_state_id(i));
if new_id == DEAD {
continue;
}
if new_id < new.min_start {
new.min_start = new_id;
}
if new_id > new.max_start {
new.max_start = new_id;
}
}
if new.max_start == DEAD {
new.min_start = DEAD;
}
}
new.quit_id = remap(new.quit_id);
new.set_max();
}
fn find_waiting(&self, set: &StateSet) -> Option<usize> {
self.waiting.iter().position(|s| s == set)
}
fn find_incoming_to(
&self,
b: alphabet::Unit,
set: &StateSet,
incoming: &mut StateSet,
) {
incoming.clear();
set.iter(|id| {
for &inid in
&self.in_transitions[self.dfa.to_index(id)][b.as_usize()]
{
incoming.add(inid);
}
});
incoming.canonicalize();
}
fn initial_partitions(dfa: &dense::OwnedDFA) -> Vec<StateSet> {
// For match states, we know that two match states with different
// pattern ID lists will *always* be distinct, so we can partition them
// initially based on that.
let mut matching: BTreeMap<Vec<PatternID>, StateSet> = BTreeMap::new();
let mut is_quit = StateSet::empty();
let mut no_match = StateSet::empty();
for state in dfa.states() {
if dfa.is_match_state(state.id()) {
let mut pids = vec![];
for i in 0..dfa.match_len(state.id()) {
pids.push(dfa.match_pattern(state.id(), i));
}
matching
.entry(pids)
.or_insert(StateSet::empty())
.add(state.id());
} else if dfa.is_quit_state(state.id()) {
is_quit.add(state.id());
} else {
no_match.add(state.id());
}
}
let mut sets: Vec<StateSet> =
matching.into_iter().map(|(_, set)| set).collect();
sets.push(no_match);
sets.push(is_quit);
sets
}
fn incoming_transitions(dfa: &dense::OwnedDFA) -> Vec<Vec<Vec<StateID>>> {
let mut incoming = vec![];
for _ in dfa.states() {
incoming.push(vec![vec![]; dfa.alphabet_len()]);
}
for state in dfa.states() {
for (b, next) in state.transitions() {
incoming[dfa.to_index(next)][b.as_usize()].push(state.id());
}
}
incoming
}
}
impl StateSet {
fn empty() -> StateSet {
StateSet { ids: Rc::new(RefCell::new(vec![])) }
}
fn add(&mut self, id: StateID) {
self.ids.borrow_mut().push(id);
}
fn min(&self) -> StateID {
self.ids.borrow()[0]
}
fn canonicalize(&mut self) {
self.ids.borrow_mut().sort();
self.ids.borrow_mut().dedup();
}
fn clear(&mut self) {
self.ids.borrow_mut().clear();
}
fn len(&self) -> usize {
self.ids.borrow().len()
}
fn is_empty(&self) -> bool {
self.len() == 0
}
fn deep_clone(&self) -> StateSet {
let ids = self.ids.borrow().iter().cloned().collect();
StateSet { ids: Rc::new(RefCell::new(ids)) }
}
fn iter<F: FnMut(StateID)>(&self, mut f: F) {
for &id in self.ids.borrow().iter() {
f(id);
}
}
fn intersection(&self, other: &StateSet, dest: &mut StateSet) {
dest.clear();
if self.is_empty() || other.is_empty() {
return;
}
let (seta, setb) = (self.ids.borrow(), other.ids.borrow());
let (mut ita, mut itb) = (seta.iter().cloned(), setb.iter().cloned());
let (mut a, mut b) = (ita.next().unwrap(), itb.next().unwrap());
loop {
if a == b {
dest.add(a);
a = match ita.next() {
None => break,
Some(a) => a,
};
b = match itb.next() {
None => break,
Some(b) => b,
};
} else if a < b {
a = match ita.next() {
None => break,
Some(a) => a,
};
} else {
b = match itb.next() {
None => break,
Some(b) => b,
};
}
}
}
fn subtract(&self, other: &StateSet, dest: &mut StateSet) {
dest.clear();
if self.is_empty() || other.is_empty() {
self.iter(|s| dest.add(s));
return;
}
let (seta, setb) = (self.ids.borrow(), other.ids.borrow());
let (mut ita, mut itb) = (seta.iter().cloned(), setb.iter().cloned());
let (mut a, mut b) = (ita.next().unwrap(), itb.next().unwrap());
loop {
if a == b {
a = match ita.next() {
None => break,
Some(a) => a,
};
b = match itb.next() {
None => {
dest.add(a);
break;
}
Some(b) => b,
};
} else if a < b {
dest.add(a);
a = match ita.next() {
None => break,
Some(a) => a,
};
} else {
b = match itb.next() {
None => {
dest.add(a);
break;
}
Some(b) => b,
};
}
}
for a in ita {
dest.add(a);
}
}
}

View file

@ -0,0 +1,360 @@
/*!
A module for building and searching with deterministic finite automata (DFAs).
Like other modules in this crate, DFAs support a rich regex syntax with Unicode
features. DFAs also have extensive options for configuring the best space vs
time trade off for your use case and provides support for cheap deserialization
of automata for use in `no_std` environments.
If you're looking for lazy DFAs that build themselves incrementally during
search, then please see the top-level [`hybrid` module](crate::hybrid).
# Overview
This section gives a brief overview of the primary types in this module:
* A [`regex::Regex`] provides a way to search for matches of a regular
expression using DFAs. This includes iterating over matches with both the start
and end positions of each match.
* A [`dense::DFA`] provides low level access to a DFA that uses a dense
representation (uses lots of space, but fast searching).
* A [`sparse::DFA`] provides the same API as a `dense::DFA`, but uses a sparse
representation (uses less space, but slower searching).
* An [`Automaton`] trait that defines an interface that both dense and sparse
DFAs implement. (A `regex::Regex` is generic over this trait.)
* Both dense DFAs and sparse DFAs support serialization to raw bytes (e.g.,
[`dense::DFA::to_bytes_little_endian`]) and cheap deserialization (e.g.,
[`dense::DFA::from_bytes`]).
There is also a [`onepass`] module that provides a [one-pass
DFA](onepass::DFA). The unique advantage of this DFA is that, for the class
of regexes it can be built with, it supports reporting the spans of matching
capturing groups. It is the only DFA in this crate capable of such a thing.
# Example: basic regex searching
This example shows how to compile a regex using the default configuration
and then use it to find matches in a byte string:
```
use regex_automata::{Match, dfa::regex::Regex};
let re = Regex::new(r"[0-9]{4}-[0-9]{2}-[0-9]{2}")?;
let text = b"2018-12-24 2016-10-08";
let matches: Vec<Match> = re.find_iter(text).collect();
assert_eq!(matches, vec![
Match::must(0, 0..10),
Match::must(0, 11..21),
]);
# Ok::<(), Box<dyn std::error::Error>>(())
```
# Example: searching with regex sets
The DFAs in this module all fully support searching with multiple regexes
simultaneously. You can use this support with standard leftmost-first style
searching to find non-overlapping matches:
```
# if cfg!(miri) { return Ok(()); } // miri takes too long
use regex_automata::{Match, dfa::regex::Regex};
let re = Regex::new_many(&[r"\w+", r"\S+"])?;
let text = b"@foo bar";
let matches: Vec<Match> = re.find_iter(text).collect();
assert_eq!(matches, vec![
Match::must(1, 0..4),
Match::must(0, 5..8),
]);
# Ok::<(), Box<dyn std::error::Error>>(())
```
# Example: use sparse DFAs
By default, compiling a regex will use dense DFAs internally. This uses more
memory, but executes searches more quickly. If you can abide slower searches
(somewhere around 3-5x), then sparse DFAs might make more sense since they can
use significantly less space.
Using sparse DFAs is as easy as using `Regex::new_sparse` instead of
`Regex::new`:
```
use regex_automata::{Match, dfa::regex::Regex};
let re = Regex::new_sparse(r"[0-9]{4}-[0-9]{2}-[0-9]{2}").unwrap();
let text = b"2018-12-24 2016-10-08";
let matches: Vec<Match> = re.find_iter(text).collect();
assert_eq!(matches, vec![
Match::must(0, 0..10),
Match::must(0, 11..21),
]);
# Ok::<(), Box<dyn std::error::Error>>(())
```
If you already have dense DFAs for some reason, they can be converted to sparse
DFAs and used to build a new `Regex`. For example:
```
use regex_automata::{Match, dfa::regex::Regex};
let dense_re = Regex::new(r"[0-9]{4}-[0-9]{2}-[0-9]{2}").unwrap();
let sparse_re = Regex::builder().build_from_dfas(
dense_re.forward().to_sparse()?,
dense_re.reverse().to_sparse()?,
);
let text = b"2018-12-24 2016-10-08";
let matches: Vec<Match> = sparse_re.find_iter(text).collect();
assert_eq!(matches, vec![
Match::must(0, 0..10),
Match::must(0, 11..21),
]);
# Ok::<(), Box<dyn std::error::Error>>(())
```
# Example: deserialize a DFA
This shows how to first serialize a DFA into raw bytes, and then deserialize
those raw bytes back into a DFA. While this particular example is a
bit contrived, this same technique can be used in your program to
deserialize a DFA at start up time or by memory mapping a file.
```
use regex_automata::{Match, dfa::{dense, regex::Regex}};
let re1 = Regex::new(r"[0-9]{4}-[0-9]{2}-[0-9]{2}").unwrap();
// serialize both the forward and reverse DFAs, see note below
let (fwd_bytes, fwd_pad) = re1.forward().to_bytes_native_endian();
let (rev_bytes, rev_pad) = re1.reverse().to_bytes_native_endian();
// now deserialize both---we need to specify the correct type!
let fwd: dense::DFA<&[u32]> = dense::DFA::from_bytes(&fwd_bytes[fwd_pad..])?.0;
let rev: dense::DFA<&[u32]> = dense::DFA::from_bytes(&rev_bytes[rev_pad..])?.0;
// finally, reconstruct our regex
let re2 = Regex::builder().build_from_dfas(fwd, rev);
// we can use it like normal
let text = b"2018-12-24 2016-10-08";
let matches: Vec<Match> = re2.find_iter(text).collect();
assert_eq!(matches, vec![
Match::must(0, 0..10),
Match::must(0, 11..21),
]);
# Ok::<(), Box<dyn std::error::Error>>(())
```
There are a few points worth noting here:
* We need to extract the raw DFAs used by the regex and serialize those. You
can build the DFAs manually yourself using [`dense::Builder`], but using
the DFAs from a `Regex` guarantees that the DFAs are built correctly. (In
particular, a `Regex` constructs a reverse DFA for finding the starting
location of matches.)
* To convert the DFA to raw bytes, we use the `to_bytes_native_endian` method.
In practice, you'll want to use either [`dense::DFA::to_bytes_little_endian`]
or [`dense::DFA::to_bytes_big_endian`], depending on which platform you're
deserializing your DFA from. If you intend to deserialize on either platform,
then you'll need to serialize both and deserialize the right one depending on
your target's endianness.
* Safely deserializing a DFA requires verifying the raw bytes, particularly if
they are untrusted, since an invalid DFA could cause logical errors, panics
or even undefined behavior. This verification step requires visiting all of
the transitions in the DFA, which can be costly. If cheaper verification is
desired, then [`dense::DFA::from_bytes_unchecked`] is available that only does
verification that can be performed in constant time. However, one can only use
this routine if the caller can guarantee that the bytes provided encoded a
valid DFA.
The same process can be achieved with sparse DFAs as well:
```
use regex_automata::{Match, dfa::{sparse, regex::Regex}};
let re1 = Regex::new(r"[0-9]{4}-[0-9]{2}-[0-9]{2}").unwrap();
// serialize both
let fwd_bytes = re1.forward().to_sparse()?.to_bytes_native_endian();
let rev_bytes = re1.reverse().to_sparse()?.to_bytes_native_endian();
// now deserialize both---we need to specify the correct type!
let fwd: sparse::DFA<&[u8]> = sparse::DFA::from_bytes(&fwd_bytes)?.0;
let rev: sparse::DFA<&[u8]> = sparse::DFA::from_bytes(&rev_bytes)?.0;
// finally, reconstruct our regex
let re2 = Regex::builder().build_from_dfas(fwd, rev);
// we can use it like normal
let text = b"2018-12-24 2016-10-08";
let matches: Vec<Match> = re2.find_iter(text).collect();
assert_eq!(matches, vec![
Match::must(0, 0..10),
Match::must(0, 11..21),
]);
# Ok::<(), Box<dyn std::error::Error>>(())
```
Note that unlike dense DFAs, sparse DFAs have no alignment requirements.
Conversely, dense DFAs must be be aligned to the same alignment as a
[`StateID`](crate::util::primitives::StateID).
# Support for `no_std` and `alloc`-only
This crate comes with `alloc` and `std` features that are enabled by default.
When the `alloc` or `std` features are enabled, the API of this module will
include the facilities necessary for compiling, serializing, deserializing
and searching with DFAs. When only the `alloc` feature is enabled, then
implementations of the `std::error::Error` trait are dropped, but everything
else generally remains the same. When both the `alloc` and `std` features are
disabled, the API of this module will shrink such that it only includes the
facilities necessary for deserializing and searching with DFAs.
The intended workflow for `no_std` environments is thus as follows:
* Write a program with the `alloc` or `std` features that compiles and
serializes a regular expression. You may need to serialize both little and big
endian versions of each DFA. (So that's 4 DFAs in total for each regex.)
* In your `no_std` environment, follow the examples above for deserializing
your previously serialized DFAs into regexes. You can then search with them as
you would any regex.
Deserialization can happen anywhere. For example, with bytes embedded into a
binary or with a file memory mapped at runtime.
The `regex-cli` command (found in the same repository as this crate) can be
used to serialize DFAs to files and generate Rust code to read them.
# Syntax
This module supports the same syntax as the `regex` crate, since they share the
same parser. You can find an exhaustive list of supported syntax in the
[documentation for the `regex` crate](https://docs.rs/regex/1/regex/#syntax).
There are two things that are not supported by the DFAs in this module:
* Capturing groups. The DFAs (and [`Regex`](regex::Regex)es built on top
of them) can only find the offsets of an entire match, but cannot resolve
the offsets of each capturing group. This is because DFAs do not have the
expressive power necessary.
* Unicode word boundaries. These present particularly difficult challenges for
DFA construction and would result in an explosion in the number of states.
One can enable [`dense::Config::unicode_word_boundary`] though, which provides
heuristic support for Unicode word boundaries that only works on ASCII text.
Otherwise, one can use `(?-u:\b)` for an ASCII word boundary, which will work
on any input.
There are no plans to lift either of these limitations.
Note that these restrictions are identical to the restrictions on lazy DFAs.
# Differences with general purpose regexes
The main goal of the [`regex`](https://docs.rs/regex) crate is to serve as a
general purpose regular expression engine. It aims to automatically balance low
compile times, fast search times and low memory usage, while also providing
a convenient API for users. In contrast, this module provides a lower level
regular expression interface based exclusively on DFAs that is a bit less
convenient while providing more explicit control over memory usage and search
times.
Here are some specific negative differences:
* **Compilation can take an exponential amount of time and space** in the size
of the regex pattern. While most patterns do not exhibit worst case exponential
time, such patterns do exist. For example, `[01]*1[01]{N}` will build a DFA
with approximately `2^(N+2)` states. For this reason, untrusted patterns should
not be compiled with this module. (In the future, the API may expose an option
to return an error if the DFA gets too big.)
* This module does not support sub-match extraction via capturing groups, which
can be achieved with the regex crate's "captures" API.
* While the regex crate doesn't necessarily sport fast compilation times,
the regexes in this module are almost universally slow to compile, especially
when they contain large Unicode character classes. For example, on my system,
compiling `\w{50}` takes about 1 second and almost 15MB of memory! (Compiling
a sparse regex takes about the same time but only uses about 1.2MB of
memory.) Conversely, compiling the same regex without Unicode support, e.g.,
`(?-u)\w{50}`, takes under 1 millisecond and about 15KB of memory. For this
reason, you should only use Unicode character classes if you absolutely need
them! (They are enabled by default though.)
* This module does not support Unicode word boundaries. ASCII word bondaries
may be used though by disabling Unicode or selectively doing so in the syntax,
e.g., `(?-u:\b)`. There is also an option to
[heuristically enable Unicode word boundaries](crate::dfa::dense::Config::unicode_word_boundary),
where the corresponding DFA will give up if any non-ASCII byte is seen.
* As a lower level API, this module does not do literal optimizations
automatically. Although it does provide hooks in its API to make use of the
[`Prefilter`](crate::util::prefilter::Prefilter) trait. Missing literal
optimizations means that searches may run much slower than what you're
accustomed to, although, it does provide more predictable and consistent
performance.
* There is no `&str` API like in the regex crate. In this module, all APIs
operate on `&[u8]`. By default, match indices are
guaranteed to fall on UTF-8 boundaries, unless either of
[`syntax::Config::utf8`](crate::util::syntax::Config::utf8) or
[`thompson::Config::utf8`](crate::nfa::thompson::Config::utf8) are disabled.
With some of the downsides out of the way, here are some positive differences:
* Both dense and sparse DFAs can be serialized to raw bytes, and then cheaply
deserialized. Deserialization can be done in constant time with the unchecked
APIs, since searching can be performed directly on the raw serialized bytes of
a DFA.
* This module was specifically designed so that the searching phase of a
DFA has minimal runtime requirements, and can therefore be used in `no_std`
environments. While `no_std` environments cannot compile regexes, they can
deserialize pre-compiled regexes.
* Since this module builds DFAs ahead of time, it will generally out-perform
the `regex` crate on equivalent tasks. The performance difference is likely
not large. However, because of a complex set of optimizations in the regex
crate (like literal optimizations), an accurate performance comparison may be
difficult to do.
* Sparse DFAs provide a way to build a DFA ahead of time that sacrifices search
performance a small amount, but uses much less storage space. Potentially even
less than what the regex crate uses.
* This module exposes DFAs directly, such as [`dense::DFA`] and
[`sparse::DFA`], which enables one to do less work in some cases. For example,
if you only need the end of a match and not the start of a match, then you can
use a DFA directly without building a `Regex`, which always requires a second
DFA to find the start of a match.
* This module provides more control over memory usage. Aside from choosing
between dense and sparse DFAs, one can also choose a smaller state identifier
representation to use less space. Also, one can enable DFA minimization
via [`dense::Config::minimize`], but it can increase compilation times
dramatically.
*/
#[cfg(feature = "dfa-search")]
pub use crate::dfa::{
automaton::{Automaton, OverlappingState, StartError},
start::StartKind,
};
/// This is an alias for a state ID of zero. It has special significance
/// because it always corresponds to the first state in a DFA, and the first
/// state in a DFA is always "dead." That is, the dead state always has all
/// of its transitions set to itself. Moreover, the dead state is used as a
/// sentinel for various things. e.g., In search, reaching a dead state means
/// that the search must stop.
const DEAD: crate::util::primitives::StateID =
crate::util::primitives::StateID::ZERO;
#[cfg(feature = "dfa-search")]
pub mod dense;
#[cfg(feature = "dfa-onepass")]
pub mod onepass;
#[cfg(feature = "dfa-search")]
pub mod regex;
#[cfg(feature = "dfa-search")]
pub mod sparse;
#[cfg(feature = "dfa-search")]
pub(crate) mod accel;
#[cfg(feature = "dfa-search")]
mod automaton;
#[cfg(feature = "dfa-build")]
mod determinize;
#[cfg(feature = "dfa-build")]
mod minimize;
#[cfg(any(feature = "dfa-build", feature = "dfa-onepass"))]
mod remapper;
#[cfg(feature = "dfa-search")]
mod search;
#[cfg(feature = "dfa-search")]
mod special;
#[cfg(feature = "dfa-search")]
mod start;

File diff suppressed because it is too large Load diff

View file

@ -0,0 +1,871 @@
/*!
A DFA-backed `Regex`.
This module provides [`Regex`], which is defined generically over the
[`Automaton`] trait. A `Regex` implements convenience routines you might have
come to expect, such as finding the start/end of a match and iterating over
all non-overlapping matches. This `Regex` type is limited in its capabilities
to what a DFA can provide. Therefore, APIs involving capturing groups, for
example, are not provided.
Internally, a `Regex` is composed of two DFAs. One is a "forward" DFA that
finds the end offset of a match, where as the other is a "reverse" DFA that
find the start offset of a match.
See the [parent module](crate::dfa) for examples.
*/
#[cfg(feature = "alloc")]
use alloc::vec::Vec;
#[cfg(feature = "dfa-build")]
use crate::dfa::dense::BuildError;
use crate::{
dfa::{automaton::Automaton, dense},
util::{iter, search::Input},
Anchored, Match, MatchError,
};
#[cfg(feature = "alloc")]
use crate::{
dfa::{sparse, StartKind},
util::search::MatchKind,
};
// When the alloc feature is enabled, the regex type sets its A type parameter
// to default to an owned dense DFA. But without alloc, we set no default. This
// makes things a lot more convenient in the common case, since writing out the
// DFA types is pretty annoying.
//
// Since we have two different definitions but only want to write one doc
// string, we use a macro to capture the doc and other attributes once and then
// repeat them for each definition.
macro_rules! define_regex_type {
($(#[$doc:meta])*) => {
#[cfg(feature = "alloc")]
$(#[$doc])*
pub struct Regex<A = dense::OwnedDFA> {
forward: A,
reverse: A,
}
#[cfg(not(feature = "alloc"))]
$(#[$doc])*
pub struct Regex<A> {
forward: A,
reverse: A,
}
};
}
define_regex_type!(
/// A regular expression that uses deterministic finite automata for fast
/// searching.
///
/// A regular expression is comprised of two DFAs, a "forward" DFA and a
/// "reverse" DFA. The forward DFA is responsible for detecting the end of
/// a match while the reverse DFA is responsible for detecting the start
/// of a match. Thus, in order to find the bounds of any given match, a
/// forward search must first be run followed by a reverse search. A match
/// found by the forward DFA guarantees that the reverse DFA will also find
/// a match.
///
/// The type of the DFA used by a `Regex` corresponds to the `A` type
/// parameter, which must satisfy the [`Automaton`] trait. Typically,
/// `A` is either a [`dense::DFA`](crate::dfa::dense::DFA) or a
/// [`sparse::DFA`](crate::dfa::sparse::DFA), where dense DFAs use more
/// memory but search faster, while sparse DFAs use less memory but search
/// more slowly.
///
/// # Crate features
///
/// Note that despite what the documentation auto-generates, the _only_
/// crate feature needed to use this type is `dfa-search`. You do _not_
/// need to enable the `alloc` feature.
///
/// By default, a regex's automaton type parameter is set to
/// `dense::DFA<Vec<u32>>` when the `alloc` feature is enabled. For most
/// in-memory work loads, this is the most convenient type that gives the
/// best search performance. When the `alloc` feature is disabled, no
/// default type is used.
///
/// # When should I use this?
///
/// Generally speaking, if you can afford the overhead of building a full
/// DFA for your regex, and you don't need things like capturing groups,
/// then this is a good choice if you're looking to optimize for matching
/// speed. Note however that its speed may be worse than a general purpose
/// regex engine if you don't provide a [`dense::Config::prefilter`] to the
/// underlying DFA.
///
/// # Sparse DFAs
///
/// Since a `Regex` is generic over the [`Automaton`] trait, it can be
/// used with any kind of DFA. While this crate constructs dense DFAs by
/// default, it is easy enough to build corresponding sparse DFAs, and then
/// build a regex from them:
///
/// ```
/// use regex_automata::dfa::regex::Regex;
///
/// // First, build a regex that uses dense DFAs.
/// let dense_re = Regex::new("foo[0-9]+")?;
///
/// // Second, build sparse DFAs from the forward and reverse dense DFAs.
/// let fwd = dense_re.forward().to_sparse()?;
/// let rev = dense_re.reverse().to_sparse()?;
///
/// // Third, build a new regex from the constituent sparse DFAs.
/// let sparse_re = Regex::builder().build_from_dfas(fwd, rev);
///
/// // A regex that uses sparse DFAs can be used just like with dense DFAs.
/// assert_eq!(true, sparse_re.is_match(b"foo123"));
///
/// # Ok::<(), Box<dyn std::error::Error>>(())
/// ```
///
/// Alternatively, one can use a [`Builder`] to construct a sparse DFA
/// more succinctly. (Note though that dense DFAs are still constructed
/// first internally, and then converted to sparse DFAs, as in the example
/// above.)
///
/// ```
/// use regex_automata::dfa::regex::Regex;
///
/// let sparse_re = Regex::builder().build_sparse(r"foo[0-9]+")?;
/// // A regex that uses sparse DFAs can be used just like with dense DFAs.
/// assert!(sparse_re.is_match(b"foo123"));
///
/// # Ok::<(), Box<dyn std::error::Error>>(())
/// ```
///
/// # Fallibility
///
/// Most of the search routines defined on this type will _panic_ when the
/// underlying search fails. This might be because the DFA gave up because
/// it saw a quit byte, whether configured explicitly or via heuristic
/// Unicode word boundary support, although neither are enabled by default.
/// Or it might fail because an invalid `Input` configuration is given,
/// for example, with an unsupported [`Anchored`] mode.
///
/// If you need to handle these error cases instead of allowing them to
/// trigger a panic, then the lower level [`Regex::try_search`] provides
/// a fallible API that never panics.
///
/// # Example
///
/// This example shows how to cause a search to terminate if it sees a
/// `\n` byte, and handle the error returned. This could be useful if, for
/// example, you wanted to prevent a user supplied pattern from matching
/// across a line boundary.
///
/// ```
/// # if cfg!(miri) { return Ok(()); } // miri takes too long
/// use regex_automata::{dfa::{self, regex::Regex}, Input, MatchError};
///
/// let re = Regex::builder()
/// .dense(dfa::dense::Config::new().quit(b'\n', true))
/// .build(r"foo\p{any}+bar")?;
///
/// let input = Input::new("foo\nbar");
/// // Normally this would produce a match, since \p{any} contains '\n'.
/// // But since we instructed the automaton to enter a quit state if a
/// // '\n' is observed, this produces a match error instead.
/// let expected = MatchError::quit(b'\n', 3);
/// let got = re.try_search(&input).unwrap_err();
/// assert_eq!(expected, got);
///
/// # Ok::<(), Box<dyn std::error::Error>>(())
/// ```
#[derive(Clone, Debug)]
);
#[cfg(all(feature = "syntax", feature = "dfa-build"))]
impl Regex {
/// Parse the given regular expression using the default configuration and
/// return the corresponding regex.
///
/// If you want a non-default configuration, then use the [`Builder`] to
/// set your own configuration.
///
/// # Example
///
/// ```
/// use regex_automata::{Match, dfa::regex::Regex};
///
/// let re = Regex::new("foo[0-9]+bar")?;
/// assert_eq!(
/// Some(Match::must(0, 3..14)),
/// re.find(b"zzzfoo12345barzzz"),
/// );
/// # Ok::<(), Box<dyn std::error::Error>>(())
/// ```
pub fn new(pattern: &str) -> Result<Regex, BuildError> {
Builder::new().build(pattern)
}
/// Like `new`, but parses multiple patterns into a single "regex set."
/// This similarly uses the default regex configuration.
///
/// # Example
///
/// ```
/// use regex_automata::{Match, dfa::regex::Regex};
///
/// let re = Regex::new_many(&["[a-z]+", "[0-9]+"])?;
///
/// let mut it = re.find_iter(b"abc 1 foo 4567 0 quux");
/// assert_eq!(Some(Match::must(0, 0..3)), it.next());
/// assert_eq!(Some(Match::must(1, 4..5)), it.next());
/// assert_eq!(Some(Match::must(0, 6..9)), it.next());
/// assert_eq!(Some(Match::must(1, 10..14)), it.next());
/// assert_eq!(Some(Match::must(1, 15..16)), it.next());
/// assert_eq!(Some(Match::must(0, 17..21)), it.next());
/// assert_eq!(None, it.next());
/// # Ok::<(), Box<dyn std::error::Error>>(())
/// ```
pub fn new_many<P: AsRef<str>>(
patterns: &[P],
) -> Result<Regex, BuildError> {
Builder::new().build_many(patterns)
}
}
#[cfg(all(feature = "syntax", feature = "dfa-build"))]
impl Regex<sparse::DFA<Vec<u8>>> {
/// Parse the given regular expression using the default configuration,
/// except using sparse DFAs, and return the corresponding regex.
///
/// If you want a non-default configuration, then use the [`Builder`] to
/// set your own configuration.
///
/// # Example
///
/// ```
/// use regex_automata::{Match, dfa::regex::Regex};
///
/// let re = Regex::new_sparse("foo[0-9]+bar")?;
/// assert_eq!(
/// Some(Match::must(0, 3..14)),
/// re.find(b"zzzfoo12345barzzz"),
/// );
/// # Ok::<(), Box<dyn std::error::Error>>(())
/// ```
pub fn new_sparse(
pattern: &str,
) -> Result<Regex<sparse::DFA<Vec<u8>>>, BuildError> {
Builder::new().build_sparse(pattern)
}
/// Like `new`, but parses multiple patterns into a single "regex set"
/// using sparse DFAs. This otherwise similarly uses the default regex
/// configuration.
///
/// # Example
///
/// ```
/// use regex_automata::{Match, dfa::regex::Regex};
///
/// let re = Regex::new_many_sparse(&["[a-z]+", "[0-9]+"])?;
///
/// let mut it = re.find_iter(b"abc 1 foo 4567 0 quux");
/// assert_eq!(Some(Match::must(0, 0..3)), it.next());
/// assert_eq!(Some(Match::must(1, 4..5)), it.next());
/// assert_eq!(Some(Match::must(0, 6..9)), it.next());
/// assert_eq!(Some(Match::must(1, 10..14)), it.next());
/// assert_eq!(Some(Match::must(1, 15..16)), it.next());
/// assert_eq!(Some(Match::must(0, 17..21)), it.next());
/// assert_eq!(None, it.next());
/// # Ok::<(), Box<dyn std::error::Error>>(())
/// ```
pub fn new_many_sparse<P: AsRef<str>>(
patterns: &[P],
) -> Result<Regex<sparse::DFA<Vec<u8>>>, BuildError> {
Builder::new().build_many_sparse(patterns)
}
}
/// Convenience routines for regex construction.
impl Regex<dense::DFA<&'static [u32]>> {
/// Return a builder for configuring the construction of a `Regex`.
///
/// This is a convenience routine to avoid needing to import the
/// [`Builder`] type in common cases.
///
/// # Example
///
/// This example shows how to use the builder to disable UTF-8 mode
/// everywhere.
///
/// ```
/// # if cfg!(miri) { return Ok(()); } // miri takes too long
/// use regex_automata::{
/// dfa::regex::Regex, nfa::thompson, util::syntax, Match,
/// };
///
/// let re = Regex::builder()
/// .syntax(syntax::Config::new().utf8(false))
/// .thompson(thompson::Config::new().utf8(false))
/// .build(r"foo(?-u:[^b])ar.*")?;
/// let haystack = b"\xFEfoo\xFFarzz\xE2\x98\xFF\n";
/// let expected = Some(Match::must(0, 1..9));
/// let got = re.find(haystack);
/// assert_eq!(expected, got);
///
/// # Ok::<(), Box<dyn std::error::Error>>(())
/// ```
pub fn builder() -> Builder {
Builder::new()
}
}
/// Standard search routines for finding and iterating over matches.
impl<A: Automaton> Regex<A> {
/// Returns true if and only if this regex matches the given haystack.
///
/// This routine may short circuit if it knows that scanning future input
/// will never lead to a different result. In particular, if the underlying
/// DFA enters a match state or a dead state, then this routine will return
/// `true` or `false`, respectively, without inspecting any future input.
///
/// # Panics
///
/// This routine panics if the search could not complete. This can occur
/// in a number of circumstances:
///
/// * The configuration of the DFA may permit it to "quit" the search.
/// For example, setting quit bytes or enabling heuristic support for
/// Unicode word boundaries. The default configuration does not enable any
/// option that could result in the DFA quitting.
/// * When the provided `Input` configuration is not supported. For
/// example, by providing an unsupported anchor mode.
///
/// When a search panics, callers cannot know whether a match exists or
/// not.
///
/// Use [`Regex::try_search`] if you want to handle these error conditions.
///
/// # Example
///
/// ```
/// use regex_automata::dfa::regex::Regex;
///
/// let re = Regex::new("foo[0-9]+bar")?;
/// assert_eq!(true, re.is_match("foo12345bar"));
/// assert_eq!(false, re.is_match("foobar"));
/// # Ok::<(), Box<dyn std::error::Error>>(())
/// ```
#[inline]
pub fn is_match<'h, I: Into<Input<'h>>>(&self, input: I) -> bool {
// Not only can we do an "earliest" search, but we can avoid doing a
// reverse scan too.
let input = input.into().earliest(true);
self.forward().try_search_fwd(&input).map(|x| x.is_some()).unwrap()
}
/// Returns the start and end offset of the leftmost match. If no match
/// exists, then `None` is returned.
///
/// # Panics
///
/// This routine panics if the search could not complete. This can occur
/// in a number of circumstances:
///
/// * The configuration of the DFA may permit it to "quit" the search.
/// For example, setting quit bytes or enabling heuristic support for
/// Unicode word boundaries. The default configuration does not enable any
/// option that could result in the DFA quitting.
/// * When the provided `Input` configuration is not supported. For
/// example, by providing an unsupported anchor mode.
///
/// When a search panics, callers cannot know whether a match exists or
/// not.
///
/// Use [`Regex::try_search`] if you want to handle these error conditions.
///
/// # Example
///
/// ```
/// use regex_automata::{Match, dfa::regex::Regex};
///
/// // Greediness is applied appropriately.
/// let re = Regex::new("foo[0-9]+")?;
/// assert_eq!(Some(Match::must(0, 3..11)), re.find("zzzfoo12345zzz"));
///
/// // Even though a match is found after reading the first byte (`a`),
/// // the default leftmost-first match semantics demand that we find the
/// // earliest match that prefers earlier parts of the pattern over latter
/// // parts.
/// let re = Regex::new("abc|a")?;
/// assert_eq!(Some(Match::must(0, 0..3)), re.find("abc"));
/// # Ok::<(), Box<dyn std::error::Error>>(())
/// ```
#[inline]
pub fn find<'h, I: Into<Input<'h>>>(&self, input: I) -> Option<Match> {
self.try_search(&input.into()).unwrap()
}
/// Returns an iterator over all non-overlapping leftmost matches in the
/// given bytes. If no match exists, then the iterator yields no elements.
///
/// This corresponds to the "standard" regex search iterator.
///
/// # Panics
///
/// If the search returns an error during iteration, then iteration
/// panics. See [`Regex::find`] for the panic conditions.
///
/// Use [`Regex::try_search`] with
/// [`util::iter::Searcher`](crate::util::iter::Searcher) if you want to
/// handle these error conditions.
///
/// # Example
///
/// ```
/// use regex_automata::{Match, dfa::regex::Regex};
///
/// let re = Regex::new("foo[0-9]+")?;
/// let text = "foo1 foo12 foo123";
/// let matches: Vec<Match> = re.find_iter(text).collect();
/// assert_eq!(matches, vec![
/// Match::must(0, 0..4),
/// Match::must(0, 5..10),
/// Match::must(0, 11..17),
/// ]);
/// # Ok::<(), Box<dyn std::error::Error>>(())
/// ```
#[inline]
pub fn find_iter<'r, 'h, I: Into<Input<'h>>>(
&'r self,
input: I,
) -> FindMatches<'r, 'h, A> {
let it = iter::Searcher::new(input.into());
FindMatches { re: self, it }
}
}
/// Lower level fallible search routines that permit controlling where the
/// search starts and ends in a particular sequence.
impl<A: Automaton> Regex<A> {
/// Returns the start and end offset of the leftmost match. If no match
/// exists, then `None` is returned.
///
/// This is like [`Regex::find`] but with two differences:
///
/// 1. It is not generic over `Into<Input>` and instead accepts a
/// `&Input`. This permits reusing the same `Input` for multiple searches
/// without needing to create a new one. This _may_ help with latency.
/// 2. It returns an error if the search could not complete where as
/// [`Regex::find`] will panic.
///
/// # Errors
///
/// This routine errors if the search could not complete. This can occur
/// in the following circumstances:
///
/// * The configuration of the DFA may permit it to "quit" the search.
/// For example, setting quit bytes or enabling heuristic support for
/// Unicode word boundaries. The default configuration does not enable any
/// option that could result in the DFA quitting.
/// * When the provided `Input` configuration is not supported. For
/// example, by providing an unsupported anchor mode.
///
/// When a search returns an error, callers cannot know whether a match
/// exists or not.
#[inline]
pub fn try_search(
&self,
input: &Input<'_>,
) -> Result<Option<Match>, MatchError> {
let (fwd, rev) = (self.forward(), self.reverse());
let end = match fwd.try_search_fwd(input)? {
None => return Ok(None),
Some(end) => end,
};
// This special cases an empty match at the beginning of the search. If
// our end matches our start, then since a reverse DFA can't match past
// the start, it must follow that our starting position is also our end
// position. So short circuit and skip the reverse search.
if input.start() == end.offset() {
return Ok(Some(Match::new(
end.pattern(),
end.offset()..end.offset(),
)));
}
// We can also skip the reverse search if we know our search was
// anchored. This occurs either when the input config is anchored or
// when we know the regex itself is anchored. In this case, we know the
// start of the match, if one is found, must be the start of the
// search.
if self.is_anchored(input) {
return Ok(Some(Match::new(
end.pattern(),
input.start()..end.offset(),
)));
}
// N.B. I have tentatively convinced myself that it isn't necessary
// to specify the specific pattern for the reverse search since the
// reverse search will always find the same pattern to match as the
// forward search. But I lack a rigorous proof. Why not just provide
// the pattern anyway? Well, if it is needed, then leaving it out
// gives us a chance to find a witness. (Also, if we don't need to
// specify the pattern, then we don't need to build the reverse DFA
// with 'starts_for_each_pattern' enabled.)
//
// We also need to be careful to disable 'earliest' for the reverse
// search, since it could be enabled for the forward search. In the
// reverse case, to satisfy "leftmost" criteria, we need to match
// as much as we can. We also need to be careful to make the search
// anchored. We don't want the reverse search to report any matches
// other than the one beginning at the end of our forward search.
let revsearch = input
.clone()
.span(input.start()..end.offset())
.anchored(Anchored::Yes)
.earliest(false);
let start = rev
.try_search_rev(&revsearch)?
.expect("reverse search must match if forward search does");
assert_eq!(
start.pattern(),
end.pattern(),
"forward and reverse search must match same pattern",
);
assert!(start.offset() <= end.offset());
Ok(Some(Match::new(end.pattern(), start.offset()..end.offset())))
}
/// Returns true if either the given input specifies an anchored search
/// or if the underlying DFA is always anchored.
fn is_anchored(&self, input: &Input<'_>) -> bool {
match input.get_anchored() {
Anchored::No => self.forward().is_always_start_anchored(),
Anchored::Yes | Anchored::Pattern(_) => true,
}
}
}
/// Non-search APIs for querying information about the regex and setting a
/// prefilter.
impl<A: Automaton> Regex<A> {
/// Return the underlying DFA responsible for forward matching.
///
/// This is useful for accessing the underlying DFA and converting it to
/// some other format or size. See the [`Builder::build_from_dfas`] docs
/// for an example of where this might be useful.
pub fn forward(&self) -> &A {
&self.forward
}
/// Return the underlying DFA responsible for reverse matching.
///
/// This is useful for accessing the underlying DFA and converting it to
/// some other format or size. See the [`Builder::build_from_dfas`] docs
/// for an example of where this might be useful.
pub fn reverse(&self) -> &A {
&self.reverse
}
/// Returns the total number of patterns matched by this regex.
///
/// # Example
///
/// ```
/// # if cfg!(miri) { return Ok(()); } // miri takes too long
/// use regex_automata::dfa::regex::Regex;
///
/// let re = Regex::new_many(&[r"[a-z]+", r"[0-9]+", r"\w+"])?;
/// assert_eq!(3, re.pattern_len());
/// # Ok::<(), Box<dyn std::error::Error>>(())
/// ```
pub fn pattern_len(&self) -> usize {
assert_eq!(self.forward().pattern_len(), self.reverse().pattern_len());
self.forward().pattern_len()
}
}
/// An iterator over all non-overlapping matches for an infallible search.
///
/// The iterator yields a [`Match`] value until no more matches could be found.
/// If the underlying regex engine returns an error, then a panic occurs.
///
/// The type parameters are as follows:
///
/// * `A` represents the type of the underlying DFA that implements the
/// [`Automaton`] trait.
///
/// The lifetime parameters are as follows:
///
/// * `'h` represents the lifetime of the haystack being searched.
/// * `'r` represents the lifetime of the regex object itself.
///
/// This iterator can be created with the [`Regex::find_iter`] method.
#[derive(Debug)]
pub struct FindMatches<'r, 'h, A> {
re: &'r Regex<A>,
it: iter::Searcher<'h>,
}
impl<'r, 'h, A: Automaton> Iterator for FindMatches<'r, 'h, A> {
type Item = Match;
#[inline]
fn next(&mut self) -> Option<Match> {
let FindMatches { re, ref mut it } = *self;
it.advance(|input| re.try_search(input))
}
}
/// A builder for a regex based on deterministic finite automatons.
///
/// This builder permits configuring options for the syntax of a pattern, the
/// NFA construction, the DFA construction and finally the regex searching
/// itself. This builder is different from a general purpose regex builder in
/// that it permits fine grain configuration of the construction process. The
/// trade off for this is complexity, and the possibility of setting a
/// configuration that might not make sense. For example, there are two
/// different UTF-8 modes:
///
/// * [`syntax::Config::utf8`](crate::util::syntax::Config::utf8) controls
/// whether the pattern itself can contain sub-expressions that match invalid
/// UTF-8.
/// * [`thompson::Config::utf8`](crate::nfa::thompson::Config::utf8) controls
/// how the regex iterators themselves advance the starting position of the
/// next search when a match with zero length is found.
///
/// Generally speaking, callers will want to either enable all of these or
/// disable all of these.
///
/// Internally, building a regex requires building two DFAs, where one is
/// responsible for finding the end of a match and the other is responsible
/// for finding the start of a match. If you only need to detect whether
/// something matched, or only the end of a match, then you should use a
/// [`dense::Builder`] to construct a single DFA, which is cheaper than
/// building two DFAs.
///
/// # Build methods
///
/// This builder has a few "build" methods. In general, it's the result of
/// combining the following parameters:
///
/// * Building one or many regexes.
/// * Building a regex with dense or sparse DFAs.
///
/// The simplest "build" method is [`Builder::build`]. It accepts a single
/// pattern and builds a dense DFA using `usize` for the state identifier
/// representation.
///
/// The most general "build" method is [`Builder::build_many`], which permits
/// building a regex that searches for multiple patterns simultaneously while
/// using a specific state identifier representation.
///
/// The most flexible "build" method, but hardest to use, is
/// [`Builder::build_from_dfas`]. This exposes the fact that a [`Regex`] is
/// just a pair of DFAs, and this method allows you to specify those DFAs
/// exactly.
///
/// # Example
///
/// This example shows how to disable UTF-8 mode in the syntax and the regex
/// itself. This is generally what you want for matching on arbitrary bytes.
///
/// ```
/// # if cfg!(miri) { return Ok(()); } // miri takes too long
/// use regex_automata::{
/// dfa::regex::Regex, nfa::thompson, util::syntax, Match,
/// };
///
/// let re = Regex::builder()
/// .syntax(syntax::Config::new().utf8(false))
/// .thompson(thompson::Config::new().utf8(false))
/// .build(r"foo(?-u:[^b])ar.*")?;
/// let haystack = b"\xFEfoo\xFFarzz\xE2\x98\xFF\n";
/// let expected = Some(Match::must(0, 1..9));
/// let got = re.find(haystack);
/// assert_eq!(expected, got);
/// // Notice that `(?-u:[^b])` matches invalid UTF-8,
/// // but the subsequent `.*` does not! Disabling UTF-8
/// // on the syntax permits this.
/// assert_eq!(b"foo\xFFarzz", &haystack[got.unwrap().range()]);
///
/// # Ok::<(), Box<dyn std::error::Error>>(())
/// ```
#[derive(Clone, Debug)]
pub struct Builder {
#[cfg(feature = "dfa-build")]
dfa: dense::Builder,
}
impl Builder {
/// Create a new regex builder with the default configuration.
pub fn new() -> Builder {
Builder {
#[cfg(feature = "dfa-build")]
dfa: dense::Builder::new(),
}
}
/// Build a regex from the given pattern.
///
/// If there was a problem parsing or compiling the pattern, then an error
/// is returned.
#[cfg(all(feature = "syntax", feature = "dfa-build"))]
pub fn build(&self, pattern: &str) -> Result<Regex, BuildError> {
self.build_many(&[pattern])
}
/// Build a regex from the given pattern using sparse DFAs.
///
/// If there was a problem parsing or compiling the pattern, then an error
/// is returned.
#[cfg(all(feature = "syntax", feature = "dfa-build"))]
pub fn build_sparse(
&self,
pattern: &str,
) -> Result<Regex<sparse::DFA<Vec<u8>>>, BuildError> {
self.build_many_sparse(&[pattern])
}
/// Build a regex from the given patterns.
#[cfg(all(feature = "syntax", feature = "dfa-build"))]
pub fn build_many<P: AsRef<str>>(
&self,
patterns: &[P],
) -> Result<Regex, BuildError> {
let forward = self.dfa.build_many(patterns)?;
let reverse = self
.dfa
.clone()
.configure(
dense::Config::new()
.prefilter(None)
.specialize_start_states(false)
.start_kind(StartKind::Anchored)
.match_kind(MatchKind::All),
)
.thompson(crate::nfa::thompson::Config::new().reverse(true))
.build_many(patterns)?;
Ok(self.build_from_dfas(forward, reverse))
}
/// Build a sparse regex from the given patterns.
#[cfg(all(feature = "syntax", feature = "dfa-build"))]
pub fn build_many_sparse<P: AsRef<str>>(
&self,
patterns: &[P],
) -> Result<Regex<sparse::DFA<Vec<u8>>>, BuildError> {
let re = self.build_many(patterns)?;
let forward = re.forward().to_sparse()?;
let reverse = re.reverse().to_sparse()?;
Ok(self.build_from_dfas(forward, reverse))
}
/// Build a regex from its component forward and reverse DFAs.
///
/// This is useful when deserializing a regex from some arbitrary
/// memory region. This is also useful for building regexes from other
/// types of DFAs.
///
/// If you're building the DFAs from scratch instead of building new DFAs
/// from other DFAs, then you'll need to make sure that the reverse DFA is
/// configured correctly to match the intended semantics. Namely:
///
/// * It should be anchored.
/// * It should use [`MatchKind::All`] semantics.
/// * It should match in reverse.
/// * Otherwise, its configuration should match the forward DFA.
///
/// If these conditions aren't satisfied, then the behavior of searches is
/// unspecified.
///
/// Note that when using this constructor, no configuration is applied.
/// Since this routine provides the DFAs to the builder, there is no
/// opportunity to apply other configuration options.
///
/// # Example
///
/// This example is a bit a contrived. The usual use of these methods
/// would involve serializing `initial_re` somewhere and then deserializing
/// it later to build a regex. But in this case, we do everything in
/// memory.
///
/// ```
/// use regex_automata::dfa::regex::Regex;
///
/// let initial_re = Regex::new("foo[0-9]+")?;
/// assert_eq!(true, initial_re.is_match(b"foo123"));
///
/// let (fwd, rev) = (initial_re.forward(), initial_re.reverse());
/// let re = Regex::builder().build_from_dfas(fwd, rev);
/// assert_eq!(true, re.is_match(b"foo123"));
/// # Ok::<(), Box<dyn std::error::Error>>(())
/// ```
///
/// This example shows how to build a `Regex` that uses sparse DFAs instead
/// of dense DFAs without using one of the convenience `build_sparse`
/// routines:
///
/// ```
/// use regex_automata::dfa::regex::Regex;
///
/// let initial_re = Regex::new("foo[0-9]+")?;
/// assert_eq!(true, initial_re.is_match(b"foo123"));
///
/// let fwd = initial_re.forward().to_sparse()?;
/// let rev = initial_re.reverse().to_sparse()?;
/// let re = Regex::builder().build_from_dfas(fwd, rev);
/// assert_eq!(true, re.is_match(b"foo123"));
/// # Ok::<(), Box<dyn std::error::Error>>(())
/// ```
pub fn build_from_dfas<A: Automaton>(
&self,
forward: A,
reverse: A,
) -> Regex<A> {
Regex { forward, reverse }
}
/// Set the syntax configuration for this builder using
/// [`syntax::Config`](crate::util::syntax::Config).
///
/// This permits setting things like case insensitivity, Unicode and multi
/// line mode.
#[cfg(all(feature = "syntax", feature = "dfa-build"))]
pub fn syntax(
&mut self,
config: crate::util::syntax::Config,
) -> &mut Builder {
self.dfa.syntax(config);
self
}
/// Set the Thompson NFA configuration for this builder using
/// [`nfa::thompson::Config`](crate::nfa::thompson::Config).
///
/// This permits setting things like whether additional time should be
/// spent shrinking the size of the NFA.
#[cfg(all(feature = "syntax", feature = "dfa-build"))]
pub fn thompson(
&mut self,
config: crate::nfa::thompson::Config,
) -> &mut Builder {
self.dfa.thompson(config);
self
}
/// Set the dense DFA compilation configuration for this builder using
/// [`dense::Config`].
///
/// This permits setting things like whether the underlying DFAs should
/// be minimized.
#[cfg(feature = "dfa-build")]
pub fn dense(&mut self, config: dense::Config) -> &mut Builder {
self.dfa.configure(config);
self
}
}
impl Default for Builder {
fn default() -> Builder {
Builder::new()
}
}

View file

@ -0,0 +1,242 @@
use alloc::vec::Vec;
use crate::util::primitives::StateID;
/// Remappable is a tightly coupled abstraction that facilitates remapping
/// state identifiers in DFAs.
///
/// The main idea behind remapping state IDs is that DFAs often need to check
/// if a certain state is a "special" state of some kind (like a match state)
/// during a search. Since this is extremely perf critical code, we want this
/// check to be as fast as possible. Partitioning state IDs into, for example,
/// into "non-match" and "match" states means one can tell if a state is a
/// match state via a simple comparison of the state ID.
///
/// The issue is that during the DFA construction process, it's not
/// particularly easy to partition the states. Instead, the simplest thing is
/// to often just do a pass over all of the states and shuffle them into their
/// desired partitionings. To do that, we need a mechanism for swapping states.
/// Hence, this abstraction.
///
/// Normally, for such little code, I would just duplicate it. But this is a
/// key optimization and the implementation is a bit subtle. So the abstraction
/// is basically a ham-fisted attempt at DRY. The only place we use this is in
/// the dense and one-pass DFAs.
///
/// See also src/dfa/special.rs for a more detailed explanation of how dense
/// DFAs are partitioned.
pub(super) trait Remappable: core::fmt::Debug {
/// Return the total number of states.
fn state_len(&self) -> usize;
/// Return the power-of-2 exponent that yields the stride. The pertinent
/// laws here are, where N=stride2: 2^N=stride and len(alphabet) <= stride.
fn stride2(&self) -> usize;
/// Swap the states pointed to by the given IDs. The underlying finite
/// state machine should be mutated such that all of the transitions in
/// `id1` are now in the memory region where the transitions for `id2`
/// were, and all of the transitions in `id2` are now in the memory region
/// where the transitions for `id1` were.
///
/// Essentially, this "moves" `id1` to `id2` and `id2` to `id1`.
///
/// It is expected that, after calling this, the underlying value will be
/// left in an inconsistent state, since any other transitions pointing to,
/// e.g., `id1` need to be updated to point to `id2`, since that's where
/// `id1` moved to.
///
/// In order to "fix" the underlying inconsistent state, a `Remapper`
/// should be used to guarantee that `remap` is called at the appropriate
/// time.
fn swap_states(&mut self, id1: StateID, id2: StateID);
/// This must remap every single state ID in the underlying value according
/// to the function given. For example, in a DFA, this should remap every
/// transition and every starting state ID.
fn remap(&mut self, map: impl Fn(StateID) -> StateID);
}
/// Remapper is an abstraction the manages the remapping of state IDs in a
/// finite state machine. This is useful when one wants to shuffle states into
/// different positions in the machine.
///
/// One of the key complexities this manages is the ability to correctly move
/// one state multiple times.
///
/// Once shuffling is complete, `remap` must be called, which will rewrite
/// all pertinent transitions to updated state IDs. Neglecting to call `remap`
/// will almost certainly result in a corrupt machine.
#[derive(Debug)]
pub(super) struct Remapper {
/// A map from the index of a state to its pre-multiplied identifier.
///
/// When a state is swapped with another, then their corresponding
/// locations in this map are also swapped. Thus, its new position will
/// still point to its old pre-multiplied StateID.
///
/// While there is a bit more to it, this then allows us to rewrite the
/// state IDs in a DFA's transition table in a single pass. This is done
/// by iterating over every ID in this map, then iterating over each
/// transition for the state at that ID and re-mapping the transition from
/// `old_id` to `map[dfa.to_index(old_id)]`. That is, we find the position
/// in this map where `old_id` *started*, and set it to where it ended up
/// after all swaps have been completed.
map: Vec<StateID>,
/// A mapper from state index to state ID (and back).
idxmap: IndexMapper,
}
impl Remapper {
/// Create a new remapper from the given remappable implementation. The
/// remapper can then be used to swap states. The remappable value given
/// here must the same one given to `swap` and `remap`.
pub(super) fn new(r: &impl Remappable) -> Remapper {
let idxmap = IndexMapper { stride2: r.stride2() };
let map = (0..r.state_len()).map(|i| idxmap.to_state_id(i)).collect();
Remapper { map, idxmap }
}
/// Swap two states. Once this is called, callers must follow through to
/// call `remap`, or else it's possible for the underlying remappable
/// value to be in a corrupt state.
pub(super) fn swap(
&mut self,
r: &mut impl Remappable,
id1: StateID,
id2: StateID,
) {
if id1 == id2 {
return;
}
r.swap_states(id1, id2);
self.map.swap(self.idxmap.to_index(id1), self.idxmap.to_index(id2));
}
/// Complete the remapping process by rewriting all state IDs in the
/// remappable value according to the swaps performed.
pub(super) fn remap(mut self, r: &mut impl Remappable) {
// Update the map to account for states that have been swapped
// multiple times. For example, if (A, C) and (C, G) are swapped, then
// transitions previously pointing to A should now point to G. But if
// we don't update our map, they will erroneously be set to C. All we
// do is follow the swaps in our map until we see our original state
// ID.
//
// The intuition here is to think about how changes are made to the
// map: only through pairwise swaps. That means that starting at any
// given state, it is always possible to find the loop back to that
// state by following the swaps represented in the map (which might be
// 0 swaps).
//
// We are also careful to clone the map before starting in order to
// freeze it. We use the frozen map to find our loops, since we need to
// update our map as well. Without freezing it, our updates could break
// the loops referenced above and produce incorrect results.
let oldmap = self.map.clone();
for i in 0..r.state_len() {
let cur_id = self.idxmap.to_state_id(i);
let mut new_id = oldmap[i];
if cur_id == new_id {
continue;
}
loop {
let id = oldmap[self.idxmap.to_index(new_id)];
if cur_id == id {
self.map[i] = new_id;
break;
}
new_id = id;
}
}
r.remap(|next| self.map[self.idxmap.to_index(next)]);
}
}
/// A simple type for mapping between state indices and state IDs.
///
/// The reason why this exists is because state IDs are "premultiplied." That
/// is, in order to get to the transitions for a particular state, one need
/// only use the state ID as-is, instead of having to multiple it by transition
/// table's stride.
///
/// The downside of this is that it's inconvenient to map between state IDs
/// using a dense map, e.g., Vec<StateID>. That's because state IDs look like
/// `0`, `0+stride`, `0+2*stride`, `0+3*stride`, etc., instead of `0`, `1`,
/// `2`, `3`, etc.
///
/// Since our state IDs are premultiplied, we can convert back-and-forth
/// between IDs and indices by simply unmultiplying the IDs and multiplying the
/// indices.
#[derive(Debug)]
struct IndexMapper {
/// The power of 2 corresponding to the stride of the corresponding
/// transition table. 'id >> stride2' de-multiplies an ID while 'index <<
/// stride2' pre-multiplies an index to an ID.
stride2: usize,
}
impl IndexMapper {
/// Convert a state ID to a state index.
fn to_index(&self, id: StateID) -> usize {
id.as_usize() >> self.stride2
}
/// Convert a state index to a state ID.
fn to_state_id(&self, index: usize) -> StateID {
// CORRECTNESS: If the given index is not valid, then it is not
// required for this to panic or return a valid state ID. We'll "just"
// wind up with panics or silent logic errors at some other point.
StateID::new_unchecked(index << self.stride2)
}
}
#[cfg(feature = "dfa-build")]
mod dense {
use crate::{dfa::dense::OwnedDFA, util::primitives::StateID};
use super::Remappable;
impl Remappable for OwnedDFA {
fn state_len(&self) -> usize {
OwnedDFA::state_len(self)
}
fn stride2(&self) -> usize {
OwnedDFA::stride2(self)
}
fn swap_states(&mut self, id1: StateID, id2: StateID) {
OwnedDFA::swap_states(self, id1, id2)
}
fn remap(&mut self, map: impl Fn(StateID) -> StateID) {
OwnedDFA::remap(self, map)
}
}
}
#[cfg(feature = "dfa-onepass")]
mod onepass {
use crate::{dfa::onepass::DFA, util::primitives::StateID};
use super::Remappable;
impl Remappable for DFA {
fn state_len(&self) -> usize {
DFA::state_len(self)
}
fn stride2(&self) -> usize {
// We don't do pre-multiplication for the one-pass DFA, so
// returning 0 has the effect of making state IDs and state indices
// equivalent.
0
}
fn swap_states(&mut self, id1: StateID, id2: StateID) {
DFA::swap_states(self, id1, id2)
}
fn remap(&mut self, map: impl Fn(StateID) -> StateID) {
DFA::remap(self, map)
}
}
}

View file

@ -0,0 +1,644 @@
use crate::{
dfa::{
accel,
automaton::{Automaton, OverlappingState},
},
util::{
prefilter::Prefilter,
primitives::StateID,
search::{Anchored, HalfMatch, Input, Span},
},
MatchError,
};
#[inline(never)]
pub fn find_fwd<A: Automaton + ?Sized>(
dfa: &A,
input: &Input<'_>,
) -> Result<Option<HalfMatch>, MatchError> {
if input.is_done() {
return Ok(None);
}
let pre = if input.get_anchored().is_anchored() {
None
} else {
dfa.get_prefilter()
};
// Searching with a pattern ID is always anchored, so we should never use
// a prefilter.
if pre.is_some() {
if input.get_earliest() {
find_fwd_imp(dfa, input, pre, true)
} else {
find_fwd_imp(dfa, input, pre, false)
}
} else {
if input.get_earliest() {
find_fwd_imp(dfa, input, None, true)
} else {
find_fwd_imp(dfa, input, None, false)
}
}
}
#[cfg_attr(feature = "perf-inline", inline(always))]
fn find_fwd_imp<A: Automaton + ?Sized>(
dfa: &A,
input: &Input<'_>,
pre: Option<&'_ Prefilter>,
earliest: bool,
) -> Result<Option<HalfMatch>, MatchError> {
// See 'prefilter_restart' docs for explanation.
let universal_start = dfa.universal_start_state(Anchored::No).is_some();
let mut mat = None;
let mut sid = init_fwd(dfa, input)?;
let mut at = input.start();
// This could just be a closure, but then I think it would be unsound
// because it would need to be safe to invoke. This way, the lack of safety
// is clearer in the code below.
macro_rules! next_unchecked {
($sid:expr, $at:expr) => {{
let byte = *input.haystack().get_unchecked($at);
dfa.next_state_unchecked($sid, byte)
}};
}
if let Some(ref pre) = pre {
let span = Span::from(at..input.end());
// If a prefilter doesn't report false positives, then we don't need to
// touch the DFA at all. However, since all matches include the pattern
// ID, and the prefilter infrastructure doesn't report pattern IDs, we
// limit this optimization to cases where there is exactly one pattern.
// In that case, any match must be the 0th pattern.
match pre.find(input.haystack(), span) {
None => return Ok(mat),
Some(ref span) => {
at = span.start;
if !universal_start {
sid = prefilter_restart(dfa, &input, at)?;
}
}
}
}
while at < input.end() {
// SAFETY: There are two safety invariants we need to uphold here in
// the loops below: that 'sid' and 'prev_sid' are valid state IDs
// for this DFA, and that 'at' is a valid index into 'haystack'.
// For the former, we rely on the invariant that next_state* and
// start_state_forward always returns a valid state ID (given a valid
// state ID in the former case). For the latter safety invariant, we
// always guard unchecked access with a check that 'at' is less than
// 'end', where 'end <= haystack.len()'. In the unrolled loop below, we
// ensure that 'at' is always in bounds.
//
// PERF: See a similar comment in src/hybrid/search.rs that justifies
// this extra work to make the search loop fast. The same reasoning and
// benchmarks apply here.
let mut prev_sid;
while at < input.end() {
prev_sid = unsafe { next_unchecked!(sid, at) };
if dfa.is_special_state(prev_sid) || at + 3 >= input.end() {
core::mem::swap(&mut prev_sid, &mut sid);
break;
}
at += 1;
sid = unsafe { next_unchecked!(prev_sid, at) };
if dfa.is_special_state(sid) {
break;
}
at += 1;
prev_sid = unsafe { next_unchecked!(sid, at) };
if dfa.is_special_state(prev_sid) {
core::mem::swap(&mut prev_sid, &mut sid);
break;
}
at += 1;
sid = unsafe { next_unchecked!(prev_sid, at) };
if dfa.is_special_state(sid) {
break;
}
at += 1;
}
if dfa.is_special_state(sid) {
if dfa.is_start_state(sid) {
if let Some(ref pre) = pre {
let span = Span::from(at..input.end());
match pre.find(input.haystack(), span) {
None => return Ok(mat),
Some(ref span) => {
// We want to skip any update to 'at' below
// at the end of this iteration and just
// jump immediately back to the next state
// transition at the leading position of the
// candidate match.
//
// ... but only if we actually made progress
// with our prefilter, otherwise if the start
// state has a self-loop, we can get stuck.
if span.start > at {
at = span.start;
if !universal_start {
sid = prefilter_restart(dfa, &input, at)?;
}
continue;
}
}
}
} else if dfa.is_accel_state(sid) {
let needles = dfa.accelerator(sid);
at = accel::find_fwd(needles, input.haystack(), at + 1)
.unwrap_or(input.end());
continue;
}
} else if dfa.is_match_state(sid) {
let pattern = dfa.match_pattern(sid, 0);
mat = Some(HalfMatch::new(pattern, at));
if earliest {
return Ok(mat);
}
if dfa.is_accel_state(sid) {
let needles = dfa.accelerator(sid);
at = accel::find_fwd(needles, input.haystack(), at + 1)
.unwrap_or(input.end());
continue;
}
} else if dfa.is_accel_state(sid) {
let needs = dfa.accelerator(sid);
at = accel::find_fwd(needs, input.haystack(), at + 1)
.unwrap_or(input.end());
continue;
} else if dfa.is_dead_state(sid) {
return Ok(mat);
} else {
// It's important that this is a debug_assert, since this can
// actually be tripped even if DFA::from_bytes succeeds and
// returns a supposedly valid DFA.
return Err(MatchError::quit(input.haystack()[at], at));
}
}
at += 1;
}
eoi_fwd(dfa, input, &mut sid, &mut mat)?;
Ok(mat)
}
#[inline(never)]
pub fn find_rev<A: Automaton + ?Sized>(
dfa: &A,
input: &Input<'_>,
) -> Result<Option<HalfMatch>, MatchError> {
if input.is_done() {
return Ok(None);
}
if input.get_earliest() {
find_rev_imp(dfa, input, true)
} else {
find_rev_imp(dfa, input, false)
}
}
#[cfg_attr(feature = "perf-inline", inline(always))]
fn find_rev_imp<A: Automaton + ?Sized>(
dfa: &A,
input: &Input<'_>,
earliest: bool,
) -> Result<Option<HalfMatch>, MatchError> {
let mut mat = None;
let mut sid = init_rev(dfa, input)?;
// In reverse search, the loop below can't handle the case of searching an
// empty slice. Ideally we could write something congruent to the forward
// search, i.e., 'while at >= start', but 'start' might be 0. Since we use
// an unsigned offset, 'at >= 0' is trivially always true. We could avoid
// this extra case handling by using a signed offset, but Rust makes it
// annoying to do. So... We just handle the empty case separately.
if input.start() == input.end() {
eoi_rev(dfa, input, &mut sid, &mut mat)?;
return Ok(mat);
}
let mut at = input.end() - 1;
macro_rules! next_unchecked {
($sid:expr, $at:expr) => {{
let byte = *input.haystack().get_unchecked($at);
dfa.next_state_unchecked($sid, byte)
}};
}
loop {
// SAFETY: See comments in 'find_fwd' for a safety argument.
let mut prev_sid;
while at >= input.start() {
prev_sid = unsafe { next_unchecked!(sid, at) };
if dfa.is_special_state(prev_sid)
|| at <= input.start().saturating_add(3)
{
core::mem::swap(&mut prev_sid, &mut sid);
break;
}
at -= 1;
sid = unsafe { next_unchecked!(prev_sid, at) };
if dfa.is_special_state(sid) {
break;
}
at -= 1;
prev_sid = unsafe { next_unchecked!(sid, at) };
if dfa.is_special_state(prev_sid) {
core::mem::swap(&mut prev_sid, &mut sid);
break;
}
at -= 1;
sid = unsafe { next_unchecked!(prev_sid, at) };
if dfa.is_special_state(sid) {
break;
}
at -= 1;
}
if dfa.is_special_state(sid) {
if dfa.is_start_state(sid) {
if dfa.is_accel_state(sid) {
let needles = dfa.accelerator(sid);
at = accel::find_rev(needles, input.haystack(), at)
.map(|i| i + 1)
.unwrap_or(input.start());
}
} else if dfa.is_match_state(sid) {
let pattern = dfa.match_pattern(sid, 0);
// Since reverse searches report the beginning of a match
// and the beginning is inclusive (not exclusive like the
// end of a match), we add 1 to make it inclusive.
mat = Some(HalfMatch::new(pattern, at + 1));
if earliest {
return Ok(mat);
}
if dfa.is_accel_state(sid) {
let needles = dfa.accelerator(sid);
at = accel::find_rev(needles, input.haystack(), at)
.map(|i| i + 1)
.unwrap_or(input.start());
}
} else if dfa.is_accel_state(sid) {
let needles = dfa.accelerator(sid);
// If the accelerator returns nothing, why don't we quit the
// search? Well, if the accelerator doesn't find anything, that
// doesn't mean we don't have a match. It just means that we
// can't leave the current state given one of the 255 possible
// byte values. However, there might be an EOI transition. So
// we set 'at' to the end of the haystack, which will cause
// this loop to stop and fall down into the EOI transition.
at = accel::find_rev(needles, input.haystack(), at)
.map(|i| i + 1)
.unwrap_or(input.start());
} else if dfa.is_dead_state(sid) {
return Ok(mat);
} else {
return Err(MatchError::quit(input.haystack()[at], at));
}
}
if at == input.start() {
break;
}
at -= 1;
}
eoi_rev(dfa, input, &mut sid, &mut mat)?;
Ok(mat)
}
#[inline(never)]
pub fn find_overlapping_fwd<A: Automaton + ?Sized>(
dfa: &A,
input: &Input<'_>,
state: &mut OverlappingState,
) -> Result<(), MatchError> {
state.mat = None;
if input.is_done() {
return Ok(());
}
let pre = if input.get_anchored().is_anchored() {
None
} else {
dfa.get_prefilter()
};
if pre.is_some() {
find_overlapping_fwd_imp(dfa, input, pre, state)
} else {
find_overlapping_fwd_imp(dfa, input, None, state)
}
}
#[cfg_attr(feature = "perf-inline", inline(always))]
fn find_overlapping_fwd_imp<A: Automaton + ?Sized>(
dfa: &A,
input: &Input<'_>,
pre: Option<&'_ Prefilter>,
state: &mut OverlappingState,
) -> Result<(), MatchError> {
// See 'prefilter_restart' docs for explanation.
let universal_start = dfa.universal_start_state(Anchored::No).is_some();
let mut sid = match state.id {
None => {
state.at = input.start();
init_fwd(dfa, input)?
}
Some(sid) => {
if let Some(match_index) = state.next_match_index {
let match_len = dfa.match_len(sid);
if match_index < match_len {
state.next_match_index = Some(match_index + 1);
let pattern = dfa.match_pattern(sid, match_index);
state.mat = Some(HalfMatch::new(pattern, state.at));
return Ok(());
}
}
// Once we've reported all matches at a given position, we need to
// advance the search to the next position.
state.at += 1;
if state.at > input.end() {
return Ok(());
}
sid
}
};
// NOTE: We don't optimize the crap out of this routine primarily because
// it seems like most find_overlapping searches will have higher match
// counts, and thus, throughput is perhaps not as important. But if you
// have a use case for something faster, feel free to file an issue.
while state.at < input.end() {
sid = dfa.next_state(sid, input.haystack()[state.at]);
if dfa.is_special_state(sid) {
state.id = Some(sid);
if dfa.is_start_state(sid) {
if let Some(ref pre) = pre {
let span = Span::from(state.at..input.end());
match pre.find(input.haystack(), span) {
None => return Ok(()),
Some(ref span) => {
if span.start > state.at {
state.at = span.start;
if !universal_start {
sid = prefilter_restart(
dfa, &input, state.at,
)?;
}
continue;
}
}
}
} else if dfa.is_accel_state(sid) {
let needles = dfa.accelerator(sid);
state.at = accel::find_fwd(
needles,
input.haystack(),
state.at + 1,
)
.unwrap_or(input.end());
continue;
}
} else if dfa.is_match_state(sid) {
state.next_match_index = Some(1);
let pattern = dfa.match_pattern(sid, 0);
state.mat = Some(HalfMatch::new(pattern, state.at));
return Ok(());
} else if dfa.is_accel_state(sid) {
let needs = dfa.accelerator(sid);
// If the accelerator returns nothing, why don't we quit the
// search? Well, if the accelerator doesn't find anything, that
// doesn't mean we don't have a match. It just means that we
// can't leave the current state given one of the 255 possible
// byte values. However, there might be an EOI transition. So
// we set 'at' to the end of the haystack, which will cause
// this loop to stop and fall down into the EOI transition.
state.at =
accel::find_fwd(needs, input.haystack(), state.at + 1)
.unwrap_or(input.end());
continue;
} else if dfa.is_dead_state(sid) {
return Ok(());
} else {
return Err(MatchError::quit(
input.haystack()[state.at],
state.at,
));
}
}
state.at += 1;
}
let result = eoi_fwd(dfa, input, &mut sid, &mut state.mat);
state.id = Some(sid);
if state.mat.is_some() {
// '1' is always correct here since if we get to this point, this
// always corresponds to the first (index '0') match discovered at
// this position. So the next match to report at this position (if
// it exists) is at index '1'.
state.next_match_index = Some(1);
}
result
}
#[inline(never)]
pub(crate) fn find_overlapping_rev<A: Automaton + ?Sized>(
dfa: &A,
input: &Input<'_>,
state: &mut OverlappingState,
) -> Result<(), MatchError> {
state.mat = None;
if input.is_done() {
return Ok(());
}
let mut sid = match state.id {
None => {
let sid = init_rev(dfa, input)?;
state.id = Some(sid);
if input.start() == input.end() {
state.rev_eoi = true;
} else {
state.at = input.end() - 1;
}
sid
}
Some(sid) => {
if let Some(match_index) = state.next_match_index {
let match_len = dfa.match_len(sid);
if match_index < match_len {
state.next_match_index = Some(match_index + 1);
let pattern = dfa.match_pattern(sid, match_index);
state.mat = Some(HalfMatch::new(pattern, state.at));
return Ok(());
}
}
// Once we've reported all matches at a given position, we need
// to advance the search to the next position. However, if we've
// already followed the EOI transition, then we know we're done
// with the search and there cannot be any more matches to report.
if state.rev_eoi {
return Ok(());
} else if state.at == input.start() {
// At this point, we should follow the EOI transition. This
// will cause us the skip the main loop below and fall through
// to the final 'eoi_rev' transition.
state.rev_eoi = true;
} else {
// We haven't hit the end of the search yet, so move on.
state.at -= 1;
}
sid
}
};
while !state.rev_eoi {
sid = dfa.next_state(sid, input.haystack()[state.at]);
if dfa.is_special_state(sid) {
state.id = Some(sid);
if dfa.is_start_state(sid) {
if dfa.is_accel_state(sid) {
let needles = dfa.accelerator(sid);
state.at =
accel::find_rev(needles, input.haystack(), state.at)
.map(|i| i + 1)
.unwrap_or(input.start());
}
} else if dfa.is_match_state(sid) {
state.next_match_index = Some(1);
let pattern = dfa.match_pattern(sid, 0);
state.mat = Some(HalfMatch::new(pattern, state.at + 1));
return Ok(());
} else if dfa.is_accel_state(sid) {
let needles = dfa.accelerator(sid);
// If the accelerator returns nothing, why don't we quit the
// search? Well, if the accelerator doesn't find anything, that
// doesn't mean we don't have a match. It just means that we
// can't leave the current state given one of the 255 possible
// byte values. However, there might be an EOI transition. So
// we set 'at' to the end of the haystack, which will cause
// this loop to stop and fall down into the EOI transition.
state.at =
accel::find_rev(needles, input.haystack(), state.at)
.map(|i| i + 1)
.unwrap_or(input.start());
} else if dfa.is_dead_state(sid) {
return Ok(());
} else {
return Err(MatchError::quit(
input.haystack()[state.at],
state.at,
));
}
}
if state.at == input.start() {
break;
}
state.at -= 1;
}
let result = eoi_rev(dfa, input, &mut sid, &mut state.mat);
state.rev_eoi = true;
state.id = Some(sid);
if state.mat.is_some() {
// '1' is always correct here since if we get to this point, this
// always corresponds to the first (index '0') match discovered at
// this position. So the next match to report at this position (if
// it exists) is at index '1'.
state.next_match_index = Some(1);
}
result
}
#[cfg_attr(feature = "perf-inline", inline(always))]
fn init_fwd<A: Automaton + ?Sized>(
dfa: &A,
input: &Input<'_>,
) -> Result<StateID, MatchError> {
let sid = dfa.start_state_forward(input)?;
// Start states can never be match states, since all matches are delayed
// by 1 byte.
debug_assert!(!dfa.is_match_state(sid));
Ok(sid)
}
#[cfg_attr(feature = "perf-inline", inline(always))]
fn init_rev<A: Automaton + ?Sized>(
dfa: &A,
input: &Input<'_>,
) -> Result<StateID, MatchError> {
let sid = dfa.start_state_reverse(input)?;
// Start states can never be match states, since all matches are delayed
// by 1 byte.
debug_assert!(!dfa.is_match_state(sid));
Ok(sid)
}
#[cfg_attr(feature = "perf-inline", inline(always))]
fn eoi_fwd<A: Automaton + ?Sized>(
dfa: &A,
input: &Input<'_>,
sid: &mut StateID,
mat: &mut Option<HalfMatch>,
) -> Result<(), MatchError> {
let sp = input.get_span();
match input.haystack().get(sp.end) {
Some(&b) => {
*sid = dfa.next_state(*sid, b);
if dfa.is_match_state(*sid) {
let pattern = dfa.match_pattern(*sid, 0);
*mat = Some(HalfMatch::new(pattern, sp.end));
} else if dfa.is_quit_state(*sid) {
return Err(MatchError::quit(b, sp.end));
}
}
None => {
*sid = dfa.next_eoi_state(*sid);
if dfa.is_match_state(*sid) {
let pattern = dfa.match_pattern(*sid, 0);
*mat = Some(HalfMatch::new(pattern, input.haystack().len()));
}
}
}
Ok(())
}
#[cfg_attr(feature = "perf-inline", inline(always))]
fn eoi_rev<A: Automaton + ?Sized>(
dfa: &A,
input: &Input<'_>,
sid: &mut StateID,
mat: &mut Option<HalfMatch>,
) -> Result<(), MatchError> {
let sp = input.get_span();
if sp.start > 0 {
let byte = input.haystack()[sp.start - 1];
*sid = dfa.next_state(*sid, byte);
if dfa.is_match_state(*sid) {
let pattern = dfa.match_pattern(*sid, 0);
*mat = Some(HalfMatch::new(pattern, sp.start));
} else if dfa.is_quit_state(*sid) {
return Err(MatchError::quit(byte, sp.start - 1));
}
} else {
*sid = dfa.next_eoi_state(*sid);
if dfa.is_match_state(*sid) {
let pattern = dfa.match_pattern(*sid, 0);
*mat = Some(HalfMatch::new(pattern, 0));
}
}
Ok(())
}
/// Re-compute the starting state that a DFA should be in after finding a
/// prefilter candidate match at the position `at`.
///
/// The function with the same name has a bit more docs in hybrid/search.rs.
#[cfg_attr(feature = "perf-inline", inline(always))]
fn prefilter_restart<A: Automaton + ?Sized>(
dfa: &A,
input: &Input<'_>,
at: usize,
) -> Result<StateID, MatchError> {
let mut input = input.clone();
input.set_start(at);
init_fwd(dfa, &input)
}

File diff suppressed because it is too large Load diff

View file

@ -0,0 +1,494 @@
use crate::{
dfa::DEAD,
util::{
primitives::StateID,
wire::{self, DeserializeError, Endian, SerializeError},
},
};
macro_rules! err {
($msg:expr) => {
return Err(DeserializeError::generic($msg));
};
}
// Special represents the identifiers in a DFA that correspond to "special"
// states. If a state is one or more of the following, then it is considered
// special:
//
// * dead - A non-matching state where all outgoing transitions lead back to
// itself. There is only one of these, regardless of whether minimization
// has run. The dead state always has an ID of 0. i.e., It is always the
// first state in a DFA.
// * quit - A state that is entered whenever a byte is seen that should cause
// a DFA to give up and stop searching. This results in a MatchError::quit
// error being returned at search time. The default configuration for a DFA
// has no quit bytes, which means this state is unreachable by default,
// although it is always present for reasons of implementation simplicity.
// This state is only reachable when the caller configures the DFA to quit
// on certain bytes. There is always exactly one of these states and it
// is always the second state. (Its actual ID depends on the size of the
// alphabet in dense DFAs, since state IDs are premultiplied in order to
// allow them to be used directly as indices into the transition table.)
// * match - An accepting state, i.e., indicative of a match. There may be
// zero or more of these states.
// * accelerated - A state where all of its outgoing transitions, except a
// few, loop back to itself. These states are candidates for acceleration
// via memchr during search. There may be zero or more of these states.
// * start - A non-matching state that indicates where the automaton should
// start during a search. There is always at least one starting state and
// all are guaranteed to be non-match states. (A start state cannot be a
// match state because the DFAs in this crate delay all matches by one byte.
// So every search that finds a match must move through one transition to
// some other match state, even when searching an empty string.)
//
// These are not mutually exclusive categories. Namely, the following
// overlappings can occur:
//
// * {dead, start} - If a DFA can never lead to a match and it is minimized,
// then it will typically compile to something where all starting IDs point
// to the DFA's dead state.
// * {match, accelerated} - It is possible for a match state to have the
// majority of its transitions loop back to itself, which means it's
// possible for a match state to be accelerated.
// * {start, accelerated} - Similarly, it is possible for a start state to be
// accelerated. Note that it is possible for an accelerated state to be
// neither a match or a start state. Also note that just because both match
// and start states overlap with accelerated states does not mean that
// match and start states overlap with each other. In fact, they are
// guaranteed not to overlap.
//
// As a special mention, every DFA always has a dead and a quit state, even
// though from the perspective of the DFA, they are equivalent. (Indeed,
// minimization special cases them to ensure they don't get merged.) The
// purpose of keeping them distinct is to use the quit state as a sentinel to
// distguish between whether a search finished successfully without finding
// anything or whether it gave up before finishing.
//
// So the main problem we want to solve here is the *fast* detection of whether
// a state is special or not. And we also want to do this while storing as
// little extra data as possible. AND we want to be able to quickly determine
// which categories a state falls into above if it is special.
//
// We achieve this by essentially shuffling all special states to the beginning
// of a DFA. That is, all special states appear before every other non-special
// state. By representing special states this way, we can determine whether a
// state is special or not by a single comparison, where special.max is the
// identifier of the last special state in the DFA:
//
// if current_state <= special.max:
// ... do something with special state
//
// The only thing left to do is to determine what kind of special state
// it is. Because what we do next depends on that. Since special states
// are typically rare, we can afford to do a bit more extra work, but we'd
// still like this to be as fast as possible. The trick we employ here is to
// continue shuffling states even within the special state range. Such that
// one contiguous region corresponds to match states, another for start states
// and then an overlapping range for accelerated states. At a high level, our
// special state detection might look like this (for leftmost searching, where
// we continue searching even after seeing a match):
//
// byte = input[offset]
// current_state = next_state(current_state, byte)
// offset += 1
// if current_state <= special.max:
// if current_state == 0:
// # We can never leave a dead state, so this always marks the
// # end of our search.
// return last_match
// if current_state == special.quit_id:
// # A quit state means we give up. If he DFA has no quit state,
// # then special.quit_id == 0 == dead, which is handled by the
// # conditional above.
// return Err(MatchError::quit { byte, offset: offset - 1 })
// if special.min_match <= current_state <= special.max_match:
// last_match = Some(offset)
// if special.min_accel <= current_state <= special.max_accel:
// offset = accelerate(input, offset)
// last_match = Some(offset)
// elif special.min_start <= current_state <= special.max_start:
// offset = prefilter.find(input, offset)
// if special.min_accel <= current_state <= special.max_accel:
// offset = accelerate(input, offset)
// elif special.min_accel <= current_state <= special.max_accel:
// offset = accelerate(input, offset)
//
// There are some small details left out of the logic above. For example,
// in order to accelerate a state, we need to know which bytes to search for.
// This in turn implies some extra data we need to store in the DFA. To keep
// things compact, we would ideally only store
//
// N = special.max_accel - special.min_accel + 1
//
// items. But state IDs are premultiplied, which means they are not contiguous.
// So in order to take a state ID and index an array of accelerated structures,
// we need to do:
//
// i = (state_id - special.min_accel) / stride
//
// (N.B. 'stride' is always a power of 2, so the above can be implemented via
// '(state_id - special.min_accel) >> stride2', where 'stride2' is x in
// 2^x=stride.)
//
// Moreover, some of these specialty categories may be empty. For example,
// DFAs are not required to have any match states or any accelerated states.
// In that case, the lower and upper bounds are both set to 0 (the dead state
// ID) and the first `current_state == 0` check subsumes cases where the
// ranges are empty.
//
// Loop unrolling, if applicable, has also been left out of the logic above.
//
// Graphically, the ranges look like this, where asterisks indicate ranges
// that can be empty. Each 'x' is a state.
//
// quit
// dead|
// ||
// xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx
// | | | | start | |
// | |-------------| |-------| |
// | match* | | | |
// | | | | |
// | |----------| | |
// | accel* | |
// | | |
// | | |
// |----------------------------|------------------------
// special non-special*
#[derive(Clone, Copy, Debug)]
pub(crate) struct Special {
/// The identifier of the last special state in a DFA. A state is special
/// if and only if its identifier is less than or equal to `max`.
pub(crate) max: StateID,
/// The identifier of the quit state in a DFA. (There is no analogous field
/// for the dead state since the dead state's ID is always zero, regardless
/// of state ID size.)
pub(crate) quit_id: StateID,
/// The identifier of the first match state.
pub(crate) min_match: StateID,
/// The identifier of the last match state.
pub(crate) max_match: StateID,
/// The identifier of the first accelerated state.
pub(crate) min_accel: StateID,
/// The identifier of the last accelerated state.
pub(crate) max_accel: StateID,
/// The identifier of the first start state.
pub(crate) min_start: StateID,
/// The identifier of the last start state.
pub(crate) max_start: StateID,
}
impl Special {
/// Creates a new set of special ranges for a DFA. All ranges are initially
/// set to only contain the dead state. This is interpreted as an empty
/// range.
#[cfg(feature = "dfa-build")]
pub(crate) fn new() -> Special {
Special {
max: DEAD,
quit_id: DEAD,
min_match: DEAD,
max_match: DEAD,
min_accel: DEAD,
max_accel: DEAD,
min_start: DEAD,
max_start: DEAD,
}
}
/// Remaps all of the special state identifiers using the function given.
#[cfg(feature = "dfa-build")]
pub(crate) fn remap(&self, map: impl Fn(StateID) -> StateID) -> Special {
Special {
max: map(self.max),
quit_id: map(self.quit_id),
min_match: map(self.min_match),
max_match: map(self.max_match),
min_accel: map(self.min_accel),
max_accel: map(self.max_accel),
min_start: map(self.min_start),
max_start: map(self.max_start),
}
}
/// Deserialize the given bytes into special state ranges. If the slice
/// given is not big enough, then this returns an error. Similarly, if
/// any of the expected invariants around special state ranges aren't
/// upheld, an error is returned. Note that this does not guarantee that
/// the information returned is correct.
///
/// Upon success, this returns the number of bytes read in addition to the
/// special state IDs themselves.
pub(crate) fn from_bytes(
mut slice: &[u8],
) -> Result<(Special, usize), DeserializeError> {
wire::check_slice_len(slice, 8 * StateID::SIZE, "special states")?;
let mut nread = 0;
let mut read_id = |what| -> Result<StateID, DeserializeError> {
let (id, nr) = wire::try_read_state_id(slice, what)?;
nread += nr;
slice = &slice[StateID::SIZE..];
Ok(id)
};
let max = read_id("special max id")?;
let quit_id = read_id("special quit id")?;
let min_match = read_id("special min match id")?;
let max_match = read_id("special max match id")?;
let min_accel = read_id("special min accel id")?;
let max_accel = read_id("special max accel id")?;
let min_start = read_id("special min start id")?;
let max_start = read_id("special max start id")?;
let special = Special {
max,
quit_id,
min_match,
max_match,
min_accel,
max_accel,
min_start,
max_start,
};
special.validate()?;
assert_eq!(nread, special.write_to_len());
Ok((special, nread))
}
/// Validate that the information describing special states satisfies
/// all known invariants.
pub(crate) fn validate(&self) -> Result<(), DeserializeError> {
// Check that both ends of the range are DEAD or neither are.
if self.min_match == DEAD && self.max_match != DEAD {
err!("min_match is DEAD, but max_match is not");
}
if self.min_match != DEAD && self.max_match == DEAD {
err!("max_match is DEAD, but min_match is not");
}
if self.min_accel == DEAD && self.max_accel != DEAD {
err!("min_accel is DEAD, but max_accel is not");
}
if self.min_accel != DEAD && self.max_accel == DEAD {
err!("max_accel is DEAD, but min_accel is not");
}
if self.min_start == DEAD && self.max_start != DEAD {
err!("min_start is DEAD, but max_start is not");
}
if self.min_start != DEAD && self.max_start == DEAD {
err!("max_start is DEAD, but min_start is not");
}
// Check that ranges are well formed.
if self.min_match > self.max_match {
err!("min_match should not be greater than max_match");
}
if self.min_accel > self.max_accel {
err!("min_accel should not be greater than max_accel");
}
if self.min_start > self.max_start {
err!("min_start should not be greater than max_start");
}
// Check that ranges are ordered with respect to one another.
if self.matches() && self.quit_id >= self.min_match {
err!("quit_id should not be greater than min_match");
}
if self.accels() && self.quit_id >= self.min_accel {
err!("quit_id should not be greater than min_accel");
}
if self.starts() && self.quit_id >= self.min_start {
err!("quit_id should not be greater than min_start");
}
if self.matches() && self.accels() && self.min_accel < self.min_match {
err!("min_match should not be greater than min_accel");
}
if self.matches() && self.starts() && self.min_start < self.min_match {
err!("min_match should not be greater than min_start");
}
if self.accels() && self.starts() && self.min_start < self.min_accel {
err!("min_accel should not be greater than min_start");
}
// Check that max is at least as big as everything else.
if self.max < self.quit_id {
err!("quit_id should not be greater than max");
}
if self.max < self.max_match {
err!("max_match should not be greater than max");
}
if self.max < self.max_accel {
err!("max_accel should not be greater than max");
}
if self.max < self.max_start {
err!("max_start should not be greater than max");
}
Ok(())
}
/// Validate that the special state information is compatible with the
/// given state len.
pub(crate) fn validate_state_len(
&self,
len: usize,
stride2: usize,
) -> Result<(), DeserializeError> {
// We assume that 'validate' has already passed, so we know that 'max'
// is truly the max. So all we need to check is that the max state ID
// is less than the state ID len. The max legal value here is len-1,
// which occurs when there are no non-special states.
if (self.max.as_usize() >> stride2) >= len {
err!("max should not be greater than or equal to state length");
}
Ok(())
}
/// Write the IDs and ranges for special states to the given byte buffer.
/// The buffer given must have enough room to store all data, otherwise
/// this will return an error. The number of bytes written is returned
/// on success. The number of bytes written is guaranteed to be a multiple
/// of 8.
pub(crate) fn write_to<E: Endian>(
&self,
dst: &mut [u8],
) -> Result<usize, SerializeError> {
use crate::util::wire::write_state_id as write;
if dst.len() < self.write_to_len() {
return Err(SerializeError::buffer_too_small("special state ids"));
}
let mut nwrite = 0;
nwrite += write::<E>(self.max, &mut dst[nwrite..]);
nwrite += write::<E>(self.quit_id, &mut dst[nwrite..]);
nwrite += write::<E>(self.min_match, &mut dst[nwrite..]);
nwrite += write::<E>(self.max_match, &mut dst[nwrite..]);
nwrite += write::<E>(self.min_accel, &mut dst[nwrite..]);
nwrite += write::<E>(self.max_accel, &mut dst[nwrite..]);
nwrite += write::<E>(self.min_start, &mut dst[nwrite..]);
nwrite += write::<E>(self.max_start, &mut dst[nwrite..]);
assert_eq!(
self.write_to_len(),
nwrite,
"expected to write certain number of bytes",
);
assert_eq!(
nwrite % 8,
0,
"expected to write multiple of 8 bytes for special states",
);
Ok(nwrite)
}
/// Returns the total number of bytes written by `write_to`.
pub(crate) fn write_to_len(&self) -> usize {
8 * StateID::SIZE
}
/// Sets the maximum special state ID based on the current values. This
/// should be used once all possible state IDs are set.
#[cfg(feature = "dfa-build")]
pub(crate) fn set_max(&mut self) {
use core::cmp::max;
self.max = max(
self.quit_id,
max(self.max_match, max(self.max_accel, self.max_start)),
);
}
/// Sets the maximum special state ID such that starting states are not
/// considered "special." This also marks the min/max starting states as
/// DEAD such that 'is_start_state' always returns false, even if the state
/// is actually a starting state.
///
/// This is useful when there is no prefilter set. It will avoid
/// ping-ponging between the hot path in the DFA search code and the start
/// state handling code, which is typically only useful for executing a
/// prefilter.
#[cfg(feature = "dfa-build")]
pub(crate) fn set_no_special_start_states(&mut self) {
use core::cmp::max;
self.max = max(self.quit_id, max(self.max_match, self.max_accel));
self.min_start = DEAD;
self.max_start = DEAD;
}
/// Returns true if and only if the given state ID is a special state.
#[inline]
pub(crate) fn is_special_state(&self, id: StateID) -> bool {
id <= self.max
}
/// Returns true if and only if the given state ID is a dead state.
#[inline]
pub(crate) fn is_dead_state(&self, id: StateID) -> bool {
id == DEAD
}
/// Returns true if and only if the given state ID is a quit state.
#[inline]
pub(crate) fn is_quit_state(&self, id: StateID) -> bool {
!self.is_dead_state(id) && self.quit_id == id
}
/// Returns true if and only if the given state ID is a match state.
#[inline]
pub(crate) fn is_match_state(&self, id: StateID) -> bool {
!self.is_dead_state(id) && self.min_match <= id && id <= self.max_match
}
/// Returns true if and only if the given state ID is an accel state.
#[inline]
pub(crate) fn is_accel_state(&self, id: StateID) -> bool {
!self.is_dead_state(id) && self.min_accel <= id && id <= self.max_accel
}
/// Returns true if and only if the given state ID is a start state.
#[inline]
pub(crate) fn is_start_state(&self, id: StateID) -> bool {
!self.is_dead_state(id) && self.min_start <= id && id <= self.max_start
}
/// Returns the total number of match states for a dense table based DFA.
#[inline]
pub(crate) fn match_len(&self, stride: usize) -> usize {
if self.matches() {
(self.max_match.as_usize() - self.min_match.as_usize() + stride)
/ stride
} else {
0
}
}
/// Returns true if and only if there is at least one match state.
#[inline]
pub(crate) fn matches(&self) -> bool {
self.min_match != DEAD
}
/// Returns the total number of accel states.
#[cfg(feature = "dfa-build")]
pub(crate) fn accel_len(&self, stride: usize) -> usize {
if self.accels() {
(self.max_accel.as_usize() - self.min_accel.as_usize() + stride)
/ stride
} else {
0
}
}
/// Returns true if and only if there is at least one accel state.
#[inline]
pub(crate) fn accels(&self) -> bool {
self.min_accel != DEAD
}
/// Returns true if and only if there is at least one start state.
#[inline]
pub(crate) fn starts(&self) -> bool {
self.min_start != DEAD
}
}

View file

@ -0,0 +1,74 @@
use core::mem::size_of;
use crate::util::wire::{self, DeserializeError, Endian, SerializeError};
/// The kind of anchored starting configurations to support in a DFA.
///
/// Fully compiled DFAs need to be explicitly configured as to which anchored
/// starting configurations to support. The reason for not just supporting
/// everything unconditionally is that it can use more resources (such as
/// memory and build time). The downside of this is that if you try to execute
/// a search using an [`Anchored`](crate::Anchored) mode that is not supported
/// by the DFA, then the search will return an error.
#[derive(Clone, Copy, Debug, Eq, PartialEq)]
pub enum StartKind {
/// Support both anchored and unanchored searches.
Both,
/// Support only unanchored searches. Requesting an anchored search will
/// panic.
///
/// Note that even if an unanchored search is requested, the pattern itself
/// may still be anchored. For example, `^abc` will only match `abc` at the
/// start of a haystack. This will remain true, even if the regex engine
/// only supported unanchored searches.
Unanchored,
/// Support only anchored searches. Requesting an unanchored search will
/// panic.
Anchored,
}
impl StartKind {
pub(crate) fn from_bytes(
slice: &[u8],
) -> Result<(StartKind, usize), DeserializeError> {
wire::check_slice_len(slice, size_of::<u32>(), "start kind bytes")?;
let (n, nr) = wire::try_read_u32(slice, "start kind integer")?;
match n {
0 => Ok((StartKind::Both, nr)),
1 => Ok((StartKind::Unanchored, nr)),
2 => Ok((StartKind::Anchored, nr)),
_ => Err(DeserializeError::generic("unrecognized start kind")),
}
}
pub(crate) fn write_to<E: Endian>(
&self,
dst: &mut [u8],
) -> Result<usize, SerializeError> {
let nwrite = self.write_to_len();
if dst.len() < nwrite {
return Err(SerializeError::buffer_too_small("start kind"));
}
let n = match *self {
StartKind::Both => 0,
StartKind::Unanchored => 1,
StartKind::Anchored => 2,
};
E::write_u32(n, dst);
Ok(nwrite)
}
pub(crate) fn write_to_len(&self) -> usize {
size_of::<u32>()
}
#[cfg_attr(feature = "perf-inline", inline(always))]
pub(crate) fn has_unanchored(&self) -> bool {
matches!(*self, StartKind::Both | StartKind::Unanchored)
}
#[cfg_attr(feature = "perf-inline", inline(always))]
pub(crate) fn has_anchored(&self) -> bool {
matches!(*self, StartKind::Both | StartKind::Anchored)
}
}