Vendor things
This commit is contained in:
parent
5deceec006
commit
977e3c17e5
19434 changed files with 10682014 additions and 0 deletions
517
third-party/vendor/regex-automata/src/dfa/accel.rs
vendored
Normal file
517
third-party/vendor/regex-automata/src/dfa/accel.rs
vendored
Normal file
|
|
@ -0,0 +1,517 @@
|
|||
// This module defines some core types for dealing with accelerated DFA states.
|
||||
// Briefly, a DFA state can be "accelerated" if all of its transitions except
|
||||
// for a few loop back to itself. This directly implies that the only way out
|
||||
// of such a state is if a byte corresponding to one of those non-loopback
|
||||
// transitions is found. Such states are often found in simple repetitions in
|
||||
// non-Unicode regexes. For example, consider '(?-u)[^a]+a'. We can look at its
|
||||
// DFA with regex-cli:
|
||||
//
|
||||
// $ regex-cli debug dense dfa -p '(?-u)[^a]+a' -BbC --no-table
|
||||
// D 000000:
|
||||
// Q 000001:
|
||||
// *000002:
|
||||
// A 000003: \x00-` => 3, a => 8, b-\xFF => 3
|
||||
// A 000004: \x00-` => 4, a => 7, b-\xFF => 4
|
||||
// 000005: \x00-` => 4, b-\xFF => 4
|
||||
// 000006: \x00-` => 3, a => 6, b-\xFF => 3
|
||||
// 000007: \x00-\xFF => 2, EOI => 2
|
||||
// 000008: \x00-\xFF => 2, EOI => 2
|
||||
//
|
||||
// In particular, state 3 is accelerated (shown via the 'A' indicator) since
|
||||
// the only way to leave that state once entered is to see an 'a' byte. If
|
||||
// there is a long run of non-'a' bytes, then using something like 'memchr'
|
||||
// to find the next 'a' byte can be significantly faster than just using the
|
||||
// standard byte-at-a-time state machine.
|
||||
//
|
||||
// Unfortunately, this optimization rarely applies when Unicode is enabled.
|
||||
// For example, patterns like '[^a]' don't actually match any byte that isn't
|
||||
// 'a', but rather, any UTF-8 encoding of a Unicode scalar value that isn't
|
||||
// 'a'. This makes the state machine much more complex---far beyond a single
|
||||
// state---and removes the ability to easily accelerate it. (Because if the
|
||||
// machine sees a non-UTF-8 sequence, then the machine won't match through it.)
|
||||
//
|
||||
// In practice, we only consider accelerating states that have 3 or fewer
|
||||
// non-loop transitions. At a certain point, you get diminishing returns, but
|
||||
// also because that's what the memchr crate supports. The structures below
|
||||
// hard-code this assumption and provide (de)serialization APIs for use inside
|
||||
// a DFA.
|
||||
//
|
||||
// And finally, note that there is some trickery involved in making it very
|
||||
// fast to not only check whether a state is accelerated at search time, but
|
||||
// also to access the bytes to search for to implement the acceleration itself.
|
||||
// dfa/special.rs provides more detail, but the short story is that all
|
||||
// accelerated states appear contiguously in a DFA. This means we can represent
|
||||
// the ID space of all accelerated DFA states with a single range. So given
|
||||
// a state ID, we can determine whether it's accelerated via
|
||||
//
|
||||
// min_accel_id <= id <= max_accel_id
|
||||
//
|
||||
// And find its corresponding accelerator with:
|
||||
//
|
||||
// accels.get((id - min_accel_id) / dfa_stride)
|
||||
|
||||
#[cfg(feature = "dfa-build")]
|
||||
use alloc::{vec, vec::Vec};
|
||||
|
||||
use crate::util::{
|
||||
int::Pointer,
|
||||
memchr,
|
||||
wire::{self, DeserializeError, Endian, SerializeError},
|
||||
};
|
||||
|
||||
/// The base type used to represent a collection of accelerators.
|
||||
///
|
||||
/// While an `Accel` is represented as a fixed size array of bytes, a
|
||||
/// *collection* of `Accel`s (called `Accels`) is represented internally as a
|
||||
/// slice of u32. While it's a bit unnatural to do this and costs us a bit of
|
||||
/// fairly low-risk not-safe code, it lets us remove the need for a second type
|
||||
/// parameter in the definition of dense::DFA. (Which really wants everything
|
||||
/// to be a slice of u32.)
|
||||
type AccelTy = u32;
|
||||
|
||||
/// The size of the unit of representation for accelerators.
|
||||
///
|
||||
/// ACCEL_CAP *must* be a multiple of this size.
|
||||
const ACCEL_TY_SIZE: usize = core::mem::size_of::<AccelTy>();
|
||||
|
||||
/// The maximum length in bytes that a single Accel can be. This is distinct
|
||||
/// from the capacity of an accelerator in that the length represents only the
|
||||
/// bytes that should be read.
|
||||
const ACCEL_LEN: usize = 4;
|
||||
|
||||
/// The capacity of each accelerator, in bytes. We set this to 8 since it's a
|
||||
/// multiple of 4 (our ID size) and because it gives us a little wiggle room
|
||||
/// if we want to support more accel bytes in the future without a breaking
|
||||
/// change.
|
||||
///
|
||||
/// This MUST be a multiple of ACCEL_TY_SIZE.
|
||||
const ACCEL_CAP: usize = 8;
|
||||
|
||||
/// Search for between 1 and 3 needle bytes in the given haystack, starting the
|
||||
/// search at the given position. If `needles` has a length other than 1-3,
|
||||
/// then this panics.
|
||||
#[cfg_attr(feature = "perf-inline", inline(always))]
|
||||
pub(crate) fn find_fwd(
|
||||
needles: &[u8],
|
||||
haystack: &[u8],
|
||||
at: usize,
|
||||
) -> Option<usize> {
|
||||
let bs = needles;
|
||||
let i = match needles.len() {
|
||||
1 => memchr::memchr(bs[0], &haystack[at..])?,
|
||||
2 => memchr::memchr2(bs[0], bs[1], &haystack[at..])?,
|
||||
3 => memchr::memchr3(bs[0], bs[1], bs[2], &haystack[at..])?,
|
||||
0 => panic!("cannot find with empty needles"),
|
||||
n => panic!("invalid needles length: {}", n),
|
||||
};
|
||||
Some(at + i)
|
||||
}
|
||||
|
||||
/// Search for between 1 and 3 needle bytes in the given haystack in reverse,
|
||||
/// starting the search at the given position. If `needles` has a length other
|
||||
/// than 1-3, then this panics.
|
||||
#[cfg_attr(feature = "perf-inline", inline(always))]
|
||||
pub(crate) fn find_rev(
|
||||
needles: &[u8],
|
||||
haystack: &[u8],
|
||||
at: usize,
|
||||
) -> Option<usize> {
|
||||
let bs = needles;
|
||||
match needles.len() {
|
||||
1 => memchr::memrchr(bs[0], &haystack[..at]),
|
||||
2 => memchr::memrchr2(bs[0], bs[1], &haystack[..at]),
|
||||
3 => memchr::memrchr3(bs[0], bs[1], bs[2], &haystack[..at]),
|
||||
0 => panic!("cannot find with empty needles"),
|
||||
n => panic!("invalid needles length: {}", n),
|
||||
}
|
||||
}
|
||||
|
||||
/// Represents the accelerators for all accelerated states in a dense DFA.
|
||||
///
|
||||
/// The `A` type parameter represents the type of the underlying bytes.
|
||||
/// Generally, this is either `&[AccelTy]` or `Vec<AccelTy>`.
|
||||
#[derive(Clone)]
|
||||
pub(crate) struct Accels<A> {
|
||||
/// A length prefixed slice of contiguous accelerators. See the top comment
|
||||
/// in this module for more details on how we can jump from a DFA's state
|
||||
/// ID to an accelerator in this list.
|
||||
///
|
||||
/// The first 4 bytes always correspond to the number of accelerators
|
||||
/// that follow.
|
||||
accels: A,
|
||||
}
|
||||
|
||||
#[cfg(feature = "dfa-build")]
|
||||
impl Accels<Vec<AccelTy>> {
|
||||
/// Create an empty sequence of accelerators for a DFA.
|
||||
pub fn empty() -> Accels<Vec<AccelTy>> {
|
||||
Accels { accels: vec![0] }
|
||||
}
|
||||
|
||||
/// Add an accelerator to this sequence.
|
||||
///
|
||||
/// This adds to the accelerator to the end of the sequence and therefore
|
||||
/// should be done in correspondence with its state in the DFA.
|
||||
///
|
||||
/// This panics if this results in more accelerators than AccelTy::MAX.
|
||||
pub fn add(&mut self, accel: Accel) {
|
||||
self.accels.extend_from_slice(&accel.as_accel_tys());
|
||||
let len = self.len();
|
||||
self.set_len(len + 1);
|
||||
}
|
||||
|
||||
/// Set the number of accelerators in this sequence, which is encoded in
|
||||
/// the first 4 bytes of the underlying bytes.
|
||||
fn set_len(&mut self, new_len: usize) {
|
||||
// The only way an accelerator gets added is if a state exists for
|
||||
// it, and if a state exists, then its index is guaranteed to be
|
||||
// representable by a AccelTy by virtue of the guarantees provided by
|
||||
// StateID.
|
||||
let new_len = AccelTy::try_from(new_len).unwrap();
|
||||
self.accels[0] = new_len;
|
||||
}
|
||||
}
|
||||
|
||||
impl<'a> Accels<&'a [AccelTy]> {
|
||||
/// Deserialize a sequence of accelerators from the given bytes. If there
|
||||
/// was a problem deserializing, then an error is returned.
|
||||
///
|
||||
/// This is guaranteed to run in constant time. This does not guarantee
|
||||
/// that every accelerator in the returned collection is valid. Thus,
|
||||
/// accessing one may panic, or not-safe code that relies on accelerators
|
||||
/// being correct my result in UB.
|
||||
///
|
||||
/// Callers may check the validity of every accelerator with the `validate`
|
||||
/// method.
|
||||
pub fn from_bytes_unchecked(
|
||||
mut slice: &'a [u8],
|
||||
) -> Result<(Accels<&'a [AccelTy]>, usize), DeserializeError> {
|
||||
let slice_start = slice.as_ptr().as_usize();
|
||||
|
||||
let (accel_len, _) =
|
||||
wire::try_read_u32_as_usize(slice, "accelerators length")?;
|
||||
// The accelerator length is part of the accel_tys slice that
|
||||
// we deserialize. This is perhaps a bit idiosyncratic. It would
|
||||
// probably be better to split out the length into a real field.
|
||||
|
||||
let accel_tys_len = wire::add(
|
||||
wire::mul(accel_len, 2, "total number of accelerator accel_tys")?,
|
||||
1,
|
||||
"total number of accel_tys",
|
||||
)?;
|
||||
let accel_tys_bytes_len = wire::mul(
|
||||
ACCEL_TY_SIZE,
|
||||
accel_tys_len,
|
||||
"total number of bytes in accelerators",
|
||||
)?;
|
||||
wire::check_slice_len(slice, accel_tys_bytes_len, "accelerators")?;
|
||||
wire::check_alignment::<AccelTy>(slice)?;
|
||||
let accel_tys = &slice[..accel_tys_bytes_len];
|
||||
slice = &slice[accel_tys_bytes_len..];
|
||||
// SAFETY: We've checked the length and alignment above, and since
|
||||
// slice is just bytes and AccelTy is just a u32, we can safely cast to
|
||||
// a slice of &[AccelTy].
|
||||
let accels = unsafe {
|
||||
core::slice::from_raw_parts(
|
||||
accel_tys.as_ptr().cast::<AccelTy>(),
|
||||
accel_tys_len,
|
||||
)
|
||||
};
|
||||
Ok((Accels { accels }, slice.as_ptr().as_usize() - slice_start))
|
||||
}
|
||||
}
|
||||
|
||||
impl<A: AsRef<[AccelTy]>> Accels<A> {
|
||||
/// Return an owned version of the accelerators.
|
||||
#[cfg(feature = "alloc")]
|
||||
pub fn to_owned(&self) -> Accels<alloc::vec::Vec<AccelTy>> {
|
||||
Accels { accels: self.accels.as_ref().to_vec() }
|
||||
}
|
||||
|
||||
/// Return a borrowed version of the accelerators.
|
||||
pub fn as_ref(&self) -> Accels<&[AccelTy]> {
|
||||
Accels { accels: self.accels.as_ref() }
|
||||
}
|
||||
|
||||
/// Return the bytes representing the serialization of the accelerators.
|
||||
pub fn as_bytes(&self) -> &[u8] {
|
||||
let accels = self.accels.as_ref();
|
||||
// SAFETY: This is safe because accels is a just a slice of AccelTy,
|
||||
// and u8 always has a smaller alignment.
|
||||
unsafe {
|
||||
core::slice::from_raw_parts(
|
||||
accels.as_ptr().cast::<u8>(),
|
||||
accels.len() * ACCEL_TY_SIZE,
|
||||
)
|
||||
}
|
||||
}
|
||||
|
||||
/// Returns the memory usage, in bytes, of these accelerators.
|
||||
///
|
||||
/// The memory usage is computed based on the number of bytes used to
|
||||
/// represent all of the accelerators.
|
||||
///
|
||||
/// This does **not** include the stack size used by this value.
|
||||
pub fn memory_usage(&self) -> usize {
|
||||
self.as_bytes().len()
|
||||
}
|
||||
|
||||
/// Return the bytes to search for corresponding to the accelerator in this
|
||||
/// sequence at index `i`. If no such accelerator exists, then this panics.
|
||||
///
|
||||
/// The significance of the index is that it should be in correspondence
|
||||
/// with the index of the corresponding DFA. That is, accelerated DFA
|
||||
/// states are stored contiguously in the DFA and have an ordering implied
|
||||
/// by their respective state IDs. The state's index in that sequence
|
||||
/// corresponds to the index of its corresponding accelerator.
|
||||
#[cfg_attr(feature = "perf-inline", inline(always))]
|
||||
pub fn needles(&self, i: usize) -> &[u8] {
|
||||
if i >= self.len() {
|
||||
panic!("invalid accelerator index {}", i);
|
||||
}
|
||||
let bytes = self.as_bytes();
|
||||
let offset = ACCEL_TY_SIZE + i * ACCEL_CAP;
|
||||
let len = usize::from(bytes[offset]);
|
||||
&bytes[offset + 1..offset + 1 + len]
|
||||
}
|
||||
|
||||
/// Return the total number of accelerators in this sequence.
|
||||
pub fn len(&self) -> usize {
|
||||
// This should never panic since deserialization checks that the
|
||||
// length can fit into a usize.
|
||||
usize::try_from(self.accels.as_ref()[0]).unwrap()
|
||||
}
|
||||
|
||||
/// Return the accelerator in this sequence at index `i`. If no such
|
||||
/// accelerator exists, then this returns None.
|
||||
///
|
||||
/// See the docs for `needles` on the significance of the index.
|
||||
fn get(&self, i: usize) -> Option<Accel> {
|
||||
if i >= self.len() {
|
||||
return None;
|
||||
}
|
||||
let offset = ACCEL_TY_SIZE + i * ACCEL_CAP;
|
||||
let accel = Accel::from_slice(&self.as_bytes()[offset..])
|
||||
.expect("Accels must contain valid accelerators");
|
||||
Some(accel)
|
||||
}
|
||||
|
||||
/// Returns an iterator of accelerators in this sequence.
|
||||
fn iter(&self) -> IterAccels<'_, A> {
|
||||
IterAccels { accels: self, i: 0 }
|
||||
}
|
||||
|
||||
/// Writes these accelerators to the given byte buffer using the indicated
|
||||
/// endianness. If the given buffer is too small, then an error is
|
||||
/// returned. Upon success, the total number of bytes written is returned.
|
||||
/// The number of bytes written is guaranteed to be a multiple of 8.
|
||||
pub fn write_to<E: Endian>(
|
||||
&self,
|
||||
dst: &mut [u8],
|
||||
) -> Result<usize, SerializeError> {
|
||||
let nwrite = self.write_to_len();
|
||||
assert_eq!(
|
||||
nwrite % ACCEL_TY_SIZE,
|
||||
0,
|
||||
"expected accelerator bytes written to be a multiple of {}",
|
||||
ACCEL_TY_SIZE,
|
||||
);
|
||||
if dst.len() < nwrite {
|
||||
return Err(SerializeError::buffer_too_small("accelerators"));
|
||||
}
|
||||
|
||||
// The number of accelerators can never exceed AccelTy::MAX.
|
||||
E::write_u32(AccelTy::try_from(self.len()).unwrap(), dst);
|
||||
// The actual accelerators are just raw bytes and thus their endianness
|
||||
// is irrelevant. So we can copy them as bytes.
|
||||
dst[ACCEL_TY_SIZE..nwrite]
|
||||
.copy_from_slice(&self.as_bytes()[ACCEL_TY_SIZE..nwrite]);
|
||||
Ok(nwrite)
|
||||
}
|
||||
|
||||
/// Validates that every accelerator in this collection can be successfully
|
||||
/// deserialized as a valid accelerator.
|
||||
pub fn validate(&self) -> Result<(), DeserializeError> {
|
||||
for chunk in self.as_bytes()[ACCEL_TY_SIZE..].chunks(ACCEL_CAP) {
|
||||
let _ = Accel::from_slice(chunk)?;
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Returns the total number of bytes written by `write_to`.
|
||||
pub fn write_to_len(&self) -> usize {
|
||||
self.as_bytes().len()
|
||||
}
|
||||
}
|
||||
|
||||
impl<A: AsRef<[AccelTy]>> core::fmt::Debug for Accels<A> {
|
||||
fn fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result {
|
||||
write!(f, "Accels(")?;
|
||||
let mut list = f.debug_list();
|
||||
for a in self.iter() {
|
||||
list.entry(&a);
|
||||
}
|
||||
list.finish()?;
|
||||
write!(f, ")")
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Debug)]
|
||||
struct IterAccels<'a, A: AsRef<[AccelTy]>> {
|
||||
accels: &'a Accels<A>,
|
||||
i: usize,
|
||||
}
|
||||
|
||||
impl<'a, A: AsRef<[AccelTy]>> Iterator for IterAccels<'a, A> {
|
||||
type Item = Accel;
|
||||
|
||||
fn next(&mut self) -> Option<Accel> {
|
||||
let accel = self.accels.get(self.i)?;
|
||||
self.i += 1;
|
||||
Some(accel)
|
||||
}
|
||||
}
|
||||
|
||||
/// Accel represents a structure for determining how to "accelerate" a DFA
|
||||
/// state.
|
||||
///
|
||||
/// Namely, it contains zero or more bytes that must be seen in order for the
|
||||
/// DFA to leave the state it is associated with. In practice, the actual range
|
||||
/// is 1 to 3 bytes.
|
||||
///
|
||||
/// The purpose of acceleration is to identify states whose vast majority
|
||||
/// of transitions are just loops back to the same state. For example,
|
||||
/// in the regex `(?-u)^[^a]+b`, the corresponding DFA will have a state
|
||||
/// (corresponding to `[^a]+`) where all transitions *except* for `a` and
|
||||
/// `b` loop back to itself. Thus, this state can be "accelerated" by simply
|
||||
/// looking for the next occurrence of either `a` or `b` instead of explicitly
|
||||
/// following transitions. (In this case, `b` transitions to the next state
|
||||
/// where as `a` would transition to the dead state.)
|
||||
#[derive(Clone)]
|
||||
pub(crate) struct Accel {
|
||||
/// The first byte is the length. Subsequent bytes are the accelerated
|
||||
/// bytes.
|
||||
///
|
||||
/// Note that we make every accelerator 8 bytes as a slightly wasteful
|
||||
/// way of making sure alignment is always correct for state ID sizes of
|
||||
/// 1, 2, 4 and 8. This should be okay since accelerated states aren't
|
||||
/// particularly common, especially when Unicode is enabled.
|
||||
bytes: [u8; ACCEL_CAP],
|
||||
}
|
||||
|
||||
impl Accel {
|
||||
/// Returns an empty accel, where no bytes are accelerated.
|
||||
#[cfg(feature = "dfa-build")]
|
||||
pub fn new() -> Accel {
|
||||
Accel { bytes: [0; ACCEL_CAP] }
|
||||
}
|
||||
|
||||
/// Returns a verified accelerator derived from the beginning of the given
|
||||
/// slice.
|
||||
///
|
||||
/// If the slice is not long enough or contains invalid bytes for an
|
||||
/// accelerator, then this returns an error.
|
||||
pub fn from_slice(mut slice: &[u8]) -> Result<Accel, DeserializeError> {
|
||||
slice = &slice[..core::cmp::min(ACCEL_LEN, slice.len())];
|
||||
let bytes = slice
|
||||
.try_into()
|
||||
.map_err(|_| DeserializeError::buffer_too_small("accelerator"))?;
|
||||
Accel::from_bytes(bytes)
|
||||
}
|
||||
|
||||
/// Returns a verified accelerator derived from raw bytes.
|
||||
///
|
||||
/// If the given bytes are invalid, then this returns an error.
|
||||
fn from_bytes(bytes: [u8; 4]) -> Result<Accel, DeserializeError> {
|
||||
if usize::from(bytes[0]) >= ACCEL_LEN {
|
||||
return Err(DeserializeError::generic(
|
||||
"accelerator bytes cannot have length more than 3",
|
||||
));
|
||||
}
|
||||
Ok(Accel::from_bytes_unchecked(bytes))
|
||||
}
|
||||
|
||||
/// Returns an accelerator derived from raw bytes.
|
||||
///
|
||||
/// This does not check whether the given bytes are valid. Invalid bytes
|
||||
/// cannot sacrifice memory safety, but may result in panics or silent
|
||||
/// logic bugs.
|
||||
fn from_bytes_unchecked(bytes: [u8; 4]) -> Accel {
|
||||
Accel { bytes: [bytes[0], bytes[1], bytes[2], bytes[3], 0, 0, 0, 0] }
|
||||
}
|
||||
|
||||
/// Attempts to add the given byte to this accelerator. If the accelerator
|
||||
/// is already full or thinks the byte is a poor accelerator, then this
|
||||
/// returns false. Otherwise, returns true.
|
||||
///
|
||||
/// If the given byte is already in this accelerator, then it panics.
|
||||
#[cfg(feature = "dfa-build")]
|
||||
pub fn add(&mut self, byte: u8) -> bool {
|
||||
if self.len() >= 3 {
|
||||
return false;
|
||||
}
|
||||
// As a special case, we totally reject trying to accelerate a state
|
||||
// with an ASCII space. In most cases, it occurs very frequently, and
|
||||
// tends to result in worse overall performance.
|
||||
if byte == b' ' {
|
||||
return false;
|
||||
}
|
||||
assert!(
|
||||
!self.contains(byte),
|
||||
"accelerator already contains {:?}",
|
||||
crate::util::escape::DebugByte(byte)
|
||||
);
|
||||
self.bytes[self.len() + 1] = byte;
|
||||
self.bytes[0] += 1;
|
||||
true
|
||||
}
|
||||
|
||||
/// Return the number of bytes in this accelerator.
|
||||
pub fn len(&self) -> usize {
|
||||
usize::from(self.bytes[0])
|
||||
}
|
||||
|
||||
/// Returns true if and only if there are no bytes in this accelerator.
|
||||
#[cfg(feature = "dfa-build")]
|
||||
pub fn is_empty(&self) -> bool {
|
||||
self.len() == 0
|
||||
}
|
||||
|
||||
/// Returns the slice of bytes to accelerate.
|
||||
///
|
||||
/// If this accelerator is empty, then this returns an empty slice.
|
||||
fn needles(&self) -> &[u8] {
|
||||
&self.bytes[1..1 + self.len()]
|
||||
}
|
||||
|
||||
/// Returns true if and only if this accelerator will accelerate the given
|
||||
/// byte.
|
||||
#[cfg(feature = "dfa-build")]
|
||||
fn contains(&self, byte: u8) -> bool {
|
||||
self.needles().iter().position(|&b| b == byte).is_some()
|
||||
}
|
||||
|
||||
/// Returns the accelerator bytes as an array of AccelTys.
|
||||
#[cfg(feature = "dfa-build")]
|
||||
fn as_accel_tys(&self) -> [AccelTy; 2] {
|
||||
assert_eq!(ACCEL_CAP, 8);
|
||||
// These unwraps are OK since ACCEL_CAP is set to 8.
|
||||
let first =
|
||||
AccelTy::from_ne_bytes(self.bytes[0..4].try_into().unwrap());
|
||||
let second =
|
||||
AccelTy::from_ne_bytes(self.bytes[4..8].try_into().unwrap());
|
||||
[first, second]
|
||||
}
|
||||
}
|
||||
|
||||
impl core::fmt::Debug for Accel {
|
||||
fn fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result {
|
||||
write!(f, "Accel(")?;
|
||||
let mut set = f.debug_set();
|
||||
for &b in self.needles() {
|
||||
set.entry(&crate::util::escape::DebugByte(b));
|
||||
}
|
||||
set.finish()?;
|
||||
write!(f, ")")
|
||||
}
|
||||
}
|
||||
2260
third-party/vendor/regex-automata/src/dfa/automaton.rs
vendored
Normal file
2260
third-party/vendor/regex-automata/src/dfa/automaton.rs
vendored
Normal file
File diff suppressed because it is too large
Load diff
5153
third-party/vendor/regex-automata/src/dfa/dense.rs
vendored
Normal file
5153
third-party/vendor/regex-automata/src/dfa/dense.rs
vendored
Normal file
File diff suppressed because it is too large
Load diff
599
third-party/vendor/regex-automata/src/dfa/determinize.rs
vendored
Normal file
599
third-party/vendor/regex-automata/src/dfa/determinize.rs
vendored
Normal file
|
|
@ -0,0 +1,599 @@
|
|||
use alloc::{collections::BTreeMap, vec::Vec};
|
||||
|
||||
use crate::{
|
||||
dfa::{
|
||||
dense::{self, BuildError},
|
||||
DEAD,
|
||||
},
|
||||
nfa::thompson,
|
||||
util::{
|
||||
self,
|
||||
alphabet::{self, ByteSet},
|
||||
determinize::{State, StateBuilderEmpty, StateBuilderNFA},
|
||||
primitives::{PatternID, StateID},
|
||||
search::{Anchored, MatchKind},
|
||||
sparse_set::SparseSets,
|
||||
start::Start,
|
||||
},
|
||||
};
|
||||
|
||||
/// A builder for configuring and running a DFA determinizer.
|
||||
#[derive(Clone, Debug)]
|
||||
pub(crate) struct Config {
|
||||
match_kind: MatchKind,
|
||||
quit: ByteSet,
|
||||
dfa_size_limit: Option<usize>,
|
||||
determinize_size_limit: Option<usize>,
|
||||
}
|
||||
|
||||
impl Config {
|
||||
/// Create a new default config for a determinizer. The determinizer may be
|
||||
/// configured before calling `run`.
|
||||
pub fn new() -> Config {
|
||||
Config {
|
||||
match_kind: MatchKind::LeftmostFirst,
|
||||
quit: ByteSet::empty(),
|
||||
dfa_size_limit: None,
|
||||
determinize_size_limit: None,
|
||||
}
|
||||
}
|
||||
|
||||
/// Run determinization on the given NFA and write the resulting DFA into
|
||||
/// the one given. The DFA given should be initialized but otherwise empty.
|
||||
/// "Initialized" means that it is setup to handle the NFA's byte classes,
|
||||
/// number of patterns and whether to build start states for each pattern.
|
||||
pub fn run(
|
||||
&self,
|
||||
nfa: &thompson::NFA,
|
||||
dfa: &mut dense::OwnedDFA,
|
||||
) -> Result<(), BuildError> {
|
||||
let dead = State::dead();
|
||||
let quit = State::dead();
|
||||
let mut cache = StateMap::default();
|
||||
// We only insert the dead state here since its representation is
|
||||
// identical to the quit state. And we never want anything pointing
|
||||
// to the quit state other than specific transitions derived from the
|
||||
// determinizer's configured "quit" bytes.
|
||||
//
|
||||
// We do put the quit state into 'builder_states' below. This ensures
|
||||
// that a proper DFA state ID is allocated for it, and that no other
|
||||
// DFA state uses the "location after the DEAD state." That is, it
|
||||
// is assumed that the quit state is always the state immediately
|
||||
// following the DEAD state.
|
||||
cache.insert(dead.clone(), DEAD);
|
||||
|
||||
let runner = Runner {
|
||||
config: self.clone(),
|
||||
nfa,
|
||||
dfa,
|
||||
builder_states: alloc::vec![dead, quit],
|
||||
cache,
|
||||
memory_usage_state: 0,
|
||||
sparses: SparseSets::new(nfa.states().len()),
|
||||
stack: alloc::vec![],
|
||||
scratch_state_builder: StateBuilderEmpty::new(),
|
||||
};
|
||||
runner.run()
|
||||
}
|
||||
|
||||
/// The match semantics to use for determinization.
|
||||
///
|
||||
/// MatchKind::All corresponds to the standard textbook construction.
|
||||
/// All possible match states are represented in the DFA.
|
||||
/// MatchKind::LeftmostFirst permits greediness and otherwise tries to
|
||||
/// simulate the match semantics of backtracking regex engines. Namely,
|
||||
/// only a subset of match states are built, and dead states are used to
|
||||
/// stop searches with an unanchored prefix.
|
||||
///
|
||||
/// The default is MatchKind::LeftmostFirst.
|
||||
pub fn match_kind(&mut self, kind: MatchKind) -> &mut Config {
|
||||
self.match_kind = kind;
|
||||
self
|
||||
}
|
||||
|
||||
/// The set of bytes to use that will cause the DFA to enter a quit state,
|
||||
/// stop searching and return an error. By default, this is empty.
|
||||
pub fn quit(&mut self, set: ByteSet) -> &mut Config {
|
||||
self.quit = set;
|
||||
self
|
||||
}
|
||||
|
||||
/// The limit, in bytes of the heap, that the DFA is permitted to use. This
|
||||
/// does not include the auxiliary heap storage used by determinization.
|
||||
pub fn dfa_size_limit(&mut self, bytes: Option<usize>) -> &mut Config {
|
||||
self.dfa_size_limit = bytes;
|
||||
self
|
||||
}
|
||||
|
||||
/// The limit, in bytes of the heap, that determinization itself is allowed
|
||||
/// to use. This does not include the size of the DFA being built.
|
||||
pub fn determinize_size_limit(
|
||||
&mut self,
|
||||
bytes: Option<usize>,
|
||||
) -> &mut Config {
|
||||
self.determinize_size_limit = bytes;
|
||||
self
|
||||
}
|
||||
}
|
||||
|
||||
/// The actual implementation of determinization that converts an NFA to a DFA
|
||||
/// through powerset construction.
|
||||
///
|
||||
/// This determinizer roughly follows the typical powerset construction, where
|
||||
/// each DFA state is comprised of one or more NFA states. In the worst case,
|
||||
/// there is one DFA state for every possible combination of NFA states. In
|
||||
/// practice, this only happens in certain conditions, typically when there are
|
||||
/// bounded repetitions.
|
||||
///
|
||||
/// The main differences between this implementation and typical deteminization
|
||||
/// are that this implementation delays matches by one state and hackily makes
|
||||
/// look-around work. Comments below attempt to explain this.
|
||||
///
|
||||
/// The lifetime variable `'a` refers to the lifetime of the NFA or DFA,
|
||||
/// whichever is shorter.
|
||||
#[derive(Debug)]
|
||||
struct Runner<'a> {
|
||||
/// The configuration used to initialize determinization.
|
||||
config: Config,
|
||||
/// The NFA we're converting into a DFA.
|
||||
nfa: &'a thompson::NFA,
|
||||
/// The DFA we're building.
|
||||
dfa: &'a mut dense::OwnedDFA,
|
||||
/// Each DFA state being built is defined as an *ordered* set of NFA
|
||||
/// states, along with some meta facts about the ordered set of NFA states.
|
||||
///
|
||||
/// This is never empty. The first state is always a dummy state such that
|
||||
/// a state id == 0 corresponds to a dead state. The second state is always
|
||||
/// the quit state.
|
||||
///
|
||||
/// Why do we have states in both a `Vec` and in a cache map below?
|
||||
/// Well, they serve two different roles based on access patterns.
|
||||
/// `builder_states` is the canonical home of each state, and provides
|
||||
/// constant random access by a DFA state's ID. The cache map below, on
|
||||
/// the other hand, provides a quick way of searching for identical DFA
|
||||
/// states by using the DFA state as a key in the map. Of course, we use
|
||||
/// reference counting to avoid actually duplicating the state's data
|
||||
/// itself. (Although this has never been benchmarked.) Note that the cache
|
||||
/// map does not give us full minimization; it just lets us avoid some very
|
||||
/// obvious redundant states.
|
||||
///
|
||||
/// Note that the index into this Vec isn't quite the DFA's state ID.
|
||||
/// Rather, it's just an index. To get the state ID, you have to multiply
|
||||
/// it by the DFA's stride. That's done by self.dfa.from_index. And the
|
||||
/// inverse is self.dfa.to_index.
|
||||
///
|
||||
/// Moreover, DFA states don't usually retain the IDs assigned to them
|
||||
/// by their position in this Vec. After determinization completes,
|
||||
/// states are shuffled around to support other optimizations. See the
|
||||
/// sibling 'special' module for more details on that. (The reason for
|
||||
/// mentioning this is that if you print out the DFA for debugging during
|
||||
/// determinization, and then print out the final DFA after it is fully
|
||||
/// built, then the state IDs likely won't match up.)
|
||||
builder_states: Vec<State>,
|
||||
/// A cache of DFA states that already exist and can be easily looked up
|
||||
/// via ordered sets of NFA states.
|
||||
///
|
||||
/// See `builder_states` docs for why we store states in two different
|
||||
/// ways.
|
||||
cache: StateMap,
|
||||
/// The memory usage, in bytes, used by builder_states and cache. We track
|
||||
/// this as new states are added since states use a variable amount of
|
||||
/// heap. Tracking this as we add states makes it possible to compute the
|
||||
/// total amount of memory used by the determinizer in constant time.
|
||||
memory_usage_state: usize,
|
||||
/// A pair of sparse sets for tracking ordered sets of NFA state IDs.
|
||||
/// These are reused throughout determinization. A bounded sparse set
|
||||
/// gives us constant time insertion, membership testing and clearing.
|
||||
sparses: SparseSets,
|
||||
/// Scratch space for a stack of NFA states to visit, for depth first
|
||||
/// visiting without recursion.
|
||||
stack: Vec<StateID>,
|
||||
/// Scratch space for storing an ordered sequence of NFA states, for
|
||||
/// amortizing allocation. This is principally useful for when we avoid
|
||||
/// adding a new DFA state since it already exists. In order to detect this
|
||||
/// case though, we still need an ordered set of NFA state IDs. So we use
|
||||
/// this space to stage that ordered set before we know whether we need to
|
||||
/// create a new DFA state or not.
|
||||
scratch_state_builder: StateBuilderEmpty,
|
||||
}
|
||||
|
||||
/// A map from states to state identifiers. When using std, we use a standard
|
||||
/// hashmap, since it's a bit faster for this use case. (Other maps, like
|
||||
/// one's based on FNV, have not yet been benchmarked.)
|
||||
///
|
||||
/// The main purpose of this map is to reuse states where possible. This won't
|
||||
/// fully minimize the DFA, but it works well in a lot of cases.
|
||||
#[cfg(feature = "std")]
|
||||
type StateMap = std::collections::HashMap<State, StateID>;
|
||||
#[cfg(not(feature = "std"))]
|
||||
type StateMap = BTreeMap<State, StateID>;
|
||||
|
||||
impl<'a> Runner<'a> {
|
||||
/// Build the DFA. If there was a problem constructing the DFA (e.g., if
|
||||
/// the chosen state identifier representation is too small), then an error
|
||||
/// is returned.
|
||||
fn run(mut self) -> Result<(), BuildError> {
|
||||
if self.nfa.look_set_any().contains_word_unicode()
|
||||
&& !self.config.quit.contains_range(0x80, 0xFF)
|
||||
{
|
||||
return Err(BuildError::unsupported_dfa_word_boundary_unicode());
|
||||
}
|
||||
|
||||
// A sequence of "representative" bytes drawn from each equivalence
|
||||
// class. These representative bytes are fed to the NFA to compute
|
||||
// state transitions. This allows us to avoid re-computing state
|
||||
// transitions for bytes that are guaranteed to produce identical
|
||||
// results. Since computing the representatives needs to do a little
|
||||
// work, we do it once here because we'll be iterating over them a lot.
|
||||
let representatives: Vec<alphabet::Unit> =
|
||||
self.dfa.byte_classes().representatives(..).collect();
|
||||
// The set of all DFA state IDs that still need to have their
|
||||
// transitions set. We start by seeding this with all starting states.
|
||||
let mut uncompiled = alloc::vec![];
|
||||
self.add_all_starts(&mut uncompiled)?;
|
||||
while let Some(dfa_id) = uncompiled.pop() {
|
||||
for &unit in &representatives {
|
||||
if unit.as_u8().map_or(false, |b| self.config.quit.contains(b))
|
||||
{
|
||||
continue;
|
||||
}
|
||||
// In many cases, the state we transition to has already been
|
||||
// computed. 'cached_state' will do the minimal amount of work
|
||||
// to check this, and if it exists, immediately return an
|
||||
// already existing state ID.
|
||||
let (next_dfa_id, is_new) = self.cached_state(dfa_id, unit)?;
|
||||
self.dfa.set_transition(dfa_id, unit, next_dfa_id);
|
||||
// If the state ID we got back is newly created, then we need
|
||||
// to compile it, so add it to our uncompiled frontier.
|
||||
if is_new {
|
||||
uncompiled.push(next_dfa_id);
|
||||
}
|
||||
}
|
||||
}
|
||||
debug!(
|
||||
"determinization complete, memory usage: {}, \
|
||||
dense DFA size: {}, \
|
||||
is reverse? {}",
|
||||
self.memory_usage(),
|
||||
self.dfa.memory_usage(),
|
||||
self.nfa.is_reverse(),
|
||||
);
|
||||
|
||||
// A map from DFA state ID to one or more NFA match IDs. Each NFA match
|
||||
// ID corresponds to a distinct regex pattern that matches in the state
|
||||
// corresponding to the key.
|
||||
let mut matches: BTreeMap<StateID, Vec<PatternID>> = BTreeMap::new();
|
||||
self.cache.clear();
|
||||
#[cfg(feature = "logging")]
|
||||
let mut total_pat_len = 0;
|
||||
for (i, state) in self.builder_states.into_iter().enumerate() {
|
||||
if let Some(pat_ids) = state.match_pattern_ids() {
|
||||
let id = self.dfa.to_state_id(i);
|
||||
log! {
|
||||
total_pat_len += pat_ids.len();
|
||||
}
|
||||
matches.insert(id, pat_ids);
|
||||
}
|
||||
}
|
||||
log! {
|
||||
use core::mem::size_of;
|
||||
let per_elem = size_of::<StateID>() + size_of::<Vec<PatternID>>();
|
||||
let pats = total_pat_len * size_of::<PatternID>();
|
||||
let mem = (matches.len() * per_elem) + pats;
|
||||
log::debug!("matches map built, memory usage: {}", mem);
|
||||
}
|
||||
// At this point, we shuffle the "special" states in the final DFA.
|
||||
// This permits a DFA's match loop to detect a match condition (among
|
||||
// other things) by merely inspecting the current state's identifier,
|
||||
// and avoids the need for any additional auxiliary storage.
|
||||
self.dfa.shuffle(matches)?;
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Return the identifier for the next DFA state given an existing DFA
|
||||
/// state and an input byte. If the next DFA state already exists, then
|
||||
/// return its identifier from the cache. Otherwise, build the state, cache
|
||||
/// it and return its identifier.
|
||||
///
|
||||
/// This routine returns a boolean indicating whether a new state was
|
||||
/// built. If a new state is built, then the caller needs to add it to its
|
||||
/// frontier of uncompiled DFA states to compute transitions for.
|
||||
fn cached_state(
|
||||
&mut self,
|
||||
dfa_id: StateID,
|
||||
unit: alphabet::Unit,
|
||||
) -> Result<(StateID, bool), BuildError> {
|
||||
// Compute the set of all reachable NFA states, including epsilons.
|
||||
let empty_builder = self.get_state_builder();
|
||||
let builder = util::determinize::next(
|
||||
self.nfa,
|
||||
self.config.match_kind,
|
||||
&mut self.sparses,
|
||||
&mut self.stack,
|
||||
&self.builder_states[self.dfa.to_index(dfa_id)],
|
||||
unit,
|
||||
empty_builder,
|
||||
);
|
||||
self.maybe_add_state(builder)
|
||||
}
|
||||
|
||||
/// Compute the set of DFA start states and add their identifiers in
|
||||
/// 'dfa_state_ids' (no duplicates are added).
|
||||
fn add_all_starts(
|
||||
&mut self,
|
||||
dfa_state_ids: &mut Vec<StateID>,
|
||||
) -> Result<(), BuildError> {
|
||||
// These should be the first states added.
|
||||
assert!(dfa_state_ids.is_empty());
|
||||
// We only want to add (un)anchored starting states that is consistent
|
||||
// with our DFA's configuration. Unconditionally adding both (although
|
||||
// it is the default) can make DFAs quite a bit bigger.
|
||||
if self.dfa.start_kind().has_unanchored() {
|
||||
self.add_start_group(Anchored::No, dfa_state_ids)?;
|
||||
}
|
||||
if self.dfa.start_kind().has_anchored() {
|
||||
self.add_start_group(Anchored::Yes, dfa_state_ids)?;
|
||||
}
|
||||
// I previously has an 'assert' here checking that either
|
||||
// 'dfa_state_ids' was non-empty, or the NFA had zero patterns. But it
|
||||
// turns out this isn't always true. For example, the NFA might have
|
||||
// one or more patterns but where all such patterns are just 'fail'
|
||||
// states. These will ultimately just compile down to DFA dead states,
|
||||
// and since the dead state was added earlier, no new DFA states are
|
||||
// added. And thus, it is valid and okay for 'dfa_state_ids' to be
|
||||
// empty even if there are a non-zero number of patterns in the NFA.
|
||||
|
||||
// We only need to compute anchored start states for each pattern if it
|
||||
// was requested to do so.
|
||||
if self.dfa.starts_for_each_pattern() {
|
||||
for pid in self.nfa.patterns() {
|
||||
self.add_start_group(Anchored::Pattern(pid), dfa_state_ids)?;
|
||||
}
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Add a group of start states for the given match pattern ID. Any new
|
||||
/// DFA states added are pushed on to 'dfa_state_ids'. (No duplicates are
|
||||
/// pushed.)
|
||||
///
|
||||
/// When pattern_id is None, then this will compile a group of unanchored
|
||||
/// start states (if the DFA is unanchored). When the pattern_id is
|
||||
/// present, then this will compile a group of anchored start states that
|
||||
/// only match the given pattern.
|
||||
///
|
||||
/// This panics if `anchored` corresponds to an invalid pattern ID.
|
||||
fn add_start_group(
|
||||
&mut self,
|
||||
anchored: Anchored,
|
||||
dfa_state_ids: &mut Vec<StateID>,
|
||||
) -> Result<(), BuildError> {
|
||||
let nfa_start = match anchored {
|
||||
Anchored::No => self.nfa.start_unanchored(),
|
||||
Anchored::Yes => self.nfa.start_anchored(),
|
||||
Anchored::Pattern(pid) => {
|
||||
self.nfa.start_pattern(pid).expect("valid pattern ID")
|
||||
}
|
||||
};
|
||||
|
||||
// When compiling start states, we're careful not to build additional
|
||||
// states that aren't necessary. For example, if the NFA has no word
|
||||
// boundary assertion, then there's no reason to have distinct start
|
||||
// states for 'NonWordByte' and 'WordByte' starting configurations.
|
||||
// Instead, the 'WordByte' starting configuration can just point
|
||||
// directly to the start state for the 'NonWordByte' config.
|
||||
//
|
||||
// Note though that we only need to care about assertions in the prefix
|
||||
// of an NFA since this only concerns the starting states. (Actually,
|
||||
// the most precisely thing we could do it is look at the prefix
|
||||
// assertions of each pattern when 'anchored == Anchored::Pattern',
|
||||
// and then only compile extra states if the prefix is non-empty.) But
|
||||
// we settle for simplicity here instead of absolute minimalism. It is
|
||||
// somewhat rare, after all, for multiple patterns in the same regex to
|
||||
// have different prefix look-arounds.
|
||||
|
||||
let (id, is_new) =
|
||||
self.add_one_start(nfa_start, Start::NonWordByte)?;
|
||||
self.dfa.set_start_state(anchored, Start::NonWordByte, id);
|
||||
if is_new {
|
||||
dfa_state_ids.push(id);
|
||||
}
|
||||
|
||||
if !self.nfa.look_set_prefix_any().contains_word() {
|
||||
self.dfa.set_start_state(anchored, Start::WordByte, id);
|
||||
} else {
|
||||
let (id, is_new) =
|
||||
self.add_one_start(nfa_start, Start::WordByte)?;
|
||||
self.dfa.set_start_state(anchored, Start::WordByte, id);
|
||||
if is_new {
|
||||
dfa_state_ids.push(id);
|
||||
}
|
||||
}
|
||||
if !self.nfa.look_set_prefix_any().contains_anchor() {
|
||||
self.dfa.set_start_state(anchored, Start::Text, id);
|
||||
self.dfa.set_start_state(anchored, Start::LineLF, id);
|
||||
self.dfa.set_start_state(anchored, Start::LineCR, id);
|
||||
self.dfa.set_start_state(
|
||||
anchored,
|
||||
Start::CustomLineTerminator,
|
||||
id,
|
||||
);
|
||||
} else {
|
||||
let (id, is_new) = self.add_one_start(nfa_start, Start::Text)?;
|
||||
self.dfa.set_start_state(anchored, Start::Text, id);
|
||||
if is_new {
|
||||
dfa_state_ids.push(id);
|
||||
}
|
||||
|
||||
let (id, is_new) = self.add_one_start(nfa_start, Start::LineLF)?;
|
||||
self.dfa.set_start_state(anchored, Start::LineLF, id);
|
||||
if is_new {
|
||||
dfa_state_ids.push(id);
|
||||
}
|
||||
|
||||
let (id, is_new) = self.add_one_start(nfa_start, Start::LineCR)?;
|
||||
self.dfa.set_start_state(anchored, Start::LineCR, id);
|
||||
if is_new {
|
||||
dfa_state_ids.push(id);
|
||||
}
|
||||
|
||||
let (id, is_new) =
|
||||
self.add_one_start(nfa_start, Start::CustomLineTerminator)?;
|
||||
self.dfa.set_start_state(
|
||||
anchored,
|
||||
Start::CustomLineTerminator,
|
||||
id,
|
||||
);
|
||||
if is_new {
|
||||
dfa_state_ids.push(id);
|
||||
}
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Add a new DFA start state corresponding to the given starting NFA
|
||||
/// state, and the starting search configuration. (The starting search
|
||||
/// configuration essentially tells us which look-behind assertions are
|
||||
/// true for this particular state.)
|
||||
///
|
||||
/// The boolean returned indicates whether the state ID returned is a newly
|
||||
/// created state, or a previously cached state.
|
||||
fn add_one_start(
|
||||
&mut self,
|
||||
nfa_start: StateID,
|
||||
start: Start,
|
||||
) -> Result<(StateID, bool), BuildError> {
|
||||
// Compute the look-behind assertions that are true in this starting
|
||||
// configuration, and the determine the epsilon closure. While
|
||||
// computing the epsilon closure, we only follow condiional epsilon
|
||||
// transitions that satisfy the look-behind assertions in 'look_have'.
|
||||
let mut builder_matches = self.get_state_builder().into_matches();
|
||||
util::determinize::set_lookbehind_from_start(
|
||||
self.nfa,
|
||||
&start,
|
||||
&mut builder_matches,
|
||||
);
|
||||
self.sparses.set1.clear();
|
||||
util::determinize::epsilon_closure(
|
||||
self.nfa,
|
||||
nfa_start,
|
||||
builder_matches.look_have(),
|
||||
&mut self.stack,
|
||||
&mut self.sparses.set1,
|
||||
);
|
||||
let mut builder = builder_matches.into_nfa();
|
||||
util::determinize::add_nfa_states(
|
||||
&self.nfa,
|
||||
&self.sparses.set1,
|
||||
&mut builder,
|
||||
);
|
||||
self.maybe_add_state(builder)
|
||||
}
|
||||
|
||||
/// Adds the given state to the DFA being built depending on whether it
|
||||
/// already exists in this determinizer's cache.
|
||||
///
|
||||
/// If it does exist, then the memory used by 'state' is put back into the
|
||||
/// determinizer and the previously created state's ID is returned. (Along
|
||||
/// with 'false', indicating that no new state was added.)
|
||||
///
|
||||
/// If it does not exist, then the state is added to the DFA being built
|
||||
/// and a fresh ID is allocated (if ID allocation fails, then an error is
|
||||
/// returned) and returned. (Along with 'true', indicating that a new state
|
||||
/// was added.)
|
||||
fn maybe_add_state(
|
||||
&mut self,
|
||||
builder: StateBuilderNFA,
|
||||
) -> Result<(StateID, bool), BuildError> {
|
||||
if let Some(&cached_id) = self.cache.get(builder.as_bytes()) {
|
||||
// Since we have a cached state, put the constructed state's
|
||||
// memory back into our scratch space, so that it can be reused.
|
||||
self.put_state_builder(builder);
|
||||
return Ok((cached_id, false));
|
||||
}
|
||||
self.add_state(builder).map(|sid| (sid, true))
|
||||
}
|
||||
|
||||
/// Add the given state to the DFA and make it available in the cache.
|
||||
///
|
||||
/// The state initially has no transitions. That is, it transitions to the
|
||||
/// dead state for all possible inputs, and transitions to the quit state
|
||||
/// for all quit bytes.
|
||||
///
|
||||
/// If adding the state would exceed the maximum value for StateID, then an
|
||||
/// error is returned.
|
||||
fn add_state(
|
||||
&mut self,
|
||||
builder: StateBuilderNFA,
|
||||
) -> Result<StateID, BuildError> {
|
||||
let id = self.dfa.add_empty_state()?;
|
||||
if !self.config.quit.is_empty() {
|
||||
for b in self.config.quit.iter() {
|
||||
self.dfa.set_transition(
|
||||
id,
|
||||
alphabet::Unit::u8(b),
|
||||
self.dfa.quit_id(),
|
||||
);
|
||||
}
|
||||
}
|
||||
let state = builder.to_state();
|
||||
// States use reference counting internally, so we only need to count
|
||||
// their memory usage once.
|
||||
self.memory_usage_state += state.memory_usage();
|
||||
self.builder_states.push(state.clone());
|
||||
self.cache.insert(state, id);
|
||||
self.put_state_builder(builder);
|
||||
if let Some(limit) = self.config.dfa_size_limit {
|
||||
if self.dfa.memory_usage() > limit {
|
||||
return Err(BuildError::dfa_exceeded_size_limit(limit));
|
||||
}
|
||||
}
|
||||
if let Some(limit) = self.config.determinize_size_limit {
|
||||
if self.memory_usage() > limit {
|
||||
return Err(BuildError::determinize_exceeded_size_limit(
|
||||
limit,
|
||||
));
|
||||
}
|
||||
}
|
||||
Ok(id)
|
||||
}
|
||||
|
||||
/// Returns a state builder from this determinizer that might have existing
|
||||
/// capacity. This helps avoid allocs in cases where a state is built that
|
||||
/// turns out to already be cached.
|
||||
///
|
||||
/// Callers must put the state builder back with 'put_state_builder',
|
||||
/// otherwise the allocation reuse won't work.
|
||||
fn get_state_builder(&mut self) -> StateBuilderEmpty {
|
||||
core::mem::replace(
|
||||
&mut self.scratch_state_builder,
|
||||
StateBuilderEmpty::new(),
|
||||
)
|
||||
}
|
||||
|
||||
/// Puts the given state builder back into this determinizer for reuse.
|
||||
///
|
||||
/// Note that building a 'State' from a builder always creates a new
|
||||
/// alloc, so callers should always put the builder back.
|
||||
fn put_state_builder(&mut self, builder: StateBuilderNFA) {
|
||||
let _ = core::mem::replace(
|
||||
&mut self.scratch_state_builder,
|
||||
builder.clear(),
|
||||
);
|
||||
}
|
||||
|
||||
/// Return the memory usage, in bytes, of this determinizer at the current
|
||||
/// point in time. This does not include memory used by the NFA or the
|
||||
/// dense DFA itself.
|
||||
fn memory_usage(&self) -> usize {
|
||||
use core::mem::size_of;
|
||||
|
||||
self.builder_states.len() * size_of::<State>()
|
||||
// Maps likely use more memory than this, but it's probably close.
|
||||
+ self.cache.len() * (size_of::<State>() + size_of::<StateID>())
|
||||
+ self.memory_usage_state
|
||||
+ self.stack.capacity() * size_of::<StateID>()
|
||||
+ self.scratch_state_builder.capacity()
|
||||
}
|
||||
}
|
||||
463
third-party/vendor/regex-automata/src/dfa/minimize.rs
vendored
Normal file
463
third-party/vendor/regex-automata/src/dfa/minimize.rs
vendored
Normal file
|
|
@ -0,0 +1,463 @@
|
|||
use core::{cell::RefCell, fmt, mem};
|
||||
|
||||
use alloc::{collections::BTreeMap, rc::Rc, vec, vec::Vec};
|
||||
|
||||
use crate::{
|
||||
dfa::{automaton::Automaton, dense, DEAD},
|
||||
util::{
|
||||
alphabet,
|
||||
primitives::{PatternID, StateID},
|
||||
},
|
||||
};
|
||||
|
||||
/// An implementation of Hopcroft's algorithm for minimizing DFAs.
|
||||
///
|
||||
/// The algorithm implemented here is mostly taken from Wikipedia:
|
||||
/// https://en.wikipedia.org/wiki/DFA_minimization#Hopcroft's_algorithm
|
||||
///
|
||||
/// This code has had some light optimization attention paid to it,
|
||||
/// particularly in the form of reducing allocation as much as possible.
|
||||
/// However, it is still generally slow. Future optimization work should
|
||||
/// probably focus on the bigger picture rather than micro-optimizations. For
|
||||
/// example:
|
||||
///
|
||||
/// 1. Figure out how to more intelligently create initial partitions. That is,
|
||||
/// Hopcroft's algorithm starts by creating two partitions of DFA states
|
||||
/// that are known to NOT be equivalent: match states and non-match states.
|
||||
/// The algorithm proceeds by progressively refining these partitions into
|
||||
/// smaller partitions. If we could start with more partitions, then we
|
||||
/// could reduce the amount of work that Hopcroft's algorithm needs to do.
|
||||
/// 2. For every partition that we visit, we find all incoming transitions to
|
||||
/// every state in the partition for *every* element in the alphabet. (This
|
||||
/// is why using byte classes can significantly decrease minimization times,
|
||||
/// since byte classes shrink the alphabet.) This is quite costly and there
|
||||
/// is perhaps some redundant work being performed depending on the specific
|
||||
/// states in the set. For example, we might be able to only visit some
|
||||
/// elements of the alphabet based on the transitions.
|
||||
/// 3. Move parts of minimization into determinization. If minimization has
|
||||
/// fewer states to deal with, then it should run faster. A prime example
|
||||
/// of this might be large Unicode classes, which are generated in way that
|
||||
/// can create a lot of redundant states. (Some work has been done on this
|
||||
/// point during NFA compilation via the algorithm described in the
|
||||
/// "Incremental Construction of MinimalAcyclic Finite-State Automata"
|
||||
/// paper.)
|
||||
pub(crate) struct Minimizer<'a> {
|
||||
dfa: &'a mut dense::OwnedDFA,
|
||||
in_transitions: Vec<Vec<Vec<StateID>>>,
|
||||
partitions: Vec<StateSet>,
|
||||
waiting: Vec<StateSet>,
|
||||
}
|
||||
|
||||
impl<'a> fmt::Debug for Minimizer<'a> {
|
||||
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
|
||||
f.debug_struct("Minimizer")
|
||||
.field("dfa", &self.dfa)
|
||||
.field("in_transitions", &self.in_transitions)
|
||||
.field("partitions", &self.partitions)
|
||||
.field("waiting", &self.waiting)
|
||||
.finish()
|
||||
}
|
||||
}
|
||||
|
||||
/// A set of states. A state set makes up a single partition in Hopcroft's
|
||||
/// algorithm.
|
||||
///
|
||||
/// It is represented by an ordered set of state identifiers. We use shared
|
||||
/// ownership so that a single state set can be in both the set of partitions
|
||||
/// and in the set of waiting sets simultaneously without an additional
|
||||
/// allocation. Generally, once a state set is built, it becomes immutable.
|
||||
///
|
||||
/// We use this representation because it avoids the overhead of more
|
||||
/// traditional set data structures (HashSet/BTreeSet), and also because
|
||||
/// computing intersection/subtraction on this representation is especially
|
||||
/// fast.
|
||||
#[derive(Clone, Debug, Eq, PartialEq, PartialOrd, Ord)]
|
||||
struct StateSet {
|
||||
ids: Rc<RefCell<Vec<StateID>>>,
|
||||
}
|
||||
|
||||
impl<'a> Minimizer<'a> {
|
||||
pub fn new(dfa: &'a mut dense::OwnedDFA) -> Minimizer<'a> {
|
||||
let in_transitions = Minimizer::incoming_transitions(dfa);
|
||||
let partitions = Minimizer::initial_partitions(dfa);
|
||||
let waiting = partitions.clone();
|
||||
Minimizer { dfa, in_transitions, partitions, waiting }
|
||||
}
|
||||
|
||||
pub fn run(mut self) {
|
||||
let stride2 = self.dfa.stride2();
|
||||
let as_state_id = |index: usize| -> StateID {
|
||||
StateID::new(index << stride2).unwrap()
|
||||
};
|
||||
let as_index = |id: StateID| -> usize { id.as_usize() >> stride2 };
|
||||
|
||||
let mut incoming = StateSet::empty();
|
||||
let mut scratch1 = StateSet::empty();
|
||||
let mut scratch2 = StateSet::empty();
|
||||
let mut newparts = vec![];
|
||||
|
||||
// This loop is basically Hopcroft's algorithm. Everything else is just
|
||||
// shuffling data around to fit our representation.
|
||||
while let Some(set) = self.waiting.pop() {
|
||||
for b in self.dfa.byte_classes().iter() {
|
||||
self.find_incoming_to(b, &set, &mut incoming);
|
||||
// If incoming is empty, then the intersection with any other
|
||||
// set must also be empty. So 'newparts' just ends up being
|
||||
// 'self.partitions'. So there's no need to go through the loop
|
||||
// below.
|
||||
//
|
||||
// This actually turns out to be rather large optimization. On
|
||||
// the order of making minimization 4-5x faster. It's likely
|
||||
// that the vast majority of all states have very few incoming
|
||||
// transitions.
|
||||
if incoming.is_empty() {
|
||||
continue;
|
||||
}
|
||||
|
||||
for p in 0..self.partitions.len() {
|
||||
self.partitions[p].intersection(&incoming, &mut scratch1);
|
||||
if scratch1.is_empty() {
|
||||
newparts.push(self.partitions[p].clone());
|
||||
continue;
|
||||
}
|
||||
|
||||
self.partitions[p].subtract(&incoming, &mut scratch2);
|
||||
if scratch2.is_empty() {
|
||||
newparts.push(self.partitions[p].clone());
|
||||
continue;
|
||||
}
|
||||
|
||||
let (x, y) =
|
||||
(scratch1.deep_clone(), scratch2.deep_clone());
|
||||
newparts.push(x.clone());
|
||||
newparts.push(y.clone());
|
||||
match self.find_waiting(&self.partitions[p]) {
|
||||
Some(i) => {
|
||||
self.waiting[i] = x;
|
||||
self.waiting.push(y);
|
||||
}
|
||||
None => {
|
||||
if x.len() <= y.len() {
|
||||
self.waiting.push(x);
|
||||
} else {
|
||||
self.waiting.push(y);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
newparts = mem::replace(&mut self.partitions, newparts);
|
||||
newparts.clear();
|
||||
}
|
||||
}
|
||||
|
||||
// At this point, we now have a minimal partitioning of states, where
|
||||
// each partition is an equivalence class of DFA states. Now we need to
|
||||
// use this partitioning to update the DFA to only contain one state for
|
||||
// each partition.
|
||||
|
||||
// Create a map from DFA state ID to the representative ID of the
|
||||
// equivalence class to which it belongs. The representative ID of an
|
||||
// equivalence class of states is the minimum ID in that class.
|
||||
let mut state_to_part = vec![DEAD; self.dfa.state_len()];
|
||||
for p in &self.partitions {
|
||||
p.iter(|id| state_to_part[as_index(id)] = p.min());
|
||||
}
|
||||
|
||||
// Generate a new contiguous sequence of IDs for minimal states, and
|
||||
// create a map from equivalence IDs to the new IDs. Thus, the new
|
||||
// minimal ID of *any* state in the unminimized DFA can be obtained
|
||||
// with minimals_ids[state_to_part[old_id]].
|
||||
let mut minimal_ids = vec![DEAD; self.dfa.state_len()];
|
||||
let mut new_index = 0;
|
||||
for state in self.dfa.states() {
|
||||
if state_to_part[as_index(state.id())] == state.id() {
|
||||
minimal_ids[as_index(state.id())] = as_state_id(new_index);
|
||||
new_index += 1;
|
||||
}
|
||||
}
|
||||
// The total number of states in the minimal DFA.
|
||||
let minimal_count = new_index;
|
||||
// Convenience function for remapping state IDs. This takes an old ID,
|
||||
// looks up its Hopcroft partition and then maps that to the new ID
|
||||
// range.
|
||||
let remap = |old| minimal_ids[as_index(state_to_part[as_index(old)])];
|
||||
|
||||
// Re-map this DFA in place such that the only states remaining
|
||||
// correspond to the representative states of every equivalence class.
|
||||
for id in (0..self.dfa.state_len()).map(as_state_id) {
|
||||
// If this state isn't a representative for an equivalence class,
|
||||
// then we skip it since it won't appear in the minimal DFA.
|
||||
if state_to_part[as_index(id)] != id {
|
||||
continue;
|
||||
}
|
||||
self.dfa.remap_state(id, remap);
|
||||
self.dfa.swap_states(id, minimal_ids[as_index(id)]);
|
||||
}
|
||||
// Trim off all unused states from the pre-minimized DFA. This
|
||||
// represents all states that were merged into a non-singleton
|
||||
// equivalence class of states, and appeared after the first state
|
||||
// in each such class. (Because the state with the smallest ID in each
|
||||
// equivalence class is its representative ID.)
|
||||
self.dfa.truncate_states(minimal_count);
|
||||
|
||||
// Update the new start states, which is now just the minimal ID of
|
||||
// whatever state the old start state was collapsed into. Also, we
|
||||
// collect everything before-hand to work around the borrow checker.
|
||||
// We're already allocating so much that this is probably fine. If this
|
||||
// turns out to be costly, then I guess add a `starts_mut` iterator.
|
||||
let starts: Vec<_> = self.dfa.starts().collect();
|
||||
for (old_start_id, anchored, start_type) in starts {
|
||||
self.dfa.set_start_state(
|
||||
anchored,
|
||||
start_type,
|
||||
remap(old_start_id),
|
||||
);
|
||||
}
|
||||
|
||||
// Update the match state pattern ID list for multi-regexes. All we
|
||||
// need to do is remap the match state IDs. The pattern ID lists are
|
||||
// always the same as they were since match states with distinct
|
||||
// pattern ID lists are always considered distinct states.
|
||||
let mut pmap = BTreeMap::new();
|
||||
for (match_id, pattern_ids) in self.dfa.pattern_map() {
|
||||
let new_id = remap(match_id);
|
||||
pmap.insert(new_id, pattern_ids);
|
||||
}
|
||||
// This unwrap is OK because minimization never increases the number of
|
||||
// match states or patterns in those match states. Since minimization
|
||||
// runs after the pattern map has already been set at least once, we
|
||||
// know that our match states cannot error.
|
||||
self.dfa.set_pattern_map(&pmap).unwrap();
|
||||
|
||||
// In order to update the ID of the maximum match state, we need to
|
||||
// find the maximum ID among all of the match states in the minimized
|
||||
// DFA. This is not necessarily the new ID of the unminimized maximum
|
||||
// match state, since that could have been collapsed with a much
|
||||
// earlier match state. Therefore, to find the new max match state,
|
||||
// we iterate over all previous match states, find their corresponding
|
||||
// new minimal ID, and take the maximum of those.
|
||||
let old = self.dfa.special().clone();
|
||||
let new = self.dfa.special_mut();
|
||||
// ... but only remap if we had match states.
|
||||
if old.matches() {
|
||||
new.min_match = StateID::MAX;
|
||||
new.max_match = StateID::ZERO;
|
||||
for i in as_index(old.min_match)..=as_index(old.max_match) {
|
||||
let new_id = remap(as_state_id(i));
|
||||
if new_id < new.min_match {
|
||||
new.min_match = new_id;
|
||||
}
|
||||
if new_id > new.max_match {
|
||||
new.max_match = new_id;
|
||||
}
|
||||
}
|
||||
}
|
||||
// ... same, but for start states.
|
||||
if old.starts() {
|
||||
new.min_start = StateID::MAX;
|
||||
new.max_start = StateID::ZERO;
|
||||
for i in as_index(old.min_start)..=as_index(old.max_start) {
|
||||
let new_id = remap(as_state_id(i));
|
||||
if new_id == DEAD {
|
||||
continue;
|
||||
}
|
||||
if new_id < new.min_start {
|
||||
new.min_start = new_id;
|
||||
}
|
||||
if new_id > new.max_start {
|
||||
new.max_start = new_id;
|
||||
}
|
||||
}
|
||||
if new.max_start == DEAD {
|
||||
new.min_start = DEAD;
|
||||
}
|
||||
}
|
||||
new.quit_id = remap(new.quit_id);
|
||||
new.set_max();
|
||||
}
|
||||
|
||||
fn find_waiting(&self, set: &StateSet) -> Option<usize> {
|
||||
self.waiting.iter().position(|s| s == set)
|
||||
}
|
||||
|
||||
fn find_incoming_to(
|
||||
&self,
|
||||
b: alphabet::Unit,
|
||||
set: &StateSet,
|
||||
incoming: &mut StateSet,
|
||||
) {
|
||||
incoming.clear();
|
||||
set.iter(|id| {
|
||||
for &inid in
|
||||
&self.in_transitions[self.dfa.to_index(id)][b.as_usize()]
|
||||
{
|
||||
incoming.add(inid);
|
||||
}
|
||||
});
|
||||
incoming.canonicalize();
|
||||
}
|
||||
|
||||
fn initial_partitions(dfa: &dense::OwnedDFA) -> Vec<StateSet> {
|
||||
// For match states, we know that two match states with different
|
||||
// pattern ID lists will *always* be distinct, so we can partition them
|
||||
// initially based on that.
|
||||
let mut matching: BTreeMap<Vec<PatternID>, StateSet> = BTreeMap::new();
|
||||
let mut is_quit = StateSet::empty();
|
||||
let mut no_match = StateSet::empty();
|
||||
for state in dfa.states() {
|
||||
if dfa.is_match_state(state.id()) {
|
||||
let mut pids = vec![];
|
||||
for i in 0..dfa.match_len(state.id()) {
|
||||
pids.push(dfa.match_pattern(state.id(), i));
|
||||
}
|
||||
matching
|
||||
.entry(pids)
|
||||
.or_insert(StateSet::empty())
|
||||
.add(state.id());
|
||||
} else if dfa.is_quit_state(state.id()) {
|
||||
is_quit.add(state.id());
|
||||
} else {
|
||||
no_match.add(state.id());
|
||||
}
|
||||
}
|
||||
|
||||
let mut sets: Vec<StateSet> =
|
||||
matching.into_iter().map(|(_, set)| set).collect();
|
||||
sets.push(no_match);
|
||||
sets.push(is_quit);
|
||||
sets
|
||||
}
|
||||
|
||||
fn incoming_transitions(dfa: &dense::OwnedDFA) -> Vec<Vec<Vec<StateID>>> {
|
||||
let mut incoming = vec![];
|
||||
for _ in dfa.states() {
|
||||
incoming.push(vec![vec![]; dfa.alphabet_len()]);
|
||||
}
|
||||
for state in dfa.states() {
|
||||
for (b, next) in state.transitions() {
|
||||
incoming[dfa.to_index(next)][b.as_usize()].push(state.id());
|
||||
}
|
||||
}
|
||||
incoming
|
||||
}
|
||||
}
|
||||
|
||||
impl StateSet {
|
||||
fn empty() -> StateSet {
|
||||
StateSet { ids: Rc::new(RefCell::new(vec![])) }
|
||||
}
|
||||
|
||||
fn add(&mut self, id: StateID) {
|
||||
self.ids.borrow_mut().push(id);
|
||||
}
|
||||
|
||||
fn min(&self) -> StateID {
|
||||
self.ids.borrow()[0]
|
||||
}
|
||||
|
||||
fn canonicalize(&mut self) {
|
||||
self.ids.borrow_mut().sort();
|
||||
self.ids.borrow_mut().dedup();
|
||||
}
|
||||
|
||||
fn clear(&mut self) {
|
||||
self.ids.borrow_mut().clear();
|
||||
}
|
||||
|
||||
fn len(&self) -> usize {
|
||||
self.ids.borrow().len()
|
||||
}
|
||||
|
||||
fn is_empty(&self) -> bool {
|
||||
self.len() == 0
|
||||
}
|
||||
|
||||
fn deep_clone(&self) -> StateSet {
|
||||
let ids = self.ids.borrow().iter().cloned().collect();
|
||||
StateSet { ids: Rc::new(RefCell::new(ids)) }
|
||||
}
|
||||
|
||||
fn iter<F: FnMut(StateID)>(&self, mut f: F) {
|
||||
for &id in self.ids.borrow().iter() {
|
||||
f(id);
|
||||
}
|
||||
}
|
||||
|
||||
fn intersection(&self, other: &StateSet, dest: &mut StateSet) {
|
||||
dest.clear();
|
||||
if self.is_empty() || other.is_empty() {
|
||||
return;
|
||||
}
|
||||
|
||||
let (seta, setb) = (self.ids.borrow(), other.ids.borrow());
|
||||
let (mut ita, mut itb) = (seta.iter().cloned(), setb.iter().cloned());
|
||||
let (mut a, mut b) = (ita.next().unwrap(), itb.next().unwrap());
|
||||
loop {
|
||||
if a == b {
|
||||
dest.add(a);
|
||||
a = match ita.next() {
|
||||
None => break,
|
||||
Some(a) => a,
|
||||
};
|
||||
b = match itb.next() {
|
||||
None => break,
|
||||
Some(b) => b,
|
||||
};
|
||||
} else if a < b {
|
||||
a = match ita.next() {
|
||||
None => break,
|
||||
Some(a) => a,
|
||||
};
|
||||
} else {
|
||||
b = match itb.next() {
|
||||
None => break,
|
||||
Some(b) => b,
|
||||
};
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
fn subtract(&self, other: &StateSet, dest: &mut StateSet) {
|
||||
dest.clear();
|
||||
if self.is_empty() || other.is_empty() {
|
||||
self.iter(|s| dest.add(s));
|
||||
return;
|
||||
}
|
||||
|
||||
let (seta, setb) = (self.ids.borrow(), other.ids.borrow());
|
||||
let (mut ita, mut itb) = (seta.iter().cloned(), setb.iter().cloned());
|
||||
let (mut a, mut b) = (ita.next().unwrap(), itb.next().unwrap());
|
||||
loop {
|
||||
if a == b {
|
||||
a = match ita.next() {
|
||||
None => break,
|
||||
Some(a) => a,
|
||||
};
|
||||
b = match itb.next() {
|
||||
None => {
|
||||
dest.add(a);
|
||||
break;
|
||||
}
|
||||
Some(b) => b,
|
||||
};
|
||||
} else if a < b {
|
||||
dest.add(a);
|
||||
a = match ita.next() {
|
||||
None => break,
|
||||
Some(a) => a,
|
||||
};
|
||||
} else {
|
||||
b = match itb.next() {
|
||||
None => {
|
||||
dest.add(a);
|
||||
break;
|
||||
}
|
||||
Some(b) => b,
|
||||
};
|
||||
}
|
||||
}
|
||||
for a in ita {
|
||||
dest.add(a);
|
||||
}
|
||||
}
|
||||
}
|
||||
360
third-party/vendor/regex-automata/src/dfa/mod.rs
vendored
Normal file
360
third-party/vendor/regex-automata/src/dfa/mod.rs
vendored
Normal file
|
|
@ -0,0 +1,360 @@
|
|||
/*!
|
||||
A module for building and searching with deterministic finite automata (DFAs).
|
||||
|
||||
Like other modules in this crate, DFAs support a rich regex syntax with Unicode
|
||||
features. DFAs also have extensive options for configuring the best space vs
|
||||
time trade off for your use case and provides support for cheap deserialization
|
||||
of automata for use in `no_std` environments.
|
||||
|
||||
If you're looking for lazy DFAs that build themselves incrementally during
|
||||
search, then please see the top-level [`hybrid` module](crate::hybrid).
|
||||
|
||||
# Overview
|
||||
|
||||
This section gives a brief overview of the primary types in this module:
|
||||
|
||||
* A [`regex::Regex`] provides a way to search for matches of a regular
|
||||
expression using DFAs. This includes iterating over matches with both the start
|
||||
and end positions of each match.
|
||||
* A [`dense::DFA`] provides low level access to a DFA that uses a dense
|
||||
representation (uses lots of space, but fast searching).
|
||||
* A [`sparse::DFA`] provides the same API as a `dense::DFA`, but uses a sparse
|
||||
representation (uses less space, but slower searching).
|
||||
* An [`Automaton`] trait that defines an interface that both dense and sparse
|
||||
DFAs implement. (A `regex::Regex` is generic over this trait.)
|
||||
* Both dense DFAs and sparse DFAs support serialization to raw bytes (e.g.,
|
||||
[`dense::DFA::to_bytes_little_endian`]) and cheap deserialization (e.g.,
|
||||
[`dense::DFA::from_bytes`]).
|
||||
|
||||
There is also a [`onepass`] module that provides a [one-pass
|
||||
DFA](onepass::DFA). The unique advantage of this DFA is that, for the class
|
||||
of regexes it can be built with, it supports reporting the spans of matching
|
||||
capturing groups. It is the only DFA in this crate capable of such a thing.
|
||||
|
||||
# Example: basic regex searching
|
||||
|
||||
This example shows how to compile a regex using the default configuration
|
||||
and then use it to find matches in a byte string:
|
||||
|
||||
```
|
||||
use regex_automata::{Match, dfa::regex::Regex};
|
||||
|
||||
let re = Regex::new(r"[0-9]{4}-[0-9]{2}-[0-9]{2}")?;
|
||||
let text = b"2018-12-24 2016-10-08";
|
||||
let matches: Vec<Match> = re.find_iter(text).collect();
|
||||
assert_eq!(matches, vec![
|
||||
Match::must(0, 0..10),
|
||||
Match::must(0, 11..21),
|
||||
]);
|
||||
# Ok::<(), Box<dyn std::error::Error>>(())
|
||||
```
|
||||
|
||||
# Example: searching with regex sets
|
||||
|
||||
The DFAs in this module all fully support searching with multiple regexes
|
||||
simultaneously. You can use this support with standard leftmost-first style
|
||||
searching to find non-overlapping matches:
|
||||
|
||||
```
|
||||
# if cfg!(miri) { return Ok(()); } // miri takes too long
|
||||
use regex_automata::{Match, dfa::regex::Regex};
|
||||
|
||||
let re = Regex::new_many(&[r"\w+", r"\S+"])?;
|
||||
let text = b"@foo bar";
|
||||
let matches: Vec<Match> = re.find_iter(text).collect();
|
||||
assert_eq!(matches, vec![
|
||||
Match::must(1, 0..4),
|
||||
Match::must(0, 5..8),
|
||||
]);
|
||||
# Ok::<(), Box<dyn std::error::Error>>(())
|
||||
```
|
||||
|
||||
# Example: use sparse DFAs
|
||||
|
||||
By default, compiling a regex will use dense DFAs internally. This uses more
|
||||
memory, but executes searches more quickly. If you can abide slower searches
|
||||
(somewhere around 3-5x), then sparse DFAs might make more sense since they can
|
||||
use significantly less space.
|
||||
|
||||
Using sparse DFAs is as easy as using `Regex::new_sparse` instead of
|
||||
`Regex::new`:
|
||||
|
||||
```
|
||||
use regex_automata::{Match, dfa::regex::Regex};
|
||||
|
||||
let re = Regex::new_sparse(r"[0-9]{4}-[0-9]{2}-[0-9]{2}").unwrap();
|
||||
let text = b"2018-12-24 2016-10-08";
|
||||
let matches: Vec<Match> = re.find_iter(text).collect();
|
||||
assert_eq!(matches, vec![
|
||||
Match::must(0, 0..10),
|
||||
Match::must(0, 11..21),
|
||||
]);
|
||||
# Ok::<(), Box<dyn std::error::Error>>(())
|
||||
```
|
||||
|
||||
If you already have dense DFAs for some reason, they can be converted to sparse
|
||||
DFAs and used to build a new `Regex`. For example:
|
||||
|
||||
```
|
||||
use regex_automata::{Match, dfa::regex::Regex};
|
||||
|
||||
let dense_re = Regex::new(r"[0-9]{4}-[0-9]{2}-[0-9]{2}").unwrap();
|
||||
let sparse_re = Regex::builder().build_from_dfas(
|
||||
dense_re.forward().to_sparse()?,
|
||||
dense_re.reverse().to_sparse()?,
|
||||
);
|
||||
let text = b"2018-12-24 2016-10-08";
|
||||
let matches: Vec<Match> = sparse_re.find_iter(text).collect();
|
||||
assert_eq!(matches, vec![
|
||||
Match::must(0, 0..10),
|
||||
Match::must(0, 11..21),
|
||||
]);
|
||||
# Ok::<(), Box<dyn std::error::Error>>(())
|
||||
```
|
||||
|
||||
# Example: deserialize a DFA
|
||||
|
||||
This shows how to first serialize a DFA into raw bytes, and then deserialize
|
||||
those raw bytes back into a DFA. While this particular example is a
|
||||
bit contrived, this same technique can be used in your program to
|
||||
deserialize a DFA at start up time or by memory mapping a file.
|
||||
|
||||
```
|
||||
use regex_automata::{Match, dfa::{dense, regex::Regex}};
|
||||
|
||||
let re1 = Regex::new(r"[0-9]{4}-[0-9]{2}-[0-9]{2}").unwrap();
|
||||
// serialize both the forward and reverse DFAs, see note below
|
||||
let (fwd_bytes, fwd_pad) = re1.forward().to_bytes_native_endian();
|
||||
let (rev_bytes, rev_pad) = re1.reverse().to_bytes_native_endian();
|
||||
// now deserialize both---we need to specify the correct type!
|
||||
let fwd: dense::DFA<&[u32]> = dense::DFA::from_bytes(&fwd_bytes[fwd_pad..])?.0;
|
||||
let rev: dense::DFA<&[u32]> = dense::DFA::from_bytes(&rev_bytes[rev_pad..])?.0;
|
||||
// finally, reconstruct our regex
|
||||
let re2 = Regex::builder().build_from_dfas(fwd, rev);
|
||||
|
||||
// we can use it like normal
|
||||
let text = b"2018-12-24 2016-10-08";
|
||||
let matches: Vec<Match> = re2.find_iter(text).collect();
|
||||
assert_eq!(matches, vec![
|
||||
Match::must(0, 0..10),
|
||||
Match::must(0, 11..21),
|
||||
]);
|
||||
# Ok::<(), Box<dyn std::error::Error>>(())
|
||||
```
|
||||
|
||||
There are a few points worth noting here:
|
||||
|
||||
* We need to extract the raw DFAs used by the regex and serialize those. You
|
||||
can build the DFAs manually yourself using [`dense::Builder`], but using
|
||||
the DFAs from a `Regex` guarantees that the DFAs are built correctly. (In
|
||||
particular, a `Regex` constructs a reverse DFA for finding the starting
|
||||
location of matches.)
|
||||
* To convert the DFA to raw bytes, we use the `to_bytes_native_endian` method.
|
||||
In practice, you'll want to use either [`dense::DFA::to_bytes_little_endian`]
|
||||
or [`dense::DFA::to_bytes_big_endian`], depending on which platform you're
|
||||
deserializing your DFA from. If you intend to deserialize on either platform,
|
||||
then you'll need to serialize both and deserialize the right one depending on
|
||||
your target's endianness.
|
||||
* Safely deserializing a DFA requires verifying the raw bytes, particularly if
|
||||
they are untrusted, since an invalid DFA could cause logical errors, panics
|
||||
or even undefined behavior. This verification step requires visiting all of
|
||||
the transitions in the DFA, which can be costly. If cheaper verification is
|
||||
desired, then [`dense::DFA::from_bytes_unchecked`] is available that only does
|
||||
verification that can be performed in constant time. However, one can only use
|
||||
this routine if the caller can guarantee that the bytes provided encoded a
|
||||
valid DFA.
|
||||
|
||||
The same process can be achieved with sparse DFAs as well:
|
||||
|
||||
```
|
||||
use regex_automata::{Match, dfa::{sparse, regex::Regex}};
|
||||
|
||||
let re1 = Regex::new(r"[0-9]{4}-[0-9]{2}-[0-9]{2}").unwrap();
|
||||
// serialize both
|
||||
let fwd_bytes = re1.forward().to_sparse()?.to_bytes_native_endian();
|
||||
let rev_bytes = re1.reverse().to_sparse()?.to_bytes_native_endian();
|
||||
// now deserialize both---we need to specify the correct type!
|
||||
let fwd: sparse::DFA<&[u8]> = sparse::DFA::from_bytes(&fwd_bytes)?.0;
|
||||
let rev: sparse::DFA<&[u8]> = sparse::DFA::from_bytes(&rev_bytes)?.0;
|
||||
// finally, reconstruct our regex
|
||||
let re2 = Regex::builder().build_from_dfas(fwd, rev);
|
||||
|
||||
// we can use it like normal
|
||||
let text = b"2018-12-24 2016-10-08";
|
||||
let matches: Vec<Match> = re2.find_iter(text).collect();
|
||||
assert_eq!(matches, vec![
|
||||
Match::must(0, 0..10),
|
||||
Match::must(0, 11..21),
|
||||
]);
|
||||
# Ok::<(), Box<dyn std::error::Error>>(())
|
||||
```
|
||||
|
||||
Note that unlike dense DFAs, sparse DFAs have no alignment requirements.
|
||||
Conversely, dense DFAs must be be aligned to the same alignment as a
|
||||
[`StateID`](crate::util::primitives::StateID).
|
||||
|
||||
# Support for `no_std` and `alloc`-only
|
||||
|
||||
This crate comes with `alloc` and `std` features that are enabled by default.
|
||||
When the `alloc` or `std` features are enabled, the API of this module will
|
||||
include the facilities necessary for compiling, serializing, deserializing
|
||||
and searching with DFAs. When only the `alloc` feature is enabled, then
|
||||
implementations of the `std::error::Error` trait are dropped, but everything
|
||||
else generally remains the same. When both the `alloc` and `std` features are
|
||||
disabled, the API of this module will shrink such that it only includes the
|
||||
facilities necessary for deserializing and searching with DFAs.
|
||||
|
||||
The intended workflow for `no_std` environments is thus as follows:
|
||||
|
||||
* Write a program with the `alloc` or `std` features that compiles and
|
||||
serializes a regular expression. You may need to serialize both little and big
|
||||
endian versions of each DFA. (So that's 4 DFAs in total for each regex.)
|
||||
* In your `no_std` environment, follow the examples above for deserializing
|
||||
your previously serialized DFAs into regexes. You can then search with them as
|
||||
you would any regex.
|
||||
|
||||
Deserialization can happen anywhere. For example, with bytes embedded into a
|
||||
binary or with a file memory mapped at runtime.
|
||||
|
||||
The `regex-cli` command (found in the same repository as this crate) can be
|
||||
used to serialize DFAs to files and generate Rust code to read them.
|
||||
|
||||
# Syntax
|
||||
|
||||
This module supports the same syntax as the `regex` crate, since they share the
|
||||
same parser. You can find an exhaustive list of supported syntax in the
|
||||
[documentation for the `regex` crate](https://docs.rs/regex/1/regex/#syntax).
|
||||
|
||||
There are two things that are not supported by the DFAs in this module:
|
||||
|
||||
* Capturing groups. The DFAs (and [`Regex`](regex::Regex)es built on top
|
||||
of them) can only find the offsets of an entire match, but cannot resolve
|
||||
the offsets of each capturing group. This is because DFAs do not have the
|
||||
expressive power necessary.
|
||||
* Unicode word boundaries. These present particularly difficult challenges for
|
||||
DFA construction and would result in an explosion in the number of states.
|
||||
One can enable [`dense::Config::unicode_word_boundary`] though, which provides
|
||||
heuristic support for Unicode word boundaries that only works on ASCII text.
|
||||
Otherwise, one can use `(?-u:\b)` for an ASCII word boundary, which will work
|
||||
on any input.
|
||||
|
||||
There are no plans to lift either of these limitations.
|
||||
|
||||
Note that these restrictions are identical to the restrictions on lazy DFAs.
|
||||
|
||||
# Differences with general purpose regexes
|
||||
|
||||
The main goal of the [`regex`](https://docs.rs/regex) crate is to serve as a
|
||||
general purpose regular expression engine. It aims to automatically balance low
|
||||
compile times, fast search times and low memory usage, while also providing
|
||||
a convenient API for users. In contrast, this module provides a lower level
|
||||
regular expression interface based exclusively on DFAs that is a bit less
|
||||
convenient while providing more explicit control over memory usage and search
|
||||
times.
|
||||
|
||||
Here are some specific negative differences:
|
||||
|
||||
* **Compilation can take an exponential amount of time and space** in the size
|
||||
of the regex pattern. While most patterns do not exhibit worst case exponential
|
||||
time, such patterns do exist. For example, `[01]*1[01]{N}` will build a DFA
|
||||
with approximately `2^(N+2)` states. For this reason, untrusted patterns should
|
||||
not be compiled with this module. (In the future, the API may expose an option
|
||||
to return an error if the DFA gets too big.)
|
||||
* This module does not support sub-match extraction via capturing groups, which
|
||||
can be achieved with the regex crate's "captures" API.
|
||||
* While the regex crate doesn't necessarily sport fast compilation times,
|
||||
the regexes in this module are almost universally slow to compile, especially
|
||||
when they contain large Unicode character classes. For example, on my system,
|
||||
compiling `\w{50}` takes about 1 second and almost 15MB of memory! (Compiling
|
||||
a sparse regex takes about the same time but only uses about 1.2MB of
|
||||
memory.) Conversely, compiling the same regex without Unicode support, e.g.,
|
||||
`(?-u)\w{50}`, takes under 1 millisecond and about 15KB of memory. For this
|
||||
reason, you should only use Unicode character classes if you absolutely need
|
||||
them! (They are enabled by default though.)
|
||||
* This module does not support Unicode word boundaries. ASCII word bondaries
|
||||
may be used though by disabling Unicode or selectively doing so in the syntax,
|
||||
e.g., `(?-u:\b)`. There is also an option to
|
||||
[heuristically enable Unicode word boundaries](crate::dfa::dense::Config::unicode_word_boundary),
|
||||
where the corresponding DFA will give up if any non-ASCII byte is seen.
|
||||
* As a lower level API, this module does not do literal optimizations
|
||||
automatically. Although it does provide hooks in its API to make use of the
|
||||
[`Prefilter`](crate::util::prefilter::Prefilter) trait. Missing literal
|
||||
optimizations means that searches may run much slower than what you're
|
||||
accustomed to, although, it does provide more predictable and consistent
|
||||
performance.
|
||||
* There is no `&str` API like in the regex crate. In this module, all APIs
|
||||
operate on `&[u8]`. By default, match indices are
|
||||
guaranteed to fall on UTF-8 boundaries, unless either of
|
||||
[`syntax::Config::utf8`](crate::util::syntax::Config::utf8) or
|
||||
[`thompson::Config::utf8`](crate::nfa::thompson::Config::utf8) are disabled.
|
||||
|
||||
With some of the downsides out of the way, here are some positive differences:
|
||||
|
||||
* Both dense and sparse DFAs can be serialized to raw bytes, and then cheaply
|
||||
deserialized. Deserialization can be done in constant time with the unchecked
|
||||
APIs, since searching can be performed directly on the raw serialized bytes of
|
||||
a DFA.
|
||||
* This module was specifically designed so that the searching phase of a
|
||||
DFA has minimal runtime requirements, and can therefore be used in `no_std`
|
||||
environments. While `no_std` environments cannot compile regexes, they can
|
||||
deserialize pre-compiled regexes.
|
||||
* Since this module builds DFAs ahead of time, it will generally out-perform
|
||||
the `regex` crate on equivalent tasks. The performance difference is likely
|
||||
not large. However, because of a complex set of optimizations in the regex
|
||||
crate (like literal optimizations), an accurate performance comparison may be
|
||||
difficult to do.
|
||||
* Sparse DFAs provide a way to build a DFA ahead of time that sacrifices search
|
||||
performance a small amount, but uses much less storage space. Potentially even
|
||||
less than what the regex crate uses.
|
||||
* This module exposes DFAs directly, such as [`dense::DFA`] and
|
||||
[`sparse::DFA`], which enables one to do less work in some cases. For example,
|
||||
if you only need the end of a match and not the start of a match, then you can
|
||||
use a DFA directly without building a `Regex`, which always requires a second
|
||||
DFA to find the start of a match.
|
||||
* This module provides more control over memory usage. Aside from choosing
|
||||
between dense and sparse DFAs, one can also choose a smaller state identifier
|
||||
representation to use less space. Also, one can enable DFA minimization
|
||||
via [`dense::Config::minimize`], but it can increase compilation times
|
||||
dramatically.
|
||||
*/
|
||||
|
||||
#[cfg(feature = "dfa-search")]
|
||||
pub use crate::dfa::{
|
||||
automaton::{Automaton, OverlappingState, StartError},
|
||||
start::StartKind,
|
||||
};
|
||||
|
||||
/// This is an alias for a state ID of zero. It has special significance
|
||||
/// because it always corresponds to the first state in a DFA, and the first
|
||||
/// state in a DFA is always "dead." That is, the dead state always has all
|
||||
/// of its transitions set to itself. Moreover, the dead state is used as a
|
||||
/// sentinel for various things. e.g., In search, reaching a dead state means
|
||||
/// that the search must stop.
|
||||
const DEAD: crate::util::primitives::StateID =
|
||||
crate::util::primitives::StateID::ZERO;
|
||||
|
||||
#[cfg(feature = "dfa-search")]
|
||||
pub mod dense;
|
||||
#[cfg(feature = "dfa-onepass")]
|
||||
pub mod onepass;
|
||||
#[cfg(feature = "dfa-search")]
|
||||
pub mod regex;
|
||||
#[cfg(feature = "dfa-search")]
|
||||
pub mod sparse;
|
||||
|
||||
#[cfg(feature = "dfa-search")]
|
||||
pub(crate) mod accel;
|
||||
#[cfg(feature = "dfa-search")]
|
||||
mod automaton;
|
||||
#[cfg(feature = "dfa-build")]
|
||||
mod determinize;
|
||||
#[cfg(feature = "dfa-build")]
|
||||
mod minimize;
|
||||
#[cfg(any(feature = "dfa-build", feature = "dfa-onepass"))]
|
||||
mod remapper;
|
||||
#[cfg(feature = "dfa-search")]
|
||||
mod search;
|
||||
#[cfg(feature = "dfa-search")]
|
||||
mod special;
|
||||
#[cfg(feature = "dfa-search")]
|
||||
mod start;
|
||||
3192
third-party/vendor/regex-automata/src/dfa/onepass.rs
vendored
Normal file
3192
third-party/vendor/regex-automata/src/dfa/onepass.rs
vendored
Normal file
File diff suppressed because it is too large
Load diff
871
third-party/vendor/regex-automata/src/dfa/regex.rs
vendored
Normal file
871
third-party/vendor/regex-automata/src/dfa/regex.rs
vendored
Normal file
|
|
@ -0,0 +1,871 @@
|
|||
/*!
|
||||
A DFA-backed `Regex`.
|
||||
|
||||
This module provides [`Regex`], which is defined generically over the
|
||||
[`Automaton`] trait. A `Regex` implements convenience routines you might have
|
||||
come to expect, such as finding the start/end of a match and iterating over
|
||||
all non-overlapping matches. This `Regex` type is limited in its capabilities
|
||||
to what a DFA can provide. Therefore, APIs involving capturing groups, for
|
||||
example, are not provided.
|
||||
|
||||
Internally, a `Regex` is composed of two DFAs. One is a "forward" DFA that
|
||||
finds the end offset of a match, where as the other is a "reverse" DFA that
|
||||
find the start offset of a match.
|
||||
|
||||
See the [parent module](crate::dfa) for examples.
|
||||
*/
|
||||
|
||||
#[cfg(feature = "alloc")]
|
||||
use alloc::vec::Vec;
|
||||
|
||||
#[cfg(feature = "dfa-build")]
|
||||
use crate::dfa::dense::BuildError;
|
||||
use crate::{
|
||||
dfa::{automaton::Automaton, dense},
|
||||
util::{iter, search::Input},
|
||||
Anchored, Match, MatchError,
|
||||
};
|
||||
#[cfg(feature = "alloc")]
|
||||
use crate::{
|
||||
dfa::{sparse, StartKind},
|
||||
util::search::MatchKind,
|
||||
};
|
||||
|
||||
// When the alloc feature is enabled, the regex type sets its A type parameter
|
||||
// to default to an owned dense DFA. But without alloc, we set no default. This
|
||||
// makes things a lot more convenient in the common case, since writing out the
|
||||
// DFA types is pretty annoying.
|
||||
//
|
||||
// Since we have two different definitions but only want to write one doc
|
||||
// string, we use a macro to capture the doc and other attributes once and then
|
||||
// repeat them for each definition.
|
||||
macro_rules! define_regex_type {
|
||||
($(#[$doc:meta])*) => {
|
||||
#[cfg(feature = "alloc")]
|
||||
$(#[$doc])*
|
||||
pub struct Regex<A = dense::OwnedDFA> {
|
||||
forward: A,
|
||||
reverse: A,
|
||||
}
|
||||
|
||||
#[cfg(not(feature = "alloc"))]
|
||||
$(#[$doc])*
|
||||
pub struct Regex<A> {
|
||||
forward: A,
|
||||
reverse: A,
|
||||
}
|
||||
};
|
||||
}
|
||||
|
||||
define_regex_type!(
|
||||
/// A regular expression that uses deterministic finite automata for fast
|
||||
/// searching.
|
||||
///
|
||||
/// A regular expression is comprised of two DFAs, a "forward" DFA and a
|
||||
/// "reverse" DFA. The forward DFA is responsible for detecting the end of
|
||||
/// a match while the reverse DFA is responsible for detecting the start
|
||||
/// of a match. Thus, in order to find the bounds of any given match, a
|
||||
/// forward search must first be run followed by a reverse search. A match
|
||||
/// found by the forward DFA guarantees that the reverse DFA will also find
|
||||
/// a match.
|
||||
///
|
||||
/// The type of the DFA used by a `Regex` corresponds to the `A` type
|
||||
/// parameter, which must satisfy the [`Automaton`] trait. Typically,
|
||||
/// `A` is either a [`dense::DFA`](crate::dfa::dense::DFA) or a
|
||||
/// [`sparse::DFA`](crate::dfa::sparse::DFA), where dense DFAs use more
|
||||
/// memory but search faster, while sparse DFAs use less memory but search
|
||||
/// more slowly.
|
||||
///
|
||||
/// # Crate features
|
||||
///
|
||||
/// Note that despite what the documentation auto-generates, the _only_
|
||||
/// crate feature needed to use this type is `dfa-search`. You do _not_
|
||||
/// need to enable the `alloc` feature.
|
||||
///
|
||||
/// By default, a regex's automaton type parameter is set to
|
||||
/// `dense::DFA<Vec<u32>>` when the `alloc` feature is enabled. For most
|
||||
/// in-memory work loads, this is the most convenient type that gives the
|
||||
/// best search performance. When the `alloc` feature is disabled, no
|
||||
/// default type is used.
|
||||
///
|
||||
/// # When should I use this?
|
||||
///
|
||||
/// Generally speaking, if you can afford the overhead of building a full
|
||||
/// DFA for your regex, and you don't need things like capturing groups,
|
||||
/// then this is a good choice if you're looking to optimize for matching
|
||||
/// speed. Note however that its speed may be worse than a general purpose
|
||||
/// regex engine if you don't provide a [`dense::Config::prefilter`] to the
|
||||
/// underlying DFA.
|
||||
///
|
||||
/// # Sparse DFAs
|
||||
///
|
||||
/// Since a `Regex` is generic over the [`Automaton`] trait, it can be
|
||||
/// used with any kind of DFA. While this crate constructs dense DFAs by
|
||||
/// default, it is easy enough to build corresponding sparse DFAs, and then
|
||||
/// build a regex from them:
|
||||
///
|
||||
/// ```
|
||||
/// use regex_automata::dfa::regex::Regex;
|
||||
///
|
||||
/// // First, build a regex that uses dense DFAs.
|
||||
/// let dense_re = Regex::new("foo[0-9]+")?;
|
||||
///
|
||||
/// // Second, build sparse DFAs from the forward and reverse dense DFAs.
|
||||
/// let fwd = dense_re.forward().to_sparse()?;
|
||||
/// let rev = dense_re.reverse().to_sparse()?;
|
||||
///
|
||||
/// // Third, build a new regex from the constituent sparse DFAs.
|
||||
/// let sparse_re = Regex::builder().build_from_dfas(fwd, rev);
|
||||
///
|
||||
/// // A regex that uses sparse DFAs can be used just like with dense DFAs.
|
||||
/// assert_eq!(true, sparse_re.is_match(b"foo123"));
|
||||
///
|
||||
/// # Ok::<(), Box<dyn std::error::Error>>(())
|
||||
/// ```
|
||||
///
|
||||
/// Alternatively, one can use a [`Builder`] to construct a sparse DFA
|
||||
/// more succinctly. (Note though that dense DFAs are still constructed
|
||||
/// first internally, and then converted to sparse DFAs, as in the example
|
||||
/// above.)
|
||||
///
|
||||
/// ```
|
||||
/// use regex_automata::dfa::regex::Regex;
|
||||
///
|
||||
/// let sparse_re = Regex::builder().build_sparse(r"foo[0-9]+")?;
|
||||
/// // A regex that uses sparse DFAs can be used just like with dense DFAs.
|
||||
/// assert!(sparse_re.is_match(b"foo123"));
|
||||
///
|
||||
/// # Ok::<(), Box<dyn std::error::Error>>(())
|
||||
/// ```
|
||||
///
|
||||
/// # Fallibility
|
||||
///
|
||||
/// Most of the search routines defined on this type will _panic_ when the
|
||||
/// underlying search fails. This might be because the DFA gave up because
|
||||
/// it saw a quit byte, whether configured explicitly or via heuristic
|
||||
/// Unicode word boundary support, although neither are enabled by default.
|
||||
/// Or it might fail because an invalid `Input` configuration is given,
|
||||
/// for example, with an unsupported [`Anchored`] mode.
|
||||
///
|
||||
/// If you need to handle these error cases instead of allowing them to
|
||||
/// trigger a panic, then the lower level [`Regex::try_search`] provides
|
||||
/// a fallible API that never panics.
|
||||
///
|
||||
/// # Example
|
||||
///
|
||||
/// This example shows how to cause a search to terminate if it sees a
|
||||
/// `\n` byte, and handle the error returned. This could be useful if, for
|
||||
/// example, you wanted to prevent a user supplied pattern from matching
|
||||
/// across a line boundary.
|
||||
///
|
||||
/// ```
|
||||
/// # if cfg!(miri) { return Ok(()); } // miri takes too long
|
||||
/// use regex_automata::{dfa::{self, regex::Regex}, Input, MatchError};
|
||||
///
|
||||
/// let re = Regex::builder()
|
||||
/// .dense(dfa::dense::Config::new().quit(b'\n', true))
|
||||
/// .build(r"foo\p{any}+bar")?;
|
||||
///
|
||||
/// let input = Input::new("foo\nbar");
|
||||
/// // Normally this would produce a match, since \p{any} contains '\n'.
|
||||
/// // But since we instructed the automaton to enter a quit state if a
|
||||
/// // '\n' is observed, this produces a match error instead.
|
||||
/// let expected = MatchError::quit(b'\n', 3);
|
||||
/// let got = re.try_search(&input).unwrap_err();
|
||||
/// assert_eq!(expected, got);
|
||||
///
|
||||
/// # Ok::<(), Box<dyn std::error::Error>>(())
|
||||
/// ```
|
||||
#[derive(Clone, Debug)]
|
||||
);
|
||||
|
||||
#[cfg(all(feature = "syntax", feature = "dfa-build"))]
|
||||
impl Regex {
|
||||
/// Parse the given regular expression using the default configuration and
|
||||
/// return the corresponding regex.
|
||||
///
|
||||
/// If you want a non-default configuration, then use the [`Builder`] to
|
||||
/// set your own configuration.
|
||||
///
|
||||
/// # Example
|
||||
///
|
||||
/// ```
|
||||
/// use regex_automata::{Match, dfa::regex::Regex};
|
||||
///
|
||||
/// let re = Regex::new("foo[0-9]+bar")?;
|
||||
/// assert_eq!(
|
||||
/// Some(Match::must(0, 3..14)),
|
||||
/// re.find(b"zzzfoo12345barzzz"),
|
||||
/// );
|
||||
/// # Ok::<(), Box<dyn std::error::Error>>(())
|
||||
/// ```
|
||||
pub fn new(pattern: &str) -> Result<Regex, BuildError> {
|
||||
Builder::new().build(pattern)
|
||||
}
|
||||
|
||||
/// Like `new`, but parses multiple patterns into a single "regex set."
|
||||
/// This similarly uses the default regex configuration.
|
||||
///
|
||||
/// # Example
|
||||
///
|
||||
/// ```
|
||||
/// use regex_automata::{Match, dfa::regex::Regex};
|
||||
///
|
||||
/// let re = Regex::new_many(&["[a-z]+", "[0-9]+"])?;
|
||||
///
|
||||
/// let mut it = re.find_iter(b"abc 1 foo 4567 0 quux");
|
||||
/// assert_eq!(Some(Match::must(0, 0..3)), it.next());
|
||||
/// assert_eq!(Some(Match::must(1, 4..5)), it.next());
|
||||
/// assert_eq!(Some(Match::must(0, 6..9)), it.next());
|
||||
/// assert_eq!(Some(Match::must(1, 10..14)), it.next());
|
||||
/// assert_eq!(Some(Match::must(1, 15..16)), it.next());
|
||||
/// assert_eq!(Some(Match::must(0, 17..21)), it.next());
|
||||
/// assert_eq!(None, it.next());
|
||||
/// # Ok::<(), Box<dyn std::error::Error>>(())
|
||||
/// ```
|
||||
pub fn new_many<P: AsRef<str>>(
|
||||
patterns: &[P],
|
||||
) -> Result<Regex, BuildError> {
|
||||
Builder::new().build_many(patterns)
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(all(feature = "syntax", feature = "dfa-build"))]
|
||||
impl Regex<sparse::DFA<Vec<u8>>> {
|
||||
/// Parse the given regular expression using the default configuration,
|
||||
/// except using sparse DFAs, and return the corresponding regex.
|
||||
///
|
||||
/// If you want a non-default configuration, then use the [`Builder`] to
|
||||
/// set your own configuration.
|
||||
///
|
||||
/// # Example
|
||||
///
|
||||
/// ```
|
||||
/// use regex_automata::{Match, dfa::regex::Regex};
|
||||
///
|
||||
/// let re = Regex::new_sparse("foo[0-9]+bar")?;
|
||||
/// assert_eq!(
|
||||
/// Some(Match::must(0, 3..14)),
|
||||
/// re.find(b"zzzfoo12345barzzz"),
|
||||
/// );
|
||||
/// # Ok::<(), Box<dyn std::error::Error>>(())
|
||||
/// ```
|
||||
pub fn new_sparse(
|
||||
pattern: &str,
|
||||
) -> Result<Regex<sparse::DFA<Vec<u8>>>, BuildError> {
|
||||
Builder::new().build_sparse(pattern)
|
||||
}
|
||||
|
||||
/// Like `new`, but parses multiple patterns into a single "regex set"
|
||||
/// using sparse DFAs. This otherwise similarly uses the default regex
|
||||
/// configuration.
|
||||
///
|
||||
/// # Example
|
||||
///
|
||||
/// ```
|
||||
/// use regex_automata::{Match, dfa::regex::Regex};
|
||||
///
|
||||
/// let re = Regex::new_many_sparse(&["[a-z]+", "[0-9]+"])?;
|
||||
///
|
||||
/// let mut it = re.find_iter(b"abc 1 foo 4567 0 quux");
|
||||
/// assert_eq!(Some(Match::must(0, 0..3)), it.next());
|
||||
/// assert_eq!(Some(Match::must(1, 4..5)), it.next());
|
||||
/// assert_eq!(Some(Match::must(0, 6..9)), it.next());
|
||||
/// assert_eq!(Some(Match::must(1, 10..14)), it.next());
|
||||
/// assert_eq!(Some(Match::must(1, 15..16)), it.next());
|
||||
/// assert_eq!(Some(Match::must(0, 17..21)), it.next());
|
||||
/// assert_eq!(None, it.next());
|
||||
/// # Ok::<(), Box<dyn std::error::Error>>(())
|
||||
/// ```
|
||||
pub fn new_many_sparse<P: AsRef<str>>(
|
||||
patterns: &[P],
|
||||
) -> Result<Regex<sparse::DFA<Vec<u8>>>, BuildError> {
|
||||
Builder::new().build_many_sparse(patterns)
|
||||
}
|
||||
}
|
||||
|
||||
/// Convenience routines for regex construction.
|
||||
impl Regex<dense::DFA<&'static [u32]>> {
|
||||
/// Return a builder for configuring the construction of a `Regex`.
|
||||
///
|
||||
/// This is a convenience routine to avoid needing to import the
|
||||
/// [`Builder`] type in common cases.
|
||||
///
|
||||
/// # Example
|
||||
///
|
||||
/// This example shows how to use the builder to disable UTF-8 mode
|
||||
/// everywhere.
|
||||
///
|
||||
/// ```
|
||||
/// # if cfg!(miri) { return Ok(()); } // miri takes too long
|
||||
/// use regex_automata::{
|
||||
/// dfa::regex::Regex, nfa::thompson, util::syntax, Match,
|
||||
/// };
|
||||
///
|
||||
/// let re = Regex::builder()
|
||||
/// .syntax(syntax::Config::new().utf8(false))
|
||||
/// .thompson(thompson::Config::new().utf8(false))
|
||||
/// .build(r"foo(?-u:[^b])ar.*")?;
|
||||
/// let haystack = b"\xFEfoo\xFFarzz\xE2\x98\xFF\n";
|
||||
/// let expected = Some(Match::must(0, 1..9));
|
||||
/// let got = re.find(haystack);
|
||||
/// assert_eq!(expected, got);
|
||||
///
|
||||
/// # Ok::<(), Box<dyn std::error::Error>>(())
|
||||
/// ```
|
||||
pub fn builder() -> Builder {
|
||||
Builder::new()
|
||||
}
|
||||
}
|
||||
|
||||
/// Standard search routines for finding and iterating over matches.
|
||||
impl<A: Automaton> Regex<A> {
|
||||
/// Returns true if and only if this regex matches the given haystack.
|
||||
///
|
||||
/// This routine may short circuit if it knows that scanning future input
|
||||
/// will never lead to a different result. In particular, if the underlying
|
||||
/// DFA enters a match state or a dead state, then this routine will return
|
||||
/// `true` or `false`, respectively, without inspecting any future input.
|
||||
///
|
||||
/// # Panics
|
||||
///
|
||||
/// This routine panics if the search could not complete. This can occur
|
||||
/// in a number of circumstances:
|
||||
///
|
||||
/// * The configuration of the DFA may permit it to "quit" the search.
|
||||
/// For example, setting quit bytes or enabling heuristic support for
|
||||
/// Unicode word boundaries. The default configuration does not enable any
|
||||
/// option that could result in the DFA quitting.
|
||||
/// * When the provided `Input` configuration is not supported. For
|
||||
/// example, by providing an unsupported anchor mode.
|
||||
///
|
||||
/// When a search panics, callers cannot know whether a match exists or
|
||||
/// not.
|
||||
///
|
||||
/// Use [`Regex::try_search`] if you want to handle these error conditions.
|
||||
///
|
||||
/// # Example
|
||||
///
|
||||
/// ```
|
||||
/// use regex_automata::dfa::regex::Regex;
|
||||
///
|
||||
/// let re = Regex::new("foo[0-9]+bar")?;
|
||||
/// assert_eq!(true, re.is_match("foo12345bar"));
|
||||
/// assert_eq!(false, re.is_match("foobar"));
|
||||
/// # Ok::<(), Box<dyn std::error::Error>>(())
|
||||
/// ```
|
||||
#[inline]
|
||||
pub fn is_match<'h, I: Into<Input<'h>>>(&self, input: I) -> bool {
|
||||
// Not only can we do an "earliest" search, but we can avoid doing a
|
||||
// reverse scan too.
|
||||
let input = input.into().earliest(true);
|
||||
self.forward().try_search_fwd(&input).map(|x| x.is_some()).unwrap()
|
||||
}
|
||||
|
||||
/// Returns the start and end offset of the leftmost match. If no match
|
||||
/// exists, then `None` is returned.
|
||||
///
|
||||
/// # Panics
|
||||
///
|
||||
/// This routine panics if the search could not complete. This can occur
|
||||
/// in a number of circumstances:
|
||||
///
|
||||
/// * The configuration of the DFA may permit it to "quit" the search.
|
||||
/// For example, setting quit bytes or enabling heuristic support for
|
||||
/// Unicode word boundaries. The default configuration does not enable any
|
||||
/// option that could result in the DFA quitting.
|
||||
/// * When the provided `Input` configuration is not supported. For
|
||||
/// example, by providing an unsupported anchor mode.
|
||||
///
|
||||
/// When a search panics, callers cannot know whether a match exists or
|
||||
/// not.
|
||||
///
|
||||
/// Use [`Regex::try_search`] if you want to handle these error conditions.
|
||||
///
|
||||
/// # Example
|
||||
///
|
||||
/// ```
|
||||
/// use regex_automata::{Match, dfa::regex::Regex};
|
||||
///
|
||||
/// // Greediness is applied appropriately.
|
||||
/// let re = Regex::new("foo[0-9]+")?;
|
||||
/// assert_eq!(Some(Match::must(0, 3..11)), re.find("zzzfoo12345zzz"));
|
||||
///
|
||||
/// // Even though a match is found after reading the first byte (`a`),
|
||||
/// // the default leftmost-first match semantics demand that we find the
|
||||
/// // earliest match that prefers earlier parts of the pattern over latter
|
||||
/// // parts.
|
||||
/// let re = Regex::new("abc|a")?;
|
||||
/// assert_eq!(Some(Match::must(0, 0..3)), re.find("abc"));
|
||||
/// # Ok::<(), Box<dyn std::error::Error>>(())
|
||||
/// ```
|
||||
#[inline]
|
||||
pub fn find<'h, I: Into<Input<'h>>>(&self, input: I) -> Option<Match> {
|
||||
self.try_search(&input.into()).unwrap()
|
||||
}
|
||||
|
||||
/// Returns an iterator over all non-overlapping leftmost matches in the
|
||||
/// given bytes. If no match exists, then the iterator yields no elements.
|
||||
///
|
||||
/// This corresponds to the "standard" regex search iterator.
|
||||
///
|
||||
/// # Panics
|
||||
///
|
||||
/// If the search returns an error during iteration, then iteration
|
||||
/// panics. See [`Regex::find`] for the panic conditions.
|
||||
///
|
||||
/// Use [`Regex::try_search`] with
|
||||
/// [`util::iter::Searcher`](crate::util::iter::Searcher) if you want to
|
||||
/// handle these error conditions.
|
||||
///
|
||||
/// # Example
|
||||
///
|
||||
/// ```
|
||||
/// use regex_automata::{Match, dfa::regex::Regex};
|
||||
///
|
||||
/// let re = Regex::new("foo[0-9]+")?;
|
||||
/// let text = "foo1 foo12 foo123";
|
||||
/// let matches: Vec<Match> = re.find_iter(text).collect();
|
||||
/// assert_eq!(matches, vec![
|
||||
/// Match::must(0, 0..4),
|
||||
/// Match::must(0, 5..10),
|
||||
/// Match::must(0, 11..17),
|
||||
/// ]);
|
||||
/// # Ok::<(), Box<dyn std::error::Error>>(())
|
||||
/// ```
|
||||
#[inline]
|
||||
pub fn find_iter<'r, 'h, I: Into<Input<'h>>>(
|
||||
&'r self,
|
||||
input: I,
|
||||
) -> FindMatches<'r, 'h, A> {
|
||||
let it = iter::Searcher::new(input.into());
|
||||
FindMatches { re: self, it }
|
||||
}
|
||||
}
|
||||
|
||||
/// Lower level fallible search routines that permit controlling where the
|
||||
/// search starts and ends in a particular sequence.
|
||||
impl<A: Automaton> Regex<A> {
|
||||
/// Returns the start and end offset of the leftmost match. If no match
|
||||
/// exists, then `None` is returned.
|
||||
///
|
||||
/// This is like [`Regex::find`] but with two differences:
|
||||
///
|
||||
/// 1. It is not generic over `Into<Input>` and instead accepts a
|
||||
/// `&Input`. This permits reusing the same `Input` for multiple searches
|
||||
/// without needing to create a new one. This _may_ help with latency.
|
||||
/// 2. It returns an error if the search could not complete where as
|
||||
/// [`Regex::find`] will panic.
|
||||
///
|
||||
/// # Errors
|
||||
///
|
||||
/// This routine errors if the search could not complete. This can occur
|
||||
/// in the following circumstances:
|
||||
///
|
||||
/// * The configuration of the DFA may permit it to "quit" the search.
|
||||
/// For example, setting quit bytes or enabling heuristic support for
|
||||
/// Unicode word boundaries. The default configuration does not enable any
|
||||
/// option that could result in the DFA quitting.
|
||||
/// * When the provided `Input` configuration is not supported. For
|
||||
/// example, by providing an unsupported anchor mode.
|
||||
///
|
||||
/// When a search returns an error, callers cannot know whether a match
|
||||
/// exists or not.
|
||||
#[inline]
|
||||
pub fn try_search(
|
||||
&self,
|
||||
input: &Input<'_>,
|
||||
) -> Result<Option<Match>, MatchError> {
|
||||
let (fwd, rev) = (self.forward(), self.reverse());
|
||||
let end = match fwd.try_search_fwd(input)? {
|
||||
None => return Ok(None),
|
||||
Some(end) => end,
|
||||
};
|
||||
// This special cases an empty match at the beginning of the search. If
|
||||
// our end matches our start, then since a reverse DFA can't match past
|
||||
// the start, it must follow that our starting position is also our end
|
||||
// position. So short circuit and skip the reverse search.
|
||||
if input.start() == end.offset() {
|
||||
return Ok(Some(Match::new(
|
||||
end.pattern(),
|
||||
end.offset()..end.offset(),
|
||||
)));
|
||||
}
|
||||
// We can also skip the reverse search if we know our search was
|
||||
// anchored. This occurs either when the input config is anchored or
|
||||
// when we know the regex itself is anchored. In this case, we know the
|
||||
// start of the match, if one is found, must be the start of the
|
||||
// search.
|
||||
if self.is_anchored(input) {
|
||||
return Ok(Some(Match::new(
|
||||
end.pattern(),
|
||||
input.start()..end.offset(),
|
||||
)));
|
||||
}
|
||||
// N.B. I have tentatively convinced myself that it isn't necessary
|
||||
// to specify the specific pattern for the reverse search since the
|
||||
// reverse search will always find the same pattern to match as the
|
||||
// forward search. But I lack a rigorous proof. Why not just provide
|
||||
// the pattern anyway? Well, if it is needed, then leaving it out
|
||||
// gives us a chance to find a witness. (Also, if we don't need to
|
||||
// specify the pattern, then we don't need to build the reverse DFA
|
||||
// with 'starts_for_each_pattern' enabled.)
|
||||
//
|
||||
// We also need to be careful to disable 'earliest' for the reverse
|
||||
// search, since it could be enabled for the forward search. In the
|
||||
// reverse case, to satisfy "leftmost" criteria, we need to match
|
||||
// as much as we can. We also need to be careful to make the search
|
||||
// anchored. We don't want the reverse search to report any matches
|
||||
// other than the one beginning at the end of our forward search.
|
||||
let revsearch = input
|
||||
.clone()
|
||||
.span(input.start()..end.offset())
|
||||
.anchored(Anchored::Yes)
|
||||
.earliest(false);
|
||||
let start = rev
|
||||
.try_search_rev(&revsearch)?
|
||||
.expect("reverse search must match if forward search does");
|
||||
assert_eq!(
|
||||
start.pattern(),
|
||||
end.pattern(),
|
||||
"forward and reverse search must match same pattern",
|
||||
);
|
||||
assert!(start.offset() <= end.offset());
|
||||
Ok(Some(Match::new(end.pattern(), start.offset()..end.offset())))
|
||||
}
|
||||
|
||||
/// Returns true if either the given input specifies an anchored search
|
||||
/// or if the underlying DFA is always anchored.
|
||||
fn is_anchored(&self, input: &Input<'_>) -> bool {
|
||||
match input.get_anchored() {
|
||||
Anchored::No => self.forward().is_always_start_anchored(),
|
||||
Anchored::Yes | Anchored::Pattern(_) => true,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Non-search APIs for querying information about the regex and setting a
|
||||
/// prefilter.
|
||||
impl<A: Automaton> Regex<A> {
|
||||
/// Return the underlying DFA responsible for forward matching.
|
||||
///
|
||||
/// This is useful for accessing the underlying DFA and converting it to
|
||||
/// some other format or size. See the [`Builder::build_from_dfas`] docs
|
||||
/// for an example of where this might be useful.
|
||||
pub fn forward(&self) -> &A {
|
||||
&self.forward
|
||||
}
|
||||
|
||||
/// Return the underlying DFA responsible for reverse matching.
|
||||
///
|
||||
/// This is useful for accessing the underlying DFA and converting it to
|
||||
/// some other format or size. See the [`Builder::build_from_dfas`] docs
|
||||
/// for an example of where this might be useful.
|
||||
pub fn reverse(&self) -> &A {
|
||||
&self.reverse
|
||||
}
|
||||
|
||||
/// Returns the total number of patterns matched by this regex.
|
||||
///
|
||||
/// # Example
|
||||
///
|
||||
/// ```
|
||||
/// # if cfg!(miri) { return Ok(()); } // miri takes too long
|
||||
/// use regex_automata::dfa::regex::Regex;
|
||||
///
|
||||
/// let re = Regex::new_many(&[r"[a-z]+", r"[0-9]+", r"\w+"])?;
|
||||
/// assert_eq!(3, re.pattern_len());
|
||||
/// # Ok::<(), Box<dyn std::error::Error>>(())
|
||||
/// ```
|
||||
pub fn pattern_len(&self) -> usize {
|
||||
assert_eq!(self.forward().pattern_len(), self.reverse().pattern_len());
|
||||
self.forward().pattern_len()
|
||||
}
|
||||
}
|
||||
|
||||
/// An iterator over all non-overlapping matches for an infallible search.
|
||||
///
|
||||
/// The iterator yields a [`Match`] value until no more matches could be found.
|
||||
/// If the underlying regex engine returns an error, then a panic occurs.
|
||||
///
|
||||
/// The type parameters are as follows:
|
||||
///
|
||||
/// * `A` represents the type of the underlying DFA that implements the
|
||||
/// [`Automaton`] trait.
|
||||
///
|
||||
/// The lifetime parameters are as follows:
|
||||
///
|
||||
/// * `'h` represents the lifetime of the haystack being searched.
|
||||
/// * `'r` represents the lifetime of the regex object itself.
|
||||
///
|
||||
/// This iterator can be created with the [`Regex::find_iter`] method.
|
||||
#[derive(Debug)]
|
||||
pub struct FindMatches<'r, 'h, A> {
|
||||
re: &'r Regex<A>,
|
||||
it: iter::Searcher<'h>,
|
||||
}
|
||||
|
||||
impl<'r, 'h, A: Automaton> Iterator for FindMatches<'r, 'h, A> {
|
||||
type Item = Match;
|
||||
|
||||
#[inline]
|
||||
fn next(&mut self) -> Option<Match> {
|
||||
let FindMatches { re, ref mut it } = *self;
|
||||
it.advance(|input| re.try_search(input))
|
||||
}
|
||||
}
|
||||
|
||||
/// A builder for a regex based on deterministic finite automatons.
|
||||
///
|
||||
/// This builder permits configuring options for the syntax of a pattern, the
|
||||
/// NFA construction, the DFA construction and finally the regex searching
|
||||
/// itself. This builder is different from a general purpose regex builder in
|
||||
/// that it permits fine grain configuration of the construction process. The
|
||||
/// trade off for this is complexity, and the possibility of setting a
|
||||
/// configuration that might not make sense. For example, there are two
|
||||
/// different UTF-8 modes:
|
||||
///
|
||||
/// * [`syntax::Config::utf8`](crate::util::syntax::Config::utf8) controls
|
||||
/// whether the pattern itself can contain sub-expressions that match invalid
|
||||
/// UTF-8.
|
||||
/// * [`thompson::Config::utf8`](crate::nfa::thompson::Config::utf8) controls
|
||||
/// how the regex iterators themselves advance the starting position of the
|
||||
/// next search when a match with zero length is found.
|
||||
///
|
||||
/// Generally speaking, callers will want to either enable all of these or
|
||||
/// disable all of these.
|
||||
///
|
||||
/// Internally, building a regex requires building two DFAs, where one is
|
||||
/// responsible for finding the end of a match and the other is responsible
|
||||
/// for finding the start of a match. If you only need to detect whether
|
||||
/// something matched, or only the end of a match, then you should use a
|
||||
/// [`dense::Builder`] to construct a single DFA, which is cheaper than
|
||||
/// building two DFAs.
|
||||
///
|
||||
/// # Build methods
|
||||
///
|
||||
/// This builder has a few "build" methods. In general, it's the result of
|
||||
/// combining the following parameters:
|
||||
///
|
||||
/// * Building one or many regexes.
|
||||
/// * Building a regex with dense or sparse DFAs.
|
||||
///
|
||||
/// The simplest "build" method is [`Builder::build`]. It accepts a single
|
||||
/// pattern and builds a dense DFA using `usize` for the state identifier
|
||||
/// representation.
|
||||
///
|
||||
/// The most general "build" method is [`Builder::build_many`], which permits
|
||||
/// building a regex that searches for multiple patterns simultaneously while
|
||||
/// using a specific state identifier representation.
|
||||
///
|
||||
/// The most flexible "build" method, but hardest to use, is
|
||||
/// [`Builder::build_from_dfas`]. This exposes the fact that a [`Regex`] is
|
||||
/// just a pair of DFAs, and this method allows you to specify those DFAs
|
||||
/// exactly.
|
||||
///
|
||||
/// # Example
|
||||
///
|
||||
/// This example shows how to disable UTF-8 mode in the syntax and the regex
|
||||
/// itself. This is generally what you want for matching on arbitrary bytes.
|
||||
///
|
||||
/// ```
|
||||
/// # if cfg!(miri) { return Ok(()); } // miri takes too long
|
||||
/// use regex_automata::{
|
||||
/// dfa::regex::Regex, nfa::thompson, util::syntax, Match,
|
||||
/// };
|
||||
///
|
||||
/// let re = Regex::builder()
|
||||
/// .syntax(syntax::Config::new().utf8(false))
|
||||
/// .thompson(thompson::Config::new().utf8(false))
|
||||
/// .build(r"foo(?-u:[^b])ar.*")?;
|
||||
/// let haystack = b"\xFEfoo\xFFarzz\xE2\x98\xFF\n";
|
||||
/// let expected = Some(Match::must(0, 1..9));
|
||||
/// let got = re.find(haystack);
|
||||
/// assert_eq!(expected, got);
|
||||
/// // Notice that `(?-u:[^b])` matches invalid UTF-8,
|
||||
/// // but the subsequent `.*` does not! Disabling UTF-8
|
||||
/// // on the syntax permits this.
|
||||
/// assert_eq!(b"foo\xFFarzz", &haystack[got.unwrap().range()]);
|
||||
///
|
||||
/// # Ok::<(), Box<dyn std::error::Error>>(())
|
||||
/// ```
|
||||
#[derive(Clone, Debug)]
|
||||
pub struct Builder {
|
||||
#[cfg(feature = "dfa-build")]
|
||||
dfa: dense::Builder,
|
||||
}
|
||||
|
||||
impl Builder {
|
||||
/// Create a new regex builder with the default configuration.
|
||||
pub fn new() -> Builder {
|
||||
Builder {
|
||||
#[cfg(feature = "dfa-build")]
|
||||
dfa: dense::Builder::new(),
|
||||
}
|
||||
}
|
||||
|
||||
/// Build a regex from the given pattern.
|
||||
///
|
||||
/// If there was a problem parsing or compiling the pattern, then an error
|
||||
/// is returned.
|
||||
#[cfg(all(feature = "syntax", feature = "dfa-build"))]
|
||||
pub fn build(&self, pattern: &str) -> Result<Regex, BuildError> {
|
||||
self.build_many(&[pattern])
|
||||
}
|
||||
|
||||
/// Build a regex from the given pattern using sparse DFAs.
|
||||
///
|
||||
/// If there was a problem parsing or compiling the pattern, then an error
|
||||
/// is returned.
|
||||
#[cfg(all(feature = "syntax", feature = "dfa-build"))]
|
||||
pub fn build_sparse(
|
||||
&self,
|
||||
pattern: &str,
|
||||
) -> Result<Regex<sparse::DFA<Vec<u8>>>, BuildError> {
|
||||
self.build_many_sparse(&[pattern])
|
||||
}
|
||||
|
||||
/// Build a regex from the given patterns.
|
||||
#[cfg(all(feature = "syntax", feature = "dfa-build"))]
|
||||
pub fn build_many<P: AsRef<str>>(
|
||||
&self,
|
||||
patterns: &[P],
|
||||
) -> Result<Regex, BuildError> {
|
||||
let forward = self.dfa.build_many(patterns)?;
|
||||
let reverse = self
|
||||
.dfa
|
||||
.clone()
|
||||
.configure(
|
||||
dense::Config::new()
|
||||
.prefilter(None)
|
||||
.specialize_start_states(false)
|
||||
.start_kind(StartKind::Anchored)
|
||||
.match_kind(MatchKind::All),
|
||||
)
|
||||
.thompson(crate::nfa::thompson::Config::new().reverse(true))
|
||||
.build_many(patterns)?;
|
||||
Ok(self.build_from_dfas(forward, reverse))
|
||||
}
|
||||
|
||||
/// Build a sparse regex from the given patterns.
|
||||
#[cfg(all(feature = "syntax", feature = "dfa-build"))]
|
||||
pub fn build_many_sparse<P: AsRef<str>>(
|
||||
&self,
|
||||
patterns: &[P],
|
||||
) -> Result<Regex<sparse::DFA<Vec<u8>>>, BuildError> {
|
||||
let re = self.build_many(patterns)?;
|
||||
let forward = re.forward().to_sparse()?;
|
||||
let reverse = re.reverse().to_sparse()?;
|
||||
Ok(self.build_from_dfas(forward, reverse))
|
||||
}
|
||||
|
||||
/// Build a regex from its component forward and reverse DFAs.
|
||||
///
|
||||
/// This is useful when deserializing a regex from some arbitrary
|
||||
/// memory region. This is also useful for building regexes from other
|
||||
/// types of DFAs.
|
||||
///
|
||||
/// If you're building the DFAs from scratch instead of building new DFAs
|
||||
/// from other DFAs, then you'll need to make sure that the reverse DFA is
|
||||
/// configured correctly to match the intended semantics. Namely:
|
||||
///
|
||||
/// * It should be anchored.
|
||||
/// * It should use [`MatchKind::All`] semantics.
|
||||
/// * It should match in reverse.
|
||||
/// * Otherwise, its configuration should match the forward DFA.
|
||||
///
|
||||
/// If these conditions aren't satisfied, then the behavior of searches is
|
||||
/// unspecified.
|
||||
///
|
||||
/// Note that when using this constructor, no configuration is applied.
|
||||
/// Since this routine provides the DFAs to the builder, there is no
|
||||
/// opportunity to apply other configuration options.
|
||||
///
|
||||
/// # Example
|
||||
///
|
||||
/// This example is a bit a contrived. The usual use of these methods
|
||||
/// would involve serializing `initial_re` somewhere and then deserializing
|
||||
/// it later to build a regex. But in this case, we do everything in
|
||||
/// memory.
|
||||
///
|
||||
/// ```
|
||||
/// use regex_automata::dfa::regex::Regex;
|
||||
///
|
||||
/// let initial_re = Regex::new("foo[0-9]+")?;
|
||||
/// assert_eq!(true, initial_re.is_match(b"foo123"));
|
||||
///
|
||||
/// let (fwd, rev) = (initial_re.forward(), initial_re.reverse());
|
||||
/// let re = Regex::builder().build_from_dfas(fwd, rev);
|
||||
/// assert_eq!(true, re.is_match(b"foo123"));
|
||||
/// # Ok::<(), Box<dyn std::error::Error>>(())
|
||||
/// ```
|
||||
///
|
||||
/// This example shows how to build a `Regex` that uses sparse DFAs instead
|
||||
/// of dense DFAs without using one of the convenience `build_sparse`
|
||||
/// routines:
|
||||
///
|
||||
/// ```
|
||||
/// use regex_automata::dfa::regex::Regex;
|
||||
///
|
||||
/// let initial_re = Regex::new("foo[0-9]+")?;
|
||||
/// assert_eq!(true, initial_re.is_match(b"foo123"));
|
||||
///
|
||||
/// let fwd = initial_re.forward().to_sparse()?;
|
||||
/// let rev = initial_re.reverse().to_sparse()?;
|
||||
/// let re = Regex::builder().build_from_dfas(fwd, rev);
|
||||
/// assert_eq!(true, re.is_match(b"foo123"));
|
||||
/// # Ok::<(), Box<dyn std::error::Error>>(())
|
||||
/// ```
|
||||
pub fn build_from_dfas<A: Automaton>(
|
||||
&self,
|
||||
forward: A,
|
||||
reverse: A,
|
||||
) -> Regex<A> {
|
||||
Regex { forward, reverse }
|
||||
}
|
||||
|
||||
/// Set the syntax configuration for this builder using
|
||||
/// [`syntax::Config`](crate::util::syntax::Config).
|
||||
///
|
||||
/// This permits setting things like case insensitivity, Unicode and multi
|
||||
/// line mode.
|
||||
#[cfg(all(feature = "syntax", feature = "dfa-build"))]
|
||||
pub fn syntax(
|
||||
&mut self,
|
||||
config: crate::util::syntax::Config,
|
||||
) -> &mut Builder {
|
||||
self.dfa.syntax(config);
|
||||
self
|
||||
}
|
||||
|
||||
/// Set the Thompson NFA configuration for this builder using
|
||||
/// [`nfa::thompson::Config`](crate::nfa::thompson::Config).
|
||||
///
|
||||
/// This permits setting things like whether additional time should be
|
||||
/// spent shrinking the size of the NFA.
|
||||
#[cfg(all(feature = "syntax", feature = "dfa-build"))]
|
||||
pub fn thompson(
|
||||
&mut self,
|
||||
config: crate::nfa::thompson::Config,
|
||||
) -> &mut Builder {
|
||||
self.dfa.thompson(config);
|
||||
self
|
||||
}
|
||||
|
||||
/// Set the dense DFA compilation configuration for this builder using
|
||||
/// [`dense::Config`].
|
||||
///
|
||||
/// This permits setting things like whether the underlying DFAs should
|
||||
/// be minimized.
|
||||
#[cfg(feature = "dfa-build")]
|
||||
pub fn dense(&mut self, config: dense::Config) -> &mut Builder {
|
||||
self.dfa.configure(config);
|
||||
self
|
||||
}
|
||||
}
|
||||
|
||||
impl Default for Builder {
|
||||
fn default() -> Builder {
|
||||
Builder::new()
|
||||
}
|
||||
}
|
||||
242
third-party/vendor/regex-automata/src/dfa/remapper.rs
vendored
Normal file
242
third-party/vendor/regex-automata/src/dfa/remapper.rs
vendored
Normal file
|
|
@ -0,0 +1,242 @@
|
|||
use alloc::vec::Vec;
|
||||
|
||||
use crate::util::primitives::StateID;
|
||||
|
||||
/// Remappable is a tightly coupled abstraction that facilitates remapping
|
||||
/// state identifiers in DFAs.
|
||||
///
|
||||
/// The main idea behind remapping state IDs is that DFAs often need to check
|
||||
/// if a certain state is a "special" state of some kind (like a match state)
|
||||
/// during a search. Since this is extremely perf critical code, we want this
|
||||
/// check to be as fast as possible. Partitioning state IDs into, for example,
|
||||
/// into "non-match" and "match" states means one can tell if a state is a
|
||||
/// match state via a simple comparison of the state ID.
|
||||
///
|
||||
/// The issue is that during the DFA construction process, it's not
|
||||
/// particularly easy to partition the states. Instead, the simplest thing is
|
||||
/// to often just do a pass over all of the states and shuffle them into their
|
||||
/// desired partitionings. To do that, we need a mechanism for swapping states.
|
||||
/// Hence, this abstraction.
|
||||
///
|
||||
/// Normally, for such little code, I would just duplicate it. But this is a
|
||||
/// key optimization and the implementation is a bit subtle. So the abstraction
|
||||
/// is basically a ham-fisted attempt at DRY. The only place we use this is in
|
||||
/// the dense and one-pass DFAs.
|
||||
///
|
||||
/// See also src/dfa/special.rs for a more detailed explanation of how dense
|
||||
/// DFAs are partitioned.
|
||||
pub(super) trait Remappable: core::fmt::Debug {
|
||||
/// Return the total number of states.
|
||||
fn state_len(&self) -> usize;
|
||||
/// Return the power-of-2 exponent that yields the stride. The pertinent
|
||||
/// laws here are, where N=stride2: 2^N=stride and len(alphabet) <= stride.
|
||||
fn stride2(&self) -> usize;
|
||||
/// Swap the states pointed to by the given IDs. The underlying finite
|
||||
/// state machine should be mutated such that all of the transitions in
|
||||
/// `id1` are now in the memory region where the transitions for `id2`
|
||||
/// were, and all of the transitions in `id2` are now in the memory region
|
||||
/// where the transitions for `id1` were.
|
||||
///
|
||||
/// Essentially, this "moves" `id1` to `id2` and `id2` to `id1`.
|
||||
///
|
||||
/// It is expected that, after calling this, the underlying value will be
|
||||
/// left in an inconsistent state, since any other transitions pointing to,
|
||||
/// e.g., `id1` need to be updated to point to `id2`, since that's where
|
||||
/// `id1` moved to.
|
||||
///
|
||||
/// In order to "fix" the underlying inconsistent state, a `Remapper`
|
||||
/// should be used to guarantee that `remap` is called at the appropriate
|
||||
/// time.
|
||||
fn swap_states(&mut self, id1: StateID, id2: StateID);
|
||||
/// This must remap every single state ID in the underlying value according
|
||||
/// to the function given. For example, in a DFA, this should remap every
|
||||
/// transition and every starting state ID.
|
||||
fn remap(&mut self, map: impl Fn(StateID) -> StateID);
|
||||
}
|
||||
|
||||
/// Remapper is an abstraction the manages the remapping of state IDs in a
|
||||
/// finite state machine. This is useful when one wants to shuffle states into
|
||||
/// different positions in the machine.
|
||||
///
|
||||
/// One of the key complexities this manages is the ability to correctly move
|
||||
/// one state multiple times.
|
||||
///
|
||||
/// Once shuffling is complete, `remap` must be called, which will rewrite
|
||||
/// all pertinent transitions to updated state IDs. Neglecting to call `remap`
|
||||
/// will almost certainly result in a corrupt machine.
|
||||
#[derive(Debug)]
|
||||
pub(super) struct Remapper {
|
||||
/// A map from the index of a state to its pre-multiplied identifier.
|
||||
///
|
||||
/// When a state is swapped with another, then their corresponding
|
||||
/// locations in this map are also swapped. Thus, its new position will
|
||||
/// still point to its old pre-multiplied StateID.
|
||||
///
|
||||
/// While there is a bit more to it, this then allows us to rewrite the
|
||||
/// state IDs in a DFA's transition table in a single pass. This is done
|
||||
/// by iterating over every ID in this map, then iterating over each
|
||||
/// transition for the state at that ID and re-mapping the transition from
|
||||
/// `old_id` to `map[dfa.to_index(old_id)]`. That is, we find the position
|
||||
/// in this map where `old_id` *started*, and set it to where it ended up
|
||||
/// after all swaps have been completed.
|
||||
map: Vec<StateID>,
|
||||
/// A mapper from state index to state ID (and back).
|
||||
idxmap: IndexMapper,
|
||||
}
|
||||
|
||||
impl Remapper {
|
||||
/// Create a new remapper from the given remappable implementation. The
|
||||
/// remapper can then be used to swap states. The remappable value given
|
||||
/// here must the same one given to `swap` and `remap`.
|
||||
pub(super) fn new(r: &impl Remappable) -> Remapper {
|
||||
let idxmap = IndexMapper { stride2: r.stride2() };
|
||||
let map = (0..r.state_len()).map(|i| idxmap.to_state_id(i)).collect();
|
||||
Remapper { map, idxmap }
|
||||
}
|
||||
|
||||
/// Swap two states. Once this is called, callers must follow through to
|
||||
/// call `remap`, or else it's possible for the underlying remappable
|
||||
/// value to be in a corrupt state.
|
||||
pub(super) fn swap(
|
||||
&mut self,
|
||||
r: &mut impl Remappable,
|
||||
id1: StateID,
|
||||
id2: StateID,
|
||||
) {
|
||||
if id1 == id2 {
|
||||
return;
|
||||
}
|
||||
r.swap_states(id1, id2);
|
||||
self.map.swap(self.idxmap.to_index(id1), self.idxmap.to_index(id2));
|
||||
}
|
||||
|
||||
/// Complete the remapping process by rewriting all state IDs in the
|
||||
/// remappable value according to the swaps performed.
|
||||
pub(super) fn remap(mut self, r: &mut impl Remappable) {
|
||||
// Update the map to account for states that have been swapped
|
||||
// multiple times. For example, if (A, C) and (C, G) are swapped, then
|
||||
// transitions previously pointing to A should now point to G. But if
|
||||
// we don't update our map, they will erroneously be set to C. All we
|
||||
// do is follow the swaps in our map until we see our original state
|
||||
// ID.
|
||||
//
|
||||
// The intuition here is to think about how changes are made to the
|
||||
// map: only through pairwise swaps. That means that starting at any
|
||||
// given state, it is always possible to find the loop back to that
|
||||
// state by following the swaps represented in the map (which might be
|
||||
// 0 swaps).
|
||||
//
|
||||
// We are also careful to clone the map before starting in order to
|
||||
// freeze it. We use the frozen map to find our loops, since we need to
|
||||
// update our map as well. Without freezing it, our updates could break
|
||||
// the loops referenced above and produce incorrect results.
|
||||
let oldmap = self.map.clone();
|
||||
for i in 0..r.state_len() {
|
||||
let cur_id = self.idxmap.to_state_id(i);
|
||||
let mut new_id = oldmap[i];
|
||||
if cur_id == new_id {
|
||||
continue;
|
||||
}
|
||||
loop {
|
||||
let id = oldmap[self.idxmap.to_index(new_id)];
|
||||
if cur_id == id {
|
||||
self.map[i] = new_id;
|
||||
break;
|
||||
}
|
||||
new_id = id;
|
||||
}
|
||||
}
|
||||
r.remap(|next| self.map[self.idxmap.to_index(next)]);
|
||||
}
|
||||
}
|
||||
|
||||
/// A simple type for mapping between state indices and state IDs.
|
||||
///
|
||||
/// The reason why this exists is because state IDs are "premultiplied." That
|
||||
/// is, in order to get to the transitions for a particular state, one need
|
||||
/// only use the state ID as-is, instead of having to multiple it by transition
|
||||
/// table's stride.
|
||||
///
|
||||
/// The downside of this is that it's inconvenient to map between state IDs
|
||||
/// using a dense map, e.g., Vec<StateID>. That's because state IDs look like
|
||||
/// `0`, `0+stride`, `0+2*stride`, `0+3*stride`, etc., instead of `0`, `1`,
|
||||
/// `2`, `3`, etc.
|
||||
///
|
||||
/// Since our state IDs are premultiplied, we can convert back-and-forth
|
||||
/// between IDs and indices by simply unmultiplying the IDs and multiplying the
|
||||
/// indices.
|
||||
#[derive(Debug)]
|
||||
struct IndexMapper {
|
||||
/// The power of 2 corresponding to the stride of the corresponding
|
||||
/// transition table. 'id >> stride2' de-multiplies an ID while 'index <<
|
||||
/// stride2' pre-multiplies an index to an ID.
|
||||
stride2: usize,
|
||||
}
|
||||
|
||||
impl IndexMapper {
|
||||
/// Convert a state ID to a state index.
|
||||
fn to_index(&self, id: StateID) -> usize {
|
||||
id.as_usize() >> self.stride2
|
||||
}
|
||||
|
||||
/// Convert a state index to a state ID.
|
||||
fn to_state_id(&self, index: usize) -> StateID {
|
||||
// CORRECTNESS: If the given index is not valid, then it is not
|
||||
// required for this to panic or return a valid state ID. We'll "just"
|
||||
// wind up with panics or silent logic errors at some other point.
|
||||
StateID::new_unchecked(index << self.stride2)
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(feature = "dfa-build")]
|
||||
mod dense {
|
||||
use crate::{dfa::dense::OwnedDFA, util::primitives::StateID};
|
||||
|
||||
use super::Remappable;
|
||||
|
||||
impl Remappable for OwnedDFA {
|
||||
fn state_len(&self) -> usize {
|
||||
OwnedDFA::state_len(self)
|
||||
}
|
||||
|
||||
fn stride2(&self) -> usize {
|
||||
OwnedDFA::stride2(self)
|
||||
}
|
||||
|
||||
fn swap_states(&mut self, id1: StateID, id2: StateID) {
|
||||
OwnedDFA::swap_states(self, id1, id2)
|
||||
}
|
||||
|
||||
fn remap(&mut self, map: impl Fn(StateID) -> StateID) {
|
||||
OwnedDFA::remap(self, map)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(feature = "dfa-onepass")]
|
||||
mod onepass {
|
||||
use crate::{dfa::onepass::DFA, util::primitives::StateID};
|
||||
|
||||
use super::Remappable;
|
||||
|
||||
impl Remappable for DFA {
|
||||
fn state_len(&self) -> usize {
|
||||
DFA::state_len(self)
|
||||
}
|
||||
|
||||
fn stride2(&self) -> usize {
|
||||
// We don't do pre-multiplication for the one-pass DFA, so
|
||||
// returning 0 has the effect of making state IDs and state indices
|
||||
// equivalent.
|
||||
0
|
||||
}
|
||||
|
||||
fn swap_states(&mut self, id1: StateID, id2: StateID) {
|
||||
DFA::swap_states(self, id1, id2)
|
||||
}
|
||||
|
||||
fn remap(&mut self, map: impl Fn(StateID) -> StateID) {
|
||||
DFA::remap(self, map)
|
||||
}
|
||||
}
|
||||
}
|
||||
644
third-party/vendor/regex-automata/src/dfa/search.rs
vendored
Normal file
644
third-party/vendor/regex-automata/src/dfa/search.rs
vendored
Normal file
|
|
@ -0,0 +1,644 @@
|
|||
use crate::{
|
||||
dfa::{
|
||||
accel,
|
||||
automaton::{Automaton, OverlappingState},
|
||||
},
|
||||
util::{
|
||||
prefilter::Prefilter,
|
||||
primitives::StateID,
|
||||
search::{Anchored, HalfMatch, Input, Span},
|
||||
},
|
||||
MatchError,
|
||||
};
|
||||
|
||||
#[inline(never)]
|
||||
pub fn find_fwd<A: Automaton + ?Sized>(
|
||||
dfa: &A,
|
||||
input: &Input<'_>,
|
||||
) -> Result<Option<HalfMatch>, MatchError> {
|
||||
if input.is_done() {
|
||||
return Ok(None);
|
||||
}
|
||||
let pre = if input.get_anchored().is_anchored() {
|
||||
None
|
||||
} else {
|
||||
dfa.get_prefilter()
|
||||
};
|
||||
// Searching with a pattern ID is always anchored, so we should never use
|
||||
// a prefilter.
|
||||
if pre.is_some() {
|
||||
if input.get_earliest() {
|
||||
find_fwd_imp(dfa, input, pre, true)
|
||||
} else {
|
||||
find_fwd_imp(dfa, input, pre, false)
|
||||
}
|
||||
} else {
|
||||
if input.get_earliest() {
|
||||
find_fwd_imp(dfa, input, None, true)
|
||||
} else {
|
||||
find_fwd_imp(dfa, input, None, false)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg_attr(feature = "perf-inline", inline(always))]
|
||||
fn find_fwd_imp<A: Automaton + ?Sized>(
|
||||
dfa: &A,
|
||||
input: &Input<'_>,
|
||||
pre: Option<&'_ Prefilter>,
|
||||
earliest: bool,
|
||||
) -> Result<Option<HalfMatch>, MatchError> {
|
||||
// See 'prefilter_restart' docs for explanation.
|
||||
let universal_start = dfa.universal_start_state(Anchored::No).is_some();
|
||||
let mut mat = None;
|
||||
let mut sid = init_fwd(dfa, input)?;
|
||||
let mut at = input.start();
|
||||
// This could just be a closure, but then I think it would be unsound
|
||||
// because it would need to be safe to invoke. This way, the lack of safety
|
||||
// is clearer in the code below.
|
||||
macro_rules! next_unchecked {
|
||||
($sid:expr, $at:expr) => {{
|
||||
let byte = *input.haystack().get_unchecked($at);
|
||||
dfa.next_state_unchecked($sid, byte)
|
||||
}};
|
||||
}
|
||||
|
||||
if let Some(ref pre) = pre {
|
||||
let span = Span::from(at..input.end());
|
||||
// If a prefilter doesn't report false positives, then we don't need to
|
||||
// touch the DFA at all. However, since all matches include the pattern
|
||||
// ID, and the prefilter infrastructure doesn't report pattern IDs, we
|
||||
// limit this optimization to cases where there is exactly one pattern.
|
||||
// In that case, any match must be the 0th pattern.
|
||||
match pre.find(input.haystack(), span) {
|
||||
None => return Ok(mat),
|
||||
Some(ref span) => {
|
||||
at = span.start;
|
||||
if !universal_start {
|
||||
sid = prefilter_restart(dfa, &input, at)?;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
while at < input.end() {
|
||||
// SAFETY: There are two safety invariants we need to uphold here in
|
||||
// the loops below: that 'sid' and 'prev_sid' are valid state IDs
|
||||
// for this DFA, and that 'at' is a valid index into 'haystack'.
|
||||
// For the former, we rely on the invariant that next_state* and
|
||||
// start_state_forward always returns a valid state ID (given a valid
|
||||
// state ID in the former case). For the latter safety invariant, we
|
||||
// always guard unchecked access with a check that 'at' is less than
|
||||
// 'end', where 'end <= haystack.len()'. In the unrolled loop below, we
|
||||
// ensure that 'at' is always in bounds.
|
||||
//
|
||||
// PERF: See a similar comment in src/hybrid/search.rs that justifies
|
||||
// this extra work to make the search loop fast. The same reasoning and
|
||||
// benchmarks apply here.
|
||||
let mut prev_sid;
|
||||
while at < input.end() {
|
||||
prev_sid = unsafe { next_unchecked!(sid, at) };
|
||||
if dfa.is_special_state(prev_sid) || at + 3 >= input.end() {
|
||||
core::mem::swap(&mut prev_sid, &mut sid);
|
||||
break;
|
||||
}
|
||||
at += 1;
|
||||
|
||||
sid = unsafe { next_unchecked!(prev_sid, at) };
|
||||
if dfa.is_special_state(sid) {
|
||||
break;
|
||||
}
|
||||
at += 1;
|
||||
|
||||
prev_sid = unsafe { next_unchecked!(sid, at) };
|
||||
if dfa.is_special_state(prev_sid) {
|
||||
core::mem::swap(&mut prev_sid, &mut sid);
|
||||
break;
|
||||
}
|
||||
at += 1;
|
||||
|
||||
sid = unsafe { next_unchecked!(prev_sid, at) };
|
||||
if dfa.is_special_state(sid) {
|
||||
break;
|
||||
}
|
||||
at += 1;
|
||||
}
|
||||
if dfa.is_special_state(sid) {
|
||||
if dfa.is_start_state(sid) {
|
||||
if let Some(ref pre) = pre {
|
||||
let span = Span::from(at..input.end());
|
||||
match pre.find(input.haystack(), span) {
|
||||
None => return Ok(mat),
|
||||
Some(ref span) => {
|
||||
// We want to skip any update to 'at' below
|
||||
// at the end of this iteration and just
|
||||
// jump immediately back to the next state
|
||||
// transition at the leading position of the
|
||||
// candidate match.
|
||||
//
|
||||
// ... but only if we actually made progress
|
||||
// with our prefilter, otherwise if the start
|
||||
// state has a self-loop, we can get stuck.
|
||||
if span.start > at {
|
||||
at = span.start;
|
||||
if !universal_start {
|
||||
sid = prefilter_restart(dfa, &input, at)?;
|
||||
}
|
||||
continue;
|
||||
}
|
||||
}
|
||||
}
|
||||
} else if dfa.is_accel_state(sid) {
|
||||
let needles = dfa.accelerator(sid);
|
||||
at = accel::find_fwd(needles, input.haystack(), at + 1)
|
||||
.unwrap_or(input.end());
|
||||
continue;
|
||||
}
|
||||
} else if dfa.is_match_state(sid) {
|
||||
let pattern = dfa.match_pattern(sid, 0);
|
||||
mat = Some(HalfMatch::new(pattern, at));
|
||||
if earliest {
|
||||
return Ok(mat);
|
||||
}
|
||||
if dfa.is_accel_state(sid) {
|
||||
let needles = dfa.accelerator(sid);
|
||||
at = accel::find_fwd(needles, input.haystack(), at + 1)
|
||||
.unwrap_or(input.end());
|
||||
continue;
|
||||
}
|
||||
} else if dfa.is_accel_state(sid) {
|
||||
let needs = dfa.accelerator(sid);
|
||||
at = accel::find_fwd(needs, input.haystack(), at + 1)
|
||||
.unwrap_or(input.end());
|
||||
continue;
|
||||
} else if dfa.is_dead_state(sid) {
|
||||
return Ok(mat);
|
||||
} else {
|
||||
// It's important that this is a debug_assert, since this can
|
||||
// actually be tripped even if DFA::from_bytes succeeds and
|
||||
// returns a supposedly valid DFA.
|
||||
return Err(MatchError::quit(input.haystack()[at], at));
|
||||
}
|
||||
}
|
||||
at += 1;
|
||||
}
|
||||
eoi_fwd(dfa, input, &mut sid, &mut mat)?;
|
||||
Ok(mat)
|
||||
}
|
||||
|
||||
#[inline(never)]
|
||||
pub fn find_rev<A: Automaton + ?Sized>(
|
||||
dfa: &A,
|
||||
input: &Input<'_>,
|
||||
) -> Result<Option<HalfMatch>, MatchError> {
|
||||
if input.is_done() {
|
||||
return Ok(None);
|
||||
}
|
||||
if input.get_earliest() {
|
||||
find_rev_imp(dfa, input, true)
|
||||
} else {
|
||||
find_rev_imp(dfa, input, false)
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg_attr(feature = "perf-inline", inline(always))]
|
||||
fn find_rev_imp<A: Automaton + ?Sized>(
|
||||
dfa: &A,
|
||||
input: &Input<'_>,
|
||||
earliest: bool,
|
||||
) -> Result<Option<HalfMatch>, MatchError> {
|
||||
let mut mat = None;
|
||||
let mut sid = init_rev(dfa, input)?;
|
||||
// In reverse search, the loop below can't handle the case of searching an
|
||||
// empty slice. Ideally we could write something congruent to the forward
|
||||
// search, i.e., 'while at >= start', but 'start' might be 0. Since we use
|
||||
// an unsigned offset, 'at >= 0' is trivially always true. We could avoid
|
||||
// this extra case handling by using a signed offset, but Rust makes it
|
||||
// annoying to do. So... We just handle the empty case separately.
|
||||
if input.start() == input.end() {
|
||||
eoi_rev(dfa, input, &mut sid, &mut mat)?;
|
||||
return Ok(mat);
|
||||
}
|
||||
|
||||
let mut at = input.end() - 1;
|
||||
macro_rules! next_unchecked {
|
||||
($sid:expr, $at:expr) => {{
|
||||
let byte = *input.haystack().get_unchecked($at);
|
||||
dfa.next_state_unchecked($sid, byte)
|
||||
}};
|
||||
}
|
||||
loop {
|
||||
// SAFETY: See comments in 'find_fwd' for a safety argument.
|
||||
let mut prev_sid;
|
||||
while at >= input.start() {
|
||||
prev_sid = unsafe { next_unchecked!(sid, at) };
|
||||
if dfa.is_special_state(prev_sid)
|
||||
|| at <= input.start().saturating_add(3)
|
||||
{
|
||||
core::mem::swap(&mut prev_sid, &mut sid);
|
||||
break;
|
||||
}
|
||||
at -= 1;
|
||||
|
||||
sid = unsafe { next_unchecked!(prev_sid, at) };
|
||||
if dfa.is_special_state(sid) {
|
||||
break;
|
||||
}
|
||||
at -= 1;
|
||||
|
||||
prev_sid = unsafe { next_unchecked!(sid, at) };
|
||||
if dfa.is_special_state(prev_sid) {
|
||||
core::mem::swap(&mut prev_sid, &mut sid);
|
||||
break;
|
||||
}
|
||||
at -= 1;
|
||||
|
||||
sid = unsafe { next_unchecked!(prev_sid, at) };
|
||||
if dfa.is_special_state(sid) {
|
||||
break;
|
||||
}
|
||||
at -= 1;
|
||||
}
|
||||
if dfa.is_special_state(sid) {
|
||||
if dfa.is_start_state(sid) {
|
||||
if dfa.is_accel_state(sid) {
|
||||
let needles = dfa.accelerator(sid);
|
||||
at = accel::find_rev(needles, input.haystack(), at)
|
||||
.map(|i| i + 1)
|
||||
.unwrap_or(input.start());
|
||||
}
|
||||
} else if dfa.is_match_state(sid) {
|
||||
let pattern = dfa.match_pattern(sid, 0);
|
||||
// Since reverse searches report the beginning of a match
|
||||
// and the beginning is inclusive (not exclusive like the
|
||||
// end of a match), we add 1 to make it inclusive.
|
||||
mat = Some(HalfMatch::new(pattern, at + 1));
|
||||
if earliest {
|
||||
return Ok(mat);
|
||||
}
|
||||
if dfa.is_accel_state(sid) {
|
||||
let needles = dfa.accelerator(sid);
|
||||
at = accel::find_rev(needles, input.haystack(), at)
|
||||
.map(|i| i + 1)
|
||||
.unwrap_or(input.start());
|
||||
}
|
||||
} else if dfa.is_accel_state(sid) {
|
||||
let needles = dfa.accelerator(sid);
|
||||
// If the accelerator returns nothing, why don't we quit the
|
||||
// search? Well, if the accelerator doesn't find anything, that
|
||||
// doesn't mean we don't have a match. It just means that we
|
||||
// can't leave the current state given one of the 255 possible
|
||||
// byte values. However, there might be an EOI transition. So
|
||||
// we set 'at' to the end of the haystack, which will cause
|
||||
// this loop to stop and fall down into the EOI transition.
|
||||
at = accel::find_rev(needles, input.haystack(), at)
|
||||
.map(|i| i + 1)
|
||||
.unwrap_or(input.start());
|
||||
} else if dfa.is_dead_state(sid) {
|
||||
return Ok(mat);
|
||||
} else {
|
||||
return Err(MatchError::quit(input.haystack()[at], at));
|
||||
}
|
||||
}
|
||||
if at == input.start() {
|
||||
break;
|
||||
}
|
||||
at -= 1;
|
||||
}
|
||||
eoi_rev(dfa, input, &mut sid, &mut mat)?;
|
||||
Ok(mat)
|
||||
}
|
||||
|
||||
#[inline(never)]
|
||||
pub fn find_overlapping_fwd<A: Automaton + ?Sized>(
|
||||
dfa: &A,
|
||||
input: &Input<'_>,
|
||||
state: &mut OverlappingState,
|
||||
) -> Result<(), MatchError> {
|
||||
state.mat = None;
|
||||
if input.is_done() {
|
||||
return Ok(());
|
||||
}
|
||||
let pre = if input.get_anchored().is_anchored() {
|
||||
None
|
||||
} else {
|
||||
dfa.get_prefilter()
|
||||
};
|
||||
if pre.is_some() {
|
||||
find_overlapping_fwd_imp(dfa, input, pre, state)
|
||||
} else {
|
||||
find_overlapping_fwd_imp(dfa, input, None, state)
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg_attr(feature = "perf-inline", inline(always))]
|
||||
fn find_overlapping_fwd_imp<A: Automaton + ?Sized>(
|
||||
dfa: &A,
|
||||
input: &Input<'_>,
|
||||
pre: Option<&'_ Prefilter>,
|
||||
state: &mut OverlappingState,
|
||||
) -> Result<(), MatchError> {
|
||||
// See 'prefilter_restart' docs for explanation.
|
||||
let universal_start = dfa.universal_start_state(Anchored::No).is_some();
|
||||
let mut sid = match state.id {
|
||||
None => {
|
||||
state.at = input.start();
|
||||
init_fwd(dfa, input)?
|
||||
}
|
||||
Some(sid) => {
|
||||
if let Some(match_index) = state.next_match_index {
|
||||
let match_len = dfa.match_len(sid);
|
||||
if match_index < match_len {
|
||||
state.next_match_index = Some(match_index + 1);
|
||||
let pattern = dfa.match_pattern(sid, match_index);
|
||||
state.mat = Some(HalfMatch::new(pattern, state.at));
|
||||
return Ok(());
|
||||
}
|
||||
}
|
||||
// Once we've reported all matches at a given position, we need to
|
||||
// advance the search to the next position.
|
||||
state.at += 1;
|
||||
if state.at > input.end() {
|
||||
return Ok(());
|
||||
}
|
||||
sid
|
||||
}
|
||||
};
|
||||
|
||||
// NOTE: We don't optimize the crap out of this routine primarily because
|
||||
// it seems like most find_overlapping searches will have higher match
|
||||
// counts, and thus, throughput is perhaps not as important. But if you
|
||||
// have a use case for something faster, feel free to file an issue.
|
||||
while state.at < input.end() {
|
||||
sid = dfa.next_state(sid, input.haystack()[state.at]);
|
||||
if dfa.is_special_state(sid) {
|
||||
state.id = Some(sid);
|
||||
if dfa.is_start_state(sid) {
|
||||
if let Some(ref pre) = pre {
|
||||
let span = Span::from(state.at..input.end());
|
||||
match pre.find(input.haystack(), span) {
|
||||
None => return Ok(()),
|
||||
Some(ref span) => {
|
||||
if span.start > state.at {
|
||||
state.at = span.start;
|
||||
if !universal_start {
|
||||
sid = prefilter_restart(
|
||||
dfa, &input, state.at,
|
||||
)?;
|
||||
}
|
||||
continue;
|
||||
}
|
||||
}
|
||||
}
|
||||
} else if dfa.is_accel_state(sid) {
|
||||
let needles = dfa.accelerator(sid);
|
||||
state.at = accel::find_fwd(
|
||||
needles,
|
||||
input.haystack(),
|
||||
state.at + 1,
|
||||
)
|
||||
.unwrap_or(input.end());
|
||||
continue;
|
||||
}
|
||||
} else if dfa.is_match_state(sid) {
|
||||
state.next_match_index = Some(1);
|
||||
let pattern = dfa.match_pattern(sid, 0);
|
||||
state.mat = Some(HalfMatch::new(pattern, state.at));
|
||||
return Ok(());
|
||||
} else if dfa.is_accel_state(sid) {
|
||||
let needs = dfa.accelerator(sid);
|
||||
// If the accelerator returns nothing, why don't we quit the
|
||||
// search? Well, if the accelerator doesn't find anything, that
|
||||
// doesn't mean we don't have a match. It just means that we
|
||||
// can't leave the current state given one of the 255 possible
|
||||
// byte values. However, there might be an EOI transition. So
|
||||
// we set 'at' to the end of the haystack, which will cause
|
||||
// this loop to stop and fall down into the EOI transition.
|
||||
state.at =
|
||||
accel::find_fwd(needs, input.haystack(), state.at + 1)
|
||||
.unwrap_or(input.end());
|
||||
continue;
|
||||
} else if dfa.is_dead_state(sid) {
|
||||
return Ok(());
|
||||
} else {
|
||||
return Err(MatchError::quit(
|
||||
input.haystack()[state.at],
|
||||
state.at,
|
||||
));
|
||||
}
|
||||
}
|
||||
state.at += 1;
|
||||
}
|
||||
|
||||
let result = eoi_fwd(dfa, input, &mut sid, &mut state.mat);
|
||||
state.id = Some(sid);
|
||||
if state.mat.is_some() {
|
||||
// '1' is always correct here since if we get to this point, this
|
||||
// always corresponds to the first (index '0') match discovered at
|
||||
// this position. So the next match to report at this position (if
|
||||
// it exists) is at index '1'.
|
||||
state.next_match_index = Some(1);
|
||||
}
|
||||
result
|
||||
}
|
||||
|
||||
#[inline(never)]
|
||||
pub(crate) fn find_overlapping_rev<A: Automaton + ?Sized>(
|
||||
dfa: &A,
|
||||
input: &Input<'_>,
|
||||
state: &mut OverlappingState,
|
||||
) -> Result<(), MatchError> {
|
||||
state.mat = None;
|
||||
if input.is_done() {
|
||||
return Ok(());
|
||||
}
|
||||
let mut sid = match state.id {
|
||||
None => {
|
||||
let sid = init_rev(dfa, input)?;
|
||||
state.id = Some(sid);
|
||||
if input.start() == input.end() {
|
||||
state.rev_eoi = true;
|
||||
} else {
|
||||
state.at = input.end() - 1;
|
||||
}
|
||||
sid
|
||||
}
|
||||
Some(sid) => {
|
||||
if let Some(match_index) = state.next_match_index {
|
||||
let match_len = dfa.match_len(sid);
|
||||
if match_index < match_len {
|
||||
state.next_match_index = Some(match_index + 1);
|
||||
let pattern = dfa.match_pattern(sid, match_index);
|
||||
state.mat = Some(HalfMatch::new(pattern, state.at));
|
||||
return Ok(());
|
||||
}
|
||||
}
|
||||
// Once we've reported all matches at a given position, we need
|
||||
// to advance the search to the next position. However, if we've
|
||||
// already followed the EOI transition, then we know we're done
|
||||
// with the search and there cannot be any more matches to report.
|
||||
if state.rev_eoi {
|
||||
return Ok(());
|
||||
} else if state.at == input.start() {
|
||||
// At this point, we should follow the EOI transition. This
|
||||
// will cause us the skip the main loop below and fall through
|
||||
// to the final 'eoi_rev' transition.
|
||||
state.rev_eoi = true;
|
||||
} else {
|
||||
// We haven't hit the end of the search yet, so move on.
|
||||
state.at -= 1;
|
||||
}
|
||||
sid
|
||||
}
|
||||
};
|
||||
while !state.rev_eoi {
|
||||
sid = dfa.next_state(sid, input.haystack()[state.at]);
|
||||
if dfa.is_special_state(sid) {
|
||||
state.id = Some(sid);
|
||||
if dfa.is_start_state(sid) {
|
||||
if dfa.is_accel_state(sid) {
|
||||
let needles = dfa.accelerator(sid);
|
||||
state.at =
|
||||
accel::find_rev(needles, input.haystack(), state.at)
|
||||
.map(|i| i + 1)
|
||||
.unwrap_or(input.start());
|
||||
}
|
||||
} else if dfa.is_match_state(sid) {
|
||||
state.next_match_index = Some(1);
|
||||
let pattern = dfa.match_pattern(sid, 0);
|
||||
state.mat = Some(HalfMatch::new(pattern, state.at + 1));
|
||||
return Ok(());
|
||||
} else if dfa.is_accel_state(sid) {
|
||||
let needles = dfa.accelerator(sid);
|
||||
// If the accelerator returns nothing, why don't we quit the
|
||||
// search? Well, if the accelerator doesn't find anything, that
|
||||
// doesn't mean we don't have a match. It just means that we
|
||||
// can't leave the current state given one of the 255 possible
|
||||
// byte values. However, there might be an EOI transition. So
|
||||
// we set 'at' to the end of the haystack, which will cause
|
||||
// this loop to stop and fall down into the EOI transition.
|
||||
state.at =
|
||||
accel::find_rev(needles, input.haystack(), state.at)
|
||||
.map(|i| i + 1)
|
||||
.unwrap_or(input.start());
|
||||
} else if dfa.is_dead_state(sid) {
|
||||
return Ok(());
|
||||
} else {
|
||||
return Err(MatchError::quit(
|
||||
input.haystack()[state.at],
|
||||
state.at,
|
||||
));
|
||||
}
|
||||
}
|
||||
if state.at == input.start() {
|
||||
break;
|
||||
}
|
||||
state.at -= 1;
|
||||
}
|
||||
|
||||
let result = eoi_rev(dfa, input, &mut sid, &mut state.mat);
|
||||
state.rev_eoi = true;
|
||||
state.id = Some(sid);
|
||||
if state.mat.is_some() {
|
||||
// '1' is always correct here since if we get to this point, this
|
||||
// always corresponds to the first (index '0') match discovered at
|
||||
// this position. So the next match to report at this position (if
|
||||
// it exists) is at index '1'.
|
||||
state.next_match_index = Some(1);
|
||||
}
|
||||
result
|
||||
}
|
||||
|
||||
#[cfg_attr(feature = "perf-inline", inline(always))]
|
||||
fn init_fwd<A: Automaton + ?Sized>(
|
||||
dfa: &A,
|
||||
input: &Input<'_>,
|
||||
) -> Result<StateID, MatchError> {
|
||||
let sid = dfa.start_state_forward(input)?;
|
||||
// Start states can never be match states, since all matches are delayed
|
||||
// by 1 byte.
|
||||
debug_assert!(!dfa.is_match_state(sid));
|
||||
Ok(sid)
|
||||
}
|
||||
|
||||
#[cfg_attr(feature = "perf-inline", inline(always))]
|
||||
fn init_rev<A: Automaton + ?Sized>(
|
||||
dfa: &A,
|
||||
input: &Input<'_>,
|
||||
) -> Result<StateID, MatchError> {
|
||||
let sid = dfa.start_state_reverse(input)?;
|
||||
// Start states can never be match states, since all matches are delayed
|
||||
// by 1 byte.
|
||||
debug_assert!(!dfa.is_match_state(sid));
|
||||
Ok(sid)
|
||||
}
|
||||
|
||||
#[cfg_attr(feature = "perf-inline", inline(always))]
|
||||
fn eoi_fwd<A: Automaton + ?Sized>(
|
||||
dfa: &A,
|
||||
input: &Input<'_>,
|
||||
sid: &mut StateID,
|
||||
mat: &mut Option<HalfMatch>,
|
||||
) -> Result<(), MatchError> {
|
||||
let sp = input.get_span();
|
||||
match input.haystack().get(sp.end) {
|
||||
Some(&b) => {
|
||||
*sid = dfa.next_state(*sid, b);
|
||||
if dfa.is_match_state(*sid) {
|
||||
let pattern = dfa.match_pattern(*sid, 0);
|
||||
*mat = Some(HalfMatch::new(pattern, sp.end));
|
||||
} else if dfa.is_quit_state(*sid) {
|
||||
return Err(MatchError::quit(b, sp.end));
|
||||
}
|
||||
}
|
||||
None => {
|
||||
*sid = dfa.next_eoi_state(*sid);
|
||||
if dfa.is_match_state(*sid) {
|
||||
let pattern = dfa.match_pattern(*sid, 0);
|
||||
*mat = Some(HalfMatch::new(pattern, input.haystack().len()));
|
||||
}
|
||||
}
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[cfg_attr(feature = "perf-inline", inline(always))]
|
||||
fn eoi_rev<A: Automaton + ?Sized>(
|
||||
dfa: &A,
|
||||
input: &Input<'_>,
|
||||
sid: &mut StateID,
|
||||
mat: &mut Option<HalfMatch>,
|
||||
) -> Result<(), MatchError> {
|
||||
let sp = input.get_span();
|
||||
if sp.start > 0 {
|
||||
let byte = input.haystack()[sp.start - 1];
|
||||
*sid = dfa.next_state(*sid, byte);
|
||||
if dfa.is_match_state(*sid) {
|
||||
let pattern = dfa.match_pattern(*sid, 0);
|
||||
*mat = Some(HalfMatch::new(pattern, sp.start));
|
||||
} else if dfa.is_quit_state(*sid) {
|
||||
return Err(MatchError::quit(byte, sp.start - 1));
|
||||
}
|
||||
} else {
|
||||
*sid = dfa.next_eoi_state(*sid);
|
||||
if dfa.is_match_state(*sid) {
|
||||
let pattern = dfa.match_pattern(*sid, 0);
|
||||
*mat = Some(HalfMatch::new(pattern, 0));
|
||||
}
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Re-compute the starting state that a DFA should be in after finding a
|
||||
/// prefilter candidate match at the position `at`.
|
||||
///
|
||||
/// The function with the same name has a bit more docs in hybrid/search.rs.
|
||||
#[cfg_attr(feature = "perf-inline", inline(always))]
|
||||
fn prefilter_restart<A: Automaton + ?Sized>(
|
||||
dfa: &A,
|
||||
input: &Input<'_>,
|
||||
at: usize,
|
||||
) -> Result<StateID, MatchError> {
|
||||
let mut input = input.clone();
|
||||
input.set_start(at);
|
||||
init_fwd(dfa, &input)
|
||||
}
|
||||
2635
third-party/vendor/regex-automata/src/dfa/sparse.rs
vendored
Normal file
2635
third-party/vendor/regex-automata/src/dfa/sparse.rs
vendored
Normal file
File diff suppressed because it is too large
Load diff
494
third-party/vendor/regex-automata/src/dfa/special.rs
vendored
Normal file
494
third-party/vendor/regex-automata/src/dfa/special.rs
vendored
Normal file
|
|
@ -0,0 +1,494 @@
|
|||
use crate::{
|
||||
dfa::DEAD,
|
||||
util::{
|
||||
primitives::StateID,
|
||||
wire::{self, DeserializeError, Endian, SerializeError},
|
||||
},
|
||||
};
|
||||
|
||||
macro_rules! err {
|
||||
($msg:expr) => {
|
||||
return Err(DeserializeError::generic($msg));
|
||||
};
|
||||
}
|
||||
|
||||
// Special represents the identifiers in a DFA that correspond to "special"
|
||||
// states. If a state is one or more of the following, then it is considered
|
||||
// special:
|
||||
//
|
||||
// * dead - A non-matching state where all outgoing transitions lead back to
|
||||
// itself. There is only one of these, regardless of whether minimization
|
||||
// has run. The dead state always has an ID of 0. i.e., It is always the
|
||||
// first state in a DFA.
|
||||
// * quit - A state that is entered whenever a byte is seen that should cause
|
||||
// a DFA to give up and stop searching. This results in a MatchError::quit
|
||||
// error being returned at search time. The default configuration for a DFA
|
||||
// has no quit bytes, which means this state is unreachable by default,
|
||||
// although it is always present for reasons of implementation simplicity.
|
||||
// This state is only reachable when the caller configures the DFA to quit
|
||||
// on certain bytes. There is always exactly one of these states and it
|
||||
// is always the second state. (Its actual ID depends on the size of the
|
||||
// alphabet in dense DFAs, since state IDs are premultiplied in order to
|
||||
// allow them to be used directly as indices into the transition table.)
|
||||
// * match - An accepting state, i.e., indicative of a match. There may be
|
||||
// zero or more of these states.
|
||||
// * accelerated - A state where all of its outgoing transitions, except a
|
||||
// few, loop back to itself. These states are candidates for acceleration
|
||||
// via memchr during search. There may be zero or more of these states.
|
||||
// * start - A non-matching state that indicates where the automaton should
|
||||
// start during a search. There is always at least one starting state and
|
||||
// all are guaranteed to be non-match states. (A start state cannot be a
|
||||
// match state because the DFAs in this crate delay all matches by one byte.
|
||||
// So every search that finds a match must move through one transition to
|
||||
// some other match state, even when searching an empty string.)
|
||||
//
|
||||
// These are not mutually exclusive categories. Namely, the following
|
||||
// overlappings can occur:
|
||||
//
|
||||
// * {dead, start} - If a DFA can never lead to a match and it is minimized,
|
||||
// then it will typically compile to something where all starting IDs point
|
||||
// to the DFA's dead state.
|
||||
// * {match, accelerated} - It is possible for a match state to have the
|
||||
// majority of its transitions loop back to itself, which means it's
|
||||
// possible for a match state to be accelerated.
|
||||
// * {start, accelerated} - Similarly, it is possible for a start state to be
|
||||
// accelerated. Note that it is possible for an accelerated state to be
|
||||
// neither a match or a start state. Also note that just because both match
|
||||
// and start states overlap with accelerated states does not mean that
|
||||
// match and start states overlap with each other. In fact, they are
|
||||
// guaranteed not to overlap.
|
||||
//
|
||||
// As a special mention, every DFA always has a dead and a quit state, even
|
||||
// though from the perspective of the DFA, they are equivalent. (Indeed,
|
||||
// minimization special cases them to ensure they don't get merged.) The
|
||||
// purpose of keeping them distinct is to use the quit state as a sentinel to
|
||||
// distguish between whether a search finished successfully without finding
|
||||
// anything or whether it gave up before finishing.
|
||||
//
|
||||
// So the main problem we want to solve here is the *fast* detection of whether
|
||||
// a state is special or not. And we also want to do this while storing as
|
||||
// little extra data as possible. AND we want to be able to quickly determine
|
||||
// which categories a state falls into above if it is special.
|
||||
//
|
||||
// We achieve this by essentially shuffling all special states to the beginning
|
||||
// of a DFA. That is, all special states appear before every other non-special
|
||||
// state. By representing special states this way, we can determine whether a
|
||||
// state is special or not by a single comparison, where special.max is the
|
||||
// identifier of the last special state in the DFA:
|
||||
//
|
||||
// if current_state <= special.max:
|
||||
// ... do something with special state
|
||||
//
|
||||
// The only thing left to do is to determine what kind of special state
|
||||
// it is. Because what we do next depends on that. Since special states
|
||||
// are typically rare, we can afford to do a bit more extra work, but we'd
|
||||
// still like this to be as fast as possible. The trick we employ here is to
|
||||
// continue shuffling states even within the special state range. Such that
|
||||
// one contiguous region corresponds to match states, another for start states
|
||||
// and then an overlapping range for accelerated states. At a high level, our
|
||||
// special state detection might look like this (for leftmost searching, where
|
||||
// we continue searching even after seeing a match):
|
||||
//
|
||||
// byte = input[offset]
|
||||
// current_state = next_state(current_state, byte)
|
||||
// offset += 1
|
||||
// if current_state <= special.max:
|
||||
// if current_state == 0:
|
||||
// # We can never leave a dead state, so this always marks the
|
||||
// # end of our search.
|
||||
// return last_match
|
||||
// if current_state == special.quit_id:
|
||||
// # A quit state means we give up. If he DFA has no quit state,
|
||||
// # then special.quit_id == 0 == dead, which is handled by the
|
||||
// # conditional above.
|
||||
// return Err(MatchError::quit { byte, offset: offset - 1 })
|
||||
// if special.min_match <= current_state <= special.max_match:
|
||||
// last_match = Some(offset)
|
||||
// if special.min_accel <= current_state <= special.max_accel:
|
||||
// offset = accelerate(input, offset)
|
||||
// last_match = Some(offset)
|
||||
// elif special.min_start <= current_state <= special.max_start:
|
||||
// offset = prefilter.find(input, offset)
|
||||
// if special.min_accel <= current_state <= special.max_accel:
|
||||
// offset = accelerate(input, offset)
|
||||
// elif special.min_accel <= current_state <= special.max_accel:
|
||||
// offset = accelerate(input, offset)
|
||||
//
|
||||
// There are some small details left out of the logic above. For example,
|
||||
// in order to accelerate a state, we need to know which bytes to search for.
|
||||
// This in turn implies some extra data we need to store in the DFA. To keep
|
||||
// things compact, we would ideally only store
|
||||
//
|
||||
// N = special.max_accel - special.min_accel + 1
|
||||
//
|
||||
// items. But state IDs are premultiplied, which means they are not contiguous.
|
||||
// So in order to take a state ID and index an array of accelerated structures,
|
||||
// we need to do:
|
||||
//
|
||||
// i = (state_id - special.min_accel) / stride
|
||||
//
|
||||
// (N.B. 'stride' is always a power of 2, so the above can be implemented via
|
||||
// '(state_id - special.min_accel) >> stride2', where 'stride2' is x in
|
||||
// 2^x=stride.)
|
||||
//
|
||||
// Moreover, some of these specialty categories may be empty. For example,
|
||||
// DFAs are not required to have any match states or any accelerated states.
|
||||
// In that case, the lower and upper bounds are both set to 0 (the dead state
|
||||
// ID) and the first `current_state == 0` check subsumes cases where the
|
||||
// ranges are empty.
|
||||
//
|
||||
// Loop unrolling, if applicable, has also been left out of the logic above.
|
||||
//
|
||||
// Graphically, the ranges look like this, where asterisks indicate ranges
|
||||
// that can be empty. Each 'x' is a state.
|
||||
//
|
||||
// quit
|
||||
// dead|
|
||||
// ||
|
||||
// xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx
|
||||
// | | | | start | |
|
||||
// | |-------------| |-------| |
|
||||
// | match* | | | |
|
||||
// | | | | |
|
||||
// | |----------| | |
|
||||
// | accel* | |
|
||||
// | | |
|
||||
// | | |
|
||||
// |----------------------------|------------------------
|
||||
// special non-special*
|
||||
#[derive(Clone, Copy, Debug)]
|
||||
pub(crate) struct Special {
|
||||
/// The identifier of the last special state in a DFA. A state is special
|
||||
/// if and only if its identifier is less than or equal to `max`.
|
||||
pub(crate) max: StateID,
|
||||
/// The identifier of the quit state in a DFA. (There is no analogous field
|
||||
/// for the dead state since the dead state's ID is always zero, regardless
|
||||
/// of state ID size.)
|
||||
pub(crate) quit_id: StateID,
|
||||
/// The identifier of the first match state.
|
||||
pub(crate) min_match: StateID,
|
||||
/// The identifier of the last match state.
|
||||
pub(crate) max_match: StateID,
|
||||
/// The identifier of the first accelerated state.
|
||||
pub(crate) min_accel: StateID,
|
||||
/// The identifier of the last accelerated state.
|
||||
pub(crate) max_accel: StateID,
|
||||
/// The identifier of the first start state.
|
||||
pub(crate) min_start: StateID,
|
||||
/// The identifier of the last start state.
|
||||
pub(crate) max_start: StateID,
|
||||
}
|
||||
|
||||
impl Special {
|
||||
/// Creates a new set of special ranges for a DFA. All ranges are initially
|
||||
/// set to only contain the dead state. This is interpreted as an empty
|
||||
/// range.
|
||||
#[cfg(feature = "dfa-build")]
|
||||
pub(crate) fn new() -> Special {
|
||||
Special {
|
||||
max: DEAD,
|
||||
quit_id: DEAD,
|
||||
min_match: DEAD,
|
||||
max_match: DEAD,
|
||||
min_accel: DEAD,
|
||||
max_accel: DEAD,
|
||||
min_start: DEAD,
|
||||
max_start: DEAD,
|
||||
}
|
||||
}
|
||||
|
||||
/// Remaps all of the special state identifiers using the function given.
|
||||
#[cfg(feature = "dfa-build")]
|
||||
pub(crate) fn remap(&self, map: impl Fn(StateID) -> StateID) -> Special {
|
||||
Special {
|
||||
max: map(self.max),
|
||||
quit_id: map(self.quit_id),
|
||||
min_match: map(self.min_match),
|
||||
max_match: map(self.max_match),
|
||||
min_accel: map(self.min_accel),
|
||||
max_accel: map(self.max_accel),
|
||||
min_start: map(self.min_start),
|
||||
max_start: map(self.max_start),
|
||||
}
|
||||
}
|
||||
|
||||
/// Deserialize the given bytes into special state ranges. If the slice
|
||||
/// given is not big enough, then this returns an error. Similarly, if
|
||||
/// any of the expected invariants around special state ranges aren't
|
||||
/// upheld, an error is returned. Note that this does not guarantee that
|
||||
/// the information returned is correct.
|
||||
///
|
||||
/// Upon success, this returns the number of bytes read in addition to the
|
||||
/// special state IDs themselves.
|
||||
pub(crate) fn from_bytes(
|
||||
mut slice: &[u8],
|
||||
) -> Result<(Special, usize), DeserializeError> {
|
||||
wire::check_slice_len(slice, 8 * StateID::SIZE, "special states")?;
|
||||
|
||||
let mut nread = 0;
|
||||
let mut read_id = |what| -> Result<StateID, DeserializeError> {
|
||||
let (id, nr) = wire::try_read_state_id(slice, what)?;
|
||||
nread += nr;
|
||||
slice = &slice[StateID::SIZE..];
|
||||
Ok(id)
|
||||
};
|
||||
|
||||
let max = read_id("special max id")?;
|
||||
let quit_id = read_id("special quit id")?;
|
||||
let min_match = read_id("special min match id")?;
|
||||
let max_match = read_id("special max match id")?;
|
||||
let min_accel = read_id("special min accel id")?;
|
||||
let max_accel = read_id("special max accel id")?;
|
||||
let min_start = read_id("special min start id")?;
|
||||
let max_start = read_id("special max start id")?;
|
||||
|
||||
let special = Special {
|
||||
max,
|
||||
quit_id,
|
||||
min_match,
|
||||
max_match,
|
||||
min_accel,
|
||||
max_accel,
|
||||
min_start,
|
||||
max_start,
|
||||
};
|
||||
special.validate()?;
|
||||
assert_eq!(nread, special.write_to_len());
|
||||
Ok((special, nread))
|
||||
}
|
||||
|
||||
/// Validate that the information describing special states satisfies
|
||||
/// all known invariants.
|
||||
pub(crate) fn validate(&self) -> Result<(), DeserializeError> {
|
||||
// Check that both ends of the range are DEAD or neither are.
|
||||
if self.min_match == DEAD && self.max_match != DEAD {
|
||||
err!("min_match is DEAD, but max_match is not");
|
||||
}
|
||||
if self.min_match != DEAD && self.max_match == DEAD {
|
||||
err!("max_match is DEAD, but min_match is not");
|
||||
}
|
||||
if self.min_accel == DEAD && self.max_accel != DEAD {
|
||||
err!("min_accel is DEAD, but max_accel is not");
|
||||
}
|
||||
if self.min_accel != DEAD && self.max_accel == DEAD {
|
||||
err!("max_accel is DEAD, but min_accel is not");
|
||||
}
|
||||
if self.min_start == DEAD && self.max_start != DEAD {
|
||||
err!("min_start is DEAD, but max_start is not");
|
||||
}
|
||||
if self.min_start != DEAD && self.max_start == DEAD {
|
||||
err!("max_start is DEAD, but min_start is not");
|
||||
}
|
||||
|
||||
// Check that ranges are well formed.
|
||||
if self.min_match > self.max_match {
|
||||
err!("min_match should not be greater than max_match");
|
||||
}
|
||||
if self.min_accel > self.max_accel {
|
||||
err!("min_accel should not be greater than max_accel");
|
||||
}
|
||||
if self.min_start > self.max_start {
|
||||
err!("min_start should not be greater than max_start");
|
||||
}
|
||||
|
||||
// Check that ranges are ordered with respect to one another.
|
||||
if self.matches() && self.quit_id >= self.min_match {
|
||||
err!("quit_id should not be greater than min_match");
|
||||
}
|
||||
if self.accels() && self.quit_id >= self.min_accel {
|
||||
err!("quit_id should not be greater than min_accel");
|
||||
}
|
||||
if self.starts() && self.quit_id >= self.min_start {
|
||||
err!("quit_id should not be greater than min_start");
|
||||
}
|
||||
if self.matches() && self.accels() && self.min_accel < self.min_match {
|
||||
err!("min_match should not be greater than min_accel");
|
||||
}
|
||||
if self.matches() && self.starts() && self.min_start < self.min_match {
|
||||
err!("min_match should not be greater than min_start");
|
||||
}
|
||||
if self.accels() && self.starts() && self.min_start < self.min_accel {
|
||||
err!("min_accel should not be greater than min_start");
|
||||
}
|
||||
|
||||
// Check that max is at least as big as everything else.
|
||||
if self.max < self.quit_id {
|
||||
err!("quit_id should not be greater than max");
|
||||
}
|
||||
if self.max < self.max_match {
|
||||
err!("max_match should not be greater than max");
|
||||
}
|
||||
if self.max < self.max_accel {
|
||||
err!("max_accel should not be greater than max");
|
||||
}
|
||||
if self.max < self.max_start {
|
||||
err!("max_start should not be greater than max");
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Validate that the special state information is compatible with the
|
||||
/// given state len.
|
||||
pub(crate) fn validate_state_len(
|
||||
&self,
|
||||
len: usize,
|
||||
stride2: usize,
|
||||
) -> Result<(), DeserializeError> {
|
||||
// We assume that 'validate' has already passed, so we know that 'max'
|
||||
// is truly the max. So all we need to check is that the max state ID
|
||||
// is less than the state ID len. The max legal value here is len-1,
|
||||
// which occurs when there are no non-special states.
|
||||
if (self.max.as_usize() >> stride2) >= len {
|
||||
err!("max should not be greater than or equal to state length");
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Write the IDs and ranges for special states to the given byte buffer.
|
||||
/// The buffer given must have enough room to store all data, otherwise
|
||||
/// this will return an error. The number of bytes written is returned
|
||||
/// on success. The number of bytes written is guaranteed to be a multiple
|
||||
/// of 8.
|
||||
pub(crate) fn write_to<E: Endian>(
|
||||
&self,
|
||||
dst: &mut [u8],
|
||||
) -> Result<usize, SerializeError> {
|
||||
use crate::util::wire::write_state_id as write;
|
||||
|
||||
if dst.len() < self.write_to_len() {
|
||||
return Err(SerializeError::buffer_too_small("special state ids"));
|
||||
}
|
||||
|
||||
let mut nwrite = 0;
|
||||
nwrite += write::<E>(self.max, &mut dst[nwrite..]);
|
||||
nwrite += write::<E>(self.quit_id, &mut dst[nwrite..]);
|
||||
nwrite += write::<E>(self.min_match, &mut dst[nwrite..]);
|
||||
nwrite += write::<E>(self.max_match, &mut dst[nwrite..]);
|
||||
nwrite += write::<E>(self.min_accel, &mut dst[nwrite..]);
|
||||
nwrite += write::<E>(self.max_accel, &mut dst[nwrite..]);
|
||||
nwrite += write::<E>(self.min_start, &mut dst[nwrite..]);
|
||||
nwrite += write::<E>(self.max_start, &mut dst[nwrite..]);
|
||||
|
||||
assert_eq!(
|
||||
self.write_to_len(),
|
||||
nwrite,
|
||||
"expected to write certain number of bytes",
|
||||
);
|
||||
assert_eq!(
|
||||
nwrite % 8,
|
||||
0,
|
||||
"expected to write multiple of 8 bytes for special states",
|
||||
);
|
||||
Ok(nwrite)
|
||||
}
|
||||
|
||||
/// Returns the total number of bytes written by `write_to`.
|
||||
pub(crate) fn write_to_len(&self) -> usize {
|
||||
8 * StateID::SIZE
|
||||
}
|
||||
|
||||
/// Sets the maximum special state ID based on the current values. This
|
||||
/// should be used once all possible state IDs are set.
|
||||
#[cfg(feature = "dfa-build")]
|
||||
pub(crate) fn set_max(&mut self) {
|
||||
use core::cmp::max;
|
||||
self.max = max(
|
||||
self.quit_id,
|
||||
max(self.max_match, max(self.max_accel, self.max_start)),
|
||||
);
|
||||
}
|
||||
|
||||
/// Sets the maximum special state ID such that starting states are not
|
||||
/// considered "special." This also marks the min/max starting states as
|
||||
/// DEAD such that 'is_start_state' always returns false, even if the state
|
||||
/// is actually a starting state.
|
||||
///
|
||||
/// This is useful when there is no prefilter set. It will avoid
|
||||
/// ping-ponging between the hot path in the DFA search code and the start
|
||||
/// state handling code, which is typically only useful for executing a
|
||||
/// prefilter.
|
||||
#[cfg(feature = "dfa-build")]
|
||||
pub(crate) fn set_no_special_start_states(&mut self) {
|
||||
use core::cmp::max;
|
||||
self.max = max(self.quit_id, max(self.max_match, self.max_accel));
|
||||
self.min_start = DEAD;
|
||||
self.max_start = DEAD;
|
||||
}
|
||||
|
||||
/// Returns true if and only if the given state ID is a special state.
|
||||
#[inline]
|
||||
pub(crate) fn is_special_state(&self, id: StateID) -> bool {
|
||||
id <= self.max
|
||||
}
|
||||
|
||||
/// Returns true if and only if the given state ID is a dead state.
|
||||
#[inline]
|
||||
pub(crate) fn is_dead_state(&self, id: StateID) -> bool {
|
||||
id == DEAD
|
||||
}
|
||||
|
||||
/// Returns true if and only if the given state ID is a quit state.
|
||||
#[inline]
|
||||
pub(crate) fn is_quit_state(&self, id: StateID) -> bool {
|
||||
!self.is_dead_state(id) && self.quit_id == id
|
||||
}
|
||||
|
||||
/// Returns true if and only if the given state ID is a match state.
|
||||
#[inline]
|
||||
pub(crate) fn is_match_state(&self, id: StateID) -> bool {
|
||||
!self.is_dead_state(id) && self.min_match <= id && id <= self.max_match
|
||||
}
|
||||
|
||||
/// Returns true if and only if the given state ID is an accel state.
|
||||
#[inline]
|
||||
pub(crate) fn is_accel_state(&self, id: StateID) -> bool {
|
||||
!self.is_dead_state(id) && self.min_accel <= id && id <= self.max_accel
|
||||
}
|
||||
|
||||
/// Returns true if and only if the given state ID is a start state.
|
||||
#[inline]
|
||||
pub(crate) fn is_start_state(&self, id: StateID) -> bool {
|
||||
!self.is_dead_state(id) && self.min_start <= id && id <= self.max_start
|
||||
}
|
||||
|
||||
/// Returns the total number of match states for a dense table based DFA.
|
||||
#[inline]
|
||||
pub(crate) fn match_len(&self, stride: usize) -> usize {
|
||||
if self.matches() {
|
||||
(self.max_match.as_usize() - self.min_match.as_usize() + stride)
|
||||
/ stride
|
||||
} else {
|
||||
0
|
||||
}
|
||||
}
|
||||
|
||||
/// Returns true if and only if there is at least one match state.
|
||||
#[inline]
|
||||
pub(crate) fn matches(&self) -> bool {
|
||||
self.min_match != DEAD
|
||||
}
|
||||
|
||||
/// Returns the total number of accel states.
|
||||
#[cfg(feature = "dfa-build")]
|
||||
pub(crate) fn accel_len(&self, stride: usize) -> usize {
|
||||
if self.accels() {
|
||||
(self.max_accel.as_usize() - self.min_accel.as_usize() + stride)
|
||||
/ stride
|
||||
} else {
|
||||
0
|
||||
}
|
||||
}
|
||||
|
||||
/// Returns true if and only if there is at least one accel state.
|
||||
#[inline]
|
||||
pub(crate) fn accels(&self) -> bool {
|
||||
self.min_accel != DEAD
|
||||
}
|
||||
|
||||
/// Returns true if and only if there is at least one start state.
|
||||
#[inline]
|
||||
pub(crate) fn starts(&self) -> bool {
|
||||
self.min_start != DEAD
|
||||
}
|
||||
}
|
||||
74
third-party/vendor/regex-automata/src/dfa/start.rs
vendored
Normal file
74
third-party/vendor/regex-automata/src/dfa/start.rs
vendored
Normal file
|
|
@ -0,0 +1,74 @@
|
|||
use core::mem::size_of;
|
||||
|
||||
use crate::util::wire::{self, DeserializeError, Endian, SerializeError};
|
||||
|
||||
/// The kind of anchored starting configurations to support in a DFA.
|
||||
///
|
||||
/// Fully compiled DFAs need to be explicitly configured as to which anchored
|
||||
/// starting configurations to support. The reason for not just supporting
|
||||
/// everything unconditionally is that it can use more resources (such as
|
||||
/// memory and build time). The downside of this is that if you try to execute
|
||||
/// a search using an [`Anchored`](crate::Anchored) mode that is not supported
|
||||
/// by the DFA, then the search will return an error.
|
||||
#[derive(Clone, Copy, Debug, Eq, PartialEq)]
|
||||
pub enum StartKind {
|
||||
/// Support both anchored and unanchored searches.
|
||||
Both,
|
||||
/// Support only unanchored searches. Requesting an anchored search will
|
||||
/// panic.
|
||||
///
|
||||
/// Note that even if an unanchored search is requested, the pattern itself
|
||||
/// may still be anchored. For example, `^abc` will only match `abc` at the
|
||||
/// start of a haystack. This will remain true, even if the regex engine
|
||||
/// only supported unanchored searches.
|
||||
Unanchored,
|
||||
/// Support only anchored searches. Requesting an unanchored search will
|
||||
/// panic.
|
||||
Anchored,
|
||||
}
|
||||
|
||||
impl StartKind {
|
||||
pub(crate) fn from_bytes(
|
||||
slice: &[u8],
|
||||
) -> Result<(StartKind, usize), DeserializeError> {
|
||||
wire::check_slice_len(slice, size_of::<u32>(), "start kind bytes")?;
|
||||
let (n, nr) = wire::try_read_u32(slice, "start kind integer")?;
|
||||
match n {
|
||||
0 => Ok((StartKind::Both, nr)),
|
||||
1 => Ok((StartKind::Unanchored, nr)),
|
||||
2 => Ok((StartKind::Anchored, nr)),
|
||||
_ => Err(DeserializeError::generic("unrecognized start kind")),
|
||||
}
|
||||
}
|
||||
|
||||
pub(crate) fn write_to<E: Endian>(
|
||||
&self,
|
||||
dst: &mut [u8],
|
||||
) -> Result<usize, SerializeError> {
|
||||
let nwrite = self.write_to_len();
|
||||
if dst.len() < nwrite {
|
||||
return Err(SerializeError::buffer_too_small("start kind"));
|
||||
}
|
||||
let n = match *self {
|
||||
StartKind::Both => 0,
|
||||
StartKind::Unanchored => 1,
|
||||
StartKind::Anchored => 2,
|
||||
};
|
||||
E::write_u32(n, dst);
|
||||
Ok(nwrite)
|
||||
}
|
||||
|
||||
pub(crate) fn write_to_len(&self) -> usize {
|
||||
size_of::<u32>()
|
||||
}
|
||||
|
||||
#[cfg_attr(feature = "perf-inline", inline(always))]
|
||||
pub(crate) fn has_unanchored(&self) -> bool {
|
||||
matches!(*self, StartKind::Both | StartKind::Unanchored)
|
||||
}
|
||||
|
||||
#[cfg_attr(feature = "perf-inline", inline(always))]
|
||||
pub(crate) fn has_anchored(&self) -> bool {
|
||||
matches!(*self, StartKind::Both | StartKind::Anchored)
|
||||
}
|
||||
}
|
||||
Loading…
Add table
Add a link
Reference in a new issue