Vendor things
This commit is contained in:
parent
5deceec006
commit
977e3c17e5
19434 changed files with 10682014 additions and 0 deletions
517
third-party/vendor/regex-automata/src/dfa/accel.rs
vendored
Normal file
517
third-party/vendor/regex-automata/src/dfa/accel.rs
vendored
Normal file
|
|
@ -0,0 +1,517 @@
|
|||
// This module defines some core types for dealing with accelerated DFA states.
|
||||
// Briefly, a DFA state can be "accelerated" if all of its transitions except
|
||||
// for a few loop back to itself. This directly implies that the only way out
|
||||
// of such a state is if a byte corresponding to one of those non-loopback
|
||||
// transitions is found. Such states are often found in simple repetitions in
|
||||
// non-Unicode regexes. For example, consider '(?-u)[^a]+a'. We can look at its
|
||||
// DFA with regex-cli:
|
||||
//
|
||||
// $ regex-cli debug dense dfa -p '(?-u)[^a]+a' -BbC --no-table
|
||||
// D 000000:
|
||||
// Q 000001:
|
||||
// *000002:
|
||||
// A 000003: \x00-` => 3, a => 8, b-\xFF => 3
|
||||
// A 000004: \x00-` => 4, a => 7, b-\xFF => 4
|
||||
// 000005: \x00-` => 4, b-\xFF => 4
|
||||
// 000006: \x00-` => 3, a => 6, b-\xFF => 3
|
||||
// 000007: \x00-\xFF => 2, EOI => 2
|
||||
// 000008: \x00-\xFF => 2, EOI => 2
|
||||
//
|
||||
// In particular, state 3 is accelerated (shown via the 'A' indicator) since
|
||||
// the only way to leave that state once entered is to see an 'a' byte. If
|
||||
// there is a long run of non-'a' bytes, then using something like 'memchr'
|
||||
// to find the next 'a' byte can be significantly faster than just using the
|
||||
// standard byte-at-a-time state machine.
|
||||
//
|
||||
// Unfortunately, this optimization rarely applies when Unicode is enabled.
|
||||
// For example, patterns like '[^a]' don't actually match any byte that isn't
|
||||
// 'a', but rather, any UTF-8 encoding of a Unicode scalar value that isn't
|
||||
// 'a'. This makes the state machine much more complex---far beyond a single
|
||||
// state---and removes the ability to easily accelerate it. (Because if the
|
||||
// machine sees a non-UTF-8 sequence, then the machine won't match through it.)
|
||||
//
|
||||
// In practice, we only consider accelerating states that have 3 or fewer
|
||||
// non-loop transitions. At a certain point, you get diminishing returns, but
|
||||
// also because that's what the memchr crate supports. The structures below
|
||||
// hard-code this assumption and provide (de)serialization APIs for use inside
|
||||
// a DFA.
|
||||
//
|
||||
// And finally, note that there is some trickery involved in making it very
|
||||
// fast to not only check whether a state is accelerated at search time, but
|
||||
// also to access the bytes to search for to implement the acceleration itself.
|
||||
// dfa/special.rs provides more detail, but the short story is that all
|
||||
// accelerated states appear contiguously in a DFA. This means we can represent
|
||||
// the ID space of all accelerated DFA states with a single range. So given
|
||||
// a state ID, we can determine whether it's accelerated via
|
||||
//
|
||||
// min_accel_id <= id <= max_accel_id
|
||||
//
|
||||
// And find its corresponding accelerator with:
|
||||
//
|
||||
// accels.get((id - min_accel_id) / dfa_stride)
|
||||
|
||||
#[cfg(feature = "dfa-build")]
|
||||
use alloc::{vec, vec::Vec};
|
||||
|
||||
use crate::util::{
|
||||
int::Pointer,
|
||||
memchr,
|
||||
wire::{self, DeserializeError, Endian, SerializeError},
|
||||
};
|
||||
|
||||
/// The base type used to represent a collection of accelerators.
|
||||
///
|
||||
/// While an `Accel` is represented as a fixed size array of bytes, a
|
||||
/// *collection* of `Accel`s (called `Accels`) is represented internally as a
|
||||
/// slice of u32. While it's a bit unnatural to do this and costs us a bit of
|
||||
/// fairly low-risk not-safe code, it lets us remove the need for a second type
|
||||
/// parameter in the definition of dense::DFA. (Which really wants everything
|
||||
/// to be a slice of u32.)
|
||||
type AccelTy = u32;
|
||||
|
||||
/// The size of the unit of representation for accelerators.
|
||||
///
|
||||
/// ACCEL_CAP *must* be a multiple of this size.
|
||||
const ACCEL_TY_SIZE: usize = core::mem::size_of::<AccelTy>();
|
||||
|
||||
/// The maximum length in bytes that a single Accel can be. This is distinct
|
||||
/// from the capacity of an accelerator in that the length represents only the
|
||||
/// bytes that should be read.
|
||||
const ACCEL_LEN: usize = 4;
|
||||
|
||||
/// The capacity of each accelerator, in bytes. We set this to 8 since it's a
|
||||
/// multiple of 4 (our ID size) and because it gives us a little wiggle room
|
||||
/// if we want to support more accel bytes in the future without a breaking
|
||||
/// change.
|
||||
///
|
||||
/// This MUST be a multiple of ACCEL_TY_SIZE.
|
||||
const ACCEL_CAP: usize = 8;
|
||||
|
||||
/// Search for between 1 and 3 needle bytes in the given haystack, starting the
|
||||
/// search at the given position. If `needles` has a length other than 1-3,
|
||||
/// then this panics.
|
||||
#[cfg_attr(feature = "perf-inline", inline(always))]
|
||||
pub(crate) fn find_fwd(
|
||||
needles: &[u8],
|
||||
haystack: &[u8],
|
||||
at: usize,
|
||||
) -> Option<usize> {
|
||||
let bs = needles;
|
||||
let i = match needles.len() {
|
||||
1 => memchr::memchr(bs[0], &haystack[at..])?,
|
||||
2 => memchr::memchr2(bs[0], bs[1], &haystack[at..])?,
|
||||
3 => memchr::memchr3(bs[0], bs[1], bs[2], &haystack[at..])?,
|
||||
0 => panic!("cannot find with empty needles"),
|
||||
n => panic!("invalid needles length: {}", n),
|
||||
};
|
||||
Some(at + i)
|
||||
}
|
||||
|
||||
/// Search for between 1 and 3 needle bytes in the given haystack in reverse,
|
||||
/// starting the search at the given position. If `needles` has a length other
|
||||
/// than 1-3, then this panics.
|
||||
#[cfg_attr(feature = "perf-inline", inline(always))]
|
||||
pub(crate) fn find_rev(
|
||||
needles: &[u8],
|
||||
haystack: &[u8],
|
||||
at: usize,
|
||||
) -> Option<usize> {
|
||||
let bs = needles;
|
||||
match needles.len() {
|
||||
1 => memchr::memrchr(bs[0], &haystack[..at]),
|
||||
2 => memchr::memrchr2(bs[0], bs[1], &haystack[..at]),
|
||||
3 => memchr::memrchr3(bs[0], bs[1], bs[2], &haystack[..at]),
|
||||
0 => panic!("cannot find with empty needles"),
|
||||
n => panic!("invalid needles length: {}", n),
|
||||
}
|
||||
}
|
||||
|
||||
/// Represents the accelerators for all accelerated states in a dense DFA.
|
||||
///
|
||||
/// The `A` type parameter represents the type of the underlying bytes.
|
||||
/// Generally, this is either `&[AccelTy]` or `Vec<AccelTy>`.
|
||||
#[derive(Clone)]
|
||||
pub(crate) struct Accels<A> {
|
||||
/// A length prefixed slice of contiguous accelerators. See the top comment
|
||||
/// in this module for more details on how we can jump from a DFA's state
|
||||
/// ID to an accelerator in this list.
|
||||
///
|
||||
/// The first 4 bytes always correspond to the number of accelerators
|
||||
/// that follow.
|
||||
accels: A,
|
||||
}
|
||||
|
||||
#[cfg(feature = "dfa-build")]
|
||||
impl Accels<Vec<AccelTy>> {
|
||||
/// Create an empty sequence of accelerators for a DFA.
|
||||
pub fn empty() -> Accels<Vec<AccelTy>> {
|
||||
Accels { accels: vec![0] }
|
||||
}
|
||||
|
||||
/// Add an accelerator to this sequence.
|
||||
///
|
||||
/// This adds to the accelerator to the end of the sequence and therefore
|
||||
/// should be done in correspondence with its state in the DFA.
|
||||
///
|
||||
/// This panics if this results in more accelerators than AccelTy::MAX.
|
||||
pub fn add(&mut self, accel: Accel) {
|
||||
self.accels.extend_from_slice(&accel.as_accel_tys());
|
||||
let len = self.len();
|
||||
self.set_len(len + 1);
|
||||
}
|
||||
|
||||
/// Set the number of accelerators in this sequence, which is encoded in
|
||||
/// the first 4 bytes of the underlying bytes.
|
||||
fn set_len(&mut self, new_len: usize) {
|
||||
// The only way an accelerator gets added is if a state exists for
|
||||
// it, and if a state exists, then its index is guaranteed to be
|
||||
// representable by a AccelTy by virtue of the guarantees provided by
|
||||
// StateID.
|
||||
let new_len = AccelTy::try_from(new_len).unwrap();
|
||||
self.accels[0] = new_len;
|
||||
}
|
||||
}
|
||||
|
||||
impl<'a> Accels<&'a [AccelTy]> {
|
||||
/// Deserialize a sequence of accelerators from the given bytes. If there
|
||||
/// was a problem deserializing, then an error is returned.
|
||||
///
|
||||
/// This is guaranteed to run in constant time. This does not guarantee
|
||||
/// that every accelerator in the returned collection is valid. Thus,
|
||||
/// accessing one may panic, or not-safe code that relies on accelerators
|
||||
/// being correct my result in UB.
|
||||
///
|
||||
/// Callers may check the validity of every accelerator with the `validate`
|
||||
/// method.
|
||||
pub fn from_bytes_unchecked(
|
||||
mut slice: &'a [u8],
|
||||
) -> Result<(Accels<&'a [AccelTy]>, usize), DeserializeError> {
|
||||
let slice_start = slice.as_ptr().as_usize();
|
||||
|
||||
let (accel_len, _) =
|
||||
wire::try_read_u32_as_usize(slice, "accelerators length")?;
|
||||
// The accelerator length is part of the accel_tys slice that
|
||||
// we deserialize. This is perhaps a bit idiosyncratic. It would
|
||||
// probably be better to split out the length into a real field.
|
||||
|
||||
let accel_tys_len = wire::add(
|
||||
wire::mul(accel_len, 2, "total number of accelerator accel_tys")?,
|
||||
1,
|
||||
"total number of accel_tys",
|
||||
)?;
|
||||
let accel_tys_bytes_len = wire::mul(
|
||||
ACCEL_TY_SIZE,
|
||||
accel_tys_len,
|
||||
"total number of bytes in accelerators",
|
||||
)?;
|
||||
wire::check_slice_len(slice, accel_tys_bytes_len, "accelerators")?;
|
||||
wire::check_alignment::<AccelTy>(slice)?;
|
||||
let accel_tys = &slice[..accel_tys_bytes_len];
|
||||
slice = &slice[accel_tys_bytes_len..];
|
||||
// SAFETY: We've checked the length and alignment above, and since
|
||||
// slice is just bytes and AccelTy is just a u32, we can safely cast to
|
||||
// a slice of &[AccelTy].
|
||||
let accels = unsafe {
|
||||
core::slice::from_raw_parts(
|
||||
accel_tys.as_ptr().cast::<AccelTy>(),
|
||||
accel_tys_len,
|
||||
)
|
||||
};
|
||||
Ok((Accels { accels }, slice.as_ptr().as_usize() - slice_start))
|
||||
}
|
||||
}
|
||||
|
||||
impl<A: AsRef<[AccelTy]>> Accels<A> {
|
||||
/// Return an owned version of the accelerators.
|
||||
#[cfg(feature = "alloc")]
|
||||
pub fn to_owned(&self) -> Accels<alloc::vec::Vec<AccelTy>> {
|
||||
Accels { accels: self.accels.as_ref().to_vec() }
|
||||
}
|
||||
|
||||
/// Return a borrowed version of the accelerators.
|
||||
pub fn as_ref(&self) -> Accels<&[AccelTy]> {
|
||||
Accels { accels: self.accels.as_ref() }
|
||||
}
|
||||
|
||||
/// Return the bytes representing the serialization of the accelerators.
|
||||
pub fn as_bytes(&self) -> &[u8] {
|
||||
let accels = self.accels.as_ref();
|
||||
// SAFETY: This is safe because accels is a just a slice of AccelTy,
|
||||
// and u8 always has a smaller alignment.
|
||||
unsafe {
|
||||
core::slice::from_raw_parts(
|
||||
accels.as_ptr().cast::<u8>(),
|
||||
accels.len() * ACCEL_TY_SIZE,
|
||||
)
|
||||
}
|
||||
}
|
||||
|
||||
/// Returns the memory usage, in bytes, of these accelerators.
|
||||
///
|
||||
/// The memory usage is computed based on the number of bytes used to
|
||||
/// represent all of the accelerators.
|
||||
///
|
||||
/// This does **not** include the stack size used by this value.
|
||||
pub fn memory_usage(&self) -> usize {
|
||||
self.as_bytes().len()
|
||||
}
|
||||
|
||||
/// Return the bytes to search for corresponding to the accelerator in this
|
||||
/// sequence at index `i`. If no such accelerator exists, then this panics.
|
||||
///
|
||||
/// The significance of the index is that it should be in correspondence
|
||||
/// with the index of the corresponding DFA. That is, accelerated DFA
|
||||
/// states are stored contiguously in the DFA and have an ordering implied
|
||||
/// by their respective state IDs. The state's index in that sequence
|
||||
/// corresponds to the index of its corresponding accelerator.
|
||||
#[cfg_attr(feature = "perf-inline", inline(always))]
|
||||
pub fn needles(&self, i: usize) -> &[u8] {
|
||||
if i >= self.len() {
|
||||
panic!("invalid accelerator index {}", i);
|
||||
}
|
||||
let bytes = self.as_bytes();
|
||||
let offset = ACCEL_TY_SIZE + i * ACCEL_CAP;
|
||||
let len = usize::from(bytes[offset]);
|
||||
&bytes[offset + 1..offset + 1 + len]
|
||||
}
|
||||
|
||||
/// Return the total number of accelerators in this sequence.
|
||||
pub fn len(&self) -> usize {
|
||||
// This should never panic since deserialization checks that the
|
||||
// length can fit into a usize.
|
||||
usize::try_from(self.accels.as_ref()[0]).unwrap()
|
||||
}
|
||||
|
||||
/// Return the accelerator in this sequence at index `i`. If no such
|
||||
/// accelerator exists, then this returns None.
|
||||
///
|
||||
/// See the docs for `needles` on the significance of the index.
|
||||
fn get(&self, i: usize) -> Option<Accel> {
|
||||
if i >= self.len() {
|
||||
return None;
|
||||
}
|
||||
let offset = ACCEL_TY_SIZE + i * ACCEL_CAP;
|
||||
let accel = Accel::from_slice(&self.as_bytes()[offset..])
|
||||
.expect("Accels must contain valid accelerators");
|
||||
Some(accel)
|
||||
}
|
||||
|
||||
/// Returns an iterator of accelerators in this sequence.
|
||||
fn iter(&self) -> IterAccels<'_, A> {
|
||||
IterAccels { accels: self, i: 0 }
|
||||
}
|
||||
|
||||
/// Writes these accelerators to the given byte buffer using the indicated
|
||||
/// endianness. If the given buffer is too small, then an error is
|
||||
/// returned. Upon success, the total number of bytes written is returned.
|
||||
/// The number of bytes written is guaranteed to be a multiple of 8.
|
||||
pub fn write_to<E: Endian>(
|
||||
&self,
|
||||
dst: &mut [u8],
|
||||
) -> Result<usize, SerializeError> {
|
||||
let nwrite = self.write_to_len();
|
||||
assert_eq!(
|
||||
nwrite % ACCEL_TY_SIZE,
|
||||
0,
|
||||
"expected accelerator bytes written to be a multiple of {}",
|
||||
ACCEL_TY_SIZE,
|
||||
);
|
||||
if dst.len() < nwrite {
|
||||
return Err(SerializeError::buffer_too_small("accelerators"));
|
||||
}
|
||||
|
||||
// The number of accelerators can never exceed AccelTy::MAX.
|
||||
E::write_u32(AccelTy::try_from(self.len()).unwrap(), dst);
|
||||
// The actual accelerators are just raw bytes and thus their endianness
|
||||
// is irrelevant. So we can copy them as bytes.
|
||||
dst[ACCEL_TY_SIZE..nwrite]
|
||||
.copy_from_slice(&self.as_bytes()[ACCEL_TY_SIZE..nwrite]);
|
||||
Ok(nwrite)
|
||||
}
|
||||
|
||||
/// Validates that every accelerator in this collection can be successfully
|
||||
/// deserialized as a valid accelerator.
|
||||
pub fn validate(&self) -> Result<(), DeserializeError> {
|
||||
for chunk in self.as_bytes()[ACCEL_TY_SIZE..].chunks(ACCEL_CAP) {
|
||||
let _ = Accel::from_slice(chunk)?;
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Returns the total number of bytes written by `write_to`.
|
||||
pub fn write_to_len(&self) -> usize {
|
||||
self.as_bytes().len()
|
||||
}
|
||||
}
|
||||
|
||||
impl<A: AsRef<[AccelTy]>> core::fmt::Debug for Accels<A> {
|
||||
fn fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result {
|
||||
write!(f, "Accels(")?;
|
||||
let mut list = f.debug_list();
|
||||
for a in self.iter() {
|
||||
list.entry(&a);
|
||||
}
|
||||
list.finish()?;
|
||||
write!(f, ")")
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Debug)]
|
||||
struct IterAccels<'a, A: AsRef<[AccelTy]>> {
|
||||
accels: &'a Accels<A>,
|
||||
i: usize,
|
||||
}
|
||||
|
||||
impl<'a, A: AsRef<[AccelTy]>> Iterator for IterAccels<'a, A> {
|
||||
type Item = Accel;
|
||||
|
||||
fn next(&mut self) -> Option<Accel> {
|
||||
let accel = self.accels.get(self.i)?;
|
||||
self.i += 1;
|
||||
Some(accel)
|
||||
}
|
||||
}
|
||||
|
||||
/// Accel represents a structure for determining how to "accelerate" a DFA
|
||||
/// state.
|
||||
///
|
||||
/// Namely, it contains zero or more bytes that must be seen in order for the
|
||||
/// DFA to leave the state it is associated with. In practice, the actual range
|
||||
/// is 1 to 3 bytes.
|
||||
///
|
||||
/// The purpose of acceleration is to identify states whose vast majority
|
||||
/// of transitions are just loops back to the same state. For example,
|
||||
/// in the regex `(?-u)^[^a]+b`, the corresponding DFA will have a state
|
||||
/// (corresponding to `[^a]+`) where all transitions *except* for `a` and
|
||||
/// `b` loop back to itself. Thus, this state can be "accelerated" by simply
|
||||
/// looking for the next occurrence of either `a` or `b` instead of explicitly
|
||||
/// following transitions. (In this case, `b` transitions to the next state
|
||||
/// where as `a` would transition to the dead state.)
|
||||
#[derive(Clone)]
|
||||
pub(crate) struct Accel {
|
||||
/// The first byte is the length. Subsequent bytes are the accelerated
|
||||
/// bytes.
|
||||
///
|
||||
/// Note that we make every accelerator 8 bytes as a slightly wasteful
|
||||
/// way of making sure alignment is always correct for state ID sizes of
|
||||
/// 1, 2, 4 and 8. This should be okay since accelerated states aren't
|
||||
/// particularly common, especially when Unicode is enabled.
|
||||
bytes: [u8; ACCEL_CAP],
|
||||
}
|
||||
|
||||
impl Accel {
|
||||
/// Returns an empty accel, where no bytes are accelerated.
|
||||
#[cfg(feature = "dfa-build")]
|
||||
pub fn new() -> Accel {
|
||||
Accel { bytes: [0; ACCEL_CAP] }
|
||||
}
|
||||
|
||||
/// Returns a verified accelerator derived from the beginning of the given
|
||||
/// slice.
|
||||
///
|
||||
/// If the slice is not long enough or contains invalid bytes for an
|
||||
/// accelerator, then this returns an error.
|
||||
pub fn from_slice(mut slice: &[u8]) -> Result<Accel, DeserializeError> {
|
||||
slice = &slice[..core::cmp::min(ACCEL_LEN, slice.len())];
|
||||
let bytes = slice
|
||||
.try_into()
|
||||
.map_err(|_| DeserializeError::buffer_too_small("accelerator"))?;
|
||||
Accel::from_bytes(bytes)
|
||||
}
|
||||
|
||||
/// Returns a verified accelerator derived from raw bytes.
|
||||
///
|
||||
/// If the given bytes are invalid, then this returns an error.
|
||||
fn from_bytes(bytes: [u8; 4]) -> Result<Accel, DeserializeError> {
|
||||
if usize::from(bytes[0]) >= ACCEL_LEN {
|
||||
return Err(DeserializeError::generic(
|
||||
"accelerator bytes cannot have length more than 3",
|
||||
));
|
||||
}
|
||||
Ok(Accel::from_bytes_unchecked(bytes))
|
||||
}
|
||||
|
||||
/// Returns an accelerator derived from raw bytes.
|
||||
///
|
||||
/// This does not check whether the given bytes are valid. Invalid bytes
|
||||
/// cannot sacrifice memory safety, but may result in panics or silent
|
||||
/// logic bugs.
|
||||
fn from_bytes_unchecked(bytes: [u8; 4]) -> Accel {
|
||||
Accel { bytes: [bytes[0], bytes[1], bytes[2], bytes[3], 0, 0, 0, 0] }
|
||||
}
|
||||
|
||||
/// Attempts to add the given byte to this accelerator. If the accelerator
|
||||
/// is already full or thinks the byte is a poor accelerator, then this
|
||||
/// returns false. Otherwise, returns true.
|
||||
///
|
||||
/// If the given byte is already in this accelerator, then it panics.
|
||||
#[cfg(feature = "dfa-build")]
|
||||
pub fn add(&mut self, byte: u8) -> bool {
|
||||
if self.len() >= 3 {
|
||||
return false;
|
||||
}
|
||||
// As a special case, we totally reject trying to accelerate a state
|
||||
// with an ASCII space. In most cases, it occurs very frequently, and
|
||||
// tends to result in worse overall performance.
|
||||
if byte == b' ' {
|
||||
return false;
|
||||
}
|
||||
assert!(
|
||||
!self.contains(byte),
|
||||
"accelerator already contains {:?}",
|
||||
crate::util::escape::DebugByte(byte)
|
||||
);
|
||||
self.bytes[self.len() + 1] = byte;
|
||||
self.bytes[0] += 1;
|
||||
true
|
||||
}
|
||||
|
||||
/// Return the number of bytes in this accelerator.
|
||||
pub fn len(&self) -> usize {
|
||||
usize::from(self.bytes[0])
|
||||
}
|
||||
|
||||
/// Returns true if and only if there are no bytes in this accelerator.
|
||||
#[cfg(feature = "dfa-build")]
|
||||
pub fn is_empty(&self) -> bool {
|
||||
self.len() == 0
|
||||
}
|
||||
|
||||
/// Returns the slice of bytes to accelerate.
|
||||
///
|
||||
/// If this accelerator is empty, then this returns an empty slice.
|
||||
fn needles(&self) -> &[u8] {
|
||||
&self.bytes[1..1 + self.len()]
|
||||
}
|
||||
|
||||
/// Returns true if and only if this accelerator will accelerate the given
|
||||
/// byte.
|
||||
#[cfg(feature = "dfa-build")]
|
||||
fn contains(&self, byte: u8) -> bool {
|
||||
self.needles().iter().position(|&b| b == byte).is_some()
|
||||
}
|
||||
|
||||
/// Returns the accelerator bytes as an array of AccelTys.
|
||||
#[cfg(feature = "dfa-build")]
|
||||
fn as_accel_tys(&self) -> [AccelTy; 2] {
|
||||
assert_eq!(ACCEL_CAP, 8);
|
||||
// These unwraps are OK since ACCEL_CAP is set to 8.
|
||||
let first =
|
||||
AccelTy::from_ne_bytes(self.bytes[0..4].try_into().unwrap());
|
||||
let second =
|
||||
AccelTy::from_ne_bytes(self.bytes[4..8].try_into().unwrap());
|
||||
[first, second]
|
||||
}
|
||||
}
|
||||
|
||||
impl core::fmt::Debug for Accel {
|
||||
fn fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result {
|
||||
write!(f, "Accel(")?;
|
||||
let mut set = f.debug_set();
|
||||
for &b in self.needles() {
|
||||
set.entry(&crate::util::escape::DebugByte(b));
|
||||
}
|
||||
set.finish()?;
|
||||
write!(f, ")")
|
||||
}
|
||||
}
|
||||
2260
third-party/vendor/regex-automata/src/dfa/automaton.rs
vendored
Normal file
2260
third-party/vendor/regex-automata/src/dfa/automaton.rs
vendored
Normal file
File diff suppressed because it is too large
Load diff
5153
third-party/vendor/regex-automata/src/dfa/dense.rs
vendored
Normal file
5153
third-party/vendor/regex-automata/src/dfa/dense.rs
vendored
Normal file
File diff suppressed because it is too large
Load diff
599
third-party/vendor/regex-automata/src/dfa/determinize.rs
vendored
Normal file
599
third-party/vendor/regex-automata/src/dfa/determinize.rs
vendored
Normal file
|
|
@ -0,0 +1,599 @@
|
|||
use alloc::{collections::BTreeMap, vec::Vec};
|
||||
|
||||
use crate::{
|
||||
dfa::{
|
||||
dense::{self, BuildError},
|
||||
DEAD,
|
||||
},
|
||||
nfa::thompson,
|
||||
util::{
|
||||
self,
|
||||
alphabet::{self, ByteSet},
|
||||
determinize::{State, StateBuilderEmpty, StateBuilderNFA},
|
||||
primitives::{PatternID, StateID},
|
||||
search::{Anchored, MatchKind},
|
||||
sparse_set::SparseSets,
|
||||
start::Start,
|
||||
},
|
||||
};
|
||||
|
||||
/// A builder for configuring and running a DFA determinizer.
|
||||
#[derive(Clone, Debug)]
|
||||
pub(crate) struct Config {
|
||||
match_kind: MatchKind,
|
||||
quit: ByteSet,
|
||||
dfa_size_limit: Option<usize>,
|
||||
determinize_size_limit: Option<usize>,
|
||||
}
|
||||
|
||||
impl Config {
|
||||
/// Create a new default config for a determinizer. The determinizer may be
|
||||
/// configured before calling `run`.
|
||||
pub fn new() -> Config {
|
||||
Config {
|
||||
match_kind: MatchKind::LeftmostFirst,
|
||||
quit: ByteSet::empty(),
|
||||
dfa_size_limit: None,
|
||||
determinize_size_limit: None,
|
||||
}
|
||||
}
|
||||
|
||||
/// Run determinization on the given NFA and write the resulting DFA into
|
||||
/// the one given. The DFA given should be initialized but otherwise empty.
|
||||
/// "Initialized" means that it is setup to handle the NFA's byte classes,
|
||||
/// number of patterns and whether to build start states for each pattern.
|
||||
pub fn run(
|
||||
&self,
|
||||
nfa: &thompson::NFA,
|
||||
dfa: &mut dense::OwnedDFA,
|
||||
) -> Result<(), BuildError> {
|
||||
let dead = State::dead();
|
||||
let quit = State::dead();
|
||||
let mut cache = StateMap::default();
|
||||
// We only insert the dead state here since its representation is
|
||||
// identical to the quit state. And we never want anything pointing
|
||||
// to the quit state other than specific transitions derived from the
|
||||
// determinizer's configured "quit" bytes.
|
||||
//
|
||||
// We do put the quit state into 'builder_states' below. This ensures
|
||||
// that a proper DFA state ID is allocated for it, and that no other
|
||||
// DFA state uses the "location after the DEAD state." That is, it
|
||||
// is assumed that the quit state is always the state immediately
|
||||
// following the DEAD state.
|
||||
cache.insert(dead.clone(), DEAD);
|
||||
|
||||
let runner = Runner {
|
||||
config: self.clone(),
|
||||
nfa,
|
||||
dfa,
|
||||
builder_states: alloc::vec![dead, quit],
|
||||
cache,
|
||||
memory_usage_state: 0,
|
||||
sparses: SparseSets::new(nfa.states().len()),
|
||||
stack: alloc::vec![],
|
||||
scratch_state_builder: StateBuilderEmpty::new(),
|
||||
};
|
||||
runner.run()
|
||||
}
|
||||
|
||||
/// The match semantics to use for determinization.
|
||||
///
|
||||
/// MatchKind::All corresponds to the standard textbook construction.
|
||||
/// All possible match states are represented in the DFA.
|
||||
/// MatchKind::LeftmostFirst permits greediness and otherwise tries to
|
||||
/// simulate the match semantics of backtracking regex engines. Namely,
|
||||
/// only a subset of match states are built, and dead states are used to
|
||||
/// stop searches with an unanchored prefix.
|
||||
///
|
||||
/// The default is MatchKind::LeftmostFirst.
|
||||
pub fn match_kind(&mut self, kind: MatchKind) -> &mut Config {
|
||||
self.match_kind = kind;
|
||||
self
|
||||
}
|
||||
|
||||
/// The set of bytes to use that will cause the DFA to enter a quit state,
|
||||
/// stop searching and return an error. By default, this is empty.
|
||||
pub fn quit(&mut self, set: ByteSet) -> &mut Config {
|
||||
self.quit = set;
|
||||
self
|
||||
}
|
||||
|
||||
/// The limit, in bytes of the heap, that the DFA is permitted to use. This
|
||||
/// does not include the auxiliary heap storage used by determinization.
|
||||
pub fn dfa_size_limit(&mut self, bytes: Option<usize>) -> &mut Config {
|
||||
self.dfa_size_limit = bytes;
|
||||
self
|
||||
}
|
||||
|
||||
/// The limit, in bytes of the heap, that determinization itself is allowed
|
||||
/// to use. This does not include the size of the DFA being built.
|
||||
pub fn determinize_size_limit(
|
||||
&mut self,
|
||||
bytes: Option<usize>,
|
||||
) -> &mut Config {
|
||||
self.determinize_size_limit = bytes;
|
||||
self
|
||||
}
|
||||
}
|
||||
|
||||
/// The actual implementation of determinization that converts an NFA to a DFA
|
||||
/// through powerset construction.
|
||||
///
|
||||
/// This determinizer roughly follows the typical powerset construction, where
|
||||
/// each DFA state is comprised of one or more NFA states. In the worst case,
|
||||
/// there is one DFA state for every possible combination of NFA states. In
|
||||
/// practice, this only happens in certain conditions, typically when there are
|
||||
/// bounded repetitions.
|
||||
///
|
||||
/// The main differences between this implementation and typical deteminization
|
||||
/// are that this implementation delays matches by one state and hackily makes
|
||||
/// look-around work. Comments below attempt to explain this.
|
||||
///
|
||||
/// The lifetime variable `'a` refers to the lifetime of the NFA or DFA,
|
||||
/// whichever is shorter.
|
||||
#[derive(Debug)]
|
||||
struct Runner<'a> {
|
||||
/// The configuration used to initialize determinization.
|
||||
config: Config,
|
||||
/// The NFA we're converting into a DFA.
|
||||
nfa: &'a thompson::NFA,
|
||||
/// The DFA we're building.
|
||||
dfa: &'a mut dense::OwnedDFA,
|
||||
/// Each DFA state being built is defined as an *ordered* set of NFA
|
||||
/// states, along with some meta facts about the ordered set of NFA states.
|
||||
///
|
||||
/// This is never empty. The first state is always a dummy state such that
|
||||
/// a state id == 0 corresponds to a dead state. The second state is always
|
||||
/// the quit state.
|
||||
///
|
||||
/// Why do we have states in both a `Vec` and in a cache map below?
|
||||
/// Well, they serve two different roles based on access patterns.
|
||||
/// `builder_states` is the canonical home of each state, and provides
|
||||
/// constant random access by a DFA state's ID. The cache map below, on
|
||||
/// the other hand, provides a quick way of searching for identical DFA
|
||||
/// states by using the DFA state as a key in the map. Of course, we use
|
||||
/// reference counting to avoid actually duplicating the state's data
|
||||
/// itself. (Although this has never been benchmarked.) Note that the cache
|
||||
/// map does not give us full minimization; it just lets us avoid some very
|
||||
/// obvious redundant states.
|
||||
///
|
||||
/// Note that the index into this Vec isn't quite the DFA's state ID.
|
||||
/// Rather, it's just an index. To get the state ID, you have to multiply
|
||||
/// it by the DFA's stride. That's done by self.dfa.from_index. And the
|
||||
/// inverse is self.dfa.to_index.
|
||||
///
|
||||
/// Moreover, DFA states don't usually retain the IDs assigned to them
|
||||
/// by their position in this Vec. After determinization completes,
|
||||
/// states are shuffled around to support other optimizations. See the
|
||||
/// sibling 'special' module for more details on that. (The reason for
|
||||
/// mentioning this is that if you print out the DFA for debugging during
|
||||
/// determinization, and then print out the final DFA after it is fully
|
||||
/// built, then the state IDs likely won't match up.)
|
||||
builder_states: Vec<State>,
|
||||
/// A cache of DFA states that already exist and can be easily looked up
|
||||
/// via ordered sets of NFA states.
|
||||
///
|
||||
/// See `builder_states` docs for why we store states in two different
|
||||
/// ways.
|
||||
cache: StateMap,
|
||||
/// The memory usage, in bytes, used by builder_states and cache. We track
|
||||
/// this as new states are added since states use a variable amount of
|
||||
/// heap. Tracking this as we add states makes it possible to compute the
|
||||
/// total amount of memory used by the determinizer in constant time.
|
||||
memory_usage_state: usize,
|
||||
/// A pair of sparse sets for tracking ordered sets of NFA state IDs.
|
||||
/// These are reused throughout determinization. A bounded sparse set
|
||||
/// gives us constant time insertion, membership testing and clearing.
|
||||
sparses: SparseSets,
|
||||
/// Scratch space for a stack of NFA states to visit, for depth first
|
||||
/// visiting without recursion.
|
||||
stack: Vec<StateID>,
|
||||
/// Scratch space for storing an ordered sequence of NFA states, for
|
||||
/// amortizing allocation. This is principally useful for when we avoid
|
||||
/// adding a new DFA state since it already exists. In order to detect this
|
||||
/// case though, we still need an ordered set of NFA state IDs. So we use
|
||||
/// this space to stage that ordered set before we know whether we need to
|
||||
/// create a new DFA state or not.
|
||||
scratch_state_builder: StateBuilderEmpty,
|
||||
}
|
||||
|
||||
/// A map from states to state identifiers. When using std, we use a standard
|
||||
/// hashmap, since it's a bit faster for this use case. (Other maps, like
|
||||
/// one's based on FNV, have not yet been benchmarked.)
|
||||
///
|
||||
/// The main purpose of this map is to reuse states where possible. This won't
|
||||
/// fully minimize the DFA, but it works well in a lot of cases.
|
||||
#[cfg(feature = "std")]
|
||||
type StateMap = std::collections::HashMap<State, StateID>;
|
||||
#[cfg(not(feature = "std"))]
|
||||
type StateMap = BTreeMap<State, StateID>;
|
||||
|
||||
impl<'a> Runner<'a> {
|
||||
/// Build the DFA. If there was a problem constructing the DFA (e.g., if
|
||||
/// the chosen state identifier representation is too small), then an error
|
||||
/// is returned.
|
||||
fn run(mut self) -> Result<(), BuildError> {
|
||||
if self.nfa.look_set_any().contains_word_unicode()
|
||||
&& !self.config.quit.contains_range(0x80, 0xFF)
|
||||
{
|
||||
return Err(BuildError::unsupported_dfa_word_boundary_unicode());
|
||||
}
|
||||
|
||||
// A sequence of "representative" bytes drawn from each equivalence
|
||||
// class. These representative bytes are fed to the NFA to compute
|
||||
// state transitions. This allows us to avoid re-computing state
|
||||
// transitions for bytes that are guaranteed to produce identical
|
||||
// results. Since computing the representatives needs to do a little
|
||||
// work, we do it once here because we'll be iterating over them a lot.
|
||||
let representatives: Vec<alphabet::Unit> =
|
||||
self.dfa.byte_classes().representatives(..).collect();
|
||||
// The set of all DFA state IDs that still need to have their
|
||||
// transitions set. We start by seeding this with all starting states.
|
||||
let mut uncompiled = alloc::vec![];
|
||||
self.add_all_starts(&mut uncompiled)?;
|
||||
while let Some(dfa_id) = uncompiled.pop() {
|
||||
for &unit in &representatives {
|
||||
if unit.as_u8().map_or(false, |b| self.config.quit.contains(b))
|
||||
{
|
||||
continue;
|
||||
}
|
||||
// In many cases, the state we transition to has already been
|
||||
// computed. 'cached_state' will do the minimal amount of work
|
||||
// to check this, and if it exists, immediately return an
|
||||
// already existing state ID.
|
||||
let (next_dfa_id, is_new) = self.cached_state(dfa_id, unit)?;
|
||||
self.dfa.set_transition(dfa_id, unit, next_dfa_id);
|
||||
// If the state ID we got back is newly created, then we need
|
||||
// to compile it, so add it to our uncompiled frontier.
|
||||
if is_new {
|
||||
uncompiled.push(next_dfa_id);
|
||||
}
|
||||
}
|
||||
}
|
||||
debug!(
|
||||
"determinization complete, memory usage: {}, \
|
||||
dense DFA size: {}, \
|
||||
is reverse? {}",
|
||||
self.memory_usage(),
|
||||
self.dfa.memory_usage(),
|
||||
self.nfa.is_reverse(),
|
||||
);
|
||||
|
||||
// A map from DFA state ID to one or more NFA match IDs. Each NFA match
|
||||
// ID corresponds to a distinct regex pattern that matches in the state
|
||||
// corresponding to the key.
|
||||
let mut matches: BTreeMap<StateID, Vec<PatternID>> = BTreeMap::new();
|
||||
self.cache.clear();
|
||||
#[cfg(feature = "logging")]
|
||||
let mut total_pat_len = 0;
|
||||
for (i, state) in self.builder_states.into_iter().enumerate() {
|
||||
if let Some(pat_ids) = state.match_pattern_ids() {
|
||||
let id = self.dfa.to_state_id(i);
|
||||
log! {
|
||||
total_pat_len += pat_ids.len();
|
||||
}
|
||||
matches.insert(id, pat_ids);
|
||||
}
|
||||
}
|
||||
log! {
|
||||
use core::mem::size_of;
|
||||
let per_elem = size_of::<StateID>() + size_of::<Vec<PatternID>>();
|
||||
let pats = total_pat_len * size_of::<PatternID>();
|
||||
let mem = (matches.len() * per_elem) + pats;
|
||||
log::debug!("matches map built, memory usage: {}", mem);
|
||||
}
|
||||
// At this point, we shuffle the "special" states in the final DFA.
|
||||
// This permits a DFA's match loop to detect a match condition (among
|
||||
// other things) by merely inspecting the current state's identifier,
|
||||
// and avoids the need for any additional auxiliary storage.
|
||||
self.dfa.shuffle(matches)?;
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Return the identifier for the next DFA state given an existing DFA
|
||||
/// state and an input byte. If the next DFA state already exists, then
|
||||
/// return its identifier from the cache. Otherwise, build the state, cache
|
||||
/// it and return its identifier.
|
||||
///
|
||||
/// This routine returns a boolean indicating whether a new state was
|
||||
/// built. If a new state is built, then the caller needs to add it to its
|
||||
/// frontier of uncompiled DFA states to compute transitions for.
|
||||
fn cached_state(
|
||||
&mut self,
|
||||
dfa_id: StateID,
|
||||
unit: alphabet::Unit,
|
||||
) -> Result<(StateID, bool), BuildError> {
|
||||
// Compute the set of all reachable NFA states, including epsilons.
|
||||
let empty_builder = self.get_state_builder();
|
||||
let builder = util::determinize::next(
|
||||
self.nfa,
|
||||
self.config.match_kind,
|
||||
&mut self.sparses,
|
||||
&mut self.stack,
|
||||
&self.builder_states[self.dfa.to_index(dfa_id)],
|
||||
unit,
|
||||
empty_builder,
|
||||
);
|
||||
self.maybe_add_state(builder)
|
||||
}
|
||||
|
||||
/// Compute the set of DFA start states and add their identifiers in
|
||||
/// 'dfa_state_ids' (no duplicates are added).
|
||||
fn add_all_starts(
|
||||
&mut self,
|
||||
dfa_state_ids: &mut Vec<StateID>,
|
||||
) -> Result<(), BuildError> {
|
||||
// These should be the first states added.
|
||||
assert!(dfa_state_ids.is_empty());
|
||||
// We only want to add (un)anchored starting states that is consistent
|
||||
// with our DFA's configuration. Unconditionally adding both (although
|
||||
// it is the default) can make DFAs quite a bit bigger.
|
||||
if self.dfa.start_kind().has_unanchored() {
|
||||
self.add_start_group(Anchored::No, dfa_state_ids)?;
|
||||
}
|
||||
if self.dfa.start_kind().has_anchored() {
|
||||
self.add_start_group(Anchored::Yes, dfa_state_ids)?;
|
||||
}
|
||||
// I previously has an 'assert' here checking that either
|
||||
// 'dfa_state_ids' was non-empty, or the NFA had zero patterns. But it
|
||||
// turns out this isn't always true. For example, the NFA might have
|
||||
// one or more patterns but where all such patterns are just 'fail'
|
||||
// states. These will ultimately just compile down to DFA dead states,
|
||||
// and since the dead state was added earlier, no new DFA states are
|
||||
// added. And thus, it is valid and okay for 'dfa_state_ids' to be
|
||||
// empty even if there are a non-zero number of patterns in the NFA.
|
||||
|
||||
// We only need to compute anchored start states for each pattern if it
|
||||
// was requested to do so.
|
||||
if self.dfa.starts_for_each_pattern() {
|
||||
for pid in self.nfa.patterns() {
|
||||
self.add_start_group(Anchored::Pattern(pid), dfa_state_ids)?;
|
||||
}
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Add a group of start states for the given match pattern ID. Any new
|
||||
/// DFA states added are pushed on to 'dfa_state_ids'. (No duplicates are
|
||||
/// pushed.)
|
||||
///
|
||||
/// When pattern_id is None, then this will compile a group of unanchored
|
||||
/// start states (if the DFA is unanchored). When the pattern_id is
|
||||
/// present, then this will compile a group of anchored start states that
|
||||
/// only match the given pattern.
|
||||
///
|
||||
/// This panics if `anchored` corresponds to an invalid pattern ID.
|
||||
fn add_start_group(
|
||||
&mut self,
|
||||
anchored: Anchored,
|
||||
dfa_state_ids: &mut Vec<StateID>,
|
||||
) -> Result<(), BuildError> {
|
||||
let nfa_start = match anchored {
|
||||
Anchored::No => self.nfa.start_unanchored(),
|
||||
Anchored::Yes => self.nfa.start_anchored(),
|
||||
Anchored::Pattern(pid) => {
|
||||
self.nfa.start_pattern(pid).expect("valid pattern ID")
|
||||
}
|
||||
};
|
||||
|
||||
// When compiling start states, we're careful not to build additional
|
||||
// states that aren't necessary. For example, if the NFA has no word
|
||||
// boundary assertion, then there's no reason to have distinct start
|
||||
// states for 'NonWordByte' and 'WordByte' starting configurations.
|
||||
// Instead, the 'WordByte' starting configuration can just point
|
||||
// directly to the start state for the 'NonWordByte' config.
|
||||
//
|
||||
// Note though that we only need to care about assertions in the prefix
|
||||
// of an NFA since this only concerns the starting states. (Actually,
|
||||
// the most precisely thing we could do it is look at the prefix
|
||||
// assertions of each pattern when 'anchored == Anchored::Pattern',
|
||||
// and then only compile extra states if the prefix is non-empty.) But
|
||||
// we settle for simplicity here instead of absolute minimalism. It is
|
||||
// somewhat rare, after all, for multiple patterns in the same regex to
|
||||
// have different prefix look-arounds.
|
||||
|
||||
let (id, is_new) =
|
||||
self.add_one_start(nfa_start, Start::NonWordByte)?;
|
||||
self.dfa.set_start_state(anchored, Start::NonWordByte, id);
|
||||
if is_new {
|
||||
dfa_state_ids.push(id);
|
||||
}
|
||||
|
||||
if !self.nfa.look_set_prefix_any().contains_word() {
|
||||
self.dfa.set_start_state(anchored, Start::WordByte, id);
|
||||
} else {
|
||||
let (id, is_new) =
|
||||
self.add_one_start(nfa_start, Start::WordByte)?;
|
||||
self.dfa.set_start_state(anchored, Start::WordByte, id);
|
||||
if is_new {
|
||||
dfa_state_ids.push(id);
|
||||
}
|
||||
}
|
||||
if !self.nfa.look_set_prefix_any().contains_anchor() {
|
||||
self.dfa.set_start_state(anchored, Start::Text, id);
|
||||
self.dfa.set_start_state(anchored, Start::LineLF, id);
|
||||
self.dfa.set_start_state(anchored, Start::LineCR, id);
|
||||
self.dfa.set_start_state(
|
||||
anchored,
|
||||
Start::CustomLineTerminator,
|
||||
id,
|
||||
);
|
||||
} else {
|
||||
let (id, is_new) = self.add_one_start(nfa_start, Start::Text)?;
|
||||
self.dfa.set_start_state(anchored, Start::Text, id);
|
||||
if is_new {
|
||||
dfa_state_ids.push(id);
|
||||
}
|
||||
|
||||
let (id, is_new) = self.add_one_start(nfa_start, Start::LineLF)?;
|
||||
self.dfa.set_start_state(anchored, Start::LineLF, id);
|
||||
if is_new {
|
||||
dfa_state_ids.push(id);
|
||||
}
|
||||
|
||||
let (id, is_new) = self.add_one_start(nfa_start, Start::LineCR)?;
|
||||
self.dfa.set_start_state(anchored, Start::LineCR, id);
|
||||
if is_new {
|
||||
dfa_state_ids.push(id);
|
||||
}
|
||||
|
||||
let (id, is_new) =
|
||||
self.add_one_start(nfa_start, Start::CustomLineTerminator)?;
|
||||
self.dfa.set_start_state(
|
||||
anchored,
|
||||
Start::CustomLineTerminator,
|
||||
id,
|
||||
);
|
||||
if is_new {
|
||||
dfa_state_ids.push(id);
|
||||
}
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Add a new DFA start state corresponding to the given starting NFA
|
||||
/// state, and the starting search configuration. (The starting search
|
||||
/// configuration essentially tells us which look-behind assertions are
|
||||
/// true for this particular state.)
|
||||
///
|
||||
/// The boolean returned indicates whether the state ID returned is a newly
|
||||
/// created state, or a previously cached state.
|
||||
fn add_one_start(
|
||||
&mut self,
|
||||
nfa_start: StateID,
|
||||
start: Start,
|
||||
) -> Result<(StateID, bool), BuildError> {
|
||||
// Compute the look-behind assertions that are true in this starting
|
||||
// configuration, and the determine the epsilon closure. While
|
||||
// computing the epsilon closure, we only follow condiional epsilon
|
||||
// transitions that satisfy the look-behind assertions in 'look_have'.
|
||||
let mut builder_matches = self.get_state_builder().into_matches();
|
||||
util::determinize::set_lookbehind_from_start(
|
||||
self.nfa,
|
||||
&start,
|
||||
&mut builder_matches,
|
||||
);
|
||||
self.sparses.set1.clear();
|
||||
util::determinize::epsilon_closure(
|
||||
self.nfa,
|
||||
nfa_start,
|
||||
builder_matches.look_have(),
|
||||
&mut self.stack,
|
||||
&mut self.sparses.set1,
|
||||
);
|
||||
let mut builder = builder_matches.into_nfa();
|
||||
util::determinize::add_nfa_states(
|
||||
&self.nfa,
|
||||
&self.sparses.set1,
|
||||
&mut builder,
|
||||
);
|
||||
self.maybe_add_state(builder)
|
||||
}
|
||||
|
||||
/// Adds the given state to the DFA being built depending on whether it
|
||||
/// already exists in this determinizer's cache.
|
||||
///
|
||||
/// If it does exist, then the memory used by 'state' is put back into the
|
||||
/// determinizer and the previously created state's ID is returned. (Along
|
||||
/// with 'false', indicating that no new state was added.)
|
||||
///
|
||||
/// If it does not exist, then the state is added to the DFA being built
|
||||
/// and a fresh ID is allocated (if ID allocation fails, then an error is
|
||||
/// returned) and returned. (Along with 'true', indicating that a new state
|
||||
/// was added.)
|
||||
fn maybe_add_state(
|
||||
&mut self,
|
||||
builder: StateBuilderNFA,
|
||||
) -> Result<(StateID, bool), BuildError> {
|
||||
if let Some(&cached_id) = self.cache.get(builder.as_bytes()) {
|
||||
// Since we have a cached state, put the constructed state's
|
||||
// memory back into our scratch space, so that it can be reused.
|
||||
self.put_state_builder(builder);
|
||||
return Ok((cached_id, false));
|
||||
}
|
||||
self.add_state(builder).map(|sid| (sid, true))
|
||||
}
|
||||
|
||||
/// Add the given state to the DFA and make it available in the cache.
|
||||
///
|
||||
/// The state initially has no transitions. That is, it transitions to the
|
||||
/// dead state for all possible inputs, and transitions to the quit state
|
||||
/// for all quit bytes.
|
||||
///
|
||||
/// If adding the state would exceed the maximum value for StateID, then an
|
||||
/// error is returned.
|
||||
fn add_state(
|
||||
&mut self,
|
||||
builder: StateBuilderNFA,
|
||||
) -> Result<StateID, BuildError> {
|
||||
let id = self.dfa.add_empty_state()?;
|
||||
if !self.config.quit.is_empty() {
|
||||
for b in self.config.quit.iter() {
|
||||
self.dfa.set_transition(
|
||||
id,
|
||||
alphabet::Unit::u8(b),
|
||||
self.dfa.quit_id(),
|
||||
);
|
||||
}
|
||||
}
|
||||
let state = builder.to_state();
|
||||
// States use reference counting internally, so we only need to count
|
||||
// their memory usage once.
|
||||
self.memory_usage_state += state.memory_usage();
|
||||
self.builder_states.push(state.clone());
|
||||
self.cache.insert(state, id);
|
||||
self.put_state_builder(builder);
|
||||
if let Some(limit) = self.config.dfa_size_limit {
|
||||
if self.dfa.memory_usage() > limit {
|
||||
return Err(BuildError::dfa_exceeded_size_limit(limit));
|
||||
}
|
||||
}
|
||||
if let Some(limit) = self.config.determinize_size_limit {
|
||||
if self.memory_usage() > limit {
|
||||
return Err(BuildError::determinize_exceeded_size_limit(
|
||||
limit,
|
||||
));
|
||||
}
|
||||
}
|
||||
Ok(id)
|
||||
}
|
||||
|
||||
/// Returns a state builder from this determinizer that might have existing
|
||||
/// capacity. This helps avoid allocs in cases where a state is built that
|
||||
/// turns out to already be cached.
|
||||
///
|
||||
/// Callers must put the state builder back with 'put_state_builder',
|
||||
/// otherwise the allocation reuse won't work.
|
||||
fn get_state_builder(&mut self) -> StateBuilderEmpty {
|
||||
core::mem::replace(
|
||||
&mut self.scratch_state_builder,
|
||||
StateBuilderEmpty::new(),
|
||||
)
|
||||
}
|
||||
|
||||
/// Puts the given state builder back into this determinizer for reuse.
|
||||
///
|
||||
/// Note that building a 'State' from a builder always creates a new
|
||||
/// alloc, so callers should always put the builder back.
|
||||
fn put_state_builder(&mut self, builder: StateBuilderNFA) {
|
||||
let _ = core::mem::replace(
|
||||
&mut self.scratch_state_builder,
|
||||
builder.clear(),
|
||||
);
|
||||
}
|
||||
|
||||
/// Return the memory usage, in bytes, of this determinizer at the current
|
||||
/// point in time. This does not include memory used by the NFA or the
|
||||
/// dense DFA itself.
|
||||
fn memory_usage(&self) -> usize {
|
||||
use core::mem::size_of;
|
||||
|
||||
self.builder_states.len() * size_of::<State>()
|
||||
// Maps likely use more memory than this, but it's probably close.
|
||||
+ self.cache.len() * (size_of::<State>() + size_of::<StateID>())
|
||||
+ self.memory_usage_state
|
||||
+ self.stack.capacity() * size_of::<StateID>()
|
||||
+ self.scratch_state_builder.capacity()
|
||||
}
|
||||
}
|
||||
463
third-party/vendor/regex-automata/src/dfa/minimize.rs
vendored
Normal file
463
third-party/vendor/regex-automata/src/dfa/minimize.rs
vendored
Normal file
|
|
@ -0,0 +1,463 @@
|
|||
use core::{cell::RefCell, fmt, mem};
|
||||
|
||||
use alloc::{collections::BTreeMap, rc::Rc, vec, vec::Vec};
|
||||
|
||||
use crate::{
|
||||
dfa::{automaton::Automaton, dense, DEAD},
|
||||
util::{
|
||||
alphabet,
|
||||
primitives::{PatternID, StateID},
|
||||
},
|
||||
};
|
||||
|
||||
/// An implementation of Hopcroft's algorithm for minimizing DFAs.
|
||||
///
|
||||
/// The algorithm implemented here is mostly taken from Wikipedia:
|
||||
/// https://en.wikipedia.org/wiki/DFA_minimization#Hopcroft's_algorithm
|
||||
///
|
||||
/// This code has had some light optimization attention paid to it,
|
||||
/// particularly in the form of reducing allocation as much as possible.
|
||||
/// However, it is still generally slow. Future optimization work should
|
||||
/// probably focus on the bigger picture rather than micro-optimizations. For
|
||||
/// example:
|
||||
///
|
||||
/// 1. Figure out how to more intelligently create initial partitions. That is,
|
||||
/// Hopcroft's algorithm starts by creating two partitions of DFA states
|
||||
/// that are known to NOT be equivalent: match states and non-match states.
|
||||
/// The algorithm proceeds by progressively refining these partitions into
|
||||
/// smaller partitions. If we could start with more partitions, then we
|
||||
/// could reduce the amount of work that Hopcroft's algorithm needs to do.
|
||||
/// 2. For every partition that we visit, we find all incoming transitions to
|
||||
/// every state in the partition for *every* element in the alphabet. (This
|
||||
/// is why using byte classes can significantly decrease minimization times,
|
||||
/// since byte classes shrink the alphabet.) This is quite costly and there
|
||||
/// is perhaps some redundant work being performed depending on the specific
|
||||
/// states in the set. For example, we might be able to only visit some
|
||||
/// elements of the alphabet based on the transitions.
|
||||
/// 3. Move parts of minimization into determinization. If minimization has
|
||||
/// fewer states to deal with, then it should run faster. A prime example
|
||||
/// of this might be large Unicode classes, which are generated in way that
|
||||
/// can create a lot of redundant states. (Some work has been done on this
|
||||
/// point during NFA compilation via the algorithm described in the
|
||||
/// "Incremental Construction of MinimalAcyclic Finite-State Automata"
|
||||
/// paper.)
|
||||
pub(crate) struct Minimizer<'a> {
|
||||
dfa: &'a mut dense::OwnedDFA,
|
||||
in_transitions: Vec<Vec<Vec<StateID>>>,
|
||||
partitions: Vec<StateSet>,
|
||||
waiting: Vec<StateSet>,
|
||||
}
|
||||
|
||||
impl<'a> fmt::Debug for Minimizer<'a> {
|
||||
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
|
||||
f.debug_struct("Minimizer")
|
||||
.field("dfa", &self.dfa)
|
||||
.field("in_transitions", &self.in_transitions)
|
||||
.field("partitions", &self.partitions)
|
||||
.field("waiting", &self.waiting)
|
||||
.finish()
|
||||
}
|
||||
}
|
||||
|
||||
/// A set of states. A state set makes up a single partition in Hopcroft's
|
||||
/// algorithm.
|
||||
///
|
||||
/// It is represented by an ordered set of state identifiers. We use shared
|
||||
/// ownership so that a single state set can be in both the set of partitions
|
||||
/// and in the set of waiting sets simultaneously without an additional
|
||||
/// allocation. Generally, once a state set is built, it becomes immutable.
|
||||
///
|
||||
/// We use this representation because it avoids the overhead of more
|
||||
/// traditional set data structures (HashSet/BTreeSet), and also because
|
||||
/// computing intersection/subtraction on this representation is especially
|
||||
/// fast.
|
||||
#[derive(Clone, Debug, Eq, PartialEq, PartialOrd, Ord)]
|
||||
struct StateSet {
|
||||
ids: Rc<RefCell<Vec<StateID>>>,
|
||||
}
|
||||
|
||||
impl<'a> Minimizer<'a> {
|
||||
pub fn new(dfa: &'a mut dense::OwnedDFA) -> Minimizer<'a> {
|
||||
let in_transitions = Minimizer::incoming_transitions(dfa);
|
||||
let partitions = Minimizer::initial_partitions(dfa);
|
||||
let waiting = partitions.clone();
|
||||
Minimizer { dfa, in_transitions, partitions, waiting }
|
||||
}
|
||||
|
||||
pub fn run(mut self) {
|
||||
let stride2 = self.dfa.stride2();
|
||||
let as_state_id = |index: usize| -> StateID {
|
||||
StateID::new(index << stride2).unwrap()
|
||||
};
|
||||
let as_index = |id: StateID| -> usize { id.as_usize() >> stride2 };
|
||||
|
||||
let mut incoming = StateSet::empty();
|
||||
let mut scratch1 = StateSet::empty();
|
||||
let mut scratch2 = StateSet::empty();
|
||||
let mut newparts = vec![];
|
||||
|
||||
// This loop is basically Hopcroft's algorithm. Everything else is just
|
||||
// shuffling data around to fit our representation.
|
||||
while let Some(set) = self.waiting.pop() {
|
||||
for b in self.dfa.byte_classes().iter() {
|
||||
self.find_incoming_to(b, &set, &mut incoming);
|
||||
// If incoming is empty, then the intersection with any other
|
||||
// set must also be empty. So 'newparts' just ends up being
|
||||
// 'self.partitions'. So there's no need to go through the loop
|
||||
// below.
|
||||
//
|
||||
// This actually turns out to be rather large optimization. On
|
||||
// the order of making minimization 4-5x faster. It's likely
|
||||
// that the vast majority of all states have very few incoming
|
||||
// transitions.
|
||||
if incoming.is_empty() {
|
||||
continue;
|
||||
}
|
||||
|
||||
for p in 0..self.partitions.len() {
|
||||
self.partitions[p].intersection(&incoming, &mut scratch1);
|
||||
if scratch1.is_empty() {
|
||||
newparts.push(self.partitions[p].clone());
|
||||
continue;
|
||||
}
|
||||
|
||||
self.partitions[p].subtract(&incoming, &mut scratch2);
|
||||
if scratch2.is_empty() {
|
||||
newparts.push(self.partitions[p].clone());
|
||||
continue;
|
||||
}
|
||||
|
||||
let (x, y) =
|
||||
(scratch1.deep_clone(), scratch2.deep_clone());
|
||||
newparts.push(x.clone());
|
||||
newparts.push(y.clone());
|
||||
match self.find_waiting(&self.partitions[p]) {
|
||||
Some(i) => {
|
||||
self.waiting[i] = x;
|
||||
self.waiting.push(y);
|
||||
}
|
||||
None => {
|
||||
if x.len() <= y.len() {
|
||||
self.waiting.push(x);
|
||||
} else {
|
||||
self.waiting.push(y);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
newparts = mem::replace(&mut self.partitions, newparts);
|
||||
newparts.clear();
|
||||
}
|
||||
}
|
||||
|
||||
// At this point, we now have a minimal partitioning of states, where
|
||||
// each partition is an equivalence class of DFA states. Now we need to
|
||||
// use this partitioning to update the DFA to only contain one state for
|
||||
// each partition.
|
||||
|
||||
// Create a map from DFA state ID to the representative ID of the
|
||||
// equivalence class to which it belongs. The representative ID of an
|
||||
// equivalence class of states is the minimum ID in that class.
|
||||
let mut state_to_part = vec![DEAD; self.dfa.state_len()];
|
||||
for p in &self.partitions {
|
||||
p.iter(|id| state_to_part[as_index(id)] = p.min());
|
||||
}
|
||||
|
||||
// Generate a new contiguous sequence of IDs for minimal states, and
|
||||
// create a map from equivalence IDs to the new IDs. Thus, the new
|
||||
// minimal ID of *any* state in the unminimized DFA can be obtained
|
||||
// with minimals_ids[state_to_part[old_id]].
|
||||
let mut minimal_ids = vec![DEAD; self.dfa.state_len()];
|
||||
let mut new_index = 0;
|
||||
for state in self.dfa.states() {
|
||||
if state_to_part[as_index(state.id())] == state.id() {
|
||||
minimal_ids[as_index(state.id())] = as_state_id(new_index);
|
||||
new_index += 1;
|
||||
}
|
||||
}
|
||||
// The total number of states in the minimal DFA.
|
||||
let minimal_count = new_index;
|
||||
// Convenience function for remapping state IDs. This takes an old ID,
|
||||
// looks up its Hopcroft partition and then maps that to the new ID
|
||||
// range.
|
||||
let remap = |old| minimal_ids[as_index(state_to_part[as_index(old)])];
|
||||
|
||||
// Re-map this DFA in place such that the only states remaining
|
||||
// correspond to the representative states of every equivalence class.
|
||||
for id in (0..self.dfa.state_len()).map(as_state_id) {
|
||||
// If this state isn't a representative for an equivalence class,
|
||||
// then we skip it since it won't appear in the minimal DFA.
|
||||
if state_to_part[as_index(id)] != id {
|
||||
continue;
|
||||
}
|
||||
self.dfa.remap_state(id, remap);
|
||||
self.dfa.swap_states(id, minimal_ids[as_index(id)]);
|
||||
}
|
||||
// Trim off all unused states from the pre-minimized DFA. This
|
||||
// represents all states that were merged into a non-singleton
|
||||
// equivalence class of states, and appeared after the first state
|
||||
// in each such class. (Because the state with the smallest ID in each
|
||||
// equivalence class is its representative ID.)
|
||||
self.dfa.truncate_states(minimal_count);
|
||||
|
||||
// Update the new start states, which is now just the minimal ID of
|
||||
// whatever state the old start state was collapsed into. Also, we
|
||||
// collect everything before-hand to work around the borrow checker.
|
||||
// We're already allocating so much that this is probably fine. If this
|
||||
// turns out to be costly, then I guess add a `starts_mut` iterator.
|
||||
let starts: Vec<_> = self.dfa.starts().collect();
|
||||
for (old_start_id, anchored, start_type) in starts {
|
||||
self.dfa.set_start_state(
|
||||
anchored,
|
||||
start_type,
|
||||
remap(old_start_id),
|
||||
);
|
||||
}
|
||||
|
||||
// Update the match state pattern ID list for multi-regexes. All we
|
||||
// need to do is remap the match state IDs. The pattern ID lists are
|
||||
// always the same as they were since match states with distinct
|
||||
// pattern ID lists are always considered distinct states.
|
||||
let mut pmap = BTreeMap::new();
|
||||
for (match_id, pattern_ids) in self.dfa.pattern_map() {
|
||||
let new_id = remap(match_id);
|
||||
pmap.insert(new_id, pattern_ids);
|
||||
}
|
||||
// This unwrap is OK because minimization never increases the number of
|
||||
// match states or patterns in those match states. Since minimization
|
||||
// runs after the pattern map has already been set at least once, we
|
||||
// know that our match states cannot error.
|
||||
self.dfa.set_pattern_map(&pmap).unwrap();
|
||||
|
||||
// In order to update the ID of the maximum match state, we need to
|
||||
// find the maximum ID among all of the match states in the minimized
|
||||
// DFA. This is not necessarily the new ID of the unminimized maximum
|
||||
// match state, since that could have been collapsed with a much
|
||||
// earlier match state. Therefore, to find the new max match state,
|
||||
// we iterate over all previous match states, find their corresponding
|
||||
// new minimal ID, and take the maximum of those.
|
||||
let old = self.dfa.special().clone();
|
||||
let new = self.dfa.special_mut();
|
||||
// ... but only remap if we had match states.
|
||||
if old.matches() {
|
||||
new.min_match = StateID::MAX;
|
||||
new.max_match = StateID::ZERO;
|
||||
for i in as_index(old.min_match)..=as_index(old.max_match) {
|
||||
let new_id = remap(as_state_id(i));
|
||||
if new_id < new.min_match {
|
||||
new.min_match = new_id;
|
||||
}
|
||||
if new_id > new.max_match {
|
||||
new.max_match = new_id;
|
||||
}
|
||||
}
|
||||
}
|
||||
// ... same, but for start states.
|
||||
if old.starts() {
|
||||
new.min_start = StateID::MAX;
|
||||
new.max_start = StateID::ZERO;
|
||||
for i in as_index(old.min_start)..=as_index(old.max_start) {
|
||||
let new_id = remap(as_state_id(i));
|
||||
if new_id == DEAD {
|
||||
continue;
|
||||
}
|
||||
if new_id < new.min_start {
|
||||
new.min_start = new_id;
|
||||
}
|
||||
if new_id > new.max_start {
|
||||
new.max_start = new_id;
|
||||
}
|
||||
}
|
||||
if new.max_start == DEAD {
|
||||
new.min_start = DEAD;
|
||||
}
|
||||
}
|
||||
new.quit_id = remap(new.quit_id);
|
||||
new.set_max();
|
||||
}
|
||||
|
||||
fn find_waiting(&self, set: &StateSet) -> Option<usize> {
|
||||
self.waiting.iter().position(|s| s == set)
|
||||
}
|
||||
|
||||
fn find_incoming_to(
|
||||
&self,
|
||||
b: alphabet::Unit,
|
||||
set: &StateSet,
|
||||
incoming: &mut StateSet,
|
||||
) {
|
||||
incoming.clear();
|
||||
set.iter(|id| {
|
||||
for &inid in
|
||||
&self.in_transitions[self.dfa.to_index(id)][b.as_usize()]
|
||||
{
|
||||
incoming.add(inid);
|
||||
}
|
||||
});
|
||||
incoming.canonicalize();
|
||||
}
|
||||
|
||||
fn initial_partitions(dfa: &dense::OwnedDFA) -> Vec<StateSet> {
|
||||
// For match states, we know that two match states with different
|
||||
// pattern ID lists will *always* be distinct, so we can partition them
|
||||
// initially based on that.
|
||||
let mut matching: BTreeMap<Vec<PatternID>, StateSet> = BTreeMap::new();
|
||||
let mut is_quit = StateSet::empty();
|
||||
let mut no_match = StateSet::empty();
|
||||
for state in dfa.states() {
|
||||
if dfa.is_match_state(state.id()) {
|
||||
let mut pids = vec![];
|
||||
for i in 0..dfa.match_len(state.id()) {
|
||||
pids.push(dfa.match_pattern(state.id(), i));
|
||||
}
|
||||
matching
|
||||
.entry(pids)
|
||||
.or_insert(StateSet::empty())
|
||||
.add(state.id());
|
||||
} else if dfa.is_quit_state(state.id()) {
|
||||
is_quit.add(state.id());
|
||||
} else {
|
||||
no_match.add(state.id());
|
||||
}
|
||||
}
|
||||
|
||||
let mut sets: Vec<StateSet> =
|
||||
matching.into_iter().map(|(_, set)| set).collect();
|
||||
sets.push(no_match);
|
||||
sets.push(is_quit);
|
||||
sets
|
||||
}
|
||||
|
||||
fn incoming_transitions(dfa: &dense::OwnedDFA) -> Vec<Vec<Vec<StateID>>> {
|
||||
let mut incoming = vec![];
|
||||
for _ in dfa.states() {
|
||||
incoming.push(vec![vec![]; dfa.alphabet_len()]);
|
||||
}
|
||||
for state in dfa.states() {
|
||||
for (b, next) in state.transitions() {
|
||||
incoming[dfa.to_index(next)][b.as_usize()].push(state.id());
|
||||
}
|
||||
}
|
||||
incoming
|
||||
}
|
||||
}
|
||||
|
||||
impl StateSet {
|
||||
fn empty() -> StateSet {
|
||||
StateSet { ids: Rc::new(RefCell::new(vec![])) }
|
||||
}
|
||||
|
||||
fn add(&mut self, id: StateID) {
|
||||
self.ids.borrow_mut().push(id);
|
||||
}
|
||||
|
||||
fn min(&self) -> StateID {
|
||||
self.ids.borrow()[0]
|
||||
}
|
||||
|
||||
fn canonicalize(&mut self) {
|
||||
self.ids.borrow_mut().sort();
|
||||
self.ids.borrow_mut().dedup();
|
||||
}
|
||||
|
||||
fn clear(&mut self) {
|
||||
self.ids.borrow_mut().clear();
|
||||
}
|
||||
|
||||
fn len(&self) -> usize {
|
||||
self.ids.borrow().len()
|
||||
}
|
||||
|
||||
fn is_empty(&self) -> bool {
|
||||
self.len() == 0
|
||||
}
|
||||
|
||||
fn deep_clone(&self) -> StateSet {
|
||||
let ids = self.ids.borrow().iter().cloned().collect();
|
||||
StateSet { ids: Rc::new(RefCell::new(ids)) }
|
||||
}
|
||||
|
||||
fn iter<F: FnMut(StateID)>(&self, mut f: F) {
|
||||
for &id in self.ids.borrow().iter() {
|
||||
f(id);
|
||||
}
|
||||
}
|
||||
|
||||
fn intersection(&self, other: &StateSet, dest: &mut StateSet) {
|
||||
dest.clear();
|
||||
if self.is_empty() || other.is_empty() {
|
||||
return;
|
||||
}
|
||||
|
||||
let (seta, setb) = (self.ids.borrow(), other.ids.borrow());
|
||||
let (mut ita, mut itb) = (seta.iter().cloned(), setb.iter().cloned());
|
||||
let (mut a, mut b) = (ita.next().unwrap(), itb.next().unwrap());
|
||||
loop {
|
||||
if a == b {
|
||||
dest.add(a);
|
||||
a = match ita.next() {
|
||||
None => break,
|
||||
Some(a) => a,
|
||||
};
|
||||
b = match itb.next() {
|
||||
None => break,
|
||||
Some(b) => b,
|
||||
};
|
||||
} else if a < b {
|
||||
a = match ita.next() {
|
||||
None => break,
|
||||
Some(a) => a,
|
||||
};
|
||||
} else {
|
||||
b = match itb.next() {
|
||||
None => break,
|
||||
Some(b) => b,
|
||||
};
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
fn subtract(&self, other: &StateSet, dest: &mut StateSet) {
|
||||
dest.clear();
|
||||
if self.is_empty() || other.is_empty() {
|
||||
self.iter(|s| dest.add(s));
|
||||
return;
|
||||
}
|
||||
|
||||
let (seta, setb) = (self.ids.borrow(), other.ids.borrow());
|
||||
let (mut ita, mut itb) = (seta.iter().cloned(), setb.iter().cloned());
|
||||
let (mut a, mut b) = (ita.next().unwrap(), itb.next().unwrap());
|
||||
loop {
|
||||
if a == b {
|
||||
a = match ita.next() {
|
||||
None => break,
|
||||
Some(a) => a,
|
||||
};
|
||||
b = match itb.next() {
|
||||
None => {
|
||||
dest.add(a);
|
||||
break;
|
||||
}
|
||||
Some(b) => b,
|
||||
};
|
||||
} else if a < b {
|
||||
dest.add(a);
|
||||
a = match ita.next() {
|
||||
None => break,
|
||||
Some(a) => a,
|
||||
};
|
||||
} else {
|
||||
b = match itb.next() {
|
||||
None => {
|
||||
dest.add(a);
|
||||
break;
|
||||
}
|
||||
Some(b) => b,
|
||||
};
|
||||
}
|
||||
}
|
||||
for a in ita {
|
||||
dest.add(a);
|
||||
}
|
||||
}
|
||||
}
|
||||
360
third-party/vendor/regex-automata/src/dfa/mod.rs
vendored
Normal file
360
third-party/vendor/regex-automata/src/dfa/mod.rs
vendored
Normal file
|
|
@ -0,0 +1,360 @@
|
|||
/*!
|
||||
A module for building and searching with deterministic finite automata (DFAs).
|
||||
|
||||
Like other modules in this crate, DFAs support a rich regex syntax with Unicode
|
||||
features. DFAs also have extensive options for configuring the best space vs
|
||||
time trade off for your use case and provides support for cheap deserialization
|
||||
of automata for use in `no_std` environments.
|
||||
|
||||
If you're looking for lazy DFAs that build themselves incrementally during
|
||||
search, then please see the top-level [`hybrid` module](crate::hybrid).
|
||||
|
||||
# Overview
|
||||
|
||||
This section gives a brief overview of the primary types in this module:
|
||||
|
||||
* A [`regex::Regex`] provides a way to search for matches of a regular
|
||||
expression using DFAs. This includes iterating over matches with both the start
|
||||
and end positions of each match.
|
||||
* A [`dense::DFA`] provides low level access to a DFA that uses a dense
|
||||
representation (uses lots of space, but fast searching).
|
||||
* A [`sparse::DFA`] provides the same API as a `dense::DFA`, but uses a sparse
|
||||
representation (uses less space, but slower searching).
|
||||
* An [`Automaton`] trait that defines an interface that both dense and sparse
|
||||
DFAs implement. (A `regex::Regex` is generic over this trait.)
|
||||
* Both dense DFAs and sparse DFAs support serialization to raw bytes (e.g.,
|
||||
[`dense::DFA::to_bytes_little_endian`]) and cheap deserialization (e.g.,
|
||||
[`dense::DFA::from_bytes`]).
|
||||
|
||||
There is also a [`onepass`] module that provides a [one-pass
|
||||
DFA](onepass::DFA). The unique advantage of this DFA is that, for the class
|
||||
of regexes it can be built with, it supports reporting the spans of matching
|
||||
capturing groups. It is the only DFA in this crate capable of such a thing.
|
||||
|
||||
# Example: basic regex searching
|
||||
|
||||
This example shows how to compile a regex using the default configuration
|
||||
and then use it to find matches in a byte string:
|
||||
|
||||
```
|
||||
use regex_automata::{Match, dfa::regex::Regex};
|
||||
|
||||
let re = Regex::new(r"[0-9]{4}-[0-9]{2}-[0-9]{2}")?;
|
||||
let text = b"2018-12-24 2016-10-08";
|
||||
let matches: Vec<Match> = re.find_iter(text).collect();
|
||||
assert_eq!(matches, vec![
|
||||
Match::must(0, 0..10),
|
||||
Match::must(0, 11..21),
|
||||
]);
|
||||
# Ok::<(), Box<dyn std::error::Error>>(())
|
||||
```
|
||||
|
||||
# Example: searching with regex sets
|
||||
|
||||
The DFAs in this module all fully support searching with multiple regexes
|
||||
simultaneously. You can use this support with standard leftmost-first style
|
||||
searching to find non-overlapping matches:
|
||||
|
||||
```
|
||||
# if cfg!(miri) { return Ok(()); } // miri takes too long
|
||||
use regex_automata::{Match, dfa::regex::Regex};
|
||||
|
||||
let re = Regex::new_many(&[r"\w+", r"\S+"])?;
|
||||
let text = b"@foo bar";
|
||||
let matches: Vec<Match> = re.find_iter(text).collect();
|
||||
assert_eq!(matches, vec![
|
||||
Match::must(1, 0..4),
|
||||
Match::must(0, 5..8),
|
||||
]);
|
||||
# Ok::<(), Box<dyn std::error::Error>>(())
|
||||
```
|
||||
|
||||
# Example: use sparse DFAs
|
||||
|
||||
By default, compiling a regex will use dense DFAs internally. This uses more
|
||||
memory, but executes searches more quickly. If you can abide slower searches
|
||||
(somewhere around 3-5x), then sparse DFAs might make more sense since they can
|
||||
use significantly less space.
|
||||
|
||||
Using sparse DFAs is as easy as using `Regex::new_sparse` instead of
|
||||
`Regex::new`:
|
||||
|
||||
```
|
||||
use regex_automata::{Match, dfa::regex::Regex};
|
||||
|
||||
let re = Regex::new_sparse(r"[0-9]{4}-[0-9]{2}-[0-9]{2}").unwrap();
|
||||
let text = b"2018-12-24 2016-10-08";
|
||||
let matches: Vec<Match> = re.find_iter(text).collect();
|
||||
assert_eq!(matches, vec![
|
||||
Match::must(0, 0..10),
|
||||
Match::must(0, 11..21),
|
||||
]);
|
||||
# Ok::<(), Box<dyn std::error::Error>>(())
|
||||
```
|
||||
|
||||
If you already have dense DFAs for some reason, they can be converted to sparse
|
||||
DFAs and used to build a new `Regex`. For example:
|
||||
|
||||
```
|
||||
use regex_automata::{Match, dfa::regex::Regex};
|
||||
|
||||
let dense_re = Regex::new(r"[0-9]{4}-[0-9]{2}-[0-9]{2}").unwrap();
|
||||
let sparse_re = Regex::builder().build_from_dfas(
|
||||
dense_re.forward().to_sparse()?,
|
||||
dense_re.reverse().to_sparse()?,
|
||||
);
|
||||
let text = b"2018-12-24 2016-10-08";
|
||||
let matches: Vec<Match> = sparse_re.find_iter(text).collect();
|
||||
assert_eq!(matches, vec![
|
||||
Match::must(0, 0..10),
|
||||
Match::must(0, 11..21),
|
||||
]);
|
||||
# Ok::<(), Box<dyn std::error::Error>>(())
|
||||
```
|
||||
|
||||
# Example: deserialize a DFA
|
||||
|
||||
This shows how to first serialize a DFA into raw bytes, and then deserialize
|
||||
those raw bytes back into a DFA. While this particular example is a
|
||||
bit contrived, this same technique can be used in your program to
|
||||
deserialize a DFA at start up time or by memory mapping a file.
|
||||
|
||||
```
|
||||
use regex_automata::{Match, dfa::{dense, regex::Regex}};
|
||||
|
||||
let re1 = Regex::new(r"[0-9]{4}-[0-9]{2}-[0-9]{2}").unwrap();
|
||||
// serialize both the forward and reverse DFAs, see note below
|
||||
let (fwd_bytes, fwd_pad) = re1.forward().to_bytes_native_endian();
|
||||
let (rev_bytes, rev_pad) = re1.reverse().to_bytes_native_endian();
|
||||
// now deserialize both---we need to specify the correct type!
|
||||
let fwd: dense::DFA<&[u32]> = dense::DFA::from_bytes(&fwd_bytes[fwd_pad..])?.0;
|
||||
let rev: dense::DFA<&[u32]> = dense::DFA::from_bytes(&rev_bytes[rev_pad..])?.0;
|
||||
// finally, reconstruct our regex
|
||||
let re2 = Regex::builder().build_from_dfas(fwd, rev);
|
||||
|
||||
// we can use it like normal
|
||||
let text = b"2018-12-24 2016-10-08";
|
||||
let matches: Vec<Match> = re2.find_iter(text).collect();
|
||||
assert_eq!(matches, vec![
|
||||
Match::must(0, 0..10),
|
||||
Match::must(0, 11..21),
|
||||
]);
|
||||
# Ok::<(), Box<dyn std::error::Error>>(())
|
||||
```
|
||||
|
||||
There are a few points worth noting here:
|
||||
|
||||
* We need to extract the raw DFAs used by the regex and serialize those. You
|
||||
can build the DFAs manually yourself using [`dense::Builder`], but using
|
||||
the DFAs from a `Regex` guarantees that the DFAs are built correctly. (In
|
||||
particular, a `Regex` constructs a reverse DFA for finding the starting
|
||||
location of matches.)
|
||||
* To convert the DFA to raw bytes, we use the `to_bytes_native_endian` method.
|
||||
In practice, you'll want to use either [`dense::DFA::to_bytes_little_endian`]
|
||||
or [`dense::DFA::to_bytes_big_endian`], depending on which platform you're
|
||||
deserializing your DFA from. If you intend to deserialize on either platform,
|
||||
then you'll need to serialize both and deserialize the right one depending on
|
||||
your target's endianness.
|
||||
* Safely deserializing a DFA requires verifying the raw bytes, particularly if
|
||||
they are untrusted, since an invalid DFA could cause logical errors, panics
|
||||
or even undefined behavior. This verification step requires visiting all of
|
||||
the transitions in the DFA, which can be costly. If cheaper verification is
|
||||
desired, then [`dense::DFA::from_bytes_unchecked`] is available that only does
|
||||
verification that can be performed in constant time. However, one can only use
|
||||
this routine if the caller can guarantee that the bytes provided encoded a
|
||||
valid DFA.
|
||||
|
||||
The same process can be achieved with sparse DFAs as well:
|
||||
|
||||
```
|
||||
use regex_automata::{Match, dfa::{sparse, regex::Regex}};
|
||||
|
||||
let re1 = Regex::new(r"[0-9]{4}-[0-9]{2}-[0-9]{2}").unwrap();
|
||||
// serialize both
|
||||
let fwd_bytes = re1.forward().to_sparse()?.to_bytes_native_endian();
|
||||
let rev_bytes = re1.reverse().to_sparse()?.to_bytes_native_endian();
|
||||
// now deserialize both---we need to specify the correct type!
|
||||
let fwd: sparse::DFA<&[u8]> = sparse::DFA::from_bytes(&fwd_bytes)?.0;
|
||||
let rev: sparse::DFA<&[u8]> = sparse::DFA::from_bytes(&rev_bytes)?.0;
|
||||
// finally, reconstruct our regex
|
||||
let re2 = Regex::builder().build_from_dfas(fwd, rev);
|
||||
|
||||
// we can use it like normal
|
||||
let text = b"2018-12-24 2016-10-08";
|
||||
let matches: Vec<Match> = re2.find_iter(text).collect();
|
||||
assert_eq!(matches, vec![
|
||||
Match::must(0, 0..10),
|
||||
Match::must(0, 11..21),
|
||||
]);
|
||||
# Ok::<(), Box<dyn std::error::Error>>(())
|
||||
```
|
||||
|
||||
Note that unlike dense DFAs, sparse DFAs have no alignment requirements.
|
||||
Conversely, dense DFAs must be be aligned to the same alignment as a
|
||||
[`StateID`](crate::util::primitives::StateID).
|
||||
|
||||
# Support for `no_std` and `alloc`-only
|
||||
|
||||
This crate comes with `alloc` and `std` features that are enabled by default.
|
||||
When the `alloc` or `std` features are enabled, the API of this module will
|
||||
include the facilities necessary for compiling, serializing, deserializing
|
||||
and searching with DFAs. When only the `alloc` feature is enabled, then
|
||||
implementations of the `std::error::Error` trait are dropped, but everything
|
||||
else generally remains the same. When both the `alloc` and `std` features are
|
||||
disabled, the API of this module will shrink such that it only includes the
|
||||
facilities necessary for deserializing and searching with DFAs.
|
||||
|
||||
The intended workflow for `no_std` environments is thus as follows:
|
||||
|
||||
* Write a program with the `alloc` or `std` features that compiles and
|
||||
serializes a regular expression. You may need to serialize both little and big
|
||||
endian versions of each DFA. (So that's 4 DFAs in total for each regex.)
|
||||
* In your `no_std` environment, follow the examples above for deserializing
|
||||
your previously serialized DFAs into regexes. You can then search with them as
|
||||
you would any regex.
|
||||
|
||||
Deserialization can happen anywhere. For example, with bytes embedded into a
|
||||
binary or with a file memory mapped at runtime.
|
||||
|
||||
The `regex-cli` command (found in the same repository as this crate) can be
|
||||
used to serialize DFAs to files and generate Rust code to read them.
|
||||
|
||||
# Syntax
|
||||
|
||||
This module supports the same syntax as the `regex` crate, since they share the
|
||||
same parser. You can find an exhaustive list of supported syntax in the
|
||||
[documentation for the `regex` crate](https://docs.rs/regex/1/regex/#syntax).
|
||||
|
||||
There are two things that are not supported by the DFAs in this module:
|
||||
|
||||
* Capturing groups. The DFAs (and [`Regex`](regex::Regex)es built on top
|
||||
of them) can only find the offsets of an entire match, but cannot resolve
|
||||
the offsets of each capturing group. This is because DFAs do not have the
|
||||
expressive power necessary.
|
||||
* Unicode word boundaries. These present particularly difficult challenges for
|
||||
DFA construction and would result in an explosion in the number of states.
|
||||
One can enable [`dense::Config::unicode_word_boundary`] though, which provides
|
||||
heuristic support for Unicode word boundaries that only works on ASCII text.
|
||||
Otherwise, one can use `(?-u:\b)` for an ASCII word boundary, which will work
|
||||
on any input.
|
||||
|
||||
There are no plans to lift either of these limitations.
|
||||
|
||||
Note that these restrictions are identical to the restrictions on lazy DFAs.
|
||||
|
||||
# Differences with general purpose regexes
|
||||
|
||||
The main goal of the [`regex`](https://docs.rs/regex) crate is to serve as a
|
||||
general purpose regular expression engine. It aims to automatically balance low
|
||||
compile times, fast search times and low memory usage, while also providing
|
||||
a convenient API for users. In contrast, this module provides a lower level
|
||||
regular expression interface based exclusively on DFAs that is a bit less
|
||||
convenient while providing more explicit control over memory usage and search
|
||||
times.
|
||||
|
||||
Here are some specific negative differences:
|
||||
|
||||
* **Compilation can take an exponential amount of time and space** in the size
|
||||
of the regex pattern. While most patterns do not exhibit worst case exponential
|
||||
time, such patterns do exist. For example, `[01]*1[01]{N}` will build a DFA
|
||||
with approximately `2^(N+2)` states. For this reason, untrusted patterns should
|
||||
not be compiled with this module. (In the future, the API may expose an option
|
||||
to return an error if the DFA gets too big.)
|
||||
* This module does not support sub-match extraction via capturing groups, which
|
||||
can be achieved with the regex crate's "captures" API.
|
||||
* While the regex crate doesn't necessarily sport fast compilation times,
|
||||
the regexes in this module are almost universally slow to compile, especially
|
||||
when they contain large Unicode character classes. For example, on my system,
|
||||
compiling `\w{50}` takes about 1 second and almost 15MB of memory! (Compiling
|
||||
a sparse regex takes about the same time but only uses about 1.2MB of
|
||||
memory.) Conversely, compiling the same regex without Unicode support, e.g.,
|
||||
`(?-u)\w{50}`, takes under 1 millisecond and about 15KB of memory. For this
|
||||
reason, you should only use Unicode character classes if you absolutely need
|
||||
them! (They are enabled by default though.)
|
||||
* This module does not support Unicode word boundaries. ASCII word bondaries
|
||||
may be used though by disabling Unicode or selectively doing so in the syntax,
|
||||
e.g., `(?-u:\b)`. There is also an option to
|
||||
[heuristically enable Unicode word boundaries](crate::dfa::dense::Config::unicode_word_boundary),
|
||||
where the corresponding DFA will give up if any non-ASCII byte is seen.
|
||||
* As a lower level API, this module does not do literal optimizations
|
||||
automatically. Although it does provide hooks in its API to make use of the
|
||||
[`Prefilter`](crate::util::prefilter::Prefilter) trait. Missing literal
|
||||
optimizations means that searches may run much slower than what you're
|
||||
accustomed to, although, it does provide more predictable and consistent
|
||||
performance.
|
||||
* There is no `&str` API like in the regex crate. In this module, all APIs
|
||||
operate on `&[u8]`. By default, match indices are
|
||||
guaranteed to fall on UTF-8 boundaries, unless either of
|
||||
[`syntax::Config::utf8`](crate::util::syntax::Config::utf8) or
|
||||
[`thompson::Config::utf8`](crate::nfa::thompson::Config::utf8) are disabled.
|
||||
|
||||
With some of the downsides out of the way, here are some positive differences:
|
||||
|
||||
* Both dense and sparse DFAs can be serialized to raw bytes, and then cheaply
|
||||
deserialized. Deserialization can be done in constant time with the unchecked
|
||||
APIs, since searching can be performed directly on the raw serialized bytes of
|
||||
a DFA.
|
||||
* This module was specifically designed so that the searching phase of a
|
||||
DFA has minimal runtime requirements, and can therefore be used in `no_std`
|
||||
environments. While `no_std` environments cannot compile regexes, they can
|
||||
deserialize pre-compiled regexes.
|
||||
* Since this module builds DFAs ahead of time, it will generally out-perform
|
||||
the `regex` crate on equivalent tasks. The performance difference is likely
|
||||
not large. However, because of a complex set of optimizations in the regex
|
||||
crate (like literal optimizations), an accurate performance comparison may be
|
||||
difficult to do.
|
||||
* Sparse DFAs provide a way to build a DFA ahead of time that sacrifices search
|
||||
performance a small amount, but uses much less storage space. Potentially even
|
||||
less than what the regex crate uses.
|
||||
* This module exposes DFAs directly, such as [`dense::DFA`] and
|
||||
[`sparse::DFA`], which enables one to do less work in some cases. For example,
|
||||
if you only need the end of a match and not the start of a match, then you can
|
||||
use a DFA directly without building a `Regex`, which always requires a second
|
||||
DFA to find the start of a match.
|
||||
* This module provides more control over memory usage. Aside from choosing
|
||||
between dense and sparse DFAs, one can also choose a smaller state identifier
|
||||
representation to use less space. Also, one can enable DFA minimization
|
||||
via [`dense::Config::minimize`], but it can increase compilation times
|
||||
dramatically.
|
||||
*/
|
||||
|
||||
#[cfg(feature = "dfa-search")]
|
||||
pub use crate::dfa::{
|
||||
automaton::{Automaton, OverlappingState, StartError},
|
||||
start::StartKind,
|
||||
};
|
||||
|
||||
/// This is an alias for a state ID of zero. It has special significance
|
||||
/// because it always corresponds to the first state in a DFA, and the first
|
||||
/// state in a DFA is always "dead." That is, the dead state always has all
|
||||
/// of its transitions set to itself. Moreover, the dead state is used as a
|
||||
/// sentinel for various things. e.g., In search, reaching a dead state means
|
||||
/// that the search must stop.
|
||||
const DEAD: crate::util::primitives::StateID =
|
||||
crate::util::primitives::StateID::ZERO;
|
||||
|
||||
#[cfg(feature = "dfa-search")]
|
||||
pub mod dense;
|
||||
#[cfg(feature = "dfa-onepass")]
|
||||
pub mod onepass;
|
||||
#[cfg(feature = "dfa-search")]
|
||||
pub mod regex;
|
||||
#[cfg(feature = "dfa-search")]
|
||||
pub mod sparse;
|
||||
|
||||
#[cfg(feature = "dfa-search")]
|
||||
pub(crate) mod accel;
|
||||
#[cfg(feature = "dfa-search")]
|
||||
mod automaton;
|
||||
#[cfg(feature = "dfa-build")]
|
||||
mod determinize;
|
||||
#[cfg(feature = "dfa-build")]
|
||||
mod minimize;
|
||||
#[cfg(any(feature = "dfa-build", feature = "dfa-onepass"))]
|
||||
mod remapper;
|
||||
#[cfg(feature = "dfa-search")]
|
||||
mod search;
|
||||
#[cfg(feature = "dfa-search")]
|
||||
mod special;
|
||||
#[cfg(feature = "dfa-search")]
|
||||
mod start;
|
||||
3192
third-party/vendor/regex-automata/src/dfa/onepass.rs
vendored
Normal file
3192
third-party/vendor/regex-automata/src/dfa/onepass.rs
vendored
Normal file
File diff suppressed because it is too large
Load diff
871
third-party/vendor/regex-automata/src/dfa/regex.rs
vendored
Normal file
871
third-party/vendor/regex-automata/src/dfa/regex.rs
vendored
Normal file
|
|
@ -0,0 +1,871 @@
|
|||
/*!
|
||||
A DFA-backed `Regex`.
|
||||
|
||||
This module provides [`Regex`], which is defined generically over the
|
||||
[`Automaton`] trait. A `Regex` implements convenience routines you might have
|
||||
come to expect, such as finding the start/end of a match and iterating over
|
||||
all non-overlapping matches. This `Regex` type is limited in its capabilities
|
||||
to what a DFA can provide. Therefore, APIs involving capturing groups, for
|
||||
example, are not provided.
|
||||
|
||||
Internally, a `Regex` is composed of two DFAs. One is a "forward" DFA that
|
||||
finds the end offset of a match, where as the other is a "reverse" DFA that
|
||||
find the start offset of a match.
|
||||
|
||||
See the [parent module](crate::dfa) for examples.
|
||||
*/
|
||||
|
||||
#[cfg(feature = "alloc")]
|
||||
use alloc::vec::Vec;
|
||||
|
||||
#[cfg(feature = "dfa-build")]
|
||||
use crate::dfa::dense::BuildError;
|
||||
use crate::{
|
||||
dfa::{automaton::Automaton, dense},
|
||||
util::{iter, search::Input},
|
||||
Anchored, Match, MatchError,
|
||||
};
|
||||
#[cfg(feature = "alloc")]
|
||||
use crate::{
|
||||
dfa::{sparse, StartKind},
|
||||
util::search::MatchKind,
|
||||
};
|
||||
|
||||
// When the alloc feature is enabled, the regex type sets its A type parameter
|
||||
// to default to an owned dense DFA. But without alloc, we set no default. This
|
||||
// makes things a lot more convenient in the common case, since writing out the
|
||||
// DFA types is pretty annoying.
|
||||
//
|
||||
// Since we have two different definitions but only want to write one doc
|
||||
// string, we use a macro to capture the doc and other attributes once and then
|
||||
// repeat them for each definition.
|
||||
macro_rules! define_regex_type {
|
||||
($(#[$doc:meta])*) => {
|
||||
#[cfg(feature = "alloc")]
|
||||
$(#[$doc])*
|
||||
pub struct Regex<A = dense::OwnedDFA> {
|
||||
forward: A,
|
||||
reverse: A,
|
||||
}
|
||||
|
||||
#[cfg(not(feature = "alloc"))]
|
||||
$(#[$doc])*
|
||||
pub struct Regex<A> {
|
||||
forward: A,
|
||||
reverse: A,
|
||||
}
|
||||
};
|
||||
}
|
||||
|
||||
define_regex_type!(
|
||||
/// A regular expression that uses deterministic finite automata for fast
|
||||
/// searching.
|
||||
///
|
||||
/// A regular expression is comprised of two DFAs, a "forward" DFA and a
|
||||
/// "reverse" DFA. The forward DFA is responsible for detecting the end of
|
||||
/// a match while the reverse DFA is responsible for detecting the start
|
||||
/// of a match. Thus, in order to find the bounds of any given match, a
|
||||
/// forward search must first be run followed by a reverse search. A match
|
||||
/// found by the forward DFA guarantees that the reverse DFA will also find
|
||||
/// a match.
|
||||
///
|
||||
/// The type of the DFA used by a `Regex` corresponds to the `A` type
|
||||
/// parameter, which must satisfy the [`Automaton`] trait. Typically,
|
||||
/// `A` is either a [`dense::DFA`](crate::dfa::dense::DFA) or a
|
||||
/// [`sparse::DFA`](crate::dfa::sparse::DFA), where dense DFAs use more
|
||||
/// memory but search faster, while sparse DFAs use less memory but search
|
||||
/// more slowly.
|
||||
///
|
||||
/// # Crate features
|
||||
///
|
||||
/// Note that despite what the documentation auto-generates, the _only_
|
||||
/// crate feature needed to use this type is `dfa-search`. You do _not_
|
||||
/// need to enable the `alloc` feature.
|
||||
///
|
||||
/// By default, a regex's automaton type parameter is set to
|
||||
/// `dense::DFA<Vec<u32>>` when the `alloc` feature is enabled. For most
|
||||
/// in-memory work loads, this is the most convenient type that gives the
|
||||
/// best search performance. When the `alloc` feature is disabled, no
|
||||
/// default type is used.
|
||||
///
|
||||
/// # When should I use this?
|
||||
///
|
||||
/// Generally speaking, if you can afford the overhead of building a full
|
||||
/// DFA for your regex, and you don't need things like capturing groups,
|
||||
/// then this is a good choice if you're looking to optimize for matching
|
||||
/// speed. Note however that its speed may be worse than a general purpose
|
||||
/// regex engine if you don't provide a [`dense::Config::prefilter`] to the
|
||||
/// underlying DFA.
|
||||
///
|
||||
/// # Sparse DFAs
|
||||
///
|
||||
/// Since a `Regex` is generic over the [`Automaton`] trait, it can be
|
||||
/// used with any kind of DFA. While this crate constructs dense DFAs by
|
||||
/// default, it is easy enough to build corresponding sparse DFAs, and then
|
||||
/// build a regex from them:
|
||||
///
|
||||
/// ```
|
||||
/// use regex_automata::dfa::regex::Regex;
|
||||
///
|
||||
/// // First, build a regex that uses dense DFAs.
|
||||
/// let dense_re = Regex::new("foo[0-9]+")?;
|
||||
///
|
||||
/// // Second, build sparse DFAs from the forward and reverse dense DFAs.
|
||||
/// let fwd = dense_re.forward().to_sparse()?;
|
||||
/// let rev = dense_re.reverse().to_sparse()?;
|
||||
///
|
||||
/// // Third, build a new regex from the constituent sparse DFAs.
|
||||
/// let sparse_re = Regex::builder().build_from_dfas(fwd, rev);
|
||||
///
|
||||
/// // A regex that uses sparse DFAs can be used just like with dense DFAs.
|
||||
/// assert_eq!(true, sparse_re.is_match(b"foo123"));
|
||||
///
|
||||
/// # Ok::<(), Box<dyn std::error::Error>>(())
|
||||
/// ```
|
||||
///
|
||||
/// Alternatively, one can use a [`Builder`] to construct a sparse DFA
|
||||
/// more succinctly. (Note though that dense DFAs are still constructed
|
||||
/// first internally, and then converted to sparse DFAs, as in the example
|
||||
/// above.)
|
||||
///
|
||||
/// ```
|
||||
/// use regex_automata::dfa::regex::Regex;
|
||||
///
|
||||
/// let sparse_re = Regex::builder().build_sparse(r"foo[0-9]+")?;
|
||||
/// // A regex that uses sparse DFAs can be used just like with dense DFAs.
|
||||
/// assert!(sparse_re.is_match(b"foo123"));
|
||||
///
|
||||
/// # Ok::<(), Box<dyn std::error::Error>>(())
|
||||
/// ```
|
||||
///
|
||||
/// # Fallibility
|
||||
///
|
||||
/// Most of the search routines defined on this type will _panic_ when the
|
||||
/// underlying search fails. This might be because the DFA gave up because
|
||||
/// it saw a quit byte, whether configured explicitly or via heuristic
|
||||
/// Unicode word boundary support, although neither are enabled by default.
|
||||
/// Or it might fail because an invalid `Input` configuration is given,
|
||||
/// for example, with an unsupported [`Anchored`] mode.
|
||||
///
|
||||
/// If you need to handle these error cases instead of allowing them to
|
||||
/// trigger a panic, then the lower level [`Regex::try_search`] provides
|
||||
/// a fallible API that never panics.
|
||||
///
|
||||
/// # Example
|
||||
///
|
||||
/// This example shows how to cause a search to terminate if it sees a
|
||||
/// `\n` byte, and handle the error returned. This could be useful if, for
|
||||
/// example, you wanted to prevent a user supplied pattern from matching
|
||||
/// across a line boundary.
|
||||
///
|
||||
/// ```
|
||||
/// # if cfg!(miri) { return Ok(()); } // miri takes too long
|
||||
/// use regex_automata::{dfa::{self, regex::Regex}, Input, MatchError};
|
||||
///
|
||||
/// let re = Regex::builder()
|
||||
/// .dense(dfa::dense::Config::new().quit(b'\n', true))
|
||||
/// .build(r"foo\p{any}+bar")?;
|
||||
///
|
||||
/// let input = Input::new("foo\nbar");
|
||||
/// // Normally this would produce a match, since \p{any} contains '\n'.
|
||||
/// // But since we instructed the automaton to enter a quit state if a
|
||||
/// // '\n' is observed, this produces a match error instead.
|
||||
/// let expected = MatchError::quit(b'\n', 3);
|
||||
/// let got = re.try_search(&input).unwrap_err();
|
||||
/// assert_eq!(expected, got);
|
||||
///
|
||||
/// # Ok::<(), Box<dyn std::error::Error>>(())
|
||||
/// ```
|
||||
#[derive(Clone, Debug)]
|
||||
);
|
||||
|
||||
#[cfg(all(feature = "syntax", feature = "dfa-build"))]
|
||||
impl Regex {
|
||||
/// Parse the given regular expression using the default configuration and
|
||||
/// return the corresponding regex.
|
||||
///
|
||||
/// If you want a non-default configuration, then use the [`Builder`] to
|
||||
/// set your own configuration.
|
||||
///
|
||||
/// # Example
|
||||
///
|
||||
/// ```
|
||||
/// use regex_automata::{Match, dfa::regex::Regex};
|
||||
///
|
||||
/// let re = Regex::new("foo[0-9]+bar")?;
|
||||
/// assert_eq!(
|
||||
/// Some(Match::must(0, 3..14)),
|
||||
/// re.find(b"zzzfoo12345barzzz"),
|
||||
/// );
|
||||
/// # Ok::<(), Box<dyn std::error::Error>>(())
|
||||
/// ```
|
||||
pub fn new(pattern: &str) -> Result<Regex, BuildError> {
|
||||
Builder::new().build(pattern)
|
||||
}
|
||||
|
||||
/// Like `new`, but parses multiple patterns into a single "regex set."
|
||||
/// This similarly uses the default regex configuration.
|
||||
///
|
||||
/// # Example
|
||||
///
|
||||
/// ```
|
||||
/// use regex_automata::{Match, dfa::regex::Regex};
|
||||
///
|
||||
/// let re = Regex::new_many(&["[a-z]+", "[0-9]+"])?;
|
||||
///
|
||||
/// let mut it = re.find_iter(b"abc 1 foo 4567 0 quux");
|
||||
/// assert_eq!(Some(Match::must(0, 0..3)), it.next());
|
||||
/// assert_eq!(Some(Match::must(1, 4..5)), it.next());
|
||||
/// assert_eq!(Some(Match::must(0, 6..9)), it.next());
|
||||
/// assert_eq!(Some(Match::must(1, 10..14)), it.next());
|
||||
/// assert_eq!(Some(Match::must(1, 15..16)), it.next());
|
||||
/// assert_eq!(Some(Match::must(0, 17..21)), it.next());
|
||||
/// assert_eq!(None, it.next());
|
||||
/// # Ok::<(), Box<dyn std::error::Error>>(())
|
||||
/// ```
|
||||
pub fn new_many<P: AsRef<str>>(
|
||||
patterns: &[P],
|
||||
) -> Result<Regex, BuildError> {
|
||||
Builder::new().build_many(patterns)
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(all(feature = "syntax", feature = "dfa-build"))]
|
||||
impl Regex<sparse::DFA<Vec<u8>>> {
|
||||
/// Parse the given regular expression using the default configuration,
|
||||
/// except using sparse DFAs, and return the corresponding regex.
|
||||
///
|
||||
/// If you want a non-default configuration, then use the [`Builder`] to
|
||||
/// set your own configuration.
|
||||
///
|
||||
/// # Example
|
||||
///
|
||||
/// ```
|
||||
/// use regex_automata::{Match, dfa::regex::Regex};
|
||||
///
|
||||
/// let re = Regex::new_sparse("foo[0-9]+bar")?;
|
||||
/// assert_eq!(
|
||||
/// Some(Match::must(0, 3..14)),
|
||||
/// re.find(b"zzzfoo12345barzzz"),
|
||||
/// );
|
||||
/// # Ok::<(), Box<dyn std::error::Error>>(())
|
||||
/// ```
|
||||
pub fn new_sparse(
|
||||
pattern: &str,
|
||||
) -> Result<Regex<sparse::DFA<Vec<u8>>>, BuildError> {
|
||||
Builder::new().build_sparse(pattern)
|
||||
}
|
||||
|
||||
/// Like `new`, but parses multiple patterns into a single "regex set"
|
||||
/// using sparse DFAs. This otherwise similarly uses the default regex
|
||||
/// configuration.
|
||||
///
|
||||
/// # Example
|
||||
///
|
||||
/// ```
|
||||
/// use regex_automata::{Match, dfa::regex::Regex};
|
||||
///
|
||||
/// let re = Regex::new_many_sparse(&["[a-z]+", "[0-9]+"])?;
|
||||
///
|
||||
/// let mut it = re.find_iter(b"abc 1 foo 4567 0 quux");
|
||||
/// assert_eq!(Some(Match::must(0, 0..3)), it.next());
|
||||
/// assert_eq!(Some(Match::must(1, 4..5)), it.next());
|
||||
/// assert_eq!(Some(Match::must(0, 6..9)), it.next());
|
||||
/// assert_eq!(Some(Match::must(1, 10..14)), it.next());
|
||||
/// assert_eq!(Some(Match::must(1, 15..16)), it.next());
|
||||
/// assert_eq!(Some(Match::must(0, 17..21)), it.next());
|
||||
/// assert_eq!(None, it.next());
|
||||
/// # Ok::<(), Box<dyn std::error::Error>>(())
|
||||
/// ```
|
||||
pub fn new_many_sparse<P: AsRef<str>>(
|
||||
patterns: &[P],
|
||||
) -> Result<Regex<sparse::DFA<Vec<u8>>>, BuildError> {
|
||||
Builder::new().build_many_sparse(patterns)
|
||||
}
|
||||
}
|
||||
|
||||
/// Convenience routines for regex construction.
|
||||
impl Regex<dense::DFA<&'static [u32]>> {
|
||||
/// Return a builder for configuring the construction of a `Regex`.
|
||||
///
|
||||
/// This is a convenience routine to avoid needing to import the
|
||||
/// [`Builder`] type in common cases.
|
||||
///
|
||||
/// # Example
|
||||
///
|
||||
/// This example shows how to use the builder to disable UTF-8 mode
|
||||
/// everywhere.
|
||||
///
|
||||
/// ```
|
||||
/// # if cfg!(miri) { return Ok(()); } // miri takes too long
|
||||
/// use regex_automata::{
|
||||
/// dfa::regex::Regex, nfa::thompson, util::syntax, Match,
|
||||
/// };
|
||||
///
|
||||
/// let re = Regex::builder()
|
||||
/// .syntax(syntax::Config::new().utf8(false))
|
||||
/// .thompson(thompson::Config::new().utf8(false))
|
||||
/// .build(r"foo(?-u:[^b])ar.*")?;
|
||||
/// let haystack = b"\xFEfoo\xFFarzz\xE2\x98\xFF\n";
|
||||
/// let expected = Some(Match::must(0, 1..9));
|
||||
/// let got = re.find(haystack);
|
||||
/// assert_eq!(expected, got);
|
||||
///
|
||||
/// # Ok::<(), Box<dyn std::error::Error>>(())
|
||||
/// ```
|
||||
pub fn builder() -> Builder {
|
||||
Builder::new()
|
||||
}
|
||||
}
|
||||
|
||||
/// Standard search routines for finding and iterating over matches.
|
||||
impl<A: Automaton> Regex<A> {
|
||||
/// Returns true if and only if this regex matches the given haystack.
|
||||
///
|
||||
/// This routine may short circuit if it knows that scanning future input
|
||||
/// will never lead to a different result. In particular, if the underlying
|
||||
/// DFA enters a match state or a dead state, then this routine will return
|
||||
/// `true` or `false`, respectively, without inspecting any future input.
|
||||
///
|
||||
/// # Panics
|
||||
///
|
||||
/// This routine panics if the search could not complete. This can occur
|
||||
/// in a number of circumstances:
|
||||
///
|
||||
/// * The configuration of the DFA may permit it to "quit" the search.
|
||||
/// For example, setting quit bytes or enabling heuristic support for
|
||||
/// Unicode word boundaries. The default configuration does not enable any
|
||||
/// option that could result in the DFA quitting.
|
||||
/// * When the provided `Input` configuration is not supported. For
|
||||
/// example, by providing an unsupported anchor mode.
|
||||
///
|
||||
/// When a search panics, callers cannot know whether a match exists or
|
||||
/// not.
|
||||
///
|
||||
/// Use [`Regex::try_search`] if you want to handle these error conditions.
|
||||
///
|
||||
/// # Example
|
||||
///
|
||||
/// ```
|
||||
/// use regex_automata::dfa::regex::Regex;
|
||||
///
|
||||
/// let re = Regex::new("foo[0-9]+bar")?;
|
||||
/// assert_eq!(true, re.is_match("foo12345bar"));
|
||||
/// assert_eq!(false, re.is_match("foobar"));
|
||||
/// # Ok::<(), Box<dyn std::error::Error>>(())
|
||||
/// ```
|
||||
#[inline]
|
||||
pub fn is_match<'h, I: Into<Input<'h>>>(&self, input: I) -> bool {
|
||||
// Not only can we do an "earliest" search, but we can avoid doing a
|
||||
// reverse scan too.
|
||||
let input = input.into().earliest(true);
|
||||
self.forward().try_search_fwd(&input).map(|x| x.is_some()).unwrap()
|
||||
}
|
||||
|
||||
/// Returns the start and end offset of the leftmost match. If no match
|
||||
/// exists, then `None` is returned.
|
||||
///
|
||||
/// # Panics
|
||||
///
|
||||
/// This routine panics if the search could not complete. This can occur
|
||||
/// in a number of circumstances:
|
||||
///
|
||||
/// * The configuration of the DFA may permit it to "quit" the search.
|
||||
/// For example, setting quit bytes or enabling heuristic support for
|
||||
/// Unicode word boundaries. The default configuration does not enable any
|
||||
/// option that could result in the DFA quitting.
|
||||
/// * When the provided `Input` configuration is not supported. For
|
||||
/// example, by providing an unsupported anchor mode.
|
||||
///
|
||||
/// When a search panics, callers cannot know whether a match exists or
|
||||
/// not.
|
||||
///
|
||||
/// Use [`Regex::try_search`] if you want to handle these error conditions.
|
||||
///
|
||||
/// # Example
|
||||
///
|
||||
/// ```
|
||||
/// use regex_automata::{Match, dfa::regex::Regex};
|
||||
///
|
||||
/// // Greediness is applied appropriately.
|
||||
/// let re = Regex::new("foo[0-9]+")?;
|
||||
/// assert_eq!(Some(Match::must(0, 3..11)), re.find("zzzfoo12345zzz"));
|
||||
///
|
||||
/// // Even though a match is found after reading the first byte (`a`),
|
||||
/// // the default leftmost-first match semantics demand that we find the
|
||||
/// // earliest match that prefers earlier parts of the pattern over latter
|
||||
/// // parts.
|
||||
/// let re = Regex::new("abc|a")?;
|
||||
/// assert_eq!(Some(Match::must(0, 0..3)), re.find("abc"));
|
||||
/// # Ok::<(), Box<dyn std::error::Error>>(())
|
||||
/// ```
|
||||
#[inline]
|
||||
pub fn find<'h, I: Into<Input<'h>>>(&self, input: I) -> Option<Match> {
|
||||
self.try_search(&input.into()).unwrap()
|
||||
}
|
||||
|
||||
/// Returns an iterator over all non-overlapping leftmost matches in the
|
||||
/// given bytes. If no match exists, then the iterator yields no elements.
|
||||
///
|
||||
/// This corresponds to the "standard" regex search iterator.
|
||||
///
|
||||
/// # Panics
|
||||
///
|
||||
/// If the search returns an error during iteration, then iteration
|
||||
/// panics. See [`Regex::find`] for the panic conditions.
|
||||
///
|
||||
/// Use [`Regex::try_search`] with
|
||||
/// [`util::iter::Searcher`](crate::util::iter::Searcher) if you want to
|
||||
/// handle these error conditions.
|
||||
///
|
||||
/// # Example
|
||||
///
|
||||
/// ```
|
||||
/// use regex_automata::{Match, dfa::regex::Regex};
|
||||
///
|
||||
/// let re = Regex::new("foo[0-9]+")?;
|
||||
/// let text = "foo1 foo12 foo123";
|
||||
/// let matches: Vec<Match> = re.find_iter(text).collect();
|
||||
/// assert_eq!(matches, vec![
|
||||
/// Match::must(0, 0..4),
|
||||
/// Match::must(0, 5..10),
|
||||
/// Match::must(0, 11..17),
|
||||
/// ]);
|
||||
/// # Ok::<(), Box<dyn std::error::Error>>(())
|
||||
/// ```
|
||||
#[inline]
|
||||
pub fn find_iter<'r, 'h, I: Into<Input<'h>>>(
|
||||
&'r self,
|
||||
input: I,
|
||||
) -> FindMatches<'r, 'h, A> {
|
||||
let it = iter::Searcher::new(input.into());
|
||||
FindMatches { re: self, it }
|
||||
}
|
||||
}
|
||||
|
||||
/// Lower level fallible search routines that permit controlling where the
|
||||
/// search starts and ends in a particular sequence.
|
||||
impl<A: Automaton> Regex<A> {
|
||||
/// Returns the start and end offset of the leftmost match. If no match
|
||||
/// exists, then `None` is returned.
|
||||
///
|
||||
/// This is like [`Regex::find`] but with two differences:
|
||||
///
|
||||
/// 1. It is not generic over `Into<Input>` and instead accepts a
|
||||
/// `&Input`. This permits reusing the same `Input` for multiple searches
|
||||
/// without needing to create a new one. This _may_ help with latency.
|
||||
/// 2. It returns an error if the search could not complete where as
|
||||
/// [`Regex::find`] will panic.
|
||||
///
|
||||
/// # Errors
|
||||
///
|
||||
/// This routine errors if the search could not complete. This can occur
|
||||
/// in the following circumstances:
|
||||
///
|
||||
/// * The configuration of the DFA may permit it to "quit" the search.
|
||||
/// For example, setting quit bytes or enabling heuristic support for
|
||||
/// Unicode word boundaries. The default configuration does not enable any
|
||||
/// option that could result in the DFA quitting.
|
||||
/// * When the provided `Input` configuration is not supported. For
|
||||
/// example, by providing an unsupported anchor mode.
|
||||
///
|
||||
/// When a search returns an error, callers cannot know whether a match
|
||||
/// exists or not.
|
||||
#[inline]
|
||||
pub fn try_search(
|
||||
&self,
|
||||
input: &Input<'_>,
|
||||
) -> Result<Option<Match>, MatchError> {
|
||||
let (fwd, rev) = (self.forward(), self.reverse());
|
||||
let end = match fwd.try_search_fwd(input)? {
|
||||
None => return Ok(None),
|
||||
Some(end) => end,
|
||||
};
|
||||
// This special cases an empty match at the beginning of the search. If
|
||||
// our end matches our start, then since a reverse DFA can't match past
|
||||
// the start, it must follow that our starting position is also our end
|
||||
// position. So short circuit and skip the reverse search.
|
||||
if input.start() == end.offset() {
|
||||
return Ok(Some(Match::new(
|
||||
end.pattern(),
|
||||
end.offset()..end.offset(),
|
||||
)));
|
||||
}
|
||||
// We can also skip the reverse search if we know our search was
|
||||
// anchored. This occurs either when the input config is anchored or
|
||||
// when we know the regex itself is anchored. In this case, we know the
|
||||
// start of the match, if one is found, must be the start of the
|
||||
// search.
|
||||
if self.is_anchored(input) {
|
||||
return Ok(Some(Match::new(
|
||||
end.pattern(),
|
||||
input.start()..end.offset(),
|
||||
)));
|
||||
}
|
||||
// N.B. I have tentatively convinced myself that it isn't necessary
|
||||
// to specify the specific pattern for the reverse search since the
|
||||
// reverse search will always find the same pattern to match as the
|
||||
// forward search. But I lack a rigorous proof. Why not just provide
|
||||
// the pattern anyway? Well, if it is needed, then leaving it out
|
||||
// gives us a chance to find a witness. (Also, if we don't need to
|
||||
// specify the pattern, then we don't need to build the reverse DFA
|
||||
// with 'starts_for_each_pattern' enabled.)
|
||||
//
|
||||
// We also need to be careful to disable 'earliest' for the reverse
|
||||
// search, since it could be enabled for the forward search. In the
|
||||
// reverse case, to satisfy "leftmost" criteria, we need to match
|
||||
// as much as we can. We also need to be careful to make the search
|
||||
// anchored. We don't want the reverse search to report any matches
|
||||
// other than the one beginning at the end of our forward search.
|
||||
let revsearch = input
|
||||
.clone()
|
||||
.span(input.start()..end.offset())
|
||||
.anchored(Anchored::Yes)
|
||||
.earliest(false);
|
||||
let start = rev
|
||||
.try_search_rev(&revsearch)?
|
||||
.expect("reverse search must match if forward search does");
|
||||
assert_eq!(
|
||||
start.pattern(),
|
||||
end.pattern(),
|
||||
"forward and reverse search must match same pattern",
|
||||
);
|
||||
assert!(start.offset() <= end.offset());
|
||||
Ok(Some(Match::new(end.pattern(), start.offset()..end.offset())))
|
||||
}
|
||||
|
||||
/// Returns true if either the given input specifies an anchored search
|
||||
/// or if the underlying DFA is always anchored.
|
||||
fn is_anchored(&self, input: &Input<'_>) -> bool {
|
||||
match input.get_anchored() {
|
||||
Anchored::No => self.forward().is_always_start_anchored(),
|
||||
Anchored::Yes | Anchored::Pattern(_) => true,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Non-search APIs for querying information about the regex and setting a
|
||||
/// prefilter.
|
||||
impl<A: Automaton> Regex<A> {
|
||||
/// Return the underlying DFA responsible for forward matching.
|
||||
///
|
||||
/// This is useful for accessing the underlying DFA and converting it to
|
||||
/// some other format or size. See the [`Builder::build_from_dfas`] docs
|
||||
/// for an example of where this might be useful.
|
||||
pub fn forward(&self) -> &A {
|
||||
&self.forward
|
||||
}
|
||||
|
||||
/// Return the underlying DFA responsible for reverse matching.
|
||||
///
|
||||
/// This is useful for accessing the underlying DFA and converting it to
|
||||
/// some other format or size. See the [`Builder::build_from_dfas`] docs
|
||||
/// for an example of where this might be useful.
|
||||
pub fn reverse(&self) -> &A {
|
||||
&self.reverse
|
||||
}
|
||||
|
||||
/// Returns the total number of patterns matched by this regex.
|
||||
///
|
||||
/// # Example
|
||||
///
|
||||
/// ```
|
||||
/// # if cfg!(miri) { return Ok(()); } // miri takes too long
|
||||
/// use regex_automata::dfa::regex::Regex;
|
||||
///
|
||||
/// let re = Regex::new_many(&[r"[a-z]+", r"[0-9]+", r"\w+"])?;
|
||||
/// assert_eq!(3, re.pattern_len());
|
||||
/// # Ok::<(), Box<dyn std::error::Error>>(())
|
||||
/// ```
|
||||
pub fn pattern_len(&self) -> usize {
|
||||
assert_eq!(self.forward().pattern_len(), self.reverse().pattern_len());
|
||||
self.forward().pattern_len()
|
||||
}
|
||||
}
|
||||
|
||||
/// An iterator over all non-overlapping matches for an infallible search.
|
||||
///
|
||||
/// The iterator yields a [`Match`] value until no more matches could be found.
|
||||
/// If the underlying regex engine returns an error, then a panic occurs.
|
||||
///
|
||||
/// The type parameters are as follows:
|
||||
///
|
||||
/// * `A` represents the type of the underlying DFA that implements the
|
||||
/// [`Automaton`] trait.
|
||||
///
|
||||
/// The lifetime parameters are as follows:
|
||||
///
|
||||
/// * `'h` represents the lifetime of the haystack being searched.
|
||||
/// * `'r` represents the lifetime of the regex object itself.
|
||||
///
|
||||
/// This iterator can be created with the [`Regex::find_iter`] method.
|
||||
#[derive(Debug)]
|
||||
pub struct FindMatches<'r, 'h, A> {
|
||||
re: &'r Regex<A>,
|
||||
it: iter::Searcher<'h>,
|
||||
}
|
||||
|
||||
impl<'r, 'h, A: Automaton> Iterator for FindMatches<'r, 'h, A> {
|
||||
type Item = Match;
|
||||
|
||||
#[inline]
|
||||
fn next(&mut self) -> Option<Match> {
|
||||
let FindMatches { re, ref mut it } = *self;
|
||||
it.advance(|input| re.try_search(input))
|
||||
}
|
||||
}
|
||||
|
||||
/// A builder for a regex based on deterministic finite automatons.
|
||||
///
|
||||
/// This builder permits configuring options for the syntax of a pattern, the
|
||||
/// NFA construction, the DFA construction and finally the regex searching
|
||||
/// itself. This builder is different from a general purpose regex builder in
|
||||
/// that it permits fine grain configuration of the construction process. The
|
||||
/// trade off for this is complexity, and the possibility of setting a
|
||||
/// configuration that might not make sense. For example, there are two
|
||||
/// different UTF-8 modes:
|
||||
///
|
||||
/// * [`syntax::Config::utf8`](crate::util::syntax::Config::utf8) controls
|
||||
/// whether the pattern itself can contain sub-expressions that match invalid
|
||||
/// UTF-8.
|
||||
/// * [`thompson::Config::utf8`](crate::nfa::thompson::Config::utf8) controls
|
||||
/// how the regex iterators themselves advance the starting position of the
|
||||
/// next search when a match with zero length is found.
|
||||
///
|
||||
/// Generally speaking, callers will want to either enable all of these or
|
||||
/// disable all of these.
|
||||
///
|
||||
/// Internally, building a regex requires building two DFAs, where one is
|
||||
/// responsible for finding the end of a match and the other is responsible
|
||||
/// for finding the start of a match. If you only need to detect whether
|
||||
/// something matched, or only the end of a match, then you should use a
|
||||
/// [`dense::Builder`] to construct a single DFA, which is cheaper than
|
||||
/// building two DFAs.
|
||||
///
|
||||
/// # Build methods
|
||||
///
|
||||
/// This builder has a few "build" methods. In general, it's the result of
|
||||
/// combining the following parameters:
|
||||
///
|
||||
/// * Building one or many regexes.
|
||||
/// * Building a regex with dense or sparse DFAs.
|
||||
///
|
||||
/// The simplest "build" method is [`Builder::build`]. It accepts a single
|
||||
/// pattern and builds a dense DFA using `usize` for the state identifier
|
||||
/// representation.
|
||||
///
|
||||
/// The most general "build" method is [`Builder::build_many`], which permits
|
||||
/// building a regex that searches for multiple patterns simultaneously while
|
||||
/// using a specific state identifier representation.
|
||||
///
|
||||
/// The most flexible "build" method, but hardest to use, is
|
||||
/// [`Builder::build_from_dfas`]. This exposes the fact that a [`Regex`] is
|
||||
/// just a pair of DFAs, and this method allows you to specify those DFAs
|
||||
/// exactly.
|
||||
///
|
||||
/// # Example
|
||||
///
|
||||
/// This example shows how to disable UTF-8 mode in the syntax and the regex
|
||||
/// itself. This is generally what you want for matching on arbitrary bytes.
|
||||
///
|
||||
/// ```
|
||||
/// # if cfg!(miri) { return Ok(()); } // miri takes too long
|
||||
/// use regex_automata::{
|
||||
/// dfa::regex::Regex, nfa::thompson, util::syntax, Match,
|
||||
/// };
|
||||
///
|
||||
/// let re = Regex::builder()
|
||||
/// .syntax(syntax::Config::new().utf8(false))
|
||||
/// .thompson(thompson::Config::new().utf8(false))
|
||||
/// .build(r"foo(?-u:[^b])ar.*")?;
|
||||
/// let haystack = b"\xFEfoo\xFFarzz\xE2\x98\xFF\n";
|
||||
/// let expected = Some(Match::must(0, 1..9));
|
||||
/// let got = re.find(haystack);
|
||||
/// assert_eq!(expected, got);
|
||||
/// // Notice that `(?-u:[^b])` matches invalid UTF-8,
|
||||
/// // but the subsequent `.*` does not! Disabling UTF-8
|
||||
/// // on the syntax permits this.
|
||||
/// assert_eq!(b"foo\xFFarzz", &haystack[got.unwrap().range()]);
|
||||
///
|
||||
/// # Ok::<(), Box<dyn std::error::Error>>(())
|
||||
/// ```
|
||||
#[derive(Clone, Debug)]
|
||||
pub struct Builder {
|
||||
#[cfg(feature = "dfa-build")]
|
||||
dfa: dense::Builder,
|
||||
}
|
||||
|
||||
impl Builder {
|
||||
/// Create a new regex builder with the default configuration.
|
||||
pub fn new() -> Builder {
|
||||
Builder {
|
||||
#[cfg(feature = "dfa-build")]
|
||||
dfa: dense::Builder::new(),
|
||||
}
|
||||
}
|
||||
|
||||
/// Build a regex from the given pattern.
|
||||
///
|
||||
/// If there was a problem parsing or compiling the pattern, then an error
|
||||
/// is returned.
|
||||
#[cfg(all(feature = "syntax", feature = "dfa-build"))]
|
||||
pub fn build(&self, pattern: &str) -> Result<Regex, BuildError> {
|
||||
self.build_many(&[pattern])
|
||||
}
|
||||
|
||||
/// Build a regex from the given pattern using sparse DFAs.
|
||||
///
|
||||
/// If there was a problem parsing or compiling the pattern, then an error
|
||||
/// is returned.
|
||||
#[cfg(all(feature = "syntax", feature = "dfa-build"))]
|
||||
pub fn build_sparse(
|
||||
&self,
|
||||
pattern: &str,
|
||||
) -> Result<Regex<sparse::DFA<Vec<u8>>>, BuildError> {
|
||||
self.build_many_sparse(&[pattern])
|
||||
}
|
||||
|
||||
/// Build a regex from the given patterns.
|
||||
#[cfg(all(feature = "syntax", feature = "dfa-build"))]
|
||||
pub fn build_many<P: AsRef<str>>(
|
||||
&self,
|
||||
patterns: &[P],
|
||||
) -> Result<Regex, BuildError> {
|
||||
let forward = self.dfa.build_many(patterns)?;
|
||||
let reverse = self
|
||||
.dfa
|
||||
.clone()
|
||||
.configure(
|
||||
dense::Config::new()
|
||||
.prefilter(None)
|
||||
.specialize_start_states(false)
|
||||
.start_kind(StartKind::Anchored)
|
||||
.match_kind(MatchKind::All),
|
||||
)
|
||||
.thompson(crate::nfa::thompson::Config::new().reverse(true))
|
||||
.build_many(patterns)?;
|
||||
Ok(self.build_from_dfas(forward, reverse))
|
||||
}
|
||||
|
||||
/// Build a sparse regex from the given patterns.
|
||||
#[cfg(all(feature = "syntax", feature = "dfa-build"))]
|
||||
pub fn build_many_sparse<P: AsRef<str>>(
|
||||
&self,
|
||||
patterns: &[P],
|
||||
) -> Result<Regex<sparse::DFA<Vec<u8>>>, BuildError> {
|
||||
let re = self.build_many(patterns)?;
|
||||
let forward = re.forward().to_sparse()?;
|
||||
let reverse = re.reverse().to_sparse()?;
|
||||
Ok(self.build_from_dfas(forward, reverse))
|
||||
}
|
||||
|
||||
/// Build a regex from its component forward and reverse DFAs.
|
||||
///
|
||||
/// This is useful when deserializing a regex from some arbitrary
|
||||
/// memory region. This is also useful for building regexes from other
|
||||
/// types of DFAs.
|
||||
///
|
||||
/// If you're building the DFAs from scratch instead of building new DFAs
|
||||
/// from other DFAs, then you'll need to make sure that the reverse DFA is
|
||||
/// configured correctly to match the intended semantics. Namely:
|
||||
///
|
||||
/// * It should be anchored.
|
||||
/// * It should use [`MatchKind::All`] semantics.
|
||||
/// * It should match in reverse.
|
||||
/// * Otherwise, its configuration should match the forward DFA.
|
||||
///
|
||||
/// If these conditions aren't satisfied, then the behavior of searches is
|
||||
/// unspecified.
|
||||
///
|
||||
/// Note that when using this constructor, no configuration is applied.
|
||||
/// Since this routine provides the DFAs to the builder, there is no
|
||||
/// opportunity to apply other configuration options.
|
||||
///
|
||||
/// # Example
|
||||
///
|
||||
/// This example is a bit a contrived. The usual use of these methods
|
||||
/// would involve serializing `initial_re` somewhere and then deserializing
|
||||
/// it later to build a regex. But in this case, we do everything in
|
||||
/// memory.
|
||||
///
|
||||
/// ```
|
||||
/// use regex_automata::dfa::regex::Regex;
|
||||
///
|
||||
/// let initial_re = Regex::new("foo[0-9]+")?;
|
||||
/// assert_eq!(true, initial_re.is_match(b"foo123"));
|
||||
///
|
||||
/// let (fwd, rev) = (initial_re.forward(), initial_re.reverse());
|
||||
/// let re = Regex::builder().build_from_dfas(fwd, rev);
|
||||
/// assert_eq!(true, re.is_match(b"foo123"));
|
||||
/// # Ok::<(), Box<dyn std::error::Error>>(())
|
||||
/// ```
|
||||
///
|
||||
/// This example shows how to build a `Regex` that uses sparse DFAs instead
|
||||
/// of dense DFAs without using one of the convenience `build_sparse`
|
||||
/// routines:
|
||||
///
|
||||
/// ```
|
||||
/// use regex_automata::dfa::regex::Regex;
|
||||
///
|
||||
/// let initial_re = Regex::new("foo[0-9]+")?;
|
||||
/// assert_eq!(true, initial_re.is_match(b"foo123"));
|
||||
///
|
||||
/// let fwd = initial_re.forward().to_sparse()?;
|
||||
/// let rev = initial_re.reverse().to_sparse()?;
|
||||
/// let re = Regex::builder().build_from_dfas(fwd, rev);
|
||||
/// assert_eq!(true, re.is_match(b"foo123"));
|
||||
/// # Ok::<(), Box<dyn std::error::Error>>(())
|
||||
/// ```
|
||||
pub fn build_from_dfas<A: Automaton>(
|
||||
&self,
|
||||
forward: A,
|
||||
reverse: A,
|
||||
) -> Regex<A> {
|
||||
Regex { forward, reverse }
|
||||
}
|
||||
|
||||
/// Set the syntax configuration for this builder using
|
||||
/// [`syntax::Config`](crate::util::syntax::Config).
|
||||
///
|
||||
/// This permits setting things like case insensitivity, Unicode and multi
|
||||
/// line mode.
|
||||
#[cfg(all(feature = "syntax", feature = "dfa-build"))]
|
||||
pub fn syntax(
|
||||
&mut self,
|
||||
config: crate::util::syntax::Config,
|
||||
) -> &mut Builder {
|
||||
self.dfa.syntax(config);
|
||||
self
|
||||
}
|
||||
|
||||
/// Set the Thompson NFA configuration for this builder using
|
||||
/// [`nfa::thompson::Config`](crate::nfa::thompson::Config).
|
||||
///
|
||||
/// This permits setting things like whether additional time should be
|
||||
/// spent shrinking the size of the NFA.
|
||||
#[cfg(all(feature = "syntax", feature = "dfa-build"))]
|
||||
pub fn thompson(
|
||||
&mut self,
|
||||
config: crate::nfa::thompson::Config,
|
||||
) -> &mut Builder {
|
||||
self.dfa.thompson(config);
|
||||
self
|
||||
}
|
||||
|
||||
/// Set the dense DFA compilation configuration for this builder using
|
||||
/// [`dense::Config`].
|
||||
///
|
||||
/// This permits setting things like whether the underlying DFAs should
|
||||
/// be minimized.
|
||||
#[cfg(feature = "dfa-build")]
|
||||
pub fn dense(&mut self, config: dense::Config) -> &mut Builder {
|
||||
self.dfa.configure(config);
|
||||
self
|
||||
}
|
||||
}
|
||||
|
||||
impl Default for Builder {
|
||||
fn default() -> Builder {
|
||||
Builder::new()
|
||||
}
|
||||
}
|
||||
242
third-party/vendor/regex-automata/src/dfa/remapper.rs
vendored
Normal file
242
third-party/vendor/regex-automata/src/dfa/remapper.rs
vendored
Normal file
|
|
@ -0,0 +1,242 @@
|
|||
use alloc::vec::Vec;
|
||||
|
||||
use crate::util::primitives::StateID;
|
||||
|
||||
/// Remappable is a tightly coupled abstraction that facilitates remapping
|
||||
/// state identifiers in DFAs.
|
||||
///
|
||||
/// The main idea behind remapping state IDs is that DFAs often need to check
|
||||
/// if a certain state is a "special" state of some kind (like a match state)
|
||||
/// during a search. Since this is extremely perf critical code, we want this
|
||||
/// check to be as fast as possible. Partitioning state IDs into, for example,
|
||||
/// into "non-match" and "match" states means one can tell if a state is a
|
||||
/// match state via a simple comparison of the state ID.
|
||||
///
|
||||
/// The issue is that during the DFA construction process, it's not
|
||||
/// particularly easy to partition the states. Instead, the simplest thing is
|
||||
/// to often just do a pass over all of the states and shuffle them into their
|
||||
/// desired partitionings. To do that, we need a mechanism for swapping states.
|
||||
/// Hence, this abstraction.
|
||||
///
|
||||
/// Normally, for such little code, I would just duplicate it. But this is a
|
||||
/// key optimization and the implementation is a bit subtle. So the abstraction
|
||||
/// is basically a ham-fisted attempt at DRY. The only place we use this is in
|
||||
/// the dense and one-pass DFAs.
|
||||
///
|
||||
/// See also src/dfa/special.rs for a more detailed explanation of how dense
|
||||
/// DFAs are partitioned.
|
||||
pub(super) trait Remappable: core::fmt::Debug {
|
||||
/// Return the total number of states.
|
||||
fn state_len(&self) -> usize;
|
||||
/// Return the power-of-2 exponent that yields the stride. The pertinent
|
||||
/// laws here are, where N=stride2: 2^N=stride and len(alphabet) <= stride.
|
||||
fn stride2(&self) -> usize;
|
||||
/// Swap the states pointed to by the given IDs. The underlying finite
|
||||
/// state machine should be mutated such that all of the transitions in
|
||||
/// `id1` are now in the memory region where the transitions for `id2`
|
||||
/// were, and all of the transitions in `id2` are now in the memory region
|
||||
/// where the transitions for `id1` were.
|
||||
///
|
||||
/// Essentially, this "moves" `id1` to `id2` and `id2` to `id1`.
|
||||
///
|
||||
/// It is expected that, after calling this, the underlying value will be
|
||||
/// left in an inconsistent state, since any other transitions pointing to,
|
||||
/// e.g., `id1` need to be updated to point to `id2`, since that's where
|
||||
/// `id1` moved to.
|
||||
///
|
||||
/// In order to "fix" the underlying inconsistent state, a `Remapper`
|
||||
/// should be used to guarantee that `remap` is called at the appropriate
|
||||
/// time.
|
||||
fn swap_states(&mut self, id1: StateID, id2: StateID);
|
||||
/// This must remap every single state ID in the underlying value according
|
||||
/// to the function given. For example, in a DFA, this should remap every
|
||||
/// transition and every starting state ID.
|
||||
fn remap(&mut self, map: impl Fn(StateID) -> StateID);
|
||||
}
|
||||
|
||||
/// Remapper is an abstraction the manages the remapping of state IDs in a
|
||||
/// finite state machine. This is useful when one wants to shuffle states into
|
||||
/// different positions in the machine.
|
||||
///
|
||||
/// One of the key complexities this manages is the ability to correctly move
|
||||
/// one state multiple times.
|
||||
///
|
||||
/// Once shuffling is complete, `remap` must be called, which will rewrite
|
||||
/// all pertinent transitions to updated state IDs. Neglecting to call `remap`
|
||||
/// will almost certainly result in a corrupt machine.
|
||||
#[derive(Debug)]
|
||||
pub(super) struct Remapper {
|
||||
/// A map from the index of a state to its pre-multiplied identifier.
|
||||
///
|
||||
/// When a state is swapped with another, then their corresponding
|
||||
/// locations in this map are also swapped. Thus, its new position will
|
||||
/// still point to its old pre-multiplied StateID.
|
||||
///
|
||||
/// While there is a bit more to it, this then allows us to rewrite the
|
||||
/// state IDs in a DFA's transition table in a single pass. This is done
|
||||
/// by iterating over every ID in this map, then iterating over each
|
||||
/// transition for the state at that ID and re-mapping the transition from
|
||||
/// `old_id` to `map[dfa.to_index(old_id)]`. That is, we find the position
|
||||
/// in this map where `old_id` *started*, and set it to where it ended up
|
||||
/// after all swaps have been completed.
|
||||
map: Vec<StateID>,
|
||||
/// A mapper from state index to state ID (and back).
|
||||
idxmap: IndexMapper,
|
||||
}
|
||||
|
||||
impl Remapper {
|
||||
/// Create a new remapper from the given remappable implementation. The
|
||||
/// remapper can then be used to swap states. The remappable value given
|
||||
/// here must the same one given to `swap` and `remap`.
|
||||
pub(super) fn new(r: &impl Remappable) -> Remapper {
|
||||
let idxmap = IndexMapper { stride2: r.stride2() };
|
||||
let map = (0..r.state_len()).map(|i| idxmap.to_state_id(i)).collect();
|
||||
Remapper { map, idxmap }
|
||||
}
|
||||
|
||||
/// Swap two states. Once this is called, callers must follow through to
|
||||
/// call `remap`, or else it's possible for the underlying remappable
|
||||
/// value to be in a corrupt state.
|
||||
pub(super) fn swap(
|
||||
&mut self,
|
||||
r: &mut impl Remappable,
|
||||
id1: StateID,
|
||||
id2: StateID,
|
||||
) {
|
||||
if id1 == id2 {
|
||||
return;
|
||||
}
|
||||
r.swap_states(id1, id2);
|
||||
self.map.swap(self.idxmap.to_index(id1), self.idxmap.to_index(id2));
|
||||
}
|
||||
|
||||
/// Complete the remapping process by rewriting all state IDs in the
|
||||
/// remappable value according to the swaps performed.
|
||||
pub(super) fn remap(mut self, r: &mut impl Remappable) {
|
||||
// Update the map to account for states that have been swapped
|
||||
// multiple times. For example, if (A, C) and (C, G) are swapped, then
|
||||
// transitions previously pointing to A should now point to G. But if
|
||||
// we don't update our map, they will erroneously be set to C. All we
|
||||
// do is follow the swaps in our map until we see our original state
|
||||
// ID.
|
||||
//
|
||||
// The intuition here is to think about how changes are made to the
|
||||
// map: only through pairwise swaps. That means that starting at any
|
||||
// given state, it is always possible to find the loop back to that
|
||||
// state by following the swaps represented in the map (which might be
|
||||
// 0 swaps).
|
||||
//
|
||||
// We are also careful to clone the map before starting in order to
|
||||
// freeze it. We use the frozen map to find our loops, since we need to
|
||||
// update our map as well. Without freezing it, our updates could break
|
||||
// the loops referenced above and produce incorrect results.
|
||||
let oldmap = self.map.clone();
|
||||
for i in 0..r.state_len() {
|
||||
let cur_id = self.idxmap.to_state_id(i);
|
||||
let mut new_id = oldmap[i];
|
||||
if cur_id == new_id {
|
||||
continue;
|
||||
}
|
||||
loop {
|
||||
let id = oldmap[self.idxmap.to_index(new_id)];
|
||||
if cur_id == id {
|
||||
self.map[i] = new_id;
|
||||
break;
|
||||
}
|
||||
new_id = id;
|
||||
}
|
||||
}
|
||||
r.remap(|next| self.map[self.idxmap.to_index(next)]);
|
||||
}
|
||||
}
|
||||
|
||||
/// A simple type for mapping between state indices and state IDs.
|
||||
///
|
||||
/// The reason why this exists is because state IDs are "premultiplied." That
|
||||
/// is, in order to get to the transitions for a particular state, one need
|
||||
/// only use the state ID as-is, instead of having to multiple it by transition
|
||||
/// table's stride.
|
||||
///
|
||||
/// The downside of this is that it's inconvenient to map between state IDs
|
||||
/// using a dense map, e.g., Vec<StateID>. That's because state IDs look like
|
||||
/// `0`, `0+stride`, `0+2*stride`, `0+3*stride`, etc., instead of `0`, `1`,
|
||||
/// `2`, `3`, etc.
|
||||
///
|
||||
/// Since our state IDs are premultiplied, we can convert back-and-forth
|
||||
/// between IDs and indices by simply unmultiplying the IDs and multiplying the
|
||||
/// indices.
|
||||
#[derive(Debug)]
|
||||
struct IndexMapper {
|
||||
/// The power of 2 corresponding to the stride of the corresponding
|
||||
/// transition table. 'id >> stride2' de-multiplies an ID while 'index <<
|
||||
/// stride2' pre-multiplies an index to an ID.
|
||||
stride2: usize,
|
||||
}
|
||||
|
||||
impl IndexMapper {
|
||||
/// Convert a state ID to a state index.
|
||||
fn to_index(&self, id: StateID) -> usize {
|
||||
id.as_usize() >> self.stride2
|
||||
}
|
||||
|
||||
/// Convert a state index to a state ID.
|
||||
fn to_state_id(&self, index: usize) -> StateID {
|
||||
// CORRECTNESS: If the given index is not valid, then it is not
|
||||
// required for this to panic or return a valid state ID. We'll "just"
|
||||
// wind up with panics or silent logic errors at some other point.
|
||||
StateID::new_unchecked(index << self.stride2)
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(feature = "dfa-build")]
|
||||
mod dense {
|
||||
use crate::{dfa::dense::OwnedDFA, util::primitives::StateID};
|
||||
|
||||
use super::Remappable;
|
||||
|
||||
impl Remappable for OwnedDFA {
|
||||
fn state_len(&self) -> usize {
|
||||
OwnedDFA::state_len(self)
|
||||
}
|
||||
|
||||
fn stride2(&self) -> usize {
|
||||
OwnedDFA::stride2(self)
|
||||
}
|
||||
|
||||
fn swap_states(&mut self, id1: StateID, id2: StateID) {
|
||||
OwnedDFA::swap_states(self, id1, id2)
|
||||
}
|
||||
|
||||
fn remap(&mut self, map: impl Fn(StateID) -> StateID) {
|
||||
OwnedDFA::remap(self, map)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(feature = "dfa-onepass")]
|
||||
mod onepass {
|
||||
use crate::{dfa::onepass::DFA, util::primitives::StateID};
|
||||
|
||||
use super::Remappable;
|
||||
|
||||
impl Remappable for DFA {
|
||||
fn state_len(&self) -> usize {
|
||||
DFA::state_len(self)
|
||||
}
|
||||
|
||||
fn stride2(&self) -> usize {
|
||||
// We don't do pre-multiplication for the one-pass DFA, so
|
||||
// returning 0 has the effect of making state IDs and state indices
|
||||
// equivalent.
|
||||
0
|
||||
}
|
||||
|
||||
fn swap_states(&mut self, id1: StateID, id2: StateID) {
|
||||
DFA::swap_states(self, id1, id2)
|
||||
}
|
||||
|
||||
fn remap(&mut self, map: impl Fn(StateID) -> StateID) {
|
||||
DFA::remap(self, map)
|
||||
}
|
||||
}
|
||||
}
|
||||
644
third-party/vendor/regex-automata/src/dfa/search.rs
vendored
Normal file
644
third-party/vendor/regex-automata/src/dfa/search.rs
vendored
Normal file
|
|
@ -0,0 +1,644 @@
|
|||
use crate::{
|
||||
dfa::{
|
||||
accel,
|
||||
automaton::{Automaton, OverlappingState},
|
||||
},
|
||||
util::{
|
||||
prefilter::Prefilter,
|
||||
primitives::StateID,
|
||||
search::{Anchored, HalfMatch, Input, Span},
|
||||
},
|
||||
MatchError,
|
||||
};
|
||||
|
||||
#[inline(never)]
|
||||
pub fn find_fwd<A: Automaton + ?Sized>(
|
||||
dfa: &A,
|
||||
input: &Input<'_>,
|
||||
) -> Result<Option<HalfMatch>, MatchError> {
|
||||
if input.is_done() {
|
||||
return Ok(None);
|
||||
}
|
||||
let pre = if input.get_anchored().is_anchored() {
|
||||
None
|
||||
} else {
|
||||
dfa.get_prefilter()
|
||||
};
|
||||
// Searching with a pattern ID is always anchored, so we should never use
|
||||
// a prefilter.
|
||||
if pre.is_some() {
|
||||
if input.get_earliest() {
|
||||
find_fwd_imp(dfa, input, pre, true)
|
||||
} else {
|
||||
find_fwd_imp(dfa, input, pre, false)
|
||||
}
|
||||
} else {
|
||||
if input.get_earliest() {
|
||||
find_fwd_imp(dfa, input, None, true)
|
||||
} else {
|
||||
find_fwd_imp(dfa, input, None, false)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg_attr(feature = "perf-inline", inline(always))]
|
||||
fn find_fwd_imp<A: Automaton + ?Sized>(
|
||||
dfa: &A,
|
||||
input: &Input<'_>,
|
||||
pre: Option<&'_ Prefilter>,
|
||||
earliest: bool,
|
||||
) -> Result<Option<HalfMatch>, MatchError> {
|
||||
// See 'prefilter_restart' docs for explanation.
|
||||
let universal_start = dfa.universal_start_state(Anchored::No).is_some();
|
||||
let mut mat = None;
|
||||
let mut sid = init_fwd(dfa, input)?;
|
||||
let mut at = input.start();
|
||||
// This could just be a closure, but then I think it would be unsound
|
||||
// because it would need to be safe to invoke. This way, the lack of safety
|
||||
// is clearer in the code below.
|
||||
macro_rules! next_unchecked {
|
||||
($sid:expr, $at:expr) => {{
|
||||
let byte = *input.haystack().get_unchecked($at);
|
||||
dfa.next_state_unchecked($sid, byte)
|
||||
}};
|
||||
}
|
||||
|
||||
if let Some(ref pre) = pre {
|
||||
let span = Span::from(at..input.end());
|
||||
// If a prefilter doesn't report false positives, then we don't need to
|
||||
// touch the DFA at all. However, since all matches include the pattern
|
||||
// ID, and the prefilter infrastructure doesn't report pattern IDs, we
|
||||
// limit this optimization to cases where there is exactly one pattern.
|
||||
// In that case, any match must be the 0th pattern.
|
||||
match pre.find(input.haystack(), span) {
|
||||
None => return Ok(mat),
|
||||
Some(ref span) => {
|
||||
at = span.start;
|
||||
if !universal_start {
|
||||
sid = prefilter_restart(dfa, &input, at)?;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
while at < input.end() {
|
||||
// SAFETY: There are two safety invariants we need to uphold here in
|
||||
// the loops below: that 'sid' and 'prev_sid' are valid state IDs
|
||||
// for this DFA, and that 'at' is a valid index into 'haystack'.
|
||||
// For the former, we rely on the invariant that next_state* and
|
||||
// start_state_forward always returns a valid state ID (given a valid
|
||||
// state ID in the former case). For the latter safety invariant, we
|
||||
// always guard unchecked access with a check that 'at' is less than
|
||||
// 'end', where 'end <= haystack.len()'. In the unrolled loop below, we
|
||||
// ensure that 'at' is always in bounds.
|
||||
//
|
||||
// PERF: See a similar comment in src/hybrid/search.rs that justifies
|
||||
// this extra work to make the search loop fast. The same reasoning and
|
||||
// benchmarks apply here.
|
||||
let mut prev_sid;
|
||||
while at < input.end() {
|
||||
prev_sid = unsafe { next_unchecked!(sid, at) };
|
||||
if dfa.is_special_state(prev_sid) || at + 3 >= input.end() {
|
||||
core::mem::swap(&mut prev_sid, &mut sid);
|
||||
break;
|
||||
}
|
||||
at += 1;
|
||||
|
||||
sid = unsafe { next_unchecked!(prev_sid, at) };
|
||||
if dfa.is_special_state(sid) {
|
||||
break;
|
||||
}
|
||||
at += 1;
|
||||
|
||||
prev_sid = unsafe { next_unchecked!(sid, at) };
|
||||
if dfa.is_special_state(prev_sid) {
|
||||
core::mem::swap(&mut prev_sid, &mut sid);
|
||||
break;
|
||||
}
|
||||
at += 1;
|
||||
|
||||
sid = unsafe { next_unchecked!(prev_sid, at) };
|
||||
if dfa.is_special_state(sid) {
|
||||
break;
|
||||
}
|
||||
at += 1;
|
||||
}
|
||||
if dfa.is_special_state(sid) {
|
||||
if dfa.is_start_state(sid) {
|
||||
if let Some(ref pre) = pre {
|
||||
let span = Span::from(at..input.end());
|
||||
match pre.find(input.haystack(), span) {
|
||||
None => return Ok(mat),
|
||||
Some(ref span) => {
|
||||
// We want to skip any update to 'at' below
|
||||
// at the end of this iteration and just
|
||||
// jump immediately back to the next state
|
||||
// transition at the leading position of the
|
||||
// candidate match.
|
||||
//
|
||||
// ... but only if we actually made progress
|
||||
// with our prefilter, otherwise if the start
|
||||
// state has a self-loop, we can get stuck.
|
||||
if span.start > at {
|
||||
at = span.start;
|
||||
if !universal_start {
|
||||
sid = prefilter_restart(dfa, &input, at)?;
|
||||
}
|
||||
continue;
|
||||
}
|
||||
}
|
||||
}
|
||||
} else if dfa.is_accel_state(sid) {
|
||||
let needles = dfa.accelerator(sid);
|
||||
at = accel::find_fwd(needles, input.haystack(), at + 1)
|
||||
.unwrap_or(input.end());
|
||||
continue;
|
||||
}
|
||||
} else if dfa.is_match_state(sid) {
|
||||
let pattern = dfa.match_pattern(sid, 0);
|
||||
mat = Some(HalfMatch::new(pattern, at));
|
||||
if earliest {
|
||||
return Ok(mat);
|
||||
}
|
||||
if dfa.is_accel_state(sid) {
|
||||
let needles = dfa.accelerator(sid);
|
||||
at = accel::find_fwd(needles, input.haystack(), at + 1)
|
||||
.unwrap_or(input.end());
|
||||
continue;
|
||||
}
|
||||
} else if dfa.is_accel_state(sid) {
|
||||
let needs = dfa.accelerator(sid);
|
||||
at = accel::find_fwd(needs, input.haystack(), at + 1)
|
||||
.unwrap_or(input.end());
|
||||
continue;
|
||||
} else if dfa.is_dead_state(sid) {
|
||||
return Ok(mat);
|
||||
} else {
|
||||
// It's important that this is a debug_assert, since this can
|
||||
// actually be tripped even if DFA::from_bytes succeeds and
|
||||
// returns a supposedly valid DFA.
|
||||
return Err(MatchError::quit(input.haystack()[at], at));
|
||||
}
|
||||
}
|
||||
at += 1;
|
||||
}
|
||||
eoi_fwd(dfa, input, &mut sid, &mut mat)?;
|
||||
Ok(mat)
|
||||
}
|
||||
|
||||
#[inline(never)]
|
||||
pub fn find_rev<A: Automaton + ?Sized>(
|
||||
dfa: &A,
|
||||
input: &Input<'_>,
|
||||
) -> Result<Option<HalfMatch>, MatchError> {
|
||||
if input.is_done() {
|
||||
return Ok(None);
|
||||
}
|
||||
if input.get_earliest() {
|
||||
find_rev_imp(dfa, input, true)
|
||||
} else {
|
||||
find_rev_imp(dfa, input, false)
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg_attr(feature = "perf-inline", inline(always))]
|
||||
fn find_rev_imp<A: Automaton + ?Sized>(
|
||||
dfa: &A,
|
||||
input: &Input<'_>,
|
||||
earliest: bool,
|
||||
) -> Result<Option<HalfMatch>, MatchError> {
|
||||
let mut mat = None;
|
||||
let mut sid = init_rev(dfa, input)?;
|
||||
// In reverse search, the loop below can't handle the case of searching an
|
||||
// empty slice. Ideally we could write something congruent to the forward
|
||||
// search, i.e., 'while at >= start', but 'start' might be 0. Since we use
|
||||
// an unsigned offset, 'at >= 0' is trivially always true. We could avoid
|
||||
// this extra case handling by using a signed offset, but Rust makes it
|
||||
// annoying to do. So... We just handle the empty case separately.
|
||||
if input.start() == input.end() {
|
||||
eoi_rev(dfa, input, &mut sid, &mut mat)?;
|
||||
return Ok(mat);
|
||||
}
|
||||
|
||||
let mut at = input.end() - 1;
|
||||
macro_rules! next_unchecked {
|
||||
($sid:expr, $at:expr) => {{
|
||||
let byte = *input.haystack().get_unchecked($at);
|
||||
dfa.next_state_unchecked($sid, byte)
|
||||
}};
|
||||
}
|
||||
loop {
|
||||
// SAFETY: See comments in 'find_fwd' for a safety argument.
|
||||
let mut prev_sid;
|
||||
while at >= input.start() {
|
||||
prev_sid = unsafe { next_unchecked!(sid, at) };
|
||||
if dfa.is_special_state(prev_sid)
|
||||
|| at <= input.start().saturating_add(3)
|
||||
{
|
||||
core::mem::swap(&mut prev_sid, &mut sid);
|
||||
break;
|
||||
}
|
||||
at -= 1;
|
||||
|
||||
sid = unsafe { next_unchecked!(prev_sid, at) };
|
||||
if dfa.is_special_state(sid) {
|
||||
break;
|
||||
}
|
||||
at -= 1;
|
||||
|
||||
prev_sid = unsafe { next_unchecked!(sid, at) };
|
||||
if dfa.is_special_state(prev_sid) {
|
||||
core::mem::swap(&mut prev_sid, &mut sid);
|
||||
break;
|
||||
}
|
||||
at -= 1;
|
||||
|
||||
sid = unsafe { next_unchecked!(prev_sid, at) };
|
||||
if dfa.is_special_state(sid) {
|
||||
break;
|
||||
}
|
||||
at -= 1;
|
||||
}
|
||||
if dfa.is_special_state(sid) {
|
||||
if dfa.is_start_state(sid) {
|
||||
if dfa.is_accel_state(sid) {
|
||||
let needles = dfa.accelerator(sid);
|
||||
at = accel::find_rev(needles, input.haystack(), at)
|
||||
.map(|i| i + 1)
|
||||
.unwrap_or(input.start());
|
||||
}
|
||||
} else if dfa.is_match_state(sid) {
|
||||
let pattern = dfa.match_pattern(sid, 0);
|
||||
// Since reverse searches report the beginning of a match
|
||||
// and the beginning is inclusive (not exclusive like the
|
||||
// end of a match), we add 1 to make it inclusive.
|
||||
mat = Some(HalfMatch::new(pattern, at + 1));
|
||||
if earliest {
|
||||
return Ok(mat);
|
||||
}
|
||||
if dfa.is_accel_state(sid) {
|
||||
let needles = dfa.accelerator(sid);
|
||||
at = accel::find_rev(needles, input.haystack(), at)
|
||||
.map(|i| i + 1)
|
||||
.unwrap_or(input.start());
|
||||
}
|
||||
} else if dfa.is_accel_state(sid) {
|
||||
let needles = dfa.accelerator(sid);
|
||||
// If the accelerator returns nothing, why don't we quit the
|
||||
// search? Well, if the accelerator doesn't find anything, that
|
||||
// doesn't mean we don't have a match. It just means that we
|
||||
// can't leave the current state given one of the 255 possible
|
||||
// byte values. However, there might be an EOI transition. So
|
||||
// we set 'at' to the end of the haystack, which will cause
|
||||
// this loop to stop and fall down into the EOI transition.
|
||||
at = accel::find_rev(needles, input.haystack(), at)
|
||||
.map(|i| i + 1)
|
||||
.unwrap_or(input.start());
|
||||
} else if dfa.is_dead_state(sid) {
|
||||
return Ok(mat);
|
||||
} else {
|
||||
return Err(MatchError::quit(input.haystack()[at], at));
|
||||
}
|
||||
}
|
||||
if at == input.start() {
|
||||
break;
|
||||
}
|
||||
at -= 1;
|
||||
}
|
||||
eoi_rev(dfa, input, &mut sid, &mut mat)?;
|
||||
Ok(mat)
|
||||
}
|
||||
|
||||
#[inline(never)]
|
||||
pub fn find_overlapping_fwd<A: Automaton + ?Sized>(
|
||||
dfa: &A,
|
||||
input: &Input<'_>,
|
||||
state: &mut OverlappingState,
|
||||
) -> Result<(), MatchError> {
|
||||
state.mat = None;
|
||||
if input.is_done() {
|
||||
return Ok(());
|
||||
}
|
||||
let pre = if input.get_anchored().is_anchored() {
|
||||
None
|
||||
} else {
|
||||
dfa.get_prefilter()
|
||||
};
|
||||
if pre.is_some() {
|
||||
find_overlapping_fwd_imp(dfa, input, pre, state)
|
||||
} else {
|
||||
find_overlapping_fwd_imp(dfa, input, None, state)
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg_attr(feature = "perf-inline", inline(always))]
|
||||
fn find_overlapping_fwd_imp<A: Automaton + ?Sized>(
|
||||
dfa: &A,
|
||||
input: &Input<'_>,
|
||||
pre: Option<&'_ Prefilter>,
|
||||
state: &mut OverlappingState,
|
||||
) -> Result<(), MatchError> {
|
||||
// See 'prefilter_restart' docs for explanation.
|
||||
let universal_start = dfa.universal_start_state(Anchored::No).is_some();
|
||||
let mut sid = match state.id {
|
||||
None => {
|
||||
state.at = input.start();
|
||||
init_fwd(dfa, input)?
|
||||
}
|
||||
Some(sid) => {
|
||||
if let Some(match_index) = state.next_match_index {
|
||||
let match_len = dfa.match_len(sid);
|
||||
if match_index < match_len {
|
||||
state.next_match_index = Some(match_index + 1);
|
||||
let pattern = dfa.match_pattern(sid, match_index);
|
||||
state.mat = Some(HalfMatch::new(pattern, state.at));
|
||||
return Ok(());
|
||||
}
|
||||
}
|
||||
// Once we've reported all matches at a given position, we need to
|
||||
// advance the search to the next position.
|
||||
state.at += 1;
|
||||
if state.at > input.end() {
|
||||
return Ok(());
|
||||
}
|
||||
sid
|
||||
}
|
||||
};
|
||||
|
||||
// NOTE: We don't optimize the crap out of this routine primarily because
|
||||
// it seems like most find_overlapping searches will have higher match
|
||||
// counts, and thus, throughput is perhaps not as important. But if you
|
||||
// have a use case for something faster, feel free to file an issue.
|
||||
while state.at < input.end() {
|
||||
sid = dfa.next_state(sid, input.haystack()[state.at]);
|
||||
if dfa.is_special_state(sid) {
|
||||
state.id = Some(sid);
|
||||
if dfa.is_start_state(sid) {
|
||||
if let Some(ref pre) = pre {
|
||||
let span = Span::from(state.at..input.end());
|
||||
match pre.find(input.haystack(), span) {
|
||||
None => return Ok(()),
|
||||
Some(ref span) => {
|
||||
if span.start > state.at {
|
||||
state.at = span.start;
|
||||
if !universal_start {
|
||||
sid = prefilter_restart(
|
||||
dfa, &input, state.at,
|
||||
)?;
|
||||
}
|
||||
continue;
|
||||
}
|
||||
}
|
||||
}
|
||||
} else if dfa.is_accel_state(sid) {
|
||||
let needles = dfa.accelerator(sid);
|
||||
state.at = accel::find_fwd(
|
||||
needles,
|
||||
input.haystack(),
|
||||
state.at + 1,
|
||||
)
|
||||
.unwrap_or(input.end());
|
||||
continue;
|
||||
}
|
||||
} else if dfa.is_match_state(sid) {
|
||||
state.next_match_index = Some(1);
|
||||
let pattern = dfa.match_pattern(sid, 0);
|
||||
state.mat = Some(HalfMatch::new(pattern, state.at));
|
||||
return Ok(());
|
||||
} else if dfa.is_accel_state(sid) {
|
||||
let needs = dfa.accelerator(sid);
|
||||
// If the accelerator returns nothing, why don't we quit the
|
||||
// search? Well, if the accelerator doesn't find anything, that
|
||||
// doesn't mean we don't have a match. It just means that we
|
||||
// can't leave the current state given one of the 255 possible
|
||||
// byte values. However, there might be an EOI transition. So
|
||||
// we set 'at' to the end of the haystack, which will cause
|
||||
// this loop to stop and fall down into the EOI transition.
|
||||
state.at =
|
||||
accel::find_fwd(needs, input.haystack(), state.at + 1)
|
||||
.unwrap_or(input.end());
|
||||
continue;
|
||||
} else if dfa.is_dead_state(sid) {
|
||||
return Ok(());
|
||||
} else {
|
||||
return Err(MatchError::quit(
|
||||
input.haystack()[state.at],
|
||||
state.at,
|
||||
));
|
||||
}
|
||||
}
|
||||
state.at += 1;
|
||||
}
|
||||
|
||||
let result = eoi_fwd(dfa, input, &mut sid, &mut state.mat);
|
||||
state.id = Some(sid);
|
||||
if state.mat.is_some() {
|
||||
// '1' is always correct here since if we get to this point, this
|
||||
// always corresponds to the first (index '0') match discovered at
|
||||
// this position. So the next match to report at this position (if
|
||||
// it exists) is at index '1'.
|
||||
state.next_match_index = Some(1);
|
||||
}
|
||||
result
|
||||
}
|
||||
|
||||
#[inline(never)]
|
||||
pub(crate) fn find_overlapping_rev<A: Automaton + ?Sized>(
|
||||
dfa: &A,
|
||||
input: &Input<'_>,
|
||||
state: &mut OverlappingState,
|
||||
) -> Result<(), MatchError> {
|
||||
state.mat = None;
|
||||
if input.is_done() {
|
||||
return Ok(());
|
||||
}
|
||||
let mut sid = match state.id {
|
||||
None => {
|
||||
let sid = init_rev(dfa, input)?;
|
||||
state.id = Some(sid);
|
||||
if input.start() == input.end() {
|
||||
state.rev_eoi = true;
|
||||
} else {
|
||||
state.at = input.end() - 1;
|
||||
}
|
||||
sid
|
||||
}
|
||||
Some(sid) => {
|
||||
if let Some(match_index) = state.next_match_index {
|
||||
let match_len = dfa.match_len(sid);
|
||||
if match_index < match_len {
|
||||
state.next_match_index = Some(match_index + 1);
|
||||
let pattern = dfa.match_pattern(sid, match_index);
|
||||
state.mat = Some(HalfMatch::new(pattern, state.at));
|
||||
return Ok(());
|
||||
}
|
||||
}
|
||||
// Once we've reported all matches at a given position, we need
|
||||
// to advance the search to the next position. However, if we've
|
||||
// already followed the EOI transition, then we know we're done
|
||||
// with the search and there cannot be any more matches to report.
|
||||
if state.rev_eoi {
|
||||
return Ok(());
|
||||
} else if state.at == input.start() {
|
||||
// At this point, we should follow the EOI transition. This
|
||||
// will cause us the skip the main loop below and fall through
|
||||
// to the final 'eoi_rev' transition.
|
||||
state.rev_eoi = true;
|
||||
} else {
|
||||
// We haven't hit the end of the search yet, so move on.
|
||||
state.at -= 1;
|
||||
}
|
||||
sid
|
||||
}
|
||||
};
|
||||
while !state.rev_eoi {
|
||||
sid = dfa.next_state(sid, input.haystack()[state.at]);
|
||||
if dfa.is_special_state(sid) {
|
||||
state.id = Some(sid);
|
||||
if dfa.is_start_state(sid) {
|
||||
if dfa.is_accel_state(sid) {
|
||||
let needles = dfa.accelerator(sid);
|
||||
state.at =
|
||||
accel::find_rev(needles, input.haystack(), state.at)
|
||||
.map(|i| i + 1)
|
||||
.unwrap_or(input.start());
|
||||
}
|
||||
} else if dfa.is_match_state(sid) {
|
||||
state.next_match_index = Some(1);
|
||||
let pattern = dfa.match_pattern(sid, 0);
|
||||
state.mat = Some(HalfMatch::new(pattern, state.at + 1));
|
||||
return Ok(());
|
||||
} else if dfa.is_accel_state(sid) {
|
||||
let needles = dfa.accelerator(sid);
|
||||
// If the accelerator returns nothing, why don't we quit the
|
||||
// search? Well, if the accelerator doesn't find anything, that
|
||||
// doesn't mean we don't have a match. It just means that we
|
||||
// can't leave the current state given one of the 255 possible
|
||||
// byte values. However, there might be an EOI transition. So
|
||||
// we set 'at' to the end of the haystack, which will cause
|
||||
// this loop to stop and fall down into the EOI transition.
|
||||
state.at =
|
||||
accel::find_rev(needles, input.haystack(), state.at)
|
||||
.map(|i| i + 1)
|
||||
.unwrap_or(input.start());
|
||||
} else if dfa.is_dead_state(sid) {
|
||||
return Ok(());
|
||||
} else {
|
||||
return Err(MatchError::quit(
|
||||
input.haystack()[state.at],
|
||||
state.at,
|
||||
));
|
||||
}
|
||||
}
|
||||
if state.at == input.start() {
|
||||
break;
|
||||
}
|
||||
state.at -= 1;
|
||||
}
|
||||
|
||||
let result = eoi_rev(dfa, input, &mut sid, &mut state.mat);
|
||||
state.rev_eoi = true;
|
||||
state.id = Some(sid);
|
||||
if state.mat.is_some() {
|
||||
// '1' is always correct here since if we get to this point, this
|
||||
// always corresponds to the first (index '0') match discovered at
|
||||
// this position. So the next match to report at this position (if
|
||||
// it exists) is at index '1'.
|
||||
state.next_match_index = Some(1);
|
||||
}
|
||||
result
|
||||
}
|
||||
|
||||
#[cfg_attr(feature = "perf-inline", inline(always))]
|
||||
fn init_fwd<A: Automaton + ?Sized>(
|
||||
dfa: &A,
|
||||
input: &Input<'_>,
|
||||
) -> Result<StateID, MatchError> {
|
||||
let sid = dfa.start_state_forward(input)?;
|
||||
// Start states can never be match states, since all matches are delayed
|
||||
// by 1 byte.
|
||||
debug_assert!(!dfa.is_match_state(sid));
|
||||
Ok(sid)
|
||||
}
|
||||
|
||||
#[cfg_attr(feature = "perf-inline", inline(always))]
|
||||
fn init_rev<A: Automaton + ?Sized>(
|
||||
dfa: &A,
|
||||
input: &Input<'_>,
|
||||
) -> Result<StateID, MatchError> {
|
||||
let sid = dfa.start_state_reverse(input)?;
|
||||
// Start states can never be match states, since all matches are delayed
|
||||
// by 1 byte.
|
||||
debug_assert!(!dfa.is_match_state(sid));
|
||||
Ok(sid)
|
||||
}
|
||||
|
||||
#[cfg_attr(feature = "perf-inline", inline(always))]
|
||||
fn eoi_fwd<A: Automaton + ?Sized>(
|
||||
dfa: &A,
|
||||
input: &Input<'_>,
|
||||
sid: &mut StateID,
|
||||
mat: &mut Option<HalfMatch>,
|
||||
) -> Result<(), MatchError> {
|
||||
let sp = input.get_span();
|
||||
match input.haystack().get(sp.end) {
|
||||
Some(&b) => {
|
||||
*sid = dfa.next_state(*sid, b);
|
||||
if dfa.is_match_state(*sid) {
|
||||
let pattern = dfa.match_pattern(*sid, 0);
|
||||
*mat = Some(HalfMatch::new(pattern, sp.end));
|
||||
} else if dfa.is_quit_state(*sid) {
|
||||
return Err(MatchError::quit(b, sp.end));
|
||||
}
|
||||
}
|
||||
None => {
|
||||
*sid = dfa.next_eoi_state(*sid);
|
||||
if dfa.is_match_state(*sid) {
|
||||
let pattern = dfa.match_pattern(*sid, 0);
|
||||
*mat = Some(HalfMatch::new(pattern, input.haystack().len()));
|
||||
}
|
||||
}
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[cfg_attr(feature = "perf-inline", inline(always))]
|
||||
fn eoi_rev<A: Automaton + ?Sized>(
|
||||
dfa: &A,
|
||||
input: &Input<'_>,
|
||||
sid: &mut StateID,
|
||||
mat: &mut Option<HalfMatch>,
|
||||
) -> Result<(), MatchError> {
|
||||
let sp = input.get_span();
|
||||
if sp.start > 0 {
|
||||
let byte = input.haystack()[sp.start - 1];
|
||||
*sid = dfa.next_state(*sid, byte);
|
||||
if dfa.is_match_state(*sid) {
|
||||
let pattern = dfa.match_pattern(*sid, 0);
|
||||
*mat = Some(HalfMatch::new(pattern, sp.start));
|
||||
} else if dfa.is_quit_state(*sid) {
|
||||
return Err(MatchError::quit(byte, sp.start - 1));
|
||||
}
|
||||
} else {
|
||||
*sid = dfa.next_eoi_state(*sid);
|
||||
if dfa.is_match_state(*sid) {
|
||||
let pattern = dfa.match_pattern(*sid, 0);
|
||||
*mat = Some(HalfMatch::new(pattern, 0));
|
||||
}
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Re-compute the starting state that a DFA should be in after finding a
|
||||
/// prefilter candidate match at the position `at`.
|
||||
///
|
||||
/// The function with the same name has a bit more docs in hybrid/search.rs.
|
||||
#[cfg_attr(feature = "perf-inline", inline(always))]
|
||||
fn prefilter_restart<A: Automaton + ?Sized>(
|
||||
dfa: &A,
|
||||
input: &Input<'_>,
|
||||
at: usize,
|
||||
) -> Result<StateID, MatchError> {
|
||||
let mut input = input.clone();
|
||||
input.set_start(at);
|
||||
init_fwd(dfa, &input)
|
||||
}
|
||||
2635
third-party/vendor/regex-automata/src/dfa/sparse.rs
vendored
Normal file
2635
third-party/vendor/regex-automata/src/dfa/sparse.rs
vendored
Normal file
File diff suppressed because it is too large
Load diff
494
third-party/vendor/regex-automata/src/dfa/special.rs
vendored
Normal file
494
third-party/vendor/regex-automata/src/dfa/special.rs
vendored
Normal file
|
|
@ -0,0 +1,494 @@
|
|||
use crate::{
|
||||
dfa::DEAD,
|
||||
util::{
|
||||
primitives::StateID,
|
||||
wire::{self, DeserializeError, Endian, SerializeError},
|
||||
},
|
||||
};
|
||||
|
||||
macro_rules! err {
|
||||
($msg:expr) => {
|
||||
return Err(DeserializeError::generic($msg));
|
||||
};
|
||||
}
|
||||
|
||||
// Special represents the identifiers in a DFA that correspond to "special"
|
||||
// states. If a state is one or more of the following, then it is considered
|
||||
// special:
|
||||
//
|
||||
// * dead - A non-matching state where all outgoing transitions lead back to
|
||||
// itself. There is only one of these, regardless of whether minimization
|
||||
// has run. The dead state always has an ID of 0. i.e., It is always the
|
||||
// first state in a DFA.
|
||||
// * quit - A state that is entered whenever a byte is seen that should cause
|
||||
// a DFA to give up and stop searching. This results in a MatchError::quit
|
||||
// error being returned at search time. The default configuration for a DFA
|
||||
// has no quit bytes, which means this state is unreachable by default,
|
||||
// although it is always present for reasons of implementation simplicity.
|
||||
// This state is only reachable when the caller configures the DFA to quit
|
||||
// on certain bytes. There is always exactly one of these states and it
|
||||
// is always the second state. (Its actual ID depends on the size of the
|
||||
// alphabet in dense DFAs, since state IDs are premultiplied in order to
|
||||
// allow them to be used directly as indices into the transition table.)
|
||||
// * match - An accepting state, i.e., indicative of a match. There may be
|
||||
// zero or more of these states.
|
||||
// * accelerated - A state where all of its outgoing transitions, except a
|
||||
// few, loop back to itself. These states are candidates for acceleration
|
||||
// via memchr during search. There may be zero or more of these states.
|
||||
// * start - A non-matching state that indicates where the automaton should
|
||||
// start during a search. There is always at least one starting state and
|
||||
// all are guaranteed to be non-match states. (A start state cannot be a
|
||||
// match state because the DFAs in this crate delay all matches by one byte.
|
||||
// So every search that finds a match must move through one transition to
|
||||
// some other match state, even when searching an empty string.)
|
||||
//
|
||||
// These are not mutually exclusive categories. Namely, the following
|
||||
// overlappings can occur:
|
||||
//
|
||||
// * {dead, start} - If a DFA can never lead to a match and it is minimized,
|
||||
// then it will typically compile to something where all starting IDs point
|
||||
// to the DFA's dead state.
|
||||
// * {match, accelerated} - It is possible for a match state to have the
|
||||
// majority of its transitions loop back to itself, which means it's
|
||||
// possible for a match state to be accelerated.
|
||||
// * {start, accelerated} - Similarly, it is possible for a start state to be
|
||||
// accelerated. Note that it is possible for an accelerated state to be
|
||||
// neither a match or a start state. Also note that just because both match
|
||||
// and start states overlap with accelerated states does not mean that
|
||||
// match and start states overlap with each other. In fact, they are
|
||||
// guaranteed not to overlap.
|
||||
//
|
||||
// As a special mention, every DFA always has a dead and a quit state, even
|
||||
// though from the perspective of the DFA, they are equivalent. (Indeed,
|
||||
// minimization special cases them to ensure they don't get merged.) The
|
||||
// purpose of keeping them distinct is to use the quit state as a sentinel to
|
||||
// distguish between whether a search finished successfully without finding
|
||||
// anything or whether it gave up before finishing.
|
||||
//
|
||||
// So the main problem we want to solve here is the *fast* detection of whether
|
||||
// a state is special or not. And we also want to do this while storing as
|
||||
// little extra data as possible. AND we want to be able to quickly determine
|
||||
// which categories a state falls into above if it is special.
|
||||
//
|
||||
// We achieve this by essentially shuffling all special states to the beginning
|
||||
// of a DFA. That is, all special states appear before every other non-special
|
||||
// state. By representing special states this way, we can determine whether a
|
||||
// state is special or not by a single comparison, where special.max is the
|
||||
// identifier of the last special state in the DFA:
|
||||
//
|
||||
// if current_state <= special.max:
|
||||
// ... do something with special state
|
||||
//
|
||||
// The only thing left to do is to determine what kind of special state
|
||||
// it is. Because what we do next depends on that. Since special states
|
||||
// are typically rare, we can afford to do a bit more extra work, but we'd
|
||||
// still like this to be as fast as possible. The trick we employ here is to
|
||||
// continue shuffling states even within the special state range. Such that
|
||||
// one contiguous region corresponds to match states, another for start states
|
||||
// and then an overlapping range for accelerated states. At a high level, our
|
||||
// special state detection might look like this (for leftmost searching, where
|
||||
// we continue searching even after seeing a match):
|
||||
//
|
||||
// byte = input[offset]
|
||||
// current_state = next_state(current_state, byte)
|
||||
// offset += 1
|
||||
// if current_state <= special.max:
|
||||
// if current_state == 0:
|
||||
// # We can never leave a dead state, so this always marks the
|
||||
// # end of our search.
|
||||
// return last_match
|
||||
// if current_state == special.quit_id:
|
||||
// # A quit state means we give up. If he DFA has no quit state,
|
||||
// # then special.quit_id == 0 == dead, which is handled by the
|
||||
// # conditional above.
|
||||
// return Err(MatchError::quit { byte, offset: offset - 1 })
|
||||
// if special.min_match <= current_state <= special.max_match:
|
||||
// last_match = Some(offset)
|
||||
// if special.min_accel <= current_state <= special.max_accel:
|
||||
// offset = accelerate(input, offset)
|
||||
// last_match = Some(offset)
|
||||
// elif special.min_start <= current_state <= special.max_start:
|
||||
// offset = prefilter.find(input, offset)
|
||||
// if special.min_accel <= current_state <= special.max_accel:
|
||||
// offset = accelerate(input, offset)
|
||||
// elif special.min_accel <= current_state <= special.max_accel:
|
||||
// offset = accelerate(input, offset)
|
||||
//
|
||||
// There are some small details left out of the logic above. For example,
|
||||
// in order to accelerate a state, we need to know which bytes to search for.
|
||||
// This in turn implies some extra data we need to store in the DFA. To keep
|
||||
// things compact, we would ideally only store
|
||||
//
|
||||
// N = special.max_accel - special.min_accel + 1
|
||||
//
|
||||
// items. But state IDs are premultiplied, which means they are not contiguous.
|
||||
// So in order to take a state ID and index an array of accelerated structures,
|
||||
// we need to do:
|
||||
//
|
||||
// i = (state_id - special.min_accel) / stride
|
||||
//
|
||||
// (N.B. 'stride' is always a power of 2, so the above can be implemented via
|
||||
// '(state_id - special.min_accel) >> stride2', where 'stride2' is x in
|
||||
// 2^x=stride.)
|
||||
//
|
||||
// Moreover, some of these specialty categories may be empty. For example,
|
||||
// DFAs are not required to have any match states or any accelerated states.
|
||||
// In that case, the lower and upper bounds are both set to 0 (the dead state
|
||||
// ID) and the first `current_state == 0` check subsumes cases where the
|
||||
// ranges are empty.
|
||||
//
|
||||
// Loop unrolling, if applicable, has also been left out of the logic above.
|
||||
//
|
||||
// Graphically, the ranges look like this, where asterisks indicate ranges
|
||||
// that can be empty. Each 'x' is a state.
|
||||
//
|
||||
// quit
|
||||
// dead|
|
||||
// ||
|
||||
// xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx
|
||||
// | | | | start | |
|
||||
// | |-------------| |-------| |
|
||||
// | match* | | | |
|
||||
// | | | | |
|
||||
// | |----------| | |
|
||||
// | accel* | |
|
||||
// | | |
|
||||
// | | |
|
||||
// |----------------------------|------------------------
|
||||
// special non-special*
|
||||
#[derive(Clone, Copy, Debug)]
|
||||
pub(crate) struct Special {
|
||||
/// The identifier of the last special state in a DFA. A state is special
|
||||
/// if and only if its identifier is less than or equal to `max`.
|
||||
pub(crate) max: StateID,
|
||||
/// The identifier of the quit state in a DFA. (There is no analogous field
|
||||
/// for the dead state since the dead state's ID is always zero, regardless
|
||||
/// of state ID size.)
|
||||
pub(crate) quit_id: StateID,
|
||||
/// The identifier of the first match state.
|
||||
pub(crate) min_match: StateID,
|
||||
/// The identifier of the last match state.
|
||||
pub(crate) max_match: StateID,
|
||||
/// The identifier of the first accelerated state.
|
||||
pub(crate) min_accel: StateID,
|
||||
/// The identifier of the last accelerated state.
|
||||
pub(crate) max_accel: StateID,
|
||||
/// The identifier of the first start state.
|
||||
pub(crate) min_start: StateID,
|
||||
/// The identifier of the last start state.
|
||||
pub(crate) max_start: StateID,
|
||||
}
|
||||
|
||||
impl Special {
|
||||
/// Creates a new set of special ranges for a DFA. All ranges are initially
|
||||
/// set to only contain the dead state. This is interpreted as an empty
|
||||
/// range.
|
||||
#[cfg(feature = "dfa-build")]
|
||||
pub(crate) fn new() -> Special {
|
||||
Special {
|
||||
max: DEAD,
|
||||
quit_id: DEAD,
|
||||
min_match: DEAD,
|
||||
max_match: DEAD,
|
||||
min_accel: DEAD,
|
||||
max_accel: DEAD,
|
||||
min_start: DEAD,
|
||||
max_start: DEAD,
|
||||
}
|
||||
}
|
||||
|
||||
/// Remaps all of the special state identifiers using the function given.
|
||||
#[cfg(feature = "dfa-build")]
|
||||
pub(crate) fn remap(&self, map: impl Fn(StateID) -> StateID) -> Special {
|
||||
Special {
|
||||
max: map(self.max),
|
||||
quit_id: map(self.quit_id),
|
||||
min_match: map(self.min_match),
|
||||
max_match: map(self.max_match),
|
||||
min_accel: map(self.min_accel),
|
||||
max_accel: map(self.max_accel),
|
||||
min_start: map(self.min_start),
|
||||
max_start: map(self.max_start),
|
||||
}
|
||||
}
|
||||
|
||||
/// Deserialize the given bytes into special state ranges. If the slice
|
||||
/// given is not big enough, then this returns an error. Similarly, if
|
||||
/// any of the expected invariants around special state ranges aren't
|
||||
/// upheld, an error is returned. Note that this does not guarantee that
|
||||
/// the information returned is correct.
|
||||
///
|
||||
/// Upon success, this returns the number of bytes read in addition to the
|
||||
/// special state IDs themselves.
|
||||
pub(crate) fn from_bytes(
|
||||
mut slice: &[u8],
|
||||
) -> Result<(Special, usize), DeserializeError> {
|
||||
wire::check_slice_len(slice, 8 * StateID::SIZE, "special states")?;
|
||||
|
||||
let mut nread = 0;
|
||||
let mut read_id = |what| -> Result<StateID, DeserializeError> {
|
||||
let (id, nr) = wire::try_read_state_id(slice, what)?;
|
||||
nread += nr;
|
||||
slice = &slice[StateID::SIZE..];
|
||||
Ok(id)
|
||||
};
|
||||
|
||||
let max = read_id("special max id")?;
|
||||
let quit_id = read_id("special quit id")?;
|
||||
let min_match = read_id("special min match id")?;
|
||||
let max_match = read_id("special max match id")?;
|
||||
let min_accel = read_id("special min accel id")?;
|
||||
let max_accel = read_id("special max accel id")?;
|
||||
let min_start = read_id("special min start id")?;
|
||||
let max_start = read_id("special max start id")?;
|
||||
|
||||
let special = Special {
|
||||
max,
|
||||
quit_id,
|
||||
min_match,
|
||||
max_match,
|
||||
min_accel,
|
||||
max_accel,
|
||||
min_start,
|
||||
max_start,
|
||||
};
|
||||
special.validate()?;
|
||||
assert_eq!(nread, special.write_to_len());
|
||||
Ok((special, nread))
|
||||
}
|
||||
|
||||
/// Validate that the information describing special states satisfies
|
||||
/// all known invariants.
|
||||
pub(crate) fn validate(&self) -> Result<(), DeserializeError> {
|
||||
// Check that both ends of the range are DEAD or neither are.
|
||||
if self.min_match == DEAD && self.max_match != DEAD {
|
||||
err!("min_match is DEAD, but max_match is not");
|
||||
}
|
||||
if self.min_match != DEAD && self.max_match == DEAD {
|
||||
err!("max_match is DEAD, but min_match is not");
|
||||
}
|
||||
if self.min_accel == DEAD && self.max_accel != DEAD {
|
||||
err!("min_accel is DEAD, but max_accel is not");
|
||||
}
|
||||
if self.min_accel != DEAD && self.max_accel == DEAD {
|
||||
err!("max_accel is DEAD, but min_accel is not");
|
||||
}
|
||||
if self.min_start == DEAD && self.max_start != DEAD {
|
||||
err!("min_start is DEAD, but max_start is not");
|
||||
}
|
||||
if self.min_start != DEAD && self.max_start == DEAD {
|
||||
err!("max_start is DEAD, but min_start is not");
|
||||
}
|
||||
|
||||
// Check that ranges are well formed.
|
||||
if self.min_match > self.max_match {
|
||||
err!("min_match should not be greater than max_match");
|
||||
}
|
||||
if self.min_accel > self.max_accel {
|
||||
err!("min_accel should not be greater than max_accel");
|
||||
}
|
||||
if self.min_start > self.max_start {
|
||||
err!("min_start should not be greater than max_start");
|
||||
}
|
||||
|
||||
// Check that ranges are ordered with respect to one another.
|
||||
if self.matches() && self.quit_id >= self.min_match {
|
||||
err!("quit_id should not be greater than min_match");
|
||||
}
|
||||
if self.accels() && self.quit_id >= self.min_accel {
|
||||
err!("quit_id should not be greater than min_accel");
|
||||
}
|
||||
if self.starts() && self.quit_id >= self.min_start {
|
||||
err!("quit_id should not be greater than min_start");
|
||||
}
|
||||
if self.matches() && self.accels() && self.min_accel < self.min_match {
|
||||
err!("min_match should not be greater than min_accel");
|
||||
}
|
||||
if self.matches() && self.starts() && self.min_start < self.min_match {
|
||||
err!("min_match should not be greater than min_start");
|
||||
}
|
||||
if self.accels() && self.starts() && self.min_start < self.min_accel {
|
||||
err!("min_accel should not be greater than min_start");
|
||||
}
|
||||
|
||||
// Check that max is at least as big as everything else.
|
||||
if self.max < self.quit_id {
|
||||
err!("quit_id should not be greater than max");
|
||||
}
|
||||
if self.max < self.max_match {
|
||||
err!("max_match should not be greater than max");
|
||||
}
|
||||
if self.max < self.max_accel {
|
||||
err!("max_accel should not be greater than max");
|
||||
}
|
||||
if self.max < self.max_start {
|
||||
err!("max_start should not be greater than max");
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Validate that the special state information is compatible with the
|
||||
/// given state len.
|
||||
pub(crate) fn validate_state_len(
|
||||
&self,
|
||||
len: usize,
|
||||
stride2: usize,
|
||||
) -> Result<(), DeserializeError> {
|
||||
// We assume that 'validate' has already passed, so we know that 'max'
|
||||
// is truly the max. So all we need to check is that the max state ID
|
||||
// is less than the state ID len. The max legal value here is len-1,
|
||||
// which occurs when there are no non-special states.
|
||||
if (self.max.as_usize() >> stride2) >= len {
|
||||
err!("max should not be greater than or equal to state length");
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Write the IDs and ranges for special states to the given byte buffer.
|
||||
/// The buffer given must have enough room to store all data, otherwise
|
||||
/// this will return an error. The number of bytes written is returned
|
||||
/// on success. The number of bytes written is guaranteed to be a multiple
|
||||
/// of 8.
|
||||
pub(crate) fn write_to<E: Endian>(
|
||||
&self,
|
||||
dst: &mut [u8],
|
||||
) -> Result<usize, SerializeError> {
|
||||
use crate::util::wire::write_state_id as write;
|
||||
|
||||
if dst.len() < self.write_to_len() {
|
||||
return Err(SerializeError::buffer_too_small("special state ids"));
|
||||
}
|
||||
|
||||
let mut nwrite = 0;
|
||||
nwrite += write::<E>(self.max, &mut dst[nwrite..]);
|
||||
nwrite += write::<E>(self.quit_id, &mut dst[nwrite..]);
|
||||
nwrite += write::<E>(self.min_match, &mut dst[nwrite..]);
|
||||
nwrite += write::<E>(self.max_match, &mut dst[nwrite..]);
|
||||
nwrite += write::<E>(self.min_accel, &mut dst[nwrite..]);
|
||||
nwrite += write::<E>(self.max_accel, &mut dst[nwrite..]);
|
||||
nwrite += write::<E>(self.min_start, &mut dst[nwrite..]);
|
||||
nwrite += write::<E>(self.max_start, &mut dst[nwrite..]);
|
||||
|
||||
assert_eq!(
|
||||
self.write_to_len(),
|
||||
nwrite,
|
||||
"expected to write certain number of bytes",
|
||||
);
|
||||
assert_eq!(
|
||||
nwrite % 8,
|
||||
0,
|
||||
"expected to write multiple of 8 bytes for special states",
|
||||
);
|
||||
Ok(nwrite)
|
||||
}
|
||||
|
||||
/// Returns the total number of bytes written by `write_to`.
|
||||
pub(crate) fn write_to_len(&self) -> usize {
|
||||
8 * StateID::SIZE
|
||||
}
|
||||
|
||||
/// Sets the maximum special state ID based on the current values. This
|
||||
/// should be used once all possible state IDs are set.
|
||||
#[cfg(feature = "dfa-build")]
|
||||
pub(crate) fn set_max(&mut self) {
|
||||
use core::cmp::max;
|
||||
self.max = max(
|
||||
self.quit_id,
|
||||
max(self.max_match, max(self.max_accel, self.max_start)),
|
||||
);
|
||||
}
|
||||
|
||||
/// Sets the maximum special state ID such that starting states are not
|
||||
/// considered "special." This also marks the min/max starting states as
|
||||
/// DEAD such that 'is_start_state' always returns false, even if the state
|
||||
/// is actually a starting state.
|
||||
///
|
||||
/// This is useful when there is no prefilter set. It will avoid
|
||||
/// ping-ponging between the hot path in the DFA search code and the start
|
||||
/// state handling code, which is typically only useful for executing a
|
||||
/// prefilter.
|
||||
#[cfg(feature = "dfa-build")]
|
||||
pub(crate) fn set_no_special_start_states(&mut self) {
|
||||
use core::cmp::max;
|
||||
self.max = max(self.quit_id, max(self.max_match, self.max_accel));
|
||||
self.min_start = DEAD;
|
||||
self.max_start = DEAD;
|
||||
}
|
||||
|
||||
/// Returns true if and only if the given state ID is a special state.
|
||||
#[inline]
|
||||
pub(crate) fn is_special_state(&self, id: StateID) -> bool {
|
||||
id <= self.max
|
||||
}
|
||||
|
||||
/// Returns true if and only if the given state ID is a dead state.
|
||||
#[inline]
|
||||
pub(crate) fn is_dead_state(&self, id: StateID) -> bool {
|
||||
id == DEAD
|
||||
}
|
||||
|
||||
/// Returns true if and only if the given state ID is a quit state.
|
||||
#[inline]
|
||||
pub(crate) fn is_quit_state(&self, id: StateID) -> bool {
|
||||
!self.is_dead_state(id) && self.quit_id == id
|
||||
}
|
||||
|
||||
/// Returns true if and only if the given state ID is a match state.
|
||||
#[inline]
|
||||
pub(crate) fn is_match_state(&self, id: StateID) -> bool {
|
||||
!self.is_dead_state(id) && self.min_match <= id && id <= self.max_match
|
||||
}
|
||||
|
||||
/// Returns true if and only if the given state ID is an accel state.
|
||||
#[inline]
|
||||
pub(crate) fn is_accel_state(&self, id: StateID) -> bool {
|
||||
!self.is_dead_state(id) && self.min_accel <= id && id <= self.max_accel
|
||||
}
|
||||
|
||||
/// Returns true if and only if the given state ID is a start state.
|
||||
#[inline]
|
||||
pub(crate) fn is_start_state(&self, id: StateID) -> bool {
|
||||
!self.is_dead_state(id) && self.min_start <= id && id <= self.max_start
|
||||
}
|
||||
|
||||
/// Returns the total number of match states for a dense table based DFA.
|
||||
#[inline]
|
||||
pub(crate) fn match_len(&self, stride: usize) -> usize {
|
||||
if self.matches() {
|
||||
(self.max_match.as_usize() - self.min_match.as_usize() + stride)
|
||||
/ stride
|
||||
} else {
|
||||
0
|
||||
}
|
||||
}
|
||||
|
||||
/// Returns true if and only if there is at least one match state.
|
||||
#[inline]
|
||||
pub(crate) fn matches(&self) -> bool {
|
||||
self.min_match != DEAD
|
||||
}
|
||||
|
||||
/// Returns the total number of accel states.
|
||||
#[cfg(feature = "dfa-build")]
|
||||
pub(crate) fn accel_len(&self, stride: usize) -> usize {
|
||||
if self.accels() {
|
||||
(self.max_accel.as_usize() - self.min_accel.as_usize() + stride)
|
||||
/ stride
|
||||
} else {
|
||||
0
|
||||
}
|
||||
}
|
||||
|
||||
/// Returns true if and only if there is at least one accel state.
|
||||
#[inline]
|
||||
pub(crate) fn accels(&self) -> bool {
|
||||
self.min_accel != DEAD
|
||||
}
|
||||
|
||||
/// Returns true if and only if there is at least one start state.
|
||||
#[inline]
|
||||
pub(crate) fn starts(&self) -> bool {
|
||||
self.min_start != DEAD
|
||||
}
|
||||
}
|
||||
74
third-party/vendor/regex-automata/src/dfa/start.rs
vendored
Normal file
74
third-party/vendor/regex-automata/src/dfa/start.rs
vendored
Normal file
|
|
@ -0,0 +1,74 @@
|
|||
use core::mem::size_of;
|
||||
|
||||
use crate::util::wire::{self, DeserializeError, Endian, SerializeError};
|
||||
|
||||
/// The kind of anchored starting configurations to support in a DFA.
|
||||
///
|
||||
/// Fully compiled DFAs need to be explicitly configured as to which anchored
|
||||
/// starting configurations to support. The reason for not just supporting
|
||||
/// everything unconditionally is that it can use more resources (such as
|
||||
/// memory and build time). The downside of this is that if you try to execute
|
||||
/// a search using an [`Anchored`](crate::Anchored) mode that is not supported
|
||||
/// by the DFA, then the search will return an error.
|
||||
#[derive(Clone, Copy, Debug, Eq, PartialEq)]
|
||||
pub enum StartKind {
|
||||
/// Support both anchored and unanchored searches.
|
||||
Both,
|
||||
/// Support only unanchored searches. Requesting an anchored search will
|
||||
/// panic.
|
||||
///
|
||||
/// Note that even if an unanchored search is requested, the pattern itself
|
||||
/// may still be anchored. For example, `^abc` will only match `abc` at the
|
||||
/// start of a haystack. This will remain true, even if the regex engine
|
||||
/// only supported unanchored searches.
|
||||
Unanchored,
|
||||
/// Support only anchored searches. Requesting an unanchored search will
|
||||
/// panic.
|
||||
Anchored,
|
||||
}
|
||||
|
||||
impl StartKind {
|
||||
pub(crate) fn from_bytes(
|
||||
slice: &[u8],
|
||||
) -> Result<(StartKind, usize), DeserializeError> {
|
||||
wire::check_slice_len(slice, size_of::<u32>(), "start kind bytes")?;
|
||||
let (n, nr) = wire::try_read_u32(slice, "start kind integer")?;
|
||||
match n {
|
||||
0 => Ok((StartKind::Both, nr)),
|
||||
1 => Ok((StartKind::Unanchored, nr)),
|
||||
2 => Ok((StartKind::Anchored, nr)),
|
||||
_ => Err(DeserializeError::generic("unrecognized start kind")),
|
||||
}
|
||||
}
|
||||
|
||||
pub(crate) fn write_to<E: Endian>(
|
||||
&self,
|
||||
dst: &mut [u8],
|
||||
) -> Result<usize, SerializeError> {
|
||||
let nwrite = self.write_to_len();
|
||||
if dst.len() < nwrite {
|
||||
return Err(SerializeError::buffer_too_small("start kind"));
|
||||
}
|
||||
let n = match *self {
|
||||
StartKind::Both => 0,
|
||||
StartKind::Unanchored => 1,
|
||||
StartKind::Anchored => 2,
|
||||
};
|
||||
E::write_u32(n, dst);
|
||||
Ok(nwrite)
|
||||
}
|
||||
|
||||
pub(crate) fn write_to_len(&self) -> usize {
|
||||
size_of::<u32>()
|
||||
}
|
||||
|
||||
#[cfg_attr(feature = "perf-inline", inline(always))]
|
||||
pub(crate) fn has_unanchored(&self) -> bool {
|
||||
matches!(*self, StartKind::Both | StartKind::Unanchored)
|
||||
}
|
||||
|
||||
#[cfg_attr(feature = "perf-inline", inline(always))]
|
||||
pub(crate) fn has_anchored(&self) -> bool {
|
||||
matches!(*self, StartKind::Both | StartKind::Anchored)
|
||||
}
|
||||
}
|
||||
4418
third-party/vendor/regex-automata/src/hybrid/dfa.rs
vendored
Normal file
4418
third-party/vendor/regex-automata/src/hybrid/dfa.rs
vendored
Normal file
File diff suppressed because it is too large
Load diff
242
third-party/vendor/regex-automata/src/hybrid/error.rs
vendored
Normal file
242
third-party/vendor/regex-automata/src/hybrid/error.rs
vendored
Normal file
|
|
@ -0,0 +1,242 @@
|
|||
use crate::{hybrid::id::LazyStateIDError, nfa, util::search::Anchored};
|
||||
|
||||
/// An error that occurs when initial construction of a lazy DFA fails.
|
||||
///
|
||||
/// A build error can occur when insufficient cache capacity is configured or
|
||||
/// if something about the NFA is unsupported. (For example, if one attempts
|
||||
/// to build a lazy DFA without heuristic Unicode support but with an NFA that
|
||||
/// contains a Unicode word boundary.)
|
||||
///
|
||||
/// This error does not provide many introspection capabilities. There are
|
||||
/// generally only two things you can do with it:
|
||||
///
|
||||
/// * Obtain a human readable message via its `std::fmt::Display` impl.
|
||||
/// * Access an underlying
|
||||
/// [`nfa::thompson::BuildError`](crate::nfa::thompson::BuildError)
|
||||
/// type from its `source` method via the `std::error::Error` trait. This error
|
||||
/// only occurs when using convenience routines for building a lazy DFA
|
||||
/// directly from a pattern string.
|
||||
///
|
||||
/// When the `std` feature is enabled, this implements the `std::error::Error`
|
||||
/// trait.
|
||||
#[derive(Clone, Debug)]
|
||||
pub struct BuildError {
|
||||
kind: BuildErrorKind,
|
||||
}
|
||||
|
||||
#[derive(Clone, Debug)]
|
||||
enum BuildErrorKind {
|
||||
NFA(nfa::thompson::BuildError),
|
||||
InsufficientCacheCapacity { minimum: usize, given: usize },
|
||||
InsufficientStateIDCapacity { err: LazyStateIDError },
|
||||
Unsupported(&'static str),
|
||||
}
|
||||
|
||||
impl BuildError {
|
||||
pub(crate) fn nfa(err: nfa::thompson::BuildError) -> BuildError {
|
||||
BuildError { kind: BuildErrorKind::NFA(err) }
|
||||
}
|
||||
|
||||
pub(crate) fn insufficient_cache_capacity(
|
||||
minimum: usize,
|
||||
given: usize,
|
||||
) -> BuildError {
|
||||
BuildError {
|
||||
kind: BuildErrorKind::InsufficientCacheCapacity { minimum, given },
|
||||
}
|
||||
}
|
||||
|
||||
pub(crate) fn insufficient_state_id_capacity(
|
||||
err: LazyStateIDError,
|
||||
) -> BuildError {
|
||||
BuildError {
|
||||
kind: BuildErrorKind::InsufficientStateIDCapacity { err },
|
||||
}
|
||||
}
|
||||
|
||||
pub(crate) fn unsupported_dfa_word_boundary_unicode() -> BuildError {
|
||||
let msg = "cannot build lazy DFAs for regexes with Unicode word \
|
||||
boundaries; switch to ASCII word boundaries, or \
|
||||
heuristically enable Unicode word boundaries or use a \
|
||||
different regex engine";
|
||||
BuildError { kind: BuildErrorKind::Unsupported(msg) }
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(feature = "std")]
|
||||
impl std::error::Error for BuildError {
|
||||
fn source(&self) -> Option<&(dyn std::error::Error + 'static)> {
|
||||
match self.kind {
|
||||
BuildErrorKind::NFA(ref err) => Some(err),
|
||||
_ => None,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl core::fmt::Display for BuildError {
|
||||
fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result {
|
||||
match self.kind {
|
||||
BuildErrorKind::NFA(_) => write!(f, "error building NFA"),
|
||||
BuildErrorKind::InsufficientCacheCapacity { minimum, given } => {
|
||||
write!(
|
||||
f,
|
||||
"given cache capacity ({}) is smaller than \
|
||||
minimum required ({})",
|
||||
given, minimum,
|
||||
)
|
||||
}
|
||||
BuildErrorKind::InsufficientStateIDCapacity { ref err } => {
|
||||
err.fmt(f)
|
||||
}
|
||||
BuildErrorKind::Unsupported(ref msg) => {
|
||||
write!(f, "unsupported regex feature for DFAs: {}", msg)
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// An error that can occur when computing the start state for a search.
|
||||
///
|
||||
/// Computing a start state can fail for a few reasons, either
|
||||
/// based on incorrect configuration or even based on whether
|
||||
/// the look-behind byte triggers a quit state. Typically
|
||||
/// one does not need to handle this error if you're using
|
||||
/// [`DFA::start_state_forward`](crate::hybrid::dfa::DFA::start_state_forward)
|
||||
/// (or its reverse counterpart), as that routine automatically converts
|
||||
/// `StartError` to a [`MatchError`](crate::MatchError) for you.
|
||||
///
|
||||
/// This error may be returned by the
|
||||
/// [`DFA::start_state`](crate::hybrid::dfa::DFA::start_state) routine.
|
||||
///
|
||||
/// This error implements the `std::error::Error` trait when the `std` feature
|
||||
/// is enabled.
|
||||
///
|
||||
/// This error is marked as non-exhaustive. New variants may be added in a
|
||||
/// semver compatible release.
|
||||
#[non_exhaustive]
|
||||
#[derive(Clone, Debug)]
|
||||
pub enum StartError {
|
||||
/// An error that occurs when cache inefficiency has dropped below the
|
||||
/// configured heuristic thresholds.
|
||||
Cache {
|
||||
/// The underlying cache error that occurred.
|
||||
err: CacheError,
|
||||
},
|
||||
/// An error that occurs when a starting configuration's look-behind byte
|
||||
/// is in this DFA's quit set.
|
||||
Quit {
|
||||
/// The quit byte that was found.
|
||||
byte: u8,
|
||||
},
|
||||
/// An error that occurs when the caller requests an anchored mode that
|
||||
/// isn't supported by the DFA.
|
||||
UnsupportedAnchored {
|
||||
/// The anchored mode given that is unsupported.
|
||||
mode: Anchored,
|
||||
},
|
||||
}
|
||||
|
||||
impl StartError {
|
||||
pub(crate) fn cache(err: CacheError) -> StartError {
|
||||
StartError::Cache { err }
|
||||
}
|
||||
|
||||
pub(crate) fn quit(byte: u8) -> StartError {
|
||||
StartError::Quit { byte }
|
||||
}
|
||||
|
||||
pub(crate) fn unsupported_anchored(mode: Anchored) -> StartError {
|
||||
StartError::UnsupportedAnchored { mode }
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(feature = "std")]
|
||||
impl std::error::Error for StartError {
|
||||
fn source(&self) -> Option<&(dyn std::error::Error + 'static)> {
|
||||
match *self {
|
||||
StartError::Cache { ref err } => Some(err),
|
||||
_ => None,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl core::fmt::Display for StartError {
|
||||
fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result {
|
||||
match *self {
|
||||
StartError::Cache { .. } => write!(
|
||||
f,
|
||||
"error computing start state because of cache inefficiency"
|
||||
),
|
||||
StartError::Quit { byte } => write!(
|
||||
f,
|
||||
"error computing start state because the look-behind byte \
|
||||
{:?} triggered a quit state",
|
||||
crate::util::escape::DebugByte(byte),
|
||||
),
|
||||
StartError::UnsupportedAnchored { mode: Anchored::Yes } => {
|
||||
write!(
|
||||
f,
|
||||
"error computing start state because \
|
||||
anchored searches are not supported or enabled"
|
||||
)
|
||||
}
|
||||
StartError::UnsupportedAnchored { mode: Anchored::No } => {
|
||||
write!(
|
||||
f,
|
||||
"error computing start state because \
|
||||
unanchored searches are not supported or enabled"
|
||||
)
|
||||
}
|
||||
StartError::UnsupportedAnchored {
|
||||
mode: Anchored::Pattern(pid),
|
||||
} => {
|
||||
write!(
|
||||
f,
|
||||
"error computing start state because \
|
||||
anchored searches for a specific pattern ({}) \
|
||||
are not supported or enabled",
|
||||
pid.as_usize(),
|
||||
)
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// An error that occurs when cache usage has become inefficient.
|
||||
///
|
||||
/// One of the weaknesses of a lazy DFA is that it may need to clear its
|
||||
/// cache repeatedly if it's not big enough. If this happens too much, then it
|
||||
/// can slow searching down significantly. A mitigation to this is to use
|
||||
/// heuristics to detect whether the cache is being used efficiently or not.
|
||||
/// If not, then a lazy DFA can return a `CacheError`.
|
||||
///
|
||||
/// The default configuration of a lazy DFA in this crate is
|
||||
/// set such that a `CacheError` will never occur. Instead,
|
||||
/// callers must opt into this behavior with settings like
|
||||
/// [`dfa::Config::minimum_cache_clear_count`](crate::hybrid::dfa::Config::minimum_cache_clear_count)
|
||||
/// and
|
||||
/// [`dfa::Config::minimum_bytes_per_state`](crate::hybrid::dfa::Config::minimum_bytes_per_state).
|
||||
///
|
||||
/// When the `std` feature is enabled, this implements the `std::error::Error`
|
||||
/// trait.
|
||||
#[derive(Clone, Debug)]
|
||||
pub struct CacheError(());
|
||||
|
||||
impl CacheError {
|
||||
pub(crate) fn too_many_cache_clears() -> CacheError {
|
||||
CacheError(())
|
||||
}
|
||||
|
||||
pub(crate) fn bad_efficiency() -> CacheError {
|
||||
CacheError(())
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(feature = "std")]
|
||||
impl std::error::Error for CacheError {}
|
||||
|
||||
impl core::fmt::Display for CacheError {
|
||||
fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result {
|
||||
write!(f, "lazy DFA cache has been cleared too many times")
|
||||
}
|
||||
}
|
||||
354
third-party/vendor/regex-automata/src/hybrid/id.rs
vendored
Normal file
354
third-party/vendor/regex-automata/src/hybrid/id.rs
vendored
Normal file
|
|
@ -0,0 +1,354 @@
|
|||
/// A state identifier specifically tailored for lazy DFAs.
|
||||
///
|
||||
/// A lazy state ID logically represents a pointer to a DFA state. In practice,
|
||||
/// by limiting the number of DFA states it can address, it reserves some
|
||||
/// bits of its representation to encode some additional information. That
|
||||
/// additional information is called a "tag." That tag is used to record
|
||||
/// whether the state it points to is an unknown, dead, quit, start or match
|
||||
/// state.
|
||||
///
|
||||
/// When implementing a low level search routine with a lazy DFA, it is
|
||||
/// necessary to query the type of the current state to know what to do:
|
||||
///
|
||||
/// * **Unknown** - The state has not yet been computed. The
|
||||
/// parameters used to get this state ID must be re-passed to
|
||||
/// [`DFA::next_state`](crate::hybrid::dfa::DFA::next_state), which will never
|
||||
/// return an unknown state ID.
|
||||
/// * **Dead** - A dead state only has transitions to itself. It indicates that
|
||||
/// the search cannot do anything else and should stop with whatever result it
|
||||
/// has.
|
||||
/// * **Quit** - A quit state indicates that the automaton could not answer
|
||||
/// whether a match exists or not. Correct search implementations must return a
|
||||
/// [`MatchError::quit`](crate::MatchError::quit) when a DFA enters a quit
|
||||
/// state.
|
||||
/// * **Start** - A start state is a state in which a search can begin.
|
||||
/// Lazy DFAs usually have more than one start state. Branching on
|
||||
/// this isn't required for correctness, but a common optimization is
|
||||
/// to run a prefilter when a search enters a start state. Note that
|
||||
/// start states are *not* tagged automatically, and one must enable the
|
||||
/// [`Config::specialize_start_states`](crate::hybrid::dfa::Config::specialize_start_states)
|
||||
/// setting for start states to be tagged. The reason for this is
|
||||
/// that a DFA search loop is usually written to execute a prefilter once it
|
||||
/// enters a start state. But if there is no prefilter, this handling can be
|
||||
/// quite diastrous as the DFA may ping-pong between the special handling code
|
||||
/// and a possible optimized hot path for handling untagged states. When start
|
||||
/// states aren't specialized, then they are untagged and remain in the hot
|
||||
/// path.
|
||||
/// * **Match** - A match state indicates that a match has been found.
|
||||
/// Depending on the semantics of your search implementation, it may either
|
||||
/// continue until the end of the haystack or a dead state, or it might quit
|
||||
/// and return the match immediately.
|
||||
///
|
||||
/// As an optimization, the [`is_tagged`](LazyStateID::is_tagged) predicate
|
||||
/// can be used to determine if a tag exists at all. This is useful to avoid
|
||||
/// branching on all of the above types for every byte searched.
|
||||
///
|
||||
/// # Example
|
||||
///
|
||||
/// This example shows how `LazyStateID` can be used to implement a correct
|
||||
/// search routine with minimal branching. In particular, this search routine
|
||||
/// implements "leftmost" matching, which means that it doesn't immediately
|
||||
/// stop once a match is found. Instead, it continues until it reaches a dead
|
||||
/// state.
|
||||
///
|
||||
/// Notice also how a correct search implementation deals with
|
||||
/// [`CacheError`](crate::hybrid::CacheError)s returned by some of
|
||||
/// the lazy DFA routines. When a `CacheError` occurs, it returns
|
||||
/// [`MatchError::gave_up`](crate::MatchError::gave_up).
|
||||
///
|
||||
/// ```
|
||||
/// use regex_automata::{
|
||||
/// hybrid::dfa::{Cache, DFA},
|
||||
/// HalfMatch, MatchError, Input,
|
||||
/// };
|
||||
///
|
||||
/// fn find_leftmost_first(
|
||||
/// dfa: &DFA,
|
||||
/// cache: &mut Cache,
|
||||
/// haystack: &[u8],
|
||||
/// ) -> Result<Option<HalfMatch>, MatchError> {
|
||||
/// // The start state is determined by inspecting the position and the
|
||||
/// // initial bytes of the haystack. Note that start states can never
|
||||
/// // be match states (since DFAs in this crate delay matches by 1
|
||||
/// // byte), so we don't need to check if the start state is a match.
|
||||
/// let mut sid = dfa.start_state_forward(
|
||||
/// cache,
|
||||
/// &Input::new(haystack),
|
||||
/// )?;
|
||||
/// let mut last_match = None;
|
||||
/// // Walk all the bytes in the haystack. We can quit early if we see
|
||||
/// // a dead or a quit state. The former means the automaton will
|
||||
/// // never transition to any other state. The latter means that the
|
||||
/// // automaton entered a condition in which its search failed.
|
||||
/// for (i, &b) in haystack.iter().enumerate() {
|
||||
/// sid = dfa
|
||||
/// .next_state(cache, sid, b)
|
||||
/// .map_err(|_| MatchError::gave_up(i))?;
|
||||
/// if sid.is_tagged() {
|
||||
/// if sid.is_match() {
|
||||
/// last_match = Some(HalfMatch::new(
|
||||
/// dfa.match_pattern(cache, sid, 0),
|
||||
/// i,
|
||||
/// ));
|
||||
/// } else if sid.is_dead() {
|
||||
/// return Ok(last_match);
|
||||
/// } else if sid.is_quit() {
|
||||
/// // It is possible to enter into a quit state after
|
||||
/// // observing a match has occurred. In that case, we
|
||||
/// // should return the match instead of an error.
|
||||
/// if last_match.is_some() {
|
||||
/// return Ok(last_match);
|
||||
/// }
|
||||
/// return Err(MatchError::quit(b, i));
|
||||
/// }
|
||||
/// // Implementors may also want to check for start states and
|
||||
/// // handle them differently for performance reasons. But it is
|
||||
/// // not necessary for correctness. Note that in order to check
|
||||
/// // for start states, you'll need to enable the
|
||||
/// // 'specialize_start_states' config knob, otherwise start
|
||||
/// // states will not be tagged.
|
||||
/// }
|
||||
/// }
|
||||
/// // Matches are always delayed by 1 byte, so we must explicitly walk
|
||||
/// // the special "EOI" transition at the end of the search.
|
||||
/// sid = dfa
|
||||
/// .next_eoi_state(cache, sid)
|
||||
/// .map_err(|_| MatchError::gave_up(haystack.len()))?;
|
||||
/// if sid.is_match() {
|
||||
/// last_match = Some(HalfMatch::new(
|
||||
/// dfa.match_pattern(cache, sid, 0),
|
||||
/// haystack.len(),
|
||||
/// ));
|
||||
/// }
|
||||
/// Ok(last_match)
|
||||
/// }
|
||||
///
|
||||
/// // We use a greedy '+' operator to show how the search doesn't just stop
|
||||
/// // once a match is detected. It continues extending the match. Using
|
||||
/// // '[a-z]+?' would also work as expected and stop the search early.
|
||||
/// // Greediness is built into the automaton.
|
||||
/// let dfa = DFA::new(r"[a-z]+")?;
|
||||
/// let mut cache = dfa.create_cache();
|
||||
/// let haystack = "123 foobar 4567".as_bytes();
|
||||
/// let mat = find_leftmost_first(&dfa, &mut cache, haystack)?.unwrap();
|
||||
/// assert_eq!(mat.pattern().as_usize(), 0);
|
||||
/// assert_eq!(mat.offset(), 10);
|
||||
///
|
||||
/// // Here's another example that tests our handling of the special
|
||||
/// // EOI transition. This will fail to find a match if we don't call
|
||||
/// // 'next_eoi_state' at the end of the search since the match isn't found
|
||||
/// // until the final byte in the haystack.
|
||||
/// let dfa = DFA::new(r"[0-9]{4}")?;
|
||||
/// let mut cache = dfa.create_cache();
|
||||
/// let haystack = "123 foobar 4567".as_bytes();
|
||||
/// let mat = find_leftmost_first(&dfa, &mut cache, haystack)?.unwrap();
|
||||
/// assert_eq!(mat.pattern().as_usize(), 0);
|
||||
/// assert_eq!(mat.offset(), 15);
|
||||
///
|
||||
/// // And note that our search implementation above automatically works
|
||||
/// // with multi-DFAs. Namely, `dfa.match_pattern(match_state, 0)` selects
|
||||
/// // the appropriate pattern ID for us.
|
||||
/// let dfa = DFA::new_many(&[r"[a-z]+", r"[0-9]+"])?;
|
||||
/// let mut cache = dfa.create_cache();
|
||||
/// let haystack = "123 foobar 4567".as_bytes();
|
||||
/// let mat = find_leftmost_first(&dfa, &mut cache, haystack)?.unwrap();
|
||||
/// assert_eq!(mat.pattern().as_usize(), 1);
|
||||
/// assert_eq!(mat.offset(), 3);
|
||||
/// let mat = find_leftmost_first(&dfa, &mut cache, &haystack[3..])?.unwrap();
|
||||
/// assert_eq!(mat.pattern().as_usize(), 0);
|
||||
/// assert_eq!(mat.offset(), 7);
|
||||
/// let mat = find_leftmost_first(&dfa, &mut cache, &haystack[10..])?.unwrap();
|
||||
/// assert_eq!(mat.pattern().as_usize(), 1);
|
||||
/// assert_eq!(mat.offset(), 5);
|
||||
///
|
||||
/// # Ok::<(), Box<dyn std::error::Error>>(())
|
||||
/// ```
|
||||
#[derive(
|
||||
Clone, Copy, Debug, Default, Eq, Hash, PartialEq, PartialOrd, Ord,
|
||||
)]
|
||||
pub struct LazyStateID(u32);
|
||||
|
||||
impl LazyStateID {
|
||||
#[cfg(any(target_pointer_width = "32", target_pointer_width = "64"))]
|
||||
const MAX_BIT: usize = 31;
|
||||
|
||||
#[cfg(target_pointer_width = "16")]
|
||||
const MAX_BIT: usize = 15;
|
||||
|
||||
const MASK_UNKNOWN: usize = 1 << (LazyStateID::MAX_BIT);
|
||||
const MASK_DEAD: usize = 1 << (LazyStateID::MAX_BIT - 1);
|
||||
const MASK_QUIT: usize = 1 << (LazyStateID::MAX_BIT - 2);
|
||||
const MASK_START: usize = 1 << (LazyStateID::MAX_BIT - 3);
|
||||
const MASK_MATCH: usize = 1 << (LazyStateID::MAX_BIT - 4);
|
||||
const MAX: usize = LazyStateID::MASK_MATCH - 1;
|
||||
|
||||
/// Create a new lazy state ID.
|
||||
///
|
||||
/// If the given identifier exceeds [`LazyStateID::MAX`], then this returns
|
||||
/// an error.
|
||||
#[inline]
|
||||
pub(crate) fn new(id: usize) -> Result<LazyStateID, LazyStateIDError> {
|
||||
if id > LazyStateID::MAX {
|
||||
let attempted = u64::try_from(id).unwrap();
|
||||
return Err(LazyStateIDError { attempted });
|
||||
}
|
||||
Ok(LazyStateID::new_unchecked(id))
|
||||
}
|
||||
|
||||
/// Create a new lazy state ID without checking whether the given value
|
||||
/// exceeds [`LazyStateID::MAX`].
|
||||
///
|
||||
/// While this is unchecked, providing an incorrect value must never
|
||||
/// sacrifice memory safety.
|
||||
#[inline]
|
||||
const fn new_unchecked(id: usize) -> LazyStateID {
|
||||
// FIXME: Use as_u32() once const functions in traits are stable.
|
||||
LazyStateID(id as u32)
|
||||
}
|
||||
|
||||
/// Return this lazy state ID as an untagged `usize`.
|
||||
///
|
||||
/// If this lazy state ID is tagged, then the usize returned is the state
|
||||
/// ID without the tag. If the ID was not tagged, then the usize returned
|
||||
/// is equivalent to the state ID.
|
||||
#[inline]
|
||||
pub(crate) fn as_usize_untagged(&self) -> usize {
|
||||
self.as_usize_unchecked() & LazyStateID::MAX
|
||||
}
|
||||
|
||||
/// Return this lazy state ID as its raw internal `usize` value, which may
|
||||
/// be tagged (and thus greater than LazyStateID::MAX).
|
||||
#[inline]
|
||||
pub(crate) const fn as_usize_unchecked(&self) -> usize {
|
||||
// FIXME: Use as_usize() once const functions in traits are stable.
|
||||
self.0 as usize
|
||||
}
|
||||
|
||||
#[inline]
|
||||
pub(crate) const fn to_unknown(&self) -> LazyStateID {
|
||||
LazyStateID::new_unchecked(
|
||||
self.as_usize_unchecked() | LazyStateID::MASK_UNKNOWN,
|
||||
)
|
||||
}
|
||||
|
||||
#[inline]
|
||||
pub(crate) const fn to_dead(&self) -> LazyStateID {
|
||||
LazyStateID::new_unchecked(
|
||||
self.as_usize_unchecked() | LazyStateID::MASK_DEAD,
|
||||
)
|
||||
}
|
||||
|
||||
#[inline]
|
||||
pub(crate) const fn to_quit(&self) -> LazyStateID {
|
||||
LazyStateID::new_unchecked(
|
||||
self.as_usize_unchecked() | LazyStateID::MASK_QUIT,
|
||||
)
|
||||
}
|
||||
|
||||
/// Return this lazy state ID as a state ID that is tagged as a start
|
||||
/// state.
|
||||
#[inline]
|
||||
pub(crate) const fn to_start(&self) -> LazyStateID {
|
||||
LazyStateID::new_unchecked(
|
||||
self.as_usize_unchecked() | LazyStateID::MASK_START,
|
||||
)
|
||||
}
|
||||
|
||||
/// Return this lazy state ID as a lazy state ID that is tagged as a match
|
||||
/// state.
|
||||
#[inline]
|
||||
pub(crate) const fn to_match(&self) -> LazyStateID {
|
||||
LazyStateID::new_unchecked(
|
||||
self.as_usize_unchecked() | LazyStateID::MASK_MATCH,
|
||||
)
|
||||
}
|
||||
|
||||
/// Return true if and only if this lazy state ID is tagged.
|
||||
///
|
||||
/// When a lazy state ID is tagged, then one can conclude that it is one
|
||||
/// of a match, start, dead, quit or unknown state.
|
||||
#[inline]
|
||||
pub const fn is_tagged(&self) -> bool {
|
||||
self.as_usize_unchecked() > LazyStateID::MAX
|
||||
}
|
||||
|
||||
/// Return true if and only if this represents a lazy state ID that is
|
||||
/// "unknown." That is, the state has not yet been created. When a caller
|
||||
/// sees this state ID, it generally means that a state has to be computed
|
||||
/// in order to proceed.
|
||||
#[inline]
|
||||
pub const fn is_unknown(&self) -> bool {
|
||||
self.as_usize_unchecked() & LazyStateID::MASK_UNKNOWN > 0
|
||||
}
|
||||
|
||||
/// Return true if and only if this represents a dead state. A dead state
|
||||
/// is a state that can never transition to any other state except the
|
||||
/// dead state. When a dead state is seen, it generally indicates that a
|
||||
/// search should stop.
|
||||
#[inline]
|
||||
pub const fn is_dead(&self) -> bool {
|
||||
self.as_usize_unchecked() & LazyStateID::MASK_DEAD > 0
|
||||
}
|
||||
|
||||
/// Return true if and only if this represents a quit state. A quit state
|
||||
/// is a state that is representationally equivalent to a dead state,
|
||||
/// except it indicates the automaton has reached a point at which it can
|
||||
/// no longer determine whether a match exists or not. In general, this
|
||||
/// indicates an error during search and the caller must either pass this
|
||||
/// error up or use a different search technique.
|
||||
#[inline]
|
||||
pub const fn is_quit(&self) -> bool {
|
||||
self.as_usize_unchecked() & LazyStateID::MASK_QUIT > 0
|
||||
}
|
||||
|
||||
/// Return true if and only if this lazy state ID has been tagged as a
|
||||
/// start state.
|
||||
///
|
||||
/// Note that if
|
||||
/// [`Config::specialize_start_states`](crate::hybrid::dfa::Config) is
|
||||
/// disabled (which is the default), then this will always return false
|
||||
/// since start states won't be tagged.
|
||||
#[inline]
|
||||
pub const fn is_start(&self) -> bool {
|
||||
self.as_usize_unchecked() & LazyStateID::MASK_START > 0
|
||||
}
|
||||
|
||||
/// Return true if and only if this lazy state ID has been tagged as a
|
||||
/// match state.
|
||||
#[inline]
|
||||
pub const fn is_match(&self) -> bool {
|
||||
self.as_usize_unchecked() & LazyStateID::MASK_MATCH > 0
|
||||
}
|
||||
}
|
||||
|
||||
/// This error occurs when a lazy state ID could not be constructed.
|
||||
///
|
||||
/// This occurs when given an integer exceeding the maximum lazy state ID
|
||||
/// value.
|
||||
///
|
||||
/// When the `std` feature is enabled, this implements the `Error` trait.
|
||||
#[derive(Clone, Debug, Eq, PartialEq)]
|
||||
pub(crate) struct LazyStateIDError {
|
||||
attempted: u64,
|
||||
}
|
||||
|
||||
impl LazyStateIDError {
|
||||
/// Returns the value that failed to constructed a lazy state ID.
|
||||
pub(crate) fn attempted(&self) -> u64 {
|
||||
self.attempted
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(feature = "std")]
|
||||
impl std::error::Error for LazyStateIDError {}
|
||||
|
||||
impl core::fmt::Display for LazyStateIDError {
|
||||
fn fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result {
|
||||
write!(
|
||||
f,
|
||||
"failed to create LazyStateID from {:?}, which exceeds {:?}",
|
||||
self.attempted(),
|
||||
LazyStateID::MAX,
|
||||
)
|
||||
}
|
||||
}
|
||||
144
third-party/vendor/regex-automata/src/hybrid/mod.rs
vendored
Normal file
144
third-party/vendor/regex-automata/src/hybrid/mod.rs
vendored
Normal file
|
|
@ -0,0 +1,144 @@
|
|||
/*!
|
||||
A module for building and searching with lazy deterministic finite automata
|
||||
(DFAs).
|
||||
|
||||
Like other modules in this crate, lazy DFAs support a rich regex syntax with
|
||||
Unicode features. The key feature of a lazy DFA is that it builds itself
|
||||
incrementally during search, and never uses more than a configured capacity of
|
||||
memory. Thus, when searching with a lazy DFA, one must supply a mutable "cache"
|
||||
in which the actual DFA's transition table is stored.
|
||||
|
||||
If you're looking for fully compiled DFAs, then please see the top-level
|
||||
[`dfa` module](crate::dfa).
|
||||
|
||||
# Overview
|
||||
|
||||
This section gives a brief overview of the primary types in this module:
|
||||
|
||||
* A [`regex::Regex`] provides a way to search for matches of a regular
|
||||
expression using lazy DFAs. This includes iterating over matches with both the
|
||||
start and end positions of each match.
|
||||
* A [`dfa::DFA`] provides direct low level access to a lazy DFA.
|
||||
|
||||
# Example: basic regex searching
|
||||
|
||||
This example shows how to compile a regex using the default configuration
|
||||
and then use it to find matches in a byte string:
|
||||
|
||||
```
|
||||
use regex_automata::{hybrid::regex::Regex, Match};
|
||||
|
||||
let re = Regex::new(r"[0-9]{4}-[0-9]{2}-[0-9]{2}")?;
|
||||
let mut cache = re.create_cache();
|
||||
|
||||
let haystack = "2018-12-24 2016-10-08";
|
||||
let matches: Vec<Match> = re.find_iter(&mut cache, haystack).collect();
|
||||
assert_eq!(matches, vec![
|
||||
Match::must(0, 0..10),
|
||||
Match::must(0, 11..21),
|
||||
]);
|
||||
# Ok::<(), Box<dyn std::error::Error>>(())
|
||||
```
|
||||
|
||||
# Example: searching with multiple regexes
|
||||
|
||||
The lazy DFAs in this module all fully support searching with multiple regexes
|
||||
simultaneously. You can use this support with standard leftmost-first style
|
||||
searching to find non-overlapping matches:
|
||||
|
||||
```
|
||||
# if cfg!(miri) { return Ok(()); } // miri takes too long
|
||||
use regex_automata::{hybrid::regex::Regex, Match};
|
||||
|
||||
let re = Regex::new_many(&[r"\w+", r"\S+"])?;
|
||||
let mut cache = re.create_cache();
|
||||
|
||||
let haystack = "@foo bar";
|
||||
let matches: Vec<Match> = re.find_iter(&mut cache, haystack).collect();
|
||||
assert_eq!(matches, vec![
|
||||
Match::must(1, 0..4),
|
||||
Match::must(0, 5..8),
|
||||
]);
|
||||
# Ok::<(), Box<dyn std::error::Error>>(())
|
||||
```
|
||||
|
||||
# When should I use this?
|
||||
|
||||
Generally speaking, if you can abide the use of mutable state during search,
|
||||
and you don't need things like capturing groups or Unicode word boundary
|
||||
support in non-ASCII text, then a lazy DFA is likely a robust choice with
|
||||
respect to both search speed and memory usage. Note however that its speed
|
||||
may be worse than a general purpose regex engine if you don't select a good
|
||||
[prefilter](crate::util::prefilter).
|
||||
|
||||
If you know ahead of time that your pattern would result in a very large DFA
|
||||
if it was fully compiled, it may be better to use an NFA simulation instead
|
||||
of a lazy DFA. Either that, or increase the cache capacity of your lazy DFA
|
||||
to something that is big enough to hold the state machine (likely through
|
||||
experimentation). The issue here is that if the cache is too small, then it
|
||||
could wind up being reset too frequently and this might decrease searching
|
||||
speed significantly.
|
||||
|
||||
# Differences with fully compiled DFAs
|
||||
|
||||
A [`hybrid::regex::Regex`](crate::hybrid::regex::Regex) and a
|
||||
[`dfa::regex::Regex`](crate::dfa::regex::Regex) both have the same capabilities
|
||||
(and similarly for their underlying DFAs), but they achieve them through
|
||||
different means. The main difference is that a hybrid or "lazy" regex builds
|
||||
its DFA lazily during search, where as a fully compiled regex will build its
|
||||
DFA at construction time. While building a DFA at search time might sound like
|
||||
it's slow, it tends to work out where most bytes seen during a search will
|
||||
reuse pre-built parts of the DFA and thus can be almost as fast as a fully
|
||||
compiled DFA. The main downside is that searching requires mutable space to
|
||||
store the DFA, and, in the worst case, a search can result in a new state being
|
||||
created for each byte seen, which would make searching quite a bit slower.
|
||||
|
||||
A fully compiled DFA never has to worry about searches being slower once
|
||||
it's built. (Aside from, say, the transition table being so large that it
|
||||
is subject to harsh CPU cache effects.) However, of course, building a full
|
||||
DFA can be quite time consuming and memory hungry. Particularly when large
|
||||
Unicode character classes are used, which tend to translate into very large
|
||||
DFAs.
|
||||
|
||||
A lazy DFA strikes a nice balance _in practice_, particularly in the
|
||||
presence of Unicode mode, by only building what is needed. It avoids the
|
||||
worst case exponential time complexity of DFA compilation by guaranteeing that
|
||||
it will only build at most one state per byte searched. While the worst
|
||||
case here can lead to a very high constant, it will never be exponential.
|
||||
|
||||
# Syntax
|
||||
|
||||
This module supports the same syntax as the `regex` crate, since they share the
|
||||
same parser. You can find an exhaustive list of supported syntax in the
|
||||
[documentation for the `regex` crate](https://docs.rs/regex/1/regex/#syntax).
|
||||
|
||||
There are two things that are not supported by the lazy DFAs in this module:
|
||||
|
||||
* Capturing groups. The DFAs (and [`Regex`](regex::Regex)es built on top
|
||||
of them) can only find the offsets of an entire match, but cannot resolve
|
||||
the offsets of each capturing group. This is because DFAs do not have the
|
||||
expressive power necessary. Note that it is okay to build a lazy DFA from an
|
||||
NFA that contains capture groups. The capture groups will simply be ignored.
|
||||
* Unicode word boundaries. These present particularly difficult challenges for
|
||||
DFA construction and would result in an explosion in the number of states.
|
||||
One can enable [`dfa::Config::unicode_word_boundary`] though, which provides
|
||||
heuristic support for Unicode word boundaries that only works on ASCII text.
|
||||
Otherwise, one can use `(?-u:\b)` for an ASCII word boundary, which will work
|
||||
on any input.
|
||||
|
||||
There are no plans to lift either of these limitations.
|
||||
|
||||
Note that these restrictions are identical to the restrictions on fully
|
||||
compiled DFAs.
|
||||
*/
|
||||
|
||||
pub use self::{
|
||||
error::{BuildError, CacheError, StartError},
|
||||
id::LazyStateID,
|
||||
};
|
||||
|
||||
pub mod dfa;
|
||||
mod error;
|
||||
mod id;
|
||||
pub mod regex;
|
||||
mod search;
|
||||
895
third-party/vendor/regex-automata/src/hybrid/regex.rs
vendored
Normal file
895
third-party/vendor/regex-automata/src/hybrid/regex.rs
vendored
Normal file
|
|
@ -0,0 +1,895 @@
|
|||
/*!
|
||||
A lazy DFA backed `Regex`.
|
||||
|
||||
This module provides a [`Regex`] backed by a lazy DFA. A `Regex` implements
|
||||
convenience routines you might have come to expect, such as finding a match
|
||||
and iterating over all non-overlapping matches. This `Regex` type is limited
|
||||
in its capabilities to what a lazy DFA can provide. Therefore, APIs involving
|
||||
capturing groups, for example, are not provided.
|
||||
|
||||
Internally, a `Regex` is composed of two DFAs. One is a "forward" DFA that
|
||||
finds the end offset of a match, where as the other is a "reverse" DFA that
|
||||
find the start offset of a match.
|
||||
|
||||
See the [parent module](crate::hybrid) for examples.
|
||||
*/
|
||||
|
||||
use crate::{
|
||||
hybrid::{
|
||||
dfa::{self, DFA},
|
||||
error::BuildError,
|
||||
},
|
||||
nfa::thompson,
|
||||
util::{
|
||||
iter,
|
||||
search::{Anchored, Input, Match, MatchError, MatchKind},
|
||||
},
|
||||
};
|
||||
|
||||
/// A regular expression that uses hybrid NFA/DFAs (also called "lazy DFAs")
|
||||
/// for searching.
|
||||
///
|
||||
/// A regular expression is comprised of two lazy DFAs, a "forward" DFA and a
|
||||
/// "reverse" DFA. The forward DFA is responsible for detecting the end of
|
||||
/// a match while the reverse DFA is responsible for detecting the start
|
||||
/// of a match. Thus, in order to find the bounds of any given match, a
|
||||
/// forward search must first be run followed by a reverse search. A match
|
||||
/// found by the forward DFA guarantees that the reverse DFA will also find
|
||||
/// a match.
|
||||
///
|
||||
/// # Fallibility
|
||||
///
|
||||
/// Most of the search routines defined on this type will _panic_ when the
|
||||
/// underlying search fails. This might be because the DFA gave up because it
|
||||
/// saw a quit byte, whether configured explicitly or via heuristic Unicode
|
||||
/// word boundary support, although neither are enabled by default. It might
|
||||
/// also fail if the underlying DFA determines it isn't making effective use of
|
||||
/// the cache (which also never happens by default). Or it might fail because
|
||||
/// an invalid `Input` configuration is given, for example, with an unsupported
|
||||
/// [`Anchored`] mode.
|
||||
///
|
||||
/// If you need to handle these error cases instead of allowing them to trigger
|
||||
/// a panic, then the lower level [`Regex::try_search`] provides a fallible API
|
||||
/// that never panics.
|
||||
///
|
||||
/// # Example
|
||||
///
|
||||
/// This example shows how to cause a search to terminate if it sees a
|
||||
/// `\n` byte, and handle the error returned. This could be useful if, for
|
||||
/// example, you wanted to prevent a user supplied pattern from matching
|
||||
/// across a line boundary.
|
||||
///
|
||||
/// ```
|
||||
/// # if cfg!(miri) { return Ok(()); } // miri takes too long
|
||||
/// use regex_automata::{hybrid::{dfa, regex::Regex}, Input, MatchError};
|
||||
///
|
||||
/// let re = Regex::builder()
|
||||
/// .dfa(dfa::Config::new().quit(b'\n', true))
|
||||
/// .build(r"foo\p{any}+bar")?;
|
||||
/// let mut cache = re.create_cache();
|
||||
///
|
||||
/// let input = Input::new("foo\nbar");
|
||||
/// // Normally this would produce a match, since \p{any} contains '\n'.
|
||||
/// // But since we instructed the automaton to enter a quit state if a
|
||||
/// // '\n' is observed, this produces a match error instead.
|
||||
/// let expected = MatchError::quit(b'\n', 3);
|
||||
/// let got = re.try_search(&mut cache, &input).unwrap_err();
|
||||
/// assert_eq!(expected, got);
|
||||
///
|
||||
/// # Ok::<(), Box<dyn std::error::Error>>(())
|
||||
/// ```
|
||||
#[derive(Debug)]
|
||||
pub struct Regex {
|
||||
/// The forward lazy DFA. This can only find the end of a match.
|
||||
forward: DFA,
|
||||
/// The reverse lazy DFA. This can only find the start of a match.
|
||||
///
|
||||
/// This is built with 'all' match semantics (instead of leftmost-first)
|
||||
/// so that it always finds the longest possible match (which corresponds
|
||||
/// to the leftmost starting position). It is also compiled as an anchored
|
||||
/// matcher and has 'starts_for_each_pattern' enabled. Including starting
|
||||
/// states for each pattern is necessary to ensure that we only look for
|
||||
/// matches of a pattern that matched in the forward direction. Otherwise,
|
||||
/// we might wind up finding the "leftmost" starting position of a totally
|
||||
/// different pattern!
|
||||
reverse: DFA,
|
||||
}
|
||||
|
||||
/// Convenience routines for regex and cache construction.
|
||||
impl Regex {
|
||||
/// Parse the given regular expression using the default configuration and
|
||||
/// return the corresponding regex.
|
||||
///
|
||||
/// If you want a non-default configuration, then use the [`Builder`] to
|
||||
/// set your own configuration.
|
||||
///
|
||||
/// # Example
|
||||
///
|
||||
/// ```
|
||||
/// use regex_automata::{hybrid::regex::Regex, Match};
|
||||
///
|
||||
/// let re = Regex::new("foo[0-9]+bar")?;
|
||||
/// let mut cache = re.create_cache();
|
||||
/// assert_eq!(
|
||||
/// Some(Match::must(0, 3..14)),
|
||||
/// re.find(&mut cache, "zzzfoo12345barzzz"),
|
||||
/// );
|
||||
/// # Ok::<(), Box<dyn std::error::Error>>(())
|
||||
/// ```
|
||||
#[cfg(feature = "syntax")]
|
||||
pub fn new(pattern: &str) -> Result<Regex, BuildError> {
|
||||
Regex::builder().build(pattern)
|
||||
}
|
||||
|
||||
/// Like `new`, but parses multiple patterns into a single "multi regex."
|
||||
/// This similarly uses the default regex configuration.
|
||||
///
|
||||
/// # Example
|
||||
///
|
||||
/// ```
|
||||
/// use regex_automata::{hybrid::regex::Regex, Match};
|
||||
///
|
||||
/// let re = Regex::new_many(&["[a-z]+", "[0-9]+"])?;
|
||||
/// let mut cache = re.create_cache();
|
||||
///
|
||||
/// let mut it = re.find_iter(&mut cache, "abc 1 foo 4567 0 quux");
|
||||
/// assert_eq!(Some(Match::must(0, 0..3)), it.next());
|
||||
/// assert_eq!(Some(Match::must(1, 4..5)), it.next());
|
||||
/// assert_eq!(Some(Match::must(0, 6..9)), it.next());
|
||||
/// assert_eq!(Some(Match::must(1, 10..14)), it.next());
|
||||
/// assert_eq!(Some(Match::must(1, 15..16)), it.next());
|
||||
/// assert_eq!(Some(Match::must(0, 17..21)), it.next());
|
||||
/// assert_eq!(None, it.next());
|
||||
/// # Ok::<(), Box<dyn std::error::Error>>(())
|
||||
/// ```
|
||||
#[cfg(feature = "syntax")]
|
||||
pub fn new_many<P: AsRef<str>>(
|
||||
patterns: &[P],
|
||||
) -> Result<Regex, BuildError> {
|
||||
Regex::builder().build_many(patterns)
|
||||
}
|
||||
|
||||
/// Return a builder for configuring the construction of a `Regex`.
|
||||
///
|
||||
/// This is a convenience routine to avoid needing to import the
|
||||
/// [`Builder`] type in common cases.
|
||||
///
|
||||
/// # Example
|
||||
///
|
||||
/// This example shows how to use the builder to disable UTF-8 mode
|
||||
/// everywhere.
|
||||
///
|
||||
/// ```
|
||||
/// # if cfg!(miri) { return Ok(()); } // miri takes too long
|
||||
/// use regex_automata::{
|
||||
/// hybrid::regex::Regex, nfa::thompson, util::syntax, Match,
|
||||
/// };
|
||||
///
|
||||
/// let re = Regex::builder()
|
||||
/// .syntax(syntax::Config::new().utf8(false))
|
||||
/// .thompson(thompson::Config::new().utf8(false))
|
||||
/// .build(r"foo(?-u:[^b])ar.*")?;
|
||||
/// let mut cache = re.create_cache();
|
||||
///
|
||||
/// let haystack = b"\xFEfoo\xFFarzz\xE2\x98\xFF\n";
|
||||
/// let expected = Some(Match::must(0, 1..9));
|
||||
/// let got = re.find(&mut cache, haystack);
|
||||
/// assert_eq!(expected, got);
|
||||
///
|
||||
/// # Ok::<(), Box<dyn std::error::Error>>(())
|
||||
/// ```
|
||||
pub fn builder() -> Builder {
|
||||
Builder::new()
|
||||
}
|
||||
|
||||
/// Create a new cache for this `Regex`.
|
||||
///
|
||||
/// The cache returned should only be used for searches for this
|
||||
/// `Regex`. If you want to reuse the cache for another `Regex`, then
|
||||
/// you must call [`Cache::reset`] with that `Regex` (or, equivalently,
|
||||
/// [`Regex::reset_cache`]).
|
||||
pub fn create_cache(&self) -> Cache {
|
||||
Cache::new(self)
|
||||
}
|
||||
|
||||
/// Reset the given cache such that it can be used for searching with the
|
||||
/// this `Regex` (and only this `Regex`).
|
||||
///
|
||||
/// A cache reset permits reusing memory already allocated in this cache
|
||||
/// with a different `Regex`.
|
||||
///
|
||||
/// Resetting a cache sets its "clear count" to 0. This is relevant if the
|
||||
/// `Regex` has been configured to "give up" after it has cleared the cache
|
||||
/// a certain number of times.
|
||||
///
|
||||
/// # Example
|
||||
///
|
||||
/// This shows how to re-purpose a cache for use with a different `Regex`.
|
||||
///
|
||||
/// ```
|
||||
/// # if cfg!(miri) { return Ok(()); } // miri takes too long
|
||||
/// use regex_automata::{hybrid::regex::Regex, Match};
|
||||
///
|
||||
/// let re1 = Regex::new(r"\w")?;
|
||||
/// let re2 = Regex::new(r"\W")?;
|
||||
///
|
||||
/// let mut cache = re1.create_cache();
|
||||
/// assert_eq!(
|
||||
/// Some(Match::must(0, 0..2)),
|
||||
/// re1.find(&mut cache, "Δ"),
|
||||
/// );
|
||||
///
|
||||
/// // Using 'cache' with re2 is not allowed. It may result in panics or
|
||||
/// // incorrect results. In order to re-purpose the cache, we must reset
|
||||
/// // it with the Regex we'd like to use it with.
|
||||
/// //
|
||||
/// // Similarly, after this reset, using the cache with 're1' is also not
|
||||
/// // allowed.
|
||||
/// re2.reset_cache(&mut cache);
|
||||
/// assert_eq!(
|
||||
/// Some(Match::must(0, 0..3)),
|
||||
/// re2.find(&mut cache, "☃"),
|
||||
/// );
|
||||
///
|
||||
/// # Ok::<(), Box<dyn std::error::Error>>(())
|
||||
/// ```
|
||||
pub fn reset_cache(&self, cache: &mut Cache) {
|
||||
self.forward().reset_cache(&mut cache.forward);
|
||||
self.reverse().reset_cache(&mut cache.reverse);
|
||||
}
|
||||
}
|
||||
|
||||
/// Standard infallible search routines for finding and iterating over matches.
|
||||
impl Regex {
|
||||
/// Returns true if and only if this regex matches the given haystack.
|
||||
///
|
||||
/// This routine may short circuit if it knows that scanning future input
|
||||
/// will never lead to a different result. In particular, if the underlying
|
||||
/// DFA enters a match state or a dead state, then this routine will return
|
||||
/// `true` or `false`, respectively, without inspecting any future input.
|
||||
///
|
||||
/// # Panics
|
||||
///
|
||||
/// This routine panics if the search could not complete. This can occur
|
||||
/// in a number of circumstances:
|
||||
///
|
||||
/// * The configuration of the lazy DFA may permit it to "quit" the search.
|
||||
/// For example, setting quit bytes or enabling heuristic support for
|
||||
/// Unicode word boundaries. The default configuration does not enable any
|
||||
/// option that could result in the lazy DFA quitting.
|
||||
/// * The configuration of the lazy DFA may also permit it to "give up"
|
||||
/// on a search if it makes ineffective use of its transition table
|
||||
/// cache. The default configuration does not enable this by default,
|
||||
/// although it is typically a good idea to.
|
||||
/// * When the provided `Input` configuration is not supported. For
|
||||
/// example, by providing an unsupported anchor mode.
|
||||
///
|
||||
/// When a search panics, callers cannot know whether a match exists or
|
||||
/// not.
|
||||
///
|
||||
/// Use [`Regex::try_search`] if you want to handle these error conditions.
|
||||
///
|
||||
/// # Example
|
||||
///
|
||||
/// ```
|
||||
/// use regex_automata::hybrid::regex::Regex;
|
||||
///
|
||||
/// let re = Regex::new("foo[0-9]+bar")?;
|
||||
/// let mut cache = re.create_cache();
|
||||
///
|
||||
/// assert!(re.is_match(&mut cache, "foo12345bar"));
|
||||
/// assert!(!re.is_match(&mut cache, "foobar"));
|
||||
/// # Ok::<(), Box<dyn std::error::Error>>(())
|
||||
/// ```
|
||||
#[inline]
|
||||
pub fn is_match<'h, I: Into<Input<'h>>>(
|
||||
&self,
|
||||
cache: &mut Cache,
|
||||
input: I,
|
||||
) -> bool {
|
||||
// Not only can we do an "earliest" search, but we can avoid doing a
|
||||
// reverse scan too.
|
||||
self.forward()
|
||||
.try_search_fwd(&mut cache.forward, &input.into().earliest(true))
|
||||
.unwrap()
|
||||
.is_some()
|
||||
}
|
||||
|
||||
/// Returns the start and end offset of the leftmost match. If no match
|
||||
/// exists, then `None` is returned.
|
||||
///
|
||||
/// # Panics
|
||||
///
|
||||
/// This routine panics if the search could not complete. This can occur
|
||||
/// in a number of circumstances:
|
||||
///
|
||||
/// * The configuration of the lazy DFA may permit it to "quit" the search.
|
||||
/// For example, setting quit bytes or enabling heuristic support for
|
||||
/// Unicode word boundaries. The default configuration does not enable any
|
||||
/// option that could result in the lazy DFA quitting.
|
||||
/// * The configuration of the lazy DFA may also permit it to "give up"
|
||||
/// on a search if it makes ineffective use of its transition table
|
||||
/// cache. The default configuration does not enable this by default,
|
||||
/// although it is typically a good idea to.
|
||||
/// * When the provided `Input` configuration is not supported. For
|
||||
/// example, by providing an unsupported anchor mode.
|
||||
///
|
||||
/// When a search panics, callers cannot know whether a match exists or
|
||||
/// not.
|
||||
///
|
||||
/// Use [`Regex::try_search`] if you want to handle these error conditions.
|
||||
///
|
||||
/// # Example
|
||||
///
|
||||
/// ```
|
||||
/// use regex_automata::{Match, hybrid::regex::Regex};
|
||||
///
|
||||
/// let re = Regex::new("foo[0-9]+")?;
|
||||
/// let mut cache = re.create_cache();
|
||||
/// assert_eq!(
|
||||
/// Some(Match::must(0, 3..11)),
|
||||
/// re.find(&mut cache, "zzzfoo12345zzz"),
|
||||
/// );
|
||||
///
|
||||
/// // Even though a match is found after reading the first byte (`a`),
|
||||
/// // the default leftmost-first match semantics demand that we find the
|
||||
/// // earliest match that prefers earlier parts of the pattern over latter
|
||||
/// // parts.
|
||||
/// let re = Regex::new("abc|a")?;
|
||||
/// let mut cache = re.create_cache();
|
||||
/// assert_eq!(Some(Match::must(0, 0..3)), re.find(&mut cache, "abc"));
|
||||
/// # Ok::<(), Box<dyn std::error::Error>>(())
|
||||
/// ```
|
||||
#[inline]
|
||||
pub fn find<'h, I: Into<Input<'h>>>(
|
||||
&self,
|
||||
cache: &mut Cache,
|
||||
input: I,
|
||||
) -> Option<Match> {
|
||||
self.try_search(cache, &input.into()).unwrap()
|
||||
}
|
||||
|
||||
/// Returns an iterator over all non-overlapping leftmost matches in the
|
||||
/// given bytes. If no match exists, then the iterator yields no elements.
|
||||
///
|
||||
/// # Panics
|
||||
///
|
||||
/// This routine panics if the search could not complete. This can occur
|
||||
/// in a number of circumstances:
|
||||
///
|
||||
/// * The configuration of the lazy DFA may permit it to "quit" the search.
|
||||
/// For example, setting quit bytes or enabling heuristic support for
|
||||
/// Unicode word boundaries. The default configuration does not enable any
|
||||
/// option that could result in the lazy DFA quitting.
|
||||
/// * The configuration of the lazy DFA may also permit it to "give up"
|
||||
/// on a search if it makes ineffective use of its transition table
|
||||
/// cache. The default configuration does not enable this by default,
|
||||
/// although it is typically a good idea to.
|
||||
/// * When the provided `Input` configuration is not supported. For
|
||||
/// example, by providing an unsupported anchor mode.
|
||||
///
|
||||
/// When a search panics, callers cannot know whether a match exists or
|
||||
/// not.
|
||||
///
|
||||
/// The above conditions also apply to the iterator returned as well. For
|
||||
/// example, if the lazy DFA gives up or quits during a search using this
|
||||
/// method, then a panic will occur during iteration.
|
||||
///
|
||||
/// Use [`Regex::try_search`] with [`util::iter::Searcher`](iter::Searcher)
|
||||
/// if you want to handle these error conditions.
|
||||
///
|
||||
/// # Example
|
||||
///
|
||||
/// ```
|
||||
/// use regex_automata::{hybrid::regex::Regex, Match};
|
||||
///
|
||||
/// let re = Regex::new("foo[0-9]+")?;
|
||||
/// let mut cache = re.create_cache();
|
||||
///
|
||||
/// let text = "foo1 foo12 foo123";
|
||||
/// let matches: Vec<Match> = re.find_iter(&mut cache, text).collect();
|
||||
/// assert_eq!(matches, vec![
|
||||
/// Match::must(0, 0..4),
|
||||
/// Match::must(0, 5..10),
|
||||
/// Match::must(0, 11..17),
|
||||
/// ]);
|
||||
/// # Ok::<(), Box<dyn std::error::Error>>(())
|
||||
/// ```
|
||||
#[inline]
|
||||
pub fn find_iter<'r, 'c, 'h, I: Into<Input<'h>>>(
|
||||
&'r self,
|
||||
cache: &'c mut Cache,
|
||||
input: I,
|
||||
) -> FindMatches<'r, 'c, 'h> {
|
||||
let it = iter::Searcher::new(input.into());
|
||||
FindMatches { re: self, cache, it }
|
||||
}
|
||||
}
|
||||
|
||||
/// Lower level "search" primitives that accept a `&Input` for cheap reuse
|
||||
/// and return an error if one occurs instead of panicking.
|
||||
impl Regex {
|
||||
/// Returns the start and end offset of the leftmost match. If no match
|
||||
/// exists, then `None` is returned.
|
||||
///
|
||||
/// This is like [`Regex::find`] but with two differences:
|
||||
///
|
||||
/// 1. It is not generic over `Into<Input>` and instead accepts a
|
||||
/// `&Input`. This permits reusing the same `Input` for multiple searches
|
||||
/// without needing to create a new one. This _may_ help with latency.
|
||||
/// 2. It returns an error if the search could not complete where as
|
||||
/// [`Regex::find`] will panic.
|
||||
///
|
||||
/// # Errors
|
||||
///
|
||||
/// This routine errors if the search could not complete. This can occur
|
||||
/// in a number of circumstances:
|
||||
///
|
||||
/// * The configuration of the lazy DFA may permit it to "quit" the search.
|
||||
/// For example, setting quit bytes or enabling heuristic support for
|
||||
/// Unicode word boundaries. The default configuration does not enable any
|
||||
/// option that could result in the lazy DFA quitting.
|
||||
/// * The configuration of the lazy DFA may also permit it to "give up"
|
||||
/// on a search if it makes ineffective use of its transition table
|
||||
/// cache. The default configuration does not enable this by default,
|
||||
/// although it is typically a good idea to.
|
||||
/// * When the provided `Input` configuration is not supported. For
|
||||
/// example, by providing an unsupported anchor mode.
|
||||
///
|
||||
/// When a search returns an error, callers cannot know whether a match
|
||||
/// exists or not.
|
||||
#[inline]
|
||||
pub fn try_search(
|
||||
&self,
|
||||
cache: &mut Cache,
|
||||
input: &Input<'_>,
|
||||
) -> Result<Option<Match>, MatchError> {
|
||||
let (fcache, rcache) = (&mut cache.forward, &mut cache.reverse);
|
||||
let end = match self.forward().try_search_fwd(fcache, input)? {
|
||||
None => return Ok(None),
|
||||
Some(end) => end,
|
||||
};
|
||||
// This special cases an empty match at the beginning of the search. If
|
||||
// our end matches our start, then since a reverse DFA can't match past
|
||||
// the start, it must follow that our starting position is also our end
|
||||
// position. So short circuit and skip the reverse search.
|
||||
if input.start() == end.offset() {
|
||||
return Ok(Some(Match::new(
|
||||
end.pattern(),
|
||||
end.offset()..end.offset(),
|
||||
)));
|
||||
}
|
||||
// We can also skip the reverse search if we know our search was
|
||||
// anchored. This occurs either when the input config is anchored or
|
||||
// when we know the regex itself is anchored. In this case, we know the
|
||||
// start of the match, if one is found, must be the start of the
|
||||
// search.
|
||||
if self.is_anchored(input) {
|
||||
return Ok(Some(Match::new(
|
||||
end.pattern(),
|
||||
input.start()..end.offset(),
|
||||
)));
|
||||
}
|
||||
// N.B. I have tentatively convinced myself that it isn't necessary
|
||||
// to specify the specific pattern for the reverse search since the
|
||||
// reverse search will always find the same pattern to match as the
|
||||
// forward search. But I lack a rigorous proof. Why not just provide
|
||||
// the pattern anyway? Well, if it is needed, then leaving it out
|
||||
// gives us a chance to find a witness. (Also, if we don't need to
|
||||
// specify the pattern, then we don't need to build the reverse DFA
|
||||
// with 'starts_for_each_pattern' enabled. It doesn't matter too much
|
||||
// for the lazy DFA, but does make the overall DFA bigger.)
|
||||
//
|
||||
// We also need to be careful to disable 'earliest' for the reverse
|
||||
// search, since it could be enabled for the forward search. In the
|
||||
// reverse case, to satisfy "leftmost" criteria, we need to match as
|
||||
// much as we can. We also need to be careful to make the search
|
||||
// anchored. We don't want the reverse search to report any matches
|
||||
// other than the one beginning at the end of our forward search.
|
||||
let revsearch = input
|
||||
.clone()
|
||||
.span(input.start()..end.offset())
|
||||
.anchored(Anchored::Yes)
|
||||
.earliest(false);
|
||||
let start = self
|
||||
.reverse()
|
||||
.try_search_rev(rcache, &revsearch)?
|
||||
.expect("reverse search must match if forward search does");
|
||||
debug_assert_eq!(
|
||||
start.pattern(),
|
||||
end.pattern(),
|
||||
"forward and reverse search must match same pattern",
|
||||
);
|
||||
debug_assert!(start.offset() <= end.offset());
|
||||
Ok(Some(Match::new(end.pattern(), start.offset()..end.offset())))
|
||||
}
|
||||
|
||||
/// Returns true if either the given input specifies an anchored search
|
||||
/// or if the underlying NFA is always anchored.
|
||||
fn is_anchored(&self, input: &Input<'_>) -> bool {
|
||||
match input.get_anchored() {
|
||||
Anchored::No => {
|
||||
self.forward().get_nfa().is_always_start_anchored()
|
||||
}
|
||||
Anchored::Yes | Anchored::Pattern(_) => true,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Non-search APIs for querying information about the regex and setting a
|
||||
/// prefilter.
|
||||
impl Regex {
|
||||
/// Return the underlying lazy DFA responsible for forward matching.
|
||||
///
|
||||
/// This is useful for accessing the underlying lazy DFA and using it
|
||||
/// directly if the situation calls for it.
|
||||
pub fn forward(&self) -> &DFA {
|
||||
&self.forward
|
||||
}
|
||||
|
||||
/// Return the underlying lazy DFA responsible for reverse matching.
|
||||
///
|
||||
/// This is useful for accessing the underlying lazy DFA and using it
|
||||
/// directly if the situation calls for it.
|
||||
pub fn reverse(&self) -> &DFA {
|
||||
&self.reverse
|
||||
}
|
||||
|
||||
/// Returns the total number of patterns matched by this regex.
|
||||
///
|
||||
/// # Example
|
||||
///
|
||||
/// ```
|
||||
/// # if cfg!(miri) { return Ok(()); } // miri takes too long
|
||||
/// use regex_automata::hybrid::regex::Regex;
|
||||
///
|
||||
/// let re = Regex::new_many(&[r"[a-z]+", r"[0-9]+", r"\w+"])?;
|
||||
/// assert_eq!(3, re.pattern_len());
|
||||
/// # Ok::<(), Box<dyn std::error::Error>>(())
|
||||
/// ```
|
||||
pub fn pattern_len(&self) -> usize {
|
||||
assert_eq!(self.forward().pattern_len(), self.reverse().pattern_len());
|
||||
self.forward().pattern_len()
|
||||
}
|
||||
}
|
||||
|
||||
/// An iterator over all non-overlapping matches for an infallible search.
|
||||
///
|
||||
/// The iterator yields a [`Match`] value until no more matches could be found.
|
||||
/// If the underlying regex engine returns an error, then a panic occurs.
|
||||
///
|
||||
/// The lifetime parameters are as follows:
|
||||
///
|
||||
/// * `'r` represents the lifetime of the regex object.
|
||||
/// * `'h` represents the lifetime of the haystack being searched.
|
||||
/// * `'c` represents the lifetime of the regex cache.
|
||||
///
|
||||
/// This iterator can be created with the [`Regex::find_iter`] method.
|
||||
#[derive(Debug)]
|
||||
pub struct FindMatches<'r, 'c, 'h> {
|
||||
re: &'r Regex,
|
||||
cache: &'c mut Cache,
|
||||
it: iter::Searcher<'h>,
|
||||
}
|
||||
|
||||
impl<'r, 'c, 'h> Iterator for FindMatches<'r, 'c, 'h> {
|
||||
type Item = Match;
|
||||
|
||||
#[inline]
|
||||
fn next(&mut self) -> Option<Match> {
|
||||
let FindMatches { re, ref mut cache, ref mut it } = *self;
|
||||
it.advance(|input| re.try_search(cache, input))
|
||||
}
|
||||
}
|
||||
|
||||
/// A cache represents a partially computed forward and reverse DFA.
|
||||
///
|
||||
/// A cache is the key component that differentiates a classical DFA and a
|
||||
/// hybrid NFA/DFA (also called a "lazy DFA"). Where a classical DFA builds a
|
||||
/// complete transition table that can handle all possible inputs, a hybrid
|
||||
/// NFA/DFA starts with an empty transition table and builds only the parts
|
||||
/// required during search. The parts that are built are stored in a cache. For
|
||||
/// this reason, a cache is a required parameter for nearly every operation on
|
||||
/// a [`Regex`].
|
||||
///
|
||||
/// Caches can be created from their corresponding `Regex` via
|
||||
/// [`Regex::create_cache`]. A cache can only be used with either the `Regex`
|
||||
/// that created it, or the `Regex` that was most recently used to reset it
|
||||
/// with [`Cache::reset`]. Using a cache with any other `Regex` may result in
|
||||
/// panics or incorrect results.
|
||||
#[derive(Debug, Clone)]
|
||||
pub struct Cache {
|
||||
forward: dfa::Cache,
|
||||
reverse: dfa::Cache,
|
||||
}
|
||||
|
||||
impl Cache {
|
||||
/// Create a new cache for the given `Regex`.
|
||||
///
|
||||
/// The cache returned should only be used for searches for the given
|
||||
/// `Regex`. If you want to reuse the cache for another `Regex`, then you
|
||||
/// must call [`Cache::reset`] with that `Regex`.
|
||||
pub fn new(re: &Regex) -> Cache {
|
||||
let forward = dfa::Cache::new(re.forward());
|
||||
let reverse = dfa::Cache::new(re.reverse());
|
||||
Cache { forward, reverse }
|
||||
}
|
||||
|
||||
/// Reset this cache such that it can be used for searching with the given
|
||||
/// `Regex` (and only that `Regex`).
|
||||
///
|
||||
/// A cache reset permits reusing memory already allocated in this cache
|
||||
/// with a different `Regex`.
|
||||
///
|
||||
/// Resetting a cache sets its "clear count" to 0. This is relevant if the
|
||||
/// `Regex` has been configured to "give up" after it has cleared the cache
|
||||
/// a certain number of times.
|
||||
///
|
||||
/// # Example
|
||||
///
|
||||
/// This shows how to re-purpose a cache for use with a different `Regex`.
|
||||
///
|
||||
/// ```
|
||||
/// # if cfg!(miri) { return Ok(()); } // miri takes too long
|
||||
/// use regex_automata::{hybrid::regex::Regex, Match};
|
||||
///
|
||||
/// let re1 = Regex::new(r"\w")?;
|
||||
/// let re2 = Regex::new(r"\W")?;
|
||||
///
|
||||
/// let mut cache = re1.create_cache();
|
||||
/// assert_eq!(
|
||||
/// Some(Match::must(0, 0..2)),
|
||||
/// re1.find(&mut cache, "Δ"),
|
||||
/// );
|
||||
///
|
||||
/// // Using 'cache' with re2 is not allowed. It may result in panics or
|
||||
/// // incorrect results. In order to re-purpose the cache, we must reset
|
||||
/// // it with the Regex we'd like to use it with.
|
||||
/// //
|
||||
/// // Similarly, after this reset, using the cache with 're1' is also not
|
||||
/// // allowed.
|
||||
/// cache.reset(&re2);
|
||||
/// assert_eq!(
|
||||
/// Some(Match::must(0, 0..3)),
|
||||
/// re2.find(&mut cache, "☃"),
|
||||
/// );
|
||||
///
|
||||
/// # Ok::<(), Box<dyn std::error::Error>>(())
|
||||
/// ```
|
||||
pub fn reset(&mut self, re: &Regex) {
|
||||
self.forward.reset(re.forward());
|
||||
self.reverse.reset(re.reverse());
|
||||
}
|
||||
|
||||
/// Return a reference to the forward cache.
|
||||
pub fn forward(&mut self) -> &dfa::Cache {
|
||||
&self.forward
|
||||
}
|
||||
|
||||
/// Return a reference to the reverse cache.
|
||||
pub fn reverse(&mut self) -> &dfa::Cache {
|
||||
&self.reverse
|
||||
}
|
||||
|
||||
/// Return a mutable reference to the forward cache.
|
||||
///
|
||||
/// If you need mutable references to both the forward and reverse caches,
|
||||
/// then use [`Cache::as_parts_mut`].
|
||||
pub fn forward_mut(&mut self) -> &mut dfa::Cache {
|
||||
&mut self.forward
|
||||
}
|
||||
|
||||
/// Return a mutable reference to the reverse cache.
|
||||
///
|
||||
/// If you need mutable references to both the forward and reverse caches,
|
||||
/// then use [`Cache::as_parts_mut`].
|
||||
pub fn reverse_mut(&mut self) -> &mut dfa::Cache {
|
||||
&mut self.reverse
|
||||
}
|
||||
|
||||
/// Return references to the forward and reverse caches, respectively.
|
||||
pub fn as_parts(&self) -> (&dfa::Cache, &dfa::Cache) {
|
||||
(&self.forward, &self.reverse)
|
||||
}
|
||||
|
||||
/// Return mutable references to the forward and reverse caches,
|
||||
/// respectively.
|
||||
pub fn as_parts_mut(&mut self) -> (&mut dfa::Cache, &mut dfa::Cache) {
|
||||
(&mut self.forward, &mut self.reverse)
|
||||
}
|
||||
|
||||
/// Returns the heap memory usage, in bytes, as a sum of the forward and
|
||||
/// reverse lazy DFA caches.
|
||||
///
|
||||
/// This does **not** include the stack size used up by this cache. To
|
||||
/// compute that, use `std::mem::size_of::<Cache>()`.
|
||||
pub fn memory_usage(&self) -> usize {
|
||||
self.forward.memory_usage() + self.reverse.memory_usage()
|
||||
}
|
||||
}
|
||||
|
||||
/// A builder for a regex based on a hybrid NFA/DFA.
|
||||
///
|
||||
/// This builder permits configuring options for the syntax of a pattern, the
|
||||
/// NFA construction, the lazy DFA construction and finally the regex searching
|
||||
/// itself. This builder is different from a general purpose regex builder
|
||||
/// in that it permits fine grain configuration of the construction process.
|
||||
/// The trade off for this is complexity, and the possibility of setting a
|
||||
/// configuration that might not make sense. For example, there are two
|
||||
/// different UTF-8 modes:
|
||||
///
|
||||
/// * [`syntax::Config::utf8`](crate::util::syntax::Config::utf8) controls
|
||||
/// whether the pattern itself can contain sub-expressions that match invalid
|
||||
/// UTF-8.
|
||||
/// * [`thompson::Config::utf8`] controls how the regex iterators themselves
|
||||
/// advance the starting position of the next search when a match with zero
|
||||
/// length is found.
|
||||
///
|
||||
/// Generally speaking, callers will want to either enable all of these or
|
||||
/// disable all of these.
|
||||
///
|
||||
/// Internally, building a regex requires building two hybrid NFA/DFAs,
|
||||
/// where one is responsible for finding the end of a match and the other is
|
||||
/// responsible for finding the start of a match. If you only need to detect
|
||||
/// whether something matched, or only the end of a match, then you should use
|
||||
/// a [`dfa::Builder`] to construct a single hybrid NFA/DFA, which is cheaper
|
||||
/// than building two of them.
|
||||
///
|
||||
/// # Example
|
||||
///
|
||||
/// This example shows how to disable UTF-8 mode in the syntax and the regex
|
||||
/// itself. This is generally what you want for matching on arbitrary bytes.
|
||||
///
|
||||
/// ```
|
||||
/// # if cfg!(miri) { return Ok(()); } // miri takes too long
|
||||
/// use regex_automata::{
|
||||
/// hybrid::regex::Regex, nfa::thompson, util::syntax, Match,
|
||||
/// };
|
||||
///
|
||||
/// let re = Regex::builder()
|
||||
/// .syntax(syntax::Config::new().utf8(false))
|
||||
/// .thompson(thompson::Config::new().utf8(false))
|
||||
/// .build(r"foo(?-u:[^b])ar.*")?;
|
||||
/// let mut cache = re.create_cache();
|
||||
///
|
||||
/// let haystack = b"\xFEfoo\xFFarzz\xE2\x98\xFF\n";
|
||||
/// let expected = Some(Match::must(0, 1..9));
|
||||
/// let got = re.find(&mut cache, haystack);
|
||||
/// assert_eq!(expected, got);
|
||||
/// // Notice that `(?-u:[^b])` matches invalid UTF-8,
|
||||
/// // but the subsequent `.*` does not! Disabling UTF-8
|
||||
/// // on the syntax permits this.
|
||||
/// assert_eq!(b"foo\xFFarzz", &haystack[got.unwrap().range()]);
|
||||
///
|
||||
/// # Ok::<(), Box<dyn std::error::Error>>(())
|
||||
/// ```
|
||||
#[derive(Clone, Debug)]
|
||||
pub struct Builder {
|
||||
dfa: dfa::Builder,
|
||||
}
|
||||
|
||||
impl Builder {
|
||||
/// Create a new regex builder with the default configuration.
|
||||
pub fn new() -> Builder {
|
||||
Builder { dfa: DFA::builder() }
|
||||
}
|
||||
|
||||
/// Build a regex from the given pattern.
|
||||
///
|
||||
/// If there was a problem parsing or compiling the pattern, then an error
|
||||
/// is returned.
|
||||
#[cfg(feature = "syntax")]
|
||||
pub fn build(&self, pattern: &str) -> Result<Regex, BuildError> {
|
||||
self.build_many(&[pattern])
|
||||
}
|
||||
|
||||
/// Build a regex from the given patterns.
|
||||
#[cfg(feature = "syntax")]
|
||||
pub fn build_many<P: AsRef<str>>(
|
||||
&self,
|
||||
patterns: &[P],
|
||||
) -> Result<Regex, BuildError> {
|
||||
let forward = self.dfa.build_many(patterns)?;
|
||||
let reverse = self
|
||||
.dfa
|
||||
.clone()
|
||||
.configure(
|
||||
DFA::config()
|
||||
.prefilter(None)
|
||||
.specialize_start_states(false)
|
||||
.match_kind(MatchKind::All),
|
||||
)
|
||||
.thompson(thompson::Config::new().reverse(true))
|
||||
.build_many(patterns)?;
|
||||
Ok(self.build_from_dfas(forward, reverse))
|
||||
}
|
||||
|
||||
/// Build a regex from its component forward and reverse hybrid NFA/DFAs.
|
||||
///
|
||||
/// This is useful when you've built a forward and reverse lazy DFA
|
||||
/// separately, and want to combine them into a single regex. Once build,
|
||||
/// the individual DFAs given can still be accessed via [`Regex::forward`]
|
||||
/// and [`Regex::reverse`].
|
||||
///
|
||||
/// It is important that the reverse lazy DFA be compiled under the
|
||||
/// following conditions:
|
||||
///
|
||||
/// * It should use [`MatchKind::All`] semantics.
|
||||
/// * It should match in reverse.
|
||||
/// * Otherwise, its configuration should match the forward DFA.
|
||||
///
|
||||
/// If these conditions aren't satisfied, then the behavior of searches is
|
||||
/// unspecified.
|
||||
///
|
||||
/// Note that when using this constructor, no configuration is applied.
|
||||
/// Since this routine provides the DFAs to the builder, there is no
|
||||
/// opportunity to apply other configuration options.
|
||||
///
|
||||
/// # Example
|
||||
///
|
||||
/// This shows how to build individual lazy forward and reverse DFAs, and
|
||||
/// then combine them into a single `Regex`.
|
||||
///
|
||||
/// ```
|
||||
/// use regex_automata::{
|
||||
/// hybrid::{dfa::DFA, regex::Regex},
|
||||
/// nfa::thompson,
|
||||
/// MatchKind,
|
||||
/// };
|
||||
///
|
||||
/// let fwd = DFA::new(r"foo[0-9]+")?;
|
||||
/// let rev = DFA::builder()
|
||||
/// .configure(DFA::config().match_kind(MatchKind::All))
|
||||
/// .thompson(thompson::Config::new().reverse(true))
|
||||
/// .build(r"foo[0-9]+")?;
|
||||
///
|
||||
/// let re = Regex::builder().build_from_dfas(fwd, rev);
|
||||
/// let mut cache = re.create_cache();
|
||||
/// assert_eq!(true, re.is_match(&mut cache, "foo123"));
|
||||
/// # Ok::<(), Box<dyn std::error::Error>>(())
|
||||
/// ```
|
||||
pub fn build_from_dfas(&self, forward: DFA, reverse: DFA) -> Regex {
|
||||
Regex { forward, reverse }
|
||||
}
|
||||
|
||||
/// Set the syntax configuration for this builder using
|
||||
/// [`syntax::Config`](crate::util::syntax::Config).
|
||||
///
|
||||
/// This permits setting things like case insensitivity, Unicode and multi
|
||||
/// line mode.
|
||||
#[cfg(feature = "syntax")]
|
||||
pub fn syntax(
|
||||
&mut self,
|
||||
config: crate::util::syntax::Config,
|
||||
) -> &mut Builder {
|
||||
self.dfa.syntax(config);
|
||||
self
|
||||
}
|
||||
|
||||
/// Set the Thompson NFA configuration for this builder using
|
||||
/// [`nfa::thompson::Config`](thompson::Config).
|
||||
///
|
||||
/// This permits setting things like whether additional time should be
|
||||
/// spent shrinking the size of the NFA.
|
||||
#[cfg(feature = "syntax")]
|
||||
pub fn thompson(&mut self, config: thompson::Config) -> &mut Builder {
|
||||
self.dfa.thompson(config);
|
||||
self
|
||||
}
|
||||
|
||||
/// Set the lazy DFA compilation configuration for this builder using
|
||||
/// [`dfa::Config`].
|
||||
///
|
||||
/// This permits setting things like whether Unicode word boundaries should
|
||||
/// be heuristically supported or settings how the behavior of the cache.
|
||||
pub fn dfa(&mut self, config: dfa::Config) -> &mut Builder {
|
||||
self.dfa.configure(config);
|
||||
self
|
||||
}
|
||||
}
|
||||
|
||||
impl Default for Builder {
|
||||
fn default() -> Builder {
|
||||
Builder::new()
|
||||
}
|
||||
}
|
||||
802
third-party/vendor/regex-automata/src/hybrid/search.rs
vendored
Normal file
802
third-party/vendor/regex-automata/src/hybrid/search.rs
vendored
Normal file
|
|
@ -0,0 +1,802 @@
|
|||
use crate::{
|
||||
hybrid::{
|
||||
dfa::{Cache, OverlappingState, DFA},
|
||||
id::LazyStateID,
|
||||
},
|
||||
util::{
|
||||
prefilter::Prefilter,
|
||||
search::{HalfMatch, Input, MatchError, Span},
|
||||
},
|
||||
};
|
||||
|
||||
#[inline(never)]
|
||||
pub(crate) fn find_fwd(
|
||||
dfa: &DFA,
|
||||
cache: &mut Cache,
|
||||
input: &Input<'_>,
|
||||
) -> Result<Option<HalfMatch>, MatchError> {
|
||||
if input.is_done() {
|
||||
return Ok(None);
|
||||
}
|
||||
let pre = if input.get_anchored().is_anchored() {
|
||||
None
|
||||
} else {
|
||||
dfa.get_config().get_prefilter()
|
||||
};
|
||||
// So what we do here is specialize four different versions of 'find_fwd':
|
||||
// one for each of the combinations for 'has prefilter' and 'is earliest
|
||||
// search'. The reason for doing this is that both of these things require
|
||||
// branches and special handling in some code that can be very hot,
|
||||
// and shaving off as much as we can when we don't need it tends to be
|
||||
// beneficial in ad hoc benchmarks. To see these differences, you often
|
||||
// need a query with a high match count. In other words, specializing these
|
||||
// four routines *tends* to help latency more than throughput.
|
||||
if pre.is_some() {
|
||||
if input.get_earliest() {
|
||||
find_fwd_imp(dfa, cache, input, pre, true)
|
||||
} else {
|
||||
find_fwd_imp(dfa, cache, input, pre, false)
|
||||
}
|
||||
} else {
|
||||
if input.get_earliest() {
|
||||
find_fwd_imp(dfa, cache, input, None, true)
|
||||
} else {
|
||||
find_fwd_imp(dfa, cache, input, None, false)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg_attr(feature = "perf-inline", inline(always))]
|
||||
fn find_fwd_imp(
|
||||
dfa: &DFA,
|
||||
cache: &mut Cache,
|
||||
input: &Input<'_>,
|
||||
pre: Option<&'_ Prefilter>,
|
||||
earliest: bool,
|
||||
) -> Result<Option<HalfMatch>, MatchError> {
|
||||
// See 'prefilter_restart' docs for explanation.
|
||||
let universal_start = dfa.get_nfa().look_set_prefix_any().is_empty();
|
||||
let mut mat = None;
|
||||
let mut sid = init_fwd(dfa, cache, input)?;
|
||||
let mut at = input.start();
|
||||
// This could just be a closure, but then I think it would be unsound
|
||||
// because it would need to be safe to invoke. This way, the lack of safety
|
||||
// is clearer in the code below.
|
||||
macro_rules! next_unchecked {
|
||||
($sid:expr, $at:expr) => {{
|
||||
let byte = *input.haystack().get_unchecked($at);
|
||||
dfa.next_state_untagged_unchecked(cache, $sid, byte)
|
||||
}};
|
||||
}
|
||||
|
||||
if let Some(ref pre) = pre {
|
||||
let span = Span::from(at..input.end());
|
||||
match pre.find(input.haystack(), span) {
|
||||
None => return Ok(mat),
|
||||
Some(ref span) => {
|
||||
at = span.start;
|
||||
if !universal_start {
|
||||
sid = prefilter_restart(dfa, cache, &input, at)?;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
cache.search_start(at);
|
||||
while at < input.end() {
|
||||
if sid.is_tagged() {
|
||||
cache.search_update(at);
|
||||
sid = dfa
|
||||
.next_state(cache, sid, input.haystack()[at])
|
||||
.map_err(|_| gave_up(at))?;
|
||||
} else {
|
||||
// SAFETY: There are two safety invariants we need to uphold
|
||||
// here in the loops below: that 'sid' and 'prev_sid' are valid
|
||||
// state IDs for this DFA, and that 'at' is a valid index into
|
||||
// 'haystack'. For the former, we rely on the invariant that
|
||||
// next_state* and start_state_forward always returns a valid state
|
||||
// ID (given a valid state ID in the former case), and that we are
|
||||
// only at this place in the code if 'sid' is untagged. Moreover,
|
||||
// every call to next_state_untagged_unchecked below is guarded by
|
||||
// a check that sid is untagged. For the latter safety invariant,
|
||||
// we always guard unchecked access with a check that 'at' is less
|
||||
// than 'end', where 'end <= haystack.len()'. In the unrolled loop
|
||||
// below, we ensure that 'at' is always in bounds.
|
||||
//
|
||||
// PERF: For justification of omitting bounds checks, it gives us a
|
||||
// ~10% bump in search time. This was used for a benchmark:
|
||||
//
|
||||
// regex-cli find half hybrid -p '(?m)^.+$' -UBb bigfile
|
||||
//
|
||||
// PERF: For justification for the loop unrolling, we use a few
|
||||
// different tests:
|
||||
//
|
||||
// regex-cli find half hybrid -p '\w{50}' -UBb bigfile
|
||||
// regex-cli find half hybrid -p '(?m)^.+$' -UBb bigfile
|
||||
// regex-cli find half hybrid -p 'ZQZQZQZQ' -UBb bigfile
|
||||
//
|
||||
// And there are three different configurations:
|
||||
//
|
||||
// nounroll: this entire 'else' block vanishes and we just
|
||||
// always use 'dfa.next_state(..)'.
|
||||
// unroll1: just the outer loop below
|
||||
// unroll2: just the inner loop below
|
||||
// unroll3: both the outer and inner loops below
|
||||
//
|
||||
// This results in a matrix of timings for each of the above
|
||||
// regexes with each of the above unrolling configurations:
|
||||
//
|
||||
// '\w{50}' '(?m)^.+$' 'ZQZQZQZQ'
|
||||
// nounroll 1.51s 2.34s 1.51s
|
||||
// unroll1 1.53s 2.32s 1.56s
|
||||
// unroll2 2.22s 1.50s 0.61s
|
||||
// unroll3 1.67s 1.45s 0.61s
|
||||
//
|
||||
// Ideally we'd be able to find a configuration that yields the
|
||||
// best time for all regexes, but alas we settle for unroll3 that
|
||||
// gives us *almost* the best for '\w{50}' and the best for the
|
||||
// other two regexes.
|
||||
//
|
||||
// So what exactly is going on here? The first unrolling (grouping
|
||||
// together runs of untagged transitions) specifically targets
|
||||
// our choice of representation. The second unrolling (grouping
|
||||
// together runs of self-transitions) specifically targets a common
|
||||
// DFA topology. Let's dig in a little bit by looking at our
|
||||
// regexes:
|
||||
//
|
||||
// '\w{50}': This regex spends a lot of time outside of the DFA's
|
||||
// start state matching some part of the '\w' repetition. This
|
||||
// means that it's a bit of a worst case for loop unrolling that
|
||||
// targets self-transitions since the self-transitions in '\w{50}'
|
||||
// are not particularly active for this haystack. However, the
|
||||
// first unrolling (grouping together untagged transitions)
|
||||
// does apply quite well here since very few transitions hit
|
||||
// match/dead/quit/unknown states. It is however worth mentioning
|
||||
// that if start states are configured to be tagged (which you
|
||||
// typically want to do if you have a prefilter), then this regex
|
||||
// actually slows way down because it is constantly ping-ponging
|
||||
// out of the unrolled loop and into the handling of a tagged start
|
||||
// state below. But when start states aren't tagged, the unrolled
|
||||
// loop stays hot. (This is why it's imperative that start state
|
||||
// tagging be disabled when there isn't a prefilter!)
|
||||
//
|
||||
// '(?m)^.+$': There are two important aspects of this regex: 1)
|
||||
// on this haystack, its match count is very high, much higher
|
||||
// than the other two regex and 2) it spends the vast majority
|
||||
// of its time matching '.+'. Since Unicode mode is disabled,
|
||||
// this corresponds to repeatedly following self transitions for
|
||||
// the vast majority of the input. This does benefit from the
|
||||
// untagged unrolling since most of the transitions will be to
|
||||
// untagged states, but the untagged unrolling does more work than
|
||||
// what is actually required. Namely, it has to keep track of the
|
||||
// previous and next state IDs, which I guess requires a bit more
|
||||
// shuffling. This is supported by the fact that nounroll+unroll1
|
||||
// are both slower than unroll2+unroll3, where the latter has a
|
||||
// loop unrolling that specifically targets self-transitions.
|
||||
//
|
||||
// 'ZQZQZQZQ': This one is very similar to '(?m)^.+$' because it
|
||||
// spends the vast majority of its time in self-transitions for
|
||||
// the (implicit) unanchored prefix. The main difference with
|
||||
// '(?m)^.+$' is that it has a much lower match count. So there
|
||||
// isn't much time spent in the overhead of reporting matches. This
|
||||
// is the primary explainer in the perf difference here. We include
|
||||
// this regex and the former to make sure we have comparison points
|
||||
// with high and low match counts.
|
||||
//
|
||||
// NOTE: I used 'OpenSubtitles2018.raw.sample.en' for 'bigfile'.
|
||||
//
|
||||
// NOTE: In a follow-up, it turns out that the "inner" loop
|
||||
// mentioned above was a pretty big pessimization in some other
|
||||
// cases. Namely, it resulted in too much ping-ponging into and out
|
||||
// of the loop, which resulted in nearly ~2x regressions in search
|
||||
// time when compared to the originally lazy DFA in the regex crate.
|
||||
// So I've removed the second loop unrolling that targets the
|
||||
// self-transition case.
|
||||
let mut prev_sid = sid;
|
||||
while at < input.end() {
|
||||
prev_sid = unsafe { next_unchecked!(sid, at) };
|
||||
if prev_sid.is_tagged() || at + 3 >= input.end() {
|
||||
core::mem::swap(&mut prev_sid, &mut sid);
|
||||
break;
|
||||
}
|
||||
at += 1;
|
||||
|
||||
sid = unsafe { next_unchecked!(prev_sid, at) };
|
||||
if sid.is_tagged() {
|
||||
break;
|
||||
}
|
||||
at += 1;
|
||||
|
||||
prev_sid = unsafe { next_unchecked!(sid, at) };
|
||||
if prev_sid.is_tagged() {
|
||||
core::mem::swap(&mut prev_sid, &mut sid);
|
||||
break;
|
||||
}
|
||||
at += 1;
|
||||
|
||||
sid = unsafe { next_unchecked!(prev_sid, at) };
|
||||
if sid.is_tagged() {
|
||||
break;
|
||||
}
|
||||
at += 1;
|
||||
}
|
||||
// If we quit out of the code above with an unknown state ID at
|
||||
// any point, then we need to re-compute that transition using
|
||||
// 'next_state', which will do NFA powerset construction for us.
|
||||
if sid.is_unknown() {
|
||||
cache.search_update(at);
|
||||
sid = dfa
|
||||
.next_state(cache, prev_sid, input.haystack()[at])
|
||||
.map_err(|_| gave_up(at))?;
|
||||
}
|
||||
}
|
||||
if sid.is_tagged() {
|
||||
if sid.is_start() {
|
||||
if let Some(ref pre) = pre {
|
||||
let span = Span::from(at..input.end());
|
||||
match pre.find(input.haystack(), span) {
|
||||
None => {
|
||||
cache.search_finish(span.end);
|
||||
return Ok(mat);
|
||||
}
|
||||
Some(ref span) => {
|
||||
// We want to skip any update to 'at' below
|
||||
// at the end of this iteration and just
|
||||
// jump immediately back to the next state
|
||||
// transition at the leading position of the
|
||||
// candidate match.
|
||||
//
|
||||
// ... but only if we actually made progress
|
||||
// with our prefilter, otherwise if the start
|
||||
// state has a self-loop, we can get stuck.
|
||||
if span.start > at {
|
||||
at = span.start;
|
||||
if !universal_start {
|
||||
sid = prefilter_restart(
|
||||
dfa, cache, &input, at,
|
||||
)?;
|
||||
}
|
||||
continue;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
} else if sid.is_match() {
|
||||
let pattern = dfa.match_pattern(cache, sid, 0);
|
||||
// Since slice ranges are inclusive at the beginning and
|
||||
// exclusive at the end, and since forward searches report
|
||||
// the end, we can return 'at' as-is. This only works because
|
||||
// matches are delayed by 1 byte. So by the time we observe a
|
||||
// match, 'at' has already been set to 1 byte past the actual
|
||||
// match location, which is precisely the exclusive ending
|
||||
// bound of the match.
|
||||
mat = Some(HalfMatch::new(pattern, at));
|
||||
if earliest {
|
||||
cache.search_finish(at);
|
||||
return Ok(mat);
|
||||
}
|
||||
} else if sid.is_dead() {
|
||||
cache.search_finish(at);
|
||||
return Ok(mat);
|
||||
} else if sid.is_quit() {
|
||||
cache.search_finish(at);
|
||||
return Err(MatchError::quit(input.haystack()[at], at));
|
||||
} else {
|
||||
debug_assert!(sid.is_unknown());
|
||||
unreachable!("sid being unknown is a bug");
|
||||
}
|
||||
}
|
||||
at += 1;
|
||||
}
|
||||
eoi_fwd(dfa, cache, input, &mut sid, &mut mat)?;
|
||||
cache.search_finish(input.end());
|
||||
Ok(mat)
|
||||
}
|
||||
|
||||
#[inline(never)]
|
||||
pub(crate) fn find_rev(
|
||||
dfa: &DFA,
|
||||
cache: &mut Cache,
|
||||
input: &Input<'_>,
|
||||
) -> Result<Option<HalfMatch>, MatchError> {
|
||||
if input.is_done() {
|
||||
return Ok(None);
|
||||
}
|
||||
if input.get_earliest() {
|
||||
find_rev_imp(dfa, cache, input, true)
|
||||
} else {
|
||||
find_rev_imp(dfa, cache, input, false)
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg_attr(feature = "perf-inline", inline(always))]
|
||||
fn find_rev_imp(
|
||||
dfa: &DFA,
|
||||
cache: &mut Cache,
|
||||
input: &Input<'_>,
|
||||
earliest: bool,
|
||||
) -> Result<Option<HalfMatch>, MatchError> {
|
||||
let mut mat = None;
|
||||
let mut sid = init_rev(dfa, cache, input)?;
|
||||
// In reverse search, the loop below can't handle the case of searching an
|
||||
// empty slice. Ideally we could write something congruent to the forward
|
||||
// search, i.e., 'while at >= start', but 'start' might be 0. Since we use
|
||||
// an unsigned offset, 'at >= 0' is trivially always true. We could avoid
|
||||
// this extra case handling by using a signed offset, but Rust makes it
|
||||
// annoying to do. So... We just handle the empty case separately.
|
||||
if input.start() == input.end() {
|
||||
eoi_rev(dfa, cache, input, &mut sid, &mut mat)?;
|
||||
return Ok(mat);
|
||||
}
|
||||
|
||||
let mut at = input.end() - 1;
|
||||
macro_rules! next_unchecked {
|
||||
($sid:expr, $at:expr) => {{
|
||||
let byte = *input.haystack().get_unchecked($at);
|
||||
dfa.next_state_untagged_unchecked(cache, $sid, byte)
|
||||
}};
|
||||
}
|
||||
cache.search_start(at);
|
||||
loop {
|
||||
if sid.is_tagged() {
|
||||
cache.search_update(at);
|
||||
sid = dfa
|
||||
.next_state(cache, sid, input.haystack()[at])
|
||||
.map_err(|_| gave_up(at))?;
|
||||
} else {
|
||||
// SAFETY: See comments in 'find_fwd' for a safety argument.
|
||||
//
|
||||
// PERF: The comments in 'find_fwd' also provide a justification
|
||||
// from a performance perspective as to 1) why we elide bounds
|
||||
// checks and 2) why we do a specialized version of unrolling
|
||||
// below. The reverse search does have a slightly different
|
||||
// consideration in that most reverse searches tend to be
|
||||
// anchored and on shorter haystacks. However, this still makes a
|
||||
// difference. Take this command for example:
|
||||
//
|
||||
// regex-cli find match hybrid -p '(?m)^.+$' -UBb bigfile
|
||||
//
|
||||
// (Notice that we use 'find hybrid regex', not 'find hybrid dfa'
|
||||
// like in the justification for the forward direction. The 'regex'
|
||||
// sub-command will find start-of-match and thus run the reverse
|
||||
// direction.)
|
||||
//
|
||||
// Without unrolling below, the above command takes around 3.76s.
|
||||
// But with the unrolling below, we get down to 2.55s. If we keep
|
||||
// the unrolling but add in bounds checks, then we get 2.86s.
|
||||
//
|
||||
// NOTE: I used 'OpenSubtitles2018.raw.sample.en' for 'bigfile'.
|
||||
let mut prev_sid = sid;
|
||||
while at >= input.start() {
|
||||
prev_sid = unsafe { next_unchecked!(sid, at) };
|
||||
if prev_sid.is_tagged()
|
||||
|| at <= input.start().saturating_add(3)
|
||||
{
|
||||
core::mem::swap(&mut prev_sid, &mut sid);
|
||||
break;
|
||||
}
|
||||
at -= 1;
|
||||
|
||||
sid = unsafe { next_unchecked!(prev_sid, at) };
|
||||
if sid.is_tagged() {
|
||||
break;
|
||||
}
|
||||
at -= 1;
|
||||
|
||||
prev_sid = unsafe { next_unchecked!(sid, at) };
|
||||
if prev_sid.is_tagged() {
|
||||
core::mem::swap(&mut prev_sid, &mut sid);
|
||||
break;
|
||||
}
|
||||
at -= 1;
|
||||
|
||||
sid = unsafe { next_unchecked!(prev_sid, at) };
|
||||
if sid.is_tagged() {
|
||||
break;
|
||||
}
|
||||
at -= 1;
|
||||
}
|
||||
// If we quit out of the code above with an unknown state ID at
|
||||
// any point, then we need to re-compute that transition using
|
||||
// 'next_state', which will do NFA powerset construction for us.
|
||||
if sid.is_unknown() {
|
||||
cache.search_update(at);
|
||||
sid = dfa
|
||||
.next_state(cache, prev_sid, input.haystack()[at])
|
||||
.map_err(|_| gave_up(at))?;
|
||||
}
|
||||
}
|
||||
if sid.is_tagged() {
|
||||
if sid.is_start() {
|
||||
// do nothing
|
||||
} else if sid.is_match() {
|
||||
let pattern = dfa.match_pattern(cache, sid, 0);
|
||||
// Since reverse searches report the beginning of a match
|
||||
// and the beginning is inclusive (not exclusive like the
|
||||
// end of a match), we add 1 to make it inclusive.
|
||||
mat = Some(HalfMatch::new(pattern, at + 1));
|
||||
if earliest {
|
||||
cache.search_finish(at);
|
||||
return Ok(mat);
|
||||
}
|
||||
} else if sid.is_dead() {
|
||||
cache.search_finish(at);
|
||||
return Ok(mat);
|
||||
} else if sid.is_quit() {
|
||||
cache.search_finish(at);
|
||||
return Err(MatchError::quit(input.haystack()[at], at));
|
||||
} else {
|
||||
debug_assert!(sid.is_unknown());
|
||||
unreachable!("sid being unknown is a bug");
|
||||
}
|
||||
}
|
||||
if at == input.start() {
|
||||
break;
|
||||
}
|
||||
at -= 1;
|
||||
}
|
||||
cache.search_finish(input.start());
|
||||
eoi_rev(dfa, cache, input, &mut sid, &mut mat)?;
|
||||
Ok(mat)
|
||||
}
|
||||
|
||||
#[inline(never)]
|
||||
pub(crate) fn find_overlapping_fwd(
|
||||
dfa: &DFA,
|
||||
cache: &mut Cache,
|
||||
input: &Input<'_>,
|
||||
state: &mut OverlappingState,
|
||||
) -> Result<(), MatchError> {
|
||||
state.mat = None;
|
||||
if input.is_done() {
|
||||
return Ok(());
|
||||
}
|
||||
let pre = if input.get_anchored().is_anchored() {
|
||||
None
|
||||
} else {
|
||||
dfa.get_config().get_prefilter()
|
||||
};
|
||||
if pre.is_some() {
|
||||
find_overlapping_fwd_imp(dfa, cache, input, pre, state)
|
||||
} else {
|
||||
find_overlapping_fwd_imp(dfa, cache, input, None, state)
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg_attr(feature = "perf-inline", inline(always))]
|
||||
fn find_overlapping_fwd_imp(
|
||||
dfa: &DFA,
|
||||
cache: &mut Cache,
|
||||
input: &Input<'_>,
|
||||
pre: Option<&'_ Prefilter>,
|
||||
state: &mut OverlappingState,
|
||||
) -> Result<(), MatchError> {
|
||||
// See 'prefilter_restart' docs for explanation.
|
||||
let universal_start = dfa.get_nfa().look_set_prefix_any().is_empty();
|
||||
let mut sid = match state.id {
|
||||
None => {
|
||||
state.at = input.start();
|
||||
init_fwd(dfa, cache, input)?
|
||||
}
|
||||
Some(sid) => {
|
||||
if let Some(match_index) = state.next_match_index {
|
||||
let match_len = dfa.match_len(cache, sid);
|
||||
if match_index < match_len {
|
||||
state.next_match_index = Some(match_index + 1);
|
||||
let pattern = dfa.match_pattern(cache, sid, match_index);
|
||||
state.mat = Some(HalfMatch::new(pattern, state.at));
|
||||
return Ok(());
|
||||
}
|
||||
}
|
||||
// Once we've reported all matches at a given position, we need to
|
||||
// advance the search to the next position.
|
||||
state.at += 1;
|
||||
if state.at > input.end() {
|
||||
return Ok(());
|
||||
}
|
||||
sid
|
||||
}
|
||||
};
|
||||
|
||||
// NOTE: We don't optimize the crap out of this routine primarily because
|
||||
// it seems like most overlapping searches will have higher match counts,
|
||||
// and thus, throughput is perhaps not as important. But if you have a use
|
||||
// case for something faster, feel free to file an issue.
|
||||
cache.search_start(state.at);
|
||||
while state.at < input.end() {
|
||||
sid = dfa
|
||||
.next_state(cache, sid, input.haystack()[state.at])
|
||||
.map_err(|_| gave_up(state.at))?;
|
||||
if sid.is_tagged() {
|
||||
state.id = Some(sid);
|
||||
if sid.is_start() {
|
||||
if let Some(ref pre) = pre {
|
||||
let span = Span::from(state.at..input.end());
|
||||
match pre.find(input.haystack(), span) {
|
||||
None => return Ok(()),
|
||||
Some(ref span) => {
|
||||
if span.start > state.at {
|
||||
state.at = span.start;
|
||||
if !universal_start {
|
||||
sid = prefilter_restart(
|
||||
dfa, cache, &input, state.at,
|
||||
)?;
|
||||
}
|
||||
continue;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
} else if sid.is_match() {
|
||||
state.next_match_index = Some(1);
|
||||
let pattern = dfa.match_pattern(cache, sid, 0);
|
||||
state.mat = Some(HalfMatch::new(pattern, state.at));
|
||||
cache.search_finish(state.at);
|
||||
return Ok(());
|
||||
} else if sid.is_dead() {
|
||||
cache.search_finish(state.at);
|
||||
return Ok(());
|
||||
} else if sid.is_quit() {
|
||||
cache.search_finish(state.at);
|
||||
return Err(MatchError::quit(
|
||||
input.haystack()[state.at],
|
||||
state.at,
|
||||
));
|
||||
} else {
|
||||
debug_assert!(sid.is_unknown());
|
||||
unreachable!("sid being unknown is a bug");
|
||||
}
|
||||
}
|
||||
state.at += 1;
|
||||
cache.search_update(state.at);
|
||||
}
|
||||
|
||||
let result = eoi_fwd(dfa, cache, input, &mut sid, &mut state.mat);
|
||||
state.id = Some(sid);
|
||||
if state.mat.is_some() {
|
||||
// '1' is always correct here since if we get to this point, this
|
||||
// always corresponds to the first (index '0') match discovered at
|
||||
// this position. So the next match to report at this position (if
|
||||
// it exists) is at index '1'.
|
||||
state.next_match_index = Some(1);
|
||||
}
|
||||
cache.search_finish(input.end());
|
||||
result
|
||||
}
|
||||
|
||||
#[inline(never)]
|
||||
pub(crate) fn find_overlapping_rev(
|
||||
dfa: &DFA,
|
||||
cache: &mut Cache,
|
||||
input: &Input<'_>,
|
||||
state: &mut OverlappingState,
|
||||
) -> Result<(), MatchError> {
|
||||
state.mat = None;
|
||||
if input.is_done() {
|
||||
return Ok(());
|
||||
}
|
||||
let mut sid = match state.id {
|
||||
None => {
|
||||
let sid = init_rev(dfa, cache, input)?;
|
||||
state.id = Some(sid);
|
||||
if input.start() == input.end() {
|
||||
state.rev_eoi = true;
|
||||
} else {
|
||||
state.at = input.end() - 1;
|
||||
}
|
||||
sid
|
||||
}
|
||||
Some(sid) => {
|
||||
if let Some(match_index) = state.next_match_index {
|
||||
let match_len = dfa.match_len(cache, sid);
|
||||
if match_index < match_len {
|
||||
state.next_match_index = Some(match_index + 1);
|
||||
let pattern = dfa.match_pattern(cache, sid, match_index);
|
||||
state.mat = Some(HalfMatch::new(pattern, state.at));
|
||||
return Ok(());
|
||||
}
|
||||
}
|
||||
// Once we've reported all matches at a given position, we need
|
||||
// to advance the search to the next position. However, if we've
|
||||
// already followed the EOI transition, then we know we're done
|
||||
// with the search and there cannot be any more matches to report.
|
||||
if state.rev_eoi {
|
||||
return Ok(());
|
||||
} else if state.at == input.start() {
|
||||
// At this point, we should follow the EOI transition. This
|
||||
// will cause us the skip the main loop below and fall through
|
||||
// to the final 'eoi_rev' transition.
|
||||
state.rev_eoi = true;
|
||||
} else {
|
||||
// We haven't hit the end of the search yet, so move on.
|
||||
state.at -= 1;
|
||||
}
|
||||
sid
|
||||
}
|
||||
};
|
||||
cache.search_start(state.at);
|
||||
while !state.rev_eoi {
|
||||
sid = dfa
|
||||
.next_state(cache, sid, input.haystack()[state.at])
|
||||
.map_err(|_| gave_up(state.at))?;
|
||||
if sid.is_tagged() {
|
||||
state.id = Some(sid);
|
||||
if sid.is_start() {
|
||||
// do nothing
|
||||
} else if sid.is_match() {
|
||||
state.next_match_index = Some(1);
|
||||
let pattern = dfa.match_pattern(cache, sid, 0);
|
||||
state.mat = Some(HalfMatch::new(pattern, state.at + 1));
|
||||
cache.search_finish(state.at);
|
||||
return Ok(());
|
||||
} else if sid.is_dead() {
|
||||
cache.search_finish(state.at);
|
||||
return Ok(());
|
||||
} else if sid.is_quit() {
|
||||
cache.search_finish(state.at);
|
||||
return Err(MatchError::quit(
|
||||
input.haystack()[state.at],
|
||||
state.at,
|
||||
));
|
||||
} else {
|
||||
debug_assert!(sid.is_unknown());
|
||||
unreachable!("sid being unknown is a bug");
|
||||
}
|
||||
}
|
||||
if state.at == input.start() {
|
||||
break;
|
||||
}
|
||||
state.at -= 1;
|
||||
cache.search_update(state.at);
|
||||
}
|
||||
|
||||
let result = eoi_rev(dfa, cache, input, &mut sid, &mut state.mat);
|
||||
state.rev_eoi = true;
|
||||
state.id = Some(sid);
|
||||
if state.mat.is_some() {
|
||||
// '1' is always correct here since if we get to this point, this
|
||||
// always corresponds to the first (index '0') match discovered at
|
||||
// this position. So the next match to report at this position (if
|
||||
// it exists) is at index '1'.
|
||||
state.next_match_index = Some(1);
|
||||
}
|
||||
cache.search_finish(input.start());
|
||||
result
|
||||
}
|
||||
|
||||
#[cfg_attr(feature = "perf-inline", inline(always))]
|
||||
fn init_fwd(
|
||||
dfa: &DFA,
|
||||
cache: &mut Cache,
|
||||
input: &Input<'_>,
|
||||
) -> Result<LazyStateID, MatchError> {
|
||||
let sid = dfa.start_state_forward(cache, input)?;
|
||||
// Start states can never be match states, since all matches are delayed
|
||||
// by 1 byte.
|
||||
debug_assert!(!sid.is_match());
|
||||
Ok(sid)
|
||||
}
|
||||
|
||||
#[cfg_attr(feature = "perf-inline", inline(always))]
|
||||
fn init_rev(
|
||||
dfa: &DFA,
|
||||
cache: &mut Cache,
|
||||
input: &Input<'_>,
|
||||
) -> Result<LazyStateID, MatchError> {
|
||||
let sid = dfa.start_state_reverse(cache, input)?;
|
||||
// Start states can never be match states, since all matches are delayed
|
||||
// by 1 byte.
|
||||
debug_assert!(!sid.is_match());
|
||||
Ok(sid)
|
||||
}
|
||||
|
||||
#[cfg_attr(feature = "perf-inline", inline(always))]
|
||||
fn eoi_fwd(
|
||||
dfa: &DFA,
|
||||
cache: &mut Cache,
|
||||
input: &Input<'_>,
|
||||
sid: &mut LazyStateID,
|
||||
mat: &mut Option<HalfMatch>,
|
||||
) -> Result<(), MatchError> {
|
||||
let sp = input.get_span();
|
||||
match input.haystack().get(sp.end) {
|
||||
Some(&b) => {
|
||||
*sid =
|
||||
dfa.next_state(cache, *sid, b).map_err(|_| gave_up(sp.end))?;
|
||||
if sid.is_match() {
|
||||
let pattern = dfa.match_pattern(cache, *sid, 0);
|
||||
*mat = Some(HalfMatch::new(pattern, sp.end));
|
||||
} else if sid.is_quit() {
|
||||
return Err(MatchError::quit(b, sp.end));
|
||||
}
|
||||
}
|
||||
None => {
|
||||
*sid = dfa
|
||||
.next_eoi_state(cache, *sid)
|
||||
.map_err(|_| gave_up(input.haystack().len()))?;
|
||||
if sid.is_match() {
|
||||
let pattern = dfa.match_pattern(cache, *sid, 0);
|
||||
*mat = Some(HalfMatch::new(pattern, input.haystack().len()));
|
||||
}
|
||||
// N.B. We don't have to check 'is_quit' here because the EOI
|
||||
// transition can never lead to a quit state.
|
||||
debug_assert!(!sid.is_quit());
|
||||
}
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[cfg_attr(feature = "perf-inline", inline(always))]
|
||||
fn eoi_rev(
|
||||
dfa: &DFA,
|
||||
cache: &mut Cache,
|
||||
input: &Input<'_>,
|
||||
sid: &mut LazyStateID,
|
||||
mat: &mut Option<HalfMatch>,
|
||||
) -> Result<(), MatchError> {
|
||||
let sp = input.get_span();
|
||||
if sp.start > 0 {
|
||||
let byte = input.haystack()[sp.start - 1];
|
||||
*sid = dfa
|
||||
.next_state(cache, *sid, byte)
|
||||
.map_err(|_| gave_up(sp.start))?;
|
||||
if sid.is_match() {
|
||||
let pattern = dfa.match_pattern(cache, *sid, 0);
|
||||
*mat = Some(HalfMatch::new(pattern, sp.start));
|
||||
} else if sid.is_quit() {
|
||||
return Err(MatchError::quit(byte, sp.start - 1));
|
||||
}
|
||||
} else {
|
||||
*sid =
|
||||
dfa.next_eoi_state(cache, *sid).map_err(|_| gave_up(sp.start))?;
|
||||
if sid.is_match() {
|
||||
let pattern = dfa.match_pattern(cache, *sid, 0);
|
||||
*mat = Some(HalfMatch::new(pattern, 0));
|
||||
}
|
||||
// N.B. We don't have to check 'is_quit' here because the EOI
|
||||
// transition can never lead to a quit state.
|
||||
debug_assert!(!sid.is_quit());
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Re-compute the starting state that a DFA should be in after finding a
|
||||
/// prefilter candidate match at the position `at`.
|
||||
///
|
||||
/// It is always correct to call this, but not always necessary. Namely,
|
||||
/// whenever the DFA has a universal start state, the DFA can remain in the
|
||||
/// start state that it was in when it ran the prefilter. Why? Because in that
|
||||
/// case, there is only one start state.
|
||||
///
|
||||
/// When does a DFA have a universal start state? In precisely cases where
|
||||
/// it has no look-around assertions in its prefix. So for example, `\bfoo`
|
||||
/// does not have a universal start state because the start state depends on
|
||||
/// whether the byte immediately before the start position is a word byte or
|
||||
/// not. However, `foo\b` does have a universal start state because the word
|
||||
/// boundary does not appear in the pattern's prefix.
|
||||
///
|
||||
/// So... most cases don't need this, but when a pattern doesn't have a
|
||||
/// universal start state, then after a prefilter candidate has been found, the
|
||||
/// current state *must* be re-litigated as if computing the start state at the
|
||||
/// beginning of the search because it might change. That is, not all start
|
||||
/// states are created equal.
|
||||
///
|
||||
/// Why avoid it? Because while it's not super expensive, it isn't a trivial
|
||||
/// operation to compute the start state. It is much better to avoid it and
|
||||
/// just state in the current state if you know it to be correct.
|
||||
#[cfg_attr(feature = "perf-inline", inline(always))]
|
||||
fn prefilter_restart(
|
||||
dfa: &DFA,
|
||||
cache: &mut Cache,
|
||||
input: &Input<'_>,
|
||||
at: usize,
|
||||
) -> Result<LazyStateID, MatchError> {
|
||||
let mut input = input.clone();
|
||||
input.set_start(at);
|
||||
init_fwd(dfa, cache, &input)
|
||||
}
|
||||
|
||||
/// A convenience routine for constructing a "gave up" match error.
|
||||
#[cfg_attr(feature = "perf-inline", inline(always))]
|
||||
fn gave_up(offset: usize) -> MatchError {
|
||||
MatchError::gave_up(offset)
|
||||
}
|
||||
648
third-party/vendor/regex-automata/src/lib.rs
vendored
Normal file
648
third-party/vendor/regex-automata/src/lib.rs
vendored
Normal file
|
|
@ -0,0 +1,648 @@
|
|||
/*!
|
||||
This crate exposes a variety of regex engines used by the `regex` crate.
|
||||
It provides a vast, sprawling and "expert" level API to each regex engine.
|
||||
The regex engines provided by this crate focus heavily on finite automata
|
||||
implementations and specifically guarantee worst case `O(m * n)` time
|
||||
complexity for all searches. (Where `m ~ len(regex)` and `n ~ len(haystack)`.)
|
||||
|
||||
The primary goal of this crate is to serve as an implementation detail for the
|
||||
`regex` crate. A secondary goal is to make its internals available for use by
|
||||
others.
|
||||
|
||||
# Table of contents
|
||||
|
||||
* [Should I be using this crate?](#should-i-be-using-this-crate) gives some
|
||||
reasons for and against using this crate.
|
||||
* [Examples](#examples) provides a small selection of things you can do with
|
||||
this crate.
|
||||
* [Available regex engines](#available-regex-engines) provides a hyperlinked
|
||||
list of all regex engines in this crate.
|
||||
* [API themes](#api-themes) discusses common elements used throughout this
|
||||
crate.
|
||||
* [Crate features](#crate-features) documents the extensive list of Cargo
|
||||
features available.
|
||||
|
||||
# Should I be using this crate?
|
||||
|
||||
If you find yourself here because you just want to use regexes, then you should
|
||||
first check out whether the [`regex` crate](https://docs.rs/regex) meets
|
||||
your needs. It provides a streamlined and difficult-to-misuse API for regex
|
||||
searching.
|
||||
|
||||
If you're here because there is something specific you want to do that can't
|
||||
be easily done with `regex` crate, then you are perhaps in the right place.
|
||||
It's most likely that the first stop you'll want to make is to explore the
|
||||
[`meta` regex APIs](meta). Namely, the `regex` crate is just a light wrapper
|
||||
over a [`meta::Regex`], so its API will probably be the easiest to transition
|
||||
to. In contrast to the `regex` crate, the `meta::Regex` API supports more
|
||||
search parameters and does multi-pattern searches. However, it isn't quite as
|
||||
ergonomic.
|
||||
|
||||
Otherwise, the following is an inexhaustive list of reasons to use this crate:
|
||||
|
||||
* You want to analyze or use a [Thompson `NFA`](nfa::thompson::NFA) directly.
|
||||
* You want more powerful multi-pattern search than what is provided by
|
||||
`RegexSet` in the `regex` crate. All regex engines in this crate support
|
||||
multi-pattern searches.
|
||||
* You want to use one of the `regex` crate's internal engines directly because
|
||||
of some interesting configuration that isn't possible via the `regex` crate.
|
||||
For example, a [lazy DFA's configuration](hybrid::dfa::Config) exposes a
|
||||
dizzying number of options for controlling its execution.
|
||||
* You want to use the lower level search APIs. For example, both the [lazy
|
||||
DFA](hybrid::dfa) and [fully compiled DFAs](dfa) support searching by exploring
|
||||
the automaton one state at a time. This might be useful, for example, for
|
||||
stream searches or searches of strings stored in non-contiguous in memory.
|
||||
* You want to build a fully compiled DFA and then [use zero-copy
|
||||
deserialization](dfa::dense::DFA::from_bytes) to load it into memory and use
|
||||
it for searching. This use case is supported in core-only no-std/no-alloc
|
||||
environments.
|
||||
* You want to run [anchored searches](Input::anchored) without using the `^`
|
||||
anchor in your regex pattern.
|
||||
* You need to work-around contention issues with
|
||||
sharing a regex across multiple threads. The
|
||||
[`meta::Regex::search_with`](meta::Regex::search_with) API permits bypassing
|
||||
any kind of synchronization at all by requiring the caller to provide the
|
||||
mutable scratch spaced needed during a search.
|
||||
* You want to build your own regex engine on top of the `regex` crate's
|
||||
infrastructure.
|
||||
|
||||
# Examples
|
||||
|
||||
This section tries to identify a few interesting things you can do with this
|
||||
crate and demonstrates them.
|
||||
|
||||
### Multi-pattern searches with capture groups
|
||||
|
||||
One of the more frustrating limitations of `RegexSet` in the `regex` crate
|
||||
(at the time of writing) is that it doesn't report match positions. With this
|
||||
crate, multi-pattern support was intentionally designed in from the beginning,
|
||||
which means it works in all regex engines and even for capture groups as well.
|
||||
|
||||
This example shows how to search for matches of multiple regexes, where each
|
||||
regex uses the same capture group names to parse different key-value formats.
|
||||
|
||||
```
|
||||
use regex_automata::{meta::Regex, PatternID};
|
||||
|
||||
let re = Regex::new_many(&[
|
||||
r#"(?m)^(?<key>[[:word:]]+)=(?<val>[[:word:]]+)$"#,
|
||||
r#"(?m)^(?<key>[[:word:]]+)="(?<val>[^"]+)"$"#,
|
||||
r#"(?m)^(?<key>[[:word:]]+)='(?<val>[^']+)'$"#,
|
||||
r#"(?m)^(?<key>[[:word:]]+):\s*(?<val>[[:word:]]+)$"#,
|
||||
])?;
|
||||
let hay = r#"
|
||||
best_album="Blow Your Face Out"
|
||||
best_quote='"then as it was, then again it will be"'
|
||||
best_year=1973
|
||||
best_simpsons_episode: HOMR
|
||||
"#;
|
||||
let mut kvs = vec![];
|
||||
for caps in re.captures_iter(hay) {
|
||||
// N.B. One could use capture indices '1' and '2' here
|
||||
// as well. Capture indices are local to each pattern.
|
||||
// (Just like names are.)
|
||||
let key = &hay[caps.get_group_by_name("key").unwrap()];
|
||||
let val = &hay[caps.get_group_by_name("val").unwrap()];
|
||||
kvs.push((key, val));
|
||||
}
|
||||
assert_eq!(kvs, vec![
|
||||
("best_album", "Blow Your Face Out"),
|
||||
("best_quote", "\"then as it was, then again it will be\""),
|
||||
("best_year", "1973"),
|
||||
("best_simpsons_episode", "HOMR"),
|
||||
]);
|
||||
|
||||
# Ok::<(), Box<dyn std::error::Error>>(())
|
||||
```
|
||||
|
||||
### Build a full DFA and walk it manually
|
||||
|
||||
One of the regex engines in this crate is a fully compiled DFA. It takes worst
|
||||
case exponential time to build, but once built, it can be easily explored and
|
||||
used for searches. Here's a simple example that uses its lower level APIs to
|
||||
implement a simple anchored search by hand.
|
||||
|
||||
```
|
||||
use regex_automata::{dfa::{Automaton, dense}, Input};
|
||||
|
||||
let dfa = dense::DFA::new(r"(?-u)\b[A-Z]\w+z\b")?;
|
||||
let haystack = "Quartz";
|
||||
|
||||
// The start state is determined by inspecting the position and the
|
||||
// initial bytes of the haystack.
|
||||
let mut state = dfa.start_state_forward(&Input::new(haystack))?;
|
||||
// Walk all the bytes in the haystack.
|
||||
for &b in haystack.as_bytes().iter() {
|
||||
state = dfa.next_state(state, b);
|
||||
}
|
||||
// DFAs in this crate require an explicit
|
||||
// end-of-input transition if a search reaches
|
||||
// the end of a haystack.
|
||||
state = dfa.next_eoi_state(state);
|
||||
assert!(dfa.is_match_state(state));
|
||||
|
||||
# Ok::<(), Box<dyn std::error::Error>>(())
|
||||
```
|
||||
|
||||
Or do the same with a lazy DFA that avoids exponential worst case compile time,
|
||||
but requires mutable scratch space to lazily build the DFA during the search.
|
||||
|
||||
```
|
||||
use regex_automata::{hybrid::dfa::DFA, Input};
|
||||
|
||||
let dfa = DFA::new(r"(?-u)\b[A-Z]\w+z\b")?;
|
||||
let mut cache = dfa.create_cache();
|
||||
let hay = "Quartz";
|
||||
|
||||
// The start state is determined by inspecting the position and the
|
||||
// initial bytes of the haystack.
|
||||
let mut state = dfa.start_state_forward(&mut cache, &Input::new(hay))?;
|
||||
// Walk all the bytes in the haystack.
|
||||
for &b in hay.as_bytes().iter() {
|
||||
state = dfa.next_state(&mut cache, state, b)?;
|
||||
}
|
||||
// DFAs in this crate require an explicit
|
||||
// end-of-input transition if a search reaches
|
||||
// the end of a haystack.
|
||||
state = dfa.next_eoi_state(&mut cache, state)?;
|
||||
assert!(state.is_match());
|
||||
|
||||
# Ok::<(), Box<dyn std::error::Error>>(())
|
||||
```
|
||||
|
||||
### Find all overlapping matches
|
||||
|
||||
This example shows how to build a DFA and use it to find all possible matches,
|
||||
including overlapping matches. A similar example will work with a lazy DFA as
|
||||
well. This also works with multiple patterns and will report all matches at the
|
||||
same position where multiple patterns match.
|
||||
|
||||
```
|
||||
use regex_automata::{
|
||||
dfa::{dense, Automaton, OverlappingState},
|
||||
Input, MatchKind,
|
||||
};
|
||||
|
||||
let dfa = dense::DFA::builder()
|
||||
.configure(dense::DFA::config().match_kind(MatchKind::All))
|
||||
.build(r"(?-u)\w{3,}")?;
|
||||
let input = Input::new("homer marge bart lisa maggie");
|
||||
let mut state = OverlappingState::start();
|
||||
|
||||
let mut matches = vec![];
|
||||
while let Some(hm) = {
|
||||
dfa.try_search_overlapping_fwd(&input, &mut state)?;
|
||||
state.get_match()
|
||||
} {
|
||||
matches.push(hm.offset());
|
||||
}
|
||||
assert_eq!(matches, vec![
|
||||
3, 4, 5, // hom, home, homer
|
||||
9, 10, 11, // mar, marg, marge
|
||||
15, 16, // bar, bart
|
||||
20, 21, // lis, lisa
|
||||
25, 26, 27, 28, // mag, magg, maggi, maggie
|
||||
]);
|
||||
|
||||
# Ok::<(), Box<dyn std::error::Error>>(())
|
||||
```
|
||||
|
||||
# Available regex engines
|
||||
|
||||
The following is a complete list of all regex engines provided by this crate,
|
||||
along with a very brief description of it and why you might want to use it.
|
||||
|
||||
* [`dfa::regex::Regex`] is a regex engine that works on top of either
|
||||
[dense](dfa::dense) or [sparse](dfa::sparse) fully compiled DFAs. You might
|
||||
use a DFA if you need the fastest possible regex engine in this crate and can
|
||||
afford the exorbitant memory usage usually required by DFAs. Low level APIs on
|
||||
fully compiled DFAs are provided by the [`Automaton` trait](dfa::Automaton).
|
||||
Fully compiled dense DFAs can handle all regexes except for searching a regex
|
||||
with a Unicode word boundary on non-ASCII haystacks. A fully compiled DFA based
|
||||
regex can only report the start and end of each match.
|
||||
* [`hybrid::regex::Regex`] is a regex engine that works on top of a lazily
|
||||
built DFA. Its performance profile is very similar to that of fully compiled
|
||||
DFAs, but can be slower in some pathological cases. Fully compiled DFAs are
|
||||
also amenable to more optimizations, such as state acceleration, that aren't
|
||||
available in a lazy DFA. You might use this lazy DFA if you can't abide the
|
||||
worst case exponential compile time of a full DFA, but still want the DFA
|
||||
search performance in the vast majority of cases. A lazy DFA based regex can
|
||||
only report the start and end of each match.
|
||||
* [`dfa::onepass::DFA`] is a regex engine that is implemented as a DFA, but
|
||||
can report the matches of each capture group in addition to the start and end
|
||||
of each match. The catch is that it only works on a somewhat small subset of
|
||||
regexes known as "one-pass." You'll want to use this for cases when you need
|
||||
capture group matches and the regex is one-pass since it is likely to be faster
|
||||
than any alternative. A one-pass DFA can handle all types of regexes, but does
|
||||
have some reasonable limits on the number of capture groups it can handle.
|
||||
* [`nfa::thompson::backtrack::BoundedBacktracker`] is a regex engine that uses
|
||||
backtracking, but keeps track of the work it has done to avoid catastrophic
|
||||
backtracking. Like the one-pass DFA, it provides the matches of each capture
|
||||
group. It retains the `O(m * n)` worst case time bound. This tends to be slower
|
||||
than the one-pass DFA regex engine, but faster than the PikeVM. It can handle
|
||||
all types of regexes, but usually only works well with small haystacks and
|
||||
small regexes due to the memory required to avoid redoing work.
|
||||
* [`nfa::thompson::pikevm::PikeVM`] is a regex engine that can handle all
|
||||
regexes, of all sizes and provides capture group matches. It tends to be a tool
|
||||
of last resort because it is also usually the slowest regex engine.
|
||||
* [`meta::Regex`] is the meta regex engine that combines *all* of the above
|
||||
engines into one. The reason for this is that each of the engines above have
|
||||
their own caveats such as, "only handles a subset of regexes" or "is generally
|
||||
slow." The meta regex engine accounts for all of these caveats and composes
|
||||
the engines in a way that attempts to mitigate each engine's weaknesses while
|
||||
emphasizing its strengths. For example, it will attempt to run a lazy DFA even
|
||||
if it might fail. In which case, it will restart the search with a likely
|
||||
slower but more capable regex engine. The meta regex engine is what you should
|
||||
default to. Use one of the above engines directly only if you have a specific
|
||||
reason to.
|
||||
|
||||
# API themes
|
||||
|
||||
While each regex engine has its own APIs and configuration options, there are
|
||||
some general themes followed by all of them.
|
||||
|
||||
### The `Input` abstraction
|
||||
|
||||
Most search routines in this crate accept anything that implements
|
||||
`Into<Input>`. Both `&str` and `&[u8]` haystacks satisfy this constraint, which
|
||||
means that things like `engine.search("foo")` will work as you would expect.
|
||||
|
||||
By virtue of accepting an `Into<Input>` though, callers can provide more than
|
||||
just a haystack. Indeed, the [`Input`] type has more details, but briefly,
|
||||
callers can use it to configure various aspects of the search:
|
||||
|
||||
* The span of the haystack to search via [`Input::span`] or [`Input::range`],
|
||||
which might be a substring of the haystack.
|
||||
* Whether to run an anchored search or not via [`Input::anchored`]. This
|
||||
permits one to require matches to start at the same offset that the search
|
||||
started.
|
||||
* Whether to ask the regex engine to stop as soon as a match is seen via
|
||||
[`Input::earliest`]. This can be used to find the offset of a match as soon
|
||||
as it is known without waiting for the full leftmost-first match to be found.
|
||||
This can also be used to avoid the worst case `O(m * n^2)` time complexity
|
||||
of iteration.
|
||||
|
||||
Some lower level search routines accept an `&Input` for performance reasons.
|
||||
In which case, `&Input::new("haystack")` can be used for a simple search.
|
||||
|
||||
### Error reporting
|
||||
|
||||
Most, but not all, regex engines in this crate can fail to execute a search.
|
||||
When a search fails, callers cannot determine whether or not a match exists.
|
||||
That is, the result is indeterminate.
|
||||
|
||||
Search failure, in all cases in this crate, is represented by a [`MatchError`].
|
||||
Routines that can fail start with the `try_` prefix in their name. For example,
|
||||
[`hybrid::regex::Regex::try_search`] can fail for a number of reasons.
|
||||
Conversely, routines that either can't fail or can panic on failure lack the
|
||||
`try_` prefix. For example, [`hybrid::regex::Regex::find`] will panic in
|
||||
cases where [`hybrid::regex::Regex::try_search`] would return an error, and
|
||||
[`meta::Regex::find`] will never panic. Therefore, callers need to pay close
|
||||
attention to the panicking conditions in the documentation.
|
||||
|
||||
In most cases, the reasons that a search fails are either predictable or
|
||||
configurable, albeit at some additional cost.
|
||||
|
||||
An example of predictable failure is
|
||||
[`BoundedBacktracker::try_search`](nfa::thompson::backtrack::BoundedBacktracker::try_search).
|
||||
Namely, it fails whenever the multiplication of the haystack, the regex and some
|
||||
constant exceeds the
|
||||
[configured visited capacity](nfa::thompson::backtrack::Config::visited_capacity).
|
||||
Callers can predict the failure in terms of haystack length via the
|
||||
[`BoundedBacktracker::max_haystack_len`](nfa::thompson::backtrack::BoundedBacktracker::max_haystack_len)
|
||||
method. While this form of failure is technically avoidable by increasing the
|
||||
visited capacity, it isn't practical to do so for all inputs because the
|
||||
memory usage required for larger haystacks becomes impractically large. So in
|
||||
practice, if one is using the bounded backtracker, you really do have to deal
|
||||
with the failure.
|
||||
|
||||
An example of configurable failure happens when one enables heuristic support
|
||||
for Unicode word boundaries in a DFA. Namely, since the DFAs in this crate
|
||||
(except for the one-pass DFA) do not support Unicode word boundaries on
|
||||
non-ASCII haystacks, building a DFA from an NFA that contains a Unicode word
|
||||
boundary will itself fail. However, one can configure DFAs to still be built in
|
||||
this case by
|
||||
[configuring heuristic support for Unicode word boundaries](hybrid::dfa::Config::unicode_word_boundary).
|
||||
If the NFA the DFA is built from contains a Unicode word boundary, then the
|
||||
DFA will still be built, but special transitions will be added to every state
|
||||
that cause the DFA to fail if any non-ASCII byte is seen. This failure happens
|
||||
at search time and it requires the caller to opt into this.
|
||||
|
||||
There are other ways for regex engines to fail in this crate, but the above
|
||||
two should represent the general theme of failures one can find. Dealing
|
||||
with these failures is, in part, one the responsibilities of the [meta regex
|
||||
engine](meta). Notice, for example, that the meta regex engine exposes an API
|
||||
that never returns an error nor panics. It carefully manages all of the ways
|
||||
in which the regex engines can fail and either avoids the predictable ones
|
||||
entirely (e.g., the bounded backtracker) or reacts to configured failures by
|
||||
falling back to a different engine (e.g., the lazy DFA quitting because it saw
|
||||
a non-ASCII byte).
|
||||
|
||||
### Configuration and Builders
|
||||
|
||||
Most of the regex engines in this crate come with two types to facilitate
|
||||
building the regex engine: a `Config` and a `Builder`. A `Config` is usually
|
||||
specific to that particular regex engine, but other objects such as parsing and
|
||||
NFA compilation have `Config` types too. A `Builder` is the thing responsible
|
||||
for taking inputs (either pattern strings or already-parsed patterns or even
|
||||
NFAs directly) and turning them into an actual regex engine that can be used
|
||||
for searching.
|
||||
|
||||
The main reason why building a regex engine is a bit complicated is because
|
||||
of the desire to permit composition with de-coupled components. For example,
|
||||
you might want to [manually construct a Thompson NFA](nfa::thompson::Builder)
|
||||
and then build a regex engine from it without ever using a regex parser
|
||||
at all. On the other hand, you might also want to build a regex engine directly
|
||||
from the concrete syntax. This demonstrates why regex engine construction is
|
||||
so flexible: it needs to support not just convenient construction, but also
|
||||
construction from parts built elsewhere.
|
||||
|
||||
This is also in turn why there are many different `Config` structs in this
|
||||
crate. Let's look more closely at an example: [`hybrid::regex::Builder`]. It
|
||||
accepts three different `Config` types for configuring construction of a lazy
|
||||
DFA regex:
|
||||
|
||||
* [`hybrid::regex::Builder::syntax`] accepts a
|
||||
[`util::syntax::Config`] for configuring the options found in the
|
||||
[`regex-syntax`](regex_syntax) crate. For example, whether to match
|
||||
case insensitively.
|
||||
* [`hybrid::regex::Builder::thompson`] accepts a [`nfa::thompson::Config`] for
|
||||
configuring construction of a [Thompson NFA](nfa::thompson::NFA). For example,
|
||||
whether to build an NFA that matches the reverse language described by the
|
||||
regex.
|
||||
* [`hybrid::regex::Builder::dfa`] accept a [`hybrid::dfa::Config`] for
|
||||
configuring construction of the pair of underlying lazy DFAs that make up the
|
||||
lazy DFA regex engine. For example, changing the capacity of the cache used to
|
||||
store the transition table.
|
||||
|
||||
The lazy DFA regex engine uses all three of those configuration objects for
|
||||
methods like [`hybrid::regex::Builder::build`], which accepts a pattern
|
||||
string containing the concrete syntax of your regex. It uses the syntax
|
||||
configuration to parse it into an AST and translate it into an HIR. Then the
|
||||
NFA configuration when compiling the HIR into an NFA. And then finally the DFA
|
||||
configuration when lazily determinizing the NFA into a DFA.
|
||||
|
||||
Notice though that the builder also has a
|
||||
[`hybrid::regex::Builder::build_from_dfas`] constructor. This permits callers
|
||||
to build the underlying pair of lazy DFAs themselves (one for the forward
|
||||
searching to find the end of a match and one for the reverse searching to find
|
||||
the start of a match), and then build the regex engine from them. The lazy
|
||||
DFAs, in turn, have their own builder that permits [construction directly from
|
||||
a Thompson NFA](hybrid::dfa::Builder::build_from_nfa). Continuing down the
|
||||
rabbit hole, a Thompson NFA has its own compiler that permits [construction
|
||||
directly from an HIR](nfa::thompson::Compiler::build_from_hir). The lazy DFA
|
||||
regex engine builder lets you follow this rabbit hole all the way down, but
|
||||
also provides convenience routines that do it for you when you don't need
|
||||
precise control over every component.
|
||||
|
||||
The [meta regex engine](meta) is a good example of something that utilizes the
|
||||
full flexibility of these builders. It often needs not only precise control
|
||||
over each component, but also shares them across multiple regex engines.
|
||||
(Most sharing is done by internal reference accounting. For example, an
|
||||
[`NFA`](nfa::thompson::NFA) is reference counted internally which makes cloning
|
||||
cheap.)
|
||||
|
||||
### Size limits
|
||||
|
||||
Unlike the `regex` crate, the `regex-automata` crate specifically does not
|
||||
enable any size limits by default. That means users of this crate need to
|
||||
be quite careful when using untrusted patterns. Namely, because bounded
|
||||
repetitions can grow exponentially by stacking them, it is possible to build a
|
||||
very large internal regex object from just a small pattern string. For example,
|
||||
the NFA built from the pattern `a{10}{10}{10}{10}{10}{10}{10}` is over 240MB.
|
||||
|
||||
There are multiple size limit options in this crate. If one or more size limits
|
||||
are relevant for the object you're building, they will be configurable via
|
||||
methods on a corresponding `Config` type.
|
||||
|
||||
# Crate features
|
||||
|
||||
This crate has a dizzying number of features. The main idea is to be able to
|
||||
control how much stuff you pull in for your specific use case, since the full
|
||||
crate is quite large and can dramatically increase compile times and binary
|
||||
size.
|
||||
|
||||
The most barebones but useful configuration is to disable all default features
|
||||
and enable only `dfa-search`. This will bring in just the DFA deserialization
|
||||
and search routines without any dependency on `std` or `alloc`. This does
|
||||
require generating and serializing a DFA, and then storing it somewhere, but
|
||||
it permits regex searches in freestanding or embedded environments.
|
||||
|
||||
Because there are so many features, they are split into a few groups.
|
||||
|
||||
The default set of features is: `std`, `syntax`, `perf`, `unicode`, `meta`,
|
||||
`nfa`, `dfa` and `hybrid`. Basically, the default is to enable everything
|
||||
except for development related features like `logging`.
|
||||
|
||||
### Ecosystem features
|
||||
|
||||
* **std** - Enables use of the standard library. In terms of APIs, this usually
|
||||
just means that error types implement the `std::error::Error` trait. Otherwise,
|
||||
`std` sometimes enables the code to be faster, for example, using a `HashMap`
|
||||
instead of a `BTreeMap`. (The `std` feature matters more for dependencies like
|
||||
`aho-corasick` and `memchr`, where `std` is required to enable certain classes
|
||||
of SIMD optimizations.) Enabling `std` automatically enables `alloc`.
|
||||
* **alloc** - Enables use of the `alloc` library. This is required for most
|
||||
APIs in this crate. The main exception is deserializing and searching with
|
||||
fully compiled DFAs.
|
||||
* **logging** - Adds a dependency on the `log` crate and makes this crate emit
|
||||
log messages of varying degrees of utility. The log messages are especially
|
||||
useful in trying to understand what the meta regex engine is doing.
|
||||
|
||||
### Performance features
|
||||
|
||||
* **perf** - Enables all of the below features.
|
||||
* **perf-inline** - When enabled, `inline(always)` is used in (many) strategic
|
||||
locations to help performance at the expense of longer compile times and
|
||||
increased binary size.
|
||||
* **perf-literal** - Enables all literal related optimizations.
|
||||
* **perf-literal-substring** - Enables all single substring literal
|
||||
optimizations. This includes adding a dependency on the `memchr` crate.
|
||||
* **perf-literal-multisubstring** - Enables all multiple substring literal
|
||||
optimizations. This includes adding a dependency on the `aho-corasick`
|
||||
crate.
|
||||
|
||||
### Unicode features
|
||||
|
||||
* **unicode** -
|
||||
Enables all Unicode features. This feature is enabled by default, and will
|
||||
always cover all Unicode features, even if more are added in the future.
|
||||
* **unicode-age** -
|
||||
Provide the data for the
|
||||
[Unicode `Age` property](https://www.unicode.org/reports/tr44/tr44-24.html#Character_Age).
|
||||
This makes it possible to use classes like `\p{Age:6.0}` to refer to all
|
||||
codepoints first introduced in Unicode 6.0
|
||||
* **unicode-bool** -
|
||||
Provide the data for numerous Unicode boolean properties. The full list
|
||||
is not included here, but contains properties like `Alphabetic`, `Emoji`,
|
||||
`Lowercase`, `Math`, `Uppercase` and `White_Space`.
|
||||
* **unicode-case** -
|
||||
Provide the data for case insensitive matching using
|
||||
[Unicode's "simple loose matches" specification](https://www.unicode.org/reports/tr18/#Simple_Loose_Matches).
|
||||
* **unicode-gencat** -
|
||||
Provide the data for
|
||||
[Unicode general categories](https://www.unicode.org/reports/tr44/tr44-24.html#General_Category_Values).
|
||||
This includes, but is not limited to, `Decimal_Number`, `Letter`,
|
||||
`Math_Symbol`, `Number` and `Punctuation`.
|
||||
* **unicode-perl** -
|
||||
Provide the data for supporting the Unicode-aware Perl character classes,
|
||||
corresponding to `\w`, `\s` and `\d`. This is also necessary for using
|
||||
Unicode-aware word boundary assertions. Note that if this feature is
|
||||
disabled, the `\s` and `\d` character classes are still available if the
|
||||
`unicode-bool` and `unicode-gencat` features are enabled, respectively.
|
||||
* **unicode-script** -
|
||||
Provide the data for
|
||||
[Unicode scripts and script extensions](https://www.unicode.org/reports/tr24/).
|
||||
This includes, but is not limited to, `Arabic`, `Cyrillic`, `Hebrew`,
|
||||
`Latin` and `Thai`.
|
||||
* **unicode-segment** -
|
||||
Provide the data necessary to provide the properties used to implement the
|
||||
[Unicode text segmentation algorithms](https://www.unicode.org/reports/tr29/).
|
||||
This enables using classes like `\p{gcb=Extend}`, `\p{wb=Katakana}` and
|
||||
`\p{sb=ATerm}`.
|
||||
* **unicode-word-boundary** -
|
||||
Enables support for Unicode word boundaries, i.e., `\b`, in regexes. When
|
||||
this and `unicode-perl` are enabled, then data tables from `regex-syntax` are
|
||||
used to implement Unicode word boundaries. However, if `regex-syntax` isn't
|
||||
enabled as a dependency then one can still enable this feature. It will
|
||||
cause `regex-automata` to bundle its own data table that would otherwise be
|
||||
redundant with `regex-syntax`'s table.
|
||||
|
||||
### Regex engine features
|
||||
|
||||
* **syntax** - Enables a dependency on `regex-syntax`. This makes APIs
|
||||
for building regex engines from pattern strings available. Without the
|
||||
`regex-syntax` dependency, the only way to build a regex engine is generally
|
||||
to deserialize a previously built DFA or to hand assemble an NFA using its
|
||||
[builder API](nfa::thompson::Builder). Once you have an NFA, you can build any
|
||||
of the regex engines in this crate. The `syntax` feature also enables `alloc`.
|
||||
* **meta** - Enables the meta regex engine. This also enables the `syntax` and
|
||||
`nfa-pikevm` features, as both are the minimal requirements needed. The meta
|
||||
regex engine benefits from enabling any of the other regex engines and will
|
||||
use them automatically when appropriate.
|
||||
* **nfa** - Enables all NFA related features below.
|
||||
* **nfa-thompson** - Enables the Thompson NFA APIs. This enables `alloc`.
|
||||
* **nfa-pikevm** - Enables the PikeVM regex engine. This enables
|
||||
`nfa-thompson`.
|
||||
* **nfa-backtrack** - Enables the bounded backtracker regex engine. This
|
||||
enables `nfa-thompson`.
|
||||
* **dfa** - Enables all DFA related features below.
|
||||
* **dfa-build** - Enables APIs for determinizing DFAs from NFAs. This
|
||||
enables `nfa-thompson` and `dfa-search`.
|
||||
* **dfa-search** - Enables APIs for searching with DFAs.
|
||||
* **dfa-onepass** - Enables the one-pass DFA API. This enables
|
||||
`nfa-thompson`.
|
||||
* **hybrid** - Enables the hybrid NFA/DFA or "lazy DFA" regex engine. This
|
||||
enables `alloc` and `nfa-thompson`.
|
||||
|
||||
*/
|
||||
|
||||
// We are no_std.
|
||||
#![no_std]
|
||||
// All APIs need docs!
|
||||
#![deny(missing_docs)]
|
||||
// Some intra-doc links are broken when certain features are disabled, so we
|
||||
// only bleat about it when most (all?) features are enabled. But when we do,
|
||||
// we block the build. Links need to work.
|
||||
#![cfg_attr(
|
||||
all(
|
||||
feature = "std",
|
||||
feature = "nfa",
|
||||
feature = "dfa",
|
||||
feature = "hybrid"
|
||||
),
|
||||
deny(rustdoc::broken_intra_doc_links)
|
||||
)]
|
||||
// Broken rustdoc links are very easy to come by when you start disabling
|
||||
// features. Namely, features tend to change imports, and imports change what's
|
||||
// available to link to.
|
||||
//
|
||||
// Basically, we just don't support rustdoc for anything other than the maximal
|
||||
// feature configuration. Other configurations will work, they just won't be
|
||||
// perfect.
|
||||
//
|
||||
// So here, we specifically allow them so we don't even get warned about them.
|
||||
#![cfg_attr(
|
||||
not(all(
|
||||
feature = "std",
|
||||
feature = "nfa",
|
||||
feature = "dfa",
|
||||
feature = "hybrid"
|
||||
)),
|
||||
allow(rustdoc::broken_intra_doc_links)
|
||||
)]
|
||||
// Kinda similar, but eliminating all of the dead code and unused import
|
||||
// warnings for every feature combo is a fool's errand. Instead, we just
|
||||
// suppress those, but still let them through in a common configuration when we
|
||||
// build most of everything.
|
||||
//
|
||||
// This does actually suggest that when features are disabled, we are actually
|
||||
// compiling more code than we need to be. And this is perhaps not so great
|
||||
// because disabling features is usually done in order to reduce compile times
|
||||
// by reducing the amount of code one compiles... However, usually, most of the
|
||||
// time this dead code is a relatively small amount from the 'util' module.
|
||||
// But... I confess... There isn't a ton of visibility on this.
|
||||
//
|
||||
// I'm happy to try to address this in a different way, but "let's annotate
|
||||
// every function in 'util' with some non-local combination of features" just
|
||||
// cannot be the way forward.
|
||||
#![cfg_attr(
|
||||
not(all(
|
||||
feature = "std",
|
||||
feature = "nfa",
|
||||
feature = "dfa",
|
||||
feature = "hybrid",
|
||||
feature = "perf-literal-substring",
|
||||
feature = "perf-literal-multisubstring",
|
||||
)),
|
||||
allow(dead_code, unused_imports, unused_variables)
|
||||
)]
|
||||
// We generally want all types to impl Debug.
|
||||
#![warn(missing_debug_implementations)]
|
||||
// No clue why this thing is still unstable because it's pretty amazing. This
|
||||
// adds Cargo feature annotations to items in the rustdoc output. Which is
|
||||
// sadly hugely beneficial for this crate due to the number of features.
|
||||
#![cfg_attr(docsrs, feature(doc_auto_cfg))]
|
||||
|
||||
// I have literally never tested this crate on 16-bit, so it is quite
|
||||
// suspicious to advertise support for it. But... the regex crate, at time
|
||||
// of writing, at least claims to support it by not doing any conditional
|
||||
// compilation based on the target pointer width. So I guess I remain
|
||||
// consistent with that here.
|
||||
//
|
||||
// If you are here because you're on a 16-bit system and you were somehow using
|
||||
// the regex crate previously, please file an issue. Please be prepared to
|
||||
// provide some kind of reproduction or carve out some path to getting 16-bit
|
||||
// working in CI. (Via qemu?)
|
||||
#[cfg(not(any(
|
||||
target_pointer_width = "16",
|
||||
target_pointer_width = "32",
|
||||
target_pointer_width = "64"
|
||||
)))]
|
||||
compile_error!("not supported on non-{16,32,64}, please file an issue");
|
||||
|
||||
#[cfg(any(test, feature = "std"))]
|
||||
extern crate std;
|
||||
|
||||
#[cfg(feature = "alloc")]
|
||||
extern crate alloc;
|
||||
|
||||
#[cfg(doctest)]
|
||||
doc_comment::doctest!("../README.md");
|
||||
|
||||
#[doc(inline)]
|
||||
pub use crate::util::primitives::PatternID;
|
||||
pub use crate::util::search::*;
|
||||
|
||||
#[macro_use]
|
||||
mod macros;
|
||||
|
||||
#[cfg(any(feature = "dfa-search", feature = "dfa-onepass"))]
|
||||
pub mod dfa;
|
||||
#[cfg(feature = "hybrid")]
|
||||
pub mod hybrid;
|
||||
#[cfg(feature = "meta")]
|
||||
pub mod meta;
|
||||
#[cfg(feature = "nfa-thompson")]
|
||||
pub mod nfa;
|
||||
pub mod util;
|
||||
20
third-party/vendor/regex-automata/src/macros.rs
vendored
Normal file
20
third-party/vendor/regex-automata/src/macros.rs
vendored
Normal file
|
|
@ -0,0 +1,20 @@
|
|||
// Some feature combinations result in some of these macros never being used.
|
||||
// Which is fine. Just squash the warnings.
|
||||
#![allow(unused_macros)]
|
||||
|
||||
macro_rules! log {
|
||||
($($tt:tt)*) => {
|
||||
#[cfg(feature = "logging")]
|
||||
{
|
||||
$($tt)*
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
macro_rules! debug {
|
||||
($($tt:tt)*) => { log!(log::debug!($($tt)*)) }
|
||||
}
|
||||
|
||||
macro_rules! trace {
|
||||
($($tt:tt)*) => { log!(log::trace!($($tt)*)) }
|
||||
}
|
||||
241
third-party/vendor/regex-automata/src/meta/error.rs
vendored
Normal file
241
third-party/vendor/regex-automata/src/meta/error.rs
vendored
Normal file
|
|
@ -0,0 +1,241 @@
|
|||
use regex_syntax::{ast, hir};
|
||||
|
||||
use crate::{nfa, util::search::MatchError, PatternID};
|
||||
|
||||
/// An error that occurs when construction of a `Regex` fails.
|
||||
///
|
||||
/// A build error is generally a result of one of two possible failure
|
||||
/// modes. First is a parse or syntax error in the concrete syntax of a
|
||||
/// pattern. Second is that the construction of the underlying regex matcher
|
||||
/// fails, usually because it gets too big with respect to limits like
|
||||
/// [`Config::nfa_size_limit`](crate::meta::Config::nfa_size_limit).
|
||||
///
|
||||
/// This error provides very little introspection capabilities. You can:
|
||||
///
|
||||
/// * Ask for the [`PatternID`] of the pattern that caused an error, if one
|
||||
/// is available. This is available for things like syntax errors, but not for
|
||||
/// cases where build limits are exceeded.
|
||||
/// * Ask for the underlying syntax error, but only if the error is a syntax
|
||||
/// error.
|
||||
/// * Ask for a human readable message corresponding to the underlying error.
|
||||
/// * The `BuildError::source` method (from the `std::error::Error`
|
||||
/// trait implementation) may be used to query for an underlying error if one
|
||||
/// exists. There are no API guarantees about which error is returned.
|
||||
///
|
||||
/// When the `std` feature is enabled, this implements `std::error::Error`.
|
||||
#[derive(Clone, Debug)]
|
||||
pub struct BuildError {
|
||||
kind: BuildErrorKind,
|
||||
}
|
||||
|
||||
#[derive(Clone, Debug)]
|
||||
enum BuildErrorKind {
|
||||
Syntax { pid: PatternID, err: regex_syntax::Error },
|
||||
NFA(nfa::thompson::BuildError),
|
||||
}
|
||||
|
||||
impl BuildError {
|
||||
/// If it is known which pattern ID caused this build error to occur, then
|
||||
/// this method returns it.
|
||||
///
|
||||
/// Some errors are not associated with a particular pattern. However, any
|
||||
/// errors that occur as part of parsing a pattern are guaranteed to be
|
||||
/// associated with a pattern ID.
|
||||
///
|
||||
/// # Example
|
||||
///
|
||||
/// ```
|
||||
/// use regex_automata::{meta::Regex, PatternID};
|
||||
///
|
||||
/// let err = Regex::new_many(&["a", "b", r"\p{Foo}", "c"]).unwrap_err();
|
||||
/// assert_eq!(Some(PatternID::must(2)), err.pattern());
|
||||
/// ```
|
||||
pub fn pattern(&self) -> Option<PatternID> {
|
||||
match self.kind {
|
||||
BuildErrorKind::Syntax { pid, .. } => Some(pid),
|
||||
_ => None,
|
||||
}
|
||||
}
|
||||
|
||||
/// If this error occurred because the regex exceeded the configured size
|
||||
/// limit before being built, then this returns the configured size limit.
|
||||
///
|
||||
/// The limit returned is what was configured, and corresponds to the
|
||||
/// maximum amount of heap usage in bytes.
|
||||
pub fn size_limit(&self) -> Option<usize> {
|
||||
match self.kind {
|
||||
BuildErrorKind::NFA(ref err) => err.size_limit(),
|
||||
_ => None,
|
||||
}
|
||||
}
|
||||
|
||||
/// If this error corresponds to a syntax error, then a reference to it is
|
||||
/// returned by this method.
|
||||
pub fn syntax_error(&self) -> Option<®ex_syntax::Error> {
|
||||
match self.kind {
|
||||
BuildErrorKind::Syntax { ref err, .. } => Some(err),
|
||||
_ => None,
|
||||
}
|
||||
}
|
||||
|
||||
pub(crate) fn ast(pid: PatternID, err: ast::Error) -> BuildError {
|
||||
let err = regex_syntax::Error::from(err);
|
||||
BuildError { kind: BuildErrorKind::Syntax { pid, err } }
|
||||
}
|
||||
|
||||
pub(crate) fn hir(pid: PatternID, err: hir::Error) -> BuildError {
|
||||
let err = regex_syntax::Error::from(err);
|
||||
BuildError { kind: BuildErrorKind::Syntax { pid, err } }
|
||||
}
|
||||
|
||||
pub(crate) fn nfa(err: nfa::thompson::BuildError) -> BuildError {
|
||||
BuildError { kind: BuildErrorKind::NFA(err) }
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(feature = "std")]
|
||||
impl std::error::Error for BuildError {
|
||||
fn source(&self) -> Option<&(dyn std::error::Error + 'static)> {
|
||||
match self.kind {
|
||||
BuildErrorKind::Syntax { ref err, .. } => Some(err),
|
||||
BuildErrorKind::NFA(ref err) => Some(err),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl core::fmt::Display for BuildError {
|
||||
fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result {
|
||||
match self.kind {
|
||||
BuildErrorKind::Syntax { pid, .. } => {
|
||||
write!(f, "error parsing pattern {}", pid.as_usize())
|
||||
}
|
||||
BuildErrorKind::NFA(_) => write!(f, "error building NFA"),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// An error that occurs when a search should be retried.
|
||||
///
|
||||
/// This retry error distinguishes between two different failure modes.
|
||||
///
|
||||
/// The first is one where potential quadratic behavior has been detected.
|
||||
/// In this case, whatever optimization that led to this behavior should be
|
||||
/// stopped, and the next best strategy should be used.
|
||||
///
|
||||
/// The second indicates that the underlying regex engine has failed for some
|
||||
/// reason. This usually occurs because either a lazy DFA's cache has become
|
||||
/// ineffective or because a non-ASCII byte has been seen *and* a Unicode word
|
||||
/// boundary was used in one of the patterns. In this failure case, a different
|
||||
/// regex engine that won't fail in these ways (PikeVM, backtracker or the
|
||||
/// one-pass DFA) should be used.
|
||||
///
|
||||
/// This is an internal error only and should never bleed into the public
|
||||
/// API.
|
||||
#[derive(Debug)]
|
||||
pub(crate) enum RetryError {
|
||||
Quadratic(RetryQuadraticError),
|
||||
Fail(RetryFailError),
|
||||
}
|
||||
|
||||
#[cfg(feature = "std")]
|
||||
impl std::error::Error for RetryError {}
|
||||
|
||||
impl core::fmt::Display for RetryError {
|
||||
fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result {
|
||||
match *self {
|
||||
RetryError::Quadratic(ref err) => err.fmt(f),
|
||||
RetryError::Fail(ref err) => err.fmt(f),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl From<MatchError> for RetryError {
|
||||
fn from(merr: MatchError) -> RetryError {
|
||||
RetryError::Fail(RetryFailError::from(merr))
|
||||
}
|
||||
}
|
||||
|
||||
/// An error that occurs when potential quadratic behavior has been detected
|
||||
/// when applying either the "reverse suffix" or "reverse inner" optimizations.
|
||||
///
|
||||
/// When this error occurs, callers should abandon the "reverse" optimization
|
||||
/// and use a normal forward search.
|
||||
#[derive(Debug)]
|
||||
pub(crate) struct RetryQuadraticError(());
|
||||
|
||||
impl RetryQuadraticError {
|
||||
pub(crate) fn new() -> RetryQuadraticError {
|
||||
RetryQuadraticError(())
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(feature = "std")]
|
||||
impl std::error::Error for RetryQuadraticError {}
|
||||
|
||||
impl core::fmt::Display for RetryQuadraticError {
|
||||
fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result {
|
||||
write!(f, "regex engine gave up to avoid quadratic behavior")
|
||||
}
|
||||
}
|
||||
|
||||
impl From<RetryQuadraticError> for RetryError {
|
||||
fn from(err: RetryQuadraticError) -> RetryError {
|
||||
RetryError::Quadratic(err)
|
||||
}
|
||||
}
|
||||
|
||||
/// An error that occurs when a regex engine "gives up" for some reason before
|
||||
/// finishing a search. Usually this occurs because of heuristic Unicode word
|
||||
/// boundary support or because of ineffective cache usage in the lazy DFA.
|
||||
///
|
||||
/// When this error occurs, callers should retry the regex search with a
|
||||
/// different regex engine.
|
||||
///
|
||||
/// Note that this has convenient `From` impls that will automatically
|
||||
/// convert a `MatchError` into this error. This works because the meta
|
||||
/// regex engine internals guarantee that errors like `HaystackTooLong` and
|
||||
/// `UnsupportedAnchored` will never occur. The only errors left are `Quit` and
|
||||
/// `GaveUp`, which both correspond to this "failure" error.
|
||||
#[derive(Debug)]
|
||||
pub(crate) struct RetryFailError {
|
||||
offset: usize,
|
||||
}
|
||||
|
||||
impl RetryFailError {
|
||||
pub(crate) fn from_offset(offset: usize) -> RetryFailError {
|
||||
RetryFailError { offset }
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(feature = "std")]
|
||||
impl std::error::Error for RetryFailError {}
|
||||
|
||||
impl core::fmt::Display for RetryFailError {
|
||||
fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result {
|
||||
write!(f, "regex engine failed at offset {:?}", self.offset)
|
||||
}
|
||||
}
|
||||
|
||||
impl From<RetryFailError> for RetryError {
|
||||
fn from(err: RetryFailError) -> RetryError {
|
||||
RetryError::Fail(err)
|
||||
}
|
||||
}
|
||||
|
||||
impl From<MatchError> for RetryFailError {
|
||||
fn from(merr: MatchError) -> RetryFailError {
|
||||
use crate::util::search::MatchErrorKind::*;
|
||||
|
||||
match *merr.kind() {
|
||||
Quit { offset, .. } => RetryFailError::from_offset(offset),
|
||||
GaveUp { offset } => RetryFailError::from_offset(offset),
|
||||
// These can never occur because we avoid them by construction
|
||||
// or with higher level control flow logic. For example, the
|
||||
// backtracker's wrapper will never hand out a backtracker engine
|
||||
// when the haystack would be too long.
|
||||
HaystackTooLong { .. } | UnsupportedAnchored { .. } => {
|
||||
unreachable!("found impossible error in meta engine: {}", merr)
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
255
third-party/vendor/regex-automata/src/meta/limited.rs
vendored
Normal file
255
third-party/vendor/regex-automata/src/meta/limited.rs
vendored
Normal file
|
|
@ -0,0 +1,255 @@
|
|||
/*!
|
||||
This module defines two bespoke reverse DFA searching routines. (One for the
|
||||
lazy DFA and one for the fully compiled DFA.) These routines differ from the
|
||||
usual ones by permitting the caller to specify a minimum starting position.
|
||||
That is, the search will begin at `input.end()` and will usually stop at
|
||||
`input.start()`, unless `min_start > input.start()`, in which case, the search
|
||||
will stop at `min_start`.
|
||||
|
||||
In other words, this lets you say, "no, the search must not extend past this
|
||||
point, even if it's within the bounds of the given `Input`." And if the search
|
||||
*does* want to go past that point, it stops and returns a "may be quadratic"
|
||||
error, which indicates that the caller should retry using some other technique.
|
||||
|
||||
These routines specifically exist to protect against quadratic behavior when
|
||||
employing the "reverse suffix" and "reverse inner" optimizations. Without the
|
||||
backstop these routines provide, it is possible for parts of the haystack to
|
||||
get re-scanned over and over again. The backstop not only prevents this, but
|
||||
*tells you when it is happening* so that you can change the strategy.
|
||||
|
||||
Why can't we just use the normal search routines? We could use the normal
|
||||
search routines and just set the start bound on the provided `Input` to our
|
||||
`min_start` position. The problem here is that it's impossible to distinguish
|
||||
between "no match because we reached the end of input" and "determined there
|
||||
was no match well before the end of input." The former case is what we care
|
||||
about with respect to quadratic behavior. The latter case is totally fine.
|
||||
|
||||
Why don't we modify the normal search routines to report the position at which
|
||||
the search stops? I considered this, and I still wonder if it is indeed the
|
||||
right thing to do. However, I think the straight-forward thing to do there
|
||||
would be to complicate the return type signature of almost every search routine
|
||||
in this crate, which I really do not want to do. It therefore might make more
|
||||
sense to provide a richer way for search routines to report meta data, but that
|
||||
was beyond my bandwidth to work on at the time of writing.
|
||||
|
||||
See the 'opt/reverse-inner' and 'opt/reverse-suffix' benchmarks in rebar for a
|
||||
real demonstration of how quadratic behavior is mitigated.
|
||||
*/
|
||||
|
||||
use crate::{
|
||||
meta::error::{RetryError, RetryQuadraticError},
|
||||
HalfMatch, Input, MatchError,
|
||||
};
|
||||
|
||||
#[cfg(feature = "dfa-build")]
|
||||
pub(crate) fn dfa_try_search_half_rev(
|
||||
dfa: &crate::dfa::dense::DFA<alloc::vec::Vec<u32>>,
|
||||
input: &Input<'_>,
|
||||
min_start: usize,
|
||||
) -> Result<Option<HalfMatch>, RetryError> {
|
||||
use crate::dfa::Automaton;
|
||||
|
||||
let mut mat = None;
|
||||
let mut sid = dfa.start_state_reverse(input)?;
|
||||
if input.start() == input.end() {
|
||||
dfa_eoi_rev(dfa, input, &mut sid, &mut mat)?;
|
||||
return Ok(mat);
|
||||
}
|
||||
let mut at = input.end() - 1;
|
||||
loop {
|
||||
sid = dfa.next_state(sid, input.haystack()[at]);
|
||||
if dfa.is_special_state(sid) {
|
||||
if dfa.is_match_state(sid) {
|
||||
let pattern = dfa.match_pattern(sid, 0);
|
||||
// Since reverse searches report the beginning of a
|
||||
// match and the beginning is inclusive (not exclusive
|
||||
// like the end of a match), we add 1 to make it
|
||||
// inclusive.
|
||||
mat = Some(HalfMatch::new(pattern, at + 1));
|
||||
} else if dfa.is_dead_state(sid) {
|
||||
return Ok(mat);
|
||||
} else if dfa.is_quit_state(sid) {
|
||||
return Err(MatchError::quit(input.haystack()[at], at).into());
|
||||
}
|
||||
}
|
||||
if at == input.start() {
|
||||
break;
|
||||
}
|
||||
at -= 1;
|
||||
if at < min_start {
|
||||
trace!(
|
||||
"reached position {} which is before the previous literal \
|
||||
match, quitting to avoid quadratic behavior",
|
||||
at,
|
||||
);
|
||||
return Err(RetryError::Quadratic(RetryQuadraticError::new()));
|
||||
}
|
||||
}
|
||||
let was_dead = dfa.is_dead_state(sid);
|
||||
dfa_eoi_rev(dfa, input, &mut sid, &mut mat)?;
|
||||
// If we reach the beginning of the search and we could otherwise still
|
||||
// potentially keep matching if there was more to match, then we actually
|
||||
// return an error to indicate giving up on this optimization. Why? Because
|
||||
// we can't prove that the real match begins at where we would report it.
|
||||
//
|
||||
// This only happens when all of the following are true:
|
||||
//
|
||||
// 1) We reach the starting point of our search span.
|
||||
// 2) The match we found is before the starting point.
|
||||
// 3) The FSM reports we could possibly find a longer match.
|
||||
//
|
||||
// We need (1) because otherwise the search stopped before the starting
|
||||
// point and there is no possible way to find a more leftmost position.
|
||||
//
|
||||
// We need (2) because if the match found has an offset equal to the minimum
|
||||
// possible offset, then there is no possible more leftmost match.
|
||||
//
|
||||
// We need (3) because if the FSM couldn't continue anyway (i.e., it's in
|
||||
// a dead state), then we know we couldn't find anything more leftmost
|
||||
// than what we have. (We have to check the state we were in prior to the
|
||||
// EOI transition since the EOI transition will usually bring us to a dead
|
||||
// state by virtue of it represents the end-of-input.)
|
||||
if at == input.start()
|
||||
&& mat.map_or(false, |m| m.offset() > input.start())
|
||||
&& !was_dead
|
||||
{
|
||||
trace!(
|
||||
"reached beginning of search at offset {} without hitting \
|
||||
a dead state, quitting to avoid potential false positive match",
|
||||
at,
|
||||
);
|
||||
return Err(RetryError::Quadratic(RetryQuadraticError::new()));
|
||||
}
|
||||
Ok(mat)
|
||||
}
|
||||
|
||||
#[cfg(feature = "hybrid")]
|
||||
pub(crate) fn hybrid_try_search_half_rev(
|
||||
dfa: &crate::hybrid::dfa::DFA,
|
||||
cache: &mut crate::hybrid::dfa::Cache,
|
||||
input: &Input<'_>,
|
||||
min_start: usize,
|
||||
) -> Result<Option<HalfMatch>, RetryError> {
|
||||
let mut mat = None;
|
||||
let mut sid = dfa.start_state_reverse(cache, input)?;
|
||||
if input.start() == input.end() {
|
||||
hybrid_eoi_rev(dfa, cache, input, &mut sid, &mut mat)?;
|
||||
return Ok(mat);
|
||||
}
|
||||
let mut at = input.end() - 1;
|
||||
loop {
|
||||
sid = dfa
|
||||
.next_state(cache, sid, input.haystack()[at])
|
||||
.map_err(|_| MatchError::gave_up(at))?;
|
||||
if sid.is_tagged() {
|
||||
if sid.is_match() {
|
||||
let pattern = dfa.match_pattern(cache, sid, 0);
|
||||
// Since reverse searches report the beginning of a
|
||||
// match and the beginning is inclusive (not exclusive
|
||||
// like the end of a match), we add 1 to make it
|
||||
// inclusive.
|
||||
mat = Some(HalfMatch::new(pattern, at + 1));
|
||||
} else if sid.is_dead() {
|
||||
return Ok(mat);
|
||||
} else if sid.is_quit() {
|
||||
return Err(MatchError::quit(input.haystack()[at], at).into());
|
||||
}
|
||||
}
|
||||
if at == input.start() {
|
||||
break;
|
||||
}
|
||||
at -= 1;
|
||||
if at < min_start {
|
||||
trace!(
|
||||
"reached position {} which is before the previous literal \
|
||||
match, quitting to avoid quadratic behavior",
|
||||
at,
|
||||
);
|
||||
return Err(RetryError::Quadratic(RetryQuadraticError::new()));
|
||||
}
|
||||
}
|
||||
let was_dead = sid.is_dead();
|
||||
hybrid_eoi_rev(dfa, cache, input, &mut sid, &mut mat)?;
|
||||
// See the comments in the full DFA routine above for why we need this.
|
||||
if at == input.start()
|
||||
&& mat.map_or(false, |m| m.offset() > input.start())
|
||||
&& !was_dead
|
||||
{
|
||||
trace!(
|
||||
"reached beginning of search at offset {} without hitting \
|
||||
a dead state, quitting to avoid potential false positive match",
|
||||
at,
|
||||
);
|
||||
return Err(RetryError::Quadratic(RetryQuadraticError::new()));
|
||||
}
|
||||
Ok(mat)
|
||||
}
|
||||
|
||||
#[cfg(feature = "dfa-build")]
|
||||
#[cfg_attr(feature = "perf-inline", inline(always))]
|
||||
fn dfa_eoi_rev(
|
||||
dfa: &crate::dfa::dense::DFA<alloc::vec::Vec<u32>>,
|
||||
input: &Input<'_>,
|
||||
sid: &mut crate::util::primitives::StateID,
|
||||
mat: &mut Option<HalfMatch>,
|
||||
) -> Result<(), MatchError> {
|
||||
use crate::dfa::Automaton;
|
||||
|
||||
let sp = input.get_span();
|
||||
if sp.start > 0 {
|
||||
let byte = input.haystack()[sp.start - 1];
|
||||
*sid = dfa.next_state(*sid, byte);
|
||||
if dfa.is_match_state(*sid) {
|
||||
let pattern = dfa.match_pattern(*sid, 0);
|
||||
*mat = Some(HalfMatch::new(pattern, sp.start));
|
||||
} else if dfa.is_quit_state(*sid) {
|
||||
return Err(MatchError::quit(byte, sp.start - 1));
|
||||
}
|
||||
} else {
|
||||
*sid = dfa.next_eoi_state(*sid);
|
||||
if dfa.is_match_state(*sid) {
|
||||
let pattern = dfa.match_pattern(*sid, 0);
|
||||
*mat = Some(HalfMatch::new(pattern, 0));
|
||||
}
|
||||
// N.B. We don't have to check 'is_quit' here because the EOI
|
||||
// transition can never lead to a quit state.
|
||||
debug_assert!(!dfa.is_quit_state(*sid));
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[cfg(feature = "hybrid")]
|
||||
#[cfg_attr(feature = "perf-inline", inline(always))]
|
||||
fn hybrid_eoi_rev(
|
||||
dfa: &crate::hybrid::dfa::DFA,
|
||||
cache: &mut crate::hybrid::dfa::Cache,
|
||||
input: &Input<'_>,
|
||||
sid: &mut crate::hybrid::LazyStateID,
|
||||
mat: &mut Option<HalfMatch>,
|
||||
) -> Result<(), MatchError> {
|
||||
let sp = input.get_span();
|
||||
if sp.start > 0 {
|
||||
let byte = input.haystack()[sp.start - 1];
|
||||
*sid = dfa
|
||||
.next_state(cache, *sid, byte)
|
||||
.map_err(|_| MatchError::gave_up(sp.start))?;
|
||||
if sid.is_match() {
|
||||
let pattern = dfa.match_pattern(cache, *sid, 0);
|
||||
*mat = Some(HalfMatch::new(pattern, sp.start));
|
||||
} else if sid.is_quit() {
|
||||
return Err(MatchError::quit(byte, sp.start - 1));
|
||||
}
|
||||
} else {
|
||||
*sid = dfa
|
||||
.next_eoi_state(cache, *sid)
|
||||
.map_err(|_| MatchError::gave_up(sp.start))?;
|
||||
if sid.is_match() {
|
||||
let pattern = dfa.match_pattern(cache, *sid, 0);
|
||||
*mat = Some(HalfMatch::new(pattern, 0));
|
||||
}
|
||||
// N.B. We don't have to check 'is_quit' here because the EOI
|
||||
// transition can never lead to a quit state.
|
||||
debug_assert!(!sid.is_quit());
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
81
third-party/vendor/regex-automata/src/meta/literal.rs
vendored
Normal file
81
third-party/vendor/regex-automata/src/meta/literal.rs
vendored
Normal file
|
|
@ -0,0 +1,81 @@
|
|||
use alloc::{vec, vec::Vec};
|
||||
|
||||
use regex_syntax::hir::Hir;
|
||||
|
||||
use crate::{meta::regex::RegexInfo, util::search::MatchKind};
|
||||
|
||||
/// Pull out an alternation of literals from the given sequence of HIR
|
||||
/// expressions.
|
||||
///
|
||||
/// There are numerous ways for this to fail. Generally, this only applies
|
||||
/// to regexes of the form 'foo|bar|baz|...|quux'. It can also fail if there
|
||||
/// are "too few" alternates, in which case, the regex engine is likely faster.
|
||||
///
|
||||
/// And currently, this only returns something when 'hirs.len() == 1'.
|
||||
pub(crate) fn alternation_literals(
|
||||
info: &RegexInfo,
|
||||
hirs: &[&Hir],
|
||||
) -> Option<Vec<Vec<u8>>> {
|
||||
use regex_syntax::hir::{HirKind, Literal};
|
||||
|
||||
// Might as well skip the work below if we know we can't build an
|
||||
// Aho-Corasick searcher.
|
||||
if !cfg!(feature = "perf-literal-multisubstring") {
|
||||
return None;
|
||||
}
|
||||
// This is pretty hacky, but basically, if `is_alternation_literal` is
|
||||
// true, then we can make several assumptions about the structure of our
|
||||
// HIR. This is what justifies the `unreachable!` statements below.
|
||||
if hirs.len() != 1
|
||||
|| !info.props()[0].look_set().is_empty()
|
||||
|| info.props()[0].explicit_captures_len() > 0
|
||||
|| !info.props()[0].is_alternation_literal()
|
||||
|| info.config().get_match_kind() != MatchKind::LeftmostFirst
|
||||
{
|
||||
return None;
|
||||
}
|
||||
let hir = &hirs[0];
|
||||
let alts = match *hir.kind() {
|
||||
HirKind::Alternation(ref alts) => alts,
|
||||
_ => return None, // one literal isn't worth it
|
||||
};
|
||||
|
||||
let mut lits = vec![];
|
||||
for alt in alts {
|
||||
let mut lit = vec![];
|
||||
match *alt.kind() {
|
||||
HirKind::Literal(Literal(ref bytes)) => {
|
||||
lit.extend_from_slice(bytes)
|
||||
}
|
||||
HirKind::Concat(ref exprs) => {
|
||||
for e in exprs {
|
||||
match *e.kind() {
|
||||
HirKind::Literal(Literal(ref bytes)) => {
|
||||
lit.extend_from_slice(bytes);
|
||||
}
|
||||
_ => unreachable!("expected literal, got {:?}", e),
|
||||
}
|
||||
}
|
||||
}
|
||||
_ => unreachable!("expected literal or concat, got {:?}", alt),
|
||||
}
|
||||
lits.push(lit);
|
||||
}
|
||||
// Why do this? Well, when the number of literals is small, it's likely
|
||||
// that we'll use the lazy DFA which is in turn likely to be faster than
|
||||
// Aho-Corasick in such cases. Primarily because Aho-Corasick doesn't have
|
||||
// a "lazy DFA" but either a contiguous NFA or a full DFA. We rarely use
|
||||
// the latter because it is so hungry (in time and space), and the former
|
||||
// is decently fast, but not as fast as a well oiled lazy DFA.
|
||||
//
|
||||
// However, once the number starts getting large, the lazy DFA is likely
|
||||
// to start thrashing because of the modest default cache size. When
|
||||
// exactly does this happen? Dunno. But at whatever point that is (we make
|
||||
// a guess below based on ad hoc benchmarking), we'll want to cut over to
|
||||
// Aho-Corasick, where even the contiguous NFA is likely to do much better.
|
||||
if lits.len() < 3000 {
|
||||
debug!("skipping Aho-Corasick because there are too few literals");
|
||||
return None;
|
||||
}
|
||||
Some(lits)
|
||||
}
|
||||
62
third-party/vendor/regex-automata/src/meta/mod.rs
vendored
Normal file
62
third-party/vendor/regex-automata/src/meta/mod.rs
vendored
Normal file
|
|
@ -0,0 +1,62 @@
|
|||
/*!
|
||||
Provides a regex matcher that composes several other regex matchers
|
||||
automatically.
|
||||
|
||||
This module is home to a meta [`Regex`], which provides a convenient high
|
||||
level API for executing regular expressions in linear time.
|
||||
|
||||
# Comparison with the `regex` crate
|
||||
|
||||
A meta `Regex` is the implementation used directly by the `regex` crate.
|
||||
Indeed, the `regex` crate API is essentially just a light wrapper over a meta
|
||||
`Regex`. This means that if you need the full flexibility offered by this
|
||||
API, then you should be able to switch to using this API directly without
|
||||
any changes in match semantics or syntax. However, there are some API level
|
||||
differences:
|
||||
|
||||
* The `regex` crate API returns match objects that include references to the
|
||||
haystack itself, which in turn makes it easy to access the matching strings
|
||||
without having to slice the haystack yourself. In contrast, a meta `Regex`
|
||||
returns match objects that only have offsets in them.
|
||||
* At time of writing, a meta `Regex` doesn't have some of the convenience
|
||||
routines that the `regex` crate has, such as replacements. Note though that
|
||||
[`Captures::interpolate_string`](crate::util::captures::Captures::interpolate_string)
|
||||
will handle the replacement string interpolation for you.
|
||||
* A meta `Regex` supports the [`Input`](crate::Input) abstraction, which
|
||||
provides a way to configure a search in more ways than is supported by the
|
||||
`regex` crate. For example, [`Input::anchored`](crate::Input::anchored) can
|
||||
be used to run an anchored search, regardless of whether the pattern is itself
|
||||
anchored with a `^`.
|
||||
* A meta `Regex` supports multi-pattern searching everywhere.
|
||||
Indeed, every [`Match`](crate::Match) returned by the search APIs
|
||||
include a [`PatternID`](crate::PatternID) indicating which pattern
|
||||
matched. In the single pattern case, all matches correspond to
|
||||
[`PatternID::ZERO`](crate::PatternID::ZERO). In contrast, the `regex` crate
|
||||
has distinct `Regex` and a `RegexSet` APIs. The former only supports a single
|
||||
pattern, while the latter supports multiple patterns but cannot report the
|
||||
offsets of a match.
|
||||
* A meta `Regex` provides the explicit capability of bypassing its internal
|
||||
memory pool for automatically acquiring mutable scratch space required by its
|
||||
internal regex engines. Namely, a [`Cache`] can be explicitly provided to lower
|
||||
level routines such as [`Regex::search_with`].
|
||||
|
||||
*/
|
||||
|
||||
pub use self::{
|
||||
error::BuildError,
|
||||
regex::{
|
||||
Builder, Cache, CapturesMatches, Config, FindMatches, Regex, Split,
|
||||
SplitN,
|
||||
},
|
||||
};
|
||||
|
||||
mod error;
|
||||
#[cfg(any(feature = "dfa-build", feature = "hybrid"))]
|
||||
mod limited;
|
||||
mod literal;
|
||||
mod regex;
|
||||
mod reverse_inner;
|
||||
#[cfg(any(feature = "dfa-build", feature = "hybrid"))]
|
||||
mod stopat;
|
||||
mod strategy;
|
||||
mod wrappers;
|
||||
3649
third-party/vendor/regex-automata/src/meta/regex.rs
vendored
Normal file
3649
third-party/vendor/regex-automata/src/meta/regex.rs
vendored
Normal file
File diff suppressed because it is too large
Load diff
220
third-party/vendor/regex-automata/src/meta/reverse_inner.rs
vendored
Normal file
220
third-party/vendor/regex-automata/src/meta/reverse_inner.rs
vendored
Normal file
|
|
@ -0,0 +1,220 @@
|
|||
/*!
|
||||
A module dedicated to plucking inner literals out of a regex pattern, and
|
||||
then constructing a prefilter for them. We also include a regex pattern
|
||||
"prefix" that corresponds to the bits of the regex that need to match before
|
||||
the literals do. The reverse inner optimization then proceeds by looking for
|
||||
matches of the inner literal(s), and then doing a reverse search of the prefix
|
||||
from the start of the literal match to find the overall start position of the
|
||||
match.
|
||||
|
||||
The essential invariant we want to uphold here is that the literals we return
|
||||
reflect a set where *at least* one of them must match in order for the overall
|
||||
regex to match. We also need to maintain the invariant that the regex prefix
|
||||
returned corresponds to the entirety of the regex up until the literals we
|
||||
return.
|
||||
|
||||
This somewhat limits what we can do. That is, if we a regex like
|
||||
`\w+(@!|%%)\w+`, then we can pluck the `{@!, %%}` out and build a prefilter
|
||||
from it. Then we just need to compile `\w+` in reverse. No fuss no muss. But if
|
||||
we have a regex like \d+@!|\w+%%`, then we get kind of stymied. Technically,
|
||||
we could still extract `{@!, %%}`, and it is true that at least of them must
|
||||
match. But then, what is our regex prefix? Again, in theory, that could be
|
||||
`\d+|\w+`, but that's not quite right, because the `\d+` only matches when `@!`
|
||||
matches, and `\w+` only matches when `%%` matches.
|
||||
|
||||
All of that is technically possible to do, but it seemingly requires a lot of
|
||||
sophistication and machinery. Probably the way to tackle that is with some kind
|
||||
of formalism and approach this problem more generally.
|
||||
|
||||
For now, the code below basically just looks for a top-level concatenation.
|
||||
And if it can find one, it looks for literals in each of the direct child
|
||||
sub-expressions of that concatenation. If some good ones are found, we return
|
||||
those and a concatenation of the Hir expressions seen up to that point.
|
||||
*/
|
||||
|
||||
use alloc::vec::Vec;
|
||||
|
||||
use regex_syntax::hir::{self, literal, Hir, HirKind};
|
||||
|
||||
use crate::{util::prefilter::Prefilter, MatchKind};
|
||||
|
||||
/// Attempts to extract an "inner" prefilter from the given HIR expressions. If
|
||||
/// one was found, then a concatenation of the HIR expressions that precede it
|
||||
/// is returned.
|
||||
///
|
||||
/// The idea here is that the prefilter returned can be used to find candidate
|
||||
/// matches. And then the HIR returned can be used to build a reverse regex
|
||||
/// matcher, which will find the start of the candidate match. Finally, the
|
||||
/// match still has to be confirmed with a normal anchored forward scan to find
|
||||
/// the end position of the match.
|
||||
///
|
||||
/// Note that this assumes leftmost-first match semantics, so callers must
|
||||
/// not call this otherwise.
|
||||
pub(crate) fn extract(hirs: &[&Hir]) -> Option<(Hir, Prefilter)> {
|
||||
if hirs.len() != 1 {
|
||||
debug!(
|
||||
"skipping reverse inner optimization since it only \
|
||||
supports 1 pattern, {} were given",
|
||||
hirs.len(),
|
||||
);
|
||||
return None;
|
||||
}
|
||||
let mut concat = match top_concat(hirs[0]) {
|
||||
Some(concat) => concat,
|
||||
None => {
|
||||
debug!(
|
||||
"skipping reverse inner optimization because a top-level \
|
||||
concatenation could not found",
|
||||
);
|
||||
return None;
|
||||
}
|
||||
};
|
||||
// We skip the first HIR because if it did have a prefix prefilter in it,
|
||||
// we probably wouldn't be here looking for an inner prefilter.
|
||||
for i in 1..concat.len() {
|
||||
let hir = &concat[i];
|
||||
let pre = match prefilter(hir) {
|
||||
None => continue,
|
||||
Some(pre) => pre,
|
||||
};
|
||||
// Even if we got a prefilter, if it isn't consider "fast," then we
|
||||
// probably don't want to bother with it. Namely, since the reverse
|
||||
// inner optimization requires some overhead, it likely only makes
|
||||
// sense if the prefilter scan itself is (believed) to be much faster
|
||||
// than the regex engine.
|
||||
if !pre.is_fast() {
|
||||
debug!(
|
||||
"skipping extracted inner prefilter because \
|
||||
it probably isn't fast"
|
||||
);
|
||||
continue;
|
||||
}
|
||||
let concat_suffix = Hir::concat(concat.split_off(i));
|
||||
let concat_prefix = Hir::concat(concat);
|
||||
// Look for a prefilter again. Why? Because above we only looked for
|
||||
// a prefilter on the individual 'hir', but we might be able to find
|
||||
// something better and more discriminatory by looking at the entire
|
||||
// suffix. We don't do this above to avoid making this loop worst case
|
||||
// quadratic in the length of 'concat'.
|
||||
let pre2 = match prefilter(&concat_suffix) {
|
||||
None => pre,
|
||||
Some(pre2) => {
|
||||
if pre2.is_fast() {
|
||||
pre2
|
||||
} else {
|
||||
pre
|
||||
}
|
||||
}
|
||||
};
|
||||
return Some((concat_prefix, pre2));
|
||||
}
|
||||
debug!(
|
||||
"skipping reverse inner optimization because a top-level \
|
||||
sub-expression with a fast prefilter could not be found"
|
||||
);
|
||||
None
|
||||
}
|
||||
|
||||
/// Attempt to extract a prefilter from an HIR expression.
|
||||
///
|
||||
/// We do a little massaging here to do our best that the prefilter we get out
|
||||
/// of this is *probably* fast. Basically, the false positive rate has a much
|
||||
/// higher impact for things like the reverse inner optimization because more
|
||||
/// work needs to potentially be done for each candidate match.
|
||||
///
|
||||
/// Note that this assumes leftmost-first match semantics, so callers must
|
||||
/// not call this otherwise.
|
||||
fn prefilter(hir: &Hir) -> Option<Prefilter> {
|
||||
let mut extractor = literal::Extractor::new();
|
||||
extractor.kind(literal::ExtractKind::Prefix);
|
||||
let mut prefixes = extractor.extract(hir);
|
||||
debug!(
|
||||
"inner prefixes (len={:?}) extracted before optimization: {:?}",
|
||||
prefixes.len(),
|
||||
prefixes
|
||||
);
|
||||
// Since these are inner literals, we know they cannot be exact. But the
|
||||
// extractor doesn't know this. We mark them as inexact because this might
|
||||
// impact literal optimization. Namely, optimization weights "all literals
|
||||
// are exact" as very high, because it presumes that any match results in
|
||||
// an overall match. But of course, that is not the case here.
|
||||
//
|
||||
// In practice, this avoids plucking out a ASCII-only \s as an alternation
|
||||
// of single-byte whitespace characters.
|
||||
prefixes.make_inexact();
|
||||
prefixes.optimize_for_prefix_by_preference();
|
||||
debug!(
|
||||
"inner prefixes (len={:?}) extracted after optimization: {:?}",
|
||||
prefixes.len(),
|
||||
prefixes
|
||||
);
|
||||
prefixes
|
||||
.literals()
|
||||
.and_then(|lits| Prefilter::new(MatchKind::LeftmostFirst, lits))
|
||||
}
|
||||
|
||||
/// Looks for a "top level" HirKind::Concat item in the given HIR. This will
|
||||
/// try to return one even if it's embedded in a capturing group, but is
|
||||
/// otherwise pretty conservative in what is returned.
|
||||
///
|
||||
/// The HIR returned is a complete copy of the concat with all capturing
|
||||
/// groups removed. In effect, the concat returned is "flattened" with respect
|
||||
/// to capturing groups. This makes the detection logic above for prefixes
|
||||
/// a bit simpler, and it works because 1) capturing groups never influence
|
||||
/// whether a match occurs or not and 2) capturing groups are not used when
|
||||
/// doing the reverse inner search to find the start of the match.
|
||||
fn top_concat(mut hir: &Hir) -> Option<Vec<Hir>> {
|
||||
loop {
|
||||
hir = match hir.kind() {
|
||||
HirKind::Empty
|
||||
| HirKind::Literal(_)
|
||||
| HirKind::Class(_)
|
||||
| HirKind::Look(_)
|
||||
| HirKind::Repetition(_)
|
||||
| HirKind::Alternation(_) => return None,
|
||||
HirKind::Capture(hir::Capture { ref sub, .. }) => sub,
|
||||
HirKind::Concat(ref subs) => {
|
||||
// We are careful to only do the flattening/copy when we know
|
||||
// we have a "top level" concat we can inspect. This avoids
|
||||
// doing extra work in cases where we definitely won't use it.
|
||||
// (This might still be wasted work if we can't go on to find
|
||||
// some literals to extract.)
|
||||
let concat =
|
||||
Hir::concat(subs.iter().map(|h| flatten(h)).collect());
|
||||
return match concat.into_kind() {
|
||||
HirKind::Concat(xs) => Some(xs),
|
||||
// It is actually possible for this case to occur, because
|
||||
// 'Hir::concat' might simplify the expression to the point
|
||||
// that concatenations are actually removed. One wonders
|
||||
// whether this leads to other cases where we should be
|
||||
// extracting literals, but in theory, I believe if we do
|
||||
// get here, then it means that a "real" prefilter failed
|
||||
// to be extracted and we should probably leave well enough
|
||||
// alone. (A "real" prefilter is unbothered by "top-level
|
||||
// concats" and "capturing groups.")
|
||||
_ => return None,
|
||||
};
|
||||
}
|
||||
};
|
||||
}
|
||||
}
|
||||
|
||||
/// Returns a copy of the given HIR but with all capturing groups removed.
|
||||
fn flatten(hir: &Hir) -> Hir {
|
||||
match hir.kind() {
|
||||
HirKind::Empty => Hir::empty(),
|
||||
HirKind::Literal(hir::Literal(ref x)) => Hir::literal(x.clone()),
|
||||
HirKind::Class(ref x) => Hir::class(x.clone()),
|
||||
HirKind::Look(ref x) => Hir::look(x.clone()),
|
||||
HirKind::Repetition(ref x) => Hir::repetition(x.with(flatten(&x.sub))),
|
||||
// This is the interesting case. We just drop the group information
|
||||
// entirely and use the child HIR itself.
|
||||
HirKind::Capture(hir::Capture { ref sub, .. }) => flatten(sub),
|
||||
HirKind::Alternation(ref xs) => {
|
||||
Hir::alternation(xs.iter().map(|x| flatten(x)).collect())
|
||||
}
|
||||
HirKind::Concat(ref xs) => {
|
||||
Hir::concat(xs.iter().map(|x| flatten(x)).collect())
|
||||
}
|
||||
}
|
||||
}
|
||||
212
third-party/vendor/regex-automata/src/meta/stopat.rs
vendored
Normal file
212
third-party/vendor/regex-automata/src/meta/stopat.rs
vendored
Normal file
|
|
@ -0,0 +1,212 @@
|
|||
/*!
|
||||
This module defines two bespoke forward DFA search routines. One for the lazy
|
||||
DFA and one for the fully compiled DFA. These routines differ from the normal
|
||||
ones by reporting the position at which the search terminates when a match
|
||||
*isn't* found.
|
||||
|
||||
This position at which a search terminates is useful in contexts where the meta
|
||||
regex engine runs optimizations that could go quadratic if we aren't careful.
|
||||
Namely, a regex search *could* scan to the end of the haystack only to report a
|
||||
non-match. If the caller doesn't know that the search scanned to the end of the
|
||||
haystack, it might restart the search at the next literal candidate it finds
|
||||
and repeat the process.
|
||||
|
||||
Providing the caller with the position at which the search stopped provides a
|
||||
way for the caller to determine the point at which subsequent scans should not
|
||||
pass. This is principally used in the "reverse inner" optimization, which works
|
||||
like this:
|
||||
|
||||
1. Look for a match of an inner literal. Say, 'Z' in '\w+Z\d+'.
|
||||
2. At the spot where 'Z' matches, do a reverse anchored search from there for
|
||||
'\w+'.
|
||||
3. If the reverse search matches, it corresponds to the start position of a
|
||||
(possible) match. At this point, do a forward anchored search to find the end
|
||||
position. If an end position is found, then we have a match and we know its
|
||||
bounds.
|
||||
|
||||
If the forward anchored search in (3) searches the entire rest of the haystack
|
||||
but reports a non-match, then a naive implementation of the above will continue
|
||||
back at step 1 looking for more candidates. There might still be a match to be
|
||||
found! It's possible. But we already scanned the whole haystack. So if we keep
|
||||
repeating the process, then we might wind up taking quadratic time in the size
|
||||
of the haystack, which is not great.
|
||||
|
||||
So if the forward anchored search in (3) reports the position at which it
|
||||
stops, then we can detect whether quadratic behavior might be occurring in
|
||||
steps (1) and (2). For (1), it occurs if the literal candidate found occurs
|
||||
*before* the end of the previous search in (3), since that means we're now
|
||||
going to look for another match in a place where the forward search has already
|
||||
scanned. It is *correct* to do so, but our technique has become inefficient.
|
||||
For (2), quadratic behavior occurs similarly when its reverse search extends
|
||||
past the point where the previous forward search in (3) terminated. Indeed, to
|
||||
implement (2), we use the sibling 'limited' module for ensuring our reverse
|
||||
scan doesn't go further than we want.
|
||||
|
||||
See the 'opt/reverse-inner' benchmarks in rebar for a real demonstration of
|
||||
how quadratic behavior is mitigated.
|
||||
*/
|
||||
|
||||
use crate::{meta::error::RetryFailError, HalfMatch, Input, MatchError};
|
||||
|
||||
#[cfg(feature = "dfa-build")]
|
||||
pub(crate) fn dfa_try_search_half_fwd(
|
||||
dfa: &crate::dfa::dense::DFA<alloc::vec::Vec<u32>>,
|
||||
input: &Input<'_>,
|
||||
) -> Result<Result<HalfMatch, usize>, RetryFailError> {
|
||||
use crate::dfa::{accel, Automaton};
|
||||
|
||||
let mut mat = None;
|
||||
let mut sid = dfa.start_state_forward(input)?;
|
||||
let mut at = input.start();
|
||||
while at < input.end() {
|
||||
sid = dfa.next_state(sid, input.haystack()[at]);
|
||||
if dfa.is_special_state(sid) {
|
||||
if dfa.is_match_state(sid) {
|
||||
let pattern = dfa.match_pattern(sid, 0);
|
||||
mat = Some(HalfMatch::new(pattern, at));
|
||||
if input.get_earliest() {
|
||||
return Ok(mat.ok_or(at));
|
||||
}
|
||||
if dfa.is_accel_state(sid) {
|
||||
let needs = dfa.accelerator(sid);
|
||||
at = accel::find_fwd(needs, input.haystack(), at)
|
||||
.unwrap_or(input.end());
|
||||
continue;
|
||||
}
|
||||
} else if dfa.is_accel_state(sid) {
|
||||
let needs = dfa.accelerator(sid);
|
||||
at = accel::find_fwd(needs, input.haystack(), at)
|
||||
.unwrap_or(input.end());
|
||||
continue;
|
||||
} else if dfa.is_dead_state(sid) {
|
||||
return Ok(mat.ok_or(at));
|
||||
} else if dfa.is_quit_state(sid) {
|
||||
return Err(MatchError::quit(input.haystack()[at], at).into());
|
||||
} else {
|
||||
// Ideally we wouldn't use a DFA that specialized start states
|
||||
// and thus 'is_start_state()' could never be true here, but in
|
||||
// practice we reuse the DFA created for the full regex which
|
||||
// will specialize start states whenever there is a prefilter.
|
||||
debug_assert!(dfa.is_start_state(sid));
|
||||
}
|
||||
}
|
||||
at += 1;
|
||||
}
|
||||
dfa_eoi_fwd(dfa, input, &mut sid, &mut mat)?;
|
||||
Ok(mat.ok_or(at))
|
||||
}
|
||||
|
||||
#[cfg(feature = "hybrid")]
|
||||
pub(crate) fn hybrid_try_search_half_fwd(
|
||||
dfa: &crate::hybrid::dfa::DFA,
|
||||
cache: &mut crate::hybrid::dfa::Cache,
|
||||
input: &Input<'_>,
|
||||
) -> Result<Result<HalfMatch, usize>, RetryFailError> {
|
||||
let mut mat = None;
|
||||
let mut sid = dfa.start_state_forward(cache, input)?;
|
||||
let mut at = input.start();
|
||||
while at < input.end() {
|
||||
sid = dfa
|
||||
.next_state(cache, sid, input.haystack()[at])
|
||||
.map_err(|_| MatchError::gave_up(at))?;
|
||||
if sid.is_tagged() {
|
||||
if sid.is_match() {
|
||||
let pattern = dfa.match_pattern(cache, sid, 0);
|
||||
mat = Some(HalfMatch::new(pattern, at));
|
||||
if input.get_earliest() {
|
||||
return Ok(mat.ok_or(at));
|
||||
}
|
||||
} else if sid.is_dead() {
|
||||
return Ok(mat.ok_or(at));
|
||||
} else if sid.is_quit() {
|
||||
return Err(MatchError::quit(input.haystack()[at], at).into());
|
||||
} else {
|
||||
// We should NEVER get an unknown state ID back from
|
||||
// dfa.next_state().
|
||||
debug_assert!(!sid.is_unknown());
|
||||
// Ideally we wouldn't use a lazy DFA that specialized start
|
||||
// states and thus 'sid.is_start()' could never be true here,
|
||||
// but in practice we reuse the lazy DFA created for the full
|
||||
// regex which will specialize start states whenever there is
|
||||
// a prefilter.
|
||||
debug_assert!(sid.is_start());
|
||||
}
|
||||
}
|
||||
at += 1;
|
||||
}
|
||||
hybrid_eoi_fwd(dfa, cache, input, &mut sid, &mut mat)?;
|
||||
Ok(mat.ok_or(at))
|
||||
}
|
||||
|
||||
#[cfg(feature = "dfa-build")]
|
||||
#[cfg_attr(feature = "perf-inline", inline(always))]
|
||||
fn dfa_eoi_fwd(
|
||||
dfa: &crate::dfa::dense::DFA<alloc::vec::Vec<u32>>,
|
||||
input: &Input<'_>,
|
||||
sid: &mut crate::util::primitives::StateID,
|
||||
mat: &mut Option<HalfMatch>,
|
||||
) -> Result<(), MatchError> {
|
||||
use crate::dfa::Automaton;
|
||||
|
||||
let sp = input.get_span();
|
||||
match input.haystack().get(sp.end) {
|
||||
Some(&b) => {
|
||||
*sid = dfa.next_state(*sid, b);
|
||||
if dfa.is_match_state(*sid) {
|
||||
let pattern = dfa.match_pattern(*sid, 0);
|
||||
*mat = Some(HalfMatch::new(pattern, sp.end));
|
||||
} else if dfa.is_quit_state(*sid) {
|
||||
return Err(MatchError::quit(b, sp.end));
|
||||
}
|
||||
}
|
||||
None => {
|
||||
*sid = dfa.next_eoi_state(*sid);
|
||||
if dfa.is_match_state(*sid) {
|
||||
let pattern = dfa.match_pattern(*sid, 0);
|
||||
*mat = Some(HalfMatch::new(pattern, input.haystack().len()));
|
||||
}
|
||||
// N.B. We don't have to check 'is_quit' here because the EOI
|
||||
// transition can never lead to a quit state.
|
||||
debug_assert!(!dfa.is_quit_state(*sid));
|
||||
}
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[cfg(feature = "hybrid")]
|
||||
#[cfg_attr(feature = "perf-inline", inline(always))]
|
||||
fn hybrid_eoi_fwd(
|
||||
dfa: &crate::hybrid::dfa::DFA,
|
||||
cache: &mut crate::hybrid::dfa::Cache,
|
||||
input: &Input<'_>,
|
||||
sid: &mut crate::hybrid::LazyStateID,
|
||||
mat: &mut Option<HalfMatch>,
|
||||
) -> Result<(), MatchError> {
|
||||
let sp = input.get_span();
|
||||
match input.haystack().get(sp.end) {
|
||||
Some(&b) => {
|
||||
*sid = dfa
|
||||
.next_state(cache, *sid, b)
|
||||
.map_err(|_| MatchError::gave_up(sp.end))?;
|
||||
if sid.is_match() {
|
||||
let pattern = dfa.match_pattern(cache, *sid, 0);
|
||||
*mat = Some(HalfMatch::new(pattern, sp.end));
|
||||
} else if sid.is_quit() {
|
||||
return Err(MatchError::quit(b, sp.end));
|
||||
}
|
||||
}
|
||||
None => {
|
||||
*sid = dfa
|
||||
.next_eoi_state(cache, *sid)
|
||||
.map_err(|_| MatchError::gave_up(input.haystack().len()))?;
|
||||
if sid.is_match() {
|
||||
let pattern = dfa.match_pattern(cache, *sid, 0);
|
||||
*mat = Some(HalfMatch::new(pattern, input.haystack().len()));
|
||||
}
|
||||
// N.B. We don't have to check 'is_quit' here because the EOI
|
||||
// transition can never lead to a quit state.
|
||||
debug_assert!(!sid.is_quit());
|
||||
}
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
1914
third-party/vendor/regex-automata/src/meta/strategy.rs
vendored
Normal file
1914
third-party/vendor/regex-automata/src/meta/strategy.rs
vendored
Normal file
File diff suppressed because it is too large
Load diff
1351
third-party/vendor/regex-automata/src/meta/wrappers.rs
vendored
Normal file
1351
third-party/vendor/regex-automata/src/meta/wrappers.rs
vendored
Normal file
File diff suppressed because it is too large
Load diff
55
third-party/vendor/regex-automata/src/nfa/mod.rs
vendored
Normal file
55
third-party/vendor/regex-automata/src/nfa/mod.rs
vendored
Normal file
|
|
@ -0,0 +1,55 @@
|
|||
/*!
|
||||
Provides non-deterministic finite automata (NFA) and regex engines that use
|
||||
them.
|
||||
|
||||
While NFAs and DFAs (deterministic finite automata) have equivalent *theoretical*
|
||||
power, their usage in practice tends to result in different engineering trade
|
||||
offs. While this isn't meant to be a comprehensive treatment of the topic, here
|
||||
are a few key trade offs that are, at minimum, true for this crate:
|
||||
|
||||
* NFAs tend to be represented sparsely where as DFAs are represented densely.
|
||||
Sparse representations use less memory, but are slower to traverse. Conversely,
|
||||
dense representations use more memory, but are faster to traverse. (Sometimes
|
||||
these lines are blurred. For example, an `NFA` might choose to represent a
|
||||
particular state in a dense fashion, and a DFA can be built using a sparse
|
||||
representation via [`sparse::DFA`](crate::dfa::sparse::DFA).
|
||||
* NFAs have espilon transitions and DFAs don't. In practice, this means that
|
||||
handling a single byte in a haystack with an NFA at search time may require
|
||||
visiting multiple NFA states. In a DFA, each byte only requires visiting
|
||||
a single state. Stated differently, NFAs require a variable number of CPU
|
||||
instructions to process one byte in a haystack where as a DFA uses a constant
|
||||
number of CPU instructions to process one byte.
|
||||
* NFAs are generally easier to amend with secondary storage. For example, the
|
||||
[`thompson::pikevm::PikeVM`] uses an NFA to match, but also uses additional
|
||||
memory beyond the model of a finite state machine to track offsets for matching
|
||||
capturing groups. Conversely, the most a DFA can do is report the offset (and
|
||||
pattern ID) at which a match occurred. This is generally why we also compile
|
||||
DFAs in reverse, so that we can run them after finding the end of a match to
|
||||
also find the start of a match.
|
||||
* NFAs take worst case linear time to build, but DFAs take worst case
|
||||
exponential time to build. The [hybrid NFA/DFA](crate::hybrid) mitigates this
|
||||
challenge for DFAs in many practical cases.
|
||||
|
||||
There are likely other differences, but the bottom line is that NFAs tend to be
|
||||
more memory efficient and give easier opportunities for increasing expressive
|
||||
power, where as DFAs are faster to search with.
|
||||
|
||||
# Why only a Thompson NFA?
|
||||
|
||||
Currently, the only kind of NFA we support in this crate is a [Thompson
|
||||
NFA](https://en.wikipedia.org/wiki/Thompson%27s_construction). This refers
|
||||
to a specific construction algorithm that takes the syntax of a regex
|
||||
pattern and converts it to an NFA. Specifically, it makes gratuitous use of
|
||||
epsilon transitions in order to keep its structure simple. In exchange, its
|
||||
construction time is linear in the size of the regex. A Thompson NFA also makes
|
||||
the guarantee that given any state and a character in a haystack, there is at
|
||||
most one transition defined for it. (Although there may be many epsilon
|
||||
transitions.)
|
||||
|
||||
It possible that other types of NFAs will be added in the future, such as a
|
||||
[Glushkov NFA](https://en.wikipedia.org/wiki/Glushkov%27s_construction_algorithm).
|
||||
But currently, this crate only provides a Thompson NFA.
|
||||
*/
|
||||
|
||||
#[cfg(feature = "nfa-thompson")]
|
||||
pub mod thompson;
|
||||
1908
third-party/vendor/regex-automata/src/nfa/thompson/backtrack.rs
vendored
Normal file
1908
third-party/vendor/regex-automata/src/nfa/thompson/backtrack.rs
vendored
Normal file
File diff suppressed because it is too large
Load diff
1337
third-party/vendor/regex-automata/src/nfa/thompson/builder.rs
vendored
Normal file
1337
third-party/vendor/regex-automata/src/nfa/thompson/builder.rs
vendored
Normal file
File diff suppressed because it is too large
Load diff
2346
third-party/vendor/regex-automata/src/nfa/thompson/compiler.rs
vendored
Normal file
2346
third-party/vendor/regex-automata/src/nfa/thompson/compiler.rs
vendored
Normal file
File diff suppressed because it is too large
Load diff
185
third-party/vendor/regex-automata/src/nfa/thompson/error.rs
vendored
Normal file
185
third-party/vendor/regex-automata/src/nfa/thompson/error.rs
vendored
Normal file
|
|
@ -0,0 +1,185 @@
|
|||
use crate::util::{
|
||||
captures, look,
|
||||
primitives::{PatternID, StateID},
|
||||
};
|
||||
|
||||
/// An error that can occurred during the construction of a thompson NFA.
|
||||
///
|
||||
/// This error does not provide many introspection capabilities. There are
|
||||
/// generally only two things you can do with it:
|
||||
///
|
||||
/// * Obtain a human readable message via its `std::fmt::Display` impl.
|
||||
/// * Access an underlying [`regex_syntax::Error`] type from its `source`
|
||||
/// method via the `std::error::Error` trait. This error only occurs when using
|
||||
/// convenience routines for building an NFA directly from a pattern string.
|
||||
///
|
||||
/// Otherwise, errors typically occur when a limit has been breeched. For
|
||||
/// example, if the total heap usage of the compiled NFA exceeds the limit
|
||||
/// set by [`Config::nfa_size_limit`](crate::nfa::thompson::Config), then
|
||||
/// building the NFA will fail.
|
||||
#[derive(Clone, Debug)]
|
||||
pub struct BuildError {
|
||||
kind: BuildErrorKind,
|
||||
}
|
||||
|
||||
/// The kind of error that occurred during the construction of a thompson NFA.
|
||||
#[derive(Clone, Debug)]
|
||||
enum BuildErrorKind {
|
||||
/// An error that occurred while parsing a regular expression. Note that
|
||||
/// this error may be printed over multiple lines, and is generally
|
||||
/// intended to be end user readable on its own.
|
||||
#[cfg(feature = "syntax")]
|
||||
Syntax(regex_syntax::Error),
|
||||
/// An error that occurs if the capturing groups provided to an NFA builder
|
||||
/// do not satisfy the documented invariants. For example, things like
|
||||
/// too many groups, missing groups, having the first (zeroth) group be
|
||||
/// named or duplicate group names within the same pattern.
|
||||
Captures(captures::GroupInfoError),
|
||||
/// An error that occurs when an NFA contains a Unicode word boundary, but
|
||||
/// where the crate was compiled without the necessary data for dealing
|
||||
/// with Unicode word boundaries.
|
||||
Word(look::UnicodeWordBoundaryError),
|
||||
/// An error that occurs if too many patterns were given to the NFA
|
||||
/// compiler.
|
||||
TooManyPatterns {
|
||||
/// The number of patterns given, which exceeds the limit.
|
||||
given: usize,
|
||||
/// The limit on the number of patterns.
|
||||
limit: usize,
|
||||
},
|
||||
/// An error that occurs if too states are produced while building an NFA.
|
||||
TooManyStates {
|
||||
/// The minimum number of states that are desired, which exceeds the
|
||||
/// limit.
|
||||
given: usize,
|
||||
/// The limit on the number of states.
|
||||
limit: usize,
|
||||
},
|
||||
/// An error that occurs when NFA compilation exceeds a configured heap
|
||||
/// limit.
|
||||
ExceededSizeLimit {
|
||||
/// The configured limit, in bytes.
|
||||
limit: usize,
|
||||
},
|
||||
/// An error that occurs when an invalid capture group index is added to
|
||||
/// the NFA. An "invalid" index can be one that would otherwise overflow
|
||||
/// a `usize` on the current target.
|
||||
InvalidCaptureIndex {
|
||||
/// The invalid index that was given.
|
||||
index: u32,
|
||||
},
|
||||
/// An error that occurs when one tries to build a reverse NFA with
|
||||
/// captures enabled. Currently, this isn't supported, but we probably
|
||||
/// should support it at some point.
|
||||
#[cfg(feature = "syntax")]
|
||||
UnsupportedCaptures,
|
||||
}
|
||||
|
||||
impl BuildError {
|
||||
/// If this error occurred because the NFA exceeded the configured size
|
||||
/// limit before being built, then this returns the configured size limit.
|
||||
///
|
||||
/// The limit returned is what was configured, and corresponds to the
|
||||
/// maximum amount of heap usage in bytes.
|
||||
pub fn size_limit(&self) -> Option<usize> {
|
||||
match self.kind {
|
||||
BuildErrorKind::ExceededSizeLimit { limit } => Some(limit),
|
||||
_ => None,
|
||||
}
|
||||
}
|
||||
|
||||
fn kind(&self) -> &BuildErrorKind {
|
||||
&self.kind
|
||||
}
|
||||
|
||||
#[cfg(feature = "syntax")]
|
||||
pub(crate) fn syntax(err: regex_syntax::Error) -> BuildError {
|
||||
BuildError { kind: BuildErrorKind::Syntax(err) }
|
||||
}
|
||||
|
||||
pub(crate) fn captures(err: captures::GroupInfoError) -> BuildError {
|
||||
BuildError { kind: BuildErrorKind::Captures(err) }
|
||||
}
|
||||
|
||||
pub(crate) fn word(err: look::UnicodeWordBoundaryError) -> BuildError {
|
||||
BuildError { kind: BuildErrorKind::Word(err) }
|
||||
}
|
||||
|
||||
pub(crate) fn too_many_patterns(given: usize) -> BuildError {
|
||||
let limit = PatternID::LIMIT;
|
||||
BuildError { kind: BuildErrorKind::TooManyPatterns { given, limit } }
|
||||
}
|
||||
|
||||
pub(crate) fn too_many_states(given: usize) -> BuildError {
|
||||
let limit = StateID::LIMIT;
|
||||
BuildError { kind: BuildErrorKind::TooManyStates { given, limit } }
|
||||
}
|
||||
|
||||
pub(crate) fn exceeded_size_limit(limit: usize) -> BuildError {
|
||||
BuildError { kind: BuildErrorKind::ExceededSizeLimit { limit } }
|
||||
}
|
||||
|
||||
pub(crate) fn invalid_capture_index(index: u32) -> BuildError {
|
||||
BuildError { kind: BuildErrorKind::InvalidCaptureIndex { index } }
|
||||
}
|
||||
|
||||
#[cfg(feature = "syntax")]
|
||||
pub(crate) fn unsupported_captures() -> BuildError {
|
||||
BuildError { kind: BuildErrorKind::UnsupportedCaptures }
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(feature = "std")]
|
||||
impl std::error::Error for BuildError {
|
||||
fn source(&self) -> Option<&(dyn std::error::Error + 'static)> {
|
||||
match self.kind() {
|
||||
#[cfg(feature = "syntax")]
|
||||
BuildErrorKind::Syntax(ref err) => Some(err),
|
||||
BuildErrorKind::Captures(ref err) => Some(err),
|
||||
_ => None,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl core::fmt::Display for BuildError {
|
||||
fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result {
|
||||
match self.kind() {
|
||||
#[cfg(feature = "syntax")]
|
||||
BuildErrorKind::Syntax(_) => write!(f, "error parsing regex"),
|
||||
BuildErrorKind::Captures(_) => {
|
||||
write!(f, "error with capture groups")
|
||||
}
|
||||
BuildErrorKind::Word(_) => {
|
||||
write!(f, "NFA contains Unicode word boundary")
|
||||
}
|
||||
BuildErrorKind::TooManyPatterns { given, limit } => write!(
|
||||
f,
|
||||
"attempted to compile {} patterns, \
|
||||
which exceeds the limit of {}",
|
||||
given, limit,
|
||||
),
|
||||
BuildErrorKind::TooManyStates { given, limit } => write!(
|
||||
f,
|
||||
"attempted to compile {} NFA states, \
|
||||
which exceeds the limit of {}",
|
||||
given, limit,
|
||||
),
|
||||
BuildErrorKind::ExceededSizeLimit { limit } => write!(
|
||||
f,
|
||||
"heap usage during NFA compilation exceeded limit of {}",
|
||||
limit,
|
||||
),
|
||||
BuildErrorKind::InvalidCaptureIndex { index } => write!(
|
||||
f,
|
||||
"capture group index {} is invalid (too big or discontinuous)",
|
||||
index,
|
||||
),
|
||||
#[cfg(feature = "syntax")]
|
||||
BuildErrorKind::UnsupportedCaptures => write!(
|
||||
f,
|
||||
"currently captures must be disabled when compiling \
|
||||
a reverse NFA",
|
||||
),
|
||||
}
|
||||
}
|
||||
}
|
||||
528
third-party/vendor/regex-automata/src/nfa/thompson/literal_trie.rs
vendored
Normal file
528
third-party/vendor/regex-automata/src/nfa/thompson/literal_trie.rs
vendored
Normal file
|
|
@ -0,0 +1,528 @@
|
|||
use core::mem;
|
||||
|
||||
use alloc::{vec, vec::Vec};
|
||||
|
||||
use crate::{
|
||||
nfa::thompson::{self, compiler::ThompsonRef, BuildError, Builder},
|
||||
util::primitives::{IteratorIndexExt, StateID},
|
||||
};
|
||||
|
||||
/// A trie that preserves leftmost-first match semantics.
|
||||
///
|
||||
/// This is a purpose-built data structure for optimizing 'lit1|lit2|..|litN'
|
||||
/// patterns. It can *only* handle alternations of literals, which makes it
|
||||
/// somewhat restricted in its scope, but literal alternations are fairly
|
||||
/// common.
|
||||
///
|
||||
/// At a 5,000 foot level, the main idea of this trie is make an alternation of
|
||||
/// literals look more like a DFA than an NFA via epsilon removal.
|
||||
///
|
||||
/// More precisely, the main issue is in how alternations are compiled into
|
||||
/// a Thompson NFA. Namely, each alternation gets a single NFA "union" state
|
||||
/// with an epsilon transition for every branch of the alternation pointing to
|
||||
/// an NFA state corresponding to the start of that branch. The main problem
|
||||
/// with this representation is the cost of computing an epsilon closure. Once
|
||||
/// you hit the alternation's start state, it acts as a sort of "clog" that
|
||||
/// requires you to traverse all of the epsilon transitions to compute the full
|
||||
/// closure.
|
||||
///
|
||||
/// While fixing such clogs in the general case is pretty tricky without going
|
||||
/// to a DFA (or perhaps a Glushkov NFA, but that comes with other problems).
|
||||
/// But at least in the case of an alternation of literals, we can convert
|
||||
/// that to a prefix trie without too much cost. In theory, that's all you
|
||||
/// really need to do: build the trie and then compile it to a Thompson NFA.
|
||||
/// For example, if you have the pattern 'bar|baz|foo', then using a trie, it
|
||||
/// is transformed to something like 'b(a(r|z))|f'. This reduces the clog by
|
||||
/// reducing the number of epsilon transitions out of the alternation's start
|
||||
/// state from 3 to 2 (it actually gets down to 1 when you use a sparse state,
|
||||
/// which we do below). It's a small effect here, but when your alternation is
|
||||
/// huge, the savings is also huge.
|
||||
///
|
||||
/// And that is... essentially what a LiteralTrie does. But there is one
|
||||
/// hiccup. Consider a regex like 'sam|samwise'. How does a prefix trie compile
|
||||
/// that when leftmost-first semantics are used? If 'sam|samwise' was the
|
||||
/// entire regex, then you could just drop the 'samwise' branch entirely since
|
||||
/// it is impossible to match ('sam' will always take priority, and since it
|
||||
/// is a prefix of 'samwise', 'samwise' will never match). But what about the
|
||||
/// regex '\b(sam|samwise)\b'? In that case, you can't remove 'samwise' because
|
||||
/// it might match when 'sam' doesn't fall on a word boundary.
|
||||
///
|
||||
/// The main idea is that 'sam|samwise' can be translated to 'sam(?:|wise)',
|
||||
/// which is a precisely equivalent regex that also gets rid of the clog.
|
||||
///
|
||||
/// Another example is 'zapper|z|zap'. That gets translated to
|
||||
/// 'z(?:apper||ap)'.
|
||||
///
|
||||
/// We accomplish this by giving each state in the trie multiple "chunks" of
|
||||
/// transitions. Each chunk barrier represents a match. The idea is that once
|
||||
/// you know a match occurs, none of the transitions after the match can be
|
||||
/// re-ordered and mixed in with the transitions before the match. Otherwise,
|
||||
/// the match semantics could be changed.
|
||||
///
|
||||
/// See the 'State' data type for a bit more detail.
|
||||
///
|
||||
/// Future work:
|
||||
///
|
||||
/// * In theory, it would be nice to generalize the idea of removing clogs and
|
||||
/// apply it to the NFA graph itself. Then this could in theory work for
|
||||
/// case insensitive alternations of literals, or even just alternations where
|
||||
/// each branch starts with a non-epsilon transition.
|
||||
/// * Could we instead use the Aho-Corasick algorithm here? The aho-corasick
|
||||
/// crate deals with leftmost-first matches correctly, but I think this implies
|
||||
/// encoding failure transitions into a Thompson NFA somehow. Which seems fine,
|
||||
/// because failure transitions are just unconditional epsilon transitions?
|
||||
/// * Or perhaps even better, could we use an aho_corasick::AhoCorasick
|
||||
/// directly? At time of writing, 0.7 is the current version of the
|
||||
/// aho-corasick crate, and that definitely cannot be used as-is. But if we
|
||||
/// expose the underlying finite state machine API, then could we use it? That
|
||||
/// would be super. If we could figure that out, it might also lend itself to
|
||||
/// more general composition of finite state machines.
|
||||
#[derive(Clone)]
|
||||
pub(crate) struct LiteralTrie {
|
||||
/// The set of trie states. Each state contains one or more chunks, where
|
||||
/// each chunk is a sparse set of transitions to other states. A leaf state
|
||||
/// is always a match state that contains only empty chunks (i.e., no
|
||||
/// transitions).
|
||||
states: Vec<State>,
|
||||
/// Whether to add literals in reverse to the trie. Useful when building
|
||||
/// a reverse NFA automaton.
|
||||
rev: bool,
|
||||
}
|
||||
|
||||
impl LiteralTrie {
|
||||
/// Create a new literal trie that adds literals in the forward direction.
|
||||
pub(crate) fn forward() -> LiteralTrie {
|
||||
let root = State::default();
|
||||
LiteralTrie { states: vec![root], rev: false }
|
||||
}
|
||||
|
||||
/// Create a new literal trie that adds literals in reverse.
|
||||
pub(crate) fn reverse() -> LiteralTrie {
|
||||
let root = State::default();
|
||||
LiteralTrie { states: vec![root], rev: true }
|
||||
}
|
||||
|
||||
/// Add the given literal to this trie.
|
||||
///
|
||||
/// If the literal could not be added because the `StateID` space was
|
||||
/// exhausted, then an error is returned. If an error returns, the trie
|
||||
/// is in an unspecified state.
|
||||
pub(crate) fn add(&mut self, bytes: &[u8]) -> Result<(), BuildError> {
|
||||
let mut prev = StateID::ZERO;
|
||||
let mut it = bytes.iter().copied();
|
||||
while let Some(b) = if self.rev { it.next_back() } else { it.next() } {
|
||||
prev = self.get_or_add_state(prev, b)?;
|
||||
}
|
||||
self.states[prev].add_match();
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// If the given transition is defined, then return the next state ID.
|
||||
/// Otherwise, add the transition to `from` and point it to a new state.
|
||||
///
|
||||
/// If a new state ID could not be allocated, then an error is returned.
|
||||
fn get_or_add_state(
|
||||
&mut self,
|
||||
from: StateID,
|
||||
byte: u8,
|
||||
) -> Result<StateID, BuildError> {
|
||||
let active = self.states[from].active_chunk();
|
||||
match active.binary_search_by_key(&byte, |t| t.byte) {
|
||||
Ok(i) => Ok(active[i].next),
|
||||
Err(i) => {
|
||||
// Add a new state and get its ID.
|
||||
let next = StateID::new(self.states.len()).map_err(|_| {
|
||||
BuildError::too_many_states(self.states.len())
|
||||
})?;
|
||||
self.states.push(State::default());
|
||||
// Offset our position to account for all transitions and not
|
||||
// just the ones in the active chunk.
|
||||
let i = self.states[from].active_chunk_start() + i;
|
||||
let t = Transition { byte, next };
|
||||
self.states[from].transitions.insert(i, t);
|
||||
Ok(next)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Compile this literal trie to the NFA builder given.
|
||||
///
|
||||
/// This forwards any errors that may occur while using the given builder.
|
||||
pub(crate) fn compile(
|
||||
&self,
|
||||
builder: &mut Builder,
|
||||
) -> Result<ThompsonRef, BuildError> {
|
||||
// Compilation proceeds via depth-first traversal of the trie.
|
||||
//
|
||||
// This is overall pretty brutal. The recursive version of this is
|
||||
// deliciously simple. (See 'compile_to_hir' below for what it might
|
||||
// look like.) But recursion on a trie means your call stack grows
|
||||
// in accordance with the longest literal, which just does not seem
|
||||
// appropriate. So we push the call stack to the heap. But as a result,
|
||||
// the trie traversal becomes pretty brutal because we essentially
|
||||
// have to encode the state of a double for-loop into an explicit call
|
||||
// frame. If someone can simplify this without using recursion, that'd
|
||||
// be great.
|
||||
|
||||
// 'end' is our match state for this trie, but represented in the the
|
||||
// NFA. Any time we see a match in the trie, we insert a transition
|
||||
// from the current state we're in to 'end'.
|
||||
let end = builder.add_empty()?;
|
||||
let mut stack = vec![];
|
||||
let mut f = Frame::new(&self.states[StateID::ZERO]);
|
||||
loop {
|
||||
if let Some(t) = f.transitions.next() {
|
||||
if self.states[t.next].is_leaf() {
|
||||
f.sparse.push(thompson::Transition {
|
||||
start: t.byte,
|
||||
end: t.byte,
|
||||
next: end,
|
||||
});
|
||||
} else {
|
||||
f.sparse.push(thompson::Transition {
|
||||
start: t.byte,
|
||||
end: t.byte,
|
||||
// This is a little funny, but when the frame we create
|
||||
// below completes, it will pop this parent frame off
|
||||
// and modify this transition to point to the correct
|
||||
// state.
|
||||
next: StateID::ZERO,
|
||||
});
|
||||
stack.push(f);
|
||||
f = Frame::new(&self.states[t.next]);
|
||||
}
|
||||
continue;
|
||||
}
|
||||
// At this point, we have visited all transitions in f.chunk, so
|
||||
// add it as a sparse NFA state. Unless the chunk was empty, in
|
||||
// which case, we don't do anything.
|
||||
if !f.sparse.is_empty() {
|
||||
let chunk_id = if f.sparse.len() == 1 {
|
||||
builder.add_range(f.sparse.pop().unwrap())?
|
||||
} else {
|
||||
let sparse = mem::replace(&mut f.sparse, vec![]);
|
||||
builder.add_sparse(sparse)?
|
||||
};
|
||||
f.union.push(chunk_id);
|
||||
}
|
||||
// Now we need to look to see if there are other chunks to visit.
|
||||
if let Some(chunk) = f.chunks.next() {
|
||||
// If we're here, it means we're on the second (or greater)
|
||||
// chunk, which implies there is a match at this point. So
|
||||
// connect this state to the final end state.
|
||||
f.union.push(end);
|
||||
// Advance to the next chunk.
|
||||
f.transitions = chunk.iter();
|
||||
continue;
|
||||
}
|
||||
// Now that we are out of chunks, we have completely visited
|
||||
// this state. So turn our union of chunks into an NFA union
|
||||
// state, and add that union state to the parent state's current
|
||||
// sparse state. (If there is no parent, we're done.)
|
||||
let start = builder.add_union(f.union)?;
|
||||
match stack.pop() {
|
||||
None => {
|
||||
return Ok(ThompsonRef { start, end });
|
||||
}
|
||||
Some(mut parent) => {
|
||||
// OK because the only way a frame gets pushed on to the
|
||||
// stack (aside from the root) is when a transition has
|
||||
// been added to 'sparse'.
|
||||
parent.sparse.last_mut().unwrap().next = start;
|
||||
f = parent;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Converts this trie to an equivalent HIR expression.
|
||||
///
|
||||
/// We don't actually use this, but it's useful for tests. In particular,
|
||||
/// it provides a (somewhat) human readable representation of the trie
|
||||
/// itself.
|
||||
#[cfg(test)]
|
||||
fn compile_to_hir(&self) -> regex_syntax::hir::Hir {
|
||||
self.compile_state_to_hir(StateID::ZERO)
|
||||
}
|
||||
|
||||
/// The recursive implementation of 'to_hir'.
|
||||
///
|
||||
/// Notice how simple this is compared to 'compile' above. 'compile' could
|
||||
/// be similarly simple, but we opt to not use recursion in order to avoid
|
||||
/// overflowing the stack in the case of a longer literal.
|
||||
#[cfg(test)]
|
||||
fn compile_state_to_hir(&self, sid: StateID) -> regex_syntax::hir::Hir {
|
||||
use regex_syntax::hir::Hir;
|
||||
|
||||
let mut alt = vec![];
|
||||
for (i, chunk) in self.states[sid].chunks().enumerate() {
|
||||
if i > 0 {
|
||||
alt.push(Hir::empty());
|
||||
}
|
||||
if chunk.is_empty() {
|
||||
continue;
|
||||
}
|
||||
let mut chunk_alt = vec![];
|
||||
for t in chunk.iter() {
|
||||
chunk_alt.push(Hir::concat(vec![
|
||||
Hir::literal(vec![t.byte]),
|
||||
self.compile_state_to_hir(t.next),
|
||||
]));
|
||||
}
|
||||
alt.push(Hir::alternation(chunk_alt));
|
||||
}
|
||||
Hir::alternation(alt)
|
||||
}
|
||||
}
|
||||
|
||||
impl core::fmt::Debug for LiteralTrie {
|
||||
fn fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result {
|
||||
writeln!(f, "LiteralTrie(")?;
|
||||
for (sid, state) in self.states.iter().with_state_ids() {
|
||||
writeln!(f, "{:06?}: {:?}", sid.as_usize(), state)?;
|
||||
}
|
||||
writeln!(f, ")")?;
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
|
||||
/// An explicit stack frame used for traversing the trie without using
|
||||
/// recursion.
|
||||
///
|
||||
/// Each frame is tied to the traversal of a single trie state. The frame is
|
||||
/// dropped once the entire state (and all of its children) have been visited.
|
||||
/// The "output" of compiling a state is the 'union' vector, which is turn
|
||||
/// converted to a NFA union state. Each branch of the union corresponds to a
|
||||
/// chunk in the trie state.
|
||||
///
|
||||
/// 'sparse' corresponds to the set of transitions for a particular chunk in a
|
||||
/// trie state. It is ultimately converted to an NFA sparse state. The 'sparse'
|
||||
/// field, after being converted to a sparse NFA state, is reused for any
|
||||
/// subsequent chunks in the trie state, if any exist.
|
||||
#[derive(Debug)]
|
||||
struct Frame<'a> {
|
||||
/// The remaining chunks to visit for a trie state.
|
||||
chunks: StateChunksIter<'a>,
|
||||
/// The transitions of the current chunk that we're iterating over. Since
|
||||
/// every trie state has at least one chunk, every frame is initialized
|
||||
/// with the first chunk's transitions ready to be consumed.
|
||||
transitions: core::slice::Iter<'a, Transition>,
|
||||
/// The NFA state IDs pointing to the start of each chunk compiled by
|
||||
/// this trie state. This ultimately gets converted to an NFA union once
|
||||
/// the entire trie state (and all of its children) have been compiled.
|
||||
/// The order of these matters for leftmost-first match semantics, since
|
||||
/// earlier matches in the union are preferred over later ones.
|
||||
union: Vec<StateID>,
|
||||
/// The actual NFA transitions for a single chunk in a trie state. This
|
||||
/// gets converted to an NFA sparse state, and its corresponding NFA state
|
||||
/// ID should get added to 'union'.
|
||||
sparse: Vec<thompson::Transition>,
|
||||
}
|
||||
|
||||
impl<'a> Frame<'a> {
|
||||
/// Create a new stack frame for trie traversal. This initializes the
|
||||
/// 'transitions' iterator to the transitions for the first chunk, with the
|
||||
/// 'chunks' iterator being every chunk after the first one.
|
||||
fn new(state: &'a State) -> Frame<'a> {
|
||||
let mut chunks = state.chunks();
|
||||
// every state has at least 1 chunk
|
||||
let chunk = chunks.next().unwrap();
|
||||
let transitions = chunk.iter();
|
||||
Frame { chunks, transitions, union: vec![], sparse: vec![] }
|
||||
}
|
||||
}
|
||||
|
||||
/// A state in a trie.
|
||||
///
|
||||
/// This uses a sparse representation. Since we don't use literal tries
|
||||
/// for searching, and ultimately (and compilation requires visiting every
|
||||
/// transition anyway), we use a sparse representation for transitions. This
|
||||
/// means we save on memory, at the expense of 'LiteralTrie::add' being perhaps
|
||||
/// a bit slower.
|
||||
///
|
||||
/// While 'transitions' is pretty standard as far as tries goes, the 'chunks'
|
||||
/// piece here is more unusual. In effect, 'chunks' defines a partitioning
|
||||
/// of 'transitions', where each chunk corresponds to a distinct set of
|
||||
/// transitions. The key invariant is that a transition in one chunk cannot
|
||||
/// be moved to another chunk. This is the secret sauce that preserve
|
||||
/// leftmost-first match semantics.
|
||||
///
|
||||
/// A new chunk is added whenever we mark a state as a match state. Once a
|
||||
/// new chunk is added, the old active chunk is frozen and is never mutated
|
||||
/// again. The new chunk becomes the active chunk, which is defined as
|
||||
/// '&transitions[chunks.last().map_or(0, |c| c.1)..]'. Thus, a state where
|
||||
/// 'chunks' is empty actually contains one chunk. Thus, every state contains
|
||||
/// at least one (possibly empty) chunk.
|
||||
///
|
||||
/// A "leaf" state is a state that has no outgoing transitions (so
|
||||
/// 'transitions' is empty). Note that there is no way for a leaf state to be a
|
||||
/// non-matching state. (Although while building the trie, within 'add', a leaf
|
||||
/// state may exist while not containing any matches. But this invariant is
|
||||
/// only broken within 'add'. Once 'add' returns, the invariant is upheld.)
|
||||
#[derive(Clone, Default)]
|
||||
struct State {
|
||||
transitions: Vec<Transition>,
|
||||
chunks: Vec<(usize, usize)>,
|
||||
}
|
||||
|
||||
impl State {
|
||||
/// Mark this state as a match state and freeze the active chunk such that
|
||||
/// it can not be further mutated.
|
||||
fn add_match(&mut self) {
|
||||
// This is not strictly necessary, but there's no point in recording
|
||||
// another match by adding another chunk if the state has no
|
||||
// transitions. Note though that we only skip this if we already know
|
||||
// this is a match state, which is only true if 'chunks' is not empty.
|
||||
// Basically, if we didn't do this, nothing semantically would change,
|
||||
// but we'd end up pushing another chunk and potentially triggering an
|
||||
// alloc.
|
||||
if self.transitions.is_empty() && !self.chunks.is_empty() {
|
||||
return;
|
||||
}
|
||||
let chunk_start = self.active_chunk_start();
|
||||
let chunk_end = self.transitions.len();
|
||||
self.chunks.push((chunk_start, chunk_end));
|
||||
}
|
||||
|
||||
/// Returns true if and only if this state is a leaf state. That is, a
|
||||
/// state that has no outgoing transitions.
|
||||
fn is_leaf(&self) -> bool {
|
||||
self.transitions.is_empty()
|
||||
}
|
||||
|
||||
/// Returns an iterator over all of the chunks (including the currently
|
||||
/// active chunk) in this state. Since the active chunk is included, the
|
||||
/// iterator is guaranteed to always yield at least one chunk (although the
|
||||
/// chunk may be empty).
|
||||
fn chunks(&self) -> StateChunksIter<'_> {
|
||||
StateChunksIter {
|
||||
transitions: &*self.transitions,
|
||||
chunks: self.chunks.iter(),
|
||||
active: Some(self.active_chunk()),
|
||||
}
|
||||
}
|
||||
|
||||
/// Returns the active chunk as a slice of transitions.
|
||||
fn active_chunk(&self) -> &[Transition] {
|
||||
let start = self.active_chunk_start();
|
||||
&self.transitions[start..]
|
||||
}
|
||||
|
||||
/// Returns the index into 'transitions' where the active chunk starts.
|
||||
fn active_chunk_start(&self) -> usize {
|
||||
self.chunks.last().map_or(0, |&(_, end)| end)
|
||||
}
|
||||
}
|
||||
|
||||
impl core::fmt::Debug for State {
|
||||
fn fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result {
|
||||
let mut spacing = " ";
|
||||
for (i, chunk) in self.chunks().enumerate() {
|
||||
if i > 0 {
|
||||
write!(f, "{}MATCH", spacing)?;
|
||||
}
|
||||
spacing = "";
|
||||
for (j, t) in chunk.iter().enumerate() {
|
||||
spacing = " ";
|
||||
if j == 0 && i > 0 {
|
||||
write!(f, " ")?;
|
||||
} else if j > 0 {
|
||||
write!(f, ", ")?;
|
||||
}
|
||||
write!(f, "{:?}", t)?;
|
||||
}
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
|
||||
/// An iterator over all of the chunks in a state, including the active chunk.
|
||||
///
|
||||
/// This iterator is created by `State::chunks`. We name this iterator so that
|
||||
/// we can include it in the `Frame` type for non-recursive trie traversal.
|
||||
#[derive(Debug)]
|
||||
struct StateChunksIter<'a> {
|
||||
transitions: &'a [Transition],
|
||||
chunks: core::slice::Iter<'a, (usize, usize)>,
|
||||
active: Option<&'a [Transition]>,
|
||||
}
|
||||
|
||||
impl<'a> Iterator for StateChunksIter<'a> {
|
||||
type Item = &'a [Transition];
|
||||
|
||||
fn next(&mut self) -> Option<&'a [Transition]> {
|
||||
if let Some(&(start, end)) = self.chunks.next() {
|
||||
return Some(&self.transitions[start..end]);
|
||||
}
|
||||
if let Some(chunk) = self.active.take() {
|
||||
return Some(chunk);
|
||||
}
|
||||
None
|
||||
}
|
||||
}
|
||||
|
||||
/// A single transition in a trie to another state.
|
||||
#[derive(Clone, Copy)]
|
||||
struct Transition {
|
||||
byte: u8,
|
||||
next: StateID,
|
||||
}
|
||||
|
||||
impl core::fmt::Debug for Transition {
|
||||
fn fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result {
|
||||
write!(
|
||||
f,
|
||||
"{:?} => {}",
|
||||
crate::util::escape::DebugByte(self.byte),
|
||||
self.next.as_usize()
|
||||
)
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use bstr::B;
|
||||
use regex_syntax::hir::Hir;
|
||||
|
||||
use super::*;
|
||||
|
||||
#[test]
|
||||
fn zap() {
|
||||
let mut trie = LiteralTrie::forward();
|
||||
trie.add(b"zapper").unwrap();
|
||||
trie.add(b"z").unwrap();
|
||||
trie.add(b"zap").unwrap();
|
||||
|
||||
let got = trie.compile_to_hir();
|
||||
let expected = Hir::concat(vec![
|
||||
Hir::literal(B("z")),
|
||||
Hir::alternation(vec![
|
||||
Hir::literal(B("apper")),
|
||||
Hir::empty(),
|
||||
Hir::literal(B("ap")),
|
||||
]),
|
||||
]);
|
||||
assert_eq!(expected, got);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn maker() {
|
||||
let mut trie = LiteralTrie::forward();
|
||||
trie.add(b"make").unwrap();
|
||||
trie.add(b"maple").unwrap();
|
||||
trie.add(b"maker").unwrap();
|
||||
|
||||
let got = trie.compile_to_hir();
|
||||
let expected = Hir::concat(vec![
|
||||
Hir::literal(B("ma")),
|
||||
Hir::alternation(vec![
|
||||
Hir::concat(vec![
|
||||
Hir::literal(B("ke")),
|
||||
Hir::alternation(vec![Hir::empty(), Hir::literal(B("r"))]),
|
||||
]),
|
||||
Hir::literal(B("ple")),
|
||||
]),
|
||||
]);
|
||||
assert_eq!(expected, got);
|
||||
}
|
||||
}
|
||||
296
third-party/vendor/regex-automata/src/nfa/thompson/map.rs
vendored
Normal file
296
third-party/vendor/regex-automata/src/nfa/thompson/map.rs
vendored
Normal file
|
|
@ -0,0 +1,296 @@
|
|||
// This module contains a couple simple and purpose built hash maps. The key
|
||||
// trade off they make is that they serve as caches rather than true maps. That
|
||||
// is, inserting a new entry may cause eviction of another entry. This gives
|
||||
// us two things. First, there's less overhead associated with inserts and
|
||||
// lookups. Secondly, it lets us control our memory usage.
|
||||
//
|
||||
// These maps are used in some fairly hot code when generating NFA states for
|
||||
// large Unicode character classes.
|
||||
//
|
||||
// Instead of exposing a rich hashmap entry API, we just permit the caller to
|
||||
// produce a hash of the key directly. The hash can then be reused for both
|
||||
// lookups and insertions at the cost of leaking abstraction a bit. But these
|
||||
// are for internal use only, so it's fine.
|
||||
//
|
||||
// The Utf8BoundedMap is used for Daciuk's algorithm for constructing a
|
||||
// (almost) minimal DFA for large Unicode character classes in linear time.
|
||||
// (Daciuk's algorithm is always used when compiling forward NFAs. For reverse
|
||||
// NFAs, it's only used when the compiler is configured to 'shrink' the NFA,
|
||||
// since there's a bit more expense in the reverse direction.)
|
||||
//
|
||||
// The Utf8SuffixMap is used when compiling large Unicode character classes for
|
||||
// reverse NFAs when 'shrink' is disabled. Specifically, it augments the naive
|
||||
// construction of UTF-8 automata by caching common suffixes. This doesn't
|
||||
// get the same space savings as Daciuk's algorithm, but it's basically as
|
||||
// fast as the naive approach and typically winds up using less memory (since
|
||||
// it generates smaller NFAs) despite the presence of the cache.
|
||||
//
|
||||
// These maps effectively represent caching mechanisms for sparse and
|
||||
// byte-range NFA states, respectively. The former represents a single NFA
|
||||
// state with many transitions of equivalent priority while the latter
|
||||
// represents a single NFA state with a single transition. (Neither state ever
|
||||
// has or is an epsilon transition.) Thus, they have different key types. It's
|
||||
// likely we could make one generic map, but the machinery didn't seem worth
|
||||
// it. They are simple enough.
|
||||
|
||||
use alloc::{vec, vec::Vec};
|
||||
|
||||
use crate::{
|
||||
nfa::thompson::Transition,
|
||||
util::{
|
||||
int::{Usize, U64},
|
||||
primitives::StateID,
|
||||
},
|
||||
};
|
||||
|
||||
// Basic FNV-1a hash constants as described in:
|
||||
// https://en.wikipedia.org/wiki/Fowler%E2%80%93Noll%E2%80%93Vo_hash_function
|
||||
const PRIME: u64 = 1099511628211;
|
||||
const INIT: u64 = 14695981039346656037;
|
||||
|
||||
/// A bounded hash map where the key is a sequence of NFA transitions and the
|
||||
/// value is a pre-existing NFA state ID.
|
||||
///
|
||||
/// std's hashmap can be used for this, however, this map has two important
|
||||
/// advantages. Firstly, it has lower overhead. Secondly, it permits us to
|
||||
/// control our memory usage by limited the number of slots. In general, the
|
||||
/// cost here is that this map acts as a cache. That is, inserting a new entry
|
||||
/// may remove an old entry. We are okay with this, since it does not impact
|
||||
/// correctness in the cases where it is used. The only effect that dropping
|
||||
/// states from the cache has is that the resulting NFA generated may be bigger
|
||||
/// than it otherwise would be.
|
||||
///
|
||||
/// This improves benchmarks that compile large Unicode character classes,
|
||||
/// since it makes the generation of (almost) minimal UTF-8 automaton faster.
|
||||
/// Specifically, one could observe the difference with std's hashmap via
|
||||
/// something like the following benchmark:
|
||||
///
|
||||
/// hyperfine "regex-cli debug thompson -qr --captures none '\w{90} ecurB'"
|
||||
///
|
||||
/// But to observe that difference, you'd have to modify the code to use
|
||||
/// std's hashmap.
|
||||
///
|
||||
/// It is quite possible that there is a better way to approach this problem.
|
||||
/// For example, if there happens to be a very common state that collides with
|
||||
/// a lot of less frequent states, then we could wind up with very poor caching
|
||||
/// behavior. Alas, the effectiveness of this cache has not been measured.
|
||||
/// Instead, ad hoc experiments suggest that it is "good enough." Additional
|
||||
/// smarts (such as an LRU eviction policy) have to be weighed against the
|
||||
/// amount of extra time they cost.
|
||||
#[derive(Clone, Debug)]
|
||||
pub struct Utf8BoundedMap {
|
||||
/// The current version of this map. Only entries with matching versions
|
||||
/// are considered during lookups. If an entry is found with a mismatched
|
||||
/// version, then the map behaves as if the entry does not exist.
|
||||
///
|
||||
/// This makes it possible to clear the map by simply incrementing the
|
||||
/// version number instead of actually deallocating any storage.
|
||||
version: u16,
|
||||
/// The total number of entries this map can store.
|
||||
capacity: usize,
|
||||
/// The actual entries, keyed by hash. Collisions between different states
|
||||
/// result in the old state being dropped.
|
||||
map: Vec<Utf8BoundedEntry>,
|
||||
}
|
||||
|
||||
/// An entry in this map.
|
||||
#[derive(Clone, Debug, Default)]
|
||||
struct Utf8BoundedEntry {
|
||||
/// The version of the map used to produce this entry. If this entry's
|
||||
/// version does not match the current version of the map, then the map
|
||||
/// should behave as if this entry does not exist.
|
||||
version: u16,
|
||||
/// The key, which is a sorted sequence of non-overlapping NFA transitions.
|
||||
key: Vec<Transition>,
|
||||
/// The state ID corresponding to the state containing the transitions in
|
||||
/// this entry.
|
||||
val: StateID,
|
||||
}
|
||||
|
||||
impl Utf8BoundedMap {
|
||||
/// Create a new bounded map with the given capacity. The map will never
|
||||
/// grow beyond the given size.
|
||||
///
|
||||
/// Note that this does not allocate. Instead, callers must call `clear`
|
||||
/// before using this map. `clear` will allocate space if necessary.
|
||||
///
|
||||
/// This avoids the need to pay for the allocation of this map when
|
||||
/// compiling regexes that lack large Unicode character classes.
|
||||
pub fn new(capacity: usize) -> Utf8BoundedMap {
|
||||
assert!(capacity > 0);
|
||||
Utf8BoundedMap { version: 0, capacity, map: vec![] }
|
||||
}
|
||||
|
||||
/// Clear this map of all entries, but permit the reuse of allocation
|
||||
/// if possible.
|
||||
///
|
||||
/// This must be called before the map can be used.
|
||||
pub fn clear(&mut self) {
|
||||
if self.map.is_empty() {
|
||||
self.map = vec![Utf8BoundedEntry::default(); self.capacity];
|
||||
} else {
|
||||
self.version = self.version.wrapping_add(1);
|
||||
// If we loop back to version 0, then we forcefully clear the
|
||||
// entire map. Otherwise, it might be possible to incorrectly
|
||||
// match entries used to generate other NFAs.
|
||||
if self.version == 0 {
|
||||
self.map = vec![Utf8BoundedEntry::default(); self.capacity];
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Return a hash of the given transitions.
|
||||
pub fn hash(&self, key: &[Transition]) -> usize {
|
||||
let mut h = INIT;
|
||||
for t in key {
|
||||
h = (h ^ u64::from(t.start)).wrapping_mul(PRIME);
|
||||
h = (h ^ u64::from(t.end)).wrapping_mul(PRIME);
|
||||
h = (h ^ t.next.as_u64()).wrapping_mul(PRIME);
|
||||
}
|
||||
(h % self.map.len().as_u64()).as_usize()
|
||||
}
|
||||
|
||||
/// Retrieve the cached state ID corresponding to the given key. The hash
|
||||
/// given must have been computed with `hash` using the same key value.
|
||||
///
|
||||
/// If there is no cached state with the given transitions, then None is
|
||||
/// returned.
|
||||
pub fn get(&mut self, key: &[Transition], hash: usize) -> Option<StateID> {
|
||||
let entry = &self.map[hash];
|
||||
if entry.version != self.version {
|
||||
return None;
|
||||
}
|
||||
// There may be a hash collision, so we need to confirm real equality.
|
||||
if entry.key != key {
|
||||
return None;
|
||||
}
|
||||
Some(entry.val)
|
||||
}
|
||||
|
||||
/// Add a cached state to this map with the given key. Callers should
|
||||
/// ensure that `state_id` points to a state that contains precisely the
|
||||
/// NFA transitions given.
|
||||
///
|
||||
/// `hash` must have been computed using the `hash` method with the same
|
||||
/// key.
|
||||
pub fn set(
|
||||
&mut self,
|
||||
key: Vec<Transition>,
|
||||
hash: usize,
|
||||
state_id: StateID,
|
||||
) {
|
||||
self.map[hash] =
|
||||
Utf8BoundedEntry { version: self.version, key, val: state_id };
|
||||
}
|
||||
}
|
||||
|
||||
/// A cache of suffixes used to modestly compress UTF-8 automata for large
|
||||
/// Unicode character classes.
|
||||
#[derive(Clone, Debug)]
|
||||
pub struct Utf8SuffixMap {
|
||||
/// The current version of this map. Only entries with matching versions
|
||||
/// are considered during lookups. If an entry is found with a mismatched
|
||||
/// version, then the map behaves as if the entry does not exist.
|
||||
version: u16,
|
||||
/// The total number of entries this map can store.
|
||||
capacity: usize,
|
||||
/// The actual entries, keyed by hash. Collisions between different states
|
||||
/// result in the old state being dropped.
|
||||
map: Vec<Utf8SuffixEntry>,
|
||||
}
|
||||
|
||||
/// A key that uniquely identifies an NFA state. It is a triple that represents
|
||||
/// a transition from one state for a particular byte range.
|
||||
#[derive(Clone, Debug, Default, Eq, PartialEq)]
|
||||
pub struct Utf8SuffixKey {
|
||||
pub from: StateID,
|
||||
pub start: u8,
|
||||
pub end: u8,
|
||||
}
|
||||
|
||||
/// An entry in this map.
|
||||
#[derive(Clone, Debug, Default)]
|
||||
struct Utf8SuffixEntry {
|
||||
/// The version of the map used to produce this entry. If this entry's
|
||||
/// version does not match the current version of the map, then the map
|
||||
/// should behave as if this entry does not exist.
|
||||
version: u16,
|
||||
/// The key, which consists of a transition in a particular state.
|
||||
key: Utf8SuffixKey,
|
||||
/// The identifier that the transition in the key maps to.
|
||||
val: StateID,
|
||||
}
|
||||
|
||||
impl Utf8SuffixMap {
|
||||
/// Create a new bounded map with the given capacity. The map will never
|
||||
/// grow beyond the given size.
|
||||
///
|
||||
/// Note that this does not allocate. Instead, callers must call `clear`
|
||||
/// before using this map. `clear` will allocate space if necessary.
|
||||
///
|
||||
/// This avoids the need to pay for the allocation of this map when
|
||||
/// compiling regexes that lack large Unicode character classes.
|
||||
pub fn new(capacity: usize) -> Utf8SuffixMap {
|
||||
assert!(capacity > 0);
|
||||
Utf8SuffixMap { version: 0, capacity, map: vec![] }
|
||||
}
|
||||
|
||||
/// Clear this map of all entries, but permit the reuse of allocation
|
||||
/// if possible.
|
||||
///
|
||||
/// This must be called before the map can be used.
|
||||
pub fn clear(&mut self) {
|
||||
if self.map.is_empty() {
|
||||
self.map = vec![Utf8SuffixEntry::default(); self.capacity];
|
||||
} else {
|
||||
self.version = self.version.wrapping_add(1);
|
||||
if self.version == 0 {
|
||||
self.map = vec![Utf8SuffixEntry::default(); self.capacity];
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Return a hash of the given transition.
|
||||
pub fn hash(&self, key: &Utf8SuffixKey) -> usize {
|
||||
// Basic FNV-1a hash as described:
|
||||
// https://en.wikipedia.org/wiki/Fowler%E2%80%93Noll%E2%80%93Vo_hash_function
|
||||
const PRIME: u64 = 1099511628211;
|
||||
const INIT: u64 = 14695981039346656037;
|
||||
|
||||
let mut h = INIT;
|
||||
h = (h ^ key.from.as_u64()).wrapping_mul(PRIME);
|
||||
h = (h ^ u64::from(key.start)).wrapping_mul(PRIME);
|
||||
h = (h ^ u64::from(key.end)).wrapping_mul(PRIME);
|
||||
(h % self.map.len().as_u64()).as_usize()
|
||||
}
|
||||
|
||||
/// Retrieve the cached state ID corresponding to the given key. The hash
|
||||
/// given must have been computed with `hash` using the same key value.
|
||||
///
|
||||
/// If there is no cached state with the given key, then None is returned.
|
||||
pub fn get(
|
||||
&mut self,
|
||||
key: &Utf8SuffixKey,
|
||||
hash: usize,
|
||||
) -> Option<StateID> {
|
||||
let entry = &self.map[hash];
|
||||
if entry.version != self.version {
|
||||
return None;
|
||||
}
|
||||
if key != &entry.key {
|
||||
return None;
|
||||
}
|
||||
Some(entry.val)
|
||||
}
|
||||
|
||||
/// Add a cached state to this map with the given key. Callers should
|
||||
/// ensure that `state_id` points to a state that contains precisely the
|
||||
/// NFA transition given.
|
||||
///
|
||||
/// `hash` must have been computed using the `hash` method with the same
|
||||
/// key.
|
||||
pub fn set(&mut self, key: Utf8SuffixKey, hash: usize, state_id: StateID) {
|
||||
self.map[hash] =
|
||||
Utf8SuffixEntry { version: self.version, key, val: state_id };
|
||||
}
|
||||
}
|
||||
81
third-party/vendor/regex-automata/src/nfa/thompson/mod.rs
vendored
Normal file
81
third-party/vendor/regex-automata/src/nfa/thompson/mod.rs
vendored
Normal file
|
|
@ -0,0 +1,81 @@
|
|||
/*!
|
||||
Defines a Thompson NFA and provides the [`PikeVM`](pikevm::PikeVM) and
|
||||
[`BoundedBacktracker`](backtrack::BoundedBacktracker) regex engines.
|
||||
|
||||
A Thompson NFA (non-deterministic finite automaton) is arguably _the_ central
|
||||
data type in this library. It is the result of what is commonly referred to as
|
||||
"regex compilation." That is, turning a regex pattern from its concrete syntax
|
||||
string into something that can run a search looks roughly like this:
|
||||
|
||||
* A `&str` is parsed into a [`regex-syntax::ast::Ast`](regex_syntax::ast::Ast).
|
||||
* An `Ast` is translated into a [`regex-syntax::hir::Hir`](regex_syntax::hir::Hir).
|
||||
* An `Hir` is compiled into a [`NFA`].
|
||||
* The `NFA` is then used to build one of a few different regex engines:
|
||||
* An `NFA` is used directly in the `PikeVM` and `BoundedBacktracker` engines.
|
||||
* An `NFA` is used by a [hybrid NFA/DFA](crate::hybrid) to build out a DFA's
|
||||
transition table at search time.
|
||||
* An `NFA`, assuming it is one-pass, is used to build a full
|
||||
[one-pass DFA](crate::dfa::onepass) ahead of time.
|
||||
* An `NFA` is used to build a [full DFA](crate::dfa) ahead of time.
|
||||
|
||||
The [`meta`](crate::meta) regex engine makes all of these choices for you based
|
||||
on various criteria. However, if you have a lower level use case, _you_ can
|
||||
build any of the above regex engines and use them directly. But you must start
|
||||
here by building an `NFA`.
|
||||
|
||||
# Details
|
||||
|
||||
It is perhaps worth expanding a bit more on what it means to go through the
|
||||
`&str`->`Ast`->`Hir`->`NFA` process.
|
||||
|
||||
* Parsing a string into an `Ast` gives it a structured representation.
|
||||
Crucially, the size and amount of work done in this step is proportional to the
|
||||
size of the original string. No optimization or Unicode handling is done at
|
||||
this point. This means that parsing into an `Ast` has very predictable costs.
|
||||
Moreover, an `Ast` can be roundtripped back to its original pattern string as
|
||||
written.
|
||||
* Translating an `Ast` into an `Hir` is a process by which the structured
|
||||
representation is simplified down to its most fundamental components.
|
||||
Translation deals with flags such as case insensitivity by converting things
|
||||
like `(?i:a)` to `[Aa]`. Translation is also where Unicode tables are consulted
|
||||
to resolve things like `\p{Emoji}` and `\p{Greek}`. It also flattens each
|
||||
character class, regardless of how deeply nested it is, into a single sequence
|
||||
of non-overlapping ranges. All the various literal forms are thrown out in
|
||||
favor of one common representation. Overall, the `Hir` is small enough to fit
|
||||
into your head and makes analysis and other tasks much simpler.
|
||||
* Compiling an `Hir` into an `NFA` formulates the regex into a finite state
|
||||
machine whose transitions are defined over bytes. For example, an `Hir` might
|
||||
have a Unicode character class corresponding to a sequence of ranges defined
|
||||
in terms of `char`. Compilation is then responsible for turning those ranges
|
||||
into a UTF-8 automaton. That is, an automaton that matches the UTF-8 encoding
|
||||
of just the codepoints specified by those ranges. Otherwise, the main job of
|
||||
an `NFA` is to serve as a byte-code of sorts for a virtual machine. It can be
|
||||
seen as a sequence of instructions for how to match a regex.
|
||||
*/
|
||||
|
||||
#[cfg(feature = "nfa-backtrack")]
|
||||
pub mod backtrack;
|
||||
mod builder;
|
||||
#[cfg(feature = "syntax")]
|
||||
mod compiler;
|
||||
mod error;
|
||||
#[cfg(feature = "syntax")]
|
||||
mod literal_trie;
|
||||
#[cfg(feature = "syntax")]
|
||||
mod map;
|
||||
mod nfa;
|
||||
#[cfg(feature = "nfa-pikevm")]
|
||||
pub mod pikevm;
|
||||
#[cfg(feature = "syntax")]
|
||||
mod range_trie;
|
||||
|
||||
pub use self::{
|
||||
builder::Builder,
|
||||
error::BuildError,
|
||||
nfa::{
|
||||
DenseTransitions, PatternIter, SparseTransitions, State, Transition,
|
||||
NFA,
|
||||
},
|
||||
};
|
||||
#[cfg(feature = "syntax")]
|
||||
pub use compiler::{Compiler, Config, WhichCaptures};
|
||||
2099
third-party/vendor/regex-automata/src/nfa/thompson/nfa.rs
vendored
Normal file
2099
third-party/vendor/regex-automata/src/nfa/thompson/nfa.rs
vendored
Normal file
File diff suppressed because it is too large
Load diff
2359
third-party/vendor/regex-automata/src/nfa/thompson/pikevm.rs
vendored
Normal file
2359
third-party/vendor/regex-automata/src/nfa/thompson/pikevm.rs
vendored
Normal file
File diff suppressed because it is too large
Load diff
1051
third-party/vendor/regex-automata/src/nfa/thompson/range_trie.rs
vendored
Normal file
1051
third-party/vendor/regex-automata/src/nfa/thompson/range_trie.rs
vendored
Normal file
File diff suppressed because it is too large
Load diff
1139
third-party/vendor/regex-automata/src/util/alphabet.rs
vendored
Normal file
1139
third-party/vendor/regex-automata/src/util/alphabet.rs
vendored
Normal file
File diff suppressed because it is too large
Load diff
2548
third-party/vendor/regex-automata/src/util/captures.rs
vendored
Normal file
2548
third-party/vendor/regex-automata/src/util/captures.rs
vendored
Normal file
File diff suppressed because it is too large
Load diff
682
third-party/vendor/regex-automata/src/util/determinize/mod.rs
vendored
Normal file
682
third-party/vendor/regex-automata/src/util/determinize/mod.rs
vendored
Normal file
|
|
@ -0,0 +1,682 @@
|
|||
/*!
|
||||
This module contains types and routines for implementing determinization.
|
||||
|
||||
In this crate, there are at least two places where we implement
|
||||
determinization: fully ahead-of-time compiled DFAs in the `dfa` module and
|
||||
lazily compiled DFAs in the `hybrid` module. The stuff in this module
|
||||
corresponds to the things that are in common between these implementations.
|
||||
|
||||
There are three broad things that our implementations of determinization have
|
||||
in common, as defined by this module:
|
||||
|
||||
* The classification of start states. That is, whether we're dealing with
|
||||
word boundaries, line boundaries, etc., is all the same. This also includes
|
||||
the look-behind assertions that are satisfied by each starting state
|
||||
classification.
|
||||
* The representation of DFA states as sets of NFA states, including
|
||||
convenience types for building these DFA states that are amenable to reusing
|
||||
allocations.
|
||||
* Routines for the "classical" parts of determinization: computing the
|
||||
epsilon closure, tracking match states (with corresponding pattern IDs, since
|
||||
we support multi-pattern finite automata) and, of course, computing the
|
||||
transition function between states for units of input.
|
||||
|
||||
I did consider a couple of alternatives to this particular form of code reuse:
|
||||
|
||||
1. Don't do any code reuse. The problem here is that we *really* want both
|
||||
forms of determinization to do exactly identical things when it comes to
|
||||
their handling of NFA states. While our tests generally ensure this, the code
|
||||
is tricky and large enough where not reusing code is a pretty big bummer.
|
||||
|
||||
2. Implement all of determinization once and make it generic over fully
|
||||
compiled DFAs and lazily compiled DFAs. While I didn't actually try this
|
||||
approach, my instinct is that it would be more complex than is needed here.
|
||||
And the interface required would be pretty hairy. Instead, I think splitting
|
||||
it into logical sub-components works better.
|
||||
*/
|
||||
|
||||
use alloc::vec::Vec;
|
||||
|
||||
pub(crate) use self::state::{
|
||||
State, StateBuilderEmpty, StateBuilderMatches, StateBuilderNFA,
|
||||
};
|
||||
|
||||
use crate::{
|
||||
nfa::thompson,
|
||||
util::{
|
||||
alphabet,
|
||||
look::{Look, LookSet},
|
||||
primitives::StateID,
|
||||
search::MatchKind,
|
||||
sparse_set::{SparseSet, SparseSets},
|
||||
start::Start,
|
||||
utf8,
|
||||
},
|
||||
};
|
||||
|
||||
mod state;
|
||||
|
||||
/// Compute the set of all reachable NFA states, including the full epsilon
|
||||
/// closure, from a DFA state for a single unit of input. The set of reachable
|
||||
/// states is returned as a `StateBuilderNFA`. The `StateBuilderNFA` returned
|
||||
/// also includes any look-behind assertions satisfied by `unit`, in addition
|
||||
/// to whether it is a match state. For multi-pattern DFAs, the builder will
|
||||
/// also include the pattern IDs that match (in the order seen).
|
||||
///
|
||||
/// `nfa` must be able to resolve any NFA state in `state` and any NFA state
|
||||
/// reachable via the epsilon closure of any NFA state in `state`. `sparses`
|
||||
/// must have capacity equivalent to `nfa.len()`.
|
||||
///
|
||||
/// `match_kind` should correspond to the match semantics implemented by the
|
||||
/// DFA being built. Generally speaking, for leftmost-first match semantics,
|
||||
/// states that appear after the first NFA match state will not be included in
|
||||
/// the `StateBuilderNFA` returned since they are impossible to visit.
|
||||
///
|
||||
/// `sparses` is used as scratch space for NFA traversal. Other than their
|
||||
/// capacity requirements (detailed above), there are no requirements on what's
|
||||
/// contained within them (if anything). Similarly, what's inside of them once
|
||||
/// this routine returns is unspecified.
|
||||
///
|
||||
/// `stack` must have length 0. It is used as scratch space for depth first
|
||||
/// traversal. After returning, it is guaranteed that `stack` will have length
|
||||
/// 0.
|
||||
///
|
||||
/// `state` corresponds to the current DFA state on which one wants to compute
|
||||
/// the transition for the input `unit`.
|
||||
///
|
||||
/// `empty_builder` corresponds to the builder allocation to use to produce a
|
||||
/// complete `StateBuilderNFA` state. If the state is not needed (or is already
|
||||
/// cached), then it can be cleared and reused without needing to create a new
|
||||
/// `State`. The `StateBuilderNFA` state returned is final and ready to be
|
||||
/// turned into a `State` if necessary.
|
||||
pub(crate) fn next(
|
||||
nfa: &thompson::NFA,
|
||||
match_kind: MatchKind,
|
||||
sparses: &mut SparseSets,
|
||||
stack: &mut Vec<StateID>,
|
||||
state: &State,
|
||||
unit: alphabet::Unit,
|
||||
empty_builder: StateBuilderEmpty,
|
||||
) -> StateBuilderNFA {
|
||||
sparses.clear();
|
||||
|
||||
// Whether the NFA is matched in reverse or not. We use this in some
|
||||
// conditional logic for dealing with the exceptionally annoying CRLF-aware
|
||||
// line anchors.
|
||||
let rev = nfa.is_reverse();
|
||||
// The look-around matcher that our NFA is configured with. We don't
|
||||
// actually use it to match look-around assertions, but we do need its
|
||||
// configuration for constructing states consistent with how it matches.
|
||||
let lookm = nfa.look_matcher();
|
||||
|
||||
// Put the NFA state IDs into a sparse set in case we need to
|
||||
// re-compute their epsilon closure.
|
||||
//
|
||||
// Doing this state shuffling is technically not necessary unless some
|
||||
// kind of look-around is used in the DFA. Some ad hoc experiments
|
||||
// suggested that avoiding this didn't lead to much of an improvement,
|
||||
// but perhaps more rigorous experimentation should be done. And in
|
||||
// particular, avoiding this check requires some light refactoring of
|
||||
// the code below.
|
||||
state.iter_nfa_state_ids(|nfa_id| {
|
||||
sparses.set1.insert(nfa_id);
|
||||
});
|
||||
|
||||
// Compute look-ahead assertions originating from the current state. Based
|
||||
// on the input unit we're transitioning over, some additional set of
|
||||
// assertions may be true. Thus, we re-compute this state's epsilon closure
|
||||
// (but only if necessary). Notably, when we build a DFA state initially,
|
||||
// we don't enable any look-ahead assertions because we don't know whether
|
||||
// they're true or not at that point.
|
||||
if !state.look_need().is_empty() {
|
||||
// Add look-ahead assertions that are now true based on the current
|
||||
// input unit.
|
||||
let mut look_have = state.look_have().clone();
|
||||
match unit.as_u8() {
|
||||
Some(b'\r') => {
|
||||
if !rev || !state.is_half_crlf() {
|
||||
look_have = look_have.insert(Look::EndCRLF);
|
||||
}
|
||||
}
|
||||
Some(b'\n') => {
|
||||
if rev || !state.is_half_crlf() {
|
||||
look_have = look_have.insert(Look::EndCRLF);
|
||||
}
|
||||
}
|
||||
Some(_) => {}
|
||||
None => {
|
||||
look_have = look_have
|
||||
.insert(Look::End)
|
||||
.insert(Look::EndLF)
|
||||
.insert(Look::EndCRLF);
|
||||
}
|
||||
}
|
||||
if unit.is_byte(lookm.get_line_terminator()) {
|
||||
look_have = look_have.insert(Look::EndLF);
|
||||
}
|
||||
if state.is_half_crlf()
|
||||
&& ((rev && !unit.is_byte(b'\r'))
|
||||
|| (!rev && !unit.is_byte(b'\n')))
|
||||
{
|
||||
look_have = look_have.insert(Look::StartCRLF);
|
||||
}
|
||||
if state.is_from_word() == unit.is_word_byte() {
|
||||
look_have = look_have
|
||||
.insert(Look::WordAsciiNegate)
|
||||
.insert(Look::WordUnicodeNegate);
|
||||
} else {
|
||||
look_have =
|
||||
look_have.insert(Look::WordAscii).insert(Look::WordUnicode);
|
||||
}
|
||||
if !unit.is_word_byte() {
|
||||
look_have = look_have
|
||||
.insert(Look::WordEndHalfAscii)
|
||||
.insert(Look::WordEndHalfUnicode);
|
||||
}
|
||||
if state.is_from_word() && !unit.is_word_byte() {
|
||||
look_have = look_have
|
||||
.insert(Look::WordEndAscii)
|
||||
.insert(Look::WordEndUnicode);
|
||||
} else if !state.is_from_word() && unit.is_word_byte() {
|
||||
look_have = look_have
|
||||
.insert(Look::WordStartAscii)
|
||||
.insert(Look::WordStartUnicode);
|
||||
}
|
||||
// If we have new assertions satisfied that are among the set of
|
||||
// assertions that exist in this state (that is, just because we added
|
||||
// an EndLF assertion above doesn't mean there is an EndLF conditional
|
||||
// epsilon transition in this state), then we re-compute this state's
|
||||
// epsilon closure using the updated set of assertions.
|
||||
//
|
||||
// Note that since our DFA states omit unconditional epsilon
|
||||
// transitions, this check is necessary for correctness. If we re-did
|
||||
// the epsilon closure below needlessly, it could change based on the
|
||||
// fact that we omitted epsilon states originally.
|
||||
if !look_have
|
||||
.subtract(state.look_have())
|
||||
.intersect(state.look_need())
|
||||
.is_empty()
|
||||
{
|
||||
for nfa_id in sparses.set1.iter() {
|
||||
epsilon_closure(
|
||||
nfa,
|
||||
nfa_id,
|
||||
look_have,
|
||||
stack,
|
||||
&mut sparses.set2,
|
||||
);
|
||||
}
|
||||
sparses.swap();
|
||||
sparses.set2.clear();
|
||||
}
|
||||
}
|
||||
|
||||
// Convert our empty builder into one that can record assertions and match
|
||||
// pattern IDs.
|
||||
let mut builder = empty_builder.into_matches();
|
||||
// Set whether the StartLF look-behind assertion is true for this
|
||||
// transition or not. The look-behind assertion for ASCII word boundaries
|
||||
// is handled below.
|
||||
if nfa.look_set_any().contains_anchor_line()
|
||||
&& unit.is_byte(lookm.get_line_terminator())
|
||||
{
|
||||
// Why only handle StartLF here and not Start? That's because Start
|
||||
// can only impact the starting state, which is special cased in
|
||||
// start state handling.
|
||||
builder.set_look_have(|have| have.insert(Look::StartLF));
|
||||
}
|
||||
// We also need to add StartCRLF to our assertions too, if we can. This
|
||||
// is unfortunately a bit more complicated, because it depends on the
|
||||
// direction of the search. In the forward direction, ^ matches after a
|
||||
// \n, but in the reverse direction, ^ only matches after a \r. (This is
|
||||
// further complicated by the fact that reverse a regex means changing a ^
|
||||
// to a $ and vice versa.)
|
||||
if nfa.look_set_any().contains_anchor_crlf()
|
||||
&& ((rev && unit.is_byte(b'\r')) || (!rev && unit.is_byte(b'\n')))
|
||||
{
|
||||
builder.set_look_have(|have| have.insert(Look::StartCRLF));
|
||||
}
|
||||
// And also for the start-half word boundary assertions. As long as the
|
||||
// look-behind byte is not a word char, then the assertions are satisfied.
|
||||
if nfa.look_set_any().contains_word() && !unit.is_word_byte() {
|
||||
builder.set_look_have(|have| {
|
||||
have.insert(Look::WordStartHalfAscii)
|
||||
.insert(Look::WordStartHalfUnicode)
|
||||
});
|
||||
}
|
||||
for nfa_id in sparses.set1.iter() {
|
||||
match *nfa.state(nfa_id) {
|
||||
thompson::State::Union { .. }
|
||||
| thompson::State::BinaryUnion { .. }
|
||||
| thompson::State::Fail
|
||||
| thompson::State::Look { .. }
|
||||
| thompson::State::Capture { .. } => {}
|
||||
thompson::State::Match { pattern_id } => {
|
||||
// Notice here that we are calling the NEW state a match
|
||||
// state if the OLD state we are transitioning from
|
||||
// contains an NFA match state. This is precisely how we
|
||||
// delay all matches by one byte and also what therefore
|
||||
// guarantees that starting states cannot be match states.
|
||||
//
|
||||
// If we didn't delay matches by one byte, then whether
|
||||
// a DFA is a matching state or not would be determined
|
||||
// by whether one of its own constituent NFA states
|
||||
// was a match state. (And that would be done in
|
||||
// 'add_nfa_states'.)
|
||||
//
|
||||
// Also, 'add_match_pattern_id' requires that callers never
|
||||
// pass duplicative pattern IDs. We do in fact uphold that
|
||||
// guarantee here, but it's subtle. In particular, a Thompson
|
||||
// NFA guarantees that each pattern has exactly one match
|
||||
// state. Moreover, since we're iterating over the NFA state
|
||||
// IDs in a set, we are guarateed not to have any duplicative
|
||||
// match states. Thus, it is impossible to add the same pattern
|
||||
// ID more than once.
|
||||
//
|
||||
// N.B. We delay matches by 1 byte as a way to hack 1-byte
|
||||
// look-around into DFA searches. This lets us support ^, $
|
||||
// and ASCII-only \b. The delay is also why we need a special
|
||||
// "end-of-input" (EOI) sentinel and why we need to follow the
|
||||
// EOI sentinel at the end of every search. This final EOI
|
||||
// transition is necessary to report matches found at the end
|
||||
// of a haystack.
|
||||
builder.add_match_pattern_id(pattern_id);
|
||||
if !match_kind.continue_past_first_match() {
|
||||
break;
|
||||
}
|
||||
}
|
||||
thompson::State::ByteRange { ref trans } => {
|
||||
if trans.matches_unit(unit) {
|
||||
epsilon_closure(
|
||||
nfa,
|
||||
trans.next,
|
||||
builder.look_have(),
|
||||
stack,
|
||||
&mut sparses.set2,
|
||||
);
|
||||
}
|
||||
}
|
||||
thompson::State::Sparse(ref sparse) => {
|
||||
if let Some(next) = sparse.matches_unit(unit) {
|
||||
epsilon_closure(
|
||||
nfa,
|
||||
next,
|
||||
builder.look_have(),
|
||||
stack,
|
||||
&mut sparses.set2,
|
||||
);
|
||||
}
|
||||
}
|
||||
thompson::State::Dense(ref dense) => {
|
||||
if let Some(next) = dense.matches_unit(unit) {
|
||||
epsilon_closure(
|
||||
nfa,
|
||||
next,
|
||||
builder.look_have(),
|
||||
stack,
|
||||
&mut sparses.set2,
|
||||
);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
// We only set the word byte if there's a word boundary look-around
|
||||
// anywhere in this regex. Otherwise, there's no point in bloating the
|
||||
// number of states if we don't have one.
|
||||
//
|
||||
// We also only set it when the state has a non-zero number of NFA states.
|
||||
// Otherwise, we could wind up with states that *should* be DEAD states
|
||||
// but are otherwise distinct from DEAD states because of this look-behind
|
||||
// assertion being set. While this can't technically impact correctness *in
|
||||
// theory*, it can create pathological DFAs that consume input until EOI or
|
||||
// a quit byte is seen. Consuming until EOI isn't a correctness problem,
|
||||
// but a (serious) perf problem. Hitting a quit byte, however, could be a
|
||||
// correctness problem since it could cause search routines to report an
|
||||
// error instead of a detected match once the quit state is entered. (The
|
||||
// search routine could be made to be a bit smarter by reporting a match
|
||||
// if one was detected once it enters a quit state (and indeed, the search
|
||||
// routines in this crate do just that), but it seems better to prevent
|
||||
// these things by construction if possible.)
|
||||
if !sparses.set2.is_empty() {
|
||||
if nfa.look_set_any().contains_word() && unit.is_word_byte() {
|
||||
builder.set_is_from_word();
|
||||
}
|
||||
if nfa.look_set_any().contains_anchor_crlf()
|
||||
&& ((rev && unit.is_byte(b'\n')) || (!rev && unit.is_byte(b'\r')))
|
||||
{
|
||||
builder.set_is_half_crlf();
|
||||
}
|
||||
}
|
||||
let mut builder_nfa = builder.into_nfa();
|
||||
add_nfa_states(nfa, &sparses.set2, &mut builder_nfa);
|
||||
builder_nfa
|
||||
}
|
||||
|
||||
/// Compute the epsilon closure for the given NFA state. The epsilon closure
|
||||
/// consists of all NFA state IDs, including `start_nfa_id`, that can be
|
||||
/// reached from `start_nfa_id` without consuming any input. These state IDs
|
||||
/// are written to `set` in the order they are visited, but only if they are
|
||||
/// not already in `set`. `start_nfa_id` must be a valid state ID for the NFA
|
||||
/// given.
|
||||
///
|
||||
/// `look_have` consists of the satisfied assertions at the current
|
||||
/// position. For conditional look-around epsilon transitions, these are
|
||||
/// only followed if they are satisfied by `look_have`.
|
||||
///
|
||||
/// `stack` must have length 0. It is used as scratch space for depth first
|
||||
/// traversal. After returning, it is guaranteed that `stack` will have length
|
||||
/// 0.
|
||||
pub(crate) fn epsilon_closure(
|
||||
nfa: &thompson::NFA,
|
||||
start_nfa_id: StateID,
|
||||
look_have: LookSet,
|
||||
stack: &mut Vec<StateID>,
|
||||
set: &mut SparseSet,
|
||||
) {
|
||||
assert!(stack.is_empty());
|
||||
// If this isn't an epsilon state, then the epsilon closure is always just
|
||||
// itself, so there's no need to spin up the machinery below to handle it.
|
||||
if !nfa.state(start_nfa_id).is_epsilon() {
|
||||
set.insert(start_nfa_id);
|
||||
return;
|
||||
}
|
||||
|
||||
stack.push(start_nfa_id);
|
||||
while let Some(mut id) = stack.pop() {
|
||||
// In many cases, we can avoid stack operations when an NFA state only
|
||||
// adds one new state to visit. In that case, we just set our ID to
|
||||
// that state and mush on. We only use the stack when an NFA state
|
||||
// introduces multiple new states to visit.
|
||||
loop {
|
||||
// Insert this NFA state, and if it's already in the set and thus
|
||||
// already visited, then we can move on to the next one.
|
||||
if !set.insert(id) {
|
||||
break;
|
||||
}
|
||||
match *nfa.state(id) {
|
||||
thompson::State::ByteRange { .. }
|
||||
| thompson::State::Sparse { .. }
|
||||
| thompson::State::Dense { .. }
|
||||
| thompson::State::Fail
|
||||
| thompson::State::Match { .. } => break,
|
||||
thompson::State::Look { look, next } => {
|
||||
if !look_have.contains(look) {
|
||||
break;
|
||||
}
|
||||
id = next;
|
||||
}
|
||||
thompson::State::Union { ref alternates } => {
|
||||
id = match alternates.get(0) {
|
||||
None => break,
|
||||
Some(&id) => id,
|
||||
};
|
||||
// We need to process our alternates in order to preserve
|
||||
// match preferences, so put the earliest alternates closer
|
||||
// to the top of the stack.
|
||||
stack.extend(alternates[1..].iter().rev());
|
||||
}
|
||||
thompson::State::BinaryUnion { alt1, alt2 } => {
|
||||
id = alt1;
|
||||
stack.push(alt2);
|
||||
}
|
||||
thompson::State::Capture { next, .. } => {
|
||||
id = next;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Add the NFA state IDs in the given `set` to the given DFA builder state.
|
||||
/// The order in which states are added corresponds to the order in which they
|
||||
/// were added to `set`.
|
||||
///
|
||||
/// The DFA builder state given should already have its complete set of match
|
||||
/// pattern IDs added (if any) and any look-behind assertions (StartLF, Start
|
||||
/// and whether this state is being generated for a transition over a word byte
|
||||
/// when applicable) that are true immediately prior to transitioning into this
|
||||
/// state (via `builder.look_have()`). The match pattern IDs should correspond
|
||||
/// to matches that occurred on the previous transition, since all matches are
|
||||
/// delayed by one byte. The things that should _not_ be set are look-ahead
|
||||
/// assertions (EndLF, End and whether the next byte is a word byte or not).
|
||||
/// The builder state should also not have anything in `look_need` set, as this
|
||||
/// routine will compute that for you.
|
||||
///
|
||||
/// The given NFA should be able to resolve all identifiers in `set` to a
|
||||
/// particular NFA state. Additionally, `set` must have capacity equivalent
|
||||
/// to `nfa.len()`.
|
||||
pub(crate) fn add_nfa_states(
|
||||
nfa: &thompson::NFA,
|
||||
set: &SparseSet,
|
||||
builder: &mut StateBuilderNFA,
|
||||
) {
|
||||
for nfa_id in set.iter() {
|
||||
match *nfa.state(nfa_id) {
|
||||
thompson::State::ByteRange { .. } => {
|
||||
builder.add_nfa_state_id(nfa_id);
|
||||
}
|
||||
thompson::State::Sparse { .. } => {
|
||||
builder.add_nfa_state_id(nfa_id);
|
||||
}
|
||||
thompson::State::Dense { .. } => {
|
||||
builder.add_nfa_state_id(nfa_id);
|
||||
}
|
||||
thompson::State::Look { look, .. } => {
|
||||
builder.add_nfa_state_id(nfa_id);
|
||||
builder.set_look_need(|need| need.insert(look));
|
||||
}
|
||||
thompson::State::Union { .. }
|
||||
| thompson::State::BinaryUnion { .. } => {
|
||||
// Pure epsilon transitions don't need to be tracked as part
|
||||
// of the DFA state. Tracking them is actually superfluous;
|
||||
// they won't cause any harm other than making determinization
|
||||
// slower.
|
||||
//
|
||||
// Why aren't these needed? Well, in an NFA, epsilon
|
||||
// transitions are really just jumping points to other states.
|
||||
// So once you hit an epsilon transition, the same set of
|
||||
// resulting states always appears. Therefore, putting them in
|
||||
// a DFA's set of ordered NFA states is strictly redundant.
|
||||
//
|
||||
// Look-around states are also epsilon transitions, but
|
||||
// they are *conditional*. So their presence could be
|
||||
// discriminatory, and thus, they are tracked above.
|
||||
//
|
||||
// But wait... why are epsilon states in our `set` in the first
|
||||
// place? Why not just leave them out? They're in our `set`
|
||||
// because it was generated by computing an epsilon closure,
|
||||
// and we want to keep track of all states we visited to avoid
|
||||
// re-visiting them. In exchange, we have to do this second
|
||||
// iteration over our collected states to finalize our DFA
|
||||
// state. In theory, we could avoid this second iteration if
|
||||
// we maintained two sets during epsilon closure: the set of
|
||||
// visited states (to avoid cycles) and the set of states that
|
||||
// will actually be used to construct the next DFA state.
|
||||
//
|
||||
// Note that this optimization requires that we re-compute the
|
||||
// epsilon closure to account for look-ahead in 'next' *only
|
||||
// when necessary*. Namely, only when the set of look-around
|
||||
// assertions changes and only when those changes are within
|
||||
// the set of assertions that are needed in order to step
|
||||
// through the closure correctly. Otherwise, if we re-do the
|
||||
// epsilon closure needlessly, it could change based on the
|
||||
// fact that we are omitting epsilon states here.
|
||||
//
|
||||
// -----
|
||||
//
|
||||
// Welp, scratch the above. It turns out that recording these
|
||||
// is in fact necessary to seemingly handle one particularly
|
||||
// annoying case: when a conditional epsilon transition is
|
||||
// put inside of a repetition operator. One specific case I
|
||||
// ran into was the regex `(?:\b|%)+` on the haystack `z%`.
|
||||
// The correct leftmost first matches are: [0, 0] and [1, 1].
|
||||
// But the DFA was reporting [0, 0] and [1, 2]. To understand
|
||||
// why this happens, consider the NFA for the aforementioned
|
||||
// regex:
|
||||
//
|
||||
// >000000: binary-union(4, 1)
|
||||
// 000001: \x00-\xFF => 0
|
||||
// 000002: WordAscii => 5
|
||||
// 000003: % => 5
|
||||
// ^000004: binary-union(2, 3)
|
||||
// 000005: binary-union(4, 6)
|
||||
// 000006: MATCH(0)
|
||||
//
|
||||
// The problem here is that one of the DFA start states is
|
||||
// going to consist of the NFA states [2, 3] by computing the
|
||||
// epsilon closure of state 4. State 4 isn't included because
|
||||
// we previously were not keeping track of union states. But
|
||||
// only a subset of transitions out of this state will be able
|
||||
// to follow WordAscii, and in those cases, the epsilon closure
|
||||
// is redone. The only problem is that computing the epsilon
|
||||
// closure from [2, 3] is different than computing the epsilon
|
||||
// closure from [4]. In the former case, assuming the WordAscii
|
||||
// assertion is satisfied, you get: [2, 3, 6]. In the latter
|
||||
// case, you get: [2, 6, 3]. Notice that '6' is the match state
|
||||
// and appears AFTER '3' in the former case. This leads to a
|
||||
// preferential but incorrect match of '%' before returning
|
||||
// a match. In the latter case, the match is preferred over
|
||||
// continuing to accept the '%'.
|
||||
//
|
||||
// It almost feels like we might be able to fix the NFA states
|
||||
// to avoid this, or to at least only keep track of union
|
||||
// states where this actually matters, since in the vast
|
||||
// majority of cases, this doesn't matter.
|
||||
//
|
||||
// Another alternative would be to define a new HIR property
|
||||
// called "assertion is repeated anywhere" and compute it
|
||||
// inductively over the entire pattern. If it happens anywhere,
|
||||
// which is probably pretty rare, then we record union states.
|
||||
// Otherwise we don't.
|
||||
builder.add_nfa_state_id(nfa_id);
|
||||
}
|
||||
// Capture states we definitely do not need to record, since they
|
||||
// are unconditional epsilon transitions with no branching.
|
||||
thompson::State::Capture { .. } => {}
|
||||
// It's not totally clear whether we need to record fail states or
|
||||
// not, but we do so out of an abundance of caution. Since they are
|
||||
// quite rare in practice, there isn't much cost to recording them.
|
||||
thompson::State::Fail => {
|
||||
builder.add_nfa_state_id(nfa_id);
|
||||
}
|
||||
thompson::State::Match { .. } => {
|
||||
// Normally, the NFA match state doesn't actually need to
|
||||
// be inside the DFA state. But since we delay matches by
|
||||
// one byte, the matching DFA state corresponds to states
|
||||
// that transition from the one we're building here. And
|
||||
// the way we detect those cases is by looking for an NFA
|
||||
// match state. See 'next' for how this is handled.
|
||||
builder.add_nfa_state_id(nfa_id);
|
||||
}
|
||||
}
|
||||
}
|
||||
// If we know this state contains no look-around assertions, then
|
||||
// there's no reason to track which look-around assertions were
|
||||
// satisfied when this state was created.
|
||||
if builder.look_need().is_empty() {
|
||||
builder.set_look_have(|_| LookSet::empty());
|
||||
}
|
||||
}
|
||||
|
||||
/// Sets the appropriate look-behind assertions on the given state based on
|
||||
/// this starting configuration.
|
||||
pub(crate) fn set_lookbehind_from_start(
|
||||
nfa: &thompson::NFA,
|
||||
start: &Start,
|
||||
builder: &mut StateBuilderMatches,
|
||||
) {
|
||||
let rev = nfa.is_reverse();
|
||||
let lineterm = nfa.look_matcher().get_line_terminator();
|
||||
let lookset = nfa.look_set_any();
|
||||
match *start {
|
||||
Start::NonWordByte => {
|
||||
if lookset.contains_word() {
|
||||
builder.set_look_have(|have| {
|
||||
have.insert(Look::WordStartHalfAscii)
|
||||
.insert(Look::WordStartHalfUnicode)
|
||||
});
|
||||
}
|
||||
}
|
||||
Start::WordByte => {
|
||||
if lookset.contains_word() {
|
||||
builder.set_is_from_word();
|
||||
}
|
||||
}
|
||||
Start::Text => {
|
||||
if lookset.contains_anchor_haystack() {
|
||||
builder.set_look_have(|have| have.insert(Look::Start));
|
||||
}
|
||||
if lookset.contains_anchor_line() {
|
||||
builder.set_look_have(|have| {
|
||||
have.insert(Look::StartLF).insert(Look::StartCRLF)
|
||||
});
|
||||
}
|
||||
if lookset.contains_word() {
|
||||
builder.set_look_have(|have| {
|
||||
have.insert(Look::WordStartHalfAscii)
|
||||
.insert(Look::WordStartHalfUnicode)
|
||||
});
|
||||
}
|
||||
}
|
||||
Start::LineLF => {
|
||||
if rev {
|
||||
if lookset.contains_anchor_crlf() {
|
||||
builder.set_is_half_crlf();
|
||||
}
|
||||
if lookset.contains_anchor_line() {
|
||||
builder.set_look_have(|have| have.insert(Look::StartLF));
|
||||
}
|
||||
} else {
|
||||
if lookset.contains_anchor_line() {
|
||||
builder.set_look_have(|have| have.insert(Look::StartCRLF));
|
||||
}
|
||||
}
|
||||
if lookset.contains_anchor_line() && lineterm == b'\n' {
|
||||
builder.set_look_have(|have| have.insert(Look::StartLF));
|
||||
}
|
||||
if lookset.contains_word() {
|
||||
builder.set_look_have(|have| {
|
||||
have.insert(Look::WordStartHalfAscii)
|
||||
.insert(Look::WordStartHalfUnicode)
|
||||
});
|
||||
}
|
||||
}
|
||||
Start::LineCR => {
|
||||
if lookset.contains_anchor_crlf() {
|
||||
if rev {
|
||||
builder.set_look_have(|have| have.insert(Look::StartCRLF));
|
||||
} else {
|
||||
builder.set_is_half_crlf();
|
||||
}
|
||||
}
|
||||
if lookset.contains_anchor_line() && lineterm == b'\r' {
|
||||
builder.set_look_have(|have| have.insert(Look::StartLF));
|
||||
}
|
||||
if lookset.contains_word() {
|
||||
builder.set_look_have(|have| {
|
||||
have.insert(Look::WordStartHalfAscii)
|
||||
.insert(Look::WordStartHalfUnicode)
|
||||
});
|
||||
}
|
||||
}
|
||||
Start::CustomLineTerminator => {
|
||||
if lookset.contains_anchor_line() {
|
||||
builder.set_look_have(|have| have.insert(Look::StartLF));
|
||||
}
|
||||
// This is a bit of a tricky case, but if the line terminator was
|
||||
// set to a word byte, then we also need to behave as if the start
|
||||
// configuration is Start::WordByte. That is, we need to mark our
|
||||
// state as having come from a word byte.
|
||||
if lookset.contains_word() {
|
||||
if utf8::is_word_byte(lineterm) {
|
||||
builder.set_is_from_word();
|
||||
} else {
|
||||
builder.set_look_have(|have| {
|
||||
have.insert(Look::WordStartHalfAscii)
|
||||
.insert(Look::WordStartHalfUnicode)
|
||||
});
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
907
third-party/vendor/regex-automata/src/util/determinize/state.rs
vendored
Normal file
907
third-party/vendor/regex-automata/src/util/determinize/state.rs
vendored
Normal file
|
|
@ -0,0 +1,907 @@
|
|||
/*!
|
||||
This module defines a DFA state representation and builders for constructing
|
||||
DFA states.
|
||||
|
||||
This representation is specifically for use in implementations of NFA-to-DFA
|
||||
conversion via powerset construction. (Also called "determinization" in this
|
||||
crate.)
|
||||
|
||||
The term "DFA state" is somewhat overloaded in this crate. In some cases, it
|
||||
refers to the set of transitions over an alphabet for a particular state. In
|
||||
other cases, it refers to a set of NFA states. The former is really about the
|
||||
final representation of a state in a DFA's transition table, where as the
|
||||
latter---what this module is focused on---is closer to an intermediate form
|
||||
that is used to help eventually build the transition table.
|
||||
|
||||
This module exports four types. All four types represent the same idea: an
|
||||
ordered set of NFA states. This ordered set represents the epsilon closure of a
|
||||
particular NFA state, where the "epsilon closure" is the set of NFA states that
|
||||
can be transitioned to without consuming any input. i.e., Follow all of the NFA
|
||||
state's epsilon transitions. In addition, this implementation of DFA states
|
||||
cares about two other things: the ordered set of pattern IDs corresponding
|
||||
to the patterns that match if the state is a match state, and the set of
|
||||
look-behind assertions that were true when the state was created.
|
||||
|
||||
The first, `State`, is a frozen representation of a state that cannot be
|
||||
modified. It may be cheaply cloned without copying the state itself and can be
|
||||
accessed safely from multiple threads simultaneously. This type is useful for
|
||||
when one knows that the DFA state being constructed is distinct from any other
|
||||
previously constructed states. Namely, powerset construction, in practice,
|
||||
requires one to keep a cache of previously created DFA states. Otherwise,
|
||||
the number of DFA states created in memory balloons to an impractically
|
||||
large number. For this reason, equivalent states should endeavor to have an
|
||||
equivalent byte-level representation. (In general, "equivalency" here means,
|
||||
"equivalent assertions, pattern IDs and NFA state IDs." We do not require that
|
||||
full DFA minimization be implemented here. This form of equivalency is only
|
||||
surface deep and is more-or-less a practical necessity.)
|
||||
|
||||
The other three types represent different phases in the construction of a
|
||||
DFA state. Internally, these three types (and `State`) all use the same
|
||||
byte-oriented representation. That means one can use any of the builder types
|
||||
to check whether the state it represents already exists or not. If it does,
|
||||
then there is no need to freeze it into a `State` (which requires an alloc and
|
||||
a copy). Here are the three types described succinctly:
|
||||
|
||||
* `StateBuilderEmpty` represents a state with no pattern IDs, no assertions
|
||||
and no NFA states. Creating a `StateBuilderEmpty` performs no allocs. A
|
||||
`StateBuilderEmpty` can only be used to query its underlying memory capacity,
|
||||
or to convert into a builder for recording pattern IDs and/or assertions.
|
||||
|
||||
* `StateBuilderMatches` represents a state with zero or more pattern IDs, zero
|
||||
or more satisfied assertions and zero NFA state IDs. A `StateBuilderMatches`
|
||||
can only be used for adding pattern IDs and recording assertions.
|
||||
|
||||
* `StateBuilderNFA` represents a state with zero or more pattern IDs, zero or
|
||||
more satisfied assertions and zero or more NFA state IDs. A `StateBuilderNFA`
|
||||
can only be used for adding NFA state IDs and recording some assertions.
|
||||
|
||||
The expected flow here is to use the above builders to construct a candidate
|
||||
DFA state to check if it already exists. If it does, then there's no need to
|
||||
freeze it into a `State`. It it doesn't exist, then `StateBuilderNFA::to_state`
|
||||
can be called to freeze the builder into an immutable `State`. In either
|
||||
case, `clear` should be called on the builder to turn it back into a
|
||||
`StateBuilderEmpty` that reuses the underlying memory.
|
||||
|
||||
The main purpose for splitting the builder into these distinct types is to
|
||||
make it impossible to do things like adding a pattern ID after adding an NFA
|
||||
state ID. Namely, this makes it simpler to use a space-and-time efficient
|
||||
binary representation for the state. (The format is documented on the `Repr`
|
||||
type below.) If we just used one type for everything, it would be possible for
|
||||
callers to use an incorrect interleaving of calls and thus result in a corrupt
|
||||
representation. I chose to use more type machinery to make this impossible to
|
||||
do because 1) determinization is itself pretty complex and it wouldn't be too
|
||||
hard to foul this up and 2) there isn't too much machinery involved and it's
|
||||
well contained.
|
||||
|
||||
As an optimization, sometimes states won't have certain things set. For
|
||||
example, if the underlying NFA has no word boundary assertions, then there is
|
||||
no reason to set a state's look-behind assertion as to whether it was generated
|
||||
from a word byte or not. Similarly, if a state has no NFA states corresponding
|
||||
to look-around assertions, then there is no reason to set `look_have` to a
|
||||
non-empty set. Finally, callers usually omit unconditional epsilon transitions
|
||||
when adding NFA state IDs since they aren't discriminatory.
|
||||
|
||||
Finally, the binary representation used by these states is, thankfully, not
|
||||
serialized anywhere. So any kind of change can be made with reckless abandon,
|
||||
as long as everything in this module agrees.
|
||||
*/
|
||||
|
||||
use core::mem;
|
||||
|
||||
use alloc::{sync::Arc, vec::Vec};
|
||||
|
||||
use crate::util::{
|
||||
int::{I32, U32},
|
||||
look::LookSet,
|
||||
primitives::{PatternID, StateID},
|
||||
wire::{self, Endian},
|
||||
};
|
||||
|
||||
/// A DFA state that, at its core, is represented by an ordered set of NFA
|
||||
/// states.
|
||||
///
|
||||
/// This type is intended to be used only in NFA-to-DFA conversion via powerset
|
||||
/// construction.
|
||||
///
|
||||
/// It may be cheaply cloned and accessed safely from multiple threads
|
||||
/// simultaneously.
|
||||
#[derive(Clone, Eq, Hash, PartialEq, PartialOrd, Ord)]
|
||||
pub(crate) struct State(Arc<[u8]>);
|
||||
|
||||
/// This Borrow impl permits us to lookup any state in a map by its byte
|
||||
/// representation. This is particularly convenient when one has a StateBuilder
|
||||
/// and we want to see if a correspondingly equivalent state already exists. If
|
||||
/// one does exist, then we can reuse the allocation required by StateBuilder
|
||||
/// without having to convert it into a State first.
|
||||
impl core::borrow::Borrow<[u8]> for State {
|
||||
fn borrow(&self) -> &[u8] {
|
||||
&*self.0
|
||||
}
|
||||
}
|
||||
|
||||
impl core::fmt::Debug for State {
|
||||
fn fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result {
|
||||
f.debug_tuple("State").field(&self.repr()).finish()
|
||||
}
|
||||
}
|
||||
|
||||
/// For docs on these routines, see the internal Repr and ReprVec types below.
|
||||
impl State {
|
||||
pub(crate) fn dead() -> State {
|
||||
StateBuilderEmpty::new().into_matches().into_nfa().to_state()
|
||||
}
|
||||
|
||||
pub(crate) fn is_match(&self) -> bool {
|
||||
self.repr().is_match()
|
||||
}
|
||||
|
||||
pub(crate) fn is_from_word(&self) -> bool {
|
||||
self.repr().is_from_word()
|
||||
}
|
||||
|
||||
pub(crate) fn is_half_crlf(&self) -> bool {
|
||||
self.repr().is_half_crlf()
|
||||
}
|
||||
|
||||
pub(crate) fn look_have(&self) -> LookSet {
|
||||
self.repr().look_have()
|
||||
}
|
||||
|
||||
pub(crate) fn look_need(&self) -> LookSet {
|
||||
self.repr().look_need()
|
||||
}
|
||||
|
||||
pub(crate) fn match_len(&self) -> usize {
|
||||
self.repr().match_len()
|
||||
}
|
||||
|
||||
pub(crate) fn match_pattern(&self, index: usize) -> PatternID {
|
||||
self.repr().match_pattern(index)
|
||||
}
|
||||
|
||||
pub(crate) fn match_pattern_ids(&self) -> Option<Vec<PatternID>> {
|
||||
self.repr().match_pattern_ids()
|
||||
}
|
||||
|
||||
#[cfg(all(test, not(miri)))]
|
||||
pub(crate) fn iter_match_pattern_ids<F: FnMut(PatternID)>(&self, f: F) {
|
||||
self.repr().iter_match_pattern_ids(f)
|
||||
}
|
||||
|
||||
pub(crate) fn iter_nfa_state_ids<F: FnMut(StateID)>(&self, f: F) {
|
||||
self.repr().iter_nfa_state_ids(f)
|
||||
}
|
||||
|
||||
pub(crate) fn memory_usage(&self) -> usize {
|
||||
self.0.len()
|
||||
}
|
||||
|
||||
fn repr(&self) -> Repr<'_> {
|
||||
Repr(&*self.0)
|
||||
}
|
||||
}
|
||||
|
||||
/// A state builder that represents an empty state.
|
||||
///
|
||||
/// This is a useful "initial condition" for state construction. It has no
|
||||
/// NFA state IDs, no assertions set and no pattern IDs. No allocations are
|
||||
/// made when new() is called. Its main use is for being converted into a
|
||||
/// builder that can capture assertions and pattern IDs.
|
||||
#[derive(Clone, Debug)]
|
||||
pub(crate) struct StateBuilderEmpty(Vec<u8>);
|
||||
|
||||
/// For docs on these routines, see the internal Repr and ReprVec types below.
|
||||
impl StateBuilderEmpty {
|
||||
pub(crate) fn new() -> StateBuilderEmpty {
|
||||
StateBuilderEmpty(alloc::vec![])
|
||||
}
|
||||
|
||||
pub(crate) fn into_matches(mut self) -> StateBuilderMatches {
|
||||
self.0.extend_from_slice(&[0, 0, 0, 0, 0, 0, 0, 0, 0]);
|
||||
StateBuilderMatches(self.0)
|
||||
}
|
||||
|
||||
fn clear(&mut self) {
|
||||
self.0.clear();
|
||||
}
|
||||
|
||||
pub(crate) fn capacity(&self) -> usize {
|
||||
self.0.capacity()
|
||||
}
|
||||
}
|
||||
|
||||
/// A state builder that collects assertions and pattern IDs.
|
||||
///
|
||||
/// When collecting pattern IDs is finished, this can be converted into a
|
||||
/// builder that collects NFA state IDs.
|
||||
#[derive(Clone)]
|
||||
pub(crate) struct StateBuilderMatches(Vec<u8>);
|
||||
|
||||
impl core::fmt::Debug for StateBuilderMatches {
|
||||
fn fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result {
|
||||
f.debug_tuple("StateBuilderMatches").field(&self.repr()).finish()
|
||||
}
|
||||
}
|
||||
|
||||
/// For docs on these routines, see the internal Repr and ReprVec types below.
|
||||
impl StateBuilderMatches {
|
||||
pub(crate) fn into_nfa(mut self) -> StateBuilderNFA {
|
||||
self.repr_vec().close_match_pattern_ids();
|
||||
StateBuilderNFA { repr: self.0, prev_nfa_state_id: StateID::ZERO }
|
||||
}
|
||||
|
||||
pub(crate) fn set_is_from_word(&mut self) {
|
||||
self.repr_vec().set_is_from_word()
|
||||
}
|
||||
|
||||
pub(crate) fn set_is_half_crlf(&mut self) {
|
||||
self.repr_vec().set_is_half_crlf()
|
||||
}
|
||||
|
||||
pub(crate) fn look_have(&self) -> LookSet {
|
||||
LookSet::read_repr(&self.0[1..])
|
||||
}
|
||||
|
||||
pub(crate) fn set_look_have(
|
||||
&mut self,
|
||||
set: impl FnMut(LookSet) -> LookSet,
|
||||
) {
|
||||
self.repr_vec().set_look_have(set)
|
||||
}
|
||||
|
||||
pub(crate) fn add_match_pattern_id(&mut self, pid: PatternID) {
|
||||
self.repr_vec().add_match_pattern_id(pid)
|
||||
}
|
||||
|
||||
fn repr(&self) -> Repr<'_> {
|
||||
Repr(&self.0)
|
||||
}
|
||||
|
||||
fn repr_vec(&mut self) -> ReprVec<'_> {
|
||||
ReprVec(&mut self.0)
|
||||
}
|
||||
}
|
||||
|
||||
/// A state builder that collects some assertions and NFA state IDs.
|
||||
///
|
||||
/// When collecting NFA state IDs is finished, this can be used to build a
|
||||
/// `State` if necessary.
|
||||
///
|
||||
/// When dont with building a state (regardless of whether it got kept or not),
|
||||
/// it's usually a good idea to call `clear` to get an empty builder back so
|
||||
/// that it can be reused to build the next state.
|
||||
#[derive(Clone)]
|
||||
pub(crate) struct StateBuilderNFA {
|
||||
repr: Vec<u8>,
|
||||
prev_nfa_state_id: StateID,
|
||||
}
|
||||
|
||||
impl core::fmt::Debug for StateBuilderNFA {
|
||||
fn fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result {
|
||||
f.debug_tuple("StateBuilderNFA").field(&self.repr()).finish()
|
||||
}
|
||||
}
|
||||
|
||||
/// For docs on these routines, see the internal Repr and ReprVec types below.
|
||||
impl StateBuilderNFA {
|
||||
pub(crate) fn to_state(&self) -> State {
|
||||
State(Arc::from(&*self.repr))
|
||||
}
|
||||
|
||||
pub(crate) fn clear(self) -> StateBuilderEmpty {
|
||||
let mut builder = StateBuilderEmpty(self.repr);
|
||||
builder.clear();
|
||||
builder
|
||||
}
|
||||
|
||||
pub(crate) fn look_need(&self) -> LookSet {
|
||||
self.repr().look_need()
|
||||
}
|
||||
|
||||
pub(crate) fn set_look_have(
|
||||
&mut self,
|
||||
set: impl FnMut(LookSet) -> LookSet,
|
||||
) {
|
||||
self.repr_vec().set_look_have(set)
|
||||
}
|
||||
|
||||
pub(crate) fn set_look_need(
|
||||
&mut self,
|
||||
set: impl FnMut(LookSet) -> LookSet,
|
||||
) {
|
||||
self.repr_vec().set_look_need(set)
|
||||
}
|
||||
|
||||
pub(crate) fn add_nfa_state_id(&mut self, sid: StateID) {
|
||||
ReprVec(&mut self.repr)
|
||||
.add_nfa_state_id(&mut self.prev_nfa_state_id, sid)
|
||||
}
|
||||
|
||||
pub(crate) fn as_bytes(&self) -> &[u8] {
|
||||
&self.repr
|
||||
}
|
||||
|
||||
fn repr(&self) -> Repr<'_> {
|
||||
Repr(&self.repr)
|
||||
}
|
||||
|
||||
fn repr_vec(&mut self) -> ReprVec<'_> {
|
||||
ReprVec(&mut self.repr)
|
||||
}
|
||||
}
|
||||
|
||||
/// Repr is a read-only view into the representation of a DFA state.
|
||||
///
|
||||
/// Primarily, a Repr is how we achieve DRY: we implement decoding the format
|
||||
/// in one place, and then use a Repr to implement the various methods on the
|
||||
/// public state types.
|
||||
///
|
||||
/// The format is as follows:
|
||||
///
|
||||
/// The first three bytes correspond to bitsets.
|
||||
///
|
||||
/// Byte 0 is a bitset corresponding to miscellaneous flags associated with the
|
||||
/// state. Bit 0 is set to 1 if the state is a match state. Bit 1 is set to 1
|
||||
/// if the state has pattern IDs explicitly written to it. (This is a flag that
|
||||
/// is not meant to be set by determinization, but rather, is used as part of
|
||||
/// an internal space-saving optimization.) Bit 2 is set to 1 if the state was
|
||||
/// generated by a transition over a "word" byte. (Callers may not always set
|
||||
/// this. For example, if the NFA has no word boundary assertion, then needing
|
||||
/// to track whether a state came from a word byte or not is superfluous and
|
||||
/// wasteful.) Bit 3 is set to 1 if the state was generated by a transition
|
||||
/// from a `\r` (forward search) or a `\n` (reverse search) when CRLF mode is
|
||||
/// enabled.
|
||||
///
|
||||
/// Bytes 1..5 correspond to the look-behind assertions that were satisfied
|
||||
/// by the transition that created this state. (Look-ahead assertions are not
|
||||
/// tracked as part of states. Instead, these are applied by re-computing the
|
||||
/// epsilon closure of a state when computing the transition function. See
|
||||
/// `next` in the parent module.)
|
||||
///
|
||||
/// Bytes 5..9 correspond to the set of look-around assertions (including both
|
||||
/// look-behind and look-ahead) that appear somewhere in this state's set of
|
||||
/// NFA state IDs. This is used to determine whether this state's epsilon
|
||||
/// closure should be re-computed when computing the transition function.
|
||||
/// Namely, look-around assertions are "just" conditional epsilon transitions,
|
||||
/// so if there are new assertions available when computing the transition
|
||||
/// function, we should only re-compute the epsilon closure if those new
|
||||
/// assertions are relevant to this particular state.
|
||||
///
|
||||
/// Bytes 9..13 correspond to a 32-bit native-endian encoded integer
|
||||
/// corresponding to the number of patterns encoded in this state. If the state
|
||||
/// is not a match state (byte 0 bit 0 is 0) or if it's only pattern ID is
|
||||
/// PatternID::ZERO, then no integer is encoded at this position. Instead, byte
|
||||
/// offset 3 is the position at which the first NFA state ID is encoded.
|
||||
///
|
||||
/// For a match state with at least one non-ZERO pattern ID, the next bytes
|
||||
/// correspond to a sequence of 32-bit native endian encoded integers that
|
||||
/// represent each pattern ID, in order, that this match state represents.
|
||||
///
|
||||
/// After the pattern IDs (if any), NFA state IDs are delta encoded as
|
||||
/// varints.[1] The first NFA state ID is encoded as itself, and each
|
||||
/// subsequent NFA state ID is encoded as the difference between itself and the
|
||||
/// previous NFA state ID.
|
||||
///
|
||||
/// [1] - https://developers.google.com/protocol-buffers/docs/encoding#varints
|
||||
struct Repr<'a>(&'a [u8]);
|
||||
|
||||
impl<'a> Repr<'a> {
|
||||
/// Returns true if and only if this is a match state.
|
||||
///
|
||||
/// If callers have added pattern IDs to this state, then callers MUST set
|
||||
/// this state as a match state explicitly. However, as a special case,
|
||||
/// states that are marked as match states but with no pattern IDs, then
|
||||
/// the state is treated as if it had a single pattern ID equivalent to
|
||||
/// PatternID::ZERO.
|
||||
fn is_match(&self) -> bool {
|
||||
self.0[0] & (1 << 0) > 0
|
||||
}
|
||||
|
||||
/// Returns true if and only if this state has had at least one pattern
|
||||
/// ID added to it.
|
||||
///
|
||||
/// This is an internal-only flag that permits the representation to save
|
||||
/// space in the common case of an NFA with one pattern in it. In that
|
||||
/// case, a match state can only ever have exactly one pattern ID:
|
||||
/// PatternID::ZERO. So there's no need to represent it.
|
||||
fn has_pattern_ids(&self) -> bool {
|
||||
self.0[0] & (1 << 1) > 0
|
||||
}
|
||||
|
||||
/// Returns true if and only if this state is marked as having been created
|
||||
/// from a transition over a word byte. This is useful for checking whether
|
||||
/// a word boundary assertion is true or not, which requires look-behind
|
||||
/// (whether the current state came from a word byte or not) and look-ahead
|
||||
/// (whether the transition byte is a word byte or not).
|
||||
///
|
||||
/// Since states with this set are distinct from states that don't have
|
||||
/// this set (even if they are otherwise equivalent), callers should not
|
||||
/// set this assertion unless the underlying NFA has at least one word
|
||||
/// boundary assertion somewhere. Otherwise, a superfluous number of states
|
||||
/// may be created.
|
||||
fn is_from_word(&self) -> bool {
|
||||
self.0[0] & (1 << 2) > 0
|
||||
}
|
||||
|
||||
/// Returns true if and only if this state is marked as being inside of a
|
||||
/// CRLF terminator. In the forward direction, this means the state was
|
||||
/// created after seeing a `\r`. In the reverse direction, this means the
|
||||
/// state was created after seeing a `\n`.
|
||||
fn is_half_crlf(&self) -> bool {
|
||||
self.0[0] & (1 << 3) > 0
|
||||
}
|
||||
|
||||
/// The set of look-behind assertions that were true in the transition that
|
||||
/// created this state.
|
||||
///
|
||||
/// Generally, this should be empty if 'look_need' is empty, since there is
|
||||
/// no reason to track which look-behind assertions are true if the state
|
||||
/// has no conditional epsilon transitions.
|
||||
///
|
||||
/// Satisfied look-ahead assertions are not tracked in states. Instead,
|
||||
/// these are re-computed on demand via epsilon closure when computing the
|
||||
/// transition function.
|
||||
fn look_have(&self) -> LookSet {
|
||||
LookSet::read_repr(&self.0[1..])
|
||||
}
|
||||
|
||||
/// The set of look-around (both behind and ahead) assertions that appear
|
||||
/// at least once in this state's set of NFA states.
|
||||
///
|
||||
/// This is used to determine whether the epsilon closure needs to be
|
||||
/// re-computed when computing the transition function. Namely, if the
|
||||
/// state has no conditional epsilon transitions, then there is no need
|
||||
/// to re-compute the epsilon closure.
|
||||
fn look_need(&self) -> LookSet {
|
||||
LookSet::read_repr(&self.0[5..])
|
||||
}
|
||||
|
||||
/// Returns the total number of match pattern IDs in this state.
|
||||
///
|
||||
/// If this state is not a match state, then this always returns 0.
|
||||
fn match_len(&self) -> usize {
|
||||
if !self.is_match() {
|
||||
return 0;
|
||||
} else if !self.has_pattern_ids() {
|
||||
1
|
||||
} else {
|
||||
self.encoded_pattern_len()
|
||||
}
|
||||
}
|
||||
|
||||
/// Returns the pattern ID for this match state at the given index.
|
||||
///
|
||||
/// If the given index is greater than or equal to `match_len()` for this
|
||||
/// state, then this could panic or return incorrect results.
|
||||
fn match_pattern(&self, index: usize) -> PatternID {
|
||||
if !self.has_pattern_ids() {
|
||||
PatternID::ZERO
|
||||
} else {
|
||||
let offset = 13 + index * PatternID::SIZE;
|
||||
// This is OK since we only ever serialize valid PatternIDs to
|
||||
// states.
|
||||
wire::read_pattern_id_unchecked(&self.0[offset..]).0
|
||||
}
|
||||
}
|
||||
|
||||
/// Returns a copy of all match pattern IDs in this state. If this state
|
||||
/// is not a match state, then this returns None.
|
||||
fn match_pattern_ids(&self) -> Option<Vec<PatternID>> {
|
||||
if !self.is_match() {
|
||||
return None;
|
||||
}
|
||||
let mut pids = alloc::vec![];
|
||||
self.iter_match_pattern_ids(|pid| pids.push(pid));
|
||||
Some(pids)
|
||||
}
|
||||
|
||||
/// Calls the given function on every pattern ID in this state.
|
||||
fn iter_match_pattern_ids<F: FnMut(PatternID)>(&self, mut f: F) {
|
||||
if !self.is_match() {
|
||||
return;
|
||||
}
|
||||
// As an optimization for a very common case, when this is a match
|
||||
// state for an NFA with only one pattern, we don't actually write the
|
||||
// pattern ID to the state representation. Instead, we know it must
|
||||
// be there since it is the only possible choice.
|
||||
if !self.has_pattern_ids() {
|
||||
f(PatternID::ZERO);
|
||||
return;
|
||||
}
|
||||
let mut pids = &self.0[13..self.pattern_offset_end()];
|
||||
while !pids.is_empty() {
|
||||
let pid = wire::read_u32(pids);
|
||||
pids = &pids[PatternID::SIZE..];
|
||||
// This is OK since we only ever serialize valid PatternIDs to
|
||||
// states. And since pattern IDs can never exceed a usize, the
|
||||
// unwrap is OK.
|
||||
f(PatternID::new_unchecked(usize::try_from(pid).unwrap()));
|
||||
}
|
||||
}
|
||||
|
||||
/// Calls the given function on every NFA state ID in this state.
|
||||
fn iter_nfa_state_ids<F: FnMut(StateID)>(&self, mut f: F) {
|
||||
let mut sids = &self.0[self.pattern_offset_end()..];
|
||||
let mut prev = 0i32;
|
||||
while !sids.is_empty() {
|
||||
let (delta, nr) = read_vari32(sids);
|
||||
sids = &sids[nr..];
|
||||
let sid = prev + delta;
|
||||
prev = sid;
|
||||
// This is OK since we only ever serialize valid StateIDs to
|
||||
// states. And since state IDs can never exceed an isize, they must
|
||||
// always be able to fit into a usize, and thus cast is OK.
|
||||
f(StateID::new_unchecked(sid.as_usize()))
|
||||
}
|
||||
}
|
||||
|
||||
/// Returns the offset into this state's representation where the pattern
|
||||
/// IDs end and the NFA state IDs begin.
|
||||
fn pattern_offset_end(&self) -> usize {
|
||||
let encoded = self.encoded_pattern_len();
|
||||
if encoded == 0 {
|
||||
return 9;
|
||||
}
|
||||
// This arithmetic is OK since we were able to address this many bytes
|
||||
// when writing to the state, thus, it must fit into a usize.
|
||||
encoded.checked_mul(4).unwrap().checked_add(13).unwrap()
|
||||
}
|
||||
|
||||
/// Returns the total number of *encoded* pattern IDs in this state.
|
||||
///
|
||||
/// This may return 0 even when this is a match state, since the pattern
|
||||
/// ID `PatternID::ZERO` is not encoded when it's the only pattern ID in
|
||||
/// the match state (the overwhelming common case).
|
||||
fn encoded_pattern_len(&self) -> usize {
|
||||
if !self.has_pattern_ids() {
|
||||
return 0;
|
||||
}
|
||||
// This unwrap is OK since the total number of patterns is always
|
||||
// guaranteed to fit into a usize.
|
||||
usize::try_from(wire::read_u32(&self.0[9..13])).unwrap()
|
||||
}
|
||||
}
|
||||
|
||||
impl<'a> core::fmt::Debug for Repr<'a> {
|
||||
fn fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result {
|
||||
let mut nfa_ids = alloc::vec![];
|
||||
self.iter_nfa_state_ids(|sid| nfa_ids.push(sid));
|
||||
f.debug_struct("Repr")
|
||||
.field("is_match", &self.is_match())
|
||||
.field("is_from_word", &self.is_from_word())
|
||||
.field("is_half_crlf", &self.is_half_crlf())
|
||||
.field("look_have", &self.look_have())
|
||||
.field("look_need", &self.look_need())
|
||||
.field("match_pattern_ids", &self.match_pattern_ids())
|
||||
.field("nfa_state_ids", &nfa_ids)
|
||||
.finish()
|
||||
}
|
||||
}
|
||||
|
||||
/// ReprVec is a write-only view into the representation of a DFA state.
|
||||
///
|
||||
/// See Repr for more details on the purpose of this type and also the format.
|
||||
///
|
||||
/// Note that not all possible combinations of methods may be called. This is
|
||||
/// precisely what the various StateBuilder types encapsulate: they only
|
||||
/// permit valid combinations via Rust's linear typing.
|
||||
struct ReprVec<'a>(&'a mut Vec<u8>);
|
||||
|
||||
impl<'a> ReprVec<'a> {
|
||||
/// Set this state as a match state.
|
||||
///
|
||||
/// This should not be exposed explicitly outside of this module. It is
|
||||
/// set automatically when a pattern ID is added.
|
||||
fn set_is_match(&mut self) {
|
||||
self.0[0] |= 1 << 0;
|
||||
}
|
||||
|
||||
/// Set that this state has pattern IDs explicitly written to it.
|
||||
///
|
||||
/// This should not be exposed explicitly outside of this module. This is
|
||||
/// used internally as a space saving optimization. Namely, if the state
|
||||
/// is a match state but does not have any pattern IDs written to it,
|
||||
/// then it is automatically inferred to have a pattern ID of ZERO.
|
||||
fn set_has_pattern_ids(&mut self) {
|
||||
self.0[0] |= 1 << 1;
|
||||
}
|
||||
|
||||
/// Set this state as being built from a transition over a word byte.
|
||||
///
|
||||
/// Setting this is only necessary when one needs to deal with word
|
||||
/// boundary assertions. Therefore, if the underlying NFA has no word
|
||||
/// boundary assertions, callers should not set this.
|
||||
fn set_is_from_word(&mut self) {
|
||||
self.0[0] |= 1 << 2;
|
||||
}
|
||||
|
||||
/// Set this state as having seen half of a CRLF terminator.
|
||||
///
|
||||
/// In the forward direction, this should be set when a `\r` has been seen.
|
||||
/// In the reverse direction, this should be set when a `\n` has been seen.
|
||||
fn set_is_half_crlf(&mut self) {
|
||||
self.0[0] |= 1 << 3;
|
||||
}
|
||||
|
||||
/// The set of look-behind assertions that were true in the transition that
|
||||
/// created this state.
|
||||
fn look_have(&self) -> LookSet {
|
||||
self.repr().look_have()
|
||||
}
|
||||
|
||||
/// The set of look-around (both behind and ahead) assertions that appear
|
||||
/// at least once in this state's set of NFA states.
|
||||
fn look_need(&self) -> LookSet {
|
||||
self.repr().look_need()
|
||||
}
|
||||
|
||||
/// Mutate the set of look-behind assertions that were true in the
|
||||
/// transition that created this state.
|
||||
fn set_look_have(&mut self, mut set: impl FnMut(LookSet) -> LookSet) {
|
||||
set(self.look_have()).write_repr(&mut self.0[1..]);
|
||||
}
|
||||
|
||||
/// Mutate the set of look-around (both behind and ahead) assertions that
|
||||
/// appear at least once in this state's set of NFA states.
|
||||
fn set_look_need(&mut self, mut set: impl FnMut(LookSet) -> LookSet) {
|
||||
set(self.look_need()).write_repr(&mut self.0[5..]);
|
||||
}
|
||||
|
||||
/// Add a pattern ID to this state. All match states must have at least
|
||||
/// one pattern ID associated with it.
|
||||
///
|
||||
/// Callers must never add duplicative pattern IDs.
|
||||
///
|
||||
/// The order in which patterns are added must correspond to the order
|
||||
/// in which patterns are reported as matches.
|
||||
fn add_match_pattern_id(&mut self, pid: PatternID) {
|
||||
// As a (somewhat small) space saving optimization, in the case where
|
||||
// a matching state has exactly one pattern ID, PatternID::ZERO, we do
|
||||
// not write either the pattern ID or the number of patterns encoded.
|
||||
// Instead, all we do is set the 'is_match' bit on this state. Overall,
|
||||
// this saves 8 bytes per match state for the overwhelming majority of
|
||||
// match states.
|
||||
//
|
||||
// In order to know whether pattern IDs need to be explicitly read or
|
||||
// not, we use another internal-only bit, 'has_pattern_ids', to
|
||||
// indicate whether they have been explicitly written or not.
|
||||
if !self.repr().has_pattern_ids() {
|
||||
if pid == PatternID::ZERO {
|
||||
self.set_is_match();
|
||||
return;
|
||||
}
|
||||
// Make room for 'close_match_pattern_ids' to write the total
|
||||
// number of pattern IDs written.
|
||||
self.0.extend(core::iter::repeat(0).take(PatternID::SIZE));
|
||||
self.set_has_pattern_ids();
|
||||
// If this was already a match state, then the only way that's
|
||||
// possible when the state doesn't have pattern IDs is if
|
||||
// PatternID::ZERO was added by the caller previously. In this
|
||||
// case, we are now adding a non-ZERO pattern ID after it, in
|
||||
// which case, we want to make sure to represent ZERO explicitly
|
||||
// now.
|
||||
if self.repr().is_match() {
|
||||
write_u32(self.0, 0)
|
||||
} else {
|
||||
// Otherwise, just make sure the 'is_match' bit is set.
|
||||
self.set_is_match();
|
||||
}
|
||||
}
|
||||
write_u32(self.0, pid.as_u32());
|
||||
}
|
||||
|
||||
/// Indicate that no more pattern IDs will be added to this state.
|
||||
///
|
||||
/// Once this is called, callers must not call it or 'add_match_pattern_id'
|
||||
/// again.
|
||||
///
|
||||
/// This should not be exposed explicitly outside of this module. It
|
||||
/// should be called only when converting a StateBuilderMatches into a
|
||||
/// StateBuilderNFA.
|
||||
fn close_match_pattern_ids(&mut self) {
|
||||
// If we never wrote any pattern IDs, then there's nothing to do here.
|
||||
if !self.repr().has_pattern_ids() {
|
||||
return;
|
||||
}
|
||||
let patsize = PatternID::SIZE;
|
||||
let pattern_bytes = self.0.len() - 13;
|
||||
// Every pattern ID uses 4 bytes, so number of bytes should be
|
||||
// divisible by 4.
|
||||
assert_eq!(pattern_bytes % patsize, 0);
|
||||
// This unwrap is OK since we are guaranteed that the maximum number
|
||||
// of possible patterns fits into a u32.
|
||||
let count32 = u32::try_from(pattern_bytes / patsize).unwrap();
|
||||
wire::NE::write_u32(count32, &mut self.0[9..13]);
|
||||
}
|
||||
|
||||
/// Add an NFA state ID to this state. The order in which NFA states are
|
||||
/// added matters. It is the caller's responsibility to ensure that
|
||||
/// duplicate NFA state IDs are not added.
|
||||
fn add_nfa_state_id(&mut self, prev: &mut StateID, sid: StateID) {
|
||||
let delta = sid.as_i32() - prev.as_i32();
|
||||
write_vari32(self.0, delta);
|
||||
*prev = sid;
|
||||
}
|
||||
|
||||
/// Return a read-only view of this state's representation.
|
||||
fn repr(&self) -> Repr<'_> {
|
||||
Repr(self.0.as_slice())
|
||||
}
|
||||
}
|
||||
|
||||
/// Write a signed 32-bit integer using zig-zag encoding.
|
||||
///
|
||||
/// https://developers.google.com/protocol-buffers/docs/encoding#varints
|
||||
fn write_vari32(data: &mut Vec<u8>, n: i32) {
|
||||
let mut un = n.to_bits() << 1;
|
||||
if n < 0 {
|
||||
un = !un;
|
||||
}
|
||||
write_varu32(data, un)
|
||||
}
|
||||
|
||||
/// Read a signed 32-bit integer using zig-zag encoding. Also, return the
|
||||
/// number of bytes read.
|
||||
///
|
||||
/// https://developers.google.com/protocol-buffers/docs/encoding#varints
|
||||
fn read_vari32(data: &[u8]) -> (i32, usize) {
|
||||
let (un, i) = read_varu32(data);
|
||||
let mut n = i32::from_bits(un >> 1);
|
||||
if un & 1 != 0 {
|
||||
n = !n;
|
||||
}
|
||||
(n, i)
|
||||
}
|
||||
|
||||
/// Write an unsigned 32-bit integer as a varint. In essence, `n` is written
|
||||
/// as a sequence of bytes where all bytes except for the last one have the
|
||||
/// most significant bit set. The least significant 7 bits correspond to the
|
||||
/// actual bits of `n`. So in the worst case, a varint uses 5 bytes, but in
|
||||
/// very common cases, it uses fewer than 4.
|
||||
///
|
||||
/// https://developers.google.com/protocol-buffers/docs/encoding#varints
|
||||
fn write_varu32(data: &mut Vec<u8>, mut n: u32) {
|
||||
while n >= 0b1000_0000 {
|
||||
data.push(n.low_u8() | 0b1000_0000);
|
||||
n >>= 7;
|
||||
}
|
||||
data.push(n.low_u8());
|
||||
}
|
||||
|
||||
/// Read an unsigned 32-bit varint. Also, return the number of bytes read.
|
||||
///
|
||||
/// https://developers.google.com/protocol-buffers/docs/encoding#varints
|
||||
fn read_varu32(data: &[u8]) -> (u32, usize) {
|
||||
// N.B. We can assume correctness here since we know that all varuints are
|
||||
// written with write_varu32. Hence, the 'as' uses and unchecked arithmetic
|
||||
// is all okay.
|
||||
let mut n: u32 = 0;
|
||||
let mut shift: u32 = 0;
|
||||
for (i, &b) in data.iter().enumerate() {
|
||||
if b < 0b1000_0000 {
|
||||
return (n | (u32::from(b) << shift), i + 1);
|
||||
}
|
||||
n |= (u32::from(b) & 0b0111_1111) << shift;
|
||||
shift += 7;
|
||||
}
|
||||
(0, 0)
|
||||
}
|
||||
|
||||
/// Push a native-endian encoded `n` on to `dst`.
|
||||
fn write_u32(dst: &mut Vec<u8>, n: u32) {
|
||||
use crate::util::wire::NE;
|
||||
|
||||
let start = dst.len();
|
||||
dst.extend(core::iter::repeat(0).take(mem::size_of::<u32>()));
|
||||
NE::write_u32(n, &mut dst[start..]);
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use alloc::vec;
|
||||
|
||||
use quickcheck::quickcheck;
|
||||
|
||||
use super::*;
|
||||
|
||||
#[cfg(not(miri))]
|
||||
quickcheck! {
|
||||
fn prop_state_read_write_nfa_state_ids(sids: Vec<StateID>) -> bool {
|
||||
// Builders states do not permit duplicate IDs.
|
||||
let sids = dedup_state_ids(sids);
|
||||
|
||||
let mut b = StateBuilderEmpty::new().into_matches().into_nfa();
|
||||
for &sid in &sids {
|
||||
b.add_nfa_state_id(sid);
|
||||
}
|
||||
let s = b.to_state();
|
||||
let mut got = vec![];
|
||||
s.iter_nfa_state_ids(|sid| got.push(sid));
|
||||
got == sids
|
||||
}
|
||||
|
||||
fn prop_state_read_write_pattern_ids(pids: Vec<PatternID>) -> bool {
|
||||
// Builders states do not permit duplicate IDs.
|
||||
let pids = dedup_pattern_ids(pids);
|
||||
|
||||
let mut b = StateBuilderEmpty::new().into_matches();
|
||||
for &pid in &pids {
|
||||
b.add_match_pattern_id(pid);
|
||||
}
|
||||
let s = b.into_nfa().to_state();
|
||||
let mut got = vec![];
|
||||
s.iter_match_pattern_ids(|pid| got.push(pid));
|
||||
got == pids
|
||||
}
|
||||
|
||||
fn prop_state_read_write_nfa_state_and_pattern_ids(
|
||||
sids: Vec<StateID>,
|
||||
pids: Vec<PatternID>
|
||||
) -> bool {
|
||||
// Builders states do not permit duplicate IDs.
|
||||
let sids = dedup_state_ids(sids);
|
||||
let pids = dedup_pattern_ids(pids);
|
||||
|
||||
let mut b = StateBuilderEmpty::new().into_matches();
|
||||
for &pid in &pids {
|
||||
b.add_match_pattern_id(pid);
|
||||
}
|
||||
|
||||
let mut b = b.into_nfa();
|
||||
for &sid in &sids {
|
||||
b.add_nfa_state_id(sid);
|
||||
}
|
||||
|
||||
let s = b.to_state();
|
||||
let mut got_pids = vec![];
|
||||
s.iter_match_pattern_ids(|pid| got_pids.push(pid));
|
||||
let mut got_sids = vec![];
|
||||
s.iter_nfa_state_ids(|sid| got_sids.push(sid));
|
||||
got_pids == pids && got_sids == sids
|
||||
}
|
||||
}
|
||||
|
||||
quickcheck! {
|
||||
fn prop_read_write_varu32(n: u32) -> bool {
|
||||
let mut buf = vec![];
|
||||
write_varu32(&mut buf, n);
|
||||
let (got, nread) = read_varu32(&buf);
|
||||
nread == buf.len() && got == n
|
||||
}
|
||||
|
||||
fn prop_read_write_vari32(n: i32) -> bool {
|
||||
let mut buf = vec![];
|
||||
write_vari32(&mut buf, n);
|
||||
let (got, nread) = read_vari32(&buf);
|
||||
nread == buf.len() && got == n
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(not(miri))]
|
||||
fn dedup_state_ids(sids: Vec<StateID>) -> Vec<StateID> {
|
||||
let mut set = alloc::collections::BTreeSet::new();
|
||||
let mut deduped = vec![];
|
||||
for sid in sids {
|
||||
if set.contains(&sid) {
|
||||
continue;
|
||||
}
|
||||
set.insert(sid);
|
||||
deduped.push(sid);
|
||||
}
|
||||
deduped
|
||||
}
|
||||
|
||||
#[cfg(not(miri))]
|
||||
fn dedup_pattern_ids(pids: Vec<PatternID>) -> Vec<PatternID> {
|
||||
let mut set = alloc::collections::BTreeSet::new();
|
||||
let mut deduped = vec![];
|
||||
for pid in pids {
|
||||
if set.contains(&pid) {
|
||||
continue;
|
||||
}
|
||||
set.insert(pid);
|
||||
deduped.push(pid);
|
||||
}
|
||||
deduped
|
||||
}
|
||||
}
|
||||
265
third-party/vendor/regex-automata/src/util/empty.rs
vendored
Normal file
265
third-party/vendor/regex-automata/src/util/empty.rs
vendored
Normal file
|
|
@ -0,0 +1,265 @@
|
|||
/*!
|
||||
This module provides helper routines for dealing with zero-width matches.
|
||||
|
||||
The main problem being solved here is this:
|
||||
|
||||
1. The caller wants to search something that they know is valid UTF-8, such
|
||||
as a Rust `&str`.
|
||||
2. The regex used by the caller can match the empty string. For example, `a*`.
|
||||
3. The caller should never get match offsets returned that occur within the
|
||||
encoding of a UTF-8 codepoint. It is logically incorrect, and also means that,
|
||||
e.g., slicing the `&str` at those offsets will lead to a panic.
|
||||
|
||||
So the question here is, how do we prevent the caller from getting match
|
||||
offsets that split a codepoint? For example, strictly speaking, the regex `a*`
|
||||
matches `☃` at the positions `[0, 0]`, `[1, 1]`, `[2, 2]` and `[3, 3]` since
|
||||
the UTF-8 encoding of `☃` is `\xE2\x98\x83`. In particular, the `NFA` that
|
||||
underlies all of the matching engines in this crate doesn't have anything in
|
||||
its state graph that prevents matching between UTF-8 code units. Indeed, any
|
||||
engine derived from the `NFA` will match at those positions by virtue of the
|
||||
fact that the `NFA` is byte oriented. That is, its transitions are defined over
|
||||
bytes and the matching engines work by proceeding one byte at a time.
|
||||
|
||||
(An alternative architecture would be to define the transitions in an `NFA`
|
||||
over codepoints, or `char`. And then make the matching engines proceed by
|
||||
decoding one codepoint at a time. This is a viable strategy, but it doesn't
|
||||
work for DFA matching engines because designing a fast and memory efficient
|
||||
transition table for an alphabet as large as Unicode is quite difficult. More
|
||||
to the point, the top-level `regex` crate supports matching on arbitrary bytes
|
||||
when Unicode mode is disabled and one is searching a `&[u8]`. So in that case,
|
||||
you can't just limit yourself to decoding codepoints and matching those. You
|
||||
really do need to be able to follow byte oriented transitions on the `NFA`.)
|
||||
|
||||
In an older version of the regex crate, we handled this case not in the regex
|
||||
engine, but in the iterators over matches. Namely, since this case only arises
|
||||
when the match is empty, we "just" incremented the next starting position
|
||||
of the search by `N`, where `N` is the length of the codepoint encoded at
|
||||
the current position. The alternative or more "natural" solution of just
|
||||
incrementing by `1` would result in executing a search of `a*` on `☃` like
|
||||
this:
|
||||
|
||||
* Start search at `0`.
|
||||
* Found match at `[0, 0]`.
|
||||
* Next start position is `0`.
|
||||
* To avoid an infinite loop, since it's an empty match, increment by `1`.
|
||||
* Start search at `1`.
|
||||
* Found match at `[1, 1]`. Oops.
|
||||
|
||||
But if we instead incremented by `3` (the length in bytes of `☃`), then we get
|
||||
the following:
|
||||
|
||||
* Start search at `0`.
|
||||
* Found match at `[0, 0]`.
|
||||
* Next start position is `0`.
|
||||
* To avoid an infinite loop, since it's an empty match, increment by `3`.
|
||||
* Start search at `3`.
|
||||
* Found match at `[3, 3]`.
|
||||
|
||||
And we get the correct result. But does this technique work in all cases?
|
||||
Crucially, it requires that a zero-width match that splits a codepoint never
|
||||
occurs beyond the starting position of the search. Because if it did, merely
|
||||
incrementing the start position by the number of bytes in the codepoint at
|
||||
the current position wouldn't be enough. A zero-width match could just occur
|
||||
anywhere. It turns out that it is _almost_ true. We can convince ourselves by
|
||||
looking at all possible patterns that can match the empty string:
|
||||
|
||||
* Patterns like `a*`, `a{0}`, `(?:)`, `a|` and `|a` all unconditionally match
|
||||
the empty string. That is, assuming there isn't an `a` at the current position,
|
||||
they will all match the empty string at the start of a search. There is no way
|
||||
to move past it because any other match would not be "leftmost."
|
||||
* `^` only matches at the beginning of the haystack, where the start position
|
||||
is `0`. Since we know we're searching valid UTF-8 (if it isn't valid UTF-8,
|
||||
then this entire problem goes away because it implies your string type supports
|
||||
invalid UTF-8 and thus must deal with offsets that not only split a codepoint
|
||||
but occur in entirely invalid UTF-8 somehow), it follows that `^` never matches
|
||||
between the code units of a codepoint because the start of a valid UTF-8 string
|
||||
is never within the encoding of a codepoint.
|
||||
* `$` basically the same logic as `^`, but for the end of a string. A valid
|
||||
UTF-8 string can't have an incomplete codepoint at the end of it.
|
||||
* `(?m:^)` follows similarly to `^`, but it can match immediately following
|
||||
a `\n`. However, since a `\n` is always a codepoint itself and can never
|
||||
appear within a codepoint, it follows that the position immediately following
|
||||
a `\n` in a string that is valid UTF-8 is guaranteed to not be between the
|
||||
code units of another codepoint. (One caveat here is that the line terminator
|
||||
for multi-line anchors can now be changed to any arbitrary byte, including
|
||||
things like `\x98` which might occur within a codepoint. However, this wasn't
|
||||
supported by the old regex crate. If it was, it pose the same problems as
|
||||
`(?-u:\B)`, as we'll discuss below.)
|
||||
* `(?m:$)` a similar argument as for `(?m:^)`. The only difference is that a
|
||||
`(?m:$)` matches just before a `\n`. But the same argument applies.
|
||||
* `(?Rm:^)` and `(?Rm:$)` weren't supported by the old regex crate, but the
|
||||
CRLF aware line anchors follow a similar argument as for `(?m:^)` and `(?m:$)`.
|
||||
Namely, since they only ever match at a boundary where one side is either a
|
||||
`\r` or a `\n`, neither of which can occur within a codepoint.
|
||||
* `\b` only matches at positions where both sides are valid codepoints, so
|
||||
this cannot split a codepoint.
|
||||
* `\B`, like `\b`, also only matches at positions where both sides are valid
|
||||
codepoints. So this cannot split a codepoint either.
|
||||
* `(?-u:\b)` matches only at positions where at least one side of it is an ASCII
|
||||
word byte. Since ASCII bytes cannot appear as code units in non-ASCII codepoints
|
||||
(one of the many amazing qualities of UTF-8), it follows that this too cannot
|
||||
split a codepoint.
|
||||
* `(?-u:\B)` finally represents a problem. It can matches between *any* two
|
||||
bytes that are either both word bytes or non-word bytes. Since code units like
|
||||
`\xE2` and `\x98` (from the UTF-8 encoding of `☃`) are both non-word bytes,
|
||||
`(?-u:\B)` will match at the position between them.
|
||||
|
||||
Thus, our approach of incrementing one codepoint at a time after seeing an
|
||||
empty match is flawed because `(?-u:\B)` can result in an empty match that
|
||||
splits a codepoint at a position past the starting point of a search. For
|
||||
example, searching `(?-u:\B)` on `a☃` would produce the following matches: `[2,
|
||||
2]`, `[3, 3]` and `[4, 4]`. The positions at `0` and `1` don't match because
|
||||
they correspond to word boundaries since `a` is an ASCII word byte.
|
||||
|
||||
So what did the old regex crate do to avoid this? It banned `(?-u:\B)` from
|
||||
regexes that could match `&str`. That might sound extreme, but a lot of other
|
||||
things were banned too. For example, all of `(?-u:.)`, `(?-u:[^a])` and
|
||||
`(?-u:\W)` can match invalid UTF-8 too, including individual code units with a
|
||||
codepoint. The key difference is that those expressions could never produce an
|
||||
empty match. That ban happens when translating an `Ast` to an `Hir`, because
|
||||
that process that reason about whether an `Hir` can produce *non-empty* matches
|
||||
at invalid UTF-8 boundaries. Bottom line though is that we side-stepped the
|
||||
`(?-u:\B)` issue by banning it.
|
||||
|
||||
If banning `(?-u:\B)` were the only issue with the old regex crate's approach,
|
||||
then I probably would have kept it. `\B` is rarely used, so it's not such a big
|
||||
deal to have to work-around it. However, the problem with the above approach
|
||||
is that it doesn't compose. The logic for avoiding splitting a codepoint only
|
||||
lived in the iterator, which means if anyone wants to implement their own
|
||||
iterator over regex matches, they have to deal with this extremely subtle edge
|
||||
case to get full correctness.
|
||||
|
||||
Instead, in this crate, we take the approach of pushing this complexity down
|
||||
to the lowest layers of each regex engine. The approach is pretty simple:
|
||||
|
||||
* If this corner case doesn't apply, don't do anything. (For example, if UTF-8
|
||||
mode isn't enabled or if the regex cannot match the empty string.)
|
||||
* If an empty match is reported, explicitly check if it splits a codepoint.
|
||||
* If it doesn't, we're done, return the match.
|
||||
* If it does, then ignore the match and re-run the search.
|
||||
* Repeat the above process until the end of the haystack is reached or a match
|
||||
is found that doesn't split a codepoint or isn't zero width.
|
||||
|
||||
And that's pretty much what this module provides. Every regex engine uses these
|
||||
methods in their lowest level public APIs, but just above the layer where
|
||||
their internal engine is used. That way, all regex engines can be arbitrarily
|
||||
composed without worrying about handling this case, and iterators don't need to
|
||||
handle it explicitly.
|
||||
|
||||
(It turns out that a new feature I added, support for changing the line
|
||||
terminator in a regex to any arbitrary byte, also provokes the above problem.
|
||||
Namely, the byte could be invalid UTF-8 or a UTF-8 continuation byte. So that
|
||||
support would need to be limited or banned when UTF-8 mode is enabled, just
|
||||
like we did for `(?-u:\B)`. But thankfully our more robust approach in this
|
||||
crate handles that case just fine too.)
|
||||
*/
|
||||
|
||||
use crate::util::search::{Input, MatchError};
|
||||
|
||||
#[cold]
|
||||
#[inline(never)]
|
||||
pub(crate) fn skip_splits_fwd<T, F>(
|
||||
input: &Input<'_>,
|
||||
init_value: T,
|
||||
match_offset: usize,
|
||||
find: F,
|
||||
) -> Result<Option<T>, MatchError>
|
||||
where
|
||||
F: FnMut(&Input<'_>) -> Result<Option<(T, usize)>, MatchError>,
|
||||
{
|
||||
skip_splits(true, input, init_value, match_offset, find)
|
||||
}
|
||||
|
||||
#[cold]
|
||||
#[inline(never)]
|
||||
pub(crate) fn skip_splits_rev<T, F>(
|
||||
input: &Input<'_>,
|
||||
init_value: T,
|
||||
match_offset: usize,
|
||||
find: F,
|
||||
) -> Result<Option<T>, MatchError>
|
||||
where
|
||||
F: FnMut(&Input<'_>) -> Result<Option<(T, usize)>, MatchError>,
|
||||
{
|
||||
skip_splits(false, input, init_value, match_offset, find)
|
||||
}
|
||||
|
||||
fn skip_splits<T, F>(
|
||||
forward: bool,
|
||||
input: &Input<'_>,
|
||||
init_value: T,
|
||||
mut match_offset: usize,
|
||||
mut find: F,
|
||||
) -> Result<Option<T>, MatchError>
|
||||
where
|
||||
F: FnMut(&Input<'_>) -> Result<Option<(T, usize)>, MatchError>,
|
||||
{
|
||||
// If our config says to do an anchored search, then we're definitely
|
||||
// done. We just need to determine whether we have a valid match or
|
||||
// not. If we don't, then we're not allowed to continue, so we report
|
||||
// no match.
|
||||
//
|
||||
// This is actually quite a subtle correctness thing. The key here is
|
||||
// that if we got an empty match that splits a codepoint after doing an
|
||||
// anchored search in UTF-8 mode, then that implies that we must have
|
||||
// *started* the search at a location that splits a codepoint. This
|
||||
// follows from the fact that if a match is reported from an anchored
|
||||
// search, then the start offset of the match *must* match the start
|
||||
// offset of the search.
|
||||
//
|
||||
// It also follows that no other non-empty match is possible. For
|
||||
// example, you might write a regex like '(?:)|SOMETHING' and start its
|
||||
// search in the middle of a codepoint. The first branch is an empty
|
||||
// regex that will bubble up a match at the first position, and then
|
||||
// get rejected here and report no match. But what if 'SOMETHING' could
|
||||
// have matched? We reason that such a thing is impossible, because
|
||||
// if it does, it must report a match that starts in the middle of a
|
||||
// codepoint. This in turn implies that a match is reported whose span
|
||||
// does not correspond to valid UTF-8, and this breaks the promise
|
||||
// made when UTF-8 mode is enabled. (That promise *can* be broken, for
|
||||
// example, by enabling UTF-8 mode but building an by hand NFA that
|
||||
// produces non-empty matches that span invalid UTF-8. This is an unchecked
|
||||
// but documented precondition violation of UTF-8 mode, and is documented
|
||||
// to have unspecified behavior.)
|
||||
//
|
||||
// I believe this actually means that if an anchored search is run, and
|
||||
// UTF-8 mode is enabled and the start position splits a codepoint,
|
||||
// then it is correct to immediately report no match without even
|
||||
// executing the regex engine. But it doesn't really seem worth writing
|
||||
// out that case in every regex engine to save a tiny bit of work in an
|
||||
// extremely pathological case, so we just handle it here.
|
||||
if input.get_anchored().is_anchored() {
|
||||
return Ok(if input.is_char_boundary(match_offset) {
|
||||
Some(init_value)
|
||||
} else {
|
||||
None
|
||||
});
|
||||
}
|
||||
// Otherwise, we have an unanchored search, so just keep looking for
|
||||
// matches until we have one that does not split a codepoint or we hit
|
||||
// EOI.
|
||||
let mut value = init_value;
|
||||
let mut input = input.clone();
|
||||
while !input.is_char_boundary(match_offset) {
|
||||
if forward {
|
||||
// The unwrap is OK here because overflowing usize while
|
||||
// iterating over a slice is impossible, at it would require
|
||||
// a slice of length greater than isize::MAX, which is itself
|
||||
// impossible.
|
||||
input.set_start(input.start().checked_add(1).unwrap());
|
||||
} else {
|
||||
input.set_end(match input.end().checked_sub(1) {
|
||||
None => return Ok(None),
|
||||
Some(end) => end,
|
||||
});
|
||||
}
|
||||
match find(&input)? {
|
||||
None => return Ok(None),
|
||||
Some((new_value, new_match_end)) => {
|
||||
value = new_value;
|
||||
match_offset = new_match_end;
|
||||
}
|
||||
}
|
||||
}
|
||||
Ok(Some(value))
|
||||
}
|
||||
84
third-party/vendor/regex-automata/src/util/escape.rs
vendored
Normal file
84
third-party/vendor/regex-automata/src/util/escape.rs
vendored
Normal file
|
|
@ -0,0 +1,84 @@
|
|||
/*!
|
||||
Provides convenience routines for escaping raw bytes.
|
||||
|
||||
Since this crate tends to deal with `&[u8]` everywhere and the default
|
||||
`Debug` implementation just shows decimal integers, it makes debugging those
|
||||
representations quite difficult. This module provides types that show `&[u8]`
|
||||
as if it were a string, with invalid UTF-8 escaped into its byte-by-byte hex
|
||||
representation.
|
||||
*/
|
||||
|
||||
use crate::util::utf8;
|
||||
|
||||
/// Provides a convenient `Debug` implementation for a `u8`.
|
||||
///
|
||||
/// The `Debug` impl treats the byte as an ASCII, and emits a human readable
|
||||
/// representation of it. If the byte isn't ASCII, then it's emitted as a hex
|
||||
/// escape sequence.
|
||||
#[derive(Clone, Copy)]
|
||||
pub struct DebugByte(pub u8);
|
||||
|
||||
impl core::fmt::Debug for DebugByte {
|
||||
fn fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result {
|
||||
// Special case ASCII space. It's too hard to read otherwise, so
|
||||
// put quotes around it. I sometimes wonder whether just '\x20' would
|
||||
// be better...
|
||||
if self.0 == b' ' {
|
||||
return write!(f, "' '");
|
||||
}
|
||||
// 10 bytes is enough to cover any output from ascii::escape_default.
|
||||
let mut bytes = [0u8; 10];
|
||||
let mut len = 0;
|
||||
for (i, mut b) in core::ascii::escape_default(self.0).enumerate() {
|
||||
// capitalize \xab to \xAB
|
||||
if i >= 2 && b'a' <= b && b <= b'f' {
|
||||
b -= 32;
|
||||
}
|
||||
bytes[len] = b;
|
||||
len += 1;
|
||||
}
|
||||
write!(f, "{}", core::str::from_utf8(&bytes[..len]).unwrap())
|
||||
}
|
||||
}
|
||||
|
||||
/// Provides a convenient `Debug` implementation for `&[u8]`.
|
||||
///
|
||||
/// This generally works best when the bytes are presumed to be mostly UTF-8,
|
||||
/// but will work for anything. For any bytes that aren't UTF-8, they are
|
||||
/// emitted as hex escape sequences.
|
||||
pub struct DebugHaystack<'a>(pub &'a [u8]);
|
||||
|
||||
impl<'a> core::fmt::Debug for DebugHaystack<'a> {
|
||||
fn fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result {
|
||||
write!(f, "\"")?;
|
||||
// This is a sad re-implementation of a similar impl found in bstr.
|
||||
let mut bytes = self.0;
|
||||
while let Some(result) = utf8::decode(bytes) {
|
||||
let ch = match result {
|
||||
Ok(ch) => ch,
|
||||
Err(byte) => {
|
||||
write!(f, r"\x{:02x}", byte)?;
|
||||
bytes = &bytes[1..];
|
||||
continue;
|
||||
}
|
||||
};
|
||||
bytes = &bytes[ch.len_utf8()..];
|
||||
match ch {
|
||||
'\0' => write!(f, "\\0")?,
|
||||
// ASCII control characters except \0, \n, \r, \t
|
||||
'\x01'..='\x08'
|
||||
| '\x0b'
|
||||
| '\x0c'
|
||||
| '\x0e'..='\x19'
|
||||
| '\x7f' => {
|
||||
write!(f, "\\x{:02x}", u32::from(ch))?;
|
||||
}
|
||||
'\n' | '\r' | '\t' | _ => {
|
||||
write!(f, "{}", ch.escape_debug())?;
|
||||
}
|
||||
}
|
||||
}
|
||||
write!(f, "\"")?;
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
246
third-party/vendor/regex-automata/src/util/int.rs
vendored
Normal file
246
third-party/vendor/regex-automata/src/util/int.rs
vendored
Normal file
|
|
@ -0,0 +1,246 @@
|
|||
/*!
|
||||
This module provides several integer oriented traits for converting between
|
||||
both fixed size integers and integers whose size varies based on the target
|
||||
(like `usize`).
|
||||
|
||||
The driving design principle of this module is to attempt to centralize as many
|
||||
`as` casts as possible here. And in particular, we separate casts into two
|
||||
buckets:
|
||||
|
||||
* Casts that we use for their truncating behavior. In this case, we use more
|
||||
descriptive names, like `low_u32` and `high_u32`.
|
||||
* Casts that we use for converting back-and-forth between `usize`. These
|
||||
conversions are generally necessary because we often store indices in different
|
||||
formats to save on memory, which requires converting to and from `usize`. In
|
||||
this case, we very specifically do not want to overflow, and so the methods
|
||||
defined here will panic if the `as` cast would be lossy in debug mode. (A
|
||||
normal `as` cast will never panic!)
|
||||
|
||||
For `as` casts between raw pointers, we use `cast`, so `as` isn't needed there.
|
||||
|
||||
For regex engines, floating point is just never used, so we don't have to worry
|
||||
about `as` casts for those.
|
||||
|
||||
Otherwise, this module pretty much covers all of our `as` needs except for one
|
||||
thing: const contexts. There are a select few places in this crate where we
|
||||
still need to use `as` because const functions on traits aren't stable yet.
|
||||
If we wind up significantly expanding our const footprint in this crate, it
|
||||
might be worth defining free functions to handle those cases. But at the time
|
||||
of writing, that just seemed like too much ceremony. Instead, I comment each
|
||||
such use of `as` in a const context with a "fixme" notice.
|
||||
|
||||
NOTE: for simplicity, we don't take target pointer width into account here for
|
||||
`usize` conversions. Since we currently only panic in debug mode, skipping the
|
||||
check when it can be proven it isn't needed at compile time doesn't really
|
||||
matter. Now, if we wind up wanting to do as many checks as possible in release
|
||||
mode, then we would want to skip those when we know the conversions are always
|
||||
non-lossy.
|
||||
|
||||
NOTE: this module isn't an exhaustive API. For example, we still use things
|
||||
like `u64::from` where possible, or even `usize::try_from()` for when we do
|
||||
explicitly want to panic or when we want to return an error for overflow.
|
||||
*/
|
||||
|
||||
// We define a little more than what we need, but I'd rather just have
|
||||
// everything via a consistent and uniform API then have holes.
|
||||
#![allow(dead_code)]
|
||||
|
||||
pub(crate) trait U8 {
|
||||
fn as_usize(self) -> usize;
|
||||
}
|
||||
|
||||
impl U8 for u8 {
|
||||
fn as_usize(self) -> usize {
|
||||
usize::from(self)
|
||||
}
|
||||
}
|
||||
|
||||
pub(crate) trait U16 {
|
||||
fn as_usize(self) -> usize;
|
||||
fn low_u8(self) -> u8;
|
||||
fn high_u8(self) -> u8;
|
||||
}
|
||||
|
||||
impl U16 for u16 {
|
||||
fn as_usize(self) -> usize {
|
||||
usize::from(self)
|
||||
}
|
||||
|
||||
fn low_u8(self) -> u8 {
|
||||
self as u8
|
||||
}
|
||||
|
||||
fn high_u8(self) -> u8 {
|
||||
(self >> 8) as u8
|
||||
}
|
||||
}
|
||||
|
||||
pub(crate) trait U32 {
|
||||
fn as_usize(self) -> usize;
|
||||
fn low_u8(self) -> u8;
|
||||
fn low_u16(self) -> u16;
|
||||
fn high_u16(self) -> u16;
|
||||
}
|
||||
|
||||
impl U32 for u32 {
|
||||
fn as_usize(self) -> usize {
|
||||
#[cfg(debug_assertions)]
|
||||
{
|
||||
usize::try_from(self).expect("u32 overflowed usize")
|
||||
}
|
||||
#[cfg(not(debug_assertions))]
|
||||
{
|
||||
self as usize
|
||||
}
|
||||
}
|
||||
|
||||
fn low_u8(self) -> u8 {
|
||||
self as u8
|
||||
}
|
||||
|
||||
fn low_u16(self) -> u16 {
|
||||
self as u16
|
||||
}
|
||||
|
||||
fn high_u16(self) -> u16 {
|
||||
(self >> 16) as u16
|
||||
}
|
||||
}
|
||||
|
||||
pub(crate) trait U64 {
|
||||
fn as_usize(self) -> usize;
|
||||
fn low_u8(self) -> u8;
|
||||
fn low_u16(self) -> u16;
|
||||
fn low_u32(self) -> u32;
|
||||
fn high_u32(self) -> u32;
|
||||
}
|
||||
|
||||
impl U64 for u64 {
|
||||
fn as_usize(self) -> usize {
|
||||
#[cfg(debug_assertions)]
|
||||
{
|
||||
usize::try_from(self).expect("u64 overflowed usize")
|
||||
}
|
||||
#[cfg(not(debug_assertions))]
|
||||
{
|
||||
self as usize
|
||||
}
|
||||
}
|
||||
|
||||
fn low_u8(self) -> u8 {
|
||||
self as u8
|
||||
}
|
||||
|
||||
fn low_u16(self) -> u16 {
|
||||
self as u16
|
||||
}
|
||||
|
||||
fn low_u32(self) -> u32 {
|
||||
self as u32
|
||||
}
|
||||
|
||||
fn high_u32(self) -> u32 {
|
||||
(self >> 32) as u32
|
||||
}
|
||||
}
|
||||
|
||||
pub(crate) trait I32 {
|
||||
fn as_usize(self) -> usize;
|
||||
fn to_bits(self) -> u32;
|
||||
fn from_bits(n: u32) -> i32;
|
||||
}
|
||||
|
||||
impl I32 for i32 {
|
||||
fn as_usize(self) -> usize {
|
||||
#[cfg(debug_assertions)]
|
||||
{
|
||||
usize::try_from(self).expect("i32 overflowed usize")
|
||||
}
|
||||
#[cfg(not(debug_assertions))]
|
||||
{
|
||||
self as usize
|
||||
}
|
||||
}
|
||||
|
||||
fn to_bits(self) -> u32 {
|
||||
self as u32
|
||||
}
|
||||
|
||||
fn from_bits(n: u32) -> i32 {
|
||||
n as i32
|
||||
}
|
||||
}
|
||||
|
||||
pub(crate) trait Usize {
|
||||
fn as_u8(self) -> u8;
|
||||
fn as_u16(self) -> u16;
|
||||
fn as_u32(self) -> u32;
|
||||
fn as_u64(self) -> u64;
|
||||
}
|
||||
|
||||
impl Usize for usize {
|
||||
fn as_u8(self) -> u8 {
|
||||
#[cfg(debug_assertions)]
|
||||
{
|
||||
u8::try_from(self).expect("usize overflowed u8")
|
||||
}
|
||||
#[cfg(not(debug_assertions))]
|
||||
{
|
||||
self as u8
|
||||
}
|
||||
}
|
||||
|
||||
fn as_u16(self) -> u16 {
|
||||
#[cfg(debug_assertions)]
|
||||
{
|
||||
u16::try_from(self).expect("usize overflowed u16")
|
||||
}
|
||||
#[cfg(not(debug_assertions))]
|
||||
{
|
||||
self as u16
|
||||
}
|
||||
}
|
||||
|
||||
fn as_u32(self) -> u32 {
|
||||
#[cfg(debug_assertions)]
|
||||
{
|
||||
u32::try_from(self).expect("usize overflowed u32")
|
||||
}
|
||||
#[cfg(not(debug_assertions))]
|
||||
{
|
||||
self as u32
|
||||
}
|
||||
}
|
||||
|
||||
fn as_u64(self) -> u64 {
|
||||
#[cfg(debug_assertions)]
|
||||
{
|
||||
u64::try_from(self).expect("usize overflowed u64")
|
||||
}
|
||||
#[cfg(not(debug_assertions))]
|
||||
{
|
||||
self as u64
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Pointers aren't integers, but we convert pointers to integers to perform
|
||||
// offset arithmetic in some places. (And no, we don't convert the integers
|
||||
// back to pointers.) So add 'as_usize' conversions here too for completeness.
|
||||
//
|
||||
// These 'as' casts are actually okay because they're always non-lossy. But the
|
||||
// idea here is to just try and remove as much 'as' as possible, particularly
|
||||
// in this crate where we are being really paranoid about offsets and making
|
||||
// sure we don't panic on inputs that might be untrusted. This way, the 'as'
|
||||
// casts become easier to audit if they're all in one place, even when some of
|
||||
// them are actually okay 100% of the time.
|
||||
|
||||
pub(crate) trait Pointer {
|
||||
fn as_usize(self) -> usize;
|
||||
}
|
||||
|
||||
impl<T> Pointer for *const T {
|
||||
fn as_usize(self) -> usize {
|
||||
self as usize
|
||||
}
|
||||
}
|
||||
579
third-party/vendor/regex-automata/src/util/interpolate.rs
vendored
Normal file
579
third-party/vendor/regex-automata/src/util/interpolate.rs
vendored
Normal file
|
|
@ -0,0 +1,579 @@
|
|||
/*!
|
||||
Provides routines for interpolating capture group references.
|
||||
|
||||
That is, if a replacement string contains references like `$foo` or `${foo1}`,
|
||||
then they are replaced with the corresponding capture values for the groups
|
||||
named `foo` and `foo1`, respectively. Similarly, syntax like `$1` and `${1}`
|
||||
is supported as well, with `1` corresponding to a capture group index and not
|
||||
a name.
|
||||
|
||||
This module provides the free functions [`string`] and [`bytes`], which
|
||||
interpolate Rust Unicode strings and byte strings, respectively.
|
||||
|
||||
# Format
|
||||
|
||||
These routines support two different kinds of capture references: unbraced and
|
||||
braced.
|
||||
|
||||
For the unbraced format, the format supported is `$ref` where `name` can be
|
||||
any character in the class `[0-9A-Za-z_]`. `ref` is always the longest
|
||||
possible parse. So for example, `$1a` corresponds to the capture group named
|
||||
`1a` and not the capture group at index `1`. If `ref` matches `^[0-9]+$`, then
|
||||
it is treated as a capture group index itself and not a name.
|
||||
|
||||
For the braced format, the format supported is `${ref}` where `ref` can be any
|
||||
sequence of bytes except for `}`. If no closing brace occurs, then it is not
|
||||
considered a capture reference. As with the unbraced format, if `ref` matches
|
||||
`^[0-9]+$`, then it is treated as a capture group index and not a name.
|
||||
|
||||
The braced format is useful for exerting precise control over the name of the
|
||||
capture reference. For example, `${1}a` corresponds to the capture group
|
||||
reference `1` followed by the letter `a`, where as `$1a` (as mentioned above)
|
||||
corresponds to the capture group reference `1a`. The braced format is also
|
||||
useful for expressing capture group names that use characters not supported by
|
||||
the unbraced format. For example, `${foo[bar].baz}` refers to the capture group
|
||||
named `foo[bar].baz`.
|
||||
|
||||
If a capture group reference is found and it does not refer to a valid capture
|
||||
group, then it will be replaced with the empty string.
|
||||
|
||||
To write a literal `$`, use `$$`.
|
||||
|
||||
To be clear, and as exhibited via the type signatures in the routines in this
|
||||
module, it is impossible for a replacement string to be invalid. A replacement
|
||||
string may not have the intended semantics, but the interpolation procedure
|
||||
itself can never fail.
|
||||
*/
|
||||
|
||||
use alloc::{string::String, vec::Vec};
|
||||
|
||||
use crate::util::memchr::memchr;
|
||||
|
||||
/// Accepts a replacement string and interpolates capture references with their
|
||||
/// corresponding values.
|
||||
///
|
||||
/// `append` should be a function that appends the string value of a capture
|
||||
/// group at a particular index to the string given. If the capture group
|
||||
/// index is invalid, then nothing should be appended.
|
||||
///
|
||||
/// `name_to_index` should be a function that maps a capture group name to a
|
||||
/// capture group index. If the given name doesn't exist, then `None` should
|
||||
/// be returned.
|
||||
///
|
||||
/// Finally, `dst` is where the final interpolated contents should be written.
|
||||
/// If `replacement` contains no capture group references, then `dst` will be
|
||||
/// equivalent to `replacement`.
|
||||
///
|
||||
/// See the [module documentation](self) for details about the format
|
||||
/// supported.
|
||||
///
|
||||
/// # Example
|
||||
///
|
||||
/// ```
|
||||
/// use regex_automata::util::interpolate;
|
||||
///
|
||||
/// let mut dst = String::new();
|
||||
/// interpolate::string(
|
||||
/// "foo $bar baz",
|
||||
/// |index, dst| {
|
||||
/// if index == 0 {
|
||||
/// dst.push_str("BAR");
|
||||
/// }
|
||||
/// },
|
||||
/// |name| {
|
||||
/// if name == "bar" {
|
||||
/// Some(0)
|
||||
/// } else {
|
||||
/// None
|
||||
/// }
|
||||
/// },
|
||||
/// &mut dst,
|
||||
/// );
|
||||
/// assert_eq!("foo BAR baz", dst);
|
||||
/// ```
|
||||
pub fn string(
|
||||
mut replacement: &str,
|
||||
mut append: impl FnMut(usize, &mut String),
|
||||
mut name_to_index: impl FnMut(&str) -> Option<usize>,
|
||||
dst: &mut String,
|
||||
) {
|
||||
while !replacement.is_empty() {
|
||||
match memchr(b'$', replacement.as_bytes()) {
|
||||
None => break,
|
||||
Some(i) => {
|
||||
dst.push_str(&replacement[..i]);
|
||||
replacement = &replacement[i..];
|
||||
}
|
||||
}
|
||||
// Handle escaping of '$'.
|
||||
if replacement.as_bytes().get(1).map_or(false, |&b| b == b'$') {
|
||||
dst.push_str("$");
|
||||
replacement = &replacement[2..];
|
||||
continue;
|
||||
}
|
||||
debug_assert!(!replacement.is_empty());
|
||||
let cap_ref = match find_cap_ref(replacement.as_bytes()) {
|
||||
Some(cap_ref) => cap_ref,
|
||||
None => {
|
||||
dst.push_str("$");
|
||||
replacement = &replacement[1..];
|
||||
continue;
|
||||
}
|
||||
};
|
||||
replacement = &replacement[cap_ref.end..];
|
||||
match cap_ref.cap {
|
||||
Ref::Number(i) => append(i, dst),
|
||||
Ref::Named(name) => {
|
||||
if let Some(i) = name_to_index(name) {
|
||||
append(i, dst);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
dst.push_str(replacement);
|
||||
}
|
||||
|
||||
/// Accepts a replacement byte string and interpolates capture references with
|
||||
/// their corresponding values.
|
||||
///
|
||||
/// `append` should be a function that appends the byte string value of a
|
||||
/// capture group at a particular index to the byte string given. If the
|
||||
/// capture group index is invalid, then nothing should be appended.
|
||||
///
|
||||
/// `name_to_index` should be a function that maps a capture group name to a
|
||||
/// capture group index. If the given name doesn't exist, then `None` should
|
||||
/// be returned.
|
||||
///
|
||||
/// Finally, `dst` is where the final interpolated contents should be written.
|
||||
/// If `replacement` contains no capture group references, then `dst` will be
|
||||
/// equivalent to `replacement`.
|
||||
///
|
||||
/// See the [module documentation](self) for details about the format
|
||||
/// supported.
|
||||
///
|
||||
/// # Example
|
||||
///
|
||||
/// ```
|
||||
/// use regex_automata::util::interpolate;
|
||||
///
|
||||
/// let mut dst = vec![];
|
||||
/// interpolate::bytes(
|
||||
/// b"foo $bar baz",
|
||||
/// |index, dst| {
|
||||
/// if index == 0 {
|
||||
/// dst.extend_from_slice(b"BAR");
|
||||
/// }
|
||||
/// },
|
||||
/// |name| {
|
||||
/// if name == "bar" {
|
||||
/// Some(0)
|
||||
/// } else {
|
||||
/// None
|
||||
/// }
|
||||
/// },
|
||||
/// &mut dst,
|
||||
/// );
|
||||
/// assert_eq!(&b"foo BAR baz"[..], dst);
|
||||
/// ```
|
||||
pub fn bytes(
|
||||
mut replacement: &[u8],
|
||||
mut append: impl FnMut(usize, &mut Vec<u8>),
|
||||
mut name_to_index: impl FnMut(&str) -> Option<usize>,
|
||||
dst: &mut Vec<u8>,
|
||||
) {
|
||||
while !replacement.is_empty() {
|
||||
match memchr(b'$', replacement) {
|
||||
None => break,
|
||||
Some(i) => {
|
||||
dst.extend_from_slice(&replacement[..i]);
|
||||
replacement = &replacement[i..];
|
||||
}
|
||||
}
|
||||
// Handle escaping of '$'.
|
||||
if replacement.get(1).map_or(false, |&b| b == b'$') {
|
||||
dst.push(b'$');
|
||||
replacement = &replacement[2..];
|
||||
continue;
|
||||
}
|
||||
debug_assert!(!replacement.is_empty());
|
||||
let cap_ref = match find_cap_ref(replacement) {
|
||||
Some(cap_ref) => cap_ref,
|
||||
None => {
|
||||
dst.push(b'$');
|
||||
replacement = &replacement[1..];
|
||||
continue;
|
||||
}
|
||||
};
|
||||
replacement = &replacement[cap_ref.end..];
|
||||
match cap_ref.cap {
|
||||
Ref::Number(i) => append(i, dst),
|
||||
Ref::Named(name) => {
|
||||
if let Some(i) = name_to_index(name) {
|
||||
append(i, dst);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
dst.extend_from_slice(replacement);
|
||||
}
|
||||
|
||||
/// `CaptureRef` represents a reference to a capture group inside some text.
|
||||
/// The reference is either a capture group name or a number.
|
||||
///
|
||||
/// It is also tagged with the position in the text following the
|
||||
/// capture reference.
|
||||
#[derive(Clone, Copy, Debug, Eq, PartialEq)]
|
||||
struct CaptureRef<'a> {
|
||||
cap: Ref<'a>,
|
||||
end: usize,
|
||||
}
|
||||
|
||||
/// A reference to a capture group in some text.
|
||||
///
|
||||
/// e.g., `$2`, `$foo`, `${foo}`.
|
||||
#[derive(Clone, Copy, Debug, Eq, PartialEq)]
|
||||
enum Ref<'a> {
|
||||
Named(&'a str),
|
||||
Number(usize),
|
||||
}
|
||||
|
||||
impl<'a> From<&'a str> for Ref<'a> {
|
||||
fn from(x: &'a str) -> Ref<'a> {
|
||||
Ref::Named(x)
|
||||
}
|
||||
}
|
||||
|
||||
impl From<usize> for Ref<'static> {
|
||||
fn from(x: usize) -> Ref<'static> {
|
||||
Ref::Number(x)
|
||||
}
|
||||
}
|
||||
|
||||
/// Parses a possible reference to a capture group name in the given text,
|
||||
/// starting at the beginning of `replacement`.
|
||||
///
|
||||
/// If no such valid reference could be found, None is returned.
|
||||
///
|
||||
/// Note that this returns a "possible" reference because this routine doesn't
|
||||
/// know whether the reference is to a valid group or not. If it winds up not
|
||||
/// being a valid reference, then it should be replaced with the empty string.
|
||||
fn find_cap_ref(replacement: &[u8]) -> Option<CaptureRef<'_>> {
|
||||
let mut i = 0;
|
||||
let rep: &[u8] = replacement;
|
||||
if rep.len() <= 1 || rep[0] != b'$' {
|
||||
return None;
|
||||
}
|
||||
i += 1;
|
||||
if rep[i] == b'{' {
|
||||
return find_cap_ref_braced(rep, i + 1);
|
||||
}
|
||||
let mut cap_end = i;
|
||||
while rep.get(cap_end).copied().map_or(false, is_valid_cap_letter) {
|
||||
cap_end += 1;
|
||||
}
|
||||
if cap_end == i {
|
||||
return None;
|
||||
}
|
||||
// We just verified that the range 0..cap_end is valid ASCII, so it must
|
||||
// therefore be valid UTF-8. If we really cared, we could avoid this UTF-8
|
||||
// check via an unchecked conversion or by parsing the number straight from
|
||||
// &[u8].
|
||||
let cap = core::str::from_utf8(&rep[i..cap_end])
|
||||
.expect("valid UTF-8 capture name");
|
||||
Some(CaptureRef {
|
||||
cap: match cap.parse::<usize>() {
|
||||
Ok(i) => Ref::Number(i),
|
||||
Err(_) => Ref::Named(cap),
|
||||
},
|
||||
end: cap_end,
|
||||
})
|
||||
}
|
||||
|
||||
/// Looks for a braced reference, e.g., `${foo1}`. This assumes that an opening
|
||||
/// brace has been found at `i-1` in `rep`. This then looks for a closing
|
||||
/// brace and returns the capture reference within the brace.
|
||||
fn find_cap_ref_braced(rep: &[u8], mut i: usize) -> Option<CaptureRef<'_>> {
|
||||
assert_eq!(b'{', rep[i.checked_sub(1).unwrap()]);
|
||||
let start = i;
|
||||
while rep.get(i).map_or(false, |&b| b != b'}') {
|
||||
i += 1;
|
||||
}
|
||||
if !rep.get(i).map_or(false, |&b| b == b'}') {
|
||||
return None;
|
||||
}
|
||||
// When looking at braced names, we don't put any restrictions on the name,
|
||||
// so it's possible it could be invalid UTF-8. But a capture group name
|
||||
// can never be invalid UTF-8, so if we have invalid UTF-8, then we can
|
||||
// safely return None.
|
||||
let cap = match core::str::from_utf8(&rep[start..i]) {
|
||||
Err(_) => return None,
|
||||
Ok(cap) => cap,
|
||||
};
|
||||
Some(CaptureRef {
|
||||
cap: match cap.parse::<usize>() {
|
||||
Ok(i) => Ref::Number(i),
|
||||
Err(_) => Ref::Named(cap),
|
||||
},
|
||||
end: i + 1,
|
||||
})
|
||||
}
|
||||
|
||||
/// Returns true if and only if the given byte is allowed in a capture name
|
||||
/// written in non-brace form.
|
||||
fn is_valid_cap_letter(b: u8) -> bool {
|
||||
match b {
|
||||
b'0'..=b'9' | b'a'..=b'z' | b'A'..=b'Z' | b'_' => true,
|
||||
_ => false,
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use alloc::{string::String, vec, vec::Vec};
|
||||
|
||||
use super::{find_cap_ref, CaptureRef};
|
||||
|
||||
macro_rules! find {
|
||||
($name:ident, $text:expr) => {
|
||||
#[test]
|
||||
fn $name() {
|
||||
assert_eq!(None, find_cap_ref($text.as_bytes()));
|
||||
}
|
||||
};
|
||||
($name:ident, $text:expr, $capref:expr) => {
|
||||
#[test]
|
||||
fn $name() {
|
||||
assert_eq!(Some($capref), find_cap_ref($text.as_bytes()));
|
||||
}
|
||||
};
|
||||
}
|
||||
|
||||
macro_rules! c {
|
||||
($name_or_number:expr, $pos:expr) => {
|
||||
CaptureRef { cap: $name_or_number.into(), end: $pos }
|
||||
};
|
||||
}
|
||||
|
||||
find!(find_cap_ref1, "$foo", c!("foo", 4));
|
||||
find!(find_cap_ref2, "${foo}", c!("foo", 6));
|
||||
find!(find_cap_ref3, "$0", c!(0, 2));
|
||||
find!(find_cap_ref4, "$5", c!(5, 2));
|
||||
find!(find_cap_ref5, "$10", c!(10, 3));
|
||||
// See https://github.com/rust-lang/regex/pull/585
|
||||
// for more on characters following numbers
|
||||
find!(find_cap_ref6, "$42a", c!("42a", 4));
|
||||
find!(find_cap_ref7, "${42}a", c!(42, 5));
|
||||
find!(find_cap_ref8, "${42");
|
||||
find!(find_cap_ref9, "${42 ");
|
||||
find!(find_cap_ref10, " $0 ");
|
||||
find!(find_cap_ref11, "$");
|
||||
find!(find_cap_ref12, " ");
|
||||
find!(find_cap_ref13, "");
|
||||
find!(find_cap_ref14, "$1-$2", c!(1, 2));
|
||||
find!(find_cap_ref15, "$1_$2", c!("1_", 3));
|
||||
find!(find_cap_ref16, "$x-$y", c!("x", 2));
|
||||
find!(find_cap_ref17, "$x_$y", c!("x_", 3));
|
||||
find!(find_cap_ref18, "${#}", c!("#", 4));
|
||||
find!(find_cap_ref19, "${Z[}", c!("Z[", 5));
|
||||
find!(find_cap_ref20, "${¾}", c!("¾", 5));
|
||||
find!(find_cap_ref21, "${¾a}", c!("¾a", 6));
|
||||
find!(find_cap_ref22, "${a¾}", c!("a¾", 6));
|
||||
find!(find_cap_ref23, "${☃}", c!("☃", 6));
|
||||
find!(find_cap_ref24, "${a☃}", c!("a☃", 7));
|
||||
find!(find_cap_ref25, "${☃a}", c!("☃a", 7));
|
||||
find!(find_cap_ref26, "${名字}", c!("名字", 9));
|
||||
|
||||
fn interpolate_string(
|
||||
mut name_to_index: Vec<(&'static str, usize)>,
|
||||
caps: Vec<&'static str>,
|
||||
replacement: &str,
|
||||
) -> String {
|
||||
name_to_index.sort_by_key(|x| x.0);
|
||||
|
||||
let mut dst = String::new();
|
||||
super::string(
|
||||
replacement,
|
||||
|i, dst| {
|
||||
if let Some(&s) = caps.get(i) {
|
||||
dst.push_str(s);
|
||||
}
|
||||
},
|
||||
|name| -> Option<usize> {
|
||||
name_to_index
|
||||
.binary_search_by_key(&name, |x| x.0)
|
||||
.ok()
|
||||
.map(|i| name_to_index[i].1)
|
||||
},
|
||||
&mut dst,
|
||||
);
|
||||
dst
|
||||
}
|
||||
|
||||
fn interpolate_bytes(
|
||||
mut name_to_index: Vec<(&'static str, usize)>,
|
||||
caps: Vec<&'static str>,
|
||||
replacement: &str,
|
||||
) -> String {
|
||||
name_to_index.sort_by_key(|x| x.0);
|
||||
|
||||
let mut dst = vec![];
|
||||
super::bytes(
|
||||
replacement.as_bytes(),
|
||||
|i, dst| {
|
||||
if let Some(&s) = caps.get(i) {
|
||||
dst.extend_from_slice(s.as_bytes());
|
||||
}
|
||||
},
|
||||
|name| -> Option<usize> {
|
||||
name_to_index
|
||||
.binary_search_by_key(&name, |x| x.0)
|
||||
.ok()
|
||||
.map(|i| name_to_index[i].1)
|
||||
},
|
||||
&mut dst,
|
||||
);
|
||||
String::from_utf8(dst).unwrap()
|
||||
}
|
||||
|
||||
macro_rules! interp {
|
||||
($name:ident, $map:expr, $caps:expr, $hay:expr, $expected:expr $(,)*) => {
|
||||
#[test]
|
||||
fn $name() {
|
||||
assert_eq!(
|
||||
$expected,
|
||||
interpolate_string($map, $caps, $hay),
|
||||
"interpolate::string failed",
|
||||
);
|
||||
assert_eq!(
|
||||
$expected,
|
||||
interpolate_bytes($map, $caps, $hay),
|
||||
"interpolate::bytes failed",
|
||||
);
|
||||
}
|
||||
};
|
||||
}
|
||||
|
||||
interp!(
|
||||
interp1,
|
||||
vec![("foo", 2)],
|
||||
vec!["", "", "xxx"],
|
||||
"test $foo test",
|
||||
"test xxx test",
|
||||
);
|
||||
|
||||
interp!(
|
||||
interp2,
|
||||
vec![("foo", 2)],
|
||||
vec!["", "", "xxx"],
|
||||
"test$footest",
|
||||
"test",
|
||||
);
|
||||
|
||||
interp!(
|
||||
interp3,
|
||||
vec![("foo", 2)],
|
||||
vec!["", "", "xxx"],
|
||||
"test${foo}test",
|
||||
"testxxxtest",
|
||||
);
|
||||
|
||||
interp!(
|
||||
interp4,
|
||||
vec![("foo", 2)],
|
||||
vec!["", "", "xxx"],
|
||||
"test$2test",
|
||||
"test",
|
||||
);
|
||||
|
||||
interp!(
|
||||
interp5,
|
||||
vec![("foo", 2)],
|
||||
vec!["", "", "xxx"],
|
||||
"test${2}test",
|
||||
"testxxxtest",
|
||||
);
|
||||
|
||||
interp!(
|
||||
interp6,
|
||||
vec![("foo", 2)],
|
||||
vec!["", "", "xxx"],
|
||||
"test $$foo test",
|
||||
"test $foo test",
|
||||
);
|
||||
|
||||
interp!(
|
||||
interp7,
|
||||
vec![("foo", 2)],
|
||||
vec!["", "", "xxx"],
|
||||
"test $foo",
|
||||
"test xxx",
|
||||
);
|
||||
|
||||
interp!(
|
||||
interp8,
|
||||
vec![("foo", 2)],
|
||||
vec!["", "", "xxx"],
|
||||
"$foo test",
|
||||
"xxx test",
|
||||
);
|
||||
|
||||
interp!(
|
||||
interp9,
|
||||
vec![("bar", 1), ("foo", 2)],
|
||||
vec!["", "yyy", "xxx"],
|
||||
"test $bar$foo",
|
||||
"test yyyxxx",
|
||||
);
|
||||
|
||||
interp!(
|
||||
interp10,
|
||||
vec![("bar", 1), ("foo", 2)],
|
||||
vec!["", "yyy", "xxx"],
|
||||
"test $ test",
|
||||
"test $ test",
|
||||
);
|
||||
|
||||
interp!(
|
||||
interp11,
|
||||
vec![("bar", 1), ("foo", 2)],
|
||||
vec!["", "yyy", "xxx"],
|
||||
"test ${} test",
|
||||
"test test",
|
||||
);
|
||||
|
||||
interp!(
|
||||
interp12,
|
||||
vec![("bar", 1), ("foo", 2)],
|
||||
vec!["", "yyy", "xxx"],
|
||||
"test ${ } test",
|
||||
"test test",
|
||||
);
|
||||
|
||||
interp!(
|
||||
interp13,
|
||||
vec![("bar", 1), ("foo", 2)],
|
||||
vec!["", "yyy", "xxx"],
|
||||
"test ${a b} test",
|
||||
"test test",
|
||||
);
|
||||
|
||||
interp!(
|
||||
interp14,
|
||||
vec![("bar", 1), ("foo", 2)],
|
||||
vec!["", "yyy", "xxx"],
|
||||
"test ${a} test",
|
||||
"test test",
|
||||
);
|
||||
|
||||
// This is a funny case where a braced reference is never closed, but
|
||||
// within the unclosed braced reference, there is an unbraced reference.
|
||||
// In this case, the braced reference is just treated literally and the
|
||||
// unbraced reference is found.
|
||||
interp!(
|
||||
interp15,
|
||||
vec![("bar", 1), ("foo", 2)],
|
||||
vec!["", "yyy", "xxx"],
|
||||
"test ${wat $bar ok",
|
||||
"test ${wat yyy ok",
|
||||
);
|
||||
}
|
||||
1027
third-party/vendor/regex-automata/src/util/iter.rs
vendored
Normal file
1027
third-party/vendor/regex-automata/src/util/iter.rs
vendored
Normal file
File diff suppressed because it is too large
Load diff
461
third-party/vendor/regex-automata/src/util/lazy.rs
vendored
Normal file
461
third-party/vendor/regex-automata/src/util/lazy.rs
vendored
Normal file
|
|
@ -0,0 +1,461 @@
|
|||
/*!
|
||||
A lazily initialized value for safe sharing between threads.
|
||||
|
||||
The principal type in this module is `Lazy`, which makes it easy to construct
|
||||
values that are shared safely across multiple threads simultaneously.
|
||||
*/
|
||||
|
||||
use core::fmt;
|
||||
|
||||
/// A lazily initialized value that implements `Deref` for `T`.
|
||||
///
|
||||
/// A `Lazy` takes an initialization function and permits callers from any
|
||||
/// thread to access the result of that initialization function in a safe
|
||||
/// manner. In effect, this permits one-time initialization of global resources
|
||||
/// in a (possibly) multi-threaded program.
|
||||
///
|
||||
/// This type and its functionality are available even when neither the `alloc`
|
||||
/// nor the `std` features are enabled. In exchange, a `Lazy` does **not**
|
||||
/// guarantee that the given `create` function is called at most once. It
|
||||
/// might be called multiple times. Moreover, a call to `Lazy::get` (either
|
||||
/// explicitly or implicitly via `Lazy`'s `Deref` impl) may block until a `T`
|
||||
/// is available.
|
||||
///
|
||||
/// This is very similar to `lazy_static` or `once_cell`, except it doesn't
|
||||
/// guarantee that the initialization function will be run once and it works
|
||||
/// in no-alloc no-std environments. With that said, if you need stronger
|
||||
/// guarantees or a more flexible API, then it is recommended to use either
|
||||
/// `lazy_static` or `once_cell`.
|
||||
///
|
||||
/// # Warning: may use a spin lock
|
||||
///
|
||||
/// When this crate is compiled _without_ the `alloc` feature, then this type
|
||||
/// may used a spin lock internally. This can have subtle effects that may
|
||||
/// be undesirable. See [Spinlocks Considered Harmful][spinharm] for a more
|
||||
/// thorough treatment of this topic.
|
||||
///
|
||||
/// [spinharm]: https://matklad.github.io/2020/01/02/spinlocks-considered-harmful.html
|
||||
///
|
||||
/// # Example
|
||||
///
|
||||
/// This type is useful for creating regexes once, and then using them from
|
||||
/// multiple threads simultaneously without worrying about synchronization.
|
||||
///
|
||||
/// ```
|
||||
/// use regex_automata::{dfa::regex::Regex, util::lazy::Lazy, Match};
|
||||
///
|
||||
/// static RE: Lazy<Regex> = Lazy::new(|| Regex::new("foo[0-9]+bar").unwrap());
|
||||
///
|
||||
/// let expected = Some(Match::must(0, 3..14));
|
||||
/// assert_eq!(expected, RE.find(b"zzzfoo12345barzzz"));
|
||||
/// ```
|
||||
pub struct Lazy<T, F = fn() -> T>(lazy::Lazy<T, F>);
|
||||
|
||||
impl<T, F> Lazy<T, F> {
|
||||
/// Create a new `Lazy` value that is initialized via the given function.
|
||||
///
|
||||
/// The `T` type is automatically inferred from the return type of the
|
||||
/// `create` function given.
|
||||
pub const fn new(create: F) -> Lazy<T, F> {
|
||||
Lazy(lazy::Lazy::new(create))
|
||||
}
|
||||
}
|
||||
|
||||
impl<T, F: Fn() -> T> Lazy<T, F> {
|
||||
/// Return a reference to the lazily initialized value.
|
||||
///
|
||||
/// This routine may block if another thread is initializing a `T`.
|
||||
///
|
||||
/// Note that given a `x` which has type `Lazy`, this must be called via
|
||||
/// `Lazy::get(x)` and not `x.get()`. This routine is defined this way
|
||||
/// because `Lazy` impls `Deref` with a target of `T`.
|
||||
///
|
||||
/// # Panics
|
||||
///
|
||||
/// This panics if the `create` function inside this lazy value panics.
|
||||
/// If the panic occurred in another thread, then this routine _may_ also
|
||||
/// panic (but is not guaranteed to do so).
|
||||
pub fn get(this: &Lazy<T, F>) -> &T {
|
||||
this.0.get()
|
||||
}
|
||||
}
|
||||
|
||||
impl<T, F: Fn() -> T> core::ops::Deref for Lazy<T, F> {
|
||||
type Target = T;
|
||||
|
||||
fn deref(&self) -> &T {
|
||||
Lazy::get(self)
|
||||
}
|
||||
}
|
||||
|
||||
impl<T: fmt::Debug, F: Fn() -> T> fmt::Debug for Lazy<T, F> {
|
||||
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
|
||||
self.0.fmt(f)
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(feature = "alloc")]
|
||||
mod lazy {
|
||||
use core::{
|
||||
fmt,
|
||||
marker::PhantomData,
|
||||
sync::atomic::{AtomicPtr, Ordering},
|
||||
};
|
||||
|
||||
use alloc::boxed::Box;
|
||||
|
||||
/// A non-std lazy initialized value.
|
||||
///
|
||||
/// This might run the initialization function more than once, but will
|
||||
/// never block.
|
||||
///
|
||||
/// I wish I could get these semantics into the non-alloc non-std Lazy
|
||||
/// type below, but I'm not sure how to do it. If you can do an alloc,
|
||||
/// then the implementation becomes very simple if you don't care about
|
||||
/// redundant work precisely because a pointer can be atomically swapped.
|
||||
///
|
||||
/// Perhaps making this approach work in the non-alloc non-std case
|
||||
/// requires asking the caller for a pointer? It would make the API less
|
||||
/// convenient I think.
|
||||
pub(super) struct Lazy<T, F> {
|
||||
data: AtomicPtr<T>,
|
||||
create: F,
|
||||
// This indicates to the compiler that this type can drop T. It's not
|
||||
// totally clear how the absence of this marker could lead to trouble,
|
||||
// but putting here doesn't have any downsides so we hedge until somone
|
||||
// can from the Unsafe Working Group can tell us definitively that we
|
||||
// don't need it.
|
||||
//
|
||||
// See: https://github.com/BurntSushi/regex-automata/issues/30
|
||||
owned: PhantomData<Box<T>>,
|
||||
}
|
||||
|
||||
// SAFETY: So long as T and &T (and F and &F) can themselves be safely
|
||||
// shared among threads, so to can a Lazy<T, _>. Namely, the Lazy API only
|
||||
// permits accessing a &T and initialization is free of data races. So if T
|
||||
// is thread safe, then so to is Lazy<T, _>.
|
||||
//
|
||||
// We specifically require that T: Send in order for Lazy<T> to be Sync.
|
||||
// Without that requirement, it's possible to send a T from one thread to
|
||||
// another via Lazy's destructor.
|
||||
//
|
||||
// It's not clear whether we need F: Send+Sync for Lazy to be Sync. But
|
||||
// we're conservative for now and keep both.
|
||||
unsafe impl<T: Send + Sync, F: Send + Sync> Sync for Lazy<T, F> {}
|
||||
|
||||
impl<T, F> Lazy<T, F> {
|
||||
/// Create a new alloc but non-std lazy value that is racily
|
||||
/// initialized. That is, the 'create' function may be called more than
|
||||
/// once.
|
||||
pub(super) const fn new(create: F) -> Lazy<T, F> {
|
||||
Lazy {
|
||||
data: AtomicPtr::new(core::ptr::null_mut()),
|
||||
create,
|
||||
owned: PhantomData,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl<T, F: Fn() -> T> Lazy<T, F> {
|
||||
/// Get the underlying lazy value. If it hasn't been initialized
|
||||
/// yet, then always attempt to initialize it (even if some other
|
||||
/// thread is initializing it) and atomically attach it to this lazy
|
||||
/// value before returning it.
|
||||
pub(super) fn get(&self) -> &T {
|
||||
if let Some(data) = self.poll() {
|
||||
return data;
|
||||
}
|
||||
let data = (self.create)();
|
||||
let mut ptr = Box::into_raw(Box::new(data));
|
||||
// We attempt to stuff our initialized value into our atomic
|
||||
// pointer. Upon success, we don't need to do anything. But if
|
||||
// someone else beat us to the punch, then we need to make sure
|
||||
// our newly created value is dropped.
|
||||
let result = self.data.compare_exchange(
|
||||
core::ptr::null_mut(),
|
||||
ptr,
|
||||
Ordering::AcqRel,
|
||||
Ordering::Acquire,
|
||||
);
|
||||
if let Err(old) = result {
|
||||
// SAFETY: We created 'ptr' via Box::into_raw above, so turning
|
||||
// it back into a Box via from_raw is safe.
|
||||
drop(unsafe { Box::from_raw(ptr) });
|
||||
ptr = old;
|
||||
}
|
||||
// SAFETY: We just set the pointer above to a non-null value, even
|
||||
// in the error case, and set it to a fully initialized value
|
||||
// returned by 'create'.
|
||||
unsafe { &*ptr }
|
||||
}
|
||||
|
||||
/// If this lazy value has been initialized successfully, then return
|
||||
/// that value. Otherwise return None immediately. This never attempts
|
||||
/// to run initialization itself.
|
||||
fn poll(&self) -> Option<&T> {
|
||||
let ptr = self.data.load(Ordering::Acquire);
|
||||
if ptr.is_null() {
|
||||
return None;
|
||||
}
|
||||
// SAFETY: We just checked that the pointer is not null. Since it's
|
||||
// not null, it must have been fully initialized by 'get' at some
|
||||
// point.
|
||||
Some(unsafe { &*ptr })
|
||||
}
|
||||
}
|
||||
|
||||
impl<T: fmt::Debug, F: Fn() -> T> fmt::Debug for Lazy<T, F> {
|
||||
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
|
||||
f.debug_struct("Lazy").field("data", &self.poll()).finish()
|
||||
}
|
||||
}
|
||||
|
||||
impl<T, F> Drop for Lazy<T, F> {
|
||||
fn drop(&mut self) {
|
||||
let ptr = *self.data.get_mut();
|
||||
if !ptr.is_null() {
|
||||
// SAFETY: We just checked that 'ptr' is not null. And since
|
||||
// we have exclusive access, there are no races to worry about.
|
||||
drop(unsafe { Box::from_raw(ptr) });
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(not(feature = "alloc"))]
|
||||
mod lazy {
|
||||
use core::{
|
||||
cell::Cell,
|
||||
fmt,
|
||||
mem::MaybeUninit,
|
||||
panic::{RefUnwindSafe, UnwindSafe},
|
||||
sync::atomic::{AtomicU8, Ordering},
|
||||
};
|
||||
|
||||
/// Our 'Lazy' value can be in one of three states:
|
||||
///
|
||||
/// * INIT is where it starts, and also ends up back here if the
|
||||
/// 'create' routine panics.
|
||||
/// * BUSY is where it sits while initialization is running in exactly
|
||||
/// one thread.
|
||||
/// * DONE is where it sits after 'create' has completed and 'data' has
|
||||
/// been fully initialized.
|
||||
const LAZY_STATE_INIT: u8 = 0;
|
||||
const LAZY_STATE_BUSY: u8 = 1;
|
||||
const LAZY_STATE_DONE: u8 = 2;
|
||||
|
||||
/// A non-alloc non-std lazy initialized value.
|
||||
///
|
||||
/// This guarantees initialization only happens once, but uses a spinlock
|
||||
/// to block in the case of simultaneous access. Blocking occurs so that
|
||||
/// one thread waits while another thread initializes the value.
|
||||
///
|
||||
/// I would much rather have the semantics of the 'alloc' Lazy type above.
|
||||
/// Namely, that we might run the initialization function more than once,
|
||||
/// but we never otherwise block. However, I don't know how to do that in
|
||||
/// a non-alloc non-std context.
|
||||
pub(super) struct Lazy<T, F> {
|
||||
state: AtomicU8,
|
||||
create: Cell<Option<F>>,
|
||||
data: Cell<MaybeUninit<T>>,
|
||||
}
|
||||
|
||||
// SAFETY: So long as T and &T (and F and &F) can themselves be safely
|
||||
// shared among threads, so to can a Lazy<T, _>. Namely, the Lazy API only
|
||||
// permits accessing a &T and initialization is free of data races. So if T
|
||||
// is thread safe, then so to is Lazy<T, _>.
|
||||
unsafe impl<T: Send + Sync, F: Send + Sync> Sync for Lazy<T, F> {}
|
||||
// A reference to a Lazy is unwind safe because we specifically take
|
||||
// precautions to poison all accesses to a Lazy if the caller-provided
|
||||
// 'create' function panics.
|
||||
impl<T: UnwindSafe, F: UnwindSafe + RefUnwindSafe> RefUnwindSafe
|
||||
for Lazy<T, F>
|
||||
{
|
||||
}
|
||||
|
||||
impl<T, F> Lazy<T, F> {
|
||||
/// Create a new non-alloc non-std lazy value that is initialized
|
||||
/// exactly once on first use using the given function.
|
||||
pub(super) const fn new(create: F) -> Lazy<T, F> {
|
||||
Lazy {
|
||||
state: AtomicU8::new(LAZY_STATE_INIT),
|
||||
create: Cell::new(Some(create)),
|
||||
data: Cell::new(MaybeUninit::uninit()),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl<T, F: FnOnce() -> T> Lazy<T, F> {
|
||||
/// Get the underlying lazy value. If it isn't been initialized
|
||||
/// yet, then either initialize it or block until some other thread
|
||||
/// initializes it. If the 'create' function given to Lazy::new panics
|
||||
/// (even in another thread), then this panics too.
|
||||
pub(super) fn get(&self) -> &T {
|
||||
// This is effectively a spinlock. We loop until we enter a DONE
|
||||
// state, and if possible, initialize it ourselves. The only way
|
||||
// we exit the loop is if 'create' panics, we initialize 'data' or
|
||||
// some other thread initializes 'data'.
|
||||
//
|
||||
// Yes, I have read spinlocks considered harmful[1]. And that
|
||||
// article is why this spinlock is only active when 'alloc' isn't
|
||||
// enabled. I did this because I don't think there is really
|
||||
// another choice without 'alloc', other than not providing this at
|
||||
// all. But I think that's a big bummer.
|
||||
//
|
||||
// [1]: https://matklad.github.io/2020/01/02/spinlocks-considered-harmful.html
|
||||
while self.state.load(Ordering::Acquire) != LAZY_STATE_DONE {
|
||||
// Check if we're the first ones to get here. If so, we'll be
|
||||
// the ones who initialize.
|
||||
let result = self.state.compare_exchange(
|
||||
LAZY_STATE_INIT,
|
||||
LAZY_STATE_BUSY,
|
||||
Ordering::AcqRel,
|
||||
Ordering::Acquire,
|
||||
);
|
||||
// This means we saw the INIT state and nobody else can. So we
|
||||
// must take responsibility for initializing. And by virtue of
|
||||
// observing INIT, we have also told anyone else trying to
|
||||
// get here that we are BUSY. If someone else sees BUSY, then
|
||||
// they will spin until we finish initialization.
|
||||
if let Ok(_) = result {
|
||||
// Since we are guaranteed to be the only ones here, we
|
||||
// know that 'create' is there... Unless someone else got
|
||||
// here before us and 'create' panicked. In which case,
|
||||
// 'self.create' is now 'None' and we forward the panic
|
||||
// to the caller. (i.e., We implement poisoning.)
|
||||
//
|
||||
// SAFETY: Our use of 'self.state' guarantees that we are
|
||||
// the only thread executing this line, and thus there are
|
||||
// no races.
|
||||
let create = unsafe {
|
||||
(*self.create.as_ptr()).take().expect(
|
||||
"Lazy's create function panicked, \
|
||||
preventing initialization,
|
||||
poisoning current thread",
|
||||
)
|
||||
};
|
||||
let guard = Guard { state: &self.state };
|
||||
// SAFETY: Our use of 'self.state' guarantees that we are
|
||||
// the only thread executing this line, and thus there are
|
||||
// no races.
|
||||
unsafe {
|
||||
(*self.data.as_ptr()).as_mut_ptr().write(create());
|
||||
}
|
||||
// All is well. 'self.create' ran successfully, so we
|
||||
// forget the guard.
|
||||
core::mem::forget(guard);
|
||||
// Everything is initialized, so we can declare success.
|
||||
self.state.store(LAZY_STATE_DONE, Ordering::Release);
|
||||
break;
|
||||
}
|
||||
core::hint::spin_loop();
|
||||
}
|
||||
// We only get here if data is fully initialized, and thus poll
|
||||
// will always return something.
|
||||
self.poll().unwrap()
|
||||
}
|
||||
|
||||
/// If this lazy value has been initialized successfully, then return
|
||||
/// that value. Otherwise return None immediately. This never blocks.
|
||||
fn poll(&self) -> Option<&T> {
|
||||
if self.state.load(Ordering::Acquire) == LAZY_STATE_DONE {
|
||||
// SAFETY: The DONE state only occurs when data has been fully
|
||||
// initialized.
|
||||
Some(unsafe { &*(*self.data.as_ptr()).as_ptr() })
|
||||
} else {
|
||||
None
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl<T: fmt::Debug, F: FnMut() -> T> fmt::Debug for Lazy<T, F> {
|
||||
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
|
||||
f.debug_struct("Lazy")
|
||||
.field("state", &self.state.load(Ordering::Acquire))
|
||||
.field("create", &"<closure>")
|
||||
.field("data", &self.poll())
|
||||
.finish()
|
||||
}
|
||||
}
|
||||
|
||||
impl<T, F> Drop for Lazy<T, F> {
|
||||
fn drop(&mut self) {
|
||||
if *self.state.get_mut() == LAZY_STATE_DONE {
|
||||
// SAFETY: state is DONE if and only if data has been fully
|
||||
// initialized. At which point, it is safe to drop.
|
||||
unsafe {
|
||||
self.data.get_mut().assume_init_drop();
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// A guard that will reset a Lazy's state back to INIT when dropped. The
|
||||
/// idea here is to 'forget' this guard on success. On failure (when a
|
||||
/// panic occurs), the Drop impl runs and causes all in-progress and future
|
||||
/// 'get' calls to panic. Without this guard, all in-progress and future
|
||||
/// 'get' calls would spin forever. Crashing is much better than getting
|
||||
/// stuck in an infinite loop.
|
||||
struct Guard<'a> {
|
||||
state: &'a AtomicU8,
|
||||
}
|
||||
|
||||
impl<'a> Drop for Guard<'a> {
|
||||
fn drop(&mut self) {
|
||||
// We force ourselves back into an INIT state. This will in turn
|
||||
// cause any future 'get' calls to attempt calling 'self.create'
|
||||
// again which will in turn panic because 'self.create' will now
|
||||
// be 'None'.
|
||||
self.state.store(LAZY_STATE_INIT, Ordering::Release);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
|
||||
fn assert_send<T: Send>() {}
|
||||
fn assert_sync<T: Sync>() {}
|
||||
fn assert_unwind<T: core::panic::UnwindSafe>() {}
|
||||
fn assert_refunwind<T: core::panic::RefUnwindSafe>() {}
|
||||
|
||||
#[test]
|
||||
fn oibits() {
|
||||
assert_send::<Lazy<u64>>();
|
||||
assert_sync::<Lazy<u64>>();
|
||||
assert_unwind::<Lazy<u64>>();
|
||||
assert_refunwind::<Lazy<u64>>();
|
||||
}
|
||||
|
||||
// This is a regression test because we used to rely on the inferred Sync
|
||||
// impl for the Lazy type defined above (for 'alloc' mode). In the
|
||||
// inferred impl, it only requires that T: Sync for Lazy<T>: Sync. But
|
||||
// if we have that, we can actually make use of the fact that Lazy<T> drops
|
||||
// T to create a value on one thread and drop it on another. This *should*
|
||||
// require T: Send, but our missing bounds before let it sneak by.
|
||||
//
|
||||
// Basically, this test should not compile, so we... comment it out. We
|
||||
// don't have a great way of testing compile-fail tests right now.
|
||||
//
|
||||
// See: https://github.com/BurntSushi/regex-automata/issues/30
|
||||
/*
|
||||
#[test]
|
||||
fn sync_not_send() {
|
||||
#[allow(dead_code)]
|
||||
fn inner<T: Sync + Default>() {
|
||||
let lazy = Lazy::new(move || T::default());
|
||||
std::thread::scope(|scope| {
|
||||
scope.spawn(|| {
|
||||
Lazy::get(&lazy); // We create T in this thread
|
||||
});
|
||||
});
|
||||
// And drop in this thread.
|
||||
drop(lazy);
|
||||
// So we have send a !Send type over threads. (with some more
|
||||
// legwork, its possible to even sneak the value out of drop
|
||||
// through thread local)
|
||||
}
|
||||
}
|
||||
*/
|
||||
}
|
||||
2547
third-party/vendor/regex-automata/src/util/look.rs
vendored
Normal file
2547
third-party/vendor/regex-automata/src/util/look.rs
vendored
Normal file
File diff suppressed because it is too large
Load diff
93
third-party/vendor/regex-automata/src/util/memchr.rs
vendored
Normal file
93
third-party/vendor/regex-automata/src/util/memchr.rs
vendored
Normal file
|
|
@ -0,0 +1,93 @@
|
|||
/*!
|
||||
This module defines simple wrapper routines for the memchr functions from the
|
||||
`memchr` crate. Basically, when the `memchr` crate is available, we use it,
|
||||
otherwise we use a naive implementation which is still pretty fast.
|
||||
*/
|
||||
|
||||
pub(crate) use self::inner::*;
|
||||
|
||||
#[cfg(feature = "perf-literal-substring")]
|
||||
pub(super) mod inner {
|
||||
#[cfg_attr(feature = "perf-inline", inline(always))]
|
||||
pub(crate) fn memchr(n1: u8, haystack: &[u8]) -> Option<usize> {
|
||||
memchr::memchr(n1, haystack)
|
||||
}
|
||||
|
||||
#[cfg_attr(feature = "perf-inline", inline(always))]
|
||||
pub(crate) fn memchr2(n1: u8, n2: u8, haystack: &[u8]) -> Option<usize> {
|
||||
memchr::memchr2(n1, n2, haystack)
|
||||
}
|
||||
|
||||
#[cfg_attr(feature = "perf-inline", inline(always))]
|
||||
pub(crate) fn memchr3(
|
||||
n1: u8,
|
||||
n2: u8,
|
||||
n3: u8,
|
||||
haystack: &[u8],
|
||||
) -> Option<usize> {
|
||||
memchr::memchr3(n1, n2, n3, haystack)
|
||||
}
|
||||
|
||||
#[cfg_attr(feature = "perf-inline", inline(always))]
|
||||
pub(crate) fn memrchr(n1: u8, haystack: &[u8]) -> Option<usize> {
|
||||
memchr::memrchr(n1, haystack)
|
||||
}
|
||||
|
||||
#[cfg_attr(feature = "perf-inline", inline(always))]
|
||||
pub(crate) fn memrchr2(n1: u8, n2: u8, haystack: &[u8]) -> Option<usize> {
|
||||
memchr::memrchr2(n1, n2, haystack)
|
||||
}
|
||||
|
||||
#[cfg_attr(feature = "perf-inline", inline(always))]
|
||||
pub(crate) fn memrchr3(
|
||||
n1: u8,
|
||||
n2: u8,
|
||||
n3: u8,
|
||||
haystack: &[u8],
|
||||
) -> Option<usize> {
|
||||
memchr::memrchr3(n1, n2, n3, haystack)
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(not(feature = "perf-literal-substring"))]
|
||||
pub(super) mod inner {
|
||||
#[cfg_attr(feature = "perf-inline", inline(always))]
|
||||
pub(crate) fn memchr(n1: u8, haystack: &[u8]) -> Option<usize> {
|
||||
haystack.iter().position(|&b| b == n1)
|
||||
}
|
||||
|
||||
#[cfg_attr(feature = "perf-inline", inline(always))]
|
||||
pub(crate) fn memchr2(n1: u8, n2: u8, haystack: &[u8]) -> Option<usize> {
|
||||
haystack.iter().position(|&b| b == n1 || b == n2)
|
||||
}
|
||||
|
||||
#[cfg_attr(feature = "perf-inline", inline(always))]
|
||||
pub(crate) fn memchr3(
|
||||
n1: u8,
|
||||
n2: u8,
|
||||
n3: u8,
|
||||
haystack: &[u8],
|
||||
) -> Option<usize> {
|
||||
haystack.iter().position(|&b| b == n1 || b == n2 || b == n3)
|
||||
}
|
||||
|
||||
#[cfg_attr(feature = "perf-inline", inline(always))]
|
||||
pub(crate) fn memrchr(n1: u8, haystack: &[u8]) -> Option<usize> {
|
||||
haystack.iter().rposition(|&b| b == n1)
|
||||
}
|
||||
|
||||
#[cfg_attr(feature = "perf-inline", inline(always))]
|
||||
pub(crate) fn memrchr2(n1: u8, n2: u8, haystack: &[u8]) -> Option<usize> {
|
||||
haystack.iter().rposition(|&b| b == n1 || b == n2)
|
||||
}
|
||||
|
||||
#[cfg_attr(feature = "perf-inline", inline(always))]
|
||||
pub(crate) fn memrchr3(
|
||||
n1: u8,
|
||||
n2: u8,
|
||||
n3: u8,
|
||||
haystack: &[u8],
|
||||
) -> Option<usize> {
|
||||
haystack.iter().rposition(|&b| b == n1 || b == n2 || b == n3)
|
||||
}
|
||||
}
|
||||
57
third-party/vendor/regex-automata/src/util/mod.rs
vendored
Normal file
57
third-party/vendor/regex-automata/src/util/mod.rs
vendored
Normal file
|
|
@ -0,0 +1,57 @@
|
|||
/*!
|
||||
A collection of modules that provide APIs that are useful across many regex
|
||||
engines.
|
||||
|
||||
While one should explore the sub-modules directly to get a sense of what's
|
||||
there, here are some highlights that tie the sub-modules to higher level
|
||||
use cases:
|
||||
|
||||
* `alphabet` contains APIs that are useful if you're doing low level things
|
||||
with the DFAs in this crate. For example, implementing determinization or
|
||||
walking its state graph directly.
|
||||
* `captures` contains APIs for dealing with capture group matches and their
|
||||
mapping to "slots" used inside an NFA graph. This is also where you can find
|
||||
iterators over capture group names.
|
||||
* `escape` contains types for pretty-printing raw byte slices as strings.
|
||||
* `iter` contains API helpers for writing regex iterators.
|
||||
* `lazy` contains a no-std and no-alloc variant of `lazy_static!` and
|
||||
`once_cell`.
|
||||
* `look` contains APIs for matching and configuring look-around assertions.
|
||||
* `pool` provides a way to reuse mutable memory allocated in a thread safe
|
||||
manner.
|
||||
* `prefilter` provides APIs for building prefilters and using them in searches.
|
||||
* `primitives` are what you might use if you're doing lower level work on
|
||||
automata, such as walking an NFA state graph.
|
||||
* `syntax` provides some higher level convenience functions for interacting
|
||||
with the `regex-syntax` crate.
|
||||
* `wire` is useful if you're working with DFA serialization.
|
||||
*/
|
||||
|
||||
pub mod alphabet;
|
||||
#[cfg(feature = "alloc")]
|
||||
pub mod captures;
|
||||
pub mod escape;
|
||||
#[cfg(feature = "alloc")]
|
||||
pub mod interpolate;
|
||||
pub mod iter;
|
||||
pub mod lazy;
|
||||
pub mod look;
|
||||
#[cfg(feature = "alloc")]
|
||||
pub mod pool;
|
||||
pub mod prefilter;
|
||||
pub mod primitives;
|
||||
pub mod start;
|
||||
#[cfg(feature = "syntax")]
|
||||
pub mod syntax;
|
||||
pub mod wire;
|
||||
|
||||
#[cfg(any(feature = "dfa-build", feature = "hybrid"))]
|
||||
pub(crate) mod determinize;
|
||||
pub(crate) mod empty;
|
||||
pub(crate) mod int;
|
||||
pub(crate) mod memchr;
|
||||
pub(crate) mod search;
|
||||
#[cfg(feature = "alloc")]
|
||||
pub(crate) mod sparse_set;
|
||||
pub(crate) mod unicode_data;
|
||||
pub(crate) mod utf8;
|
||||
1199
third-party/vendor/regex-automata/src/util/pool.rs
vendored
Normal file
1199
third-party/vendor/regex-automata/src/util/pool.rs
vendored
Normal file
File diff suppressed because it is too large
Load diff
149
third-party/vendor/regex-automata/src/util/prefilter/aho_corasick.rs
vendored
Normal file
149
third-party/vendor/regex-automata/src/util/prefilter/aho_corasick.rs
vendored
Normal file
|
|
@ -0,0 +1,149 @@
|
|||
use crate::util::{
|
||||
prefilter::PrefilterI,
|
||||
search::{MatchKind, Span},
|
||||
};
|
||||
|
||||
#[derive(Clone, Debug)]
|
||||
pub(crate) struct AhoCorasick {
|
||||
#[cfg(not(feature = "perf-literal-multisubstring"))]
|
||||
_unused: (),
|
||||
#[cfg(feature = "perf-literal-multisubstring")]
|
||||
ac: aho_corasick::AhoCorasick,
|
||||
}
|
||||
|
||||
impl AhoCorasick {
|
||||
pub(crate) fn new<B: AsRef<[u8]>>(
|
||||
kind: MatchKind,
|
||||
needles: &[B],
|
||||
) -> Option<AhoCorasick> {
|
||||
#[cfg(not(feature = "perf-literal-multisubstring"))]
|
||||
{
|
||||
None
|
||||
}
|
||||
#[cfg(feature = "perf-literal-multisubstring")]
|
||||
{
|
||||
// We used to use `aho_corasick::MatchKind::Standard` here when
|
||||
// `kind` was `MatchKind::All`, but this is not correct. The
|
||||
// "standard" Aho-Corasick match semantics are to report a match
|
||||
// immediately as soon as it is seen, but `All` isn't like that.
|
||||
// In particular, with "standard" semantics, given the needles
|
||||
// "abc" and "b" and the haystack "abc," it would report a match
|
||||
// at offset 1 before a match at offset 0. This is never what we
|
||||
// want in the context of the regex engine, regardless of whether
|
||||
// we have leftmost-first or 'all' semantics. Namely, we always
|
||||
// want the leftmost match.
|
||||
let ac_match_kind = match kind {
|
||||
MatchKind::LeftmostFirst | MatchKind::All => {
|
||||
aho_corasick::MatchKind::LeftmostFirst
|
||||
}
|
||||
};
|
||||
// This is kind of just an arbitrary number, but basically, if we
|
||||
// have a small enough set of literals, then we try to use the VERY
|
||||
// memory hungry DFA. Otherwise, we whimp out and use an NFA. The
|
||||
// upshot is that the NFA is quite lean and decently fast. Faster
|
||||
// than a naive Aho-Corasick NFA anyway.
|
||||
let ac_kind = if needles.len() <= 500 {
|
||||
aho_corasick::AhoCorasickKind::DFA
|
||||
} else {
|
||||
aho_corasick::AhoCorasickKind::ContiguousNFA
|
||||
};
|
||||
let result = aho_corasick::AhoCorasick::builder()
|
||||
.kind(Some(ac_kind))
|
||||
.match_kind(ac_match_kind)
|
||||
.start_kind(aho_corasick::StartKind::Both)
|
||||
// We try to handle all of the prefilter cases in the super
|
||||
// module, and only use Aho-Corasick for the actual automaton.
|
||||
// The aho-corasick crate does have some extra prefilters,
|
||||
// namely, looking for rare bytes to feed to memchr{,2,3}
|
||||
// instead of just the first byte. If we end up wanting
|
||||
// those---and they are somewhat tricky to implement---then
|
||||
// we could port them to this crate.
|
||||
//
|
||||
// The main reason for doing things this way is so we have a
|
||||
// complete and easy to understand picture of which prefilters
|
||||
// are available and how they work. Otherwise it seems too
|
||||
// easy to get into a situation where we have a prefilter
|
||||
// layered on top of prefilter, and that might have unintended
|
||||
// consequences.
|
||||
.prefilter(false)
|
||||
.build(needles);
|
||||
let ac = match result {
|
||||
Ok(ac) => ac,
|
||||
Err(_err) => {
|
||||
debug!("aho-corasick prefilter failed to build: {}", _err);
|
||||
return None;
|
||||
}
|
||||
};
|
||||
Some(AhoCorasick { ac })
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl PrefilterI for AhoCorasick {
|
||||
fn find(&self, haystack: &[u8], span: Span) -> Option<Span> {
|
||||
#[cfg(not(feature = "perf-literal-multisubstring"))]
|
||||
{
|
||||
unreachable!()
|
||||
}
|
||||
#[cfg(feature = "perf-literal-multisubstring")]
|
||||
{
|
||||
let input =
|
||||
aho_corasick::Input::new(haystack).span(span.start..span.end);
|
||||
self.ac
|
||||
.find(input)
|
||||
.map(|m| Span { start: m.start(), end: m.end() })
|
||||
}
|
||||
}
|
||||
|
||||
fn prefix(&self, haystack: &[u8], span: Span) -> Option<Span> {
|
||||
#[cfg(not(feature = "perf-literal-multisubstring"))]
|
||||
{
|
||||
unreachable!()
|
||||
}
|
||||
#[cfg(feature = "perf-literal-multisubstring")]
|
||||
{
|
||||
let input = aho_corasick::Input::new(haystack)
|
||||
.anchored(aho_corasick::Anchored::Yes)
|
||||
.span(span.start..span.end);
|
||||
self.ac
|
||||
.find(input)
|
||||
.map(|m| Span { start: m.start(), end: m.end() })
|
||||
}
|
||||
}
|
||||
|
||||
fn memory_usage(&self) -> usize {
|
||||
#[cfg(not(feature = "perf-literal-multisubstring"))]
|
||||
{
|
||||
unreachable!()
|
||||
}
|
||||
#[cfg(feature = "perf-literal-multisubstring")]
|
||||
{
|
||||
self.ac.memory_usage()
|
||||
}
|
||||
}
|
||||
|
||||
fn is_fast(&self) -> bool {
|
||||
#[cfg(not(feature = "perf-literal-multisubstring"))]
|
||||
{
|
||||
unreachable!()
|
||||
}
|
||||
#[cfg(feature = "perf-literal-multisubstring")]
|
||||
{
|
||||
// Aho-Corasick is never considered "fast" because it's never
|
||||
// going to be even close to an order of magnitude faster than the
|
||||
// regex engine itself (assuming a DFA is used). In fact, it is
|
||||
// usually slower. The magic of Aho-Corasick is that it can search
|
||||
// a *large* number of literals with a relatively small amount of
|
||||
// memory. The regex engines are far more wasteful.
|
||||
//
|
||||
// Aho-Corasick may be "fast" when the regex engine corresponds
|
||||
// to, say, the PikeVM. That happens when the lazy DFA couldn't be
|
||||
// built or used for some reason. But in these cases, the regex
|
||||
// itself is likely quite big and we're probably hosed no matter
|
||||
// what we do. (In this case, the best bet is for the caller to
|
||||
// increase some of the memory limits on the hybrid cache capacity
|
||||
// and hope that's enough.)
|
||||
false
|
||||
}
|
||||
}
|
||||
}
|
||||
58
third-party/vendor/regex-automata/src/util/prefilter/byteset.rs
vendored
Normal file
58
third-party/vendor/regex-automata/src/util/prefilter/byteset.rs
vendored
Normal file
|
|
@ -0,0 +1,58 @@
|
|||
use crate::util::{
|
||||
prefilter::PrefilterI,
|
||||
search::{MatchKind, Span},
|
||||
};
|
||||
|
||||
#[derive(Clone, Debug)]
|
||||
pub(crate) struct ByteSet([bool; 256]);
|
||||
|
||||
impl ByteSet {
|
||||
pub(crate) fn new<B: AsRef<[u8]>>(
|
||||
_kind: MatchKind,
|
||||
needles: &[B],
|
||||
) -> Option<ByteSet> {
|
||||
#[cfg(not(feature = "perf-literal-multisubstring"))]
|
||||
{
|
||||
None
|
||||
}
|
||||
#[cfg(feature = "perf-literal-multisubstring")]
|
||||
{
|
||||
let mut set = [false; 256];
|
||||
for needle in needles.iter() {
|
||||
let needle = needle.as_ref();
|
||||
if needle.len() != 1 {
|
||||
return None;
|
||||
}
|
||||
set[usize::from(needle[0])] = true;
|
||||
}
|
||||
Some(ByteSet(set))
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl PrefilterI for ByteSet {
|
||||
fn find(&self, haystack: &[u8], span: Span) -> Option<Span> {
|
||||
haystack[span].iter().position(|&b| self.0[usize::from(b)]).map(|i| {
|
||||
let start = span.start + i;
|
||||
let end = start + 1;
|
||||
Span { start, end }
|
||||
})
|
||||
}
|
||||
|
||||
fn prefix(&self, haystack: &[u8], span: Span) -> Option<Span> {
|
||||
let b = *haystack.get(span.start)?;
|
||||
if self.0[usize::from(b)] {
|
||||
Some(Span { start: span.start, end: span.start + 1 })
|
||||
} else {
|
||||
None
|
||||
}
|
||||
}
|
||||
|
||||
fn memory_usage(&self) -> usize {
|
||||
0
|
||||
}
|
||||
|
||||
fn is_fast(&self) -> bool {
|
||||
false
|
||||
}
|
||||
}
|
||||
186
third-party/vendor/regex-automata/src/util/prefilter/memchr.rs
vendored
Normal file
186
third-party/vendor/regex-automata/src/util/prefilter/memchr.rs
vendored
Normal file
|
|
@ -0,0 +1,186 @@
|
|||
use crate::util::{
|
||||
prefilter::PrefilterI,
|
||||
search::{MatchKind, Span},
|
||||
};
|
||||
|
||||
#[derive(Clone, Debug)]
|
||||
pub(crate) struct Memchr(u8);
|
||||
|
||||
impl Memchr {
|
||||
pub(crate) fn new<B: AsRef<[u8]>>(
|
||||
_kind: MatchKind,
|
||||
needles: &[B],
|
||||
) -> Option<Memchr> {
|
||||
#[cfg(not(feature = "perf-literal-substring"))]
|
||||
{
|
||||
None
|
||||
}
|
||||
#[cfg(feature = "perf-literal-substring")]
|
||||
{
|
||||
if needles.len() != 1 {
|
||||
return None;
|
||||
}
|
||||
if needles[0].as_ref().len() != 1 {
|
||||
return None;
|
||||
}
|
||||
Some(Memchr(needles[0].as_ref()[0]))
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl PrefilterI for Memchr {
|
||||
fn find(&self, haystack: &[u8], span: Span) -> Option<Span> {
|
||||
#[cfg(not(feature = "perf-literal-substring"))]
|
||||
{
|
||||
unreachable!()
|
||||
}
|
||||
#[cfg(feature = "perf-literal-substring")]
|
||||
{
|
||||
memchr::memchr(self.0, &haystack[span]).map(|i| {
|
||||
let start = span.start + i;
|
||||
let end = start + 1;
|
||||
Span { start, end }
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
fn prefix(&self, haystack: &[u8], span: Span) -> Option<Span> {
|
||||
let b = *haystack.get(span.start)?;
|
||||
if self.0 == b {
|
||||
Some(Span { start: span.start, end: span.start + 1 })
|
||||
} else {
|
||||
None
|
||||
}
|
||||
}
|
||||
|
||||
fn memory_usage(&self) -> usize {
|
||||
0
|
||||
}
|
||||
|
||||
fn is_fast(&self) -> bool {
|
||||
true
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Clone, Debug)]
|
||||
pub(crate) struct Memchr2(u8, u8);
|
||||
|
||||
impl Memchr2 {
|
||||
pub(crate) fn new<B: AsRef<[u8]>>(
|
||||
_kind: MatchKind,
|
||||
needles: &[B],
|
||||
) -> Option<Memchr2> {
|
||||
#[cfg(not(feature = "perf-literal-substring"))]
|
||||
{
|
||||
None
|
||||
}
|
||||
#[cfg(feature = "perf-literal-substring")]
|
||||
{
|
||||
if needles.len() != 2 {
|
||||
return None;
|
||||
}
|
||||
if !needles.iter().all(|n| n.as_ref().len() == 1) {
|
||||
return None;
|
||||
}
|
||||
let b1 = needles[0].as_ref()[0];
|
||||
let b2 = needles[1].as_ref()[0];
|
||||
Some(Memchr2(b1, b2))
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl PrefilterI for Memchr2 {
|
||||
fn find(&self, haystack: &[u8], span: Span) -> Option<Span> {
|
||||
#[cfg(not(feature = "perf-literal-substring"))]
|
||||
{
|
||||
unreachable!()
|
||||
}
|
||||
#[cfg(feature = "perf-literal-substring")]
|
||||
{
|
||||
memchr::memchr2(self.0, self.1, &haystack[span]).map(|i| {
|
||||
let start = span.start + i;
|
||||
let end = start + 1;
|
||||
Span { start, end }
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
fn prefix(&self, haystack: &[u8], span: Span) -> Option<Span> {
|
||||
let b = *haystack.get(span.start)?;
|
||||
if self.0 == b || self.1 == b {
|
||||
Some(Span { start: span.start, end: span.start + 1 })
|
||||
} else {
|
||||
None
|
||||
}
|
||||
}
|
||||
|
||||
fn memory_usage(&self) -> usize {
|
||||
0
|
||||
}
|
||||
|
||||
fn is_fast(&self) -> bool {
|
||||
true
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Clone, Debug)]
|
||||
pub(crate) struct Memchr3(u8, u8, u8);
|
||||
|
||||
impl Memchr3 {
|
||||
pub(crate) fn new<B: AsRef<[u8]>>(
|
||||
_kind: MatchKind,
|
||||
needles: &[B],
|
||||
) -> Option<Memchr3> {
|
||||
#[cfg(not(feature = "perf-literal-substring"))]
|
||||
{
|
||||
None
|
||||
}
|
||||
#[cfg(feature = "perf-literal-substring")]
|
||||
{
|
||||
if needles.len() != 3 {
|
||||
return None;
|
||||
}
|
||||
if !needles.iter().all(|n| n.as_ref().len() == 1) {
|
||||
return None;
|
||||
}
|
||||
let b1 = needles[0].as_ref()[0];
|
||||
let b2 = needles[1].as_ref()[0];
|
||||
let b3 = needles[2].as_ref()[0];
|
||||
Some(Memchr3(b1, b2, b3))
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl PrefilterI for Memchr3 {
|
||||
fn find(&self, haystack: &[u8], span: Span) -> Option<Span> {
|
||||
#[cfg(not(feature = "perf-literal-substring"))]
|
||||
{
|
||||
unreachable!()
|
||||
}
|
||||
#[cfg(feature = "perf-literal-substring")]
|
||||
{
|
||||
memchr::memchr3(self.0, self.1, self.2, &haystack[span]).map(|i| {
|
||||
let start = span.start + i;
|
||||
let end = start + 1;
|
||||
Span { start, end }
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
fn prefix(&self, haystack: &[u8], span: Span) -> Option<Span> {
|
||||
let b = *haystack.get(span.start)?;
|
||||
if self.0 == b || self.1 == b || self.2 == b {
|
||||
Some(Span { start: span.start, end: span.start + 1 })
|
||||
} else {
|
||||
None
|
||||
}
|
||||
}
|
||||
|
||||
fn memory_usage(&self) -> usize {
|
||||
0
|
||||
}
|
||||
|
||||
fn is_fast(&self) -> bool {
|
||||
true
|
||||
}
|
||||
}
|
||||
88
third-party/vendor/regex-automata/src/util/prefilter/memmem.rs
vendored
Normal file
88
third-party/vendor/regex-automata/src/util/prefilter/memmem.rs
vendored
Normal file
|
|
@ -0,0 +1,88 @@
|
|||
use crate::util::{
|
||||
prefilter::PrefilterI,
|
||||
search::{MatchKind, Span},
|
||||
};
|
||||
|
||||
#[derive(Clone, Debug)]
|
||||
pub(crate) struct Memmem {
|
||||
#[cfg(not(all(feature = "std", feature = "perf-literal-substring")))]
|
||||
_unused: (),
|
||||
#[cfg(all(feature = "std", feature = "perf-literal-substring"))]
|
||||
finder: memchr::memmem::Finder<'static>,
|
||||
}
|
||||
|
||||
impl Memmem {
|
||||
pub(crate) fn new<B: AsRef<[u8]>>(
|
||||
_kind: MatchKind,
|
||||
needles: &[B],
|
||||
) -> Option<Memmem> {
|
||||
#[cfg(not(all(feature = "std", feature = "perf-literal-substring")))]
|
||||
{
|
||||
None
|
||||
}
|
||||
#[cfg(all(feature = "std", feature = "perf-literal-substring"))]
|
||||
{
|
||||
if needles.len() != 1 {
|
||||
return None;
|
||||
}
|
||||
let needle = needles[0].as_ref();
|
||||
let finder = memchr::memmem::Finder::new(needle).into_owned();
|
||||
Some(Memmem { finder })
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl PrefilterI for Memmem {
|
||||
fn find(&self, haystack: &[u8], span: Span) -> Option<Span> {
|
||||
#[cfg(not(all(feature = "std", feature = "perf-literal-substring")))]
|
||||
{
|
||||
unreachable!()
|
||||
}
|
||||
#[cfg(all(feature = "std", feature = "perf-literal-substring"))]
|
||||
{
|
||||
self.finder.find(&haystack[span]).map(|i| {
|
||||
let start = span.start + i;
|
||||
let end = start + self.finder.needle().len();
|
||||
Span { start, end }
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
fn prefix(&self, haystack: &[u8], span: Span) -> Option<Span> {
|
||||
#[cfg(not(all(feature = "std", feature = "perf-literal-substring")))]
|
||||
{
|
||||
unreachable!()
|
||||
}
|
||||
#[cfg(all(feature = "std", feature = "perf-literal-substring"))]
|
||||
{
|
||||
let needle = self.finder.needle();
|
||||
if haystack[span].starts_with(needle) {
|
||||
Some(Span { end: span.start + needle.len(), ..span })
|
||||
} else {
|
||||
None
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
fn memory_usage(&self) -> usize {
|
||||
#[cfg(not(all(feature = "std", feature = "perf-literal-substring")))]
|
||||
{
|
||||
unreachable!()
|
||||
}
|
||||
#[cfg(all(feature = "std", feature = "perf-literal-substring"))]
|
||||
{
|
||||
self.finder.needle().len()
|
||||
}
|
||||
}
|
||||
|
||||
fn is_fast(&self) -> bool {
|
||||
#[cfg(not(all(feature = "std", feature = "perf-literal-substring")))]
|
||||
{
|
||||
unreachable!()
|
||||
}
|
||||
#[cfg(all(feature = "std", feature = "perf-literal-substring"))]
|
||||
{
|
||||
true
|
||||
}
|
||||
}
|
||||
}
|
||||
719
third-party/vendor/regex-automata/src/util/prefilter/mod.rs
vendored
Normal file
719
third-party/vendor/regex-automata/src/util/prefilter/mod.rs
vendored
Normal file
|
|
@ -0,0 +1,719 @@
|
|||
/*!
|
||||
Defines a prefilter for accelerating regex searches.
|
||||
|
||||
A prefilter can be created by building a [`Prefilter`] value.
|
||||
|
||||
A prefilter represents one of the most important optimizations available for
|
||||
accelerating regex searches. The idea of a prefilter is to very quickly find
|
||||
candidate locations in a haystack where a regex _could_ match. Once a candidate
|
||||
is found, it is then intended for the regex engine to run at that position to
|
||||
determine whether the candidate is a match or a false positive.
|
||||
|
||||
In the aforementioned description of the prefilter optimization also lay its
|
||||
demise. Namely, if a prefilter has a high false positive rate and it produces
|
||||
lots of candidates, then a prefilter can overall make a regex search slower.
|
||||
It can run more slowly because more time is spent ping-ponging between the
|
||||
prefilter search and the regex engine attempting to confirm each candidate as
|
||||
a match. This ping-ponging has overhead that adds up, and is exacerbated by
|
||||
a high false positive rate.
|
||||
|
||||
Nevertheless, the optimization is still generally worth performing in most
|
||||
cases. Particularly given just how much throughput can be improved. (It is not
|
||||
uncommon for prefilter optimizations to improve throughput by one or two orders
|
||||
of magnitude.)
|
||||
|
||||
Typically a prefilter is used to find occurrences of literal prefixes from a
|
||||
regex pattern, but this isn't required. A prefilter can be used to look for
|
||||
suffixes or even inner literals.
|
||||
|
||||
Note that as of now, prefilters throw away information about which pattern
|
||||
each literal comes from. In other words, when a prefilter finds a match,
|
||||
there's no way to know which pattern (or patterns) it came from. Therefore,
|
||||
in order to confirm a match, you'll have to check all of the patterns by
|
||||
running the full regex engine.
|
||||
*/
|
||||
|
||||
mod aho_corasick;
|
||||
mod byteset;
|
||||
mod memchr;
|
||||
mod memmem;
|
||||
mod teddy;
|
||||
|
||||
use core::{
|
||||
borrow::Borrow,
|
||||
fmt::Debug,
|
||||
panic::{RefUnwindSafe, UnwindSafe},
|
||||
};
|
||||
|
||||
#[cfg(feature = "alloc")]
|
||||
use alloc::sync::Arc;
|
||||
|
||||
#[cfg(feature = "syntax")]
|
||||
use regex_syntax::hir::{literal, Hir};
|
||||
|
||||
use crate::util::search::{MatchKind, Span};
|
||||
|
||||
pub(crate) use crate::util::prefilter::{
|
||||
aho_corasick::AhoCorasick,
|
||||
byteset::ByteSet,
|
||||
memchr::{Memchr, Memchr2, Memchr3},
|
||||
memmem::Memmem,
|
||||
teddy::Teddy,
|
||||
};
|
||||
|
||||
/// A prefilter for accelerating regex searches.
|
||||
///
|
||||
/// If you already have your literals that you want to search with,
|
||||
/// then the vanilla [`Prefilter::new`] constructor is for you. But
|
||||
/// if you have an [`Hir`] value from the `regex-syntax` crate, then
|
||||
/// [`Prefilter::from_hir_prefix`] might be more convenient. Namely, it uses
|
||||
/// the [`regex-syntax::hir::literal`](regex_syntax::hir::literal) module to
|
||||
/// extract literal prefixes for you, optimize them and then select and build a
|
||||
/// prefilter matcher.
|
||||
///
|
||||
/// A prefilter must have **zero false negatives**. However, by its very
|
||||
/// nature, it may produce false positives. That is, a prefilter will never
|
||||
/// skip over a position in the haystack that corresponds to a match of the
|
||||
/// original regex pattern, but it *may* produce a match for a position
|
||||
/// in the haystack that does *not* correspond to a match of the original
|
||||
/// regex pattern. If you use either the [`Prefilter::from_hir_prefix`] or
|
||||
/// [`Prefilter::from_hirs_prefix`] constructors, then this guarantee is
|
||||
/// upheld for you automatically. This guarantee is not preserved if you use
|
||||
/// [`Prefilter::new`] though, since it is up to the caller to provide correct
|
||||
/// literal strings with respect to the original regex pattern.
|
||||
///
|
||||
/// # Cloning
|
||||
///
|
||||
/// It is an API guarantee that cloning a prefilter is cheap. That is, cloning
|
||||
/// it will not duplicate whatever heap memory is used to represent the
|
||||
/// underlying matcher.
|
||||
///
|
||||
/// # Example
|
||||
///
|
||||
/// This example shows how to attach a `Prefilter` to the
|
||||
/// [`PikeVM`](crate::nfa::thompson::pikevm::PikeVM) in order to accelerate
|
||||
/// searches.
|
||||
///
|
||||
/// ```
|
||||
/// use regex_automata::{
|
||||
/// nfa::thompson::pikevm::PikeVM,
|
||||
/// util::prefilter::Prefilter,
|
||||
/// Match, MatchKind,
|
||||
/// };
|
||||
///
|
||||
/// let pre = Prefilter::new(MatchKind::LeftmostFirst, &["Bruce "])
|
||||
/// .expect("a prefilter");
|
||||
/// let re = PikeVM::builder()
|
||||
/// .configure(PikeVM::config().prefilter(Some(pre)))
|
||||
/// .build(r"Bruce \w+")?;
|
||||
/// let mut cache = re.create_cache();
|
||||
/// assert_eq!(
|
||||
/// Some(Match::must(0, 6..23)),
|
||||
/// re.find(&mut cache, "Hello Bruce Springsteen!"),
|
||||
/// );
|
||||
/// # Ok::<(), Box<dyn std::error::Error>>(())
|
||||
/// ```
|
||||
///
|
||||
/// But note that if you get your prefilter incorrect, it could lead to an
|
||||
/// incorrect result!
|
||||
///
|
||||
/// ```
|
||||
/// use regex_automata::{
|
||||
/// nfa::thompson::pikevm::PikeVM,
|
||||
/// util::prefilter::Prefilter,
|
||||
/// Match, MatchKind,
|
||||
/// };
|
||||
///
|
||||
/// // This prefilter is wrong!
|
||||
/// let pre = Prefilter::new(MatchKind::LeftmostFirst, &["Patti "])
|
||||
/// .expect("a prefilter");
|
||||
/// let re = PikeVM::builder()
|
||||
/// .configure(PikeVM::config().prefilter(Some(pre)))
|
||||
/// .build(r"Bruce \w+")?;
|
||||
/// let mut cache = re.create_cache();
|
||||
/// // We find no match even though the regex does match.
|
||||
/// assert_eq!(
|
||||
/// None,
|
||||
/// re.find(&mut cache, "Hello Bruce Springsteen!"),
|
||||
/// );
|
||||
/// # Ok::<(), Box<dyn std::error::Error>>(())
|
||||
/// ```
|
||||
#[derive(Clone, Debug)]
|
||||
pub struct Prefilter {
|
||||
#[cfg(not(feature = "alloc"))]
|
||||
_unused: (),
|
||||
#[cfg(feature = "alloc")]
|
||||
pre: Arc<dyn PrefilterI>,
|
||||
#[cfg(feature = "alloc")]
|
||||
is_fast: bool,
|
||||
#[cfg(feature = "alloc")]
|
||||
max_needle_len: usize,
|
||||
}
|
||||
|
||||
impl Prefilter {
|
||||
/// Create a new prefilter from a sequence of needles and a corresponding
|
||||
/// match semantics.
|
||||
///
|
||||
/// This may return `None` for a variety of reasons, for example, if
|
||||
/// a suitable prefilter could not be constructed. That might occur
|
||||
/// if they are unavailable (e.g., the `perf-literal-substring` and
|
||||
/// `perf-literal-multisubstring` features aren't enabled), or it might
|
||||
/// occur because of heuristics or other artifacts of how the prefilter
|
||||
/// works.
|
||||
///
|
||||
/// Note that if you have an [`Hir`] expression, it may be more convenient
|
||||
/// to use [`Prefilter::from_hir_prefix`]. It will automatically handle the
|
||||
/// task of extracting prefix literals for you.
|
||||
///
|
||||
/// # Example
|
||||
///
|
||||
/// This example shows how match semantics can impact the matching
|
||||
/// algorithm used by the prefilter. For this reason, it is important to
|
||||
/// ensure that the match semantics given here are consistent with the
|
||||
/// match semantics intended for the regular expression that the literals
|
||||
/// were extracted from.
|
||||
///
|
||||
/// ```
|
||||
/// use regex_automata::{
|
||||
/// util::{prefilter::Prefilter, syntax},
|
||||
/// MatchKind, Span,
|
||||
/// };
|
||||
///
|
||||
/// let hay = "Hello samwise";
|
||||
///
|
||||
/// // With leftmost-first, we find 'samwise' here because it comes
|
||||
/// // before 'sam' in the sequence we give it..
|
||||
/// let pre = Prefilter::new(MatchKind::LeftmostFirst, &["samwise", "sam"])
|
||||
/// .expect("a prefilter");
|
||||
/// assert_eq!(
|
||||
/// Some(Span::from(6..13)),
|
||||
/// pre.find(hay.as_bytes(), Span::from(0..hay.len())),
|
||||
/// );
|
||||
/// // Still with leftmost-first but with the literals reverse, now 'sam'
|
||||
/// // will match instead!
|
||||
/// let pre = Prefilter::new(MatchKind::LeftmostFirst, &["sam", "samwise"])
|
||||
/// .expect("a prefilter");
|
||||
/// assert_eq!(
|
||||
/// Some(Span::from(6..9)),
|
||||
/// pre.find(hay.as_bytes(), Span::from(0..hay.len())),
|
||||
/// );
|
||||
///
|
||||
/// # Ok::<(), Box<dyn std::error::Error>>(())
|
||||
/// ```
|
||||
pub fn new<B: AsRef<[u8]>>(
|
||||
kind: MatchKind,
|
||||
needles: &[B],
|
||||
) -> Option<Prefilter> {
|
||||
Choice::new(kind, needles).and_then(|choice| {
|
||||
let max_needle_len =
|
||||
needles.iter().map(|b| b.as_ref().len()).max().unwrap_or(0);
|
||||
Prefilter::from_choice(choice, max_needle_len)
|
||||
})
|
||||
}
|
||||
|
||||
/// This turns a prefilter selection into a `Prefilter`. That is, in turns
|
||||
/// the enum given into a trait object.
|
||||
fn from_choice(
|
||||
choice: Choice,
|
||||
max_needle_len: usize,
|
||||
) -> Option<Prefilter> {
|
||||
#[cfg(not(feature = "alloc"))]
|
||||
{
|
||||
None
|
||||
}
|
||||
#[cfg(feature = "alloc")]
|
||||
{
|
||||
let pre: Arc<dyn PrefilterI> = match choice {
|
||||
Choice::Memchr(p) => Arc::new(p),
|
||||
Choice::Memchr2(p) => Arc::new(p),
|
||||
Choice::Memchr3(p) => Arc::new(p),
|
||||
Choice::Memmem(p) => Arc::new(p),
|
||||
Choice::Teddy(p) => Arc::new(p),
|
||||
Choice::ByteSet(p) => Arc::new(p),
|
||||
Choice::AhoCorasick(p) => Arc::new(p),
|
||||
};
|
||||
let is_fast = pre.is_fast();
|
||||
Some(Prefilter { pre, is_fast, max_needle_len })
|
||||
}
|
||||
}
|
||||
|
||||
/// This attempts to extract prefixes from the given `Hir` expression for
|
||||
/// the given match semantics, and if possible, builds a prefilter for
|
||||
/// them.
|
||||
///
|
||||
/// # Example
|
||||
///
|
||||
/// This example shows how to build a prefilter directly from an [`Hir`]
|
||||
/// expression, and use to find an occurrence of a prefix from the regex
|
||||
/// pattern.
|
||||
///
|
||||
/// ```
|
||||
/// use regex_automata::{
|
||||
/// util::{prefilter::Prefilter, syntax},
|
||||
/// MatchKind, Span,
|
||||
/// };
|
||||
///
|
||||
/// let hir = syntax::parse(r"(Bruce|Patti) \w+")?;
|
||||
/// let pre = Prefilter::from_hir_prefix(MatchKind::LeftmostFirst, &hir)
|
||||
/// .expect("a prefilter");
|
||||
/// let hay = "Hello Patti Scialfa!";
|
||||
/// assert_eq!(
|
||||
/// Some(Span::from(6..12)),
|
||||
/// pre.find(hay.as_bytes(), Span::from(0..hay.len())),
|
||||
/// );
|
||||
///
|
||||
/// # Ok::<(), Box<dyn std::error::Error>>(())
|
||||
/// ```
|
||||
#[cfg(feature = "syntax")]
|
||||
pub fn from_hir_prefix(kind: MatchKind, hir: &Hir) -> Option<Prefilter> {
|
||||
Prefilter::from_hirs_prefix(kind, &[hir])
|
||||
}
|
||||
|
||||
/// This attempts to extract prefixes from the given `Hir` expressions for
|
||||
/// the given match semantics, and if possible, builds a prefilter for
|
||||
/// them.
|
||||
///
|
||||
/// Note that as of now, prefilters throw away information about which
|
||||
/// pattern each literal comes from. In other words, when a prefilter finds
|
||||
/// a match, there's no way to know which pattern (or patterns) it came
|
||||
/// from. Therefore, in order to confirm a match, you'll have to check all
|
||||
/// of the patterns by running the full regex engine.
|
||||
///
|
||||
/// # Example
|
||||
///
|
||||
/// This example shows how to build a prefilter directly from multiple
|
||||
/// `Hir` expressions expression, and use it to find an occurrence of a
|
||||
/// prefix from the regex patterns.
|
||||
///
|
||||
/// ```
|
||||
/// use regex_automata::{
|
||||
/// util::{prefilter::Prefilter, syntax},
|
||||
/// MatchKind, Span,
|
||||
/// };
|
||||
///
|
||||
/// let hirs = syntax::parse_many(&[
|
||||
/// r"(Bruce|Patti) \w+",
|
||||
/// r"Mrs?\. Doubtfire",
|
||||
/// ])?;
|
||||
/// let pre = Prefilter::from_hirs_prefix(MatchKind::LeftmostFirst, &hirs)
|
||||
/// .expect("a prefilter");
|
||||
/// let hay = "Hello Mrs. Doubtfire";
|
||||
/// assert_eq!(
|
||||
/// Some(Span::from(6..20)),
|
||||
/// pre.find(hay.as_bytes(), Span::from(0..hay.len())),
|
||||
/// );
|
||||
///
|
||||
/// # Ok::<(), Box<dyn std::error::Error>>(())
|
||||
/// ```
|
||||
#[cfg(feature = "syntax")]
|
||||
pub fn from_hirs_prefix<H: Borrow<Hir>>(
|
||||
kind: MatchKind,
|
||||
hirs: &[H],
|
||||
) -> Option<Prefilter> {
|
||||
prefixes(kind, hirs)
|
||||
.literals()
|
||||
.and_then(|lits| Prefilter::new(kind, lits))
|
||||
}
|
||||
|
||||
/// Run this prefilter on `haystack[span.start..end]` and return a matching
|
||||
/// span if one exists.
|
||||
///
|
||||
/// The span returned is guaranteed to have a start position greater than
|
||||
/// or equal to the one given, and an end position less than or equal to
|
||||
/// the one given.
|
||||
///
|
||||
/// # Example
|
||||
///
|
||||
/// This example shows how to build a prefilter directly from an [`Hir`]
|
||||
/// expression, and use it to find an occurrence of a prefix from the regex
|
||||
/// pattern.
|
||||
///
|
||||
/// ```
|
||||
/// use regex_automata::{
|
||||
/// util::{prefilter::Prefilter, syntax},
|
||||
/// MatchKind, Span,
|
||||
/// };
|
||||
///
|
||||
/// let hir = syntax::parse(r"Bruce \w+")?;
|
||||
/// let pre = Prefilter::from_hir_prefix(MatchKind::LeftmostFirst, &hir)
|
||||
/// .expect("a prefilter");
|
||||
/// let hay = "Hello Bruce Springsteen!";
|
||||
/// assert_eq!(
|
||||
/// Some(Span::from(6..12)),
|
||||
/// pre.find(hay.as_bytes(), Span::from(0..hay.len())),
|
||||
/// );
|
||||
///
|
||||
/// # Ok::<(), Box<dyn std::error::Error>>(())
|
||||
/// ```
|
||||
#[inline]
|
||||
pub fn find(&self, haystack: &[u8], span: Span) -> Option<Span> {
|
||||
#[cfg(not(feature = "alloc"))]
|
||||
{
|
||||
unreachable!()
|
||||
}
|
||||
#[cfg(feature = "alloc")]
|
||||
{
|
||||
self.pre.find(haystack, span)
|
||||
}
|
||||
}
|
||||
|
||||
/// Returns the span of a prefix of `haystack[span.start..span.end]` if
|
||||
/// the prefilter matches.
|
||||
///
|
||||
/// The span returned is guaranteed to have a start position equivalent to
|
||||
/// the one given, and an end position less than or equal to the one given.
|
||||
///
|
||||
/// # Example
|
||||
///
|
||||
/// This example shows how to build a prefilter directly from an [`Hir`]
|
||||
/// expression, and use it to find an occurrence of a prefix from the regex
|
||||
/// pattern that begins at the start of a haystack only.
|
||||
///
|
||||
/// ```
|
||||
/// use regex_automata::{
|
||||
/// util::{prefilter::Prefilter, syntax},
|
||||
/// MatchKind, Span,
|
||||
/// };
|
||||
///
|
||||
/// let hir = syntax::parse(r"Bruce \w+")?;
|
||||
/// let pre = Prefilter::from_hir_prefix(MatchKind::LeftmostFirst, &hir)
|
||||
/// .expect("a prefilter");
|
||||
/// let hay = "Hello Bruce Springsteen!";
|
||||
/// // Nothing is found here because 'Bruce' does
|
||||
/// // not occur at the beginning of our search.
|
||||
/// assert_eq!(
|
||||
/// None,
|
||||
/// pre.prefix(hay.as_bytes(), Span::from(0..hay.len())),
|
||||
/// );
|
||||
/// // But if we change where we start the search
|
||||
/// // to begin where 'Bruce ' begins, then a
|
||||
/// // match will be found.
|
||||
/// assert_eq!(
|
||||
/// Some(Span::from(6..12)),
|
||||
/// pre.prefix(hay.as_bytes(), Span::from(6..hay.len())),
|
||||
/// );
|
||||
///
|
||||
/// # Ok::<(), Box<dyn std::error::Error>>(())
|
||||
/// ```
|
||||
#[inline]
|
||||
pub fn prefix(&self, haystack: &[u8], span: Span) -> Option<Span> {
|
||||
#[cfg(not(feature = "alloc"))]
|
||||
{
|
||||
unreachable!()
|
||||
}
|
||||
#[cfg(feature = "alloc")]
|
||||
{
|
||||
self.pre.prefix(haystack, span)
|
||||
}
|
||||
}
|
||||
|
||||
/// Returns the heap memory, in bytes, used by the underlying prefilter.
|
||||
#[inline]
|
||||
pub fn memory_usage(&self) -> usize {
|
||||
#[cfg(not(feature = "alloc"))]
|
||||
{
|
||||
unreachable!()
|
||||
}
|
||||
#[cfg(feature = "alloc")]
|
||||
{
|
||||
self.pre.memory_usage()
|
||||
}
|
||||
}
|
||||
|
||||
/// Return the length of the longest needle
|
||||
/// in this Prefilter
|
||||
#[inline]
|
||||
pub fn max_needle_len(&self) -> usize {
|
||||
#[cfg(not(feature = "alloc"))]
|
||||
{
|
||||
unreachable!()
|
||||
}
|
||||
#[cfg(feature = "alloc")]
|
||||
{
|
||||
self.max_needle_len
|
||||
}
|
||||
}
|
||||
|
||||
/// Implementations might return true here if they believe themselves to
|
||||
/// be "fast." The concept of "fast" is deliberately left vague, but in
|
||||
/// practice this usually corresponds to whether it's believed that SIMD
|
||||
/// will be used.
|
||||
///
|
||||
/// Why do we care about this? Well, some prefilter tricks tend to come
|
||||
/// with their own bits of overhead, and so might only make sense if we
|
||||
/// know that a scan will be *much* faster than the regex engine itself.
|
||||
/// Otherwise, the trick may not be worth doing. Whether something is
|
||||
/// "much" faster than the regex engine generally boils down to whether
|
||||
/// SIMD is used. (But not always. Even a SIMD matcher with a high false
|
||||
/// positive rate can become quite slow.)
|
||||
///
|
||||
/// Even if this returns true, it is still possible for the prefilter to
|
||||
/// be "slow." Remember, prefilters are just heuristics. We can't really
|
||||
/// *know* a prefilter will be fast without actually trying the prefilter.
|
||||
/// (Which of course we cannot afford to do.)
|
||||
#[inline]
|
||||
pub fn is_fast(&self) -> bool {
|
||||
#[cfg(not(feature = "alloc"))]
|
||||
{
|
||||
unreachable!()
|
||||
}
|
||||
#[cfg(feature = "alloc")]
|
||||
{
|
||||
self.is_fast
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// A trait for abstracting over prefilters. Basically, a prefilter is
|
||||
/// something that do an unanchored *and* an anchored search in a haystack
|
||||
/// within a given span.
|
||||
///
|
||||
/// This exists pretty much only so that we can use prefilters as a trait
|
||||
/// object (which is what `Prefilter` is). If we ever move off of trait objects
|
||||
/// and to an enum, then it's likely this trait could be removed.
|
||||
pub(crate) trait PrefilterI:
|
||||
Debug + Send + Sync + RefUnwindSafe + UnwindSafe + 'static
|
||||
{
|
||||
/// Run this prefilter on `haystack[span.start..end]` and return a matching
|
||||
/// span if one exists.
|
||||
///
|
||||
/// The span returned is guaranteed to have a start position greater than
|
||||
/// or equal to the one given, and an end position less than or equal to
|
||||
/// the one given.
|
||||
fn find(&self, haystack: &[u8], span: Span) -> Option<Span>;
|
||||
|
||||
/// Returns the span of a prefix of `haystack[span.start..span.end]` if
|
||||
/// the prefilter matches.
|
||||
///
|
||||
/// The span returned is guaranteed to have a start position equivalent to
|
||||
/// the one given, and an end position less than or equal to the one given.
|
||||
fn prefix(&self, haystack: &[u8], span: Span) -> Option<Span>;
|
||||
|
||||
/// Returns the heap memory, in bytes, used by the underlying prefilter.
|
||||
fn memory_usage(&self) -> usize;
|
||||
|
||||
/// Implementations might return true here if they believe themselves to
|
||||
/// be "fast." See [`Prefilter::is_fast`] for more details.
|
||||
fn is_fast(&self) -> bool;
|
||||
}
|
||||
|
||||
#[cfg(feature = "alloc")]
|
||||
impl<P: PrefilterI + ?Sized> PrefilterI for Arc<P> {
|
||||
#[cfg_attr(feature = "perf-inline", inline(always))]
|
||||
fn find(&self, haystack: &[u8], span: Span) -> Option<Span> {
|
||||
(&**self).find(haystack, span)
|
||||
}
|
||||
|
||||
#[cfg_attr(feature = "perf-inline", inline(always))]
|
||||
fn prefix(&self, haystack: &[u8], span: Span) -> Option<Span> {
|
||||
(&**self).prefix(haystack, span)
|
||||
}
|
||||
|
||||
#[cfg_attr(feature = "perf-inline", inline(always))]
|
||||
fn memory_usage(&self) -> usize {
|
||||
(&**self).memory_usage()
|
||||
}
|
||||
|
||||
#[cfg_attr(feature = "perf-inline", inline(always))]
|
||||
fn is_fast(&self) -> bool {
|
||||
(&**self).is_fast()
|
||||
}
|
||||
}
|
||||
|
||||
/// A type that encapsulates the selection of a prefilter algorithm from a
|
||||
/// sequence of needles.
|
||||
///
|
||||
/// The existence of this type is a little tricky, because we don't (currently)
|
||||
/// use it for performing a search. Instead, we really only consume it by
|
||||
/// converting the underlying prefilter into a trait object, whether that be
|
||||
/// `dyn PrefilterI` or `dyn Strategy` (for the meta regex engine). In order
|
||||
/// to avoid re-copying the prefilter selection logic, we isolate it here, and
|
||||
/// then force anything downstream that wants to convert it to a trait object
|
||||
/// to do trivial case analysis on it.
|
||||
///
|
||||
/// One wonders whether we *should* use an enum instead of a trait object.
|
||||
/// At time of writing, I chose trait objects based on instinct because 1) I
|
||||
/// knew I wasn't going to inline anything and 2) there would potentially be
|
||||
/// many different choices. However, as of time of writing, I haven't actually
|
||||
/// compared the trait object approach to the enum approach. That probably
|
||||
/// should be litigated, but I ran out of steam.
|
||||
///
|
||||
/// Note that if the `alloc` feature is disabled, then values of this type
|
||||
/// are (and should) never be constructed. Also, in practice, for any of the
|
||||
/// prefilters to be selected, you'll need at least one of the `perf-literal-*`
|
||||
/// features enabled.
|
||||
#[derive(Clone, Debug)]
|
||||
pub(crate) enum Choice {
|
||||
Memchr(Memchr),
|
||||
Memchr2(Memchr2),
|
||||
Memchr3(Memchr3),
|
||||
Memmem(Memmem),
|
||||
Teddy(Teddy),
|
||||
ByteSet(ByteSet),
|
||||
AhoCorasick(AhoCorasick),
|
||||
}
|
||||
|
||||
impl Choice {
|
||||
/// Select what is believed to be the best prefilter algorithm for the
|
||||
/// match semantics and sequence of needles given.
|
||||
///
|
||||
/// This selection algorithm uses the needles as given without any
|
||||
/// modification. For example, if `[bar]` is given, then this doesn't
|
||||
/// try to select `memchr` for `b`. Instead, it would select `memmem`
|
||||
/// for `bar`. If callers would want `memchr` selected for `[bar]`, then
|
||||
/// callers should massages the literals themselves. That is, callers are
|
||||
/// responsible for heuristics surrounding which sequence of literals is
|
||||
/// best.
|
||||
///
|
||||
/// What this selection algorithm does is attempt to use the fastest
|
||||
/// prefilter that works for the literals given. So if `[a, b]`, is given,
|
||||
/// then `memchr2` is selected.
|
||||
///
|
||||
/// Of course, which prefilter is selected is also subject to what
|
||||
/// is available. For example, if `alloc` isn't enabled, then
|
||||
/// that limits which prefilters can be selected. Similarly, if
|
||||
/// `perf-literal-substring` isn't enabled, then nothing from the `memchr`
|
||||
/// crate can be returned.
|
||||
pub(crate) fn new<B: AsRef<[u8]>>(
|
||||
kind: MatchKind,
|
||||
needles: &[B],
|
||||
) -> Option<Choice> {
|
||||
// An empty set means the regex matches nothing, so no sense in
|
||||
// building a prefilter.
|
||||
if needles.len() == 0 {
|
||||
debug!("prefilter building failed: found empty set of literals");
|
||||
return None;
|
||||
}
|
||||
// If the regex can match the empty string, then the prefilter
|
||||
// will by definition match at every position. This is obviously
|
||||
// completely ineffective.
|
||||
if needles.iter().any(|n| n.as_ref().is_empty()) {
|
||||
debug!("prefilter building failed: literals match empty string");
|
||||
return None;
|
||||
}
|
||||
// BREADCRUMBS: Perhaps the literal optimizer should special case
|
||||
// sequences of length two or three if the leading bytes of each are
|
||||
// "rare"? Or perhaps, if there are two or three total possible leading
|
||||
// bytes, regardless of the number of literals, and all are rare...
|
||||
// Then well, perhaps we should use memchr2 or memchr3 in those cases?
|
||||
if let Some(pre) = Memchr::new(kind, needles) {
|
||||
debug!("prefilter built: memchr");
|
||||
return Some(Choice::Memchr(pre));
|
||||
}
|
||||
if let Some(pre) = Memchr2::new(kind, needles) {
|
||||
debug!("prefilter built: memchr2");
|
||||
return Some(Choice::Memchr2(pre));
|
||||
}
|
||||
if let Some(pre) = Memchr3::new(kind, needles) {
|
||||
debug!("prefilter built: memchr3");
|
||||
return Some(Choice::Memchr3(pre));
|
||||
}
|
||||
if let Some(pre) = Memmem::new(kind, needles) {
|
||||
debug!("prefilter built: memmem");
|
||||
return Some(Choice::Memmem(pre));
|
||||
}
|
||||
if let Some(pre) = Teddy::new(kind, needles) {
|
||||
debug!("prefilter built: teddy");
|
||||
return Some(Choice::Teddy(pre));
|
||||
}
|
||||
if let Some(pre) = ByteSet::new(kind, needles) {
|
||||
debug!("prefilter built: byteset");
|
||||
return Some(Choice::ByteSet(pre));
|
||||
}
|
||||
if let Some(pre) = AhoCorasick::new(kind, needles) {
|
||||
debug!("prefilter built: aho-corasick");
|
||||
return Some(Choice::AhoCorasick(pre));
|
||||
}
|
||||
debug!("prefilter building failed: no strategy could be found");
|
||||
None
|
||||
}
|
||||
}
|
||||
|
||||
/// Extracts all of the prefix literals from the given HIR expressions into a
|
||||
/// single `Seq`. The literals in the sequence are ordered with respect to the
|
||||
/// order of the given HIR expressions and consistent with the match semantics
|
||||
/// given.
|
||||
///
|
||||
/// The sequence returned is "optimized." That is, they may be shrunk or even
|
||||
/// truncated according to heuristics with the intent of making them more
|
||||
/// useful as a prefilter. (Which translates to both using faster algorithms
|
||||
/// and minimizing the false positive rate.)
|
||||
///
|
||||
/// Note that this erases any connection between the literals and which pattern
|
||||
/// (or patterns) they came from.
|
||||
///
|
||||
/// The match kind given must correspond to the match semantics of the regex
|
||||
/// that is represented by the HIRs given. The match semantics may change the
|
||||
/// literal sequence returned.
|
||||
#[cfg(feature = "syntax")]
|
||||
pub(crate) fn prefixes<H>(kind: MatchKind, hirs: &[H]) -> literal::Seq
|
||||
where
|
||||
H: core::borrow::Borrow<Hir>,
|
||||
{
|
||||
let mut extractor = literal::Extractor::new();
|
||||
extractor.kind(literal::ExtractKind::Prefix);
|
||||
|
||||
let mut prefixes = literal::Seq::empty();
|
||||
for hir in hirs {
|
||||
prefixes.union(&mut extractor.extract(hir.borrow()));
|
||||
}
|
||||
debug!(
|
||||
"prefixes (len={:?}, exact={:?}) extracted before optimization: {:?}",
|
||||
prefixes.len(),
|
||||
prefixes.is_exact(),
|
||||
prefixes
|
||||
);
|
||||
match kind {
|
||||
MatchKind::All => {
|
||||
prefixes.sort();
|
||||
prefixes.dedup();
|
||||
}
|
||||
MatchKind::LeftmostFirst => {
|
||||
prefixes.optimize_for_prefix_by_preference();
|
||||
}
|
||||
}
|
||||
debug!(
|
||||
"prefixes (len={:?}, exact={:?}) extracted after optimization: {:?}",
|
||||
prefixes.len(),
|
||||
prefixes.is_exact(),
|
||||
prefixes
|
||||
);
|
||||
prefixes
|
||||
}
|
||||
|
||||
/// Like `prefixes`, but for all suffixes of all matches for the given HIRs.
|
||||
#[cfg(feature = "syntax")]
|
||||
pub(crate) fn suffixes<H>(kind: MatchKind, hirs: &[H]) -> literal::Seq
|
||||
where
|
||||
H: core::borrow::Borrow<Hir>,
|
||||
{
|
||||
let mut extractor = literal::Extractor::new();
|
||||
extractor.kind(literal::ExtractKind::Suffix);
|
||||
|
||||
let mut suffixes = literal::Seq::empty();
|
||||
for hir in hirs {
|
||||
suffixes.union(&mut extractor.extract(hir.borrow()));
|
||||
}
|
||||
debug!(
|
||||
"suffixes (len={:?}, exact={:?}) extracted before optimization: {:?}",
|
||||
suffixes.len(),
|
||||
suffixes.is_exact(),
|
||||
suffixes
|
||||
);
|
||||
match kind {
|
||||
MatchKind::All => {
|
||||
suffixes.sort();
|
||||
suffixes.dedup();
|
||||
}
|
||||
MatchKind::LeftmostFirst => {
|
||||
suffixes.optimize_for_suffix_by_preference();
|
||||
}
|
||||
}
|
||||
debug!(
|
||||
"suffixes (len={:?}, exact={:?}) extracted after optimization: {:?}",
|
||||
suffixes.len(),
|
||||
suffixes.is_exact(),
|
||||
suffixes
|
||||
);
|
||||
suffixes
|
||||
}
|
||||
160
third-party/vendor/regex-automata/src/util/prefilter/teddy.rs
vendored
Normal file
160
third-party/vendor/regex-automata/src/util/prefilter/teddy.rs
vendored
Normal file
|
|
@ -0,0 +1,160 @@
|
|||
use crate::util::{
|
||||
prefilter::PrefilterI,
|
||||
search::{MatchKind, Span},
|
||||
};
|
||||
|
||||
#[derive(Clone, Debug)]
|
||||
pub(crate) struct Teddy {
|
||||
#[cfg(not(feature = "perf-literal-multisubstring"))]
|
||||
_unused: (),
|
||||
/// The actual Teddy searcher.
|
||||
///
|
||||
/// Technically, it's possible that Teddy doesn't actually get used, since
|
||||
/// Teddy does require its haystack to at least be of a certain size
|
||||
/// (usually around the size of whatever vector is being used, so ~16
|
||||
/// or ~32 bytes). For haystacks shorter than that, the implementation
|
||||
/// currently uses Rabin-Karp.
|
||||
#[cfg(feature = "perf-literal-multisubstring")]
|
||||
searcher: aho_corasick::packed::Searcher,
|
||||
/// When running an anchored search, the packed searcher can't handle it so
|
||||
/// we defer to Aho-Corasick itself. Kind of sad, but changing the packed
|
||||
/// searchers to support anchored search would be difficult at worst and
|
||||
/// annoying at best. Since packed searchers only apply to small numbers of
|
||||
/// literals, we content ourselves that this is not much of an added cost.
|
||||
/// (That packed searchers only work with a small number of literals is
|
||||
/// also why we use a DFA here. Otherwise, the memory usage of a DFA would
|
||||
/// likely be unacceptable.)
|
||||
#[cfg(feature = "perf-literal-multisubstring")]
|
||||
anchored_ac: aho_corasick::dfa::DFA,
|
||||
/// The length of the smallest literal we look for.
|
||||
///
|
||||
/// We use this as a heuristic to figure out whether this will be "fast" or
|
||||
/// not. Generally, the longer the better, because longer needles are more
|
||||
/// discriminating and thus reduce false positive rate.
|
||||
#[cfg(feature = "perf-literal-multisubstring")]
|
||||
minimum_len: usize,
|
||||
}
|
||||
|
||||
impl Teddy {
|
||||
pub(crate) fn new<B: AsRef<[u8]>>(
|
||||
kind: MatchKind,
|
||||
needles: &[B],
|
||||
) -> Option<Teddy> {
|
||||
#[cfg(not(feature = "perf-literal-multisubstring"))]
|
||||
{
|
||||
None
|
||||
}
|
||||
#[cfg(feature = "perf-literal-multisubstring")]
|
||||
{
|
||||
// We only really support leftmost-first semantics. In
|
||||
// theory we could at least support leftmost-longest, as the
|
||||
// aho-corasick crate does, but regex-automata doesn't know about
|
||||
// leftmost-longest currently.
|
||||
//
|
||||
// And like the aho-corasick prefilter, if we're using `All`
|
||||
// semantics, then we can still use leftmost semantics for a
|
||||
// prefilter. (This might be a suspicious choice for the literal
|
||||
// engine, which uses a prefilter as a regex engine directly, but
|
||||
// that only happens when using leftmost-first semantics.)
|
||||
let (packed_match_kind, ac_match_kind) = match kind {
|
||||
MatchKind::LeftmostFirst | MatchKind::All => (
|
||||
aho_corasick::packed::MatchKind::LeftmostFirst,
|
||||
aho_corasick::MatchKind::LeftmostFirst,
|
||||
),
|
||||
};
|
||||
let minimum_len =
|
||||
needles.iter().map(|n| n.as_ref().len()).min().unwrap_or(0);
|
||||
let packed = aho_corasick::packed::Config::new()
|
||||
.match_kind(packed_match_kind)
|
||||
.builder()
|
||||
.extend(needles)
|
||||
.build()?;
|
||||
let anchored_ac = aho_corasick::dfa::DFA::builder()
|
||||
.match_kind(ac_match_kind)
|
||||
.start_kind(aho_corasick::StartKind::Anchored)
|
||||
.prefilter(false)
|
||||
.build(needles)
|
||||
.ok()?;
|
||||
Some(Teddy { searcher: packed, anchored_ac, minimum_len })
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl PrefilterI for Teddy {
|
||||
fn find(&self, haystack: &[u8], span: Span) -> Option<Span> {
|
||||
#[cfg(not(feature = "perf-literal-multisubstring"))]
|
||||
{
|
||||
unreachable!()
|
||||
}
|
||||
#[cfg(feature = "perf-literal-multisubstring")]
|
||||
{
|
||||
let ac_span =
|
||||
aho_corasick::Span { start: span.start, end: span.end };
|
||||
self.searcher
|
||||
.find_in(haystack, ac_span)
|
||||
.map(|m| Span { start: m.start(), end: m.end() })
|
||||
}
|
||||
}
|
||||
|
||||
fn prefix(&self, haystack: &[u8], span: Span) -> Option<Span> {
|
||||
#[cfg(not(feature = "perf-literal-multisubstring"))]
|
||||
{
|
||||
unreachable!()
|
||||
}
|
||||
#[cfg(feature = "perf-literal-multisubstring")]
|
||||
{
|
||||
use aho_corasick::automaton::Automaton;
|
||||
let input = aho_corasick::Input::new(haystack)
|
||||
.anchored(aho_corasick::Anchored::Yes)
|
||||
.span(span.start..span.end);
|
||||
self.anchored_ac
|
||||
.try_find(&input)
|
||||
// OK because we build the DFA with anchored support.
|
||||
.expect("aho-corasick DFA should never fail")
|
||||
.map(|m| Span { start: m.start(), end: m.end() })
|
||||
}
|
||||
}
|
||||
|
||||
fn memory_usage(&self) -> usize {
|
||||
#[cfg(not(feature = "perf-literal-multisubstring"))]
|
||||
{
|
||||
unreachable!()
|
||||
}
|
||||
#[cfg(feature = "perf-literal-multisubstring")]
|
||||
{
|
||||
use aho_corasick::automaton::Automaton;
|
||||
self.searcher.memory_usage() + self.anchored_ac.memory_usage()
|
||||
}
|
||||
}
|
||||
|
||||
fn is_fast(&self) -> bool {
|
||||
#[cfg(not(feature = "perf-literal-multisubstring"))]
|
||||
{
|
||||
unreachable!()
|
||||
}
|
||||
#[cfg(feature = "perf-literal-multisubstring")]
|
||||
{
|
||||
// Teddy is usually quite fast, but I have seen some cases where
|
||||
// a large number of literals can overwhelm it and make it not so
|
||||
// fast. We make an educated but conservative guess at a limit, at
|
||||
// which point, we're not so comfortable thinking Teddy is "fast."
|
||||
//
|
||||
// Well... this used to incorporate a "limit" on the *number*
|
||||
// of literals, but I have since changed it to a minimum on the
|
||||
// *smallest* literal. Namely, when there is a very small literal
|
||||
// (1 or 2 bytes), it is far more likely that it leads to a higher
|
||||
// false positive rate. (Although, of course, not always. For
|
||||
// example, 'zq' is likely to have a very low false positive rate.)
|
||||
// But when we have 3 bytes, we have a really good chance of being
|
||||
// quite discriminatory and thus fast.
|
||||
//
|
||||
// We may still want to add some kind of limit on the number of
|
||||
// literals here, but keep in mind that Teddy already has its own
|
||||
// somewhat small limit (64 at time of writing). The main issue
|
||||
// here is that if 'is_fast' is false, it opens the door for the
|
||||
// reverse inner optimization to kick in. We really only want to
|
||||
// resort to the reverse inner optimization if we absolutely must.
|
||||
self.minimum_len >= 3
|
||||
}
|
||||
}
|
||||
}
|
||||
776
third-party/vendor/regex-automata/src/util/primitives.rs
vendored
Normal file
776
third-party/vendor/regex-automata/src/util/primitives.rs
vendored
Normal file
|
|
@ -0,0 +1,776 @@
|
|||
/*!
|
||||
Lower level primitive types that are useful in a variety of circumstances.
|
||||
|
||||
# Overview
|
||||
|
||||
This list represents the principle types in this module and briefly describes
|
||||
when you might want to use them.
|
||||
|
||||
* [`PatternID`] - A type that represents the identifier of a regex pattern.
|
||||
This is probably the most widely used type in this module (which is why it's
|
||||
also re-exported in the crate root).
|
||||
* [`StateID`] - A type the represents the identifier of a finite automaton
|
||||
state. This is used for both NFAs and DFAs, with the notable exception of
|
||||
the hybrid NFA/DFA. (The hybrid NFA/DFA uses a special purpose "lazy" state
|
||||
identifier.)
|
||||
* [`SmallIndex`] - The internal representation of both a `PatternID` and a
|
||||
`StateID`. Its purpose is to serve as a type that can index memory without
|
||||
being as big as a `usize` on 64-bit targets. The main idea behind this type
|
||||
is that there are many things in regex engines that will, in practice, never
|
||||
overflow a 32-bit integer. (For example, like the number of patterns in a regex
|
||||
or the number of states in an NFA.) Thus, a `SmallIndex` can be used to index
|
||||
memory without peppering `as` casts everywhere. Moreover, it forces callers
|
||||
to handle errors in the case where, somehow, the value would otherwise overflow
|
||||
either a 32-bit integer or a `usize` (e.g., on 16-bit targets).
|
||||
* [`NonMaxUsize`] - Represents a `usize` that cannot be `usize::MAX`. As a
|
||||
result, `Option<NonMaxUsize>` has the same size in memory as a `usize`. This
|
||||
useful, for example, when representing the offsets of submatches since it
|
||||
reduces memory usage by a factor of 2. It is a legal optimization since Rust
|
||||
guarantees that slices never have a length that exceeds `isize::MAX`.
|
||||
*/
|
||||
|
||||
use core::num::NonZeroUsize;
|
||||
|
||||
#[cfg(feature = "alloc")]
|
||||
use alloc::vec::Vec;
|
||||
|
||||
use crate::util::int::{Usize, U16, U32, U64};
|
||||
|
||||
/// A `usize` that can never be `usize::MAX`.
|
||||
///
|
||||
/// This is similar to `core::num::NonZeroUsize`, but instead of not permitting
|
||||
/// a zero value, this does not permit a max value.
|
||||
///
|
||||
/// This is useful in certain contexts where one wants to optimize the memory
|
||||
/// usage of things that contain match offsets. Namely, since Rust slices
|
||||
/// are guaranteed to never have a length exceeding `isize::MAX`, we can use
|
||||
/// `usize::MAX` as a sentinel to indicate that no match was found. Indeed,
|
||||
/// types like `Option<NonMaxUsize>` have exactly the same size in memory as a
|
||||
/// `usize`.
|
||||
///
|
||||
/// This type is defined to be `repr(transparent)` for
|
||||
/// `core::num::NonZeroUsize`, which is in turn defined to be
|
||||
/// `repr(transparent)` for `usize`.
|
||||
#[derive(Clone, Copy, Eq, Hash, PartialEq, PartialOrd, Ord)]
|
||||
#[repr(transparent)]
|
||||
pub struct NonMaxUsize(NonZeroUsize);
|
||||
|
||||
impl NonMaxUsize {
|
||||
/// Create a new `NonMaxUsize` from the given value.
|
||||
///
|
||||
/// This returns `None` only when the given value is equal to `usize::MAX`.
|
||||
#[inline]
|
||||
pub fn new(value: usize) -> Option<NonMaxUsize> {
|
||||
NonZeroUsize::new(value.wrapping_add(1)).map(NonMaxUsize)
|
||||
}
|
||||
|
||||
/// Return the underlying `usize` value. The returned value is guaranteed
|
||||
/// to not equal `usize::MAX`.
|
||||
#[inline]
|
||||
pub fn get(self) -> usize {
|
||||
self.0.get().wrapping_sub(1)
|
||||
}
|
||||
}
|
||||
|
||||
// We provide our own Debug impl because seeing the internal repr can be quite
|
||||
// surprising if you aren't expecting it. e.g., 'NonMaxUsize(5)' vs just '5'.
|
||||
impl core::fmt::Debug for NonMaxUsize {
|
||||
fn fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result {
|
||||
write!(f, "{:?}", self.get())
|
||||
}
|
||||
}
|
||||
|
||||
/// A type that represents a "small" index.
|
||||
///
|
||||
/// The main idea of this type is to provide something that can index memory,
|
||||
/// but uses less memory than `usize` on 64-bit systems. Specifically, its
|
||||
/// representation is always a `u32` and has `repr(transparent)` enabled. (So
|
||||
/// it is safe to transmute between a `u32` and a `SmallIndex`.)
|
||||
///
|
||||
/// A small index is typically useful in cases where there is no practical way
|
||||
/// that the index will overflow a 32-bit integer. A good example of this is
|
||||
/// an NFA state. If you could somehow build an NFA with `2^30` states, its
|
||||
/// memory usage would be exorbitant and its runtime execution would be so
|
||||
/// slow as to be completely worthless. Therefore, this crate generally deems
|
||||
/// it acceptable to return an error if it would otherwise build an NFA that
|
||||
/// requires a slice longer than what a 32-bit integer can index. In exchange,
|
||||
/// we can use 32-bit indices instead of 64-bit indices in various places.
|
||||
///
|
||||
/// This type ensures this by providing a constructor that will return an error
|
||||
/// if its argument cannot fit into the type. This makes it much easier to
|
||||
/// handle these sorts of boundary cases that are otherwise extremely subtle.
|
||||
///
|
||||
/// On all targets, this type guarantees that its value will fit in a `u32`,
|
||||
/// `i32`, `usize` and an `isize`. This means that on 16-bit targets, for
|
||||
/// example, this type's maximum value will never overflow an `isize`,
|
||||
/// which means it will never overflow a `i16` even though its internal
|
||||
/// representation is still a `u32`.
|
||||
///
|
||||
/// The purpose for making the type fit into even signed integer types like
|
||||
/// `isize` is to guarantee that the difference between any two small indices
|
||||
/// is itself also a small index. This is useful in certain contexts, e.g.,
|
||||
/// for delta encoding.
|
||||
///
|
||||
/// # Other types
|
||||
///
|
||||
/// The following types wrap `SmallIndex` to provide a more focused use case:
|
||||
///
|
||||
/// * [`PatternID`] is for representing the identifiers of patterns.
|
||||
/// * [`StateID`] is for representing the identifiers of states in finite
|
||||
/// automata. It is used for both NFAs and DFAs.
|
||||
///
|
||||
/// # Representation
|
||||
///
|
||||
/// This type is always represented internally by a `u32` and is marked as
|
||||
/// `repr(transparent)`. Thus, this type always has the same representation as
|
||||
/// a `u32`. It is thus safe to transmute between a `u32` and a `SmallIndex`.
|
||||
///
|
||||
/// # Indexing
|
||||
///
|
||||
/// For convenience, callers may use a `SmallIndex` to index slices.
|
||||
///
|
||||
/// # Safety
|
||||
///
|
||||
/// While a `SmallIndex` is meant to guarantee that its value fits into `usize`
|
||||
/// without using as much space as a `usize` on all targets, callers must
|
||||
/// not rely on this property for safety. Callers may choose to rely on this
|
||||
/// property for correctness however. For example, creating a `SmallIndex` with
|
||||
/// an invalid value can be done in entirely safe code. This may in turn result
|
||||
/// in panics or silent logical errors.
|
||||
#[derive(
|
||||
Clone, Copy, Debug, Default, Eq, Hash, PartialEq, PartialOrd, Ord,
|
||||
)]
|
||||
#[repr(transparent)]
|
||||
pub struct SmallIndex(u32);
|
||||
|
||||
impl SmallIndex {
|
||||
/// The maximum index value.
|
||||
#[cfg(any(target_pointer_width = "32", target_pointer_width = "64"))]
|
||||
pub const MAX: SmallIndex =
|
||||
// FIXME: Use as_usize() once const functions in traits are stable.
|
||||
SmallIndex::new_unchecked(core::i32::MAX as usize - 1);
|
||||
|
||||
/// The maximum index value.
|
||||
#[cfg(target_pointer_width = "16")]
|
||||
pub const MAX: SmallIndex =
|
||||
SmallIndex::new_unchecked(core::isize::MAX - 1);
|
||||
|
||||
/// The total number of values that can be represented as a small index.
|
||||
pub const LIMIT: usize = SmallIndex::MAX.as_usize() + 1;
|
||||
|
||||
/// The zero index value.
|
||||
pub const ZERO: SmallIndex = SmallIndex::new_unchecked(0);
|
||||
|
||||
/// The number of bytes that a single small index uses in memory.
|
||||
pub const SIZE: usize = core::mem::size_of::<SmallIndex>();
|
||||
|
||||
/// Create a new small index.
|
||||
///
|
||||
/// If the given index exceeds [`SmallIndex::MAX`], then this returns
|
||||
/// an error.
|
||||
#[inline]
|
||||
pub fn new(index: usize) -> Result<SmallIndex, SmallIndexError> {
|
||||
SmallIndex::try_from(index)
|
||||
}
|
||||
|
||||
/// Create a new small index without checking whether the given value
|
||||
/// exceeds [`SmallIndex::MAX`].
|
||||
///
|
||||
/// Using this routine with an invalid index value will result in
|
||||
/// unspecified behavior, but *not* undefined behavior. In particular, an
|
||||
/// invalid index value is likely to cause panics or possibly even silent
|
||||
/// logical errors.
|
||||
///
|
||||
/// Callers must never rely on a `SmallIndex` to be within a certain range
|
||||
/// for memory safety.
|
||||
#[inline]
|
||||
pub const fn new_unchecked(index: usize) -> SmallIndex {
|
||||
// FIXME: Use as_u32() once const functions in traits are stable.
|
||||
SmallIndex(index as u32)
|
||||
}
|
||||
|
||||
/// Like [`SmallIndex::new`], but panics if the given index is not valid.
|
||||
#[inline]
|
||||
pub fn must(index: usize) -> SmallIndex {
|
||||
SmallIndex::new(index).expect("invalid small index")
|
||||
}
|
||||
|
||||
/// Return this small index as a `usize`. This is guaranteed to never
|
||||
/// overflow `usize`.
|
||||
#[inline]
|
||||
pub const fn as_usize(&self) -> usize {
|
||||
// FIXME: Use as_usize() once const functions in traits are stable.
|
||||
self.0 as usize
|
||||
}
|
||||
|
||||
/// Return this small index as a `u64`. This is guaranteed to never
|
||||
/// overflow.
|
||||
#[inline]
|
||||
pub const fn as_u64(&self) -> u64 {
|
||||
// FIXME: Use u64::from() once const functions in traits are stable.
|
||||
self.0 as u64
|
||||
}
|
||||
|
||||
/// Return the internal `u32` of this small index. This is guaranteed to
|
||||
/// never overflow `u32`.
|
||||
#[inline]
|
||||
pub const fn as_u32(&self) -> u32 {
|
||||
self.0
|
||||
}
|
||||
|
||||
/// Return the internal `u32` of this small index represented as an `i32`.
|
||||
/// This is guaranteed to never overflow an `i32`.
|
||||
#[inline]
|
||||
pub const fn as_i32(&self) -> i32 {
|
||||
// This is OK because we guarantee that our max value is <= i32::MAX.
|
||||
self.0 as i32
|
||||
}
|
||||
|
||||
/// Returns one more than this small index as a usize.
|
||||
///
|
||||
/// Since a small index has constraints on its maximum value, adding `1` to
|
||||
/// it will always fit in a `usize`, `u32` and a `i32`.
|
||||
#[inline]
|
||||
pub fn one_more(&self) -> usize {
|
||||
self.as_usize() + 1
|
||||
}
|
||||
|
||||
/// Decode this small index from the bytes given using the native endian
|
||||
/// byte order for the current target.
|
||||
///
|
||||
/// If the decoded integer is not representable as a small index for the
|
||||
/// current target, then this returns an error.
|
||||
#[inline]
|
||||
pub fn from_ne_bytes(
|
||||
bytes: [u8; 4],
|
||||
) -> Result<SmallIndex, SmallIndexError> {
|
||||
let id = u32::from_ne_bytes(bytes);
|
||||
if id > SmallIndex::MAX.as_u32() {
|
||||
return Err(SmallIndexError { attempted: u64::from(id) });
|
||||
}
|
||||
Ok(SmallIndex::new_unchecked(id.as_usize()))
|
||||
}
|
||||
|
||||
/// Decode this small index from the bytes given using the native endian
|
||||
/// byte order for the current target.
|
||||
///
|
||||
/// This is analogous to [`SmallIndex::new_unchecked`] in that is does not
|
||||
/// check whether the decoded integer is representable as a small index.
|
||||
#[inline]
|
||||
pub fn from_ne_bytes_unchecked(bytes: [u8; 4]) -> SmallIndex {
|
||||
SmallIndex::new_unchecked(u32::from_ne_bytes(bytes).as_usize())
|
||||
}
|
||||
|
||||
/// Return the underlying small index integer as raw bytes in native endian
|
||||
/// format.
|
||||
#[inline]
|
||||
pub fn to_ne_bytes(&self) -> [u8; 4] {
|
||||
self.0.to_ne_bytes()
|
||||
}
|
||||
}
|
||||
|
||||
impl<T> core::ops::Index<SmallIndex> for [T] {
|
||||
type Output = T;
|
||||
|
||||
#[inline]
|
||||
fn index(&self, index: SmallIndex) -> &T {
|
||||
&self[index.as_usize()]
|
||||
}
|
||||
}
|
||||
|
||||
impl<T> core::ops::IndexMut<SmallIndex> for [T] {
|
||||
#[inline]
|
||||
fn index_mut(&mut self, index: SmallIndex) -> &mut T {
|
||||
&mut self[index.as_usize()]
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(feature = "alloc")]
|
||||
impl<T> core::ops::Index<SmallIndex> for Vec<T> {
|
||||
type Output = T;
|
||||
|
||||
#[inline]
|
||||
fn index(&self, index: SmallIndex) -> &T {
|
||||
&self[index.as_usize()]
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(feature = "alloc")]
|
||||
impl<T> core::ops::IndexMut<SmallIndex> for Vec<T> {
|
||||
#[inline]
|
||||
fn index_mut(&mut self, index: SmallIndex) -> &mut T {
|
||||
&mut self[index.as_usize()]
|
||||
}
|
||||
}
|
||||
|
||||
impl From<u8> for SmallIndex {
|
||||
fn from(index: u8) -> SmallIndex {
|
||||
SmallIndex::new_unchecked(usize::from(index))
|
||||
}
|
||||
}
|
||||
|
||||
impl TryFrom<u16> for SmallIndex {
|
||||
type Error = SmallIndexError;
|
||||
|
||||
fn try_from(index: u16) -> Result<SmallIndex, SmallIndexError> {
|
||||
if u32::from(index) > SmallIndex::MAX.as_u32() {
|
||||
return Err(SmallIndexError { attempted: u64::from(index) });
|
||||
}
|
||||
Ok(SmallIndex::new_unchecked(index.as_usize()))
|
||||
}
|
||||
}
|
||||
|
||||
impl TryFrom<u32> for SmallIndex {
|
||||
type Error = SmallIndexError;
|
||||
|
||||
fn try_from(index: u32) -> Result<SmallIndex, SmallIndexError> {
|
||||
if index > SmallIndex::MAX.as_u32() {
|
||||
return Err(SmallIndexError { attempted: u64::from(index) });
|
||||
}
|
||||
Ok(SmallIndex::new_unchecked(index.as_usize()))
|
||||
}
|
||||
}
|
||||
|
||||
impl TryFrom<u64> for SmallIndex {
|
||||
type Error = SmallIndexError;
|
||||
|
||||
fn try_from(index: u64) -> Result<SmallIndex, SmallIndexError> {
|
||||
if index > SmallIndex::MAX.as_u64() {
|
||||
return Err(SmallIndexError { attempted: index });
|
||||
}
|
||||
Ok(SmallIndex::new_unchecked(index.as_usize()))
|
||||
}
|
||||
}
|
||||
|
||||
impl TryFrom<usize> for SmallIndex {
|
||||
type Error = SmallIndexError;
|
||||
|
||||
fn try_from(index: usize) -> Result<SmallIndex, SmallIndexError> {
|
||||
if index > SmallIndex::MAX.as_usize() {
|
||||
return Err(SmallIndexError { attempted: index.as_u64() });
|
||||
}
|
||||
Ok(SmallIndex::new_unchecked(index))
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
impl quickcheck::Arbitrary for SmallIndex {
|
||||
fn arbitrary(gen: &mut quickcheck::Gen) -> SmallIndex {
|
||||
use core::cmp::max;
|
||||
|
||||
let id = max(i32::MIN + 1, i32::arbitrary(gen)).abs();
|
||||
if id > SmallIndex::MAX.as_i32() {
|
||||
SmallIndex::MAX
|
||||
} else {
|
||||
SmallIndex::new(usize::try_from(id).unwrap()).unwrap()
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// This error occurs when a small index could not be constructed.
|
||||
///
|
||||
/// This occurs when given an integer exceeding the maximum small index value.
|
||||
///
|
||||
/// When the `std` feature is enabled, this implements the `Error` trait.
|
||||
#[derive(Clone, Debug, Eq, PartialEq)]
|
||||
pub struct SmallIndexError {
|
||||
attempted: u64,
|
||||
}
|
||||
|
||||
impl SmallIndexError {
|
||||
/// Returns the value that could not be converted to a small index.
|
||||
pub fn attempted(&self) -> u64 {
|
||||
self.attempted
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(feature = "std")]
|
||||
impl std::error::Error for SmallIndexError {}
|
||||
|
||||
impl core::fmt::Display for SmallIndexError {
|
||||
fn fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result {
|
||||
write!(
|
||||
f,
|
||||
"failed to create small index from {:?}, which exceeds {:?}",
|
||||
self.attempted(),
|
||||
SmallIndex::MAX,
|
||||
)
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Clone, Debug)]
|
||||
pub(crate) struct SmallIndexIter {
|
||||
rng: core::ops::Range<usize>,
|
||||
}
|
||||
|
||||
impl Iterator for SmallIndexIter {
|
||||
type Item = SmallIndex;
|
||||
|
||||
fn next(&mut self) -> Option<SmallIndex> {
|
||||
if self.rng.start >= self.rng.end {
|
||||
return None;
|
||||
}
|
||||
let next_id = self.rng.start + 1;
|
||||
let id = core::mem::replace(&mut self.rng.start, next_id);
|
||||
// new_unchecked is OK since we asserted that the number of
|
||||
// elements in this iterator will fit in an ID at construction.
|
||||
Some(SmallIndex::new_unchecked(id))
|
||||
}
|
||||
}
|
||||
|
||||
macro_rules! index_type_impls {
|
||||
($name:ident, $err:ident, $iter:ident, $withiter:ident) => {
|
||||
impl $name {
|
||||
/// The maximum value.
|
||||
pub const MAX: $name = $name(SmallIndex::MAX);
|
||||
|
||||
/// The total number of values that can be represented.
|
||||
pub const LIMIT: usize = SmallIndex::LIMIT;
|
||||
|
||||
/// The zero value.
|
||||
pub const ZERO: $name = $name(SmallIndex::ZERO);
|
||||
|
||||
/// The number of bytes that a single value uses in memory.
|
||||
pub const SIZE: usize = SmallIndex::SIZE;
|
||||
|
||||
/// Create a new value that is represented by a "small index."
|
||||
///
|
||||
/// If the given index exceeds the maximum allowed value, then this
|
||||
/// returns an error.
|
||||
#[inline]
|
||||
pub fn new(value: usize) -> Result<$name, $err> {
|
||||
SmallIndex::new(value).map($name).map_err($err)
|
||||
}
|
||||
|
||||
/// Create a new value without checking whether the given argument
|
||||
/// exceeds the maximum.
|
||||
///
|
||||
/// Using this routine with an invalid value will result in
|
||||
/// unspecified behavior, but *not* undefined behavior. In
|
||||
/// particular, an invalid ID value is likely to cause panics or
|
||||
/// possibly even silent logical errors.
|
||||
///
|
||||
/// Callers must never rely on this type to be within a certain
|
||||
/// range for memory safety.
|
||||
#[inline]
|
||||
pub const fn new_unchecked(value: usize) -> $name {
|
||||
$name(SmallIndex::new_unchecked(value))
|
||||
}
|
||||
|
||||
/// Like `new`, but panics if the given value is not valid.
|
||||
#[inline]
|
||||
pub fn must(value: usize) -> $name {
|
||||
$name::new(value).expect(concat!(
|
||||
"invalid ",
|
||||
stringify!($name),
|
||||
" value"
|
||||
))
|
||||
}
|
||||
|
||||
/// Return the internal value as a `usize`. This is guaranteed to
|
||||
/// never overflow `usize`.
|
||||
#[inline]
|
||||
pub const fn as_usize(&self) -> usize {
|
||||
self.0.as_usize()
|
||||
}
|
||||
|
||||
/// Return the internal value as a `u64`. This is guaranteed to
|
||||
/// never overflow.
|
||||
#[inline]
|
||||
pub const fn as_u64(&self) -> u64 {
|
||||
self.0.as_u64()
|
||||
}
|
||||
|
||||
/// Return the internal value as a `u32`. This is guaranteed to
|
||||
/// never overflow `u32`.
|
||||
#[inline]
|
||||
pub const fn as_u32(&self) -> u32 {
|
||||
self.0.as_u32()
|
||||
}
|
||||
|
||||
/// Return the internal value as a i32`. This is guaranteed to
|
||||
/// never overflow an `i32`.
|
||||
#[inline]
|
||||
pub const fn as_i32(&self) -> i32 {
|
||||
self.0.as_i32()
|
||||
}
|
||||
|
||||
/// Returns one more than this value as a usize.
|
||||
///
|
||||
/// Since values represented by a "small index" have constraints
|
||||
/// on their maximum value, adding `1` to it will always fit in a
|
||||
/// `usize`, `u32` and a `i32`.
|
||||
#[inline]
|
||||
pub fn one_more(&self) -> usize {
|
||||
self.0.one_more()
|
||||
}
|
||||
|
||||
/// Decode this value from the bytes given using the native endian
|
||||
/// byte order for the current target.
|
||||
///
|
||||
/// If the decoded integer is not representable as a small index
|
||||
/// for the current target, then this returns an error.
|
||||
#[inline]
|
||||
pub fn from_ne_bytes(bytes: [u8; 4]) -> Result<$name, $err> {
|
||||
SmallIndex::from_ne_bytes(bytes).map($name).map_err($err)
|
||||
}
|
||||
|
||||
/// Decode this value from the bytes given using the native endian
|
||||
/// byte order for the current target.
|
||||
///
|
||||
/// This is analogous to `new_unchecked` in that is does not check
|
||||
/// whether the decoded integer is representable as a small index.
|
||||
#[inline]
|
||||
pub fn from_ne_bytes_unchecked(bytes: [u8; 4]) -> $name {
|
||||
$name(SmallIndex::from_ne_bytes_unchecked(bytes))
|
||||
}
|
||||
|
||||
/// Return the underlying integer as raw bytes in native endian
|
||||
/// format.
|
||||
#[inline]
|
||||
pub fn to_ne_bytes(&self) -> [u8; 4] {
|
||||
self.0.to_ne_bytes()
|
||||
}
|
||||
|
||||
/// Returns an iterator over all values from 0 up to and not
|
||||
/// including the given length.
|
||||
///
|
||||
/// If the given length exceeds this type's limit, then this
|
||||
/// panics.
|
||||
pub(crate) fn iter(len: usize) -> $iter {
|
||||
$iter::new(len)
|
||||
}
|
||||
}
|
||||
|
||||
// We write our own Debug impl so that we get things like PatternID(5)
|
||||
// instead of PatternID(SmallIndex(5)).
|
||||
impl core::fmt::Debug for $name {
|
||||
fn fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result {
|
||||
f.debug_tuple(stringify!($name)).field(&self.as_u32()).finish()
|
||||
}
|
||||
}
|
||||
|
||||
impl<T> core::ops::Index<$name> for [T] {
|
||||
type Output = T;
|
||||
|
||||
#[inline]
|
||||
fn index(&self, index: $name) -> &T {
|
||||
&self[index.as_usize()]
|
||||
}
|
||||
}
|
||||
|
||||
impl<T> core::ops::IndexMut<$name> for [T] {
|
||||
#[inline]
|
||||
fn index_mut(&mut self, index: $name) -> &mut T {
|
||||
&mut self[index.as_usize()]
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(feature = "alloc")]
|
||||
impl<T> core::ops::Index<$name> for Vec<T> {
|
||||
type Output = T;
|
||||
|
||||
#[inline]
|
||||
fn index(&self, index: $name) -> &T {
|
||||
&self[index.as_usize()]
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(feature = "alloc")]
|
||||
impl<T> core::ops::IndexMut<$name> for Vec<T> {
|
||||
#[inline]
|
||||
fn index_mut(&mut self, index: $name) -> &mut T {
|
||||
&mut self[index.as_usize()]
|
||||
}
|
||||
}
|
||||
|
||||
impl From<u8> for $name {
|
||||
fn from(value: u8) -> $name {
|
||||
$name(SmallIndex::from(value))
|
||||
}
|
||||
}
|
||||
|
||||
impl TryFrom<u16> for $name {
|
||||
type Error = $err;
|
||||
|
||||
fn try_from(value: u16) -> Result<$name, $err> {
|
||||
SmallIndex::try_from(value).map($name).map_err($err)
|
||||
}
|
||||
}
|
||||
|
||||
impl TryFrom<u32> for $name {
|
||||
type Error = $err;
|
||||
|
||||
fn try_from(value: u32) -> Result<$name, $err> {
|
||||
SmallIndex::try_from(value).map($name).map_err($err)
|
||||
}
|
||||
}
|
||||
|
||||
impl TryFrom<u64> for $name {
|
||||
type Error = $err;
|
||||
|
||||
fn try_from(value: u64) -> Result<$name, $err> {
|
||||
SmallIndex::try_from(value).map($name).map_err($err)
|
||||
}
|
||||
}
|
||||
|
||||
impl TryFrom<usize> for $name {
|
||||
type Error = $err;
|
||||
|
||||
fn try_from(value: usize) -> Result<$name, $err> {
|
||||
SmallIndex::try_from(value).map($name).map_err($err)
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
impl quickcheck::Arbitrary for $name {
|
||||
fn arbitrary(gen: &mut quickcheck::Gen) -> $name {
|
||||
$name(SmallIndex::arbitrary(gen))
|
||||
}
|
||||
}
|
||||
|
||||
/// This error occurs when a value could not be constructed.
|
||||
///
|
||||
/// This occurs when given an integer exceeding the maximum allowed
|
||||
/// value.
|
||||
///
|
||||
/// When the `std` feature is enabled, this implements the `Error`
|
||||
/// trait.
|
||||
#[derive(Clone, Debug, Eq, PartialEq)]
|
||||
pub struct $err(SmallIndexError);
|
||||
|
||||
impl $err {
|
||||
/// Returns the value that could not be converted to an ID.
|
||||
pub fn attempted(&self) -> u64 {
|
||||
self.0.attempted()
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(feature = "std")]
|
||||
impl std::error::Error for $err {}
|
||||
|
||||
impl core::fmt::Display for $err {
|
||||
fn fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result {
|
||||
write!(
|
||||
f,
|
||||
"failed to create {} from {:?}, which exceeds {:?}",
|
||||
stringify!($name),
|
||||
self.attempted(),
|
||||
$name::MAX,
|
||||
)
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Clone, Debug)]
|
||||
pub(crate) struct $iter(SmallIndexIter);
|
||||
|
||||
impl $iter {
|
||||
fn new(len: usize) -> $iter {
|
||||
assert!(
|
||||
len <= $name::LIMIT,
|
||||
"cannot create iterator for {} when number of \
|
||||
elements exceed {:?}",
|
||||
stringify!($name),
|
||||
$name::LIMIT,
|
||||
);
|
||||
$iter(SmallIndexIter { rng: 0..len })
|
||||
}
|
||||
}
|
||||
|
||||
impl Iterator for $iter {
|
||||
type Item = $name;
|
||||
|
||||
fn next(&mut self) -> Option<$name> {
|
||||
self.0.next().map($name)
|
||||
}
|
||||
}
|
||||
|
||||
/// An iterator adapter that is like std::iter::Enumerate, but attaches
|
||||
/// small index values instead. It requires `ExactSizeIterator`. At
|
||||
/// construction, it ensures that the index of each element in the
|
||||
/// iterator is representable in the corresponding small index type.
|
||||
#[derive(Clone, Debug)]
|
||||
pub(crate) struct $withiter<I> {
|
||||
it: I,
|
||||
ids: $iter,
|
||||
}
|
||||
|
||||
impl<I: Iterator + ExactSizeIterator> $withiter<I> {
|
||||
fn new(it: I) -> $withiter<I> {
|
||||
let ids = $name::iter(it.len());
|
||||
$withiter { it, ids }
|
||||
}
|
||||
}
|
||||
|
||||
impl<I: Iterator + ExactSizeIterator> Iterator for $withiter<I> {
|
||||
type Item = ($name, I::Item);
|
||||
|
||||
fn next(&mut self) -> Option<($name, I::Item)> {
|
||||
let item = self.it.next()?;
|
||||
// Number of elements in this iterator must match, according
|
||||
// to contract of ExactSizeIterator.
|
||||
let id = self.ids.next().unwrap();
|
||||
Some((id, item))
|
||||
}
|
||||
}
|
||||
};
|
||||
}
|
||||
|
||||
/// The identifier of a regex pattern, represented by a [`SmallIndex`].
|
||||
///
|
||||
/// The identifier for a pattern corresponds to its relative position among
|
||||
/// other patterns in a single finite state machine. Namely, when building
|
||||
/// a multi-pattern regex engine, one must supply a sequence of patterns to
|
||||
/// match. The position (starting at 0) of each pattern in that sequence
|
||||
/// represents its identifier. This identifier is in turn used to identify and
|
||||
/// report matches of that pattern in various APIs.
|
||||
///
|
||||
/// See the [`SmallIndex`] type for more information about what it means for
|
||||
/// a pattern ID to be a "small index."
|
||||
///
|
||||
/// Note that this type is defined in the
|
||||
/// [`util::primitives`](crate::util::primitives) module, but it is also
|
||||
/// re-exported at the crate root due to how common it is.
|
||||
#[derive(Clone, Copy, Default, Eq, Hash, PartialEq, PartialOrd, Ord)]
|
||||
#[repr(transparent)]
|
||||
pub struct PatternID(SmallIndex);
|
||||
|
||||
/// The identifier of a finite automaton state, represented by a
|
||||
/// [`SmallIndex`].
|
||||
///
|
||||
/// Most regex engines in this crate are built on top of finite automata. Each
|
||||
/// state in a finite automaton defines transitions from its state to another.
|
||||
/// Those transitions point to other states via their identifiers, i.e., a
|
||||
/// `StateID`. Since finite automata tend to contain many transitions, it is
|
||||
/// much more memory efficient to define state IDs as small indices.
|
||||
///
|
||||
/// See the [`SmallIndex`] type for more information about what it means for
|
||||
/// a state ID to be a "small index."
|
||||
#[derive(Clone, Copy, Default, Eq, Hash, PartialEq, PartialOrd, Ord)]
|
||||
#[repr(transparent)]
|
||||
pub struct StateID(SmallIndex);
|
||||
|
||||
index_type_impls!(PatternID, PatternIDError, PatternIDIter, WithPatternIDIter);
|
||||
index_type_impls!(StateID, StateIDError, StateIDIter, WithStateIDIter);
|
||||
|
||||
/// A utility trait that defines a couple of adapters for making it convenient
|
||||
/// to access indices as "small index" types. We require ExactSizeIterator so
|
||||
/// that iterator construction can do a single check to make sure the index of
|
||||
/// each element is representable by its small index type.
|
||||
pub(crate) trait IteratorIndexExt: Iterator {
|
||||
fn with_pattern_ids(self) -> WithPatternIDIter<Self>
|
||||
where
|
||||
Self: Sized + ExactSizeIterator,
|
||||
{
|
||||
WithPatternIDIter::new(self)
|
||||
}
|
||||
|
||||
fn with_state_ids(self) -> WithStateIDIter<Self>
|
||||
where
|
||||
Self: Sized + ExactSizeIterator,
|
||||
{
|
||||
WithStateIDIter::new(self)
|
||||
}
|
||||
}
|
||||
|
||||
impl<I: Iterator> IteratorIndexExt for I {}
|
||||
1993
third-party/vendor/regex-automata/src/util/search.rs
vendored
Normal file
1993
third-party/vendor/regex-automata/src/util/search.rs
vendored
Normal file
File diff suppressed because it is too large
Load diff
239
third-party/vendor/regex-automata/src/util/sparse_set.rs
vendored
Normal file
239
third-party/vendor/regex-automata/src/util/sparse_set.rs
vendored
Normal file
|
|
@ -0,0 +1,239 @@
|
|||
/*!
|
||||
This module defines a sparse set data structure. Its most interesting
|
||||
properties are:
|
||||
|
||||
* They preserve insertion order.
|
||||
* Set membership testing is done in constant time.
|
||||
* Set insertion is done in constant time.
|
||||
* Clearing the set is done in constant time.
|
||||
|
||||
The cost for doing this is that the capacity of the set needs to be known up
|
||||
front, and the elements in the set are limited to state identifiers.
|
||||
|
||||
These sets are principally used when traversing an NFA state graph. This
|
||||
happens at search time, for example, in the PikeVM. It also happens during DFA
|
||||
determinization.
|
||||
*/
|
||||
|
||||
use alloc::{vec, vec::Vec};
|
||||
|
||||
use crate::util::primitives::StateID;
|
||||
|
||||
/// A pairse of sparse sets.
|
||||
///
|
||||
/// This is useful when one needs to compute NFA epsilon closures from a
|
||||
/// previous set of states derived from an epsilon closure. One set can be the
|
||||
/// starting states where as the other set can be the destination states after
|
||||
/// following the transitions for a particular byte of input.
|
||||
///
|
||||
/// There is no significance to 'set1' or 'set2'. They are both sparse sets of
|
||||
/// the same size.
|
||||
///
|
||||
/// The members of this struct are exposed so that callers may borrow 'set1'
|
||||
/// and 'set2' individually without being force to borrow both at the same
|
||||
/// time.
|
||||
#[derive(Clone, Debug)]
|
||||
pub(crate) struct SparseSets {
|
||||
pub(crate) set1: SparseSet,
|
||||
pub(crate) set2: SparseSet,
|
||||
}
|
||||
|
||||
impl SparseSets {
|
||||
/// Create a new pair of sparse sets where each set has the given capacity.
|
||||
///
|
||||
/// This panics if the capacity given is bigger than `StateID::LIMIT`.
|
||||
pub(crate) fn new(capacity: usize) -> SparseSets {
|
||||
SparseSets {
|
||||
set1: SparseSet::new(capacity),
|
||||
set2: SparseSet::new(capacity),
|
||||
}
|
||||
}
|
||||
|
||||
/// Resizes these sparse sets to have the new capacity given.
|
||||
///
|
||||
/// The sets are automatically cleared.
|
||||
///
|
||||
/// This panics if the capacity given is bigger than `StateID::LIMIT`.
|
||||
#[inline]
|
||||
pub(crate) fn resize(&mut self, new_capacity: usize) {
|
||||
self.set1.resize(new_capacity);
|
||||
self.set2.resize(new_capacity);
|
||||
}
|
||||
|
||||
/// Clear both sparse sets.
|
||||
pub(crate) fn clear(&mut self) {
|
||||
self.set1.clear();
|
||||
self.set2.clear();
|
||||
}
|
||||
|
||||
/// Swap set1 with set2.
|
||||
pub(crate) fn swap(&mut self) {
|
||||
core::mem::swap(&mut self.set1, &mut self.set2);
|
||||
}
|
||||
|
||||
/// Returns the memory usage, in bytes, used by this pair of sparse sets.
|
||||
pub(crate) fn memory_usage(&self) -> usize {
|
||||
self.set1.memory_usage() + self.set2.memory_usage()
|
||||
}
|
||||
}
|
||||
|
||||
/// A sparse set used for representing ordered NFA states.
|
||||
///
|
||||
/// This supports constant time addition and membership testing. Clearing an
|
||||
/// entire set can also be done in constant time. Iteration yields elements
|
||||
/// in the order in which they were inserted.
|
||||
///
|
||||
/// The data structure is based on: https://research.swtch.com/sparse
|
||||
/// Note though that we don't actually use uninitialized memory. We generally
|
||||
/// reuse sparse sets, so the initial allocation cost is bareable. However, its
|
||||
/// other properties listed above are extremely useful.
|
||||
#[derive(Clone)]
|
||||
pub(crate) struct SparseSet {
|
||||
/// The number of elements currently in this set.
|
||||
len: usize,
|
||||
/// Dense contains the ids in the order in which they were inserted.
|
||||
dense: Vec<StateID>,
|
||||
/// Sparse maps ids to their location in dense.
|
||||
///
|
||||
/// A state ID is in the set if and only if
|
||||
/// sparse[id] < len && id == dense[sparse[id]].
|
||||
///
|
||||
/// Note that these are indices into 'dense'. It's a little weird to use
|
||||
/// StateID here, but we know our length can never exceed the bounds of
|
||||
/// StateID (enforced by 'resize') and StateID will be at most 4 bytes
|
||||
/// where as a usize is likely double that in most cases.
|
||||
sparse: Vec<StateID>,
|
||||
}
|
||||
|
||||
impl SparseSet {
|
||||
/// Create a new sparse set with the given capacity.
|
||||
///
|
||||
/// Sparse sets have a fixed size and they cannot grow. Attempting to
|
||||
/// insert more distinct elements than the total capacity of the set will
|
||||
/// result in a panic.
|
||||
///
|
||||
/// This panics if the capacity given is bigger than `StateID::LIMIT`.
|
||||
#[inline]
|
||||
pub(crate) fn new(capacity: usize) -> SparseSet {
|
||||
let mut set = SparseSet { len: 0, dense: vec![], sparse: vec![] };
|
||||
set.resize(capacity);
|
||||
set
|
||||
}
|
||||
|
||||
/// Resizes this sparse set to have the new capacity given.
|
||||
///
|
||||
/// This set is automatically cleared.
|
||||
///
|
||||
/// This panics if the capacity given is bigger than `StateID::LIMIT`.
|
||||
#[inline]
|
||||
pub(crate) fn resize(&mut self, new_capacity: usize) {
|
||||
assert!(
|
||||
new_capacity <= StateID::LIMIT,
|
||||
"sparse set capacity cannot excced {:?}",
|
||||
StateID::LIMIT
|
||||
);
|
||||
self.clear();
|
||||
self.dense.resize(new_capacity, StateID::ZERO);
|
||||
self.sparse.resize(new_capacity, StateID::ZERO);
|
||||
}
|
||||
|
||||
/// Returns the capacity of this set.
|
||||
///
|
||||
/// The capacity represents a fixed limit on the number of distinct
|
||||
/// elements that are allowed in this set. The capacity cannot be changed.
|
||||
#[inline]
|
||||
pub(crate) fn capacity(&self) -> usize {
|
||||
self.dense.len()
|
||||
}
|
||||
|
||||
/// Returns the number of elements in this set.
|
||||
#[inline]
|
||||
pub(crate) fn len(&self) -> usize {
|
||||
self.len
|
||||
}
|
||||
|
||||
/// Returns true if and only if this set is empty.
|
||||
#[inline]
|
||||
pub(crate) fn is_empty(&self) -> bool {
|
||||
self.len() == 0
|
||||
}
|
||||
|
||||
/// Insert the state ID value into this set and return true if the given
|
||||
/// state ID was not previously in this set.
|
||||
///
|
||||
/// This operation is idempotent. If the given value is already in this
|
||||
/// set, then this is a no-op.
|
||||
///
|
||||
/// If more than `capacity` ids are inserted, then this panics.
|
||||
///
|
||||
/// This is marked as inline(always) since the compiler won't inline it
|
||||
/// otherwise, and it's a fairly hot piece of code in DFA determinization.
|
||||
#[cfg_attr(feature = "perf-inline", inline(always))]
|
||||
pub(crate) fn insert(&mut self, id: StateID) -> bool {
|
||||
if self.contains(id) {
|
||||
return false;
|
||||
}
|
||||
|
||||
let i = self.len();
|
||||
assert!(
|
||||
i < self.capacity(),
|
||||
"{:?} exceeds capacity of {:?} when inserting {:?}",
|
||||
i,
|
||||
self.capacity(),
|
||||
id,
|
||||
);
|
||||
// OK since i < self.capacity() and self.capacity() is guaranteed to
|
||||
// be <= StateID::LIMIT.
|
||||
let index = StateID::new_unchecked(i);
|
||||
self.dense[index] = id;
|
||||
self.sparse[id] = index;
|
||||
self.len += 1;
|
||||
true
|
||||
}
|
||||
|
||||
/// Returns true if and only if this set contains the given value.
|
||||
#[inline]
|
||||
pub(crate) fn contains(&self, id: StateID) -> bool {
|
||||
let index = self.sparse[id];
|
||||
index.as_usize() < self.len() && self.dense[index] == id
|
||||
}
|
||||
|
||||
/// Clear this set such that it has no members.
|
||||
#[inline]
|
||||
pub(crate) fn clear(&mut self) {
|
||||
self.len = 0;
|
||||
}
|
||||
|
||||
#[inline]
|
||||
pub(crate) fn iter(&self) -> SparseSetIter<'_> {
|
||||
SparseSetIter(self.dense[..self.len()].iter())
|
||||
}
|
||||
|
||||
/// Returns the heap memory usage, in bytes, used by this sparse set.
|
||||
#[inline]
|
||||
pub(crate) fn memory_usage(&self) -> usize {
|
||||
self.dense.len() * StateID::SIZE + self.sparse.len() * StateID::SIZE
|
||||
}
|
||||
}
|
||||
|
||||
impl core::fmt::Debug for SparseSet {
|
||||
fn fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result {
|
||||
let elements: Vec<StateID> = self.iter().collect();
|
||||
f.debug_tuple("SparseSet").field(&elements).finish()
|
||||
}
|
||||
}
|
||||
|
||||
/// An iterator over all elements in a sparse set.
|
||||
///
|
||||
/// The lifetime `'a` refers to the lifetime of the set being iterated over.
|
||||
#[derive(Debug)]
|
||||
pub(crate) struct SparseSetIter<'a>(core::slice::Iter<'a, StateID>);
|
||||
|
||||
impl<'a> Iterator for SparseSetIter<'a> {
|
||||
type Item = StateID;
|
||||
|
||||
#[cfg_attr(feature = "perf-inline", inline(always))]
|
||||
fn next(&mut self) -> Option<StateID> {
|
||||
self.0.next().map(|&id| id)
|
||||
}
|
||||
}
|
||||
479
third-party/vendor/regex-automata/src/util/start.rs
vendored
Normal file
479
third-party/vendor/regex-automata/src/util/start.rs
vendored
Normal file
|
|
@ -0,0 +1,479 @@
|
|||
/*!
|
||||
Provides helpers for dealing with start state configurations in DFAs.
|
||||
*/
|
||||
|
||||
use crate::util::{
|
||||
look::LookMatcher,
|
||||
search::{Anchored, Input},
|
||||
wire::{self, DeserializeError, SerializeError},
|
||||
};
|
||||
|
||||
/// The configuration used to determine a DFA's start state for a search.
|
||||
///
|
||||
/// A DFA has a single starting state in the typical textbook description. That
|
||||
/// is, it corresponds to the set of all starting states for the NFA that built
|
||||
/// it, along with their espsilon closures. In this crate, however, DFAs have
|
||||
/// many possible start states due to a few factors:
|
||||
///
|
||||
/// * DFAs support the ability to run either anchored or unanchored searches.
|
||||
/// Each type of search needs its own start state. For example, an unanchored
|
||||
/// search requires starting at a state corresponding to a regex with a
|
||||
/// `(?s-u:.)*?` prefix, which will match through anything.
|
||||
/// * DFAs also optionally support starting an anchored search for any one
|
||||
/// specific pattern. Each such pattern requires its own start state.
|
||||
/// * If a look-behind assertion like `^` or `\b` is used in the regex, then
|
||||
/// the DFA will need to inspect a single byte immediately before the start of
|
||||
/// the search to choose the correct start state.
|
||||
///
|
||||
/// Indeed, this configuration precisely encapsulates all of the above factors.
|
||||
/// The [`Config::anchored`] method sets which kind of anchored search to
|
||||
/// perform while the [`Config::look_behind`] method provides a way to set
|
||||
/// the byte that occurs immediately before the start of the search.
|
||||
///
|
||||
/// Generally speaking, this type is only useful when you want to run searches
|
||||
/// without using an [`Input`]. In particular, an `Input` wants a haystack
|
||||
/// slice, but callers may not have a contiguous sequence of bytes as a
|
||||
/// haystack in all cases. This type provides a lower level of control such
|
||||
/// that callers can provide their own anchored configuration and look-behind
|
||||
/// byte explicitly.
|
||||
///
|
||||
/// # Example
|
||||
///
|
||||
/// This shows basic usage that permits running a search with a DFA without
|
||||
/// using the `Input` abstraction.
|
||||
///
|
||||
/// ```
|
||||
/// use regex_automata::{
|
||||
/// dfa::{Automaton, dense},
|
||||
/// util::start,
|
||||
/// Anchored,
|
||||
/// };
|
||||
///
|
||||
/// let dfa = dense::DFA::new(r"(?-u)\b\w+\b")?;
|
||||
/// let haystack = "quartz";
|
||||
///
|
||||
/// let config = start::Config::new().anchored(Anchored::Yes);
|
||||
/// let mut state = dfa.start_state(&config)?;
|
||||
/// for &b in haystack.as_bytes().iter() {
|
||||
/// state = dfa.next_state(state, b);
|
||||
/// }
|
||||
/// state = dfa.next_eoi_state(state);
|
||||
/// assert!(dfa.is_match_state(state));
|
||||
///
|
||||
/// # Ok::<(), Box<dyn std::error::Error>>(())
|
||||
/// ```
|
||||
///
|
||||
/// This example shows how to correctly run a search that doesn't begin at
|
||||
/// the start of a haystack. Notice how we set the look-behind byte, and as
|
||||
/// a result, the `\b` assertion does not match.
|
||||
///
|
||||
/// ```
|
||||
/// use regex_automata::{
|
||||
/// dfa::{Automaton, dense},
|
||||
/// util::start,
|
||||
/// Anchored,
|
||||
/// };
|
||||
///
|
||||
/// let dfa = dense::DFA::new(r"(?-u)\b\w+\b")?;
|
||||
/// let haystack = "quartz";
|
||||
///
|
||||
/// let config = start::Config::new()
|
||||
/// .anchored(Anchored::Yes)
|
||||
/// .look_behind(Some(b'q'));
|
||||
/// let mut state = dfa.start_state(&config)?;
|
||||
/// for &b in haystack.as_bytes().iter().skip(1) {
|
||||
/// state = dfa.next_state(state, b);
|
||||
/// }
|
||||
/// state = dfa.next_eoi_state(state);
|
||||
/// // No match!
|
||||
/// assert!(!dfa.is_match_state(state));
|
||||
///
|
||||
/// # Ok::<(), Box<dyn std::error::Error>>(())
|
||||
/// ```
|
||||
///
|
||||
/// If we had instead not set a look-behind byte, then the DFA would assume
|
||||
/// that it was starting at the beginning of the haystack, and thus `\b` should
|
||||
/// match. This in turn would result in erroneously reporting a match:
|
||||
///
|
||||
/// ```
|
||||
/// use regex_automata::{
|
||||
/// dfa::{Automaton, dense},
|
||||
/// util::start,
|
||||
/// Anchored,
|
||||
/// };
|
||||
///
|
||||
/// let dfa = dense::DFA::new(r"(?-u)\b\w+\b")?;
|
||||
/// let haystack = "quartz";
|
||||
///
|
||||
/// // Whoops, forgot the look-behind byte...
|
||||
/// let config = start::Config::new().anchored(Anchored::Yes);
|
||||
/// let mut state = dfa.start_state(&config)?;
|
||||
/// for &b in haystack.as_bytes().iter().skip(1) {
|
||||
/// state = dfa.next_state(state, b);
|
||||
/// }
|
||||
/// state = dfa.next_eoi_state(state);
|
||||
/// // And now we get a match unexpectedly.
|
||||
/// assert!(dfa.is_match_state(state));
|
||||
///
|
||||
/// # Ok::<(), Box<dyn std::error::Error>>(())
|
||||
/// ```
|
||||
#[derive(Clone, Debug)]
|
||||
pub struct Config {
|
||||
look_behind: Option<u8>,
|
||||
anchored: Anchored,
|
||||
}
|
||||
|
||||
impl Config {
|
||||
/// Create a new default start configuration.
|
||||
///
|
||||
/// The default is an unanchored search that starts at the beginning of the
|
||||
/// haystack.
|
||||
pub fn new() -> Config {
|
||||
Config { anchored: Anchored::No, look_behind: None }
|
||||
}
|
||||
|
||||
/// A convenience routine for building a start configuration from an
|
||||
/// [`Input`] for a forward search.
|
||||
///
|
||||
/// This automatically sets the look-behind byte to the byte immediately
|
||||
/// preceding the start of the search. If the start of the search is at
|
||||
/// offset `0`, then no look-behind byte is set.
|
||||
pub fn from_input_forward(input: &Input<'_>) -> Config {
|
||||
let look_behind = input
|
||||
.start()
|
||||
.checked_sub(1)
|
||||
.and_then(|i| input.haystack().get(i).copied());
|
||||
Config { look_behind, anchored: input.get_anchored() }
|
||||
}
|
||||
|
||||
/// A convenience routine for building a start configuration from an
|
||||
/// [`Input`] for a reverse search.
|
||||
///
|
||||
/// This automatically sets the look-behind byte to the byte immediately
|
||||
/// following the end of the search. If the end of the search is at
|
||||
/// offset `haystack.len()`, then no look-behind byte is set.
|
||||
pub fn from_input_reverse(input: &Input<'_>) -> Config {
|
||||
let look_behind = input.haystack().get(input.end()).copied();
|
||||
Config { look_behind, anchored: input.get_anchored() }
|
||||
}
|
||||
|
||||
/// Set the look-behind byte at the start of a search.
|
||||
///
|
||||
/// Unless the search is intended to logically start at the beginning of a
|
||||
/// haystack, this should _always_ be set to the byte immediately preceding
|
||||
/// the start of the search. If no look-behind byte is set, then the start
|
||||
/// configuration will assume it is at the beginning of the haystack. For
|
||||
/// example, the anchor `^` will match.
|
||||
///
|
||||
/// The default is that no look-behind byte is set.
|
||||
pub fn look_behind(mut self, byte: Option<u8>) -> Config {
|
||||
self.look_behind = byte;
|
||||
self
|
||||
}
|
||||
|
||||
/// Set the anchored mode of a search.
|
||||
///
|
||||
/// The default is an unanchored search.
|
||||
pub fn anchored(mut self, mode: Anchored) -> Config {
|
||||
self.anchored = mode;
|
||||
self
|
||||
}
|
||||
|
||||
/// Return the look-behind byte in this configuration, if one exists.
|
||||
pub fn get_look_behind(&self) -> Option<u8> {
|
||||
self.look_behind
|
||||
}
|
||||
|
||||
/// Return the anchored mode in this configuration.
|
||||
pub fn get_anchored(&self) -> Anchored {
|
||||
self.anchored
|
||||
}
|
||||
}
|
||||
|
||||
/// A map from every possible byte value to its corresponding starting
|
||||
/// configuration.
|
||||
///
|
||||
/// This map is used in order to lookup the start configuration for a particular
|
||||
/// position in a haystack. This start configuration is then used in
|
||||
/// combination with things like the anchored mode and pattern ID to fully
|
||||
/// determine the start state.
|
||||
///
|
||||
/// Generally speaking, this map is only used for fully compiled DFAs and lazy
|
||||
/// DFAs. For NFAs (including the one-pass DFA), the start state is generally
|
||||
/// selected by virtue of traversing the NFA state graph. DFAs do the same
|
||||
/// thing, but at build time and not search time. (Well, technically the lazy
|
||||
/// DFA does it at search time, but it does enough work to cache the full
|
||||
/// result of the epsilon closure that the NFA engines tend to need to do.)
|
||||
#[derive(Clone)]
|
||||
pub(crate) struct StartByteMap {
|
||||
map: [Start; 256],
|
||||
}
|
||||
|
||||
impl StartByteMap {
|
||||
/// Create a new map from byte values to their corresponding starting
|
||||
/// configurations. The map is determined, in part, by how look-around
|
||||
/// assertions are matched via the matcher given.
|
||||
pub(crate) fn new(lookm: &LookMatcher) -> StartByteMap {
|
||||
let mut map = [Start::NonWordByte; 256];
|
||||
map[usize::from(b'\n')] = Start::LineLF;
|
||||
map[usize::from(b'\r')] = Start::LineCR;
|
||||
map[usize::from(b'_')] = Start::WordByte;
|
||||
|
||||
let mut byte = b'0';
|
||||
while byte <= b'9' {
|
||||
map[usize::from(byte)] = Start::WordByte;
|
||||
byte += 1;
|
||||
}
|
||||
byte = b'A';
|
||||
while byte <= b'Z' {
|
||||
map[usize::from(byte)] = Start::WordByte;
|
||||
byte += 1;
|
||||
}
|
||||
byte = b'a';
|
||||
while byte <= b'z' {
|
||||
map[usize::from(byte)] = Start::WordByte;
|
||||
byte += 1;
|
||||
}
|
||||
|
||||
let lineterm = lookm.get_line_terminator();
|
||||
// If our line terminator is normal, then it is already handled by
|
||||
// the LineLF and LineCR configurations. But if it's weird, then we
|
||||
// overwrite whatever was there before for that terminator with a
|
||||
// special configuration. The trick here is that if the terminator
|
||||
// is, say, a word byte like `a`, then callers seeing this start
|
||||
// configuration need to account for that and build their DFA state as
|
||||
// if it *also* came from a word byte.
|
||||
if lineterm != b'\r' && lineterm != b'\n' {
|
||||
map[usize::from(lineterm)] = Start::CustomLineTerminator;
|
||||
}
|
||||
StartByteMap { map }
|
||||
}
|
||||
|
||||
/// Return the starting configuration for the given look-behind byte.
|
||||
///
|
||||
/// If no look-behind exists, callers should use `Start::Text`.
|
||||
#[cfg_attr(feature = "perf-inline", inline(always))]
|
||||
pub(crate) fn get(&self, byte: u8) -> Start {
|
||||
self.map[usize::from(byte)]
|
||||
}
|
||||
|
||||
/// Deserializes a byte class map from the given slice. If the slice is of
|
||||
/// insufficient length or otherwise contains an impossible mapping, then
|
||||
/// an error is returned. Upon success, the number of bytes read along with
|
||||
/// the map are returned. The number of bytes read is always a multiple of
|
||||
/// 8.
|
||||
pub(crate) fn from_bytes(
|
||||
slice: &[u8],
|
||||
) -> Result<(StartByteMap, usize), DeserializeError> {
|
||||
wire::check_slice_len(slice, 256, "start byte map")?;
|
||||
let mut map = [Start::NonWordByte; 256];
|
||||
for (i, &repr) in slice[..256].iter().enumerate() {
|
||||
map[i] = match Start::from_usize(usize::from(repr)) {
|
||||
Some(start) => start,
|
||||
None => {
|
||||
return Err(DeserializeError::generic(
|
||||
"found invalid starting configuration",
|
||||
))
|
||||
}
|
||||
};
|
||||
}
|
||||
Ok((StartByteMap { map }, 256))
|
||||
}
|
||||
|
||||
/// Writes this map to the given byte buffer. if the given buffer is too
|
||||
/// small, then an error is returned. Upon success, the total number of
|
||||
/// bytes written is returned. The number of bytes written is guaranteed to
|
||||
/// be a multiple of 8.
|
||||
pub(crate) fn write_to(
|
||||
&self,
|
||||
dst: &mut [u8],
|
||||
) -> Result<usize, SerializeError> {
|
||||
let nwrite = self.write_to_len();
|
||||
if dst.len() < nwrite {
|
||||
return Err(SerializeError::buffer_too_small("start byte map"));
|
||||
}
|
||||
for (i, &start) in self.map.iter().enumerate() {
|
||||
dst[i] = start.as_u8();
|
||||
}
|
||||
Ok(nwrite)
|
||||
}
|
||||
|
||||
/// Returns the total number of bytes written by `write_to`.
|
||||
pub(crate) fn write_to_len(&self) -> usize {
|
||||
256
|
||||
}
|
||||
}
|
||||
|
||||
impl core::fmt::Debug for StartByteMap {
|
||||
fn fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result {
|
||||
use crate::util::escape::DebugByte;
|
||||
|
||||
write!(f, "StartByteMap{{")?;
|
||||
for byte in 0..=255 {
|
||||
if byte > 0 {
|
||||
write!(f, ", ")?;
|
||||
}
|
||||
let start = self.map[usize::from(byte)];
|
||||
write!(f, "{:?} => {:?}", DebugByte(byte), start)?;
|
||||
}
|
||||
write!(f, "}}")?;
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
|
||||
/// Represents the six possible starting configurations of a DFA search.
|
||||
///
|
||||
/// The starting configuration is determined by inspecting the the beginning
|
||||
/// of the haystack (up to 1 byte). Ultimately, this along with a pattern ID
|
||||
/// (if specified) and the type of search (anchored or not) is what selects the
|
||||
/// start state to use in a DFA.
|
||||
///
|
||||
/// As one example, if a DFA only supports unanchored searches and does not
|
||||
/// support anchored searches for each pattern, then it will have at most 6
|
||||
/// distinct start states. (Some start states may be reused if determinization
|
||||
/// can determine that they will be equivalent.) If the DFA supports both
|
||||
/// anchored and unanchored searches, then it will have a maximum of 12
|
||||
/// distinct start states. Finally, if the DFA also supports anchored searches
|
||||
/// for each pattern, then it can have up to `12 + (N * 6)` start states, where
|
||||
/// `N` is the number of patterns.
|
||||
///
|
||||
/// Handling each of these starting configurations in the context of DFA
|
||||
/// determinization can be *quite* tricky and subtle. But the code is small
|
||||
/// and can be found at `crate::util::determinize::set_lookbehind_from_start`.
|
||||
#[derive(Clone, Copy, Debug, Eq, PartialEq)]
|
||||
pub(crate) enum Start {
|
||||
/// This occurs when the starting position is not any of the ones below.
|
||||
NonWordByte = 0,
|
||||
/// This occurs when the byte immediately preceding the start of the search
|
||||
/// is an ASCII word byte.
|
||||
WordByte = 1,
|
||||
/// This occurs when the starting position of the search corresponds to the
|
||||
/// beginning of the haystack.
|
||||
Text = 2,
|
||||
/// This occurs when the byte immediately preceding the start of the search
|
||||
/// is a line terminator. Specifically, `\n`.
|
||||
LineLF = 3,
|
||||
/// This occurs when the byte immediately preceding the start of the search
|
||||
/// is a line terminator. Specifically, `\r`.
|
||||
LineCR = 4,
|
||||
/// This occurs when a custom line terminator has been set via a
|
||||
/// `LookMatcher`, and when that line terminator is neither a `\r` or a
|
||||
/// `\n`.
|
||||
///
|
||||
/// If the custom line terminator is a word byte, then this start
|
||||
/// configuration is still selected. DFAs that implement word boundary
|
||||
/// assertions will likely need to check whether the custom line terminator
|
||||
/// is a word byte, in which case, it should behave as if the byte
|
||||
/// satisfies `\b` in addition to multi-line anchors.
|
||||
CustomLineTerminator = 5,
|
||||
}
|
||||
|
||||
impl Start {
|
||||
/// Return the starting state corresponding to the given integer. If no
|
||||
/// starting state exists for the given integer, then None is returned.
|
||||
pub(crate) fn from_usize(n: usize) -> Option<Start> {
|
||||
match n {
|
||||
0 => Some(Start::NonWordByte),
|
||||
1 => Some(Start::WordByte),
|
||||
2 => Some(Start::Text),
|
||||
3 => Some(Start::LineLF),
|
||||
4 => Some(Start::LineCR),
|
||||
5 => Some(Start::CustomLineTerminator),
|
||||
_ => None,
|
||||
}
|
||||
}
|
||||
|
||||
/// Returns the total number of starting state configurations.
|
||||
pub(crate) fn len() -> usize {
|
||||
6
|
||||
}
|
||||
|
||||
/// Return this starting configuration as `u8` integer. It is guaranteed to
|
||||
/// be less than `Start::len()`.
|
||||
#[cfg_attr(feature = "perf-inline", inline(always))]
|
||||
pub(crate) fn as_u8(&self) -> u8 {
|
||||
// AFAIK, 'as' is the only way to zero-cost convert an int enum to an
|
||||
// actual int.
|
||||
*self as u8
|
||||
}
|
||||
|
||||
/// Return this starting configuration as a `usize` integer. It is
|
||||
/// guaranteed to be less than `Start::len()`.
|
||||
#[cfg_attr(feature = "perf-inline", inline(always))]
|
||||
pub(crate) fn as_usize(&self) -> usize {
|
||||
usize::from(self.as_u8())
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
|
||||
#[test]
|
||||
fn start_fwd_done_range() {
|
||||
let smap = StartByteMap::new(&LookMatcher::default());
|
||||
let input = Input::new("").range(1..0);
|
||||
let config = Config::from_input_forward(&input);
|
||||
let start =
|
||||
config.get_look_behind().map_or(Start::Text, |b| smap.get(b));
|
||||
assert_eq!(Start::Text, start);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn start_rev_done_range() {
|
||||
let smap = StartByteMap::new(&LookMatcher::default());
|
||||
let input = Input::new("").range(1..0);
|
||||
let config = Config::from_input_reverse(&input);
|
||||
let start =
|
||||
config.get_look_behind().map_or(Start::Text, |b| smap.get(b));
|
||||
assert_eq!(Start::Text, start);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn start_fwd() {
|
||||
let f = |haystack, start, end| {
|
||||
let smap = StartByteMap::new(&LookMatcher::default());
|
||||
let input = Input::new(haystack).range(start..end);
|
||||
let config = Config::from_input_forward(&input);
|
||||
let start =
|
||||
config.get_look_behind().map_or(Start::Text, |b| smap.get(b));
|
||||
start
|
||||
};
|
||||
|
||||
assert_eq!(Start::Text, f("", 0, 0));
|
||||
assert_eq!(Start::Text, f("abc", 0, 3));
|
||||
assert_eq!(Start::Text, f("\nabc", 0, 3));
|
||||
|
||||
assert_eq!(Start::LineLF, f("\nabc", 1, 3));
|
||||
|
||||
assert_eq!(Start::LineCR, f("\rabc", 1, 3));
|
||||
|
||||
assert_eq!(Start::WordByte, f("abc", 1, 3));
|
||||
|
||||
assert_eq!(Start::NonWordByte, f(" abc", 1, 3));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn start_rev() {
|
||||
let f = |haystack, start, end| {
|
||||
let smap = StartByteMap::new(&LookMatcher::default());
|
||||
let input = Input::new(haystack).range(start..end);
|
||||
let config = Config::from_input_reverse(&input);
|
||||
let start =
|
||||
config.get_look_behind().map_or(Start::Text, |b| smap.get(b));
|
||||
start
|
||||
};
|
||||
|
||||
assert_eq!(Start::Text, f("", 0, 0));
|
||||
assert_eq!(Start::Text, f("abc", 0, 3));
|
||||
assert_eq!(Start::Text, f("abc\n", 0, 4));
|
||||
|
||||
assert_eq!(Start::LineLF, f("abc\nz", 0, 3));
|
||||
|
||||
assert_eq!(Start::LineCR, f("abc\rz", 0, 3));
|
||||
|
||||
assert_eq!(Start::WordByte, f("abc", 0, 2));
|
||||
|
||||
assert_eq!(Start::NonWordByte, f("abc ", 0, 3));
|
||||
}
|
||||
}
|
||||
482
third-party/vendor/regex-automata/src/util/syntax.rs
vendored
Normal file
482
third-party/vendor/regex-automata/src/util/syntax.rs
vendored
Normal file
|
|
@ -0,0 +1,482 @@
|
|||
/*!
|
||||
Utilities for dealing with the syntax of a regular expression.
|
||||
|
||||
This module currently only exposes a [`Config`] type that
|
||||
itself represents a wrapper around the configuration for a
|
||||
[`regex-syntax::ParserBuilder`](regex_syntax::ParserBuilder). The purpose of
|
||||
this wrapper is to make configuring syntax options very similar to how other
|
||||
configuration is done throughout this crate. Namely, instead of duplicating
|
||||
syntax options across every builder (of which there are many), we instead
|
||||
create small config objects like this one that can be passed around and
|
||||
composed.
|
||||
*/
|
||||
|
||||
use alloc::{vec, vec::Vec};
|
||||
|
||||
use regex_syntax::{
|
||||
ast,
|
||||
hir::{self, Hir},
|
||||
Error, ParserBuilder,
|
||||
};
|
||||
|
||||
/// A convenience routine for parsing a pattern into an HIR value with the
|
||||
/// default configuration.
|
||||
///
|
||||
/// # Example
|
||||
///
|
||||
/// This shows how to parse a pattern into an HIR value:
|
||||
///
|
||||
/// ```
|
||||
/// use regex_automata::util::syntax;
|
||||
///
|
||||
/// let hir = syntax::parse(r"([a-z]+)|([0-9]+)")?;
|
||||
/// assert_eq!(Some(1), hir.properties().static_explicit_captures_len());
|
||||
///
|
||||
/// # Ok::<(), Box<dyn std::error::Error>>(())
|
||||
/// ```
|
||||
pub fn parse(pattern: &str) -> Result<Hir, Error> {
|
||||
parse_with(pattern, &Config::default())
|
||||
}
|
||||
|
||||
/// A convenience routine for parsing many patterns into HIR value with the
|
||||
/// default configuration.
|
||||
///
|
||||
/// # Example
|
||||
///
|
||||
/// This shows how to parse many patterns into an corresponding HIR values:
|
||||
///
|
||||
/// ```
|
||||
/// use {
|
||||
/// regex_automata::util::syntax,
|
||||
/// regex_syntax::hir::Properties,
|
||||
/// };
|
||||
///
|
||||
/// let hirs = syntax::parse_many(&[
|
||||
/// r"([a-z]+)|([0-9]+)",
|
||||
/// r"foo(A-Z]+)bar",
|
||||
/// ])?;
|
||||
/// let props = Properties::union(hirs.iter().map(|h| h.properties()));
|
||||
/// assert_eq!(Some(1), props.static_explicit_captures_len());
|
||||
///
|
||||
/// # Ok::<(), Box<dyn std::error::Error>>(())
|
||||
/// ```
|
||||
pub fn parse_many<P: AsRef<str>>(patterns: &[P]) -> Result<Vec<Hir>, Error> {
|
||||
parse_many_with(patterns, &Config::default())
|
||||
}
|
||||
|
||||
/// A convenience routine for parsing a pattern into an HIR value using a
|
||||
/// `Config`.
|
||||
///
|
||||
/// # Example
|
||||
///
|
||||
/// This shows how to parse a pattern into an HIR value with a non-default
|
||||
/// configuration:
|
||||
///
|
||||
/// ```
|
||||
/// use regex_automata::util::syntax;
|
||||
///
|
||||
/// let hir = syntax::parse_with(
|
||||
/// r"^[a-z]+$",
|
||||
/// &syntax::Config::new().multi_line(true).crlf(true),
|
||||
/// )?;
|
||||
/// assert!(hir.properties().look_set().contains_anchor_crlf());
|
||||
///
|
||||
/// # Ok::<(), Box<dyn std::error::Error>>(())
|
||||
/// ```
|
||||
pub fn parse_with(pattern: &str, config: &Config) -> Result<Hir, Error> {
|
||||
let mut builder = ParserBuilder::new();
|
||||
config.apply(&mut builder);
|
||||
builder.build().parse(pattern)
|
||||
}
|
||||
|
||||
/// A convenience routine for parsing many patterns into HIR values using a
|
||||
/// `Config`.
|
||||
///
|
||||
/// # Example
|
||||
///
|
||||
/// This shows how to parse many patterns into an corresponding HIR values
|
||||
/// with a non-default configuration:
|
||||
///
|
||||
/// ```
|
||||
/// use {
|
||||
/// regex_automata::util::syntax,
|
||||
/// regex_syntax::hir::Properties,
|
||||
/// };
|
||||
///
|
||||
/// let patterns = &[
|
||||
/// r"([a-z]+)|([0-9]+)",
|
||||
/// r"\W",
|
||||
/// r"foo(A-Z]+)bar",
|
||||
/// ];
|
||||
/// let config = syntax::Config::new().unicode(false).utf8(false);
|
||||
/// let hirs = syntax::parse_many_with(patterns, &config)?;
|
||||
/// let props = Properties::union(hirs.iter().map(|h| h.properties()));
|
||||
/// assert!(!props.is_utf8());
|
||||
///
|
||||
/// # Ok::<(), Box<dyn std::error::Error>>(())
|
||||
/// ```
|
||||
pub fn parse_many_with<P: AsRef<str>>(
|
||||
patterns: &[P],
|
||||
config: &Config,
|
||||
) -> Result<Vec<Hir>, Error> {
|
||||
let mut builder = ParserBuilder::new();
|
||||
config.apply(&mut builder);
|
||||
let mut hirs = vec![];
|
||||
for p in patterns.iter() {
|
||||
hirs.push(builder.build().parse(p.as_ref())?);
|
||||
}
|
||||
Ok(hirs)
|
||||
}
|
||||
|
||||
/// A common set of configuration options that apply to the syntax of a regex.
|
||||
///
|
||||
/// This represents a group of configuration options that specifically apply
|
||||
/// to how the concrete syntax of a regular expression is interpreted. In
|
||||
/// particular, they are generally forwarded to the
|
||||
/// [`ParserBuilder`](https://docs.rs/regex-syntax/*/regex_syntax/struct.ParserBuilder.html)
|
||||
/// in the
|
||||
/// [`regex-syntax`](https://docs.rs/regex-syntax)
|
||||
/// crate when building a regex from its concrete syntax directly.
|
||||
///
|
||||
/// These options are defined as a group since they apply to every regex engine
|
||||
/// in this crate. Instead of re-defining them on every engine's builder, they
|
||||
/// are instead provided here as one cohesive unit.
|
||||
#[derive(Clone, Copy, Debug)]
|
||||
pub struct Config {
|
||||
case_insensitive: bool,
|
||||
multi_line: bool,
|
||||
dot_matches_new_line: bool,
|
||||
crlf: bool,
|
||||
line_terminator: u8,
|
||||
swap_greed: bool,
|
||||
ignore_whitespace: bool,
|
||||
unicode: bool,
|
||||
utf8: bool,
|
||||
nest_limit: u32,
|
||||
octal: bool,
|
||||
}
|
||||
|
||||
impl Config {
|
||||
/// Return a new default syntax configuration.
|
||||
pub fn new() -> Config {
|
||||
// These defaults match the ones used in regex-syntax.
|
||||
Config {
|
||||
case_insensitive: false,
|
||||
multi_line: false,
|
||||
dot_matches_new_line: false,
|
||||
crlf: false,
|
||||
line_terminator: b'\n',
|
||||
swap_greed: false,
|
||||
ignore_whitespace: false,
|
||||
unicode: true,
|
||||
utf8: true,
|
||||
nest_limit: 250,
|
||||
octal: false,
|
||||
}
|
||||
}
|
||||
|
||||
/// Enable or disable the case insensitive flag by default.
|
||||
///
|
||||
/// When Unicode mode is enabled, case insensitivity is Unicode-aware.
|
||||
/// Specifically, it will apply the "simple" case folding rules as
|
||||
/// specified by Unicode.
|
||||
///
|
||||
/// By default this is disabled. It may alternatively be selectively
|
||||
/// enabled in the regular expression itself via the `i` flag.
|
||||
pub fn case_insensitive(mut self, yes: bool) -> Config {
|
||||
self.case_insensitive = yes;
|
||||
self
|
||||
}
|
||||
|
||||
/// Enable or disable the multi-line matching flag by default.
|
||||
///
|
||||
/// When this is enabled, the `^` and `$` look-around assertions will
|
||||
/// match immediately after and immediately before a new line character,
|
||||
/// respectively. Note that the `\A` and `\z` look-around assertions are
|
||||
/// unaffected by this setting and always correspond to matching at the
|
||||
/// beginning and end of the input.
|
||||
///
|
||||
/// By default this is disabled. It may alternatively be selectively
|
||||
/// enabled in the regular expression itself via the `m` flag.
|
||||
pub fn multi_line(mut self, yes: bool) -> Config {
|
||||
self.multi_line = yes;
|
||||
self
|
||||
}
|
||||
|
||||
/// Enable or disable the "dot matches any character" flag by default.
|
||||
///
|
||||
/// When this is enabled, `.` will match any character. When it's disabled,
|
||||
/// then `.` will match any character except for a new line character.
|
||||
///
|
||||
/// Note that `.` is impacted by whether the "unicode" setting is enabled
|
||||
/// or not. When Unicode is enabled (the default), `.` will match any UTF-8
|
||||
/// encoding of any Unicode scalar value (sans a new line, depending on
|
||||
/// whether this "dot matches new line" option is enabled). When Unicode
|
||||
/// mode is disabled, `.` will match any byte instead. Because of this,
|
||||
/// when Unicode mode is disabled, `.` can only be used when the "allow
|
||||
/// invalid UTF-8" option is enabled, since `.` could otherwise match
|
||||
/// invalid UTF-8.
|
||||
///
|
||||
/// By default this is disabled. It may alternatively be selectively
|
||||
/// enabled in the regular expression itself via the `s` flag.
|
||||
pub fn dot_matches_new_line(mut self, yes: bool) -> Config {
|
||||
self.dot_matches_new_line = yes;
|
||||
self
|
||||
}
|
||||
|
||||
/// Enable or disable the "CRLF mode" flag by default.
|
||||
///
|
||||
/// By default this is disabled. It may alternatively be selectively
|
||||
/// enabled in the regular expression itself via the `R` flag.
|
||||
///
|
||||
/// When CRLF mode is enabled, the following happens:
|
||||
///
|
||||
/// * Unless `dot_matches_new_line` is enabled, `.` will match any character
|
||||
/// except for `\r` and `\n`.
|
||||
/// * When `multi_line` mode is enabled, `^` and `$` will treat `\r\n`,
|
||||
/// `\r` and `\n` as line terminators. And in particular, neither will
|
||||
/// match between a `\r` and a `\n`.
|
||||
pub fn crlf(mut self, yes: bool) -> Config {
|
||||
self.crlf = yes;
|
||||
self
|
||||
}
|
||||
|
||||
/// Sets the line terminator for use with `(?u-s:.)` and `(?-us:.)`.
|
||||
///
|
||||
/// Namely, instead of `.` (by default) matching everything except for `\n`,
|
||||
/// this will cause `.` to match everything except for the byte given.
|
||||
///
|
||||
/// If `.` is used in a context where Unicode mode is enabled and this byte
|
||||
/// isn't ASCII, then an error will be returned. When Unicode mode is
|
||||
/// disabled, then any byte is permitted, but will return an error if UTF-8
|
||||
/// mode is enabled and it is a non-ASCII byte.
|
||||
///
|
||||
/// In short, any ASCII value for a line terminator is always okay. But a
|
||||
/// non-ASCII byte might result in an error depending on whether Unicode
|
||||
/// mode or UTF-8 mode are enabled.
|
||||
///
|
||||
/// Note that if `R` mode is enabled then it always takes precedence and
|
||||
/// the line terminator will be treated as `\r` and `\n` simultaneously.
|
||||
///
|
||||
/// Note also that this *doesn't* impact the look-around assertions
|
||||
/// `(?m:^)` and `(?m:$)`. That's usually controlled by additional
|
||||
/// configuration in the regex engine itself.
|
||||
pub fn line_terminator(mut self, byte: u8) -> Config {
|
||||
self.line_terminator = byte;
|
||||
self
|
||||
}
|
||||
|
||||
/// Enable or disable the "swap greed" flag by default.
|
||||
///
|
||||
/// When this is enabled, `.*` (for example) will become ungreedy and `.*?`
|
||||
/// will become greedy.
|
||||
///
|
||||
/// By default this is disabled. It may alternatively be selectively
|
||||
/// enabled in the regular expression itself via the `U` flag.
|
||||
pub fn swap_greed(mut self, yes: bool) -> Config {
|
||||
self.swap_greed = yes;
|
||||
self
|
||||
}
|
||||
|
||||
/// Enable verbose mode in the regular expression.
|
||||
///
|
||||
/// When enabled, verbose mode permits insigificant whitespace in many
|
||||
/// places in the regular expression, as well as comments. Comments are
|
||||
/// started using `#` and continue until the end of the line.
|
||||
///
|
||||
/// By default, this is disabled. It may be selectively enabled in the
|
||||
/// regular expression by using the `x` flag regardless of this setting.
|
||||
pub fn ignore_whitespace(mut self, yes: bool) -> Config {
|
||||
self.ignore_whitespace = yes;
|
||||
self
|
||||
}
|
||||
|
||||
/// Enable or disable the Unicode flag (`u`) by default.
|
||||
///
|
||||
/// By default this is **enabled**. It may alternatively be selectively
|
||||
/// disabled in the regular expression itself via the `u` flag.
|
||||
///
|
||||
/// Note that unless "allow invalid UTF-8" is enabled (it's disabled by
|
||||
/// default), a regular expression will fail to parse if Unicode mode is
|
||||
/// disabled and a sub-expression could possibly match invalid UTF-8.
|
||||
///
|
||||
/// **WARNING**: Unicode mode can greatly increase the size of the compiled
|
||||
/// DFA, which can noticeably impact both memory usage and compilation
|
||||
/// time. This is especially noticeable if your regex contains character
|
||||
/// classes like `\w` that are impacted by whether Unicode is enabled or
|
||||
/// not. If Unicode is not necessary, you are encouraged to disable it.
|
||||
pub fn unicode(mut self, yes: bool) -> Config {
|
||||
self.unicode = yes;
|
||||
self
|
||||
}
|
||||
|
||||
/// When disabled, the builder will permit the construction of a regular
|
||||
/// expression that may match invalid UTF-8.
|
||||
///
|
||||
/// For example, when [`Config::unicode`] is disabled, then
|
||||
/// expressions like `[^a]` may match invalid UTF-8 since they can match
|
||||
/// any single byte that is not `a`. By default, these sub-expressions
|
||||
/// are disallowed to avoid returning offsets that split a UTF-8
|
||||
/// encoded codepoint. However, in cases where matching at arbitrary
|
||||
/// locations is desired, this option can be disabled to permit all such
|
||||
/// sub-expressions.
|
||||
///
|
||||
/// When enabled (the default), the builder is guaranteed to produce a
|
||||
/// regex that will only ever match valid UTF-8 (otherwise, the builder
|
||||
/// will return an error).
|
||||
pub fn utf8(mut self, yes: bool) -> Config {
|
||||
self.utf8 = yes;
|
||||
self
|
||||
}
|
||||
|
||||
/// Set the nesting limit used for the regular expression parser.
|
||||
///
|
||||
/// The nesting limit controls how deep the abstract syntax tree is allowed
|
||||
/// to be. If the AST exceeds the given limit (e.g., with too many nested
|
||||
/// groups), then an error is returned by the parser.
|
||||
///
|
||||
/// The purpose of this limit is to act as a heuristic to prevent stack
|
||||
/// overflow when building a finite automaton from a regular expression's
|
||||
/// abstract syntax tree. In particular, construction currently uses
|
||||
/// recursion. In the future, the implementation may stop using recursion
|
||||
/// and this option will no longer be necessary.
|
||||
///
|
||||
/// This limit is not checked until the entire AST is parsed. Therefore,
|
||||
/// if callers want to put a limit on the amount of heap space used, then
|
||||
/// they should impose a limit on the length, in bytes, of the concrete
|
||||
/// pattern string. In particular, this is viable since the parser will
|
||||
/// limit itself to heap space proportional to the length of the pattern
|
||||
/// string.
|
||||
///
|
||||
/// Note that a nest limit of `0` will return a nest limit error for most
|
||||
/// patterns but not all. For example, a nest limit of `0` permits `a` but
|
||||
/// not `ab`, since `ab` requires a concatenation AST item, which results
|
||||
/// in a nest depth of `1`. In general, a nest limit is not something that
|
||||
/// manifests in an obvious way in the concrete syntax, therefore, it
|
||||
/// should not be used in a granular way.
|
||||
pub fn nest_limit(mut self, limit: u32) -> Config {
|
||||
self.nest_limit = limit;
|
||||
self
|
||||
}
|
||||
|
||||
/// Whether to support octal syntax or not.
|
||||
///
|
||||
/// Octal syntax is a little-known way of uttering Unicode codepoints in
|
||||
/// a regular expression. For example, `a`, `\x61`, `\u0061` and
|
||||
/// `\141` are all equivalent regular expressions, where the last example
|
||||
/// shows octal syntax.
|
||||
///
|
||||
/// While supporting octal syntax isn't in and of itself a problem, it does
|
||||
/// make good error messages harder. That is, in PCRE based regex engines,
|
||||
/// syntax like `\1` invokes a backreference, which is explicitly
|
||||
/// unsupported in Rust's regex engine. However, many users expect it to
|
||||
/// be supported. Therefore, when octal support is disabled, the error
|
||||
/// message will explicitly mention that backreferences aren't supported.
|
||||
///
|
||||
/// Octal syntax is disabled by default.
|
||||
pub fn octal(mut self, yes: bool) -> Config {
|
||||
self.octal = yes;
|
||||
self
|
||||
}
|
||||
|
||||
/// Returns whether "unicode" mode is enabled.
|
||||
pub fn get_unicode(&self) -> bool {
|
||||
self.unicode
|
||||
}
|
||||
|
||||
/// Returns whether "case insensitive" mode is enabled.
|
||||
pub fn get_case_insensitive(&self) -> bool {
|
||||
self.case_insensitive
|
||||
}
|
||||
|
||||
/// Returns whether "multi line" mode is enabled.
|
||||
pub fn get_multi_line(&self) -> bool {
|
||||
self.multi_line
|
||||
}
|
||||
|
||||
/// Returns whether "dot matches new line" mode is enabled.
|
||||
pub fn get_dot_matches_new_line(&self) -> bool {
|
||||
self.dot_matches_new_line
|
||||
}
|
||||
|
||||
/// Returns whether "CRLF" mode is enabled.
|
||||
pub fn get_crlf(&self) -> bool {
|
||||
self.crlf
|
||||
}
|
||||
|
||||
/// Returns the line terminator in this syntax configuration.
|
||||
pub fn get_line_terminator(&self) -> u8 {
|
||||
self.line_terminator
|
||||
}
|
||||
|
||||
/// Returns whether "swap greed" mode is enabled.
|
||||
pub fn get_swap_greed(&self) -> bool {
|
||||
self.swap_greed
|
||||
}
|
||||
|
||||
/// Returns whether "ignore whitespace" mode is enabled.
|
||||
pub fn get_ignore_whitespace(&self) -> bool {
|
||||
self.ignore_whitespace
|
||||
}
|
||||
|
||||
/// Returns whether UTF-8 mode is enabled.
|
||||
pub fn get_utf8(&self) -> bool {
|
||||
self.utf8
|
||||
}
|
||||
|
||||
/// Returns the "nest limit" setting.
|
||||
pub fn get_nest_limit(&self) -> u32 {
|
||||
self.nest_limit
|
||||
}
|
||||
|
||||
/// Returns whether "octal" mode is enabled.
|
||||
pub fn get_octal(&self) -> bool {
|
||||
self.octal
|
||||
}
|
||||
|
||||
/// Applies this configuration to the given parser.
|
||||
pub(crate) fn apply(&self, builder: &mut ParserBuilder) {
|
||||
builder
|
||||
.unicode(self.unicode)
|
||||
.case_insensitive(self.case_insensitive)
|
||||
.multi_line(self.multi_line)
|
||||
.dot_matches_new_line(self.dot_matches_new_line)
|
||||
.crlf(self.crlf)
|
||||
.line_terminator(self.line_terminator)
|
||||
.swap_greed(self.swap_greed)
|
||||
.ignore_whitespace(self.ignore_whitespace)
|
||||
.utf8(self.utf8)
|
||||
.nest_limit(self.nest_limit)
|
||||
.octal(self.octal);
|
||||
}
|
||||
|
||||
/// Applies this configuration to the given AST parser.
|
||||
pub(crate) fn apply_ast(&self, builder: &mut ast::parse::ParserBuilder) {
|
||||
builder
|
||||
.ignore_whitespace(self.ignore_whitespace)
|
||||
.nest_limit(self.nest_limit)
|
||||
.octal(self.octal);
|
||||
}
|
||||
|
||||
/// Applies this configuration to the given AST-to-HIR translator.
|
||||
pub(crate) fn apply_hir(
|
||||
&self,
|
||||
builder: &mut hir::translate::TranslatorBuilder,
|
||||
) {
|
||||
builder
|
||||
.unicode(self.unicode)
|
||||
.case_insensitive(self.case_insensitive)
|
||||
.multi_line(self.multi_line)
|
||||
.crlf(self.crlf)
|
||||
.dot_matches_new_line(self.dot_matches_new_line)
|
||||
.line_terminator(self.line_terminator)
|
||||
.swap_greed(self.swap_greed)
|
||||
.utf8(self.utf8);
|
||||
}
|
||||
}
|
||||
|
||||
impl Default for Config {
|
||||
fn default() -> Config {
|
||||
Config::new()
|
||||
}
|
||||
}
|
||||
17
third-party/vendor/regex-automata/src/util/unicode_data/mod.rs
vendored
Normal file
17
third-party/vendor/regex-automata/src/util/unicode_data/mod.rs
vendored
Normal file
|
|
@ -0,0 +1,17 @@
|
|||
// This cfg should match the one in src/util/look.rs that uses perl_word.
|
||||
#[cfg(all(
|
||||
// We have to explicitly want to support Unicode word boundaries.
|
||||
feature = "unicode-word-boundary",
|
||||
not(all(
|
||||
// If we don't have regex-syntax at all, then we definitely need to
|
||||
// bring our own \w data table.
|
||||
feature = "syntax",
|
||||
// If unicode-perl is enabled, then regex-syntax/unicode-perl is
|
||||
// also enabled, which in turn means we can use regex-syntax's
|
||||
// is_word_character routine (and thus use its data tables). But if
|
||||
// unicode-perl is not enabled, even if syntax is, then we need to
|
||||
// bring our own.
|
||||
feature = "unicode-perl",
|
||||
)),
|
||||
))]
|
||||
pub(crate) mod perl_word;
|
||||
781
third-party/vendor/regex-automata/src/util/unicode_data/perl_word.rs
vendored
Normal file
781
third-party/vendor/regex-automata/src/util/unicode_data/perl_word.rs
vendored
Normal file
|
|
@ -0,0 +1,781 @@
|
|||
// DO NOT EDIT THIS FILE. IT WAS AUTOMATICALLY GENERATED BY:
|
||||
//
|
||||
// ucd-generate perl-word tmp/ucd-15.0.0/ --chars
|
||||
//
|
||||
// Unicode version: 15.0.0.
|
||||
//
|
||||
// ucd-generate 0.2.15 is available on crates.io.
|
||||
|
||||
pub const PERL_WORD: &'static [(char, char)] = &[
|
||||
('0', '9'),
|
||||
('A', 'Z'),
|
||||
('_', '_'),
|
||||
('a', 'z'),
|
||||
('ª', 'ª'),
|
||||
('µ', 'µ'),
|
||||
('º', 'º'),
|
||||
('À', 'Ö'),
|
||||
('Ø', 'ö'),
|
||||
('ø', 'ˁ'),
|
||||
('ˆ', 'ˑ'),
|
||||
('ˠ', 'ˤ'),
|
||||
('ˬ', 'ˬ'),
|
||||
('ˮ', 'ˮ'),
|
||||
('\u{300}', 'ʹ'),
|
||||
('Ͷ', 'ͷ'),
|
||||
('ͺ', 'ͽ'),
|
||||
('Ϳ', 'Ϳ'),
|
||||
('Ά', 'Ά'),
|
||||
('Έ', 'Ί'),
|
||||
('Ό', 'Ό'),
|
||||
('Ύ', 'Ρ'),
|
||||
('Σ', 'ϵ'),
|
||||
('Ϸ', 'ҁ'),
|
||||
('\u{483}', 'ԯ'),
|
||||
('Ա', 'Ֆ'),
|
||||
('ՙ', 'ՙ'),
|
||||
('ՠ', 'ֈ'),
|
||||
('\u{591}', '\u{5bd}'),
|
||||
('\u{5bf}', '\u{5bf}'),
|
||||
('\u{5c1}', '\u{5c2}'),
|
||||
('\u{5c4}', '\u{5c5}'),
|
||||
('\u{5c7}', '\u{5c7}'),
|
||||
('א', 'ת'),
|
||||
('ׯ', 'ײ'),
|
||||
('\u{610}', '\u{61a}'),
|
||||
('ؠ', '٩'),
|
||||
('ٮ', 'ۓ'),
|
||||
('ە', '\u{6dc}'),
|
||||
('\u{6df}', '\u{6e8}'),
|
||||
('\u{6ea}', 'ۼ'),
|
||||
('ۿ', 'ۿ'),
|
||||
('ܐ', '\u{74a}'),
|
||||
('ݍ', 'ޱ'),
|
||||
('߀', 'ߵ'),
|
||||
('ߺ', 'ߺ'),
|
||||
('\u{7fd}', '\u{7fd}'),
|
||||
('ࠀ', '\u{82d}'),
|
||||
('ࡀ', '\u{85b}'),
|
||||
('ࡠ', 'ࡪ'),
|
||||
('ࡰ', 'ࢇ'),
|
||||
('ࢉ', 'ࢎ'),
|
||||
('\u{898}', '\u{8e1}'),
|
||||
('\u{8e3}', '\u{963}'),
|
||||
('०', '९'),
|
||||
('ॱ', 'ঃ'),
|
||||
('অ', 'ঌ'),
|
||||
('এ', 'ঐ'),
|
||||
('ও', 'ন'),
|
||||
('প', 'র'),
|
||||
('ল', 'ল'),
|
||||
('শ', 'হ'),
|
||||
('\u{9bc}', '\u{9c4}'),
|
||||
('ে', 'ৈ'),
|
||||
('ো', 'ৎ'),
|
||||
('\u{9d7}', '\u{9d7}'),
|
||||
('ড়', 'ঢ়'),
|
||||
('য়', '\u{9e3}'),
|
||||
('০', 'ৱ'),
|
||||
('ৼ', 'ৼ'),
|
||||
('\u{9fe}', '\u{9fe}'),
|
||||
('\u{a01}', 'ਃ'),
|
||||
('ਅ', 'ਊ'),
|
||||
('ਏ', 'ਐ'),
|
||||
('ਓ', 'ਨ'),
|
||||
('ਪ', 'ਰ'),
|
||||
('ਲ', 'ਲ਼'),
|
||||
('ਵ', 'ਸ਼'),
|
||||
('ਸ', 'ਹ'),
|
||||
('\u{a3c}', '\u{a3c}'),
|
||||
('ਾ', '\u{a42}'),
|
||||
('\u{a47}', '\u{a48}'),
|
||||
('\u{a4b}', '\u{a4d}'),
|
||||
('\u{a51}', '\u{a51}'),
|
||||
('ਖ਼', 'ੜ'),
|
||||
('ਫ਼', 'ਫ਼'),
|
||||
('੦', '\u{a75}'),
|
||||
('\u{a81}', 'ઃ'),
|
||||
('અ', 'ઍ'),
|
||||
('એ', 'ઑ'),
|
||||
('ઓ', 'ન'),
|
||||
('પ', 'ર'),
|
||||
('લ', 'ળ'),
|
||||
('વ', 'હ'),
|
||||
('\u{abc}', '\u{ac5}'),
|
||||
('\u{ac7}', 'ૉ'),
|
||||
('ો', '\u{acd}'),
|
||||
('ૐ', 'ૐ'),
|
||||
('ૠ', '\u{ae3}'),
|
||||
('૦', '૯'),
|
||||
('ૹ', '\u{aff}'),
|
||||
('\u{b01}', 'ଃ'),
|
||||
('ଅ', 'ଌ'),
|
||||
('ଏ', 'ଐ'),
|
||||
('ଓ', 'ନ'),
|
||||
('ପ', 'ର'),
|
||||
('ଲ', 'ଳ'),
|
||||
('ଵ', 'ହ'),
|
||||
('\u{b3c}', '\u{b44}'),
|
||||
('େ', 'ୈ'),
|
||||
('ୋ', '\u{b4d}'),
|
||||
('\u{b55}', '\u{b57}'),
|
||||
('ଡ଼', 'ଢ଼'),
|
||||
('ୟ', '\u{b63}'),
|
||||
('୦', '୯'),
|
||||
('ୱ', 'ୱ'),
|
||||
('\u{b82}', 'ஃ'),
|
||||
('அ', 'ஊ'),
|
||||
('எ', 'ஐ'),
|
||||
('ஒ', 'க'),
|
||||
('ங', 'ச'),
|
||||
('ஜ', 'ஜ'),
|
||||
('ஞ', 'ட'),
|
||||
('ண', 'த'),
|
||||
('ந', 'ப'),
|
||||
('ம', 'ஹ'),
|
||||
('\u{bbe}', 'ூ'),
|
||||
('ெ', 'ை'),
|
||||
('ொ', '\u{bcd}'),
|
||||
('ௐ', 'ௐ'),
|
||||
('\u{bd7}', '\u{bd7}'),
|
||||
('௦', '௯'),
|
||||
('\u{c00}', 'ఌ'),
|
||||
('ఎ', 'ఐ'),
|
||||
('ఒ', 'న'),
|
||||
('ప', 'హ'),
|
||||
('\u{c3c}', 'ౄ'),
|
||||
('\u{c46}', '\u{c48}'),
|
||||
('\u{c4a}', '\u{c4d}'),
|
||||
('\u{c55}', '\u{c56}'),
|
||||
('ౘ', 'ౚ'),
|
||||
('ౝ', 'ౝ'),
|
||||
('ౠ', '\u{c63}'),
|
||||
('౦', '౯'),
|
||||
('ಀ', 'ಃ'),
|
||||
('ಅ', 'ಌ'),
|
||||
('ಎ', 'ಐ'),
|
||||
('ಒ', 'ನ'),
|
||||
('ಪ', 'ಳ'),
|
||||
('ವ', 'ಹ'),
|
||||
('\u{cbc}', 'ೄ'),
|
||||
('\u{cc6}', 'ೈ'),
|
||||
('ೊ', '\u{ccd}'),
|
||||
('\u{cd5}', '\u{cd6}'),
|
||||
('ೝ', 'ೞ'),
|
||||
('ೠ', '\u{ce3}'),
|
||||
('೦', '೯'),
|
||||
('ೱ', 'ೳ'),
|
||||
('\u{d00}', 'ഌ'),
|
||||
('എ', 'ഐ'),
|
||||
('ഒ', '\u{d44}'),
|
||||
('െ', 'ൈ'),
|
||||
('ൊ', 'ൎ'),
|
||||
('ൔ', '\u{d57}'),
|
||||
('ൟ', '\u{d63}'),
|
||||
('൦', '൯'),
|
||||
('ൺ', 'ൿ'),
|
||||
('\u{d81}', 'ඃ'),
|
||||
('අ', 'ඖ'),
|
||||
('ක', 'න'),
|
||||
('ඳ', 'ර'),
|
||||
('ල', 'ල'),
|
||||
('ව', 'ෆ'),
|
||||
('\u{dca}', '\u{dca}'),
|
||||
('\u{dcf}', '\u{dd4}'),
|
||||
('\u{dd6}', '\u{dd6}'),
|
||||
('ෘ', '\u{ddf}'),
|
||||
('෦', '෯'),
|
||||
('ෲ', 'ෳ'),
|
||||
('ก', '\u{e3a}'),
|
||||
('เ', '\u{e4e}'),
|
||||
('๐', '๙'),
|
||||
('ກ', 'ຂ'),
|
||||
('ຄ', 'ຄ'),
|
||||
('ຆ', 'ຊ'),
|
||||
('ຌ', 'ຣ'),
|
||||
('ລ', 'ລ'),
|
||||
('ວ', 'ຽ'),
|
||||
('ເ', 'ໄ'),
|
||||
('ໆ', 'ໆ'),
|
||||
('\u{ec8}', '\u{ece}'),
|
||||
('໐', '໙'),
|
||||
('ໜ', 'ໟ'),
|
||||
('ༀ', 'ༀ'),
|
||||
('\u{f18}', '\u{f19}'),
|
||||
('༠', '༩'),
|
||||
('\u{f35}', '\u{f35}'),
|
||||
('\u{f37}', '\u{f37}'),
|
||||
('\u{f39}', '\u{f39}'),
|
||||
('༾', 'ཇ'),
|
||||
('ཉ', 'ཬ'),
|
||||
('\u{f71}', '\u{f84}'),
|
||||
('\u{f86}', '\u{f97}'),
|
||||
('\u{f99}', '\u{fbc}'),
|
||||
('\u{fc6}', '\u{fc6}'),
|
||||
('က', '၉'),
|
||||
('ၐ', '\u{109d}'),
|
||||
('Ⴀ', 'Ⴥ'),
|
||||
('Ⴧ', 'Ⴧ'),
|
||||
('Ⴭ', 'Ⴭ'),
|
||||
('ა', 'ჺ'),
|
||||
('ჼ', 'ቈ'),
|
||||
('ቊ', 'ቍ'),
|
||||
('ቐ', 'ቖ'),
|
||||
('ቘ', 'ቘ'),
|
||||
('ቚ', 'ቝ'),
|
||||
('በ', 'ኈ'),
|
||||
('ኊ', 'ኍ'),
|
||||
('ነ', 'ኰ'),
|
||||
('ኲ', 'ኵ'),
|
||||
('ኸ', 'ኾ'),
|
||||
('ዀ', 'ዀ'),
|
||||
('ዂ', 'ዅ'),
|
||||
('ወ', 'ዖ'),
|
||||
('ዘ', 'ጐ'),
|
||||
('ጒ', 'ጕ'),
|
||||
('ጘ', 'ፚ'),
|
||||
('\u{135d}', '\u{135f}'),
|
||||
('ᎀ', 'ᎏ'),
|
||||
('Ꭰ', 'Ᏽ'),
|
||||
('ᏸ', 'ᏽ'),
|
||||
('ᐁ', 'ᙬ'),
|
||||
('ᙯ', 'ᙿ'),
|
||||
('ᚁ', 'ᚚ'),
|
||||
('ᚠ', 'ᛪ'),
|
||||
('ᛮ', 'ᛸ'),
|
||||
('ᜀ', '᜕'),
|
||||
('ᜟ', '᜴'),
|
||||
('ᝀ', '\u{1753}'),
|
||||
('ᝠ', 'ᝬ'),
|
||||
('ᝮ', 'ᝰ'),
|
||||
('\u{1772}', '\u{1773}'),
|
||||
('ក', '\u{17d3}'),
|
||||
('ៗ', 'ៗ'),
|
||||
('ៜ', '\u{17dd}'),
|
||||
('០', '៩'),
|
||||
('\u{180b}', '\u{180d}'),
|
||||
('\u{180f}', '᠙'),
|
||||
('ᠠ', 'ᡸ'),
|
||||
('ᢀ', 'ᢪ'),
|
||||
('ᢰ', 'ᣵ'),
|
||||
('ᤀ', 'ᤞ'),
|
||||
('\u{1920}', 'ᤫ'),
|
||||
('ᤰ', '\u{193b}'),
|
||||
('᥆', 'ᥭ'),
|
||||
('ᥰ', 'ᥴ'),
|
||||
('ᦀ', 'ᦫ'),
|
||||
('ᦰ', 'ᧉ'),
|
||||
('᧐', '᧙'),
|
||||
('ᨀ', '\u{1a1b}'),
|
||||
('ᨠ', '\u{1a5e}'),
|
||||
('\u{1a60}', '\u{1a7c}'),
|
||||
('\u{1a7f}', '᪉'),
|
||||
('᪐', '᪙'),
|
||||
('ᪧ', 'ᪧ'),
|
||||
('\u{1ab0}', '\u{1ace}'),
|
||||
('\u{1b00}', 'ᭌ'),
|
||||
('᭐', '᭙'),
|
||||
('\u{1b6b}', '\u{1b73}'),
|
||||
('\u{1b80}', '᯳'),
|
||||
('ᰀ', '\u{1c37}'),
|
||||
('᱀', '᱉'),
|
||||
('ᱍ', 'ᱽ'),
|
||||
('ᲀ', 'ᲈ'),
|
||||
('Ა', 'Ჺ'),
|
||||
('Ჽ', 'Ჿ'),
|
||||
('\u{1cd0}', '\u{1cd2}'),
|
||||
('\u{1cd4}', 'ᳺ'),
|
||||
('ᴀ', 'ἕ'),
|
||||
('Ἐ', 'Ἕ'),
|
||||
('ἠ', 'ὅ'),
|
||||
('Ὀ', 'Ὅ'),
|
||||
('ὐ', 'ὗ'),
|
||||
('Ὑ', 'Ὑ'),
|
||||
('Ὓ', 'Ὓ'),
|
||||
('Ὕ', 'Ὕ'),
|
||||
('Ὗ', 'ώ'),
|
||||
('ᾀ', 'ᾴ'),
|
||||
('ᾶ', 'ᾼ'),
|
||||
('ι', 'ι'),
|
||||
('ῂ', 'ῄ'),
|
||||
('ῆ', 'ῌ'),
|
||||
('ῐ', 'ΐ'),
|
||||
('ῖ', 'Ί'),
|
||||
('ῠ', 'Ῥ'),
|
||||
('ῲ', 'ῴ'),
|
||||
('ῶ', 'ῼ'),
|
||||
('\u{200c}', '\u{200d}'),
|
||||
('‿', '⁀'),
|
||||
('⁔', '⁔'),
|
||||
('ⁱ', 'ⁱ'),
|
||||
('ⁿ', 'ⁿ'),
|
||||
('ₐ', 'ₜ'),
|
||||
('\u{20d0}', '\u{20f0}'),
|
||||
('ℂ', 'ℂ'),
|
||||
('ℇ', 'ℇ'),
|
||||
('ℊ', 'ℓ'),
|
||||
('ℕ', 'ℕ'),
|
||||
('ℙ', 'ℝ'),
|
||||
('ℤ', 'ℤ'),
|
||||
('Ω', 'Ω'),
|
||||
('ℨ', 'ℨ'),
|
||||
('K', 'ℭ'),
|
||||
('ℯ', 'ℹ'),
|
||||
('ℼ', 'ℿ'),
|
||||
('ⅅ', 'ⅉ'),
|
||||
('ⅎ', 'ⅎ'),
|
||||
('Ⅰ', 'ↈ'),
|
||||
('Ⓐ', 'ⓩ'),
|
||||
('Ⰰ', 'ⳤ'),
|
||||
('Ⳬ', 'ⳳ'),
|
||||
('ⴀ', 'ⴥ'),
|
||||
('ⴧ', 'ⴧ'),
|
||||
('ⴭ', 'ⴭ'),
|
||||
('ⴰ', 'ⵧ'),
|
||||
('ⵯ', 'ⵯ'),
|
||||
('\u{2d7f}', 'ⶖ'),
|
||||
('ⶠ', 'ⶦ'),
|
||||
('ⶨ', 'ⶮ'),
|
||||
('ⶰ', 'ⶶ'),
|
||||
('ⶸ', 'ⶾ'),
|
||||
('ⷀ', 'ⷆ'),
|
||||
('ⷈ', 'ⷎ'),
|
||||
('ⷐ', 'ⷖ'),
|
||||
('ⷘ', 'ⷞ'),
|
||||
('\u{2de0}', '\u{2dff}'),
|
||||
('ⸯ', 'ⸯ'),
|
||||
('々', '〇'),
|
||||
('〡', '\u{302f}'),
|
||||
('〱', '〵'),
|
||||
('〸', '〼'),
|
||||
('ぁ', 'ゖ'),
|
||||
('\u{3099}', '\u{309a}'),
|
||||
('ゝ', 'ゟ'),
|
||||
('ァ', 'ヺ'),
|
||||
('ー', 'ヿ'),
|
||||
('ㄅ', 'ㄯ'),
|
||||
('ㄱ', 'ㆎ'),
|
||||
('ㆠ', 'ㆿ'),
|
||||
('ㇰ', 'ㇿ'),
|
||||
('㐀', '䶿'),
|
||||
('一', 'ꒌ'),
|
||||
('ꓐ', 'ꓽ'),
|
||||
('ꔀ', 'ꘌ'),
|
||||
('ꘐ', 'ꘫ'),
|
||||
('Ꙁ', '\u{a672}'),
|
||||
('\u{a674}', '\u{a67d}'),
|
||||
('ꙿ', '\u{a6f1}'),
|
||||
('ꜗ', 'ꜟ'),
|
||||
('Ꜣ', 'ꞈ'),
|
||||
('Ꞌ', 'ꟊ'),
|
||||
('Ꟑ', 'ꟑ'),
|
||||
('ꟓ', 'ꟓ'),
|
||||
('ꟕ', 'ꟙ'),
|
||||
('ꟲ', 'ꠧ'),
|
||||
('\u{a82c}', '\u{a82c}'),
|
||||
('ꡀ', 'ꡳ'),
|
||||
('ꢀ', '\u{a8c5}'),
|
||||
('꣐', '꣙'),
|
||||
('\u{a8e0}', 'ꣷ'),
|
||||
('ꣻ', 'ꣻ'),
|
||||
('ꣽ', '\u{a92d}'),
|
||||
('ꤰ', '꥓'),
|
||||
('ꥠ', 'ꥼ'),
|
||||
('\u{a980}', '꧀'),
|
||||
('ꧏ', '꧙'),
|
||||
('ꧠ', 'ꧾ'),
|
||||
('ꨀ', '\u{aa36}'),
|
||||
('ꩀ', 'ꩍ'),
|
||||
('꩐', '꩙'),
|
||||
('ꩠ', 'ꩶ'),
|
||||
('ꩺ', 'ꫂ'),
|
||||
('ꫛ', 'ꫝ'),
|
||||
('ꫠ', 'ꫯ'),
|
||||
('ꫲ', '\u{aaf6}'),
|
||||
('ꬁ', 'ꬆ'),
|
||||
('ꬉ', 'ꬎ'),
|
||||
('ꬑ', 'ꬖ'),
|
||||
('ꬠ', 'ꬦ'),
|
||||
('ꬨ', 'ꬮ'),
|
||||
('ꬰ', 'ꭚ'),
|
||||
('ꭜ', 'ꭩ'),
|
||||
('ꭰ', 'ꯪ'),
|
||||
('꯬', '\u{abed}'),
|
||||
('꯰', '꯹'),
|
||||
('가', '힣'),
|
||||
('ힰ', 'ퟆ'),
|
||||
('ퟋ', 'ퟻ'),
|
||||
('豈', '舘'),
|
||||
('並', '龎'),
|
||||
('ff', 'st'),
|
||||
('ﬓ', 'ﬗ'),
|
||||
('יִ', 'ﬨ'),
|
||||
('שׁ', 'זּ'),
|
||||
('טּ', 'לּ'),
|
||||
('מּ', 'מּ'),
|
||||
('נּ', 'סּ'),
|
||||
('ףּ', 'פּ'),
|
||||
('צּ', 'ﮱ'),
|
||||
('ﯓ', 'ﴽ'),
|
||||
('ﵐ', 'ﶏ'),
|
||||
('ﶒ', 'ﷇ'),
|
||||
('ﷰ', 'ﷻ'),
|
||||
('\u{fe00}', '\u{fe0f}'),
|
||||
('\u{fe20}', '\u{fe2f}'),
|
||||
('︳', '︴'),
|
||||
('﹍', '﹏'),
|
||||
('ﹰ', 'ﹴ'),
|
||||
('ﹶ', 'ﻼ'),
|
||||
('0', '9'),
|
||||
('A', 'Z'),
|
||||
('_', '_'),
|
||||
('a', 'z'),
|
||||
('ヲ', 'ᄒ'),
|
||||
('ᅡ', 'ᅦ'),
|
||||
('ᅧ', 'ᅬ'),
|
||||
('ᅭ', 'ᅲ'),
|
||||
('ᅳ', 'ᅵ'),
|
||||
('𐀀', '𐀋'),
|
||||
('𐀍', '𐀦'),
|
||||
('𐀨', '𐀺'),
|
||||
('𐀼', '𐀽'),
|
||||
('𐀿', '𐁍'),
|
||||
('𐁐', '𐁝'),
|
||||
('𐂀', '𐃺'),
|
||||
('𐅀', '𐅴'),
|
||||
('\u{101fd}', '\u{101fd}'),
|
||||
('𐊀', '𐊜'),
|
||||
('𐊠', '𐋐'),
|
||||
('\u{102e0}', '\u{102e0}'),
|
||||
('𐌀', '𐌟'),
|
||||
('𐌭', '𐍊'),
|
||||
('𐍐', '\u{1037a}'),
|
||||
('𐎀', '𐎝'),
|
||||
('𐎠', '𐏃'),
|
||||
('𐏈', '𐏏'),
|
||||
('𐏑', '𐏕'),
|
||||
('𐐀', '𐒝'),
|
||||
('𐒠', '𐒩'),
|
||||
('𐒰', '𐓓'),
|
||||
('𐓘', '𐓻'),
|
||||
('𐔀', '𐔧'),
|
||||
('𐔰', '𐕣'),
|
||||
('𐕰', '𐕺'),
|
||||
('𐕼', '𐖊'),
|
||||
('𐖌', '𐖒'),
|
||||
('𐖔', '𐖕'),
|
||||
('𐖗', '𐖡'),
|
||||
('𐖣', '𐖱'),
|
||||
('𐖳', '𐖹'),
|
||||
('𐖻', '𐖼'),
|
||||
('𐘀', '𐜶'),
|
||||
('𐝀', '𐝕'),
|
||||
('𐝠', '𐝧'),
|
||||
('𐞀', '𐞅'),
|
||||
('𐞇', '𐞰'),
|
||||
('𐞲', '𐞺'),
|
||||
('𐠀', '𐠅'),
|
||||
('𐠈', '𐠈'),
|
||||
('𐠊', '𐠵'),
|
||||
('𐠷', '𐠸'),
|
||||
('𐠼', '𐠼'),
|
||||
('𐠿', '𐡕'),
|
||||
('𐡠', '𐡶'),
|
||||
('𐢀', '𐢞'),
|
||||
('𐣠', '𐣲'),
|
||||
('𐣴', '𐣵'),
|
||||
('𐤀', '𐤕'),
|
||||
('𐤠', '𐤹'),
|
||||
('𐦀', '𐦷'),
|
||||
('𐦾', '𐦿'),
|
||||
('𐨀', '\u{10a03}'),
|
||||
('\u{10a05}', '\u{10a06}'),
|
||||
('\u{10a0c}', '𐨓'),
|
||||
('𐨕', '𐨗'),
|
||||
('𐨙', '𐨵'),
|
||||
('\u{10a38}', '\u{10a3a}'),
|
||||
('\u{10a3f}', '\u{10a3f}'),
|
||||
('𐩠', '𐩼'),
|
||||
('𐪀', '𐪜'),
|
||||
('𐫀', '𐫇'),
|
||||
('𐫉', '\u{10ae6}'),
|
||||
('𐬀', '𐬵'),
|
||||
('𐭀', '𐭕'),
|
||||
('𐭠', '𐭲'),
|
||||
('𐮀', '𐮑'),
|
||||
('𐰀', '𐱈'),
|
||||
('𐲀', '𐲲'),
|
||||
('𐳀', '𐳲'),
|
||||
('𐴀', '\u{10d27}'),
|
||||
('𐴰', '𐴹'),
|
||||
('𐺀', '𐺩'),
|
||||
('\u{10eab}', '\u{10eac}'),
|
||||
('𐺰', '𐺱'),
|
||||
('\u{10efd}', '𐼜'),
|
||||
('𐼧', '𐼧'),
|
||||
('𐼰', '\u{10f50}'),
|
||||
('𐽰', '\u{10f85}'),
|
||||
('𐾰', '𐿄'),
|
||||
('𐿠', '𐿶'),
|
||||
('𑀀', '\u{11046}'),
|
||||
('𑁦', '𑁵'),
|
||||
('\u{1107f}', '\u{110ba}'),
|
||||
('\u{110c2}', '\u{110c2}'),
|
||||
('𑃐', '𑃨'),
|
||||
('𑃰', '𑃹'),
|
||||
('\u{11100}', '\u{11134}'),
|
||||
('𑄶', '𑄿'),
|
||||
('𑅄', '𑅇'),
|
||||
('𑅐', '\u{11173}'),
|
||||
('𑅶', '𑅶'),
|
||||
('\u{11180}', '𑇄'),
|
||||
('\u{111c9}', '\u{111cc}'),
|
||||
('𑇎', '𑇚'),
|
||||
('𑇜', '𑇜'),
|
||||
('𑈀', '𑈑'),
|
||||
('𑈓', '\u{11237}'),
|
||||
('\u{1123e}', '\u{11241}'),
|
||||
('𑊀', '𑊆'),
|
||||
('𑊈', '𑊈'),
|
||||
('𑊊', '𑊍'),
|
||||
('𑊏', '𑊝'),
|
||||
('𑊟', '𑊨'),
|
||||
('𑊰', '\u{112ea}'),
|
||||
('𑋰', '𑋹'),
|
||||
('\u{11300}', '𑌃'),
|
||||
('𑌅', '𑌌'),
|
||||
('𑌏', '𑌐'),
|
||||
('𑌓', '𑌨'),
|
||||
('𑌪', '𑌰'),
|
||||
('𑌲', '𑌳'),
|
||||
('𑌵', '𑌹'),
|
||||
('\u{1133b}', '𑍄'),
|
||||
('𑍇', '𑍈'),
|
||||
('𑍋', '𑍍'),
|
||||
('𑍐', '𑍐'),
|
||||
('\u{11357}', '\u{11357}'),
|
||||
('𑍝', '𑍣'),
|
||||
('\u{11366}', '\u{1136c}'),
|
||||
('\u{11370}', '\u{11374}'),
|
||||
('𑐀', '𑑊'),
|
||||
('𑑐', '𑑙'),
|
||||
('\u{1145e}', '𑑡'),
|
||||
('𑒀', '𑓅'),
|
||||
('𑓇', '𑓇'),
|
||||
('𑓐', '𑓙'),
|
||||
('𑖀', '\u{115b5}'),
|
||||
('𑖸', '\u{115c0}'),
|
||||
('𑗘', '\u{115dd}'),
|
||||
('𑘀', '\u{11640}'),
|
||||
('𑙄', '𑙄'),
|
||||
('𑙐', '𑙙'),
|
||||
('𑚀', '𑚸'),
|
||||
('𑛀', '𑛉'),
|
||||
('𑜀', '𑜚'),
|
||||
('\u{1171d}', '\u{1172b}'),
|
||||
('𑜰', '𑜹'),
|
||||
('𑝀', '𑝆'),
|
||||
('𑠀', '\u{1183a}'),
|
||||
('𑢠', '𑣩'),
|
||||
('𑣿', '𑤆'),
|
||||
('𑤉', '𑤉'),
|
||||
('𑤌', '𑤓'),
|
||||
('𑤕', '𑤖'),
|
||||
('𑤘', '𑤵'),
|
||||
('𑤷', '𑤸'),
|
||||
('\u{1193b}', '\u{11943}'),
|
||||
('𑥐', '𑥙'),
|
||||
('𑦠', '𑦧'),
|
||||
('𑦪', '\u{119d7}'),
|
||||
('\u{119da}', '𑧡'),
|
||||
('𑧣', '𑧤'),
|
||||
('𑨀', '\u{11a3e}'),
|
||||
('\u{11a47}', '\u{11a47}'),
|
||||
('𑩐', '\u{11a99}'),
|
||||
('𑪝', '𑪝'),
|
||||
('𑪰', '𑫸'),
|
||||
('𑰀', '𑰈'),
|
||||
('𑰊', '\u{11c36}'),
|
||||
('\u{11c38}', '𑱀'),
|
||||
('𑱐', '𑱙'),
|
||||
('𑱲', '𑲏'),
|
||||
('\u{11c92}', '\u{11ca7}'),
|
||||
('𑲩', '\u{11cb6}'),
|
||||
('𑴀', '𑴆'),
|
||||
('𑴈', '𑴉'),
|
||||
('𑴋', '\u{11d36}'),
|
||||
('\u{11d3a}', '\u{11d3a}'),
|
||||
('\u{11d3c}', '\u{11d3d}'),
|
||||
('\u{11d3f}', '\u{11d47}'),
|
||||
('𑵐', '𑵙'),
|
||||
('𑵠', '𑵥'),
|
||||
('𑵧', '𑵨'),
|
||||
('𑵪', '𑶎'),
|
||||
('\u{11d90}', '\u{11d91}'),
|
||||
('𑶓', '𑶘'),
|
||||
('𑶠', '𑶩'),
|
||||
('𑻠', '𑻶'),
|
||||
('\u{11f00}', '𑼐'),
|
||||
('𑼒', '\u{11f3a}'),
|
||||
('𑼾', '\u{11f42}'),
|
||||
('𑽐', '𑽙'),
|
||||
('𑾰', '𑾰'),
|
||||
('𒀀', '𒎙'),
|
||||
('𒐀', '𒑮'),
|
||||
('𒒀', '𒕃'),
|
||||
('𒾐', '𒿰'),
|
||||
('𓀀', '𓐯'),
|
||||
('\u{13440}', '\u{13455}'),
|
||||
('𔐀', '𔙆'),
|
||||
('𖠀', '𖨸'),
|
||||
('𖩀', '𖩞'),
|
||||
('𖩠', '𖩩'),
|
||||
('𖩰', '𖪾'),
|
||||
('𖫀', '𖫉'),
|
||||
('𖫐', '𖫭'),
|
||||
('\u{16af0}', '\u{16af4}'),
|
||||
('𖬀', '\u{16b36}'),
|
||||
('𖭀', '𖭃'),
|
||||
('𖭐', '𖭙'),
|
||||
('𖭣', '𖭷'),
|
||||
('𖭽', '𖮏'),
|
||||
('𖹀', '𖹿'),
|
||||
('𖼀', '𖽊'),
|
||||
('\u{16f4f}', '𖾇'),
|
||||
('\u{16f8f}', '𖾟'),
|
||||
('𖿠', '𖿡'),
|
||||
('𖿣', '\u{16fe4}'),
|
||||
('𖿰', '𖿱'),
|
||||
('𗀀', '𘟷'),
|
||||
('𘠀', '𘳕'),
|
||||
('𘴀', '𘴈'),
|
||||
('𚿰', '𚿳'),
|
||||
('𚿵', '𚿻'),
|
||||
('𚿽', '𚿾'),
|
||||
('𛀀', '𛄢'),
|
||||
('𛄲', '𛄲'),
|
||||
('𛅐', '𛅒'),
|
||||
('𛅕', '𛅕'),
|
||||
('𛅤', '𛅧'),
|
||||
('𛅰', '𛋻'),
|
||||
('𛰀', '𛱪'),
|
||||
('𛱰', '𛱼'),
|
||||
('𛲀', '𛲈'),
|
||||
('𛲐', '𛲙'),
|
||||
('\u{1bc9d}', '\u{1bc9e}'),
|
||||
('\u{1cf00}', '\u{1cf2d}'),
|
||||
('\u{1cf30}', '\u{1cf46}'),
|
||||
('\u{1d165}', '\u{1d169}'),
|
||||
('𝅭', '\u{1d172}'),
|
||||
('\u{1d17b}', '\u{1d182}'),
|
||||
('\u{1d185}', '\u{1d18b}'),
|
||||
('\u{1d1aa}', '\u{1d1ad}'),
|
||||
('\u{1d242}', '\u{1d244}'),
|
||||
('𝐀', '𝑔'),
|
||||
('𝑖', '𝒜'),
|
||||
('𝒞', '𝒟'),
|
||||
('𝒢', '𝒢'),
|
||||
('𝒥', '𝒦'),
|
||||
('𝒩', '𝒬'),
|
||||
('𝒮', '𝒹'),
|
||||
('𝒻', '𝒻'),
|
||||
('𝒽', '𝓃'),
|
||||
('𝓅', '𝔅'),
|
||||
('𝔇', '𝔊'),
|
||||
('𝔍', '𝔔'),
|
||||
('𝔖', '𝔜'),
|
||||
('𝔞', '𝔹'),
|
||||
('𝔻', '𝔾'),
|
||||
('𝕀', '𝕄'),
|
||||
('𝕆', '𝕆'),
|
||||
('𝕊', '𝕐'),
|
||||
('𝕒', '𝚥'),
|
||||
('𝚨', '𝛀'),
|
||||
('𝛂', '𝛚'),
|
||||
('𝛜', '𝛺'),
|
||||
('𝛼', '𝜔'),
|
||||
('𝜖', '𝜴'),
|
||||
('𝜶', '𝝎'),
|
||||
('𝝐', '𝝮'),
|
||||
('𝝰', '𝞈'),
|
||||
('𝞊', '𝞨'),
|
||||
('𝞪', '𝟂'),
|
||||
('𝟄', '𝟋'),
|
||||
('𝟎', '𝟿'),
|
||||
('\u{1da00}', '\u{1da36}'),
|
||||
('\u{1da3b}', '\u{1da6c}'),
|
||||
('\u{1da75}', '\u{1da75}'),
|
||||
('\u{1da84}', '\u{1da84}'),
|
||||
('\u{1da9b}', '\u{1da9f}'),
|
||||
('\u{1daa1}', '\u{1daaf}'),
|
||||
('𝼀', '𝼞'),
|
||||
('𝼥', '𝼪'),
|
||||
('\u{1e000}', '\u{1e006}'),
|
||||
('\u{1e008}', '\u{1e018}'),
|
||||
('\u{1e01b}', '\u{1e021}'),
|
||||
('\u{1e023}', '\u{1e024}'),
|
||||
('\u{1e026}', '\u{1e02a}'),
|
||||
('𞀰', '𞁭'),
|
||||
('\u{1e08f}', '\u{1e08f}'),
|
||||
('𞄀', '𞄬'),
|
||||
('\u{1e130}', '𞄽'),
|
||||
('𞅀', '𞅉'),
|
||||
('𞅎', '𞅎'),
|
||||
('𞊐', '\u{1e2ae}'),
|
||||
('𞋀', '𞋹'),
|
||||
('𞓐', '𞓹'),
|
||||
('𞟠', '𞟦'),
|
||||
('𞟨', '𞟫'),
|
||||
('𞟭', '𞟮'),
|
||||
('𞟰', '𞟾'),
|
||||
('𞠀', '𞣄'),
|
||||
('\u{1e8d0}', '\u{1e8d6}'),
|
||||
('𞤀', '𞥋'),
|
||||
('𞥐', '𞥙'),
|
||||
('𞸀', '𞸃'),
|
||||
('𞸅', '𞸟'),
|
||||
('𞸡', '𞸢'),
|
||||
('𞸤', '𞸤'),
|
||||
('𞸧', '𞸧'),
|
||||
('𞸩', '𞸲'),
|
||||
('𞸴', '𞸷'),
|
||||
('𞸹', '𞸹'),
|
||||
('𞸻', '𞸻'),
|
||||
('𞹂', '𞹂'),
|
||||
('𞹇', '𞹇'),
|
||||
('𞹉', '𞹉'),
|
||||
('𞹋', '𞹋'),
|
||||
('𞹍', '𞹏'),
|
||||
('𞹑', '𞹒'),
|
||||
('𞹔', '𞹔'),
|
||||
('𞹗', '𞹗'),
|
||||
('𞹙', '𞹙'),
|
||||
('𞹛', '𞹛'),
|
||||
('𞹝', '𞹝'),
|
||||
('𞹟', '𞹟'),
|
||||
('𞹡', '𞹢'),
|
||||
('𞹤', '𞹤'),
|
||||
('𞹧', '𞹪'),
|
||||
('𞹬', '𞹲'),
|
||||
('𞹴', '𞹷'),
|
||||
('𞹹', '𞹼'),
|
||||
('𞹾', '𞹾'),
|
||||
('𞺀', '𞺉'),
|
||||
('𞺋', '𞺛'),
|
||||
('𞺡', '𞺣'),
|
||||
('𞺥', '𞺩'),
|
||||
('𞺫', '𞺻'),
|
||||
('🄰', '🅉'),
|
||||
('🅐', '🅩'),
|
||||
('🅰', '🆉'),
|
||||
('🯰', '🯹'),
|
||||
('𠀀', '𪛟'),
|
||||
('𪜀', '𫜹'),
|
||||
('𫝀', '𫠝'),
|
||||
('𫠠', '𬺡'),
|
||||
('𬺰', '𮯠'),
|
||||
('丽', '𪘀'),
|
||||
('𰀀', '𱍊'),
|
||||
('𱍐', '𲎯'),
|
||||
('\u{e0100}', '\u{e01ef}'),
|
||||
];
|
||||
196
third-party/vendor/regex-automata/src/util/utf8.rs
vendored
Normal file
196
third-party/vendor/regex-automata/src/util/utf8.rs
vendored
Normal file
|
|
@ -0,0 +1,196 @@
|
|||
/*!
|
||||
Utilities for dealing with UTF-8.
|
||||
|
||||
This module provides some UTF-8 related helper routines, including an
|
||||
incremental decoder.
|
||||
*/
|
||||
|
||||
/// Returns true if and only if the given byte is considered a word character.
|
||||
/// This only applies to ASCII.
|
||||
///
|
||||
/// This was copied from regex-syntax so that we can use it to determine the
|
||||
/// starting DFA state while searching without depending on regex-syntax. The
|
||||
/// definition is never going to change, so there's no maintenance/bit-rot
|
||||
/// hazard here.
|
||||
#[cfg_attr(feature = "perf-inline", inline(always))]
|
||||
pub(crate) fn is_word_byte(b: u8) -> bool {
|
||||
const fn mkwordset() -> [bool; 256] {
|
||||
// FIXME: Use as_usize() once const functions in traits are stable.
|
||||
let mut set = [false; 256];
|
||||
set[b'_' as usize] = true;
|
||||
|
||||
let mut byte = b'0';
|
||||
while byte <= b'9' {
|
||||
set[byte as usize] = true;
|
||||
byte += 1;
|
||||
}
|
||||
byte = b'A';
|
||||
while byte <= b'Z' {
|
||||
set[byte as usize] = true;
|
||||
byte += 1;
|
||||
}
|
||||
byte = b'a';
|
||||
while byte <= b'z' {
|
||||
set[byte as usize] = true;
|
||||
byte += 1;
|
||||
}
|
||||
set
|
||||
}
|
||||
const WORD: [bool; 256] = mkwordset();
|
||||
WORD[b as usize]
|
||||
}
|
||||
|
||||
/// Decodes the next UTF-8 encoded codepoint from the given byte slice.
|
||||
///
|
||||
/// If no valid encoding of a codepoint exists at the beginning of the given
|
||||
/// byte slice, then the first byte is returned instead.
|
||||
///
|
||||
/// This returns `None` if and only if `bytes` is empty.
|
||||
///
|
||||
/// This never panics.
|
||||
///
|
||||
/// *WARNING*: This is not designed for performance. If you're looking for a
|
||||
/// fast UTF-8 decoder, this is not it. If you feel like you need one in this
|
||||
/// crate, then please file an issue and discuss your use case.
|
||||
#[cfg_attr(feature = "perf-inline", inline(always))]
|
||||
pub(crate) fn decode(bytes: &[u8]) -> Option<Result<char, u8>> {
|
||||
if bytes.is_empty() {
|
||||
return None;
|
||||
}
|
||||
let len = match len(bytes[0]) {
|
||||
None => return Some(Err(bytes[0])),
|
||||
Some(len) if len > bytes.len() => return Some(Err(bytes[0])),
|
||||
Some(1) => return Some(Ok(char::from(bytes[0]))),
|
||||
Some(len) => len,
|
||||
};
|
||||
match core::str::from_utf8(&bytes[..len]) {
|
||||
Ok(s) => Some(Ok(s.chars().next().unwrap())),
|
||||
Err(_) => Some(Err(bytes[0])),
|
||||
}
|
||||
}
|
||||
|
||||
/// Decodes the last UTF-8 encoded codepoint from the given byte slice.
|
||||
///
|
||||
/// If no valid encoding of a codepoint exists at the end of the given byte
|
||||
/// slice, then the last byte is returned instead.
|
||||
///
|
||||
/// This returns `None` if and only if `bytes` is empty.
|
||||
#[cfg_attr(feature = "perf-inline", inline(always))]
|
||||
pub(crate) fn decode_last(bytes: &[u8]) -> Option<Result<char, u8>> {
|
||||
if bytes.is_empty() {
|
||||
return None;
|
||||
}
|
||||
let mut start = bytes.len() - 1;
|
||||
let limit = bytes.len().saturating_sub(4);
|
||||
while start > limit && !is_leading_or_invalid_byte(bytes[start]) {
|
||||
start -= 1;
|
||||
}
|
||||
match decode(&bytes[start..]) {
|
||||
None => None,
|
||||
Some(Ok(ch)) => Some(Ok(ch)),
|
||||
Some(Err(_)) => Some(Err(bytes[bytes.len() - 1])),
|
||||
}
|
||||
}
|
||||
|
||||
/// Given a UTF-8 leading byte, this returns the total number of code units
|
||||
/// in the following encoded codepoint.
|
||||
///
|
||||
/// If the given byte is not a valid UTF-8 leading byte, then this returns
|
||||
/// `None`.
|
||||
#[cfg_attr(feature = "perf-inline", inline(always))]
|
||||
fn len(byte: u8) -> Option<usize> {
|
||||
if byte <= 0x7F {
|
||||
return Some(1);
|
||||
} else if byte & 0b1100_0000 == 0b1000_0000 {
|
||||
return None;
|
||||
} else if byte <= 0b1101_1111 {
|
||||
Some(2)
|
||||
} else if byte <= 0b1110_1111 {
|
||||
Some(3)
|
||||
} else if byte <= 0b1111_0111 {
|
||||
Some(4)
|
||||
} else {
|
||||
None
|
||||
}
|
||||
}
|
||||
|
||||
/// Returns true if and only if the given offset in the given bytes falls on a
|
||||
/// valid UTF-8 encoded codepoint boundary.
|
||||
///
|
||||
/// If `bytes` is not valid UTF-8, then the behavior of this routine is
|
||||
/// unspecified.
|
||||
#[cfg_attr(feature = "perf-inline", inline(always))]
|
||||
pub(crate) fn is_boundary(bytes: &[u8], i: usize) -> bool {
|
||||
match bytes.get(i) {
|
||||
// The position at the end of the bytes always represents an empty
|
||||
// string, which is a valid boundary. But anything after that doesn't
|
||||
// make much sense to call valid a boundary.
|
||||
None => i == bytes.len(),
|
||||
// Other than ASCII (where the most significant bit is never set),
|
||||
// valid starting bytes always have their most significant two bits
|
||||
// set, where as continuation bytes never have their second most
|
||||
// significant bit set. Therefore, this only returns true when bytes[i]
|
||||
// corresponds to a byte that begins a valid UTF-8 encoding of a
|
||||
// Unicode scalar value.
|
||||
Some(&b) => b <= 0b0111_1111 || b >= 0b1100_0000,
|
||||
}
|
||||
}
|
||||
|
||||
/// Returns true if and only if the given byte is either a valid leading UTF-8
|
||||
/// byte, or is otherwise an invalid byte that can never appear anywhere in a
|
||||
/// valid UTF-8 sequence.
|
||||
#[cfg_attr(feature = "perf-inline", inline(always))]
|
||||
fn is_leading_or_invalid_byte(b: u8) -> bool {
|
||||
// In the ASCII case, the most significant bit is never set. The leading
|
||||
// byte of a 2/3/4-byte sequence always has the top two most significant
|
||||
// bits set. For bytes that can never appear anywhere in valid UTF-8, this
|
||||
// also returns true, since every such byte has its two most significant
|
||||
// bits set:
|
||||
//
|
||||
// \xC0 :: 11000000
|
||||
// \xC1 :: 11000001
|
||||
// \xF5 :: 11110101
|
||||
// \xF6 :: 11110110
|
||||
// \xF7 :: 11110111
|
||||
// \xF8 :: 11111000
|
||||
// \xF9 :: 11111001
|
||||
// \xFA :: 11111010
|
||||
// \xFB :: 11111011
|
||||
// \xFC :: 11111100
|
||||
// \xFD :: 11111101
|
||||
// \xFE :: 11111110
|
||||
// \xFF :: 11111111
|
||||
(b & 0b1100_0000) != 0b1000_0000
|
||||
}
|
||||
|
||||
/*
|
||||
/// Returns the smallest possible index of the next valid UTF-8 sequence
|
||||
/// starting after `i`.
|
||||
///
|
||||
/// For all inputs, including invalid UTF-8 and any value of `i`, the return
|
||||
/// value is guaranteed to be greater than `i`. (If there is no value greater
|
||||
/// than `i` that fits in `usize`, then this panics.)
|
||||
///
|
||||
/// Generally speaking, this should only be called on `text` when it is
|
||||
/// permitted to assume that it is valid UTF-8 and where either `i >=
|
||||
/// text.len()` or where `text[i]` is a leading byte of a UTF-8 sequence.
|
||||
///
|
||||
/// NOTE: This method was used in a previous conception of iterators where we
|
||||
/// specifically tried to skip over empty matches that split a codepoint by
|
||||
/// simply requiring that our next search begin at the beginning of codepoint.
|
||||
/// But we ended up changing that technique to always advance by 1 byte and
|
||||
/// then filter out matches that split a codepoint after-the-fact. Thus, we no
|
||||
/// longer use this method. But I've kept it around in case we want to switch
|
||||
/// back to this approach. Its guarantees are a little subtle, so I'd prefer
|
||||
/// not to rebuild it from whole cloth.
|
||||
pub(crate) fn next(text: &[u8], i: usize) -> usize {
|
||||
let b = match text.get(i) {
|
||||
None => return i.checked_add(1).unwrap(),
|
||||
Some(&b) => b,
|
||||
};
|
||||
// For cases where we see an invalid UTF-8 byte, there isn't much we can do
|
||||
// other than just start at the next byte.
|
||||
let inc = len(b).unwrap_or(1);
|
||||
i.checked_add(inc).unwrap()
|
||||
}
|
||||
*/
|
||||
958
third-party/vendor/regex-automata/src/util/wire.rs
vendored
Normal file
958
third-party/vendor/regex-automata/src/util/wire.rs
vendored
Normal file
|
|
@ -0,0 +1,958 @@
|
|||
/*!
|
||||
Types and routines that support the wire format of finite automata.
|
||||
|
||||
Currently, this module just exports a few error types and some small helpers
|
||||
for deserializing [dense DFAs](crate::dfa::dense::DFA) using correct alignment.
|
||||
*/
|
||||
|
||||
/*
|
||||
A collection of helper functions, types and traits for serializing automata.
|
||||
|
||||
This crate defines its own bespoke serialization mechanism for some structures
|
||||
provided in the public API, namely, DFAs. A bespoke mechanism was developed
|
||||
primarily because structures like automata demand a specific binary format.
|
||||
Attempting to encode their rich structure in an existing serialization
|
||||
format is just not feasible. Moreover, the format for each structure is
|
||||
generally designed such that deserialization is cheap. More specifically, that
|
||||
deserialization can be done in constant time. (The idea being that you can
|
||||
embed it into your binary or mmap it, and then use it immediately.)
|
||||
|
||||
In order to achieve this, the dense and sparse DFAs in this crate use an
|
||||
in-memory representation that very closely corresponds to its binary serialized
|
||||
form. This pervades and complicates everything, and in some cases, requires
|
||||
dealing with alignment and reasoning about safety.
|
||||
|
||||
This technique does have major advantages. In particular, it permits doing
|
||||
the potentially costly work of compiling a finite state machine in an offline
|
||||
manner, and then loading it at runtime not only without having to re-compile
|
||||
the regex, but even without the code required to do the compilation. This, for
|
||||
example, permits one to use a pre-compiled DFA not only in environments without
|
||||
Rust's standard library, but also in environments without a heap.
|
||||
|
||||
In the code below, whenever we insert some kind of padding, it's to enforce a
|
||||
4-byte alignment, unless otherwise noted. Namely, u32 is the only state ID type
|
||||
supported. (In a previous version of this library, DFAs were generic over the
|
||||
state ID representation.)
|
||||
|
||||
Also, serialization generally requires the caller to specify endianness,
|
||||
where as deserialization always assumes native endianness (otherwise cheap
|
||||
deserialization would be impossible). This implies that serializing a structure
|
||||
generally requires serializing both its big-endian and little-endian variants,
|
||||
and then loading the correct one based on the target's endianness.
|
||||
*/
|
||||
|
||||
use core::{cmp, mem::size_of};
|
||||
|
||||
#[cfg(feature = "alloc")]
|
||||
use alloc::{vec, vec::Vec};
|
||||
|
||||
use crate::util::{
|
||||
int::Pointer,
|
||||
primitives::{PatternID, PatternIDError, StateID, StateIDError},
|
||||
};
|
||||
|
||||
/// A hack to align a smaller type `B` with a bigger type `T`.
|
||||
///
|
||||
/// The usual use of this is with `B = [u8]` and `T = u32`. That is,
|
||||
/// it permits aligning a sequence of bytes on a 4-byte boundary. This
|
||||
/// is useful in contexts where one wants to embed a serialized [dense
|
||||
/// DFA](crate::dfa::dense::DFA) into a Rust a program while guaranteeing the
|
||||
/// alignment required for the DFA.
|
||||
///
|
||||
/// See [`dense::DFA::from_bytes`](crate::dfa::dense::DFA::from_bytes) for an
|
||||
/// example of how to use this type.
|
||||
#[repr(C)]
|
||||
#[derive(Debug)]
|
||||
pub struct AlignAs<B: ?Sized, T> {
|
||||
/// A zero-sized field indicating the alignment we want.
|
||||
pub _align: [T; 0],
|
||||
/// A possibly non-sized field containing a sequence of bytes.
|
||||
pub bytes: B,
|
||||
}
|
||||
|
||||
/// An error that occurs when serializing an object from this crate.
|
||||
///
|
||||
/// Serialization, as used in this crate, universally refers to the process
|
||||
/// of transforming a structure (like a DFA) into a custom binary format
|
||||
/// represented by `&[u8]`. To this end, serialization is generally infallible.
|
||||
/// However, it can fail when caller provided buffer sizes are too small. When
|
||||
/// that occurs, a serialization error is reported.
|
||||
///
|
||||
/// A `SerializeError` provides no introspection capabilities. Its only
|
||||
/// supported operation is conversion to a human readable error message.
|
||||
///
|
||||
/// This error type implements the `std::error::Error` trait only when the
|
||||
/// `std` feature is enabled. Otherwise, this type is defined in all
|
||||
/// configurations.
|
||||
#[derive(Debug)]
|
||||
pub struct SerializeError {
|
||||
/// The name of the thing that a buffer is too small for.
|
||||
///
|
||||
/// Currently, the only kind of serialization error is one that is
|
||||
/// committed by a caller: providing a destination buffer that is too
|
||||
/// small to fit the serialized object. This makes sense conceptually,
|
||||
/// since every valid inhabitant of a type should be serializable.
|
||||
///
|
||||
/// This is somewhat exposed in the public API of this crate. For example,
|
||||
/// the `to_bytes_{big,little}_endian` APIs return a `Vec<u8>` and are
|
||||
/// guaranteed to never panic or error. This is only possible because the
|
||||
/// implementation guarantees that it will allocate a `Vec<u8>` that is
|
||||
/// big enough.
|
||||
///
|
||||
/// In summary, if a new serialization error kind needs to be added, then
|
||||
/// it will need careful consideration.
|
||||
what: &'static str,
|
||||
}
|
||||
|
||||
impl SerializeError {
|
||||
pub(crate) fn buffer_too_small(what: &'static str) -> SerializeError {
|
||||
SerializeError { what }
|
||||
}
|
||||
}
|
||||
|
||||
impl core::fmt::Display for SerializeError {
|
||||
fn fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result {
|
||||
write!(f, "destination buffer is too small to write {}", self.what)
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(feature = "std")]
|
||||
impl std::error::Error for SerializeError {}
|
||||
|
||||
/// An error that occurs when deserializing an object defined in this crate.
|
||||
///
|
||||
/// Serialization, as used in this crate, universally refers to the process
|
||||
/// of transforming a structure (like a DFA) into a custom binary format
|
||||
/// represented by `&[u8]`. Deserialization, then, refers to the process of
|
||||
/// cheaply converting this binary format back to the object's in-memory
|
||||
/// representation as defined in this crate. To the extent possible,
|
||||
/// deserialization will report this error whenever this process fails.
|
||||
///
|
||||
/// A `DeserializeError` provides no introspection capabilities. Its only
|
||||
/// supported operation is conversion to a human readable error message.
|
||||
///
|
||||
/// This error type implements the `std::error::Error` trait only when the
|
||||
/// `std` feature is enabled. Otherwise, this type is defined in all
|
||||
/// configurations.
|
||||
#[derive(Debug)]
|
||||
pub struct DeserializeError(DeserializeErrorKind);
|
||||
|
||||
#[derive(Debug)]
|
||||
enum DeserializeErrorKind {
|
||||
Generic { msg: &'static str },
|
||||
BufferTooSmall { what: &'static str },
|
||||
InvalidUsize { what: &'static str },
|
||||
VersionMismatch { expected: u32, found: u32 },
|
||||
EndianMismatch { expected: u32, found: u32 },
|
||||
AlignmentMismatch { alignment: usize, address: usize },
|
||||
LabelMismatch { expected: &'static str },
|
||||
ArithmeticOverflow { what: &'static str },
|
||||
PatternID { err: PatternIDError, what: &'static str },
|
||||
StateID { err: StateIDError, what: &'static str },
|
||||
}
|
||||
|
||||
impl DeserializeError {
|
||||
pub(crate) fn generic(msg: &'static str) -> DeserializeError {
|
||||
DeserializeError(DeserializeErrorKind::Generic { msg })
|
||||
}
|
||||
|
||||
pub(crate) fn buffer_too_small(what: &'static str) -> DeserializeError {
|
||||
DeserializeError(DeserializeErrorKind::BufferTooSmall { what })
|
||||
}
|
||||
|
||||
fn invalid_usize(what: &'static str) -> DeserializeError {
|
||||
DeserializeError(DeserializeErrorKind::InvalidUsize { what })
|
||||
}
|
||||
|
||||
fn version_mismatch(expected: u32, found: u32) -> DeserializeError {
|
||||
DeserializeError(DeserializeErrorKind::VersionMismatch {
|
||||
expected,
|
||||
found,
|
||||
})
|
||||
}
|
||||
|
||||
fn endian_mismatch(expected: u32, found: u32) -> DeserializeError {
|
||||
DeserializeError(DeserializeErrorKind::EndianMismatch {
|
||||
expected,
|
||||
found,
|
||||
})
|
||||
}
|
||||
|
||||
fn alignment_mismatch(
|
||||
alignment: usize,
|
||||
address: usize,
|
||||
) -> DeserializeError {
|
||||
DeserializeError(DeserializeErrorKind::AlignmentMismatch {
|
||||
alignment,
|
||||
address,
|
||||
})
|
||||
}
|
||||
|
||||
fn label_mismatch(expected: &'static str) -> DeserializeError {
|
||||
DeserializeError(DeserializeErrorKind::LabelMismatch { expected })
|
||||
}
|
||||
|
||||
fn arithmetic_overflow(what: &'static str) -> DeserializeError {
|
||||
DeserializeError(DeserializeErrorKind::ArithmeticOverflow { what })
|
||||
}
|
||||
|
||||
fn pattern_id_error(
|
||||
err: PatternIDError,
|
||||
what: &'static str,
|
||||
) -> DeserializeError {
|
||||
DeserializeError(DeserializeErrorKind::PatternID { err, what })
|
||||
}
|
||||
|
||||
pub(crate) fn state_id_error(
|
||||
err: StateIDError,
|
||||
what: &'static str,
|
||||
) -> DeserializeError {
|
||||
DeserializeError(DeserializeErrorKind::StateID { err, what })
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(feature = "std")]
|
||||
impl std::error::Error for DeserializeError {}
|
||||
|
||||
impl core::fmt::Display for DeserializeError {
|
||||
fn fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result {
|
||||
use self::DeserializeErrorKind::*;
|
||||
|
||||
match self.0 {
|
||||
Generic { msg } => write!(f, "{}", msg),
|
||||
BufferTooSmall { what } => {
|
||||
write!(f, "buffer is too small to read {}", what)
|
||||
}
|
||||
InvalidUsize { what } => {
|
||||
write!(f, "{} is too big to fit in a usize", what)
|
||||
}
|
||||
VersionMismatch { expected, found } => write!(
|
||||
f,
|
||||
"unsupported version: \
|
||||
expected version {} but found version {}",
|
||||
expected, found,
|
||||
),
|
||||
EndianMismatch { expected, found } => write!(
|
||||
f,
|
||||
"endianness mismatch: expected 0x{:X} but got 0x{:X}. \
|
||||
(Are you trying to load an object serialized with a \
|
||||
different endianness?)",
|
||||
expected, found,
|
||||
),
|
||||
AlignmentMismatch { alignment, address } => write!(
|
||||
f,
|
||||
"alignment mismatch: slice starts at address \
|
||||
0x{:X}, which is not aligned to a {} byte boundary",
|
||||
address, alignment,
|
||||
),
|
||||
LabelMismatch { expected } => write!(
|
||||
f,
|
||||
"label mismatch: start of serialized object should \
|
||||
contain a NUL terminated {:?} label, but a different \
|
||||
label was found",
|
||||
expected,
|
||||
),
|
||||
ArithmeticOverflow { what } => {
|
||||
write!(f, "arithmetic overflow for {}", what)
|
||||
}
|
||||
PatternID { ref err, what } => {
|
||||
write!(f, "failed to read pattern ID for {}: {}", what, err)
|
||||
}
|
||||
StateID { ref err, what } => {
|
||||
write!(f, "failed to read state ID for {}: {}", what, err)
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Safely converts a `&[u32]` to `&[StateID]` with zero cost.
|
||||
#[cfg_attr(feature = "perf-inline", inline(always))]
|
||||
pub(crate) fn u32s_to_state_ids(slice: &[u32]) -> &[StateID] {
|
||||
// SAFETY: This is safe because StateID is defined to have the same memory
|
||||
// representation as a u32 (it is repr(transparent)). While not every u32
|
||||
// is a "valid" StateID, callers are not permitted to rely on the validity
|
||||
// of StateIDs for memory safety. It can only lead to logical errors. (This
|
||||
// is why StateID::new_unchecked is safe.)
|
||||
unsafe {
|
||||
core::slice::from_raw_parts(
|
||||
slice.as_ptr().cast::<StateID>(),
|
||||
slice.len(),
|
||||
)
|
||||
}
|
||||
}
|
||||
|
||||
/// Safely converts a `&mut [u32]` to `&mut [StateID]` with zero cost.
|
||||
pub(crate) fn u32s_to_state_ids_mut(slice: &mut [u32]) -> &mut [StateID] {
|
||||
// SAFETY: This is safe because StateID is defined to have the same memory
|
||||
// representation as a u32 (it is repr(transparent)). While not every u32
|
||||
// is a "valid" StateID, callers are not permitted to rely on the validity
|
||||
// of StateIDs for memory safety. It can only lead to logical errors. (This
|
||||
// is why StateID::new_unchecked is safe.)
|
||||
unsafe {
|
||||
core::slice::from_raw_parts_mut(
|
||||
slice.as_mut_ptr().cast::<StateID>(),
|
||||
slice.len(),
|
||||
)
|
||||
}
|
||||
}
|
||||
|
||||
/// Safely converts a `&[u32]` to `&[PatternID]` with zero cost.
|
||||
#[cfg_attr(feature = "perf-inline", inline(always))]
|
||||
pub(crate) fn u32s_to_pattern_ids(slice: &[u32]) -> &[PatternID] {
|
||||
// SAFETY: This is safe because PatternID is defined to have the same
|
||||
// memory representation as a u32 (it is repr(transparent)). While not
|
||||
// every u32 is a "valid" PatternID, callers are not permitted to rely
|
||||
// on the validity of PatternIDs for memory safety. It can only lead to
|
||||
// logical errors. (This is why PatternID::new_unchecked is safe.)
|
||||
unsafe {
|
||||
core::slice::from_raw_parts(
|
||||
slice.as_ptr().cast::<PatternID>(),
|
||||
slice.len(),
|
||||
)
|
||||
}
|
||||
}
|
||||
|
||||
/// Checks that the given slice has an alignment that matches `T`.
|
||||
///
|
||||
/// This is useful for checking that a slice has an appropriate alignment
|
||||
/// before casting it to a &[T]. Note though that alignment is not itself
|
||||
/// sufficient to perform the cast for any `T`.
|
||||
pub(crate) fn check_alignment<T>(
|
||||
slice: &[u8],
|
||||
) -> Result<(), DeserializeError> {
|
||||
let alignment = core::mem::align_of::<T>();
|
||||
let address = slice.as_ptr().as_usize();
|
||||
if address % alignment == 0 {
|
||||
return Ok(());
|
||||
}
|
||||
Err(DeserializeError::alignment_mismatch(alignment, address))
|
||||
}
|
||||
|
||||
/// Reads a possibly empty amount of padding, up to 7 bytes, from the beginning
|
||||
/// of the given slice. All padding bytes must be NUL bytes.
|
||||
///
|
||||
/// This is useful because it can be theoretically necessary to pad the
|
||||
/// beginning of a serialized object with NUL bytes to ensure that it starts
|
||||
/// at a correctly aligned address. These padding bytes should come immediately
|
||||
/// before the label.
|
||||
///
|
||||
/// This returns the number of bytes read from the given slice.
|
||||
pub(crate) fn skip_initial_padding(slice: &[u8]) -> usize {
|
||||
let mut nread = 0;
|
||||
while nread < 7 && nread < slice.len() && slice[nread] == 0 {
|
||||
nread += 1;
|
||||
}
|
||||
nread
|
||||
}
|
||||
|
||||
/// Allocate a byte buffer of the given size, along with some initial padding
|
||||
/// such that `buf[padding..]` has the same alignment as `T`, where the
|
||||
/// alignment of `T` must be at most `8`. In particular, callers should treat
|
||||
/// the first N bytes (second return value) as padding bytes that must not be
|
||||
/// overwritten. In all cases, the following identity holds:
|
||||
///
|
||||
/// ```ignore
|
||||
/// let (buf, padding) = alloc_aligned_buffer::<StateID>(SIZE);
|
||||
/// assert_eq!(SIZE, buf[padding..].len());
|
||||
/// ```
|
||||
///
|
||||
/// In practice, padding is often zero.
|
||||
///
|
||||
/// The requirement for `8` as a maximum here is somewhat arbitrary. In
|
||||
/// practice, we never need anything bigger in this crate, and so this function
|
||||
/// does some sanity asserts under the assumption of a max alignment of `8`.
|
||||
#[cfg(feature = "alloc")]
|
||||
pub(crate) fn alloc_aligned_buffer<T>(size: usize) -> (Vec<u8>, usize) {
|
||||
// NOTE: This is a kludge because there's no easy way to allocate a Vec<u8>
|
||||
// with an alignment guaranteed to be greater than 1. We could create a
|
||||
// Vec<u32>, but this cannot be safely transmuted to a Vec<u8> without
|
||||
// concern, since reallocing or dropping the Vec<u8> is UB (different
|
||||
// alignment than the initial allocation). We could define a wrapper type
|
||||
// to manage this for us, but it seems like more machinery than it's worth.
|
||||
let buf = vec![0; size];
|
||||
let align = core::mem::align_of::<T>();
|
||||
let address = buf.as_ptr().as_usize();
|
||||
if address % align == 0 {
|
||||
return (buf, 0);
|
||||
}
|
||||
// Let's try this again. We have to create a totally new alloc with
|
||||
// the maximum amount of bytes we might need. We can't just extend our
|
||||
// pre-existing 'buf' because that might create a new alloc with a
|
||||
// different alignment.
|
||||
let extra = align - 1;
|
||||
let mut buf = vec![0; size + extra];
|
||||
let address = buf.as_ptr().as_usize();
|
||||
// The code below handles the case where 'address' is aligned to T, so if
|
||||
// we got lucky and 'address' is now aligned to T (when it previously
|
||||
// wasn't), then we're done.
|
||||
if address % align == 0 {
|
||||
buf.truncate(size);
|
||||
return (buf, 0);
|
||||
}
|
||||
let padding = ((address & !(align - 1)).checked_add(align).unwrap())
|
||||
.checked_sub(address)
|
||||
.unwrap();
|
||||
assert!(padding <= 7, "padding of {} is bigger than 7", padding);
|
||||
assert!(
|
||||
padding <= extra,
|
||||
"padding of {} is bigger than extra {} bytes",
|
||||
padding,
|
||||
extra
|
||||
);
|
||||
buf.truncate(size + padding);
|
||||
assert_eq!(size + padding, buf.len());
|
||||
assert_eq!(
|
||||
0,
|
||||
buf[padding..].as_ptr().as_usize() % align,
|
||||
"expected end of initial padding to be aligned to {}",
|
||||
align,
|
||||
);
|
||||
(buf, padding)
|
||||
}
|
||||
|
||||
/// Reads a NUL terminated label starting at the beginning of the given slice.
|
||||
///
|
||||
/// If a NUL terminated label could not be found, then an error is returned.
|
||||
/// Similarly, if a label is found but doesn't match the expected label, then
|
||||
/// an error is returned.
|
||||
///
|
||||
/// Upon success, the total number of bytes read (including padding bytes) is
|
||||
/// returned.
|
||||
pub(crate) fn read_label(
|
||||
slice: &[u8],
|
||||
expected_label: &'static str,
|
||||
) -> Result<usize, DeserializeError> {
|
||||
// Set an upper bound on how many bytes we scan for a NUL. Since no label
|
||||
// in this crate is longer than 256 bytes, if we can't find one within that
|
||||
// range, then we have corrupted data.
|
||||
let first_nul =
|
||||
slice[..cmp::min(slice.len(), 256)].iter().position(|&b| b == 0);
|
||||
let first_nul = match first_nul {
|
||||
Some(first_nul) => first_nul,
|
||||
None => {
|
||||
return Err(DeserializeError::generic(
|
||||
"could not find NUL terminated label \
|
||||
at start of serialized object",
|
||||
));
|
||||
}
|
||||
};
|
||||
let len = first_nul + padding_len(first_nul);
|
||||
if slice.len() < len {
|
||||
return Err(DeserializeError::generic(
|
||||
"could not find properly sized label at start of serialized object"
|
||||
));
|
||||
}
|
||||
if expected_label.as_bytes() != &slice[..first_nul] {
|
||||
return Err(DeserializeError::label_mismatch(expected_label));
|
||||
}
|
||||
Ok(len)
|
||||
}
|
||||
|
||||
/// Writes the given label to the buffer as a NUL terminated string. The label
|
||||
/// given must not contain NUL, otherwise this will panic. Similarly, the label
|
||||
/// must not be longer than 255 bytes, otherwise this will panic.
|
||||
///
|
||||
/// Additional NUL bytes are written as necessary to ensure that the number of
|
||||
/// bytes written is always a multiple of 4.
|
||||
///
|
||||
/// Upon success, the total number of bytes written (including padding) is
|
||||
/// returned.
|
||||
pub(crate) fn write_label(
|
||||
label: &str,
|
||||
dst: &mut [u8],
|
||||
) -> Result<usize, SerializeError> {
|
||||
let nwrite = write_label_len(label);
|
||||
if dst.len() < nwrite {
|
||||
return Err(SerializeError::buffer_too_small("label"));
|
||||
}
|
||||
dst[..label.len()].copy_from_slice(label.as_bytes());
|
||||
for i in 0..(nwrite - label.len()) {
|
||||
dst[label.len() + i] = 0;
|
||||
}
|
||||
assert_eq!(nwrite % 4, 0);
|
||||
Ok(nwrite)
|
||||
}
|
||||
|
||||
/// Returns the total number of bytes (including padding) that would be written
|
||||
/// for the given label. This panics if the given label contains a NUL byte or
|
||||
/// is longer than 255 bytes. (The size restriction exists so that searching
|
||||
/// for a label during deserialization can be done in small bounded space.)
|
||||
pub(crate) fn write_label_len(label: &str) -> usize {
|
||||
if label.len() > 255 {
|
||||
panic!("label must not be longer than 255 bytes");
|
||||
}
|
||||
if label.as_bytes().iter().position(|&b| b == 0).is_some() {
|
||||
panic!("label must not contain NUL bytes");
|
||||
}
|
||||
let label_len = label.len() + 1; // +1 for the NUL terminator
|
||||
label_len + padding_len(label_len)
|
||||
}
|
||||
|
||||
/// Reads the endianness check from the beginning of the given slice and
|
||||
/// confirms that the endianness of the serialized object matches the expected
|
||||
/// endianness. If the slice is too small or if the endianness check fails,
|
||||
/// this returns an error.
|
||||
///
|
||||
/// Upon success, the total number of bytes read is returned.
|
||||
pub(crate) fn read_endianness_check(
|
||||
slice: &[u8],
|
||||
) -> Result<usize, DeserializeError> {
|
||||
let (n, nr) = try_read_u32(slice, "endianness check")?;
|
||||
assert_eq!(nr, write_endianness_check_len());
|
||||
if n != 0xFEFF {
|
||||
return Err(DeserializeError::endian_mismatch(0xFEFF, n));
|
||||
}
|
||||
Ok(nr)
|
||||
}
|
||||
|
||||
/// Writes 0xFEFF as an integer using the given endianness.
|
||||
///
|
||||
/// This is useful for writing into the header of a serialized object. It can
|
||||
/// be read during deserialization as a sanity check to ensure the proper
|
||||
/// endianness is used.
|
||||
///
|
||||
/// Upon success, the total number of bytes written is returned.
|
||||
pub(crate) fn write_endianness_check<E: Endian>(
|
||||
dst: &mut [u8],
|
||||
) -> Result<usize, SerializeError> {
|
||||
let nwrite = write_endianness_check_len();
|
||||
if dst.len() < nwrite {
|
||||
return Err(SerializeError::buffer_too_small("endianness check"));
|
||||
}
|
||||
E::write_u32(0xFEFF, dst);
|
||||
Ok(nwrite)
|
||||
}
|
||||
|
||||
/// Returns the number of bytes written by the endianness check.
|
||||
pub(crate) fn write_endianness_check_len() -> usize {
|
||||
size_of::<u32>()
|
||||
}
|
||||
|
||||
/// Reads a version number from the beginning of the given slice and confirms
|
||||
/// that is matches the expected version number given. If the slice is too
|
||||
/// small or if the version numbers aren't equivalent, this returns an error.
|
||||
///
|
||||
/// Upon success, the total number of bytes read is returned.
|
||||
///
|
||||
/// N.B. Currently, we require that the version number is exactly equivalent.
|
||||
/// In the future, if we bump the version number without a semver bump, then
|
||||
/// we'll need to relax this a bit and support older versions.
|
||||
pub(crate) fn read_version(
|
||||
slice: &[u8],
|
||||
expected_version: u32,
|
||||
) -> Result<usize, DeserializeError> {
|
||||
let (n, nr) = try_read_u32(slice, "version")?;
|
||||
assert_eq!(nr, write_version_len());
|
||||
if n != expected_version {
|
||||
return Err(DeserializeError::version_mismatch(expected_version, n));
|
||||
}
|
||||
Ok(nr)
|
||||
}
|
||||
|
||||
/// Writes the given version number to the beginning of the given slice.
|
||||
///
|
||||
/// This is useful for writing into the header of a serialized object. It can
|
||||
/// be read during deserialization as a sanity check to ensure that the library
|
||||
/// code supports the format of the serialized object.
|
||||
///
|
||||
/// Upon success, the total number of bytes written is returned.
|
||||
pub(crate) fn write_version<E: Endian>(
|
||||
version: u32,
|
||||
dst: &mut [u8],
|
||||
) -> Result<usize, SerializeError> {
|
||||
let nwrite = write_version_len();
|
||||
if dst.len() < nwrite {
|
||||
return Err(SerializeError::buffer_too_small("version number"));
|
||||
}
|
||||
E::write_u32(version, dst);
|
||||
Ok(nwrite)
|
||||
}
|
||||
|
||||
/// Returns the number of bytes written by writing the version number.
|
||||
pub(crate) fn write_version_len() -> usize {
|
||||
size_of::<u32>()
|
||||
}
|
||||
|
||||
/// Reads a pattern ID from the given slice. If the slice has insufficient
|
||||
/// length, then this panics. If the deserialized integer exceeds the pattern
|
||||
/// ID limit for the current target, then this returns an error.
|
||||
///
|
||||
/// Upon success, this also returns the number of bytes read.
|
||||
pub(crate) fn read_pattern_id(
|
||||
slice: &[u8],
|
||||
what: &'static str,
|
||||
) -> Result<(PatternID, usize), DeserializeError> {
|
||||
let bytes: [u8; PatternID::SIZE] =
|
||||
slice[..PatternID::SIZE].try_into().unwrap();
|
||||
let pid = PatternID::from_ne_bytes(bytes)
|
||||
.map_err(|err| DeserializeError::pattern_id_error(err, what))?;
|
||||
Ok((pid, PatternID::SIZE))
|
||||
}
|
||||
|
||||
/// Reads a pattern ID from the given slice. If the slice has insufficient
|
||||
/// length, then this panics. Otherwise, the deserialized integer is assumed
|
||||
/// to be a valid pattern ID.
|
||||
///
|
||||
/// This also returns the number of bytes read.
|
||||
pub(crate) fn read_pattern_id_unchecked(slice: &[u8]) -> (PatternID, usize) {
|
||||
let pid = PatternID::from_ne_bytes_unchecked(
|
||||
slice[..PatternID::SIZE].try_into().unwrap(),
|
||||
);
|
||||
(pid, PatternID::SIZE)
|
||||
}
|
||||
|
||||
/// Write the given pattern ID to the beginning of the given slice of bytes
|
||||
/// using the specified endianness. The given slice must have length at least
|
||||
/// `PatternID::SIZE`, or else this panics. Upon success, the total number of
|
||||
/// bytes written is returned.
|
||||
pub(crate) fn write_pattern_id<E: Endian>(
|
||||
pid: PatternID,
|
||||
dst: &mut [u8],
|
||||
) -> usize {
|
||||
E::write_u32(pid.as_u32(), dst);
|
||||
PatternID::SIZE
|
||||
}
|
||||
|
||||
/// Attempts to read a state ID from the given slice. If the slice has an
|
||||
/// insufficient number of bytes or if the state ID exceeds the limit for
|
||||
/// the current target, then this returns an error.
|
||||
///
|
||||
/// Upon success, this also returns the number of bytes read.
|
||||
pub(crate) fn try_read_state_id(
|
||||
slice: &[u8],
|
||||
what: &'static str,
|
||||
) -> Result<(StateID, usize), DeserializeError> {
|
||||
if slice.len() < StateID::SIZE {
|
||||
return Err(DeserializeError::buffer_too_small(what));
|
||||
}
|
||||
read_state_id(slice, what)
|
||||
}
|
||||
|
||||
/// Reads a state ID from the given slice. If the slice has insufficient
|
||||
/// length, then this panics. If the deserialized integer exceeds the state ID
|
||||
/// limit for the current target, then this returns an error.
|
||||
///
|
||||
/// Upon success, this also returns the number of bytes read.
|
||||
pub(crate) fn read_state_id(
|
||||
slice: &[u8],
|
||||
what: &'static str,
|
||||
) -> Result<(StateID, usize), DeserializeError> {
|
||||
let bytes: [u8; StateID::SIZE] =
|
||||
slice[..StateID::SIZE].try_into().unwrap();
|
||||
let sid = StateID::from_ne_bytes(bytes)
|
||||
.map_err(|err| DeserializeError::state_id_error(err, what))?;
|
||||
Ok((sid, StateID::SIZE))
|
||||
}
|
||||
|
||||
/// Reads a state ID from the given slice. If the slice has insufficient
|
||||
/// length, then this panics. Otherwise, the deserialized integer is assumed
|
||||
/// to be a valid state ID.
|
||||
///
|
||||
/// This also returns the number of bytes read.
|
||||
pub(crate) fn read_state_id_unchecked(slice: &[u8]) -> (StateID, usize) {
|
||||
let sid = StateID::from_ne_bytes_unchecked(
|
||||
slice[..StateID::SIZE].try_into().unwrap(),
|
||||
);
|
||||
(sid, StateID::SIZE)
|
||||
}
|
||||
|
||||
/// Write the given state ID to the beginning of the given slice of bytes
|
||||
/// using the specified endianness. The given slice must have length at least
|
||||
/// `StateID::SIZE`, or else this panics. Upon success, the total number of
|
||||
/// bytes written is returned.
|
||||
pub(crate) fn write_state_id<E: Endian>(
|
||||
sid: StateID,
|
||||
dst: &mut [u8],
|
||||
) -> usize {
|
||||
E::write_u32(sid.as_u32(), dst);
|
||||
StateID::SIZE
|
||||
}
|
||||
|
||||
/// Try to read a u16 as a usize from the beginning of the given slice in
|
||||
/// native endian format. If the slice has fewer than 2 bytes or if the
|
||||
/// deserialized number cannot be represented by usize, then this returns an
|
||||
/// error. The error message will include the `what` description of what is
|
||||
/// being deserialized, for better error messages. `what` should be a noun in
|
||||
/// singular form.
|
||||
///
|
||||
/// Upon success, this also returns the number of bytes read.
|
||||
pub(crate) fn try_read_u16_as_usize(
|
||||
slice: &[u8],
|
||||
what: &'static str,
|
||||
) -> Result<(usize, usize), DeserializeError> {
|
||||
try_read_u16(slice, what).and_then(|(n, nr)| {
|
||||
usize::try_from(n)
|
||||
.map(|n| (n, nr))
|
||||
.map_err(|_| DeserializeError::invalid_usize(what))
|
||||
})
|
||||
}
|
||||
|
||||
/// Try to read a u32 as a usize from the beginning of the given slice in
|
||||
/// native endian format. If the slice has fewer than 4 bytes or if the
|
||||
/// deserialized number cannot be represented by usize, then this returns an
|
||||
/// error. The error message will include the `what` description of what is
|
||||
/// being deserialized, for better error messages. `what` should be a noun in
|
||||
/// singular form.
|
||||
///
|
||||
/// Upon success, this also returns the number of bytes read.
|
||||
pub(crate) fn try_read_u32_as_usize(
|
||||
slice: &[u8],
|
||||
what: &'static str,
|
||||
) -> Result<(usize, usize), DeserializeError> {
|
||||
try_read_u32(slice, what).and_then(|(n, nr)| {
|
||||
usize::try_from(n)
|
||||
.map(|n| (n, nr))
|
||||
.map_err(|_| DeserializeError::invalid_usize(what))
|
||||
})
|
||||
}
|
||||
|
||||
/// Try to read a u16 from the beginning of the given slice in native endian
|
||||
/// format. If the slice has fewer than 2 bytes, then this returns an error.
|
||||
/// The error message will include the `what` description of what is being
|
||||
/// deserialized, for better error messages. `what` should be a noun in
|
||||
/// singular form.
|
||||
///
|
||||
/// Upon success, this also returns the number of bytes read.
|
||||
pub(crate) fn try_read_u16(
|
||||
slice: &[u8],
|
||||
what: &'static str,
|
||||
) -> Result<(u16, usize), DeserializeError> {
|
||||
check_slice_len(slice, size_of::<u16>(), what)?;
|
||||
Ok((read_u16(slice), size_of::<u16>()))
|
||||
}
|
||||
|
||||
/// Try to read a u32 from the beginning of the given slice in native endian
|
||||
/// format. If the slice has fewer than 4 bytes, then this returns an error.
|
||||
/// The error message will include the `what` description of what is being
|
||||
/// deserialized, for better error messages. `what` should be a noun in
|
||||
/// singular form.
|
||||
///
|
||||
/// Upon success, this also returns the number of bytes read.
|
||||
pub(crate) fn try_read_u32(
|
||||
slice: &[u8],
|
||||
what: &'static str,
|
||||
) -> Result<(u32, usize), DeserializeError> {
|
||||
check_slice_len(slice, size_of::<u32>(), what)?;
|
||||
Ok((read_u32(slice), size_of::<u32>()))
|
||||
}
|
||||
|
||||
/// Try to read a u128 from the beginning of the given slice in native endian
|
||||
/// format. If the slice has fewer than 16 bytes, then this returns an error.
|
||||
/// The error message will include the `what` description of what is being
|
||||
/// deserialized, for better error messages. `what` should be a noun in
|
||||
/// singular form.
|
||||
///
|
||||
/// Upon success, this also returns the number of bytes read.
|
||||
pub(crate) fn try_read_u128(
|
||||
slice: &[u8],
|
||||
what: &'static str,
|
||||
) -> Result<(u128, usize), DeserializeError> {
|
||||
check_slice_len(slice, size_of::<u128>(), what)?;
|
||||
Ok((read_u128(slice), size_of::<u128>()))
|
||||
}
|
||||
|
||||
/// Read a u16 from the beginning of the given slice in native endian format.
|
||||
/// If the slice has fewer than 2 bytes, then this panics.
|
||||
///
|
||||
/// Marked as inline to speed up sparse searching which decodes integers from
|
||||
/// its automaton at search time.
|
||||
#[cfg_attr(feature = "perf-inline", inline(always))]
|
||||
pub(crate) fn read_u16(slice: &[u8]) -> u16 {
|
||||
let bytes: [u8; 2] = slice[..size_of::<u16>()].try_into().unwrap();
|
||||
u16::from_ne_bytes(bytes)
|
||||
}
|
||||
|
||||
/// Read a u32 from the beginning of the given slice in native endian format.
|
||||
/// If the slice has fewer than 4 bytes, then this panics.
|
||||
///
|
||||
/// Marked as inline to speed up sparse searching which decodes integers from
|
||||
/// its automaton at search time.
|
||||
#[cfg_attr(feature = "perf-inline", inline(always))]
|
||||
pub(crate) fn read_u32(slice: &[u8]) -> u32 {
|
||||
let bytes: [u8; 4] = slice[..size_of::<u32>()].try_into().unwrap();
|
||||
u32::from_ne_bytes(bytes)
|
||||
}
|
||||
|
||||
/// Read a u128 from the beginning of the given slice in native endian format.
|
||||
/// If the slice has fewer than 16 bytes, then this panics.
|
||||
pub(crate) fn read_u128(slice: &[u8]) -> u128 {
|
||||
let bytes: [u8; 16] = slice[..size_of::<u128>()].try_into().unwrap();
|
||||
u128::from_ne_bytes(bytes)
|
||||
}
|
||||
|
||||
/// Checks that the given slice has some minimal length. If it's smaller than
|
||||
/// the bound given, then a "buffer too small" error is returned with `what`
|
||||
/// describing what the buffer represents.
|
||||
pub(crate) fn check_slice_len<T>(
|
||||
slice: &[T],
|
||||
at_least_len: usize,
|
||||
what: &'static str,
|
||||
) -> Result<(), DeserializeError> {
|
||||
if slice.len() < at_least_len {
|
||||
return Err(DeserializeError::buffer_too_small(what));
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Multiply the given numbers, and on overflow, return an error that includes
|
||||
/// 'what' in the error message.
|
||||
///
|
||||
/// This is useful when doing arithmetic with untrusted data.
|
||||
pub(crate) fn mul(
|
||||
a: usize,
|
||||
b: usize,
|
||||
what: &'static str,
|
||||
) -> Result<usize, DeserializeError> {
|
||||
match a.checked_mul(b) {
|
||||
Some(c) => Ok(c),
|
||||
None => Err(DeserializeError::arithmetic_overflow(what)),
|
||||
}
|
||||
}
|
||||
|
||||
/// Add the given numbers, and on overflow, return an error that includes
|
||||
/// 'what' in the error message.
|
||||
///
|
||||
/// This is useful when doing arithmetic with untrusted data.
|
||||
pub(crate) fn add(
|
||||
a: usize,
|
||||
b: usize,
|
||||
what: &'static str,
|
||||
) -> Result<usize, DeserializeError> {
|
||||
match a.checked_add(b) {
|
||||
Some(c) => Ok(c),
|
||||
None => Err(DeserializeError::arithmetic_overflow(what)),
|
||||
}
|
||||
}
|
||||
|
||||
/// Shift `a` left by `b`, and on overflow, return an error that includes
|
||||
/// 'what' in the error message.
|
||||
///
|
||||
/// This is useful when doing arithmetic with untrusted data.
|
||||
pub(crate) fn shl(
|
||||
a: usize,
|
||||
b: usize,
|
||||
what: &'static str,
|
||||
) -> Result<usize, DeserializeError> {
|
||||
let amount = u32::try_from(b)
|
||||
.map_err(|_| DeserializeError::arithmetic_overflow(what))?;
|
||||
match a.checked_shl(amount) {
|
||||
Some(c) => Ok(c),
|
||||
None => Err(DeserializeError::arithmetic_overflow(what)),
|
||||
}
|
||||
}
|
||||
|
||||
/// Returns the number of additional bytes required to add to the given length
|
||||
/// in order to make the total length a multiple of 4. The return value is
|
||||
/// always less than 4.
|
||||
pub(crate) fn padding_len(non_padding_len: usize) -> usize {
|
||||
(4 - (non_padding_len & 0b11)) & 0b11
|
||||
}
|
||||
|
||||
/// A simple trait for writing code generic over endianness.
|
||||
///
|
||||
/// This is similar to what byteorder provides, but we only need a very small
|
||||
/// subset.
|
||||
pub(crate) trait Endian {
|
||||
/// Writes a u16 to the given destination buffer in a particular
|
||||
/// endianness. If the destination buffer has a length smaller than 2, then
|
||||
/// this panics.
|
||||
fn write_u16(n: u16, dst: &mut [u8]);
|
||||
|
||||
/// Writes a u32 to the given destination buffer in a particular
|
||||
/// endianness. If the destination buffer has a length smaller than 4, then
|
||||
/// this panics.
|
||||
fn write_u32(n: u32, dst: &mut [u8]);
|
||||
|
||||
/// Writes a u128 to the given destination buffer in a particular
|
||||
/// endianness. If the destination buffer has a length smaller than 16,
|
||||
/// then this panics.
|
||||
fn write_u128(n: u128, dst: &mut [u8]);
|
||||
}
|
||||
|
||||
/// Little endian writing.
|
||||
pub(crate) enum LE {}
|
||||
/// Big endian writing.
|
||||
pub(crate) enum BE {}
|
||||
|
||||
#[cfg(target_endian = "little")]
|
||||
pub(crate) type NE = LE;
|
||||
#[cfg(target_endian = "big")]
|
||||
pub(crate) type NE = BE;
|
||||
|
||||
impl Endian for LE {
|
||||
fn write_u16(n: u16, dst: &mut [u8]) {
|
||||
dst[..2].copy_from_slice(&n.to_le_bytes());
|
||||
}
|
||||
|
||||
fn write_u32(n: u32, dst: &mut [u8]) {
|
||||
dst[..4].copy_from_slice(&n.to_le_bytes());
|
||||
}
|
||||
|
||||
fn write_u128(n: u128, dst: &mut [u8]) {
|
||||
dst[..16].copy_from_slice(&n.to_le_bytes());
|
||||
}
|
||||
}
|
||||
|
||||
impl Endian for BE {
|
||||
fn write_u16(n: u16, dst: &mut [u8]) {
|
||||
dst[..2].copy_from_slice(&n.to_be_bytes());
|
||||
}
|
||||
|
||||
fn write_u32(n: u32, dst: &mut [u8]) {
|
||||
dst[..4].copy_from_slice(&n.to_be_bytes());
|
||||
}
|
||||
|
||||
fn write_u128(n: u128, dst: &mut [u8]) {
|
||||
dst[..16].copy_from_slice(&n.to_be_bytes());
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(all(test, feature = "alloc"))]
|
||||
mod tests {
|
||||
use super::*;
|
||||
|
||||
#[test]
|
||||
fn labels() {
|
||||
let mut buf = [0; 1024];
|
||||
|
||||
let nwrite = write_label("fooba", &mut buf).unwrap();
|
||||
assert_eq!(nwrite, 8);
|
||||
assert_eq!(&buf[..nwrite], b"fooba\x00\x00\x00");
|
||||
|
||||
let nread = read_label(&buf, "fooba").unwrap();
|
||||
assert_eq!(nread, 8);
|
||||
}
|
||||
|
||||
#[test]
|
||||
#[should_panic]
|
||||
fn bad_label_interior_nul() {
|
||||
// interior NULs are not allowed
|
||||
write_label("foo\x00bar", &mut [0; 1024]).unwrap();
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn bad_label_almost_too_long() {
|
||||
// ok
|
||||
write_label(&"z".repeat(255), &mut [0; 1024]).unwrap();
|
||||
}
|
||||
|
||||
#[test]
|
||||
#[should_panic]
|
||||
fn bad_label_too_long() {
|
||||
// labels longer than 255 bytes are banned
|
||||
write_label(&"z".repeat(256), &mut [0; 1024]).unwrap();
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn padding() {
|
||||
assert_eq!(0, padding_len(8));
|
||||
assert_eq!(3, padding_len(9));
|
||||
assert_eq!(2, padding_len(10));
|
||||
assert_eq!(1, padding_len(11));
|
||||
assert_eq!(0, padding_len(12));
|
||||
assert_eq!(3, padding_len(13));
|
||||
assert_eq!(2, padding_len(14));
|
||||
assert_eq!(1, padding_len(15));
|
||||
assert_eq!(0, padding_len(16));
|
||||
}
|
||||
}
|
||||
Loading…
Add table
Add a link
Reference in a new issue