Vendor things
This commit is contained in:
parent
5deceec006
commit
977e3c17e5
19434 changed files with 10682014 additions and 0 deletions
1
third-party/vendor/regex-automata-0.1.10/.cargo-checksum.json
vendored
Normal file
1
third-party/vendor/regex-automata-0.1.10/.cargo-checksum.json
vendored
Normal file
|
|
@ -0,0 +1 @@
|
|||
{"files":{"COPYING":"01c266bced4a434da0051174d6bee16a4c82cf634e2679b6155d40d75012390f","Cargo.toml":"674fda607d585e7a9d1d07e6fee2807e6a1a3709ca8d5a507dac051cac84dcf1","LICENSE-MIT":"0f96a83840e146e43c0ec96a22ec1f392e0680e6c1226e6f3ba87e0740af850f","README.md":"34ebd8d165fbd934198653a6d619d62788ff72f0e058139459d4369683423551","TODO":"daea9f7378f543311d657e6ef3d2a09d51e82b9e70d0026140130862c32b3c08","UNLICENSE":"7e12e5df4bae12cb21581ba157ced20e1986a0508dd10d0e8a4ab9a4cf94e85c","data/fowler-tests/LICENSE":"58cf078acc03da3e280a938c2bd9943f554fc9b6ced89ad93ba35ca436872899","data/fowler-tests/README":"45f869e37f798905c773bfbe0ef19a5fb7e585cbf0b7c21b5b5a784e8cec3c14","data/fowler-tests/basic.dat":"3756a5bdd6f387ed34731197fbdaab8521b0ae1726250100ba235756cb42b4b1","data/fowler-tests/nullsubexpr.dat":"496ac0278eec3b6d9170faace14554569032dd3d909618364d9326156de39ecf","data/fowler-tests/repetition.dat":"1f7959063015b284b18a4a2c1c8b416d438a2d6c4b1a362da43406b865f50e69","data/tests/crazy.toml":"b6e644a74b990a4344b15e7366da36e5b3f73a183944e249082f74c23ff01e5f","data/tests/flags.toml":"aefd9483c1c9c52c3669a9f2e88cd494c293f2e14c59aecb1d94dbb82546a705","data/tests/fowler/LICENSE":"58cf078acc03da3e280a938c2bd9943f554fc9b6ced89ad93ba35ca436872899","data/tests/fowler/README":"e9f049297023d5a81c5c600280016fe0271e7d0eda898c41399eb61431820404","data/tests/fowler/basic.dat":"3756a5bdd6f387ed34731197fbdaab8521b0ae1726250100ba235756cb42b4b1","data/tests/fowler/basic.toml":"7b043231ca8c89dbd10cef0de3b0be18c9ae442be1e99a657cd412b8b7edec21","data/tests/fowler/fowler-to-toml":"5bb78b924f3b6b1c27278b37baae556115fe03c864c1d33a7c53718b99885515","data/tests/fowler/nullsubexpr.dat":"496ac0278eec3b6d9170faace14554569032dd3d909618364d9326156de39ecf","data/tests/fowler/nullsubexpr.toml":"7e4bf9fec1c4a8aca04cc96e74b3f51ed6b8c3f85e4bfc7acc9c74ab95166976","data/tests/fowler/repetition-long.dat":"040b869c8c3a02ed0497943af1860094fd8437892811180fb9624a41eb491f23","data/tests/fowler/repetition-long.toml":"3eb7199d936b3f7eb9863ebc3b0c94648cfc32192f626dcfa33ddf352918c1c0","data/tests/fowler/repetition.dat":"d8fa959c2be524c90f2092f7c41ff3a7b48232b4378cb3b090625e74e76fc625","data/tests/fowler/repetition.toml":"ccf21430a325c4e1dae4eb6c52e3cea5d3c1847559ba6e75466bdb6bbd98204d","data/tests/iter.toml":"99adc397fe0a00c759eb659531d3e69445b43f5ecd5771c549117933b73bd43e","data/tests/no-unicode.toml":"f329ee939c2d07a17e51f0090d9f2431395e47dac8e0b982fb5e16e0555b75e3","data/tests/unicode.toml":"0ff418de5bc238e4595956b66981fe02018938d57d76d11cab840606b9da60ba","rustfmt.toml":"1ca600239a27401c4a43f363cf3f38183a212affc1f31bff3ae93234bbaec228","src/byteorder.rs":"0827852aa563e3c5b3ffaf484ce8a34537e82719a3606d4b948bc8a1e21d8b18","src/classes.rs":"706c8a8a9bf70260b9c92ff865891fc26de0453495afca7b325afdf5e6a3e242","src/codegen.rs":"5686b97fec69158c7264183a71ad9a1ff8e74db02fa0fcfccaa0a516cbfc7d1d","src/dense.rs":"7561f35019b20642f2ee75fd20365e21a4c8260deb7cee84fa3f8264b9fd9a4b","src/determinize.rs":"876c844d0470854dbbe3eb4386611fd57d95a5a4ae38ee937fbb14676f0a383a","src/dfa.rs":"032f09d187ec8dd06ef09940515690af045ca9f7ef7f819c31a97607df1432e5","src/error.rs":"d07ecdc617e243a43a99e911398b9c37721afd2b9548153c5f359b8c4605c749","src/lib.rs":"520781bdd60d425b16ef72f03330362e7c2aec274338e73f309d730bea4d7ab0","src/minimize.rs":"dfa7b6a6f36bb2dedaee8bfc5c4bb943f59e0cf98cde5358822e70cbdb284a7e","src/nfa/compiler.rs":"f43901929f44efa420e441cbff8687e05059ceae88492a2ed6c49fdd5a6a6b04","src/nfa/map.rs":"b7e2e561d6fe5775716e27eded1ae3e2277a50073a2e182f3dabedcda5c30d27","src/nfa/mod.rs":"93e7dee804751fcf66d48ca48b3467a4ab5155063461e69c428e46bcf977711d","src/nfa/range_trie.rs":"3a3d2853987619688ab5b61acef575f216d5bdd7b9e15fa508e0ba6f29c641a9","src/regex.rs":"2f3868a3fa52b2a040fd0fb9f12386b1af1f0f650d948e821c7ba83f087826f0","src/sparse.rs":"976540bcd134a225e5d39e1aef688f63b02b3d745249a3a95fec387a7ffb88cc","src/sparse_set.rs":"81bef5057781e26da39855b0f38b02ddfd09183bc62d30cf454ec706885e3a70","src/state_id.rs":"44c4bf1a5d091b97e8c1ce872bafe45d806905b07a73a6f82b1655b7897e7b5f","src/transducer.rs":"28c728ef45a3f6177d5a3ac589f166764c11d6c66bd5d916bcf30ad2be187a0c","tests/collection.rs":"2907cc0a32e5e59ceca4b34fe582f9275c12ee1a8d6e73d689056bdfd5357b9a","tests/regression.rs":"5a9b2654f88b1b07401c5b1fe925f62421bff67be7d80cae7a985eb66ed9886b","tests/suite.rs":"8148247667b34b370855c247ffcc9c6339f8f72d6fe481b79936afbb165dd6bd","tests/tests.rs":"f1b407d3d288a9c2b1500151205f9d0bcc0668b2ab38c5094ee459d6d4893e18","tests/unescape.rs":"67a7c466ba5c873a3c29f7e00649535ddc2921fcc14ac92cb207f43b4b6e461d"},"package":"6c230d73fb8d8c1b9c0b3135c5142a8acee3a0558fb8db5cf1cb65f8d7862132"}
|
||||
3
third-party/vendor/regex-automata-0.1.10/COPYING
vendored
Normal file
3
third-party/vendor/regex-automata-0.1.10/COPYING
vendored
Normal file
|
|
@ -0,0 +1,3 @@
|
|||
This project is dual-licensed under the Unlicense and MIT licenses.
|
||||
|
||||
You may use this code under the terms of either license.
|
||||
86
third-party/vendor/regex-automata-0.1.10/Cargo.toml
vendored
Normal file
86
third-party/vendor/regex-automata-0.1.10/Cargo.toml
vendored
Normal file
|
|
@ -0,0 +1,86 @@
|
|||
# THIS FILE IS AUTOMATICALLY GENERATED BY CARGO
|
||||
#
|
||||
# When uploading crates to the registry Cargo will automatically
|
||||
# "normalize" Cargo.toml files for maximal compatibility
|
||||
# with all versions of Cargo and also rewrite `path` dependencies
|
||||
# to registry (e.g., crates.io) dependencies
|
||||
#
|
||||
# If you believe there's an error in this file please file an
|
||||
# issue against the rust-lang/cargo repository. If you're
|
||||
# editing this file be aware that the upstream Cargo.toml
|
||||
# will likely look very different (and much more reasonable)
|
||||
|
||||
[package]
|
||||
name = "regex-automata"
|
||||
version = "0.1.10"
|
||||
authors = ["Andrew Gallant <jamslam@gmail.com>"]
|
||||
exclude = ["/.travis.yml", "/appveyor.yml", "/ci/*", "/scripts/*", "/regex-automata-debug"]
|
||||
autoexamples = false
|
||||
autotests = false
|
||||
description = "Automata construction and matching using regular expressions."
|
||||
homepage = "https://github.com/BurntSushi/regex-automata"
|
||||
documentation = "https://docs.rs/regex-automata"
|
||||
readme = "README.md"
|
||||
keywords = ["regex", "dfa", "automata", "automaton", "nfa"]
|
||||
categories = ["text-processing"]
|
||||
license = "Unlicense/MIT"
|
||||
repository = "https://github.com/BurntSushi/regex-automata"
|
||||
[profile.bench]
|
||||
debug = true
|
||||
|
||||
[profile.dev]
|
||||
opt-level = 3
|
||||
debug = true
|
||||
|
||||
[profile.release]
|
||||
debug = true
|
||||
|
||||
[profile.test]
|
||||
opt-level = 3
|
||||
debug = true
|
||||
|
||||
[lib]
|
||||
bench = false
|
||||
|
||||
[[test]]
|
||||
name = "default"
|
||||
path = "tests/tests.rs"
|
||||
[dependencies.fst]
|
||||
version = "0.4.0"
|
||||
optional = true
|
||||
|
||||
[dependencies.regex-syntax]
|
||||
version = "0.6.16"
|
||||
optional = true
|
||||
[dev-dependencies.bstr]
|
||||
version = "0.2"
|
||||
features = ["std"]
|
||||
default-features = false
|
||||
|
||||
[dev-dependencies.lazy_static]
|
||||
version = "1.2.0"
|
||||
|
||||
[dev-dependencies.regex]
|
||||
version = "1.1"
|
||||
|
||||
[dev-dependencies.serde]
|
||||
version = "1.0.82"
|
||||
|
||||
[dev-dependencies.serde_bytes]
|
||||
version = "0.11"
|
||||
|
||||
[dev-dependencies.serde_derive]
|
||||
version = "1.0.82"
|
||||
|
||||
[dev-dependencies.toml]
|
||||
version = "0.4.10"
|
||||
|
||||
[features]
|
||||
default = ["std"]
|
||||
std = ["regex-syntax"]
|
||||
transducer = ["std", "fst"]
|
||||
[badges.appveyor]
|
||||
repository = "BurntSushi/regex-automata"
|
||||
|
||||
[badges.travis-ci]
|
||||
repository = "BurntSushi/regex-automata"
|
||||
21
third-party/vendor/regex-automata-0.1.10/LICENSE-MIT
vendored
Normal file
21
third-party/vendor/regex-automata-0.1.10/LICENSE-MIT
vendored
Normal file
|
|
@ -0,0 +1,21 @@
|
|||
The MIT License (MIT)
|
||||
|
||||
Copyright (c) 2015 Andrew Gallant
|
||||
|
||||
Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
of this software and associated documentation files (the "Software"), to deal
|
||||
in the Software without restriction, including without limitation the rights
|
||||
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
copies of the Software, and to permit persons to whom the Software is
|
||||
furnished to do so, subject to the following conditions:
|
||||
|
||||
The above copyright notice and this permission notice shall be included in
|
||||
all copies or substantial portions of the Software.
|
||||
|
||||
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
||||
THE SOFTWARE.
|
||||
223
third-party/vendor/regex-automata-0.1.10/README.md
vendored
Normal file
223
third-party/vendor/regex-automata-0.1.10/README.md
vendored
Normal file
|
|
@ -0,0 +1,223 @@
|
|||
regex-automata
|
||||
==============
|
||||
A low level regular expression library that uses deterministic finite automata.
|
||||
It supports a rich syntax with Unicode support, has extensive options for
|
||||
configuring the best space vs time trade off for your use case and provides
|
||||
support for cheap deserialization of automata for use in `no_std` environments.
|
||||
|
||||
[](https://github.com/BurntSushi/regex-automata/actions)
|
||||
[](https://crates.io/crates/regex-automata)
|
||||

|
||||
|
||||
Dual-licensed under MIT or the [UNLICENSE](https://unlicense.org/).
|
||||
|
||||
|
||||
### Documentation
|
||||
|
||||
https://docs.rs/regex-automata
|
||||
|
||||
|
||||
### Usage
|
||||
|
||||
Add this to your `Cargo.toml`:
|
||||
|
||||
```toml
|
||||
[dependencies]
|
||||
regex-automata = "0.1"
|
||||
```
|
||||
|
||||
and this to your crate root (if you're using Rust 2015):
|
||||
|
||||
```rust
|
||||
extern crate regex_automata;
|
||||
```
|
||||
|
||||
|
||||
### Example: basic regex searching
|
||||
|
||||
This example shows how to compile a regex using the default configuration
|
||||
and then use it to find matches in a byte string:
|
||||
|
||||
```rust
|
||||
use regex_automata::Regex;
|
||||
|
||||
let re = Regex::new(r"[0-9]{4}-[0-9]{2}-[0-9]{2}").unwrap();
|
||||
let text = b"2018-12-24 2016-10-08";
|
||||
let matches: Vec<(usize, usize)> = re.find_iter(text).collect();
|
||||
assert_eq!(matches, vec![(0, 10), (11, 21)]);
|
||||
```
|
||||
|
||||
For more examples and information about the various knobs that can be turned,
|
||||
please see the [docs](https://docs.rs/regex-automata).
|
||||
|
||||
|
||||
### Support for `no_std`
|
||||
|
||||
This crate comes with a `std` feature that is enabled by default. When the
|
||||
`std` feature is enabled, the API of this crate will include the facilities
|
||||
necessary for compiling, serializing, deserializing and searching with regular
|
||||
expressions. When the `std` feature is disabled, the API of this crate will
|
||||
shrink such that it only includes the facilities necessary for deserializing
|
||||
and searching with regular expressions.
|
||||
|
||||
The intended workflow for `no_std` environments is thus as follows:
|
||||
|
||||
* Write a program with the `std` feature that compiles and serializes a
|
||||
regular expression. Serialization should only happen after first converting
|
||||
the DFAs to use a fixed size state identifier instead of the default `usize`.
|
||||
You may also need to serialize both little and big endian versions of each
|
||||
DFA. (So that's 4 DFAs in total for each regex.)
|
||||
* In your `no_std` environment, follow the examples above for deserializing
|
||||
your previously serialized DFAs into regexes. You can then search with them
|
||||
as you would any regex.
|
||||
|
||||
Deserialization can happen anywhere. For example, with bytes embedded into a
|
||||
binary or with a file memory mapped at runtime.
|
||||
|
||||
Note that the
|
||||
[`ucd-generate`](https://github.com/BurntSushi/ucd-generate)
|
||||
tool will do the first step for you with its `dfa` or `regex` sub-commands.
|
||||
|
||||
|
||||
### Cargo features
|
||||
|
||||
* `std` - **Enabled** by default. This enables the ability to compile finite
|
||||
automata. This requires the `regex-syntax` dependency. Without this feature
|
||||
enabled, finite automata can only be used for searching (using the approach
|
||||
described above).
|
||||
* `transducer` - **Disabled** by default. This provides implementations of the
|
||||
`Automaton` trait found in the `fst` crate. This permits using finite
|
||||
automata generated by this crate to search finite state transducers. This
|
||||
requires the `fst` dependency.
|
||||
|
||||
|
||||
### Differences with the regex crate
|
||||
|
||||
The main goal of the [`regex`](https://docs.rs/regex) crate is to serve as a
|
||||
general purpose regular expression engine. It aims to automatically balance low
|
||||
compile times, fast search times and low memory usage, while also providing
|
||||
a convenient API for users. In contrast, this crate provides a lower level
|
||||
regular expression interface that is a bit less convenient while providing more
|
||||
explicit control over memory usage and search times.
|
||||
|
||||
Here are some specific negative differences:
|
||||
|
||||
* **Compilation can take an exponential amount of time and space** in the size
|
||||
of the regex pattern. While most patterns do not exhibit worst case
|
||||
exponential time, such patterns do exist. For example, `[01]*1[01]{N}` will
|
||||
build a DFA with `2^(N+1)` states. For this reason, untrusted patterns should
|
||||
not be compiled with this library. (In the future, the API may expose an
|
||||
option to return an error if the DFA gets too big.)
|
||||
* This crate does not support sub-match extraction, which can be achieved with
|
||||
the regex crate's "captures" API. This may be added in the future, but is
|
||||
unlikely.
|
||||
* While the regex crate doesn't necessarily sport fast compilation times, the
|
||||
regexes in this crate are almost universally slow to compile, especially when
|
||||
they contain large Unicode character classes. For example, on my system,
|
||||
compiling `\w{3}` with byte classes enabled takes just over 1 second and
|
||||
almost 5MB of memory! (Compiling a sparse regex takes about the same time
|
||||
but only uses about 500KB of memory.) Conversly, compiling the same regex
|
||||
without Unicode support, e.g., `(?-u)\w{3}`, takes under 1 millisecond and
|
||||
less than 5KB of memory. For this reason, you should only use Unicode
|
||||
character classes if you absolutely need them!
|
||||
* This crate does not support regex sets.
|
||||
* This crate does not support zero-width assertions such as `^`, `$`, `\b` or
|
||||
`\B`.
|
||||
* As a lower level crate, this library does not do literal optimizations. In
|
||||
exchange, you get predictable performance regardless of input. The
|
||||
philosophy here is that literal optimizations should be applied at a higher
|
||||
level, although there is no easy support for this in the ecosystem yet.
|
||||
* There is no `&str` API like in the regex crate. In this crate, all APIs
|
||||
operate on `&[u8]`. By default, match indices are guaranteed to fall on
|
||||
UTF-8 boundaries, unless `RegexBuilder::allow_invalid_utf8` is enabled.
|
||||
|
||||
With some of the downsides out of the way, here are some positive differences:
|
||||
|
||||
* Both dense and sparse DFAs can be serialized to raw bytes, and then cheaply
|
||||
deserialized. Deserialization always takes constant time since searching can
|
||||
be performed directly on the raw serialized bytes of a DFA.
|
||||
* This crate was specifically designed so that the searching phase of a DFA has
|
||||
minimal runtime requirements, and can therefore be used in `no_std`
|
||||
environments. While `no_std` environments cannot compile regexes, they can
|
||||
deserialize pre-compiled regexes.
|
||||
* Since this crate builds DFAs ahead of time, it will generally out-perform
|
||||
the `regex` crate on equivalent tasks. The performance difference is likely
|
||||
not large. However, because of a complex set of optimizations in the regex
|
||||
crate (like literal optimizations), an accurate performance comparison may be
|
||||
difficult to do.
|
||||
* Sparse DFAs provide a way to build a DFA ahead of time that sacrifices search
|
||||
performance a small amount, but uses much less storage space. Potentially
|
||||
even less than what the regex crate uses.
|
||||
* This crate exposes DFAs directly, such as `DenseDFA` and `SparseDFA`,
|
||||
which enables one to do less work in some cases. For example, if you only
|
||||
need the end of a match and not the start of a match, then you can use a DFA
|
||||
directly without building a `Regex`, which always requires a second DFA to
|
||||
find the start of a match.
|
||||
* Aside from choosing between dense and sparse DFAs, there are several options
|
||||
for configuring the space usage vs search time trade off. These include
|
||||
things like choosing a smaller state identifier representation, to
|
||||
premultiplying state identifiers and splitting a DFA's alphabet into
|
||||
equivalence classes. Finally, DFA minimization is also provided, but can
|
||||
increase compilation times dramatically.
|
||||
|
||||
|
||||
### Future work
|
||||
|
||||
* Look into being smarter about generating NFA states for large Unicode
|
||||
character classes. These can create a lot of additional work for both the
|
||||
determinizer and the minimizer, and I suspect this is the key thing we'll
|
||||
want to improve if we want to make DFA compile times faster. I *believe*
|
||||
it's possible to potentially build minimal or nearly minimal NFAs for the
|
||||
special case of Unicode character classes by leveraging Daciuk's algorithms
|
||||
for building minimal automata in linear time for sets of strings. See
|
||||
https://blog.burntsushi.net/transducers/#construction for more details. The
|
||||
key adaptation I think we need to make is to modify the algorithm to operate
|
||||
on byte ranges instead of enumerating every codepoint in the set. Otherwise,
|
||||
it might not be worth doing.
|
||||
* Add support for regex sets. It should be possible to do this by "simply"
|
||||
introducing more match states. I think we can also report the positions at
|
||||
each match, similar to how Aho-Corasick works. I think the long pole in the
|
||||
tent here is probably the API design work and arranging it so that we don't
|
||||
introduce extra overhead into the non-regex-set case without duplicating a
|
||||
lot of code. It seems doable.
|
||||
* Stretch goal: support capturing groups by implementing "tagged" DFA
|
||||
(transducers). Laurikari's paper is the usual reference here, but Trofimovich
|
||||
has a much more thorough treatment here:
|
||||
https://re2c.org/2017_trofimovich_tagged_deterministic_finite_automata_with_lookahead.pdf
|
||||
I've only read the paper once. I suspect it will require at least a few more
|
||||
read throughs before I understand it.
|
||||
See also: https://re2c.org
|
||||
* Possibly less ambitious goal: can we select a portion of Trofimovich's work
|
||||
to make small fixed length look-around work? It would be really nice to
|
||||
support ^, $ and \b, especially the Unicode variant of \b and CRLF aware $.
|
||||
* Experiment with code generating Rust code. There is an early experiment in
|
||||
src/codegen.rs that is thoroughly bit-rotted. At the time, I was
|
||||
experimenting with whether or not codegen would significant decrease the size
|
||||
of a DFA, since if you squint hard enough, it's kind of like a sparse
|
||||
representation. However, it didn't shrink as much as I thought it would, so
|
||||
I gave up. The other problem is that Rust doesn't support gotos, so I don't
|
||||
even know whether the "match on each state" in a loop thing will be fast
|
||||
enough. Either way, it's probably a good option to have. For one thing, it
|
||||
would be endian independent where as the serialization format of the DFAs in
|
||||
this crate are endian dependent (so you need two versions of every DFA, but
|
||||
you only need to compile one of them for any given arch).
|
||||
* Experiment with unrolling the match loops and fill out the benchmarks.
|
||||
* Add some kind of streaming API. I believe users of the library can already
|
||||
implement something for this outside of the crate, but it would be good to
|
||||
provide an official API. The key thing here is figuring out the API. I
|
||||
suspect we might want to support several variants.
|
||||
* Make a decision on whether or not there is room for literal optimizations
|
||||
in this crate. My original intent was to not let this crate sink down into
|
||||
that very very very deep rabbit hole. But instead, we might want to provide
|
||||
some way for literal optimizations to hook into the match routines. The right
|
||||
path forward here is to probably build something outside of the crate and
|
||||
then see about integrating it. After all, users can implement their own
|
||||
match routines just as efficiently as what the crate provides.
|
||||
* A key downside of DFAs is that they can take up a lot of memory and can be
|
||||
quite costly to build. Their worst case compilation time is O(2^n), where
|
||||
n is the number of NFA states. A paper by Yang and Prasanna (2011) actually
|
||||
seems to provide a way to character state blow up such that it is detectable.
|
||||
If we could know whether a regex will exhibit state explosion or not, then
|
||||
we could make an intelligent decision about whether to ahead-of-time compile
|
||||
a DFA.
|
||||
See: https://www.researchgate.net/profile/Xu-Shutu/publication/229032602_Characterization_of_a_global_germplasm_collection_and_its_potential_utilization_for_analysis_of_complex_quantitative_traits_in_maize/links/02bfe50f914d04c837000000/Characterization-of-a-global-germplasm-collection-and-its-potential-utilization-for-analysis-of-complex-quantitative-traits-in-maize.pdf
|
||||
10
third-party/vendor/regex-automata-0.1.10/TODO
vendored
Normal file
10
third-party/vendor/regex-automata-0.1.10/TODO
vendored
Normal file
|
|
@ -0,0 +1,10 @@
|
|||
* Remove the `empty` constructors for DFAs and replace them with
|
||||
`never_match` and `always_match` constructors.
|
||||
* Consider refactoring the NFA representation such that it can be instantly
|
||||
loaded from a `&[u8]`, just like a sparse DFA. Main downside is that this
|
||||
could negatively impact using the NFA with deserialization costs. Before
|
||||
doing this, we should write PikeVM and backtracking implementations so that
|
||||
they can be benchmarked.
|
||||
* Add captures and anchors to NFA.
|
||||
* Once we're happy, re-organize the public API such that NFAs are exported
|
||||
and usable on their own.
|
||||
24
third-party/vendor/regex-automata-0.1.10/UNLICENSE
vendored
Normal file
24
third-party/vendor/regex-automata-0.1.10/UNLICENSE
vendored
Normal file
|
|
@ -0,0 +1,24 @@
|
|||
This is free and unencumbered software released into the public domain.
|
||||
|
||||
Anyone is free to copy, modify, publish, use, compile, sell, or
|
||||
distribute this software, either in source code form or as a compiled
|
||||
binary, for any purpose, commercial or non-commercial, and by any
|
||||
means.
|
||||
|
||||
In jurisdictions that recognize copyright laws, the author or authors
|
||||
of this software dedicate any and all copyright interest in the
|
||||
software to the public domain. We make this dedication for the benefit
|
||||
of the public at large and to the detriment of our heirs and
|
||||
successors. We intend this dedication to be an overt act of
|
||||
relinquishment in perpetuity of all present and future rights to this
|
||||
software under copyright law.
|
||||
|
||||
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
||||
EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
|
||||
MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
|
||||
IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR
|
||||
OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
|
||||
ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
|
||||
OTHER DEALINGS IN THE SOFTWARE.
|
||||
|
||||
For more information, please refer to <http://unlicense.org/>
|
||||
19
third-party/vendor/regex-automata-0.1.10/data/fowler-tests/LICENSE
vendored
Normal file
19
third-party/vendor/regex-automata-0.1.10/data/fowler-tests/LICENSE
vendored
Normal file
|
|
@ -0,0 +1,19 @@
|
|||
The following license covers testregex.c and all associated test data.
|
||||
|
||||
Permission is hereby granted, free of charge, to any person obtaining a
|
||||
copy of THIS SOFTWARE FILE (the "Software"), to deal in the Software
|
||||
without restriction, including without limitation the rights to use,
|
||||
copy, modify, merge, publish, distribute, and/or sell copies of the
|
||||
Software, and to permit persons to whom the Software is furnished to do
|
||||
so, subject to the following disclaimer:
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY AT&T ``AS IS'' AND ANY EXPRESS OR IMPLIED
|
||||
WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
|
||||
MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
|
||||
IN NO EVENT SHALL AT&T BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||||
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
17
third-party/vendor/regex-automata-0.1.10/data/fowler-tests/README
vendored
Normal file
17
third-party/vendor/regex-automata-0.1.10/data/fowler-tests/README
vendored
Normal file
|
|
@ -0,0 +1,17 @@
|
|||
Test data was taken from the Go distribution, which was in turn taken from the
|
||||
testregex test suite:
|
||||
|
||||
http://www2.research.att.com/~astopen/testregex/testregex.html
|
||||
|
||||
The LICENSE in this directory corresponds to the LICENSE that the data was
|
||||
released under.
|
||||
|
||||
The tests themselves were modified for RE2/Go. A couple were modified further
|
||||
by me (Andrew Gallant) (only in repetition.dat) so that RE2/Go would pass them.
|
||||
(Yes, it seems like RE2/Go includes failing test cases.) This may or may not
|
||||
have been a bad idea, but I think being consistent with an established Regex
|
||||
library is worth something.
|
||||
|
||||
Note that these files are read by 'scripts/regex-match-tests.py' and turned
|
||||
into Rust tests found in 'regex_macros/tests/matches.rs'.
|
||||
|
||||
221
third-party/vendor/regex-automata-0.1.10/data/fowler-tests/basic.dat
vendored
Normal file
221
third-party/vendor/regex-automata-0.1.10/data/fowler-tests/basic.dat
vendored
Normal file
|
|
@ -0,0 +1,221 @@
|
|||
NOTE all standard compliant implementations should pass these : 2002-05-31
|
||||
|
||||
BE abracadabra$ abracadabracadabra (7,18)
|
||||
BE a...b abababbb (2,7)
|
||||
BE XXXXXX ..XXXXXX (2,8)
|
||||
E \) () (1,2)
|
||||
BE a] a]a (0,2)
|
||||
B } } (0,1)
|
||||
E \} } (0,1)
|
||||
BE \] ] (0,1)
|
||||
B ] ] (0,1)
|
||||
E ] ] (0,1)
|
||||
B { { (0,1)
|
||||
B } } (0,1)
|
||||
BE ^a ax (0,1)
|
||||
BE \^a a^a (1,3)
|
||||
BE a\^ a^ (0,2)
|
||||
BE a$ aa (1,2)
|
||||
BE a\$ a$ (0,2)
|
||||
BE ^$ NULL (0,0)
|
||||
E $^ NULL (0,0)
|
||||
E a($) aa (1,2)(2,2)
|
||||
E a*(^a) aa (0,1)(0,1)
|
||||
E (..)*(...)* a (0,0)
|
||||
E (..)*(...)* abcd (0,4)(2,4)
|
||||
E (ab|a)(bc|c) abc (0,3)(0,2)(2,3)
|
||||
E (ab)c|abc abc (0,3)(0,2)
|
||||
E a{0}b ab (1,2)
|
||||
E (a*)(b?)(b+)b{3} aaabbbbbbb (0,10)(0,3)(3,4)(4,7)
|
||||
E (a*)(b{0,1})(b{1,})b{3} aaabbbbbbb (0,10)(0,3)(3,4)(4,7)
|
||||
E a{9876543210} NULL BADBR
|
||||
E ((a|a)|a) a (0,1)(0,1)(0,1)
|
||||
E (a*)(a|aa) aaaa (0,4)(0,3)(3,4)
|
||||
E a*(a.|aa) aaaa (0,4)(2,4)
|
||||
E a(b)|c(d)|a(e)f aef (0,3)(?,?)(?,?)(1,2)
|
||||
E (a|b)?.* b (0,1)(0,1)
|
||||
E (a|b)c|a(b|c) ac (0,2)(0,1)
|
||||
E (a|b)c|a(b|c) ab (0,2)(?,?)(1,2)
|
||||
E (a|b)*c|(a|ab)*c abc (0,3)(1,2)
|
||||
E (a|b)*c|(a|ab)*c xc (1,2)
|
||||
E (.a|.b).*|.*(.a|.b) xa (0,2)(0,2)
|
||||
E a?(ab|ba)ab abab (0,4)(0,2)
|
||||
E a?(ac{0}b|ba)ab abab (0,4)(0,2)
|
||||
E ab|abab abbabab (0,2)
|
||||
E aba|bab|bba baaabbbaba (5,8)
|
||||
E aba|bab baaabbbaba (6,9)
|
||||
E (aa|aaa)*|(a|aaaaa) aa (0,2)(0,2)
|
||||
E (a.|.a.)*|(a|.a...) aa (0,2)(0,2)
|
||||
E ab|a xabc (1,3)
|
||||
E ab|a xxabc (2,4)
|
||||
Ei (Ab|cD)* aBcD (0,4)(2,4)
|
||||
BE [^-] --a (2,3)
|
||||
BE [a-]* --a (0,3)
|
||||
BE [a-m-]* --amoma-- (0,4)
|
||||
E :::1:::0:|:::1:1:0: :::0:::1:::1:::0: (8,17)
|
||||
E :::1:::0:|:::1:1:1: :::0:::1:::1:::0: (8,17)
|
||||
{E [[:upper:]] A (0,1) [[<element>]] not supported
|
||||
E [[:lower:]]+ `az{ (1,3)
|
||||
E [[:upper:]]+ @AZ[ (1,3)
|
||||
# No collation in Go
|
||||
#BE [[-]] [[-]] (2,4)
|
||||
#BE [[.NIL.]] NULL ECOLLATE
|
||||
#BE [[=aleph=]] NULL ECOLLATE
|
||||
}
|
||||
BE$ \n \n (0,1)
|
||||
BEn$ \n \n (0,1)
|
||||
BE$ [^a] \n (0,1)
|
||||
BE$ \na \na (0,2)
|
||||
E (a)(b)(c) abc (0,3)(0,1)(1,2)(2,3)
|
||||
BE xxx xxx (0,3)
|
||||
E1 (^|[ (,;])((([Ff]eb[^ ]* *|0*2/|\* */?)0*[6-7]))([^0-9]|$) feb 6, (0,6)
|
||||
E1 (^|[ (,;])((([Ff]eb[^ ]* *|0*2/|\* */?)0*[6-7]))([^0-9]|$) 2/7 (0,3)
|
||||
E1 (^|[ (,;])((([Ff]eb[^ ]* *|0*2/|\* */?)0*[6-7]))([^0-9]|$) feb 1,Feb 6 (5,11)
|
||||
E3 ((((((((((((((((((((((((((((((x)))))))))))))))))))))))))))))) x (0,1)(0,1)(0,1)
|
||||
E3 ((((((((((((((((((((((((((((((x))))))))))))))))))))))))))))))* xx (0,2)(1,2)(1,2)
|
||||
E a?(ab|ba)* ababababababababababababababababababababababababababababababababababababababababa (0,81)(79,81)
|
||||
E abaa|abbaa|abbbaa|abbbbaa ababbabbbabbbabbbbabbbbaa (18,25)
|
||||
E abaa|abbaa|abbbaa|abbbbaa ababbabbbabbbabbbbabaa (18,22)
|
||||
E aaac|aabc|abac|abbc|baac|babc|bbac|bbbc baaabbbabac (7,11)
|
||||
BE$ .* \x01\x7f (0,2)
|
||||
E aaaa|bbbb|cccc|ddddd|eeeeee|fffffff|gggg|hhhh|iiiii|jjjjj|kkkkk|llll XaaaXbbbXcccXdddXeeeXfffXgggXhhhXiiiXjjjXkkkXlllXcbaXaaaa (53,57)
|
||||
L aaaa\nbbbb\ncccc\nddddd\neeeeee\nfffffff\ngggg\nhhhh\niiiii\njjjjj\nkkkkk\nllll XaaaXbbbXcccXdddXeeeXfffXgggXhhhXiiiXjjjXkkkXlllXcbaXaaaa NOMATCH
|
||||
E a*a*a*a*a*b aaaaaaaaab (0,10)
|
||||
BE ^ NULL (0,0)
|
||||
BE $ NULL (0,0)
|
||||
BE ^$ NULL (0,0)
|
||||
BE ^a$ a (0,1)
|
||||
BE abc abc (0,3)
|
||||
BE abc xabcy (1,4)
|
||||
BE abc ababc (2,5)
|
||||
BE ab*c abc (0,3)
|
||||
BE ab*bc abc (0,3)
|
||||
BE ab*bc abbc (0,4)
|
||||
BE ab*bc abbbbc (0,6)
|
||||
E ab+bc abbc (0,4)
|
||||
E ab+bc abbbbc (0,6)
|
||||
E ab?bc abbc (0,4)
|
||||
E ab?bc abc (0,3)
|
||||
E ab?c abc (0,3)
|
||||
BE ^abc$ abc (0,3)
|
||||
BE ^abc abcc (0,3)
|
||||
BE abc$ aabc (1,4)
|
||||
BE ^ abc (0,0)
|
||||
BE $ abc (3,3)
|
||||
BE a.c abc (0,3)
|
||||
BE a.c axc (0,3)
|
||||
BE a.*c axyzc (0,5)
|
||||
BE a[bc]d abd (0,3)
|
||||
BE a[b-d]e ace (0,3)
|
||||
BE a[b-d] aac (1,3)
|
||||
BE a[-b] a- (0,2)
|
||||
BE a[b-] a- (0,2)
|
||||
BE a] a] (0,2)
|
||||
BE a[]]b a]b (0,3)
|
||||
BE a[^bc]d aed (0,3)
|
||||
BE a[^-b]c adc (0,3)
|
||||
BE a[^]b]c adc (0,3)
|
||||
E ab|cd abc (0,2)
|
||||
E ab|cd abcd (0,2)
|
||||
E a\(b a(b (0,3)
|
||||
E a\(*b ab (0,2)
|
||||
E a\(*b a((b (0,4)
|
||||
E ((a)) abc (0,1)(0,1)(0,1)
|
||||
E (a)b(c) abc (0,3)(0,1)(2,3)
|
||||
E a+b+c aabbabc (4,7)
|
||||
E a* aaa (0,3)
|
||||
#E (a*)* - (0,0)(0,0)
|
||||
E (a*)* - (0,0)(?,?) RE2/Go
|
||||
E (a*)+ - (0,0)(0,0)
|
||||
#E (a*|b)* - (0,0)(0,0)
|
||||
E (a*|b)* - (0,0)(?,?) RE2/Go
|
||||
E (a+|b)* ab (0,2)(1,2)
|
||||
E (a+|b)+ ab (0,2)(1,2)
|
||||
E (a+|b)? ab (0,1)(0,1)
|
||||
BE [^ab]* cde (0,3)
|
||||
#E (^)* - (0,0)(0,0)
|
||||
E (^)* - (0,0)(?,?) RE2/Go
|
||||
BE a* NULL (0,0)
|
||||
E ([abc])*d abbbcd (0,6)(4,5)
|
||||
E ([abc])*bcd abcd (0,4)(0,1)
|
||||
E a|b|c|d|e e (0,1)
|
||||
E (a|b|c|d|e)f ef (0,2)(0,1)
|
||||
#E ((a*|b))* - (0,0)(0,0)(0,0)
|
||||
E ((a*|b))* - (0,0)(?,?)(?,?) RE2/Go
|
||||
BE abcd*efg abcdefg (0,7)
|
||||
BE ab* xabyabbbz (1,3)
|
||||
BE ab* xayabbbz (1,2)
|
||||
E (ab|cd)e abcde (2,5)(2,4)
|
||||
BE [abhgefdc]ij hij (0,3)
|
||||
E (a|b)c*d abcd (1,4)(1,2)
|
||||
E (ab|ab*)bc abc (0,3)(0,1)
|
||||
E a([bc]*)c* abc (0,3)(1,3)
|
||||
E a([bc]*)(c*d) abcd (0,4)(1,3)(3,4)
|
||||
E a([bc]+)(c*d) abcd (0,4)(1,3)(3,4)
|
||||
E a([bc]*)(c+d) abcd (0,4)(1,2)(2,4)
|
||||
E a[bcd]*dcdcde adcdcde (0,7)
|
||||
E (ab|a)b*c abc (0,3)(0,2)
|
||||
E ((a)(b)c)(d) abcd (0,4)(0,3)(0,1)(1,2)(3,4)
|
||||
BE [A-Za-z_][A-Za-z0-9_]* alpha (0,5)
|
||||
E ^a(bc+|b[eh])g|.h$ abh (1,3)
|
||||
E (bc+d$|ef*g.|h?i(j|k)) effgz (0,5)(0,5)
|
||||
E (bc+d$|ef*g.|h?i(j|k)) ij (0,2)(0,2)(1,2)
|
||||
E (bc+d$|ef*g.|h?i(j|k)) reffgz (1,6)(1,6)
|
||||
E (((((((((a))))))))) a (0,1)(0,1)(0,1)(0,1)(0,1)(0,1)(0,1)(0,1)(0,1)(0,1)
|
||||
BE multiple words multiple words yeah (0,14)
|
||||
E (.*)c(.*) abcde (0,5)(0,2)(3,5)
|
||||
BE abcd abcd (0,4)
|
||||
E a(bc)d abcd (0,4)(1,3)
|
||||
E a[-]?c ac (0,3)
|
||||
E M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy] Muammar Qaddafi (0,15)(?,?)(10,12)
|
||||
E M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy] Mo'ammar Gadhafi (0,16)(?,?)(11,13)
|
||||
E M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy] Muammar Kaddafi (0,15)(?,?)(10,12)
|
||||
E M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy] Muammar Qadhafi (0,15)(?,?)(10,12)
|
||||
E M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy] Muammar Gadafi (0,14)(?,?)(10,11)
|
||||
E M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy] Mu'ammar Qadafi (0,15)(?,?)(11,12)
|
||||
E M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy] Moamar Gaddafi (0,14)(?,?)(9,11)
|
||||
E M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy] Mu'ammar Qadhdhafi (0,18)(?,?)(13,15)
|
||||
E M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy] Muammar Khaddafi (0,16)(?,?)(11,13)
|
||||
E M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy] Muammar Ghaddafy (0,16)(?,?)(11,13)
|
||||
E M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy] Muammar Ghadafi (0,15)(?,?)(11,12)
|
||||
E M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy] Muammar Ghaddafi (0,16)(?,?)(11,13)
|
||||
E M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy] Muamar Kaddafi (0,14)(?,?)(9,11)
|
||||
E M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy] Muammar Quathafi (0,16)(?,?)(11,13)
|
||||
E M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy] Muammar Gheddafi (0,16)(?,?)(11,13)
|
||||
E M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy] Moammar Khadafy (0,15)(?,?)(11,12)
|
||||
E M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy] Moammar Qudhafi (0,15)(?,?)(10,12)
|
||||
E a+(b|c)*d+ aabcdd (0,6)(3,4)
|
||||
E ^.+$ vivi (0,4)
|
||||
E ^(.+)$ vivi (0,4)(0,4)
|
||||
E ^([^!.]+).att.com!(.+)$ gryphon.att.com!eby (0,19)(0,7)(16,19)
|
||||
E ^([^!]+!)?([^!]+)$ bas (0,3)(?,?)(0,3)
|
||||
E ^([^!]+!)?([^!]+)$ bar!bas (0,7)(0,4)(4,7)
|
||||
E ^([^!]+!)?([^!]+)$ foo!bas (0,7)(0,4)(4,7)
|
||||
E ^.+!([^!]+!)([^!]+)$ foo!bar!bas (0,11)(4,8)(8,11)
|
||||
E ((foo)|(bar))!bas bar!bas (0,7)(0,3)(?,?)(0,3)
|
||||
E ((foo)|(bar))!bas foo!bar!bas (4,11)(4,7)(?,?)(4,7)
|
||||
E ((foo)|(bar))!bas foo!bas (0,7)(0,3)(0,3)
|
||||
E ((foo)|bar)!bas bar!bas (0,7)(0,3)
|
||||
E ((foo)|bar)!bas foo!bar!bas (4,11)(4,7)
|
||||
E ((foo)|bar)!bas foo!bas (0,7)(0,3)(0,3)
|
||||
E (foo|(bar))!bas bar!bas (0,7)(0,3)(0,3)
|
||||
E (foo|(bar))!bas foo!bar!bas (4,11)(4,7)(4,7)
|
||||
E (foo|(bar))!bas foo!bas (0,7)(0,3)
|
||||
E (foo|bar)!bas bar!bas (0,7)(0,3)
|
||||
E (foo|bar)!bas foo!bar!bas (4,11)(4,7)
|
||||
E (foo|bar)!bas foo!bas (0,7)(0,3)
|
||||
E ^(([^!]+!)?([^!]+)|.+!([^!]+!)([^!]+))$ foo!bar!bas (0,11)(0,11)(?,?)(?,?)(4,8)(8,11)
|
||||
E ^([^!]+!)?([^!]+)$|^.+!([^!]+!)([^!]+)$ bas (0,3)(?,?)(0,3)
|
||||
E ^([^!]+!)?([^!]+)$|^.+!([^!]+!)([^!]+)$ bar!bas (0,7)(0,4)(4,7)
|
||||
E ^([^!]+!)?([^!]+)$|^.+!([^!]+!)([^!]+)$ foo!bar!bas (0,11)(?,?)(?,?)(4,8)(8,11)
|
||||
E ^([^!]+!)?([^!]+)$|^.+!([^!]+!)([^!]+)$ foo!bas (0,7)(0,4)(4,7)
|
||||
E ^(([^!]+!)?([^!]+)|.+!([^!]+!)([^!]+))$ bas (0,3)(0,3)(?,?)(0,3)
|
||||
E ^(([^!]+!)?([^!]+)|.+!([^!]+!)([^!]+))$ bar!bas (0,7)(0,7)(0,4)(4,7)
|
||||
E ^(([^!]+!)?([^!]+)|.+!([^!]+!)([^!]+))$ foo!bar!bas (0,11)(0,11)(?,?)(?,?)(4,8)(8,11)
|
||||
E ^(([^!]+!)?([^!]+)|.+!([^!]+!)([^!]+))$ foo!bas (0,7)(0,7)(0,4)(4,7)
|
||||
E .*(/XXX).* /XXX (0,4)(0,4)
|
||||
E .*(\\XXX).* \XXX (0,4)(0,4)
|
||||
E \\XXX \XXX (0,4)
|
||||
E .*(/000).* /000 (0,4)(0,4)
|
||||
E .*(\\000).* \000 (0,4)(0,4)
|
||||
E \\000 \000 (0,4)
|
||||
79
third-party/vendor/regex-automata-0.1.10/data/fowler-tests/nullsubexpr.dat
vendored
Normal file
79
third-party/vendor/regex-automata-0.1.10/data/fowler-tests/nullsubexpr.dat
vendored
Normal file
|
|
@ -0,0 +1,79 @@
|
|||
NOTE null subexpression matches : 2002-06-06
|
||||
|
||||
E (a*)* a (0,1)(0,1)
|
||||
#E SAME x (0,0)(0,0)
|
||||
E SAME x (0,0)(?,?) RE2/Go
|
||||
E SAME aaaaaa (0,6)(0,6)
|
||||
E SAME aaaaaax (0,6)(0,6)
|
||||
E (a*)+ a (0,1)(0,1)
|
||||
E SAME x (0,0)(0,0)
|
||||
E SAME aaaaaa (0,6)(0,6)
|
||||
E SAME aaaaaax (0,6)(0,6)
|
||||
E (a+)* a (0,1)(0,1)
|
||||
E SAME x (0,0)
|
||||
E SAME aaaaaa (0,6)(0,6)
|
||||
E SAME aaaaaax (0,6)(0,6)
|
||||
E (a+)+ a (0,1)(0,1)
|
||||
E SAME x NOMATCH
|
||||
E SAME aaaaaa (0,6)(0,6)
|
||||
E SAME aaaaaax (0,6)(0,6)
|
||||
|
||||
E ([a]*)* a (0,1)(0,1)
|
||||
#E SAME x (0,0)(0,0)
|
||||
E SAME x (0,0)(?,?) RE2/Go
|
||||
E SAME aaaaaa (0,6)(0,6)
|
||||
E SAME aaaaaax (0,6)(0,6)
|
||||
E ([a]*)+ a (0,1)(0,1)
|
||||
E SAME x (0,0)(0,0)
|
||||
E SAME aaaaaa (0,6)(0,6)
|
||||
E SAME aaaaaax (0,6)(0,6)
|
||||
E ([^b]*)* a (0,1)(0,1)
|
||||
#E SAME b (0,0)(0,0)
|
||||
E SAME b (0,0)(?,?) RE2/Go
|
||||
E SAME aaaaaa (0,6)(0,6)
|
||||
E SAME aaaaaab (0,6)(0,6)
|
||||
E ([ab]*)* a (0,1)(0,1)
|
||||
E SAME aaaaaa (0,6)(0,6)
|
||||
E SAME ababab (0,6)(0,6)
|
||||
E SAME bababa (0,6)(0,6)
|
||||
E SAME b (0,1)(0,1)
|
||||
E SAME bbbbbb (0,6)(0,6)
|
||||
E SAME aaaabcde (0,5)(0,5)
|
||||
E ([^a]*)* b (0,1)(0,1)
|
||||
E SAME bbbbbb (0,6)(0,6)
|
||||
#E SAME aaaaaa (0,0)(0,0)
|
||||
E SAME aaaaaa (0,0)(?,?) RE2/Go
|
||||
E ([^ab]*)* ccccxx (0,6)(0,6)
|
||||
#E SAME ababab (0,0)(0,0)
|
||||
E SAME ababab (0,0)(?,?) RE2/Go
|
||||
|
||||
E ((z)+|a)* zabcde (0,2)(1,2)
|
||||
|
||||
#{E a+? aaaaaa (0,1) no *? +? mimimal match ops
|
||||
#E (a) aaa (0,1)(0,1)
|
||||
#E (a*?) aaa (0,0)(0,0)
|
||||
#E (a)*? aaa (0,0)
|
||||
#E (a*?)*? aaa (0,0)
|
||||
#}
|
||||
|
||||
B \(a*\)*\(x\) x (0,1)(0,0)(0,1)
|
||||
B \(a*\)*\(x\) ax (0,2)(0,1)(1,2)
|
||||
B \(a*\)*\(x\) axa (0,2)(0,1)(1,2)
|
||||
B \(a*\)*\(x\)\(\1\) x (0,1)(0,0)(0,1)(1,1)
|
||||
B \(a*\)*\(x\)\(\1\) ax (0,2)(1,1)(1,2)(2,2)
|
||||
B \(a*\)*\(x\)\(\1\) axa (0,3)(0,1)(1,2)(2,3)
|
||||
B \(a*\)*\(x\)\(\1\)\(x\) axax (0,4)(0,1)(1,2)(2,3)(3,4)
|
||||
B \(a*\)*\(x\)\(\1\)\(x\) axxa (0,3)(1,1)(1,2)(2,2)(2,3)
|
||||
|
||||
#E (a*)*(x) x (0,1)(0,0)(0,1)
|
||||
E (a*)*(x) x (0,1)(?,?)(0,1) RE2/Go
|
||||
E (a*)*(x) ax (0,2)(0,1)(1,2)
|
||||
E (a*)*(x) axa (0,2)(0,1)(1,2)
|
||||
|
||||
E (a*)+(x) x (0,1)(0,0)(0,1)
|
||||
E (a*)+(x) ax (0,2)(0,1)(1,2)
|
||||
E (a*)+(x) axa (0,2)(0,1)(1,2)
|
||||
|
||||
E (a*){2}(x) x (0,1)(0,0)(0,1)
|
||||
E (a*){2}(x) ax (0,2)(1,1)(1,2)
|
||||
E (a*){2}(x) axa (0,2)(1,1)(1,2)
|
||||
163
third-party/vendor/regex-automata-0.1.10/data/fowler-tests/repetition.dat
vendored
Normal file
163
third-party/vendor/regex-automata-0.1.10/data/fowler-tests/repetition.dat
vendored
Normal file
|
|
@ -0,0 +1,163 @@
|
|||
NOTE implicit vs. explicit repetitions : 2009-02-02
|
||||
|
||||
# Glenn Fowler <gsf@research.att.com>
|
||||
# conforming matches (column 4) must match one of the following BREs
|
||||
# NOMATCH
|
||||
# (0,.)\((\(.\),\(.\))(?,?)(\2,\3)\)*
|
||||
# (0,.)\((\(.\),\(.\))(\2,\3)(?,?)\)*
|
||||
# i.e., each 3-tuple has two identical elements and one (?,?)
|
||||
|
||||
E ((..)|(.)) NULL NOMATCH
|
||||
E ((..)|(.))((..)|(.)) NULL NOMATCH
|
||||
E ((..)|(.))((..)|(.))((..)|(.)) NULL NOMATCH
|
||||
|
||||
E ((..)|(.)){1} NULL NOMATCH
|
||||
E ((..)|(.)){2} NULL NOMATCH
|
||||
E ((..)|(.)){3} NULL NOMATCH
|
||||
|
||||
E ((..)|(.))* NULL (0,0)
|
||||
|
||||
E ((..)|(.)) a (0,1)(0,1)(?,?)(0,1)
|
||||
E ((..)|(.))((..)|(.)) a NOMATCH
|
||||
E ((..)|(.))((..)|(.))((..)|(.)) a NOMATCH
|
||||
|
||||
E ((..)|(.)){1} a (0,1)(0,1)(?,?)(0,1)
|
||||
E ((..)|(.)){2} a NOMATCH
|
||||
E ((..)|(.)){3} a NOMATCH
|
||||
|
||||
E ((..)|(.))* a (0,1)(0,1)(?,?)(0,1)
|
||||
|
||||
E ((..)|(.)) aa (0,2)(0,2)(0,2)(?,?)
|
||||
E ((..)|(.))((..)|(.)) aa (0,2)(0,1)(?,?)(0,1)(1,2)(?,?)(1,2)
|
||||
E ((..)|(.))((..)|(.))((..)|(.)) aa NOMATCH
|
||||
|
||||
E ((..)|(.)){1} aa (0,2)(0,2)(0,2)(?,?)
|
||||
E ((..)|(.)){2} aa (0,2)(1,2)(?,?)(1,2)
|
||||
E ((..)|(.)){3} aa NOMATCH
|
||||
|
||||
E ((..)|(.))* aa (0,2)(0,2)(0,2)(?,?)
|
||||
|
||||
E ((..)|(.)) aaa (0,2)(0,2)(0,2)(?,?)
|
||||
E ((..)|(.))((..)|(.)) aaa (0,3)(0,2)(0,2)(?,?)(2,3)(?,?)(2,3)
|
||||
E ((..)|(.))((..)|(.))((..)|(.)) aaa (0,3)(0,1)(?,?)(0,1)(1,2)(?,?)(1,2)(2,3)(?,?)(2,3)
|
||||
|
||||
E ((..)|(.)){1} aaa (0,2)(0,2)(0,2)(?,?)
|
||||
#E ((..)|(.)){2} aaa (0,3)(2,3)(?,?)(2,3)
|
||||
E ((..)|(.)){2} aaa (0,3)(2,3)(0,2)(2,3) RE2/Go
|
||||
E ((..)|(.)){3} aaa (0,3)(2,3)(?,?)(2,3)
|
||||
|
||||
#E ((..)|(.))* aaa (0,3)(2,3)(?,?)(2,3)
|
||||
E ((..)|(.))* aaa (0,3)(2,3)(0,2)(2,3) RE2/Go
|
||||
|
||||
E ((..)|(.)) aaaa (0,2)(0,2)(0,2)(?,?)
|
||||
E ((..)|(.))((..)|(.)) aaaa (0,4)(0,2)(0,2)(?,?)(2,4)(2,4)(?,?)
|
||||
E ((..)|(.))((..)|(.))((..)|(.)) aaaa (0,4)(0,2)(0,2)(?,?)(2,3)(?,?)(2,3)(3,4)(?,?)(3,4)
|
||||
|
||||
E ((..)|(.)){1} aaaa (0,2)(0,2)(0,2)(?,?)
|
||||
E ((..)|(.)){2} aaaa (0,4)(2,4)(2,4)(?,?)
|
||||
#E ((..)|(.)){3} aaaa (0,4)(3,4)(?,?)(3,4)
|
||||
E ((..)|(.)){3} aaaa (0,4)(3,4)(0,2)(3,4) RE2/Go
|
||||
|
||||
E ((..)|(.))* aaaa (0,4)(2,4)(2,4)(?,?)
|
||||
|
||||
E ((..)|(.)) aaaaa (0,2)(0,2)(0,2)(?,?)
|
||||
E ((..)|(.))((..)|(.)) aaaaa (0,4)(0,2)(0,2)(?,?)(2,4)(2,4)(?,?)
|
||||
E ((..)|(.))((..)|(.))((..)|(.)) aaaaa (0,5)(0,2)(0,2)(?,?)(2,4)(2,4)(?,?)(4,5)(?,?)(4,5)
|
||||
|
||||
E ((..)|(.)){1} aaaaa (0,2)(0,2)(0,2)(?,?)
|
||||
E ((..)|(.)){2} aaaaa (0,4)(2,4)(2,4)(?,?)
|
||||
#E ((..)|(.)){3} aaaaa (0,5)(4,5)(?,?)(4,5)
|
||||
E ((..)|(.)){3} aaaaa (0,5)(4,5)(2,4)(4,5) RE2/Go
|
||||
|
||||
#E ((..)|(.))* aaaaa (0,5)(4,5)(?,?)(4,5)
|
||||
E ((..)|(.))* aaaaa (0,5)(4,5)(2,4)(4,5) RE2/Go
|
||||
|
||||
E ((..)|(.)) aaaaaa (0,2)(0,2)(0,2)(?,?)
|
||||
E ((..)|(.))((..)|(.)) aaaaaa (0,4)(0,2)(0,2)(?,?)(2,4)(2,4)(?,?)
|
||||
E ((..)|(.))((..)|(.))((..)|(.)) aaaaaa (0,6)(0,2)(0,2)(?,?)(2,4)(2,4)(?,?)(4,6)(4,6)(?,?)
|
||||
|
||||
E ((..)|(.)){1} aaaaaa (0,2)(0,2)(0,2)(?,?)
|
||||
E ((..)|(.)){2} aaaaaa (0,4)(2,4)(2,4)(?,?)
|
||||
E ((..)|(.)){3} aaaaaa (0,6)(4,6)(4,6)(?,?)
|
||||
|
||||
E ((..)|(.))* aaaaaa (0,6)(4,6)(4,6)(?,?)
|
||||
|
||||
NOTE additional repetition tests graciously provided by Chris Kuklewicz www.haskell.org 2009-02-02
|
||||
|
||||
# These test a bug in OS X / FreeBSD / NetBSD, and libtree.
|
||||
# Linux/GLIBC gets the {8,} and {8,8} wrong.
|
||||
|
||||
:HA#100:E X(.?){0,}Y X1234567Y (0,9)(7,8)
|
||||
:HA#101:E X(.?){1,}Y X1234567Y (0,9)(7,8)
|
||||
:HA#102:E X(.?){2,}Y X1234567Y (0,9)(7,8)
|
||||
:HA#103:E X(.?){3,}Y X1234567Y (0,9)(7,8)
|
||||
:HA#104:E X(.?){4,}Y X1234567Y (0,9)(7,8)
|
||||
:HA#105:E X(.?){5,}Y X1234567Y (0,9)(7,8)
|
||||
:HA#106:E X(.?){6,}Y X1234567Y (0,9)(7,8)
|
||||
:HA#107:E X(.?){7,}Y X1234567Y (0,9)(7,8)
|
||||
:HA#108:E X(.?){8,}Y X1234567Y (0,9)(8,8)
|
||||
#:HA#110:E X(.?){0,8}Y X1234567Y (0,9)(7,8)
|
||||
:HA#110:E X(.?){0,8}Y X1234567Y (0,9)(8,8) RE2/Go
|
||||
#:HA#111:E X(.?){1,8}Y X1234567Y (0,9)(7,8)
|
||||
:HA#111:E X(.?){1,8}Y X1234567Y (0,9)(8,8) RE2/Go
|
||||
#:HA#112:E X(.?){2,8}Y X1234567Y (0,9)(7,8)
|
||||
:HA#112:E X(.?){2,8}Y X1234567Y (0,9)(8,8) RE2/Go
|
||||
#:HA#113:E X(.?){3,8}Y X1234567Y (0,9)(7,8)
|
||||
:HA#113:E X(.?){3,8}Y X1234567Y (0,9)(8,8) RE2/Go
|
||||
#:HA#114:E X(.?){4,8}Y X1234567Y (0,9)(7,8)
|
||||
:HA#114:E X(.?){4,8}Y X1234567Y (0,9)(8,8) RE2/Go
|
||||
#:HA#115:E X(.?){5,8}Y X1234567Y (0,9)(7,8)
|
||||
:HA#115:E X(.?){5,8}Y X1234567Y (0,9)(8,8) RE2/Go
|
||||
#:HA#116:E X(.?){6,8}Y X1234567Y (0,9)(7,8)
|
||||
:HA#116:E X(.?){6,8}Y X1234567Y (0,9)(8,8) RE2/Go
|
||||
#:HA#117:E X(.?){7,8}Y X1234567Y (0,9)(7,8)
|
||||
:HA#117:E X(.?){7,8}Y X1234567Y (0,9)(8,8) RE2/Go
|
||||
:HA#118:E X(.?){8,8}Y X1234567Y (0,9)(8,8)
|
||||
|
||||
# These test a fixed bug in my regex-tdfa that did not keep the expanded
|
||||
# form properly grouped, so right association did the wrong thing with
|
||||
# these ambiguous patterns (crafted just to test my code when I became
|
||||
# suspicious of my implementation). The first subexpression should use
|
||||
# "ab" then "a" then "bcd".
|
||||
|
||||
# OS X / FreeBSD / NetBSD badly fail many of these, with impossible
|
||||
# results like (0,6)(4,5)(6,6).
|
||||
|
||||
:HA#260:E (a|ab|c|bcd){0,}(d*) ababcd (0,1)(0,1)(1,1)
|
||||
:HA#261:E (a|ab|c|bcd){1,}(d*) ababcd (0,1)(0,1)(1,1)
|
||||
:HA#262:E (a|ab|c|bcd){2,}(d*) ababcd (0,6)(3,6)(6,6)
|
||||
:HA#263:E (a|ab|c|bcd){3,}(d*) ababcd (0,6)(3,6)(6,6)
|
||||
:HA#264:E (a|ab|c|bcd){4,}(d*) ababcd NOMATCH
|
||||
:HA#265:E (a|ab|c|bcd){0,10}(d*) ababcd (0,1)(0,1)(1,1)
|
||||
:HA#266:E (a|ab|c|bcd){1,10}(d*) ababcd (0,1)(0,1)(1,1)
|
||||
:HA#267:E (a|ab|c|bcd){2,10}(d*) ababcd (0,6)(3,6)(6,6)
|
||||
:HA#268:E (a|ab|c|bcd){3,10}(d*) ababcd (0,6)(3,6)(6,6)
|
||||
:HA#269:E (a|ab|c|bcd){4,10}(d*) ababcd NOMATCH
|
||||
:HA#270:E (a|ab|c|bcd)*(d*) ababcd (0,1)(0,1)(1,1)
|
||||
:HA#271:E (a|ab|c|bcd)+(d*) ababcd (0,1)(0,1)(1,1)
|
||||
|
||||
# The above worked on Linux/GLIBC but the following often fail.
|
||||
# They also trip up OS X / FreeBSD / NetBSD:
|
||||
|
||||
#:HA#280:E (ab|a|c|bcd){0,}(d*) ababcd (0,6)(3,6)(6,6)
|
||||
:HA#280:E (ab|a|c|bcd){0,}(d*) ababcd (0,6)(4,5)(5,6) RE2/Go
|
||||
#:HA#281:E (ab|a|c|bcd){1,}(d*) ababcd (0,6)(3,6)(6,6)
|
||||
:HA#281:E (ab|a|c|bcd){1,}(d*) ababcd (0,6)(4,5)(5,6) RE2/Go
|
||||
#:HA#282:E (ab|a|c|bcd){2,}(d*) ababcd (0,6)(3,6)(6,6)
|
||||
:HA#282:E (ab|a|c|bcd){2,}(d*) ababcd (0,6)(4,5)(5,6) RE2/Go
|
||||
#:HA#283:E (ab|a|c|bcd){3,}(d*) ababcd (0,6)(3,6)(6,6)
|
||||
:HA#283:E (ab|a|c|bcd){3,}(d*) ababcd (0,6)(4,5)(5,6) RE2/Go
|
||||
:HA#284:E (ab|a|c|bcd){4,}(d*) ababcd NOMATCH
|
||||
#:HA#285:E (ab|a|c|bcd){0,10}(d*) ababcd (0,6)(3,6)(6,6)
|
||||
:HA#285:E (ab|a|c|bcd){0,10}(d*) ababcd (0,6)(4,5)(5,6) RE2/Go
|
||||
#:HA#286:E (ab|a|c|bcd){1,10}(d*) ababcd (0,6)(3,6)(6,6)
|
||||
:HA#286:E (ab|a|c|bcd){1,10}(d*) ababcd (0,6)(4,5)(5,6) RE2/Go
|
||||
#:HA#287:E (ab|a|c|bcd){2,10}(d*) ababcd (0,6)(3,6)(6,6)
|
||||
:HA#287:E (ab|a|c|bcd){2,10}(d*) ababcd (0,6)(4,5)(5,6) RE2/Go
|
||||
#:HA#288:E (ab|a|c|bcd){3,10}(d*) ababcd (0,6)(3,6)(6,6)
|
||||
:HA#288:E (ab|a|c|bcd){3,10}(d*) ababcd (0,6)(4,5)(5,6) RE2/Go
|
||||
:HA#289:E (ab|a|c|bcd){4,10}(d*) ababcd NOMATCH
|
||||
#:HA#290:E (ab|a|c|bcd)*(d*) ababcd (0,6)(3,6)(6,6)
|
||||
:HA#290:E (ab|a|c|bcd)*(d*) ababcd (0,6)(4,5)(5,6) RE2/Go
|
||||
#:HA#291:E (ab|a|c|bcd)+(d*) ababcd (0,6)(3,6)(6,6)
|
||||
:HA#291:E (ab|a|c|bcd)+(d*) ababcd (0,6)(4,5)(5,6) RE2/Go
|
||||
177
third-party/vendor/regex-automata-0.1.10/data/tests/crazy.toml
vendored
Normal file
177
third-party/vendor/regex-automata-0.1.10/data/tests/crazy.toml
vendored
Normal file
|
|
@ -0,0 +1,177 @@
|
|||
[[tests]]
|
||||
name = "crazy-misc1"
|
||||
pattern = '[-+]?[0-9]*\.?[0-9]+'
|
||||
input = "0.1"
|
||||
matches = [[0, 3]]
|
||||
|
||||
[[tests]]
|
||||
name = "crazy-misc2"
|
||||
pattern = '[-+]?[0-9]*\.?[0-9]+'
|
||||
input = "0.1.2"
|
||||
matches = [[0, 3]]
|
||||
|
||||
[[tests]]
|
||||
name = "crazy-misc3"
|
||||
pattern = '[-+]?[0-9]*\.?[0-9]+'
|
||||
input = "a1.2"
|
||||
matches = [[1, 4]]
|
||||
|
||||
[[tests]]
|
||||
options = ["case-insensitive"]
|
||||
name = "crazy-misc4"
|
||||
pattern = '[A-Z0-9._%+-]+@[A-Z0-9.-]+\.[A-Z]{2,4}'
|
||||
input = "mine is jam.slam@gmail.com "
|
||||
matches = [[8, 26]]
|
||||
|
||||
[[tests]]
|
||||
options = ["case-insensitive"]
|
||||
name = "crazy-misc5"
|
||||
pattern = '[A-Z0-9._%+-]+@[A-Z0-9.-]+\.[A-Z]{2,4}'
|
||||
input = "mine is jam.slam@gmail "
|
||||
matches = []
|
||||
|
||||
[[tests]]
|
||||
name = "crazy-misc6"
|
||||
pattern = '''[a-z0-9!#$%&'*+/=?^_`{|}~-]+(?:\.[a-z0-9!#$%&'*+/=?^_`{|}~-]+)*@(?:[a-z0-9](?:[a-z0-9-]*[a-z0-9])?\.)+[a-z0-9](?:[a-z0-9-]*[a-z0-9])?'''
|
||||
input = "mine is jam.slam@gmail.com "
|
||||
matches = [[8, 26]]
|
||||
|
||||
[[tests]]
|
||||
name = "crazy-misc7"
|
||||
pattern = '(19|20)\d\d[- /.](0[1-9]|1[012])[- /.](0[1-9]|[12][0-9]|3[01])'
|
||||
input = "1900-01-01"
|
||||
matches = [[0, 10]]
|
||||
|
||||
[[tests]]
|
||||
name = "crazy-misc8"
|
||||
pattern = '(19|20)\d\d[- /.](0[1-9]|1[012])[- /.](0[1-9]|[12][0-9]|3[01])'
|
||||
input = "1900-00-01"
|
||||
matches = []
|
||||
|
||||
[[tests]]
|
||||
name = "crazy-misc9"
|
||||
pattern = '(19|20)\d\d[- /.](0[1-9]|1[012])[- /.](0[1-9]|[12][0-9]|3[01])'
|
||||
input = "1900-13-01"
|
||||
matches = []
|
||||
|
||||
|
||||
[[tests]]
|
||||
name = "crazy-negclass1"
|
||||
pattern = "[^ac]"
|
||||
input = "acx"
|
||||
matches = [[2, 3]]
|
||||
|
||||
[[tests]]
|
||||
name = "crazy-negclass2"
|
||||
pattern = "[^a,]"
|
||||
input = "a,x"
|
||||
matches = [[2, 3]]
|
||||
|
||||
[[tests]]
|
||||
name = "crazy-negclass3"
|
||||
pattern = '[^a\s]'
|
||||
input = "a x"
|
||||
matches = [[2, 3]]
|
||||
|
||||
[[tests]]
|
||||
name = "crazy-negclass4"
|
||||
pattern = "[^,]"
|
||||
input = ",,x"
|
||||
matches = [[2, 3]]
|
||||
|
||||
[[tests]]
|
||||
name = "crazy-negclass5"
|
||||
pattern = '[^\s]'
|
||||
input = " a"
|
||||
matches = [[1, 2]]
|
||||
|
||||
[[tests]]
|
||||
name = "crazy-negclass6"
|
||||
pattern = '[^,\s]'
|
||||
input = ", a"
|
||||
matches = [[2, 3]]
|
||||
|
||||
[[tests]]
|
||||
name = "crazy-negclass7"
|
||||
pattern = '[^\s,]'
|
||||
input = " ,a"
|
||||
matches = [[2, 3]]
|
||||
|
||||
[[tests]]
|
||||
name = "crazy-negclass8"
|
||||
pattern = "[^[:alpha:]Z]"
|
||||
input = "A1"
|
||||
matches = [[1, 2]]
|
||||
|
||||
|
||||
[[tests]]
|
||||
name = "crazy-empty-repeat1"
|
||||
pattern = "((.*)*?)="
|
||||
input = "a=b"
|
||||
matches = [[0, 2]]
|
||||
|
||||
[[tests]]
|
||||
name = "crazy-empty-repeat2"
|
||||
pattern = "((.?)*?)="
|
||||
input = "a=b"
|
||||
matches = [[0, 2]]
|
||||
|
||||
[[tests]]
|
||||
name = "crazy-empty-repeat3"
|
||||
pattern = "((.*)+?)="
|
||||
input = "a=b"
|
||||
matches = [[0, 2]]
|
||||
|
||||
[[tests]]
|
||||
name = "crazy-empty-repeat4"
|
||||
pattern = "((.?)+?)="
|
||||
input = "a=b"
|
||||
matches = [[0, 2]]
|
||||
|
||||
[[tests]]
|
||||
name = "crazy-empty-repeat5"
|
||||
pattern = "((.*){1,}?)="
|
||||
input = "a=b"
|
||||
matches = [[0, 2]]
|
||||
|
||||
[[tests]]
|
||||
name = "crazy-empty-repeat6"
|
||||
pattern = "((.*){1,2}?)="
|
||||
input = "a=b"
|
||||
matches = [[0, 2]]
|
||||
|
||||
[[tests]]
|
||||
name = "crazy-empty-repeat7"
|
||||
pattern = "((.*)*)="
|
||||
input = "a=b"
|
||||
matches = [[0, 2]]
|
||||
|
||||
[[tests]]
|
||||
name = "crazy-empty-repeat8"
|
||||
pattern = "((.?)*)="
|
||||
input = "a=b"
|
||||
matches = [[0, 2]]
|
||||
|
||||
[[tests]]
|
||||
name = "crazy-empty-repeat9"
|
||||
pattern = "((.*)+)="
|
||||
input = "a=b"
|
||||
matches = [[0, 2]]
|
||||
|
||||
[[tests]]
|
||||
name = "crazy-empty-repeat10"
|
||||
pattern = "((.?)+)="
|
||||
input = "a=b"
|
||||
matches = [[0, 2]]
|
||||
|
||||
[[tests]]
|
||||
name = "crazy-empty-repeat11"
|
||||
pattern = "((.*){1,})="
|
||||
input = "a=b"
|
||||
matches = [[0, 2]]
|
||||
|
||||
[[tests]]
|
||||
name = "crazy-empty-repeat12"
|
||||
pattern = "((.*){1,2})="
|
||||
input = "a=b"
|
||||
matches = [[0, 2]]
|
||||
59
third-party/vendor/regex-automata-0.1.10/data/tests/flags.toml
vendored
Normal file
59
third-party/vendor/regex-automata-0.1.10/data/tests/flags.toml
vendored
Normal file
|
|
@ -0,0 +1,59 @@
|
|||
[[tests]]
|
||||
name = "flags1"
|
||||
pattern = "(?i)abc"
|
||||
input = "ABC"
|
||||
matches = [[0, 3]]
|
||||
|
||||
[[tests]]
|
||||
name = "flags2"
|
||||
pattern = "(?i)a(?-i)bc"
|
||||
input = "Abc"
|
||||
matches = [[0, 3]]
|
||||
|
||||
[[tests]]
|
||||
name = "flags3"
|
||||
pattern = "(?i)a(?-i)bc"
|
||||
input = "ABC"
|
||||
matches = []
|
||||
|
||||
[[tests]]
|
||||
name = "flags4"
|
||||
pattern = "(?is)a."
|
||||
input = "A\n"
|
||||
matches = [[0, 2]]
|
||||
|
||||
[[tests]]
|
||||
name = "flags5"
|
||||
pattern = "(?is)a.(?-is)a."
|
||||
input = "A\nab"
|
||||
matches = [[0, 4]]
|
||||
|
||||
[[tests]]
|
||||
name = "flags6"
|
||||
pattern = "(?is)a.(?-is)a."
|
||||
input = "A\na\n"
|
||||
matches = []
|
||||
|
||||
[[tests]]
|
||||
name = "flags7"
|
||||
pattern = "(?is)a.(?-is:a.)?"
|
||||
input = "A\na\n"
|
||||
matches = [[0, 2]]
|
||||
|
||||
[[tests]]
|
||||
name = "flags8"
|
||||
pattern = "(?U)a+"
|
||||
input = "aa"
|
||||
matches = [[0, 1]]
|
||||
|
||||
[[tests]]
|
||||
name = "flags9"
|
||||
pattern = "(?U)a+?"
|
||||
input = "aa"
|
||||
matches = [[0, 2]]
|
||||
|
||||
[[tests]]
|
||||
name = "flags10"
|
||||
pattern = "(?U)(?-U)a+"
|
||||
input = "aa"
|
||||
matches = [[0, 2]]
|
||||
19
third-party/vendor/regex-automata-0.1.10/data/tests/fowler/LICENSE
vendored
Normal file
19
third-party/vendor/regex-automata-0.1.10/data/tests/fowler/LICENSE
vendored
Normal file
|
|
@ -0,0 +1,19 @@
|
|||
The following license covers testregex.c and all associated test data.
|
||||
|
||||
Permission is hereby granted, free of charge, to any person obtaining a
|
||||
copy of THIS SOFTWARE FILE (the "Software"), to deal in the Software
|
||||
without restriction, including without limitation the rights to use,
|
||||
copy, modify, merge, publish, distribute, and/or sell copies of the
|
||||
Software, and to permit persons to whom the Software is furnished to do
|
||||
so, subject to the following disclaimer:
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY AT&T ``AS IS'' AND ANY EXPRESS OR IMPLIED
|
||||
WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
|
||||
MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
|
||||
IN NO EVENT SHALL AT&T BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||||
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
23
third-party/vendor/regex-automata-0.1.10/data/tests/fowler/README
vendored
Normal file
23
third-party/vendor/regex-automata-0.1.10/data/tests/fowler/README
vendored
Normal file
|
|
@ -0,0 +1,23 @@
|
|||
Test data was taken from the Go distribution, which was in turn taken from the
|
||||
testregex test suite:
|
||||
|
||||
http://www2.research.att.com/~astopen/testregex/testregex.html
|
||||
|
||||
Unfortunately, the above link is now dead, but the test data lives on.
|
||||
|
||||
The LICENSE in this directory corresponds to the LICENSE that the data was
|
||||
originally released under.
|
||||
|
||||
The tests themselves were modified for RE2/Go. A couple were modified further
|
||||
by me (Andrew Gallant) (only in repetition.dat) so that RE2/Go would pass them.
|
||||
(Yes, it seems like RE2/Go includes failing test cases.) This may or may not
|
||||
have been a bad idea, but I think being consistent with an established Regex
|
||||
library is worth something.
|
||||
|
||||
After some number of years, these tests were transformed into a JSON format
|
||||
using the fowler-to-json script in this directory, e.g.,
|
||||
|
||||
./fowler-to-json basic.dat > basic.json
|
||||
|
||||
which brings them into a sensible structured format in which other tests can
|
||||
be written.
|
||||
221
third-party/vendor/regex-automata-0.1.10/data/tests/fowler/basic.dat
vendored
Normal file
221
third-party/vendor/regex-automata-0.1.10/data/tests/fowler/basic.dat
vendored
Normal file
|
|
@ -0,0 +1,221 @@
|
|||
NOTE all standard compliant implementations should pass these : 2002-05-31
|
||||
|
||||
BE abracadabra$ abracadabracadabra (7,18)
|
||||
BE a...b abababbb (2,7)
|
||||
BE XXXXXX ..XXXXXX (2,8)
|
||||
E \) () (1,2)
|
||||
BE a] a]a (0,2)
|
||||
B } } (0,1)
|
||||
E \} } (0,1)
|
||||
BE \] ] (0,1)
|
||||
B ] ] (0,1)
|
||||
E ] ] (0,1)
|
||||
B { { (0,1)
|
||||
B } } (0,1)
|
||||
BE ^a ax (0,1)
|
||||
BE \^a a^a (1,3)
|
||||
BE a\^ a^ (0,2)
|
||||
BE a$ aa (1,2)
|
||||
BE a\$ a$ (0,2)
|
||||
BE ^$ NULL (0,0)
|
||||
E $^ NULL (0,0)
|
||||
E a($) aa (1,2)(2,2)
|
||||
E a*(^a) aa (0,1)(0,1)
|
||||
E (..)*(...)* a (0,0)
|
||||
E (..)*(...)* abcd (0,4)(2,4)
|
||||
E (ab|a)(bc|c) abc (0,3)(0,2)(2,3)
|
||||
E (ab)c|abc abc (0,3)(0,2)
|
||||
E a{0}b ab (1,2)
|
||||
E (a*)(b?)(b+)b{3} aaabbbbbbb (0,10)(0,3)(3,4)(4,7)
|
||||
E (a*)(b{0,1})(b{1,})b{3} aaabbbbbbb (0,10)(0,3)(3,4)(4,7)
|
||||
E a{9876543210} NULL BADBR
|
||||
E ((a|a)|a) a (0,1)(0,1)(0,1)
|
||||
E (a*)(a|aa) aaaa (0,4)(0,3)(3,4)
|
||||
E a*(a.|aa) aaaa (0,4)(2,4)
|
||||
E a(b)|c(d)|a(e)f aef (0,3)(?,?)(?,?)(1,2)
|
||||
E (a|b)?.* b (0,1)(0,1)
|
||||
E (a|b)c|a(b|c) ac (0,2)(0,1)
|
||||
E (a|b)c|a(b|c) ab (0,2)(?,?)(1,2)
|
||||
E (a|b)*c|(a|ab)*c abc (0,3)(1,2)
|
||||
E (a|b)*c|(a|ab)*c xc (1,2)
|
||||
E (.a|.b).*|.*(.a|.b) xa (0,2)(0,2)
|
||||
E a?(ab|ba)ab abab (0,4)(0,2)
|
||||
E a?(ac{0}b|ba)ab abab (0,4)(0,2)
|
||||
E ab|abab abbabab (0,2)
|
||||
E aba|bab|bba baaabbbaba (5,8)
|
||||
E aba|bab baaabbbaba (6,9)
|
||||
E (aa|aaa)*|(a|aaaaa) aa (0,2)(0,2)
|
||||
E (a.|.a.)*|(a|.a...) aa (0,2)(0,2)
|
||||
E ab|a xabc (1,3)
|
||||
E ab|a xxabc (2,4)
|
||||
Ei (Ab|cD)* aBcD (0,4)(2,4)
|
||||
BE [^-] --a (2,3)
|
||||
BE [a-]* --a (0,3)
|
||||
BE [a-m-]* --amoma-- (0,4)
|
||||
E :::1:::0:|:::1:1:0: :::0:::1:::1:::0: (8,17)
|
||||
E :::1:::0:|:::1:1:1: :::0:::1:::1:::0: (8,17)
|
||||
{E [[:upper:]] A (0,1) [[<element>]] not supported
|
||||
E [[:lower:]]+ `az{ (1,3)
|
||||
E [[:upper:]]+ @AZ[ (1,3)
|
||||
# No collation in Go
|
||||
#BE [[-]] [[-]] (2,4)
|
||||
#BE [[.NIL.]] NULL ECOLLATE
|
||||
#BE [[=aleph=]] NULL ECOLLATE
|
||||
}
|
||||
BE$ \n \n (0,1)
|
||||
BEn$ \n \n (0,1)
|
||||
BE$ [^a] \n (0,1)
|
||||
BE$ \na \na (0,2)
|
||||
E (a)(b)(c) abc (0,3)(0,1)(1,2)(2,3)
|
||||
BE xxx xxx (0,3)
|
||||
E1 (^|[ (,;])((([Ff]eb[^ ]* *|0*2/|\* */?)0*[6-7]))([^0-9]|$) feb 6, (0,6)
|
||||
E1 (^|[ (,;])((([Ff]eb[^ ]* *|0*2/|\* */?)0*[6-7]))([^0-9]|$) 2/7 (0,3)
|
||||
E1 (^|[ (,;])((([Ff]eb[^ ]* *|0*2/|\* */?)0*[6-7]))([^0-9]|$) feb 1,Feb 6 (5,11)
|
||||
E3 ((((((((((((((((((((((((((((((x)))))))))))))))))))))))))))))) x (0,1)(0,1)(0,1)
|
||||
E3 ((((((((((((((((((((((((((((((x))))))))))))))))))))))))))))))* xx (0,2)(1,2)(1,2)
|
||||
E a?(ab|ba)* ababababababababababababababababababababababababababababababababababababababababa (0,81)(79,81)
|
||||
E abaa|abbaa|abbbaa|abbbbaa ababbabbbabbbabbbbabbbbaa (18,25)
|
||||
E abaa|abbaa|abbbaa|abbbbaa ababbabbbabbbabbbbabaa (18,22)
|
||||
E aaac|aabc|abac|abbc|baac|babc|bbac|bbbc baaabbbabac (7,11)
|
||||
BE$ .* \x01\x7f (0,2)
|
||||
E aaaa|bbbb|cccc|ddddd|eeeeee|fffffff|gggg|hhhh|iiiii|jjjjj|kkkkk|llll XaaaXbbbXcccXdddXeeeXfffXgggXhhhXiiiXjjjXkkkXlllXcbaXaaaa (53,57)
|
||||
L aaaa\nbbbb\ncccc\nddddd\neeeeee\nfffffff\ngggg\nhhhh\niiiii\njjjjj\nkkkkk\nllll XaaaXbbbXcccXdddXeeeXfffXgggXhhhXiiiXjjjXkkkXlllXcbaXaaaa NOMATCH
|
||||
E a*a*a*a*a*b aaaaaaaaab (0,10)
|
||||
BE ^ NULL (0,0)
|
||||
BE $ NULL (0,0)
|
||||
BE ^$ NULL (0,0)
|
||||
BE ^a$ a (0,1)
|
||||
BE abc abc (0,3)
|
||||
BE abc xabcy (1,4)
|
||||
BE abc ababc (2,5)
|
||||
BE ab*c abc (0,3)
|
||||
BE ab*bc abc (0,3)
|
||||
BE ab*bc abbc (0,4)
|
||||
BE ab*bc abbbbc (0,6)
|
||||
E ab+bc abbc (0,4)
|
||||
E ab+bc abbbbc (0,6)
|
||||
E ab?bc abbc (0,4)
|
||||
E ab?bc abc (0,3)
|
||||
E ab?c abc (0,3)
|
||||
BE ^abc$ abc (0,3)
|
||||
BE ^abc abcc (0,3)
|
||||
BE abc$ aabc (1,4)
|
||||
BE ^ abc (0,0)
|
||||
BE $ abc (3,3)
|
||||
BE a.c abc (0,3)
|
||||
BE a.c axc (0,3)
|
||||
BE a.*c axyzc (0,5)
|
||||
BE a[bc]d abd (0,3)
|
||||
BE a[b-d]e ace (0,3)
|
||||
BE a[b-d] aac (1,3)
|
||||
BE a[-b] a- (0,2)
|
||||
BE a[b-] a- (0,2)
|
||||
BE a] a] (0,2)
|
||||
BE a[]]b a]b (0,3)
|
||||
BE a[^bc]d aed (0,3)
|
||||
BE a[^-b]c adc (0,3)
|
||||
BE a[^]b]c adc (0,3)
|
||||
E ab|cd abc (0,2)
|
||||
E ab|cd abcd (0,2)
|
||||
E a\(b a(b (0,3)
|
||||
E a\(*b ab (0,2)
|
||||
E a\(*b a((b (0,4)
|
||||
E ((a)) abc (0,1)(0,1)(0,1)
|
||||
E (a)b(c) abc (0,3)(0,1)(2,3)
|
||||
E a+b+c aabbabc (4,7)
|
||||
E a* aaa (0,3)
|
||||
#E (a*)* - (0,0)(0,0)
|
||||
E (a*)* - (0,0)(?,?) RE2/Go
|
||||
E (a*)+ - (0,0)(0,0)
|
||||
#E (a*|b)* - (0,0)(0,0)
|
||||
E (a*|b)* - (0,0)(?,?) RE2/Go
|
||||
E (a+|b)* ab (0,2)(1,2)
|
||||
E (a+|b)+ ab (0,2)(1,2)
|
||||
E (a+|b)? ab (0,1)(0,1)
|
||||
BE [^ab]* cde (0,3)
|
||||
#E (^)* - (0,0)(0,0)
|
||||
E (^)* - (0,0)(?,?) RE2/Go
|
||||
BE a* NULL (0,0)
|
||||
E ([abc])*d abbbcd (0,6)(4,5)
|
||||
E ([abc])*bcd abcd (0,4)(0,1)
|
||||
E a|b|c|d|e e (0,1)
|
||||
E (a|b|c|d|e)f ef (0,2)(0,1)
|
||||
#E ((a*|b))* - (0,0)(0,0)(0,0)
|
||||
E ((a*|b))* - (0,0)(?,?)(?,?) RE2/Go
|
||||
BE abcd*efg abcdefg (0,7)
|
||||
BE ab* xabyabbbz (1,3)
|
||||
BE ab* xayabbbz (1,2)
|
||||
E (ab|cd)e abcde (2,5)(2,4)
|
||||
BE [abhgefdc]ij hij (0,3)
|
||||
E (a|b)c*d abcd (1,4)(1,2)
|
||||
E (ab|ab*)bc abc (0,3)(0,1)
|
||||
E a([bc]*)c* abc (0,3)(1,3)
|
||||
E a([bc]*)(c*d) abcd (0,4)(1,3)(3,4)
|
||||
E a([bc]+)(c*d) abcd (0,4)(1,3)(3,4)
|
||||
E a([bc]*)(c+d) abcd (0,4)(1,2)(2,4)
|
||||
E a[bcd]*dcdcde adcdcde (0,7)
|
||||
E (ab|a)b*c abc (0,3)(0,2)
|
||||
E ((a)(b)c)(d) abcd (0,4)(0,3)(0,1)(1,2)(3,4)
|
||||
BE [A-Za-z_][A-Za-z0-9_]* alpha (0,5)
|
||||
E ^a(bc+|b[eh])g|.h$ abh (1,3)
|
||||
E (bc+d$|ef*g.|h?i(j|k)) effgz (0,5)(0,5)
|
||||
E (bc+d$|ef*g.|h?i(j|k)) ij (0,2)(0,2)(1,2)
|
||||
E (bc+d$|ef*g.|h?i(j|k)) reffgz (1,6)(1,6)
|
||||
E (((((((((a))))))))) a (0,1)(0,1)(0,1)(0,1)(0,1)(0,1)(0,1)(0,1)(0,1)(0,1)
|
||||
BE multiple words multiple words yeah (0,14)
|
||||
E (.*)c(.*) abcde (0,5)(0,2)(3,5)
|
||||
BE abcd abcd (0,4)
|
||||
E a(bc)d abcd (0,4)(1,3)
|
||||
E a[-]?c ac (0,3)
|
||||
E M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy] Muammar Qaddafi (0,15)(?,?)(10,12)
|
||||
E M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy] Mo'ammar Gadhafi (0,16)(?,?)(11,13)
|
||||
E M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy] Muammar Kaddafi (0,15)(?,?)(10,12)
|
||||
E M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy] Muammar Qadhafi (0,15)(?,?)(10,12)
|
||||
E M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy] Muammar Gadafi (0,14)(?,?)(10,11)
|
||||
E M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy] Mu'ammar Qadafi (0,15)(?,?)(11,12)
|
||||
E M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy] Moamar Gaddafi (0,14)(?,?)(9,11)
|
||||
E M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy] Mu'ammar Qadhdhafi (0,18)(?,?)(13,15)
|
||||
E M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy] Muammar Khaddafi (0,16)(?,?)(11,13)
|
||||
E M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy] Muammar Ghaddafy (0,16)(?,?)(11,13)
|
||||
E M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy] Muammar Ghadafi (0,15)(?,?)(11,12)
|
||||
E M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy] Muammar Ghaddafi (0,16)(?,?)(11,13)
|
||||
E M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy] Muamar Kaddafi (0,14)(?,?)(9,11)
|
||||
E M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy] Muammar Quathafi (0,16)(?,?)(11,13)
|
||||
E M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy] Muammar Gheddafi (0,16)(?,?)(11,13)
|
||||
E M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy] Moammar Khadafy (0,15)(?,?)(11,12)
|
||||
E M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy] Moammar Qudhafi (0,15)(?,?)(10,12)
|
||||
E a+(b|c)*d+ aabcdd (0,6)(3,4)
|
||||
E ^.+$ vivi (0,4)
|
||||
E ^(.+)$ vivi (0,4)(0,4)
|
||||
E ^([^!.]+).att.com!(.+)$ gryphon.att.com!eby (0,19)(0,7)(16,19)
|
||||
E ^([^!]+!)?([^!]+)$ bas (0,3)(?,?)(0,3)
|
||||
E ^([^!]+!)?([^!]+)$ bar!bas (0,7)(0,4)(4,7)
|
||||
E ^([^!]+!)?([^!]+)$ foo!bas (0,7)(0,4)(4,7)
|
||||
E ^.+!([^!]+!)([^!]+)$ foo!bar!bas (0,11)(4,8)(8,11)
|
||||
E ((foo)|(bar))!bas bar!bas (0,7)(0,3)(?,?)(0,3)
|
||||
E ((foo)|(bar))!bas foo!bar!bas (4,11)(4,7)(?,?)(4,7)
|
||||
E ((foo)|(bar))!bas foo!bas (0,7)(0,3)(0,3)
|
||||
E ((foo)|bar)!bas bar!bas (0,7)(0,3)
|
||||
E ((foo)|bar)!bas foo!bar!bas (4,11)(4,7)
|
||||
E ((foo)|bar)!bas foo!bas (0,7)(0,3)(0,3)
|
||||
E (foo|(bar))!bas bar!bas (0,7)(0,3)(0,3)
|
||||
E (foo|(bar))!bas foo!bar!bas (4,11)(4,7)(4,7)
|
||||
E (foo|(bar))!bas foo!bas (0,7)(0,3)
|
||||
E (foo|bar)!bas bar!bas (0,7)(0,3)
|
||||
E (foo|bar)!bas foo!bar!bas (4,11)(4,7)
|
||||
E (foo|bar)!bas foo!bas (0,7)(0,3)
|
||||
E ^(([^!]+!)?([^!]+)|.+!([^!]+!)([^!]+))$ foo!bar!bas (0,11)(0,11)(?,?)(?,?)(4,8)(8,11)
|
||||
E ^([^!]+!)?([^!]+)$|^.+!([^!]+!)([^!]+)$ bas (0,3)(?,?)(0,3)
|
||||
E ^([^!]+!)?([^!]+)$|^.+!([^!]+!)([^!]+)$ bar!bas (0,7)(0,4)(4,7)
|
||||
E ^([^!]+!)?([^!]+)$|^.+!([^!]+!)([^!]+)$ foo!bar!bas (0,11)(?,?)(?,?)(4,8)(8,11)
|
||||
E ^([^!]+!)?([^!]+)$|^.+!([^!]+!)([^!]+)$ foo!bas (0,7)(0,4)(4,7)
|
||||
E ^(([^!]+!)?([^!]+)|.+!([^!]+!)([^!]+))$ bas (0,3)(0,3)(?,?)(0,3)
|
||||
E ^(([^!]+!)?([^!]+)|.+!([^!]+!)([^!]+))$ bar!bas (0,7)(0,7)(0,4)(4,7)
|
||||
E ^(([^!]+!)?([^!]+)|.+!([^!]+!)([^!]+))$ foo!bar!bas (0,11)(0,11)(?,?)(?,?)(4,8)(8,11)
|
||||
E ^(([^!]+!)?([^!]+)|.+!([^!]+!)([^!]+))$ foo!bas (0,7)(0,7)(0,4)(4,7)
|
||||
E .*(/XXX).* /XXX (0,4)(0,4)
|
||||
E .*(\\XXX).* \XXX (0,4)(0,4)
|
||||
E \\XXX \XXX (0,4)
|
||||
E .*(/000).* /000 (0,4)(0,4)
|
||||
E .*(\\000).* \000 (0,4)(0,4)
|
||||
E \\000 \000 (0,4)
|
||||
1428
third-party/vendor/regex-automata-0.1.10/data/tests/fowler/basic.toml
vendored
Normal file
1428
third-party/vendor/regex-automata-0.1.10/data/tests/fowler/basic.toml
vendored
Normal file
File diff suppressed because it is too large
Load diff
76
third-party/vendor/regex-automata-0.1.10/data/tests/fowler/fowler-to-toml
vendored
Executable file
76
third-party/vendor/regex-automata-0.1.10/data/tests/fowler/fowler-to-toml
vendored
Executable file
|
|
@ -0,0 +1,76 @@
|
|||
#!/usr/bin/env python
|
||||
|
||||
from __future__ import absolute_import, division, print_function
|
||||
import argparse
|
||||
import os.path as path
|
||||
|
||||
|
||||
def read_tests(f):
|
||||
basename, _ = path.splitext(path.basename(f))
|
||||
tests = []
|
||||
prev_pattern = None
|
||||
|
||||
for lineno, line in enumerate(open(f), 1):
|
||||
fields = list(filter(None, map(str.strip, line.split('\t'))))
|
||||
if not (4 <= len(fields) <= 5) \
|
||||
or 'E' not in fields[0] or fields[0][0] == '#':
|
||||
continue
|
||||
|
||||
terse_opts, pat, text, sgroups = fields[0:4]
|
||||
groups = [] # groups as integer ranges
|
||||
if sgroups == 'NOMATCH':
|
||||
groups = []
|
||||
elif ',' in sgroups:
|
||||
noparen = map(lambda s: s.strip('()'), sgroups.split(')('))
|
||||
for g in noparen:
|
||||
s, e = map(str.strip, g.split(','))
|
||||
groups.append([int(s), int(e)])
|
||||
break
|
||||
else:
|
||||
# This skips tests that should result in an error.
|
||||
# There aren't many, so I think we can just capture those
|
||||
# manually. Possibly fix this in future.
|
||||
continue
|
||||
|
||||
opts = []
|
||||
if text == "NULL":
|
||||
text = ""
|
||||
if pat == 'SAME':
|
||||
pat = prev_pattern
|
||||
if '$' in terse_opts:
|
||||
pat = pat.encode('utf-8').decode('unicode_escape')
|
||||
text = text.encode('utf-8').decode('unicode_escape')
|
||||
text = text.encode('unicode_escape').decode('utf-8')
|
||||
opts.append('escaped')
|
||||
else:
|
||||
opts.append('escaped')
|
||||
text = text.encode('unicode_escape').decode('utf-8')
|
||||
if 'i' in terse_opts:
|
||||
opts.append('case-insensitive')
|
||||
|
||||
pat = pat.encode('unicode_escape').decode('utf-8')
|
||||
pat = pat.replace('\\\\', '\\')
|
||||
tests.append({
|
||||
'name': '"%s%d"' % (basename, lineno),
|
||||
'options': repr(opts),
|
||||
'pattern': "'''%s'''" % pat,
|
||||
'input': "'''%s'''" % text,
|
||||
'matches': str(groups),
|
||||
})
|
||||
prev_pattern = pat
|
||||
return tests
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
parser = argparse.ArgumentParser(
|
||||
description='Generate match tests from an AT&T POSIX test file.')
|
||||
aa = parser.add_argument
|
||||
aa('datfile', help='A dat AT&T POSIX test file.')
|
||||
args = parser.parse_args()
|
||||
|
||||
tests = read_tests(args.datfile)
|
||||
for t in tests:
|
||||
print('[[tests]]')
|
||||
for k, v in t.items():
|
||||
print('%s = %s' % (k, v))
|
||||
print('')
|
||||
79
third-party/vendor/regex-automata-0.1.10/data/tests/fowler/nullsubexpr.dat
vendored
Normal file
79
third-party/vendor/regex-automata-0.1.10/data/tests/fowler/nullsubexpr.dat
vendored
Normal file
|
|
@ -0,0 +1,79 @@
|
|||
NOTE null subexpression matches : 2002-06-06
|
||||
|
||||
E (a*)* a (0,1)(0,1)
|
||||
#E SAME x (0,0)(0,0)
|
||||
E SAME x (0,0)(?,?) RE2/Go
|
||||
E SAME aaaaaa (0,6)(0,6)
|
||||
E SAME aaaaaax (0,6)(0,6)
|
||||
E (a*)+ a (0,1)(0,1)
|
||||
E SAME x (0,0)(0,0)
|
||||
E SAME aaaaaa (0,6)(0,6)
|
||||
E SAME aaaaaax (0,6)(0,6)
|
||||
E (a+)* a (0,1)(0,1)
|
||||
E SAME x (0,0)
|
||||
E SAME aaaaaa (0,6)(0,6)
|
||||
E SAME aaaaaax (0,6)(0,6)
|
||||
E (a+)+ a (0,1)(0,1)
|
||||
E SAME x NOMATCH
|
||||
E SAME aaaaaa (0,6)(0,6)
|
||||
E SAME aaaaaax (0,6)(0,6)
|
||||
|
||||
E ([a]*)* a (0,1)(0,1)
|
||||
#E SAME x (0,0)(0,0)
|
||||
E SAME x (0,0)(?,?) RE2/Go
|
||||
E SAME aaaaaa (0,6)(0,6)
|
||||
E SAME aaaaaax (0,6)(0,6)
|
||||
E ([a]*)+ a (0,1)(0,1)
|
||||
E SAME x (0,0)(0,0)
|
||||
E SAME aaaaaa (0,6)(0,6)
|
||||
E SAME aaaaaax (0,6)(0,6)
|
||||
E ([^b]*)* a (0,1)(0,1)
|
||||
#E SAME b (0,0)(0,0)
|
||||
E SAME b (0,0)(?,?) RE2/Go
|
||||
E SAME aaaaaa (0,6)(0,6)
|
||||
E SAME aaaaaab (0,6)(0,6)
|
||||
E ([ab]*)* a (0,1)(0,1)
|
||||
E SAME aaaaaa (0,6)(0,6)
|
||||
E SAME ababab (0,6)(0,6)
|
||||
E SAME bababa (0,6)(0,6)
|
||||
E SAME b (0,1)(0,1)
|
||||
E SAME bbbbbb (0,6)(0,6)
|
||||
E SAME aaaabcde (0,5)(0,5)
|
||||
E ([^a]*)* b (0,1)(0,1)
|
||||
E SAME bbbbbb (0,6)(0,6)
|
||||
#E SAME aaaaaa (0,0)(0,0)
|
||||
E SAME aaaaaa (0,0)(?,?) RE2/Go
|
||||
E ([^ab]*)* ccccxx (0,6)(0,6)
|
||||
#E SAME ababab (0,0)(0,0)
|
||||
E SAME ababab (0,0)(?,?) RE2/Go
|
||||
|
||||
E ((z)+|a)* zabcde (0,2)(1,2)
|
||||
|
||||
#{E a+? aaaaaa (0,1) no *? +? mimimal match ops
|
||||
#E (a) aaa (0,1)(0,1)
|
||||
#E (a*?) aaa (0,0)(0,0)
|
||||
#E (a)*? aaa (0,0)
|
||||
#E (a*?)*? aaa (0,0)
|
||||
#}
|
||||
|
||||
B \(a*\)*\(x\) x (0,1)(0,0)(0,1)
|
||||
B \(a*\)*\(x\) ax (0,2)(0,1)(1,2)
|
||||
B \(a*\)*\(x\) axa (0,2)(0,1)(1,2)
|
||||
B \(a*\)*\(x\)\(\1\) x (0,1)(0,0)(0,1)(1,1)
|
||||
B \(a*\)*\(x\)\(\1\) ax (0,2)(1,1)(1,2)(2,2)
|
||||
B \(a*\)*\(x\)\(\1\) axa (0,3)(0,1)(1,2)(2,3)
|
||||
B \(a*\)*\(x\)\(\1\)\(x\) axax (0,4)(0,1)(1,2)(2,3)(3,4)
|
||||
B \(a*\)*\(x\)\(\1\)\(x\) axxa (0,3)(1,1)(1,2)(2,2)(2,3)
|
||||
|
||||
#E (a*)*(x) x (0,1)(0,0)(0,1)
|
||||
E (a*)*(x) x (0,1)(?,?)(0,1) RE2/Go
|
||||
E (a*)*(x) ax (0,2)(0,1)(1,2)
|
||||
E (a*)*(x) axa (0,2)(0,1)(1,2)
|
||||
|
||||
E (a*)+(x) x (0,1)(0,0)(0,1)
|
||||
E (a*)+(x) ax (0,2)(0,1)(1,2)
|
||||
E (a*)+(x) axa (0,2)(0,1)(1,2)
|
||||
|
||||
E (a*){2}(x) x (0,1)(0,0)(0,1)
|
||||
E (a*){2}(x) ax (0,2)(1,1)(1,2)
|
||||
E (a*){2}(x) axa (0,2)(1,1)(1,2)
|
||||
350
third-party/vendor/regex-automata-0.1.10/data/tests/fowler/nullsubexpr.toml
vendored
Normal file
350
third-party/vendor/regex-automata-0.1.10/data/tests/fowler/nullsubexpr.toml
vendored
Normal file
|
|
@ -0,0 +1,350 @@
|
|||
[[tests]]
|
||||
name = "nullsubexpr3"
|
||||
options = ['escaped']
|
||||
pattern = '''(a*)*'''
|
||||
input = '''a'''
|
||||
matches = [[0, 1]]
|
||||
|
||||
[[tests]]
|
||||
name = "nullsubexpr5"
|
||||
options = ['escaped']
|
||||
pattern = '''(a*)*'''
|
||||
input = '''x'''
|
||||
matches = [[0, 0]]
|
||||
|
||||
[[tests]]
|
||||
name = "nullsubexpr6"
|
||||
options = ['escaped']
|
||||
pattern = '''(a*)*'''
|
||||
input = '''aaaaaa'''
|
||||
matches = [[0, 6]]
|
||||
|
||||
[[tests]]
|
||||
name = "nullsubexpr7"
|
||||
options = ['escaped']
|
||||
pattern = '''(a*)*'''
|
||||
input = '''aaaaaax'''
|
||||
matches = [[0, 6]]
|
||||
|
||||
[[tests]]
|
||||
name = "nullsubexpr8"
|
||||
options = ['escaped']
|
||||
pattern = '''(a*)+'''
|
||||
input = '''a'''
|
||||
matches = [[0, 1]]
|
||||
|
||||
[[tests]]
|
||||
name = "nullsubexpr9"
|
||||
options = ['escaped']
|
||||
pattern = '''(a*)+'''
|
||||
input = '''x'''
|
||||
matches = [[0, 0]]
|
||||
|
||||
[[tests]]
|
||||
name = "nullsubexpr10"
|
||||
options = ['escaped']
|
||||
pattern = '''(a*)+'''
|
||||
input = '''aaaaaa'''
|
||||
matches = [[0, 6]]
|
||||
|
||||
[[tests]]
|
||||
name = "nullsubexpr11"
|
||||
options = ['escaped']
|
||||
pattern = '''(a*)+'''
|
||||
input = '''aaaaaax'''
|
||||
matches = [[0, 6]]
|
||||
|
||||
[[tests]]
|
||||
name = "nullsubexpr12"
|
||||
options = ['escaped']
|
||||
pattern = '''(a+)*'''
|
||||
input = '''a'''
|
||||
matches = [[0, 1]]
|
||||
|
||||
[[tests]]
|
||||
name = "nullsubexpr13"
|
||||
options = ['escaped']
|
||||
pattern = '''(a+)*'''
|
||||
input = '''x'''
|
||||
matches = [[0, 0]]
|
||||
|
||||
[[tests]]
|
||||
name = "nullsubexpr14"
|
||||
options = ['escaped']
|
||||
pattern = '''(a+)*'''
|
||||
input = '''aaaaaa'''
|
||||
matches = [[0, 6]]
|
||||
|
||||
[[tests]]
|
||||
name = "nullsubexpr15"
|
||||
options = ['escaped']
|
||||
pattern = '''(a+)*'''
|
||||
input = '''aaaaaax'''
|
||||
matches = [[0, 6]]
|
||||
|
||||
[[tests]]
|
||||
name = "nullsubexpr16"
|
||||
options = ['escaped']
|
||||
pattern = '''(a+)+'''
|
||||
input = '''a'''
|
||||
matches = [[0, 1]]
|
||||
|
||||
[[tests]]
|
||||
name = "nullsubexpr17"
|
||||
options = ['escaped']
|
||||
pattern = '''(a+)+'''
|
||||
input = '''x'''
|
||||
matches = []
|
||||
|
||||
[[tests]]
|
||||
name = "nullsubexpr18"
|
||||
options = ['escaped']
|
||||
pattern = '''(a+)+'''
|
||||
input = '''aaaaaa'''
|
||||
matches = [[0, 6]]
|
||||
|
||||
[[tests]]
|
||||
name = "nullsubexpr19"
|
||||
options = ['escaped']
|
||||
pattern = '''(a+)+'''
|
||||
input = '''aaaaaax'''
|
||||
matches = [[0, 6]]
|
||||
|
||||
[[tests]]
|
||||
name = "nullsubexpr21"
|
||||
options = ['escaped']
|
||||
pattern = '''([a]*)*'''
|
||||
input = '''a'''
|
||||
matches = [[0, 1]]
|
||||
|
||||
[[tests]]
|
||||
name = "nullsubexpr23"
|
||||
options = ['escaped']
|
||||
pattern = '''([a]*)*'''
|
||||
input = '''x'''
|
||||
matches = [[0, 0]]
|
||||
|
||||
[[tests]]
|
||||
name = "nullsubexpr24"
|
||||
options = ['escaped']
|
||||
pattern = '''([a]*)*'''
|
||||
input = '''aaaaaa'''
|
||||
matches = [[0, 6]]
|
||||
|
||||
[[tests]]
|
||||
name = "nullsubexpr25"
|
||||
options = ['escaped']
|
||||
pattern = '''([a]*)*'''
|
||||
input = '''aaaaaax'''
|
||||
matches = [[0, 6]]
|
||||
|
||||
[[tests]]
|
||||
name = "nullsubexpr26"
|
||||
options = ['escaped']
|
||||
pattern = '''([a]*)+'''
|
||||
input = '''a'''
|
||||
matches = [[0, 1]]
|
||||
|
||||
[[tests]]
|
||||
name = "nullsubexpr27"
|
||||
options = ['escaped']
|
||||
pattern = '''([a]*)+'''
|
||||
input = '''x'''
|
||||
matches = [[0, 0]]
|
||||
|
||||
[[tests]]
|
||||
name = "nullsubexpr28"
|
||||
options = ['escaped']
|
||||
pattern = '''([a]*)+'''
|
||||
input = '''aaaaaa'''
|
||||
matches = [[0, 6]]
|
||||
|
||||
[[tests]]
|
||||
name = "nullsubexpr29"
|
||||
options = ['escaped']
|
||||
pattern = '''([a]*)+'''
|
||||
input = '''aaaaaax'''
|
||||
matches = [[0, 6]]
|
||||
|
||||
[[tests]]
|
||||
name = "nullsubexpr30"
|
||||
options = ['escaped']
|
||||
pattern = '''([^b]*)*'''
|
||||
input = '''a'''
|
||||
matches = [[0, 1]]
|
||||
|
||||
[[tests]]
|
||||
name = "nullsubexpr32"
|
||||
options = ['escaped']
|
||||
pattern = '''([^b]*)*'''
|
||||
input = '''b'''
|
||||
matches = [[0, 0]]
|
||||
|
||||
[[tests]]
|
||||
name = "nullsubexpr33"
|
||||
options = ['escaped']
|
||||
pattern = '''([^b]*)*'''
|
||||
input = '''aaaaaa'''
|
||||
matches = [[0, 6]]
|
||||
|
||||
[[tests]]
|
||||
name = "nullsubexpr34"
|
||||
options = ['escaped']
|
||||
pattern = '''([^b]*)*'''
|
||||
input = '''aaaaaab'''
|
||||
matches = [[0, 6]]
|
||||
|
||||
[[tests]]
|
||||
name = "nullsubexpr35"
|
||||
options = ['escaped']
|
||||
pattern = '''([ab]*)*'''
|
||||
input = '''a'''
|
||||
matches = [[0, 1]]
|
||||
|
||||
[[tests]]
|
||||
name = "nullsubexpr36"
|
||||
options = ['escaped']
|
||||
pattern = '''([ab]*)*'''
|
||||
input = '''aaaaaa'''
|
||||
matches = [[0, 6]]
|
||||
|
||||
[[tests]]
|
||||
name = "nullsubexpr37"
|
||||
options = ['escaped']
|
||||
pattern = '''([ab]*)*'''
|
||||
input = '''ababab'''
|
||||
matches = [[0, 6]]
|
||||
|
||||
[[tests]]
|
||||
name = "nullsubexpr38"
|
||||
options = ['escaped']
|
||||
pattern = '''([ab]*)*'''
|
||||
input = '''bababa'''
|
||||
matches = [[0, 6]]
|
||||
|
||||
[[tests]]
|
||||
name = "nullsubexpr39"
|
||||
options = ['escaped']
|
||||
pattern = '''([ab]*)*'''
|
||||
input = '''b'''
|
||||
matches = [[0, 1]]
|
||||
|
||||
[[tests]]
|
||||
name = "nullsubexpr40"
|
||||
options = ['escaped']
|
||||
pattern = '''([ab]*)*'''
|
||||
input = '''bbbbbb'''
|
||||
matches = [[0, 6]]
|
||||
|
||||
[[tests]]
|
||||
name = "nullsubexpr41"
|
||||
options = ['escaped']
|
||||
pattern = '''([ab]*)*'''
|
||||
input = '''aaaabcde'''
|
||||
matches = [[0, 5]]
|
||||
|
||||
[[tests]]
|
||||
name = "nullsubexpr42"
|
||||
options = ['escaped']
|
||||
pattern = '''([^a]*)*'''
|
||||
input = '''b'''
|
||||
matches = [[0, 1]]
|
||||
|
||||
[[tests]]
|
||||
name = "nullsubexpr43"
|
||||
options = ['escaped']
|
||||
pattern = '''([^a]*)*'''
|
||||
input = '''bbbbbb'''
|
||||
matches = [[0, 6]]
|
||||
|
||||
[[tests]]
|
||||
name = "nullsubexpr45"
|
||||
options = ['escaped']
|
||||
pattern = '''([^a]*)*'''
|
||||
input = '''aaaaaa'''
|
||||
matches = [[0, 0]]
|
||||
|
||||
[[tests]]
|
||||
name = "nullsubexpr46"
|
||||
options = ['escaped']
|
||||
pattern = '''([^ab]*)*'''
|
||||
input = '''ccccxx'''
|
||||
matches = [[0, 6]]
|
||||
|
||||
[[tests]]
|
||||
name = "nullsubexpr48"
|
||||
options = ['escaped']
|
||||
pattern = '''([^ab]*)*'''
|
||||
input = '''ababab'''
|
||||
matches = [[0, 0]]
|
||||
|
||||
[[tests]]
|
||||
name = "nullsubexpr50"
|
||||
options = ['escaped']
|
||||
pattern = '''((z)+|a)*'''
|
||||
input = '''zabcde'''
|
||||
matches = [[0, 2]]
|
||||
|
||||
[[tests]]
|
||||
name = "nullsubexpr69"
|
||||
options = ['escaped']
|
||||
pattern = '''(a*)*(x)'''
|
||||
input = '''x'''
|
||||
matches = [[0, 1]]
|
||||
|
||||
[[tests]]
|
||||
name = "nullsubexpr70"
|
||||
options = ['escaped']
|
||||
pattern = '''(a*)*(x)'''
|
||||
input = '''ax'''
|
||||
matches = [[0, 2]]
|
||||
|
||||
[[tests]]
|
||||
name = "nullsubexpr71"
|
||||
options = ['escaped']
|
||||
pattern = '''(a*)*(x)'''
|
||||
input = '''axa'''
|
||||
matches = [[0, 2]]
|
||||
|
||||
[[tests]]
|
||||
name = "nullsubexpr73"
|
||||
options = ['escaped']
|
||||
pattern = '''(a*)+(x)'''
|
||||
input = '''x'''
|
||||
matches = [[0, 1]]
|
||||
|
||||
[[tests]]
|
||||
name = "nullsubexpr74"
|
||||
options = ['escaped']
|
||||
pattern = '''(a*)+(x)'''
|
||||
input = '''ax'''
|
||||
matches = [[0, 2]]
|
||||
|
||||
[[tests]]
|
||||
name = "nullsubexpr75"
|
||||
options = ['escaped']
|
||||
pattern = '''(a*)+(x)'''
|
||||
input = '''axa'''
|
||||
matches = [[0, 2]]
|
||||
|
||||
[[tests]]
|
||||
name = "nullsubexpr77"
|
||||
options = ['escaped']
|
||||
pattern = '''(a*){2}(x)'''
|
||||
input = '''x'''
|
||||
matches = [[0, 1]]
|
||||
|
||||
[[tests]]
|
||||
name = "nullsubexpr78"
|
||||
options = ['escaped']
|
||||
pattern = '''(a*){2}(x)'''
|
||||
input = '''ax'''
|
||||
matches = [[0, 2]]
|
||||
|
||||
[[tests]]
|
||||
name = "nullsubexpr79"
|
||||
options = ['escaped']
|
||||
pattern = '''(a*){2}(x)'''
|
||||
input = '''axa'''
|
||||
matches = [[0, 2]]
|
||||
|
||||
85
third-party/vendor/regex-automata-0.1.10/data/tests/fowler/repetition-long.dat
vendored
Normal file
85
third-party/vendor/regex-automata-0.1.10/data/tests/fowler/repetition-long.dat
vendored
Normal file
|
|
@ -0,0 +1,85 @@
|
|||
NOTE implicit vs. explicit repetitions : 2009-02-02
|
||||
|
||||
# Glenn Fowler <gsf@research.att.com>
|
||||
# conforming matches (column 4) must match one of the following BREs
|
||||
# NOMATCH
|
||||
# (0,.)\((\(.\),\(.\))(?,?)(\2,\3)\)*
|
||||
# (0,.)\((\(.\),\(.\))(\2,\3)(?,?)\)*
|
||||
# i.e., each 3-tuple has two identical elements and one (?,?)
|
||||
|
||||
NOTE additional repetition tests graciously provided by Chris Kuklewicz www.haskell.org 2009-02-02
|
||||
|
||||
:HA#100:E X(.?){0,}Y X1234567Y (0,9)(7,8)
|
||||
:HA#101:E X(.?){1,}Y X1234567Y (0,9)(7,8)
|
||||
:HA#102:E X(.?){2,}Y X1234567Y (0,9)(7,8)
|
||||
:HA#103:E X(.?){3,}Y X1234567Y (0,9)(7,8)
|
||||
:HA#104:E X(.?){4,}Y X1234567Y (0,9)(7,8)
|
||||
:HA#105:E X(.?){5,}Y X1234567Y (0,9)(7,8)
|
||||
:HA#106:E X(.?){6,}Y X1234567Y (0,9)(7,8)
|
||||
:HA#107:E X(.?){7,}Y X1234567Y (0,9)(7,8)
|
||||
:HA#108:E X(.?){8,}Y X1234567Y (0,9)(8,8)
|
||||
#:HA#110:E X(.?){0,8}Y X1234567Y (0,9)(7,8)
|
||||
:HA#110:E X(.?){0,8}Y X1234567Y (0,9)(8,8) RE2/Go
|
||||
#:HA#111:E X(.?){1,8}Y X1234567Y (0,9)(7,8)
|
||||
:HA#111:E X(.?){1,8}Y X1234567Y (0,9)(8,8) RE2/Go
|
||||
#:HA#112:E X(.?){2,8}Y X1234567Y (0,9)(7,8)
|
||||
:HA#112:E X(.?){2,8}Y X1234567Y (0,9)(8,8) RE2/Go
|
||||
#:HA#113:E X(.?){3,8}Y X1234567Y (0,9)(7,8)
|
||||
:HA#113:E X(.?){3,8}Y X1234567Y (0,9)(8,8) RE2/Go
|
||||
#:HA#114:E X(.?){4,8}Y X1234567Y (0,9)(7,8)
|
||||
:HA#114:E X(.?){4,8}Y X1234567Y (0,9)(8,8) RE2/Go
|
||||
#:HA#115:E X(.?){5,8}Y X1234567Y (0,9)(7,8)
|
||||
:HA#115:E X(.?){5,8}Y X1234567Y (0,9)(8,8) RE2/Go
|
||||
#:HA#116:E X(.?){6,8}Y X1234567Y (0,9)(7,8)
|
||||
:HA#116:E X(.?){6,8}Y X1234567Y (0,9)(8,8) RE2/Go
|
||||
#:HA#117:E X(.?){7,8}Y X1234567Y (0,9)(7,8)
|
||||
:HA#117:E X(.?){7,8}Y X1234567Y (0,9)(8,8) RE2/Go
|
||||
:HA#118:E X(.?){8,8}Y X1234567Y (0,9)(8,8)
|
||||
|
||||
# These test a fixed bug in my regex-tdfa that did not keep the expanded
|
||||
# form properly grouped, so right association did the wrong thing with
|
||||
# these ambiguous patterns (crafted just to test my code when I became
|
||||
# suspicious of my implementation). The first subexpression should use
|
||||
# "ab" then "a" then "bcd".
|
||||
|
||||
# OS X / FreeBSD / NetBSD badly fail many of these, with impossible
|
||||
# results like (0,6)(4,5)(6,6).
|
||||
|
||||
:HA#260:E (a|ab|c|bcd){0,}(d*) ababcd (0,1)(0,1)(1,1)
|
||||
:HA#261:E (a|ab|c|bcd){1,}(d*) ababcd (0,1)(0,1)(1,1)
|
||||
:HA#262:E (a|ab|c|bcd){2,}(d*) ababcd (0,6)(3,6)(6,6)
|
||||
:HA#263:E (a|ab|c|bcd){3,}(d*) ababcd (0,6)(3,6)(6,6)
|
||||
:HA#264:E (a|ab|c|bcd){4,}(d*) ababcd NOMATCH
|
||||
:HA#265:E (a|ab|c|bcd){0,10}(d*) ababcd (0,1)(0,1)(1,1)
|
||||
:HA#266:E (a|ab|c|bcd){1,10}(d*) ababcd (0,1)(0,1)(1,1)
|
||||
:HA#267:E (a|ab|c|bcd){2,10}(d*) ababcd (0,6)(3,6)(6,6)
|
||||
:HA#268:E (a|ab|c|bcd){3,10}(d*) ababcd (0,6)(3,6)(6,6)
|
||||
:HA#269:E (a|ab|c|bcd){4,10}(d*) ababcd NOMATCH
|
||||
:HA#270:E (a|ab|c|bcd)*(d*) ababcd (0,1)(0,1)(1,1)
|
||||
:HA#271:E (a|ab|c|bcd)+(d*) ababcd (0,1)(0,1)(1,1)
|
||||
|
||||
# The above worked on Linux/GLIBC but the following often fail.
|
||||
# They also trip up OS X / FreeBSD / NetBSD:
|
||||
|
||||
#:HA#280:E (ab|a|c|bcd){0,}(d*) ababcd (0,6)(3,6)(6,6)
|
||||
:HA#280:E (ab|a|c|bcd){0,}(d*) ababcd (0,6)(4,5)(5,6) RE2/Go
|
||||
#:HA#281:E (ab|a|c|bcd){1,}(d*) ababcd (0,6)(3,6)(6,6)
|
||||
:HA#281:E (ab|a|c|bcd){1,}(d*) ababcd (0,6)(4,5)(5,6) RE2/Go
|
||||
#:HA#282:E (ab|a|c|bcd){2,}(d*) ababcd (0,6)(3,6)(6,6)
|
||||
:HA#282:E (ab|a|c|bcd){2,}(d*) ababcd (0,6)(4,5)(5,6) RE2/Go
|
||||
#:HA#283:E (ab|a|c|bcd){3,}(d*) ababcd (0,6)(3,6)(6,6)
|
||||
:HA#283:E (ab|a|c|bcd){3,}(d*) ababcd (0,6)(4,5)(5,6) RE2/Go
|
||||
:HA#284:E (ab|a|c|bcd){4,}(d*) ababcd NOMATCH
|
||||
#:HA#285:E (ab|a|c|bcd){0,10}(d*) ababcd (0,6)(3,6)(6,6)
|
||||
:HA#285:E (ab|a|c|bcd){0,10}(d*) ababcd (0,6)(4,5)(5,6) RE2/Go
|
||||
#:HA#286:E (ab|a|c|bcd){1,10}(d*) ababcd (0,6)(3,6)(6,6)
|
||||
:HA#286:E (ab|a|c|bcd){1,10}(d*) ababcd (0,6)(4,5)(5,6) RE2/Go
|
||||
#:HA#287:E (ab|a|c|bcd){2,10}(d*) ababcd (0,6)(3,6)(6,6)
|
||||
:HA#287:E (ab|a|c|bcd){2,10}(d*) ababcd (0,6)(4,5)(5,6) RE2/Go
|
||||
#:HA#288:E (ab|a|c|bcd){3,10}(d*) ababcd (0,6)(3,6)(6,6)
|
||||
:HA#288:E (ab|a|c|bcd){3,10}(d*) ababcd (0,6)(4,5)(5,6) RE2/Go
|
||||
:HA#289:E (ab|a|c|bcd){4,10}(d*) ababcd NOMATCH
|
||||
#:HA#290:E (ab|a|c|bcd)*(d*) ababcd (0,6)(3,6)(6,6)
|
||||
:HA#290:E (ab|a|c|bcd)*(d*) ababcd (0,6)(4,5)(5,6) RE2/Go
|
||||
#:HA#291:E (ab|a|c|bcd)+(d*) ababcd (0,6)(3,6)(6,6)
|
||||
:HA#291:E (ab|a|c|bcd)+(d*) ababcd (0,6)(4,5)(5,6) RE2/Go
|
||||
294
third-party/vendor/regex-automata-0.1.10/data/tests/fowler/repetition-long.toml
vendored
Normal file
294
third-party/vendor/regex-automata-0.1.10/data/tests/fowler/repetition-long.toml
vendored
Normal file
|
|
@ -0,0 +1,294 @@
|
|||
[[tests]]
|
||||
name = "repetition-long12"
|
||||
options = ['escaped']
|
||||
pattern = '''X(.?){0,}Y'''
|
||||
input = '''X1234567Y'''
|
||||
matches = [[0, 9]]
|
||||
|
||||
[[tests]]
|
||||
name = "repetition-long13"
|
||||
options = ['escaped']
|
||||
pattern = '''X(.?){1,}Y'''
|
||||
input = '''X1234567Y'''
|
||||
matches = [[0, 9]]
|
||||
|
||||
[[tests]]
|
||||
name = "repetition-long14"
|
||||
options = ['escaped']
|
||||
pattern = '''X(.?){2,}Y'''
|
||||
input = '''X1234567Y'''
|
||||
matches = [[0, 9]]
|
||||
|
||||
[[tests]]
|
||||
name = "repetition-long15"
|
||||
options = ['escaped']
|
||||
pattern = '''X(.?){3,}Y'''
|
||||
input = '''X1234567Y'''
|
||||
matches = [[0, 9]]
|
||||
|
||||
[[tests]]
|
||||
name = "repetition-long16"
|
||||
options = ['escaped']
|
||||
pattern = '''X(.?){4,}Y'''
|
||||
input = '''X1234567Y'''
|
||||
matches = [[0, 9]]
|
||||
|
||||
[[tests]]
|
||||
name = "repetition-long17"
|
||||
options = ['escaped']
|
||||
pattern = '''X(.?){5,}Y'''
|
||||
input = '''X1234567Y'''
|
||||
matches = [[0, 9]]
|
||||
|
||||
[[tests]]
|
||||
name = "repetition-long18"
|
||||
options = ['escaped']
|
||||
pattern = '''X(.?){6,}Y'''
|
||||
input = '''X1234567Y'''
|
||||
matches = [[0, 9]]
|
||||
|
||||
[[tests]]
|
||||
name = "repetition-long19"
|
||||
options = ['escaped']
|
||||
pattern = '''X(.?){7,}Y'''
|
||||
input = '''X1234567Y'''
|
||||
matches = [[0, 9]]
|
||||
|
||||
[[tests]]
|
||||
name = "repetition-long20"
|
||||
options = ['escaped']
|
||||
pattern = '''X(.?){8,}Y'''
|
||||
input = '''X1234567Y'''
|
||||
matches = [[0, 9]]
|
||||
|
||||
[[tests]]
|
||||
name = "repetition-long22"
|
||||
options = ['escaped']
|
||||
pattern = '''X(.?){0,8}Y'''
|
||||
input = '''X1234567Y'''
|
||||
matches = [[0, 9]]
|
||||
|
||||
[[tests]]
|
||||
name = "repetition-long24"
|
||||
options = ['escaped']
|
||||
pattern = '''X(.?){1,8}Y'''
|
||||
input = '''X1234567Y'''
|
||||
matches = [[0, 9]]
|
||||
|
||||
[[tests]]
|
||||
name = "repetition-long26"
|
||||
options = ['escaped']
|
||||
pattern = '''X(.?){2,8}Y'''
|
||||
input = '''X1234567Y'''
|
||||
matches = [[0, 9]]
|
||||
|
||||
[[tests]]
|
||||
name = "repetition-long28"
|
||||
options = ['escaped']
|
||||
pattern = '''X(.?){3,8}Y'''
|
||||
input = '''X1234567Y'''
|
||||
matches = [[0, 9]]
|
||||
|
||||
[[tests]]
|
||||
name = "repetition-long30"
|
||||
options = ['escaped']
|
||||
pattern = '''X(.?){4,8}Y'''
|
||||
input = '''X1234567Y'''
|
||||
matches = [[0, 9]]
|
||||
|
||||
[[tests]]
|
||||
name = "repetition-long32"
|
||||
options = ['escaped']
|
||||
pattern = '''X(.?){5,8}Y'''
|
||||
input = '''X1234567Y'''
|
||||
matches = [[0, 9]]
|
||||
|
||||
[[tests]]
|
||||
name = "repetition-long34"
|
||||
options = ['escaped']
|
||||
pattern = '''X(.?){6,8}Y'''
|
||||
input = '''X1234567Y'''
|
||||
matches = [[0, 9]]
|
||||
|
||||
[[tests]]
|
||||
name = "repetition-long36"
|
||||
options = ['escaped']
|
||||
pattern = '''X(.?){7,8}Y'''
|
||||
input = '''X1234567Y'''
|
||||
matches = [[0, 9]]
|
||||
|
||||
[[tests]]
|
||||
name = "repetition-long37"
|
||||
options = ['escaped']
|
||||
pattern = '''X(.?){8,8}Y'''
|
||||
input = '''X1234567Y'''
|
||||
matches = [[0, 9]]
|
||||
|
||||
[[tests]]
|
||||
name = "repetition-long48"
|
||||
options = ['escaped']
|
||||
pattern = '''(a|ab|c|bcd){0,}(d*)'''
|
||||
input = '''ababcd'''
|
||||
matches = [[0, 1]]
|
||||
|
||||
[[tests]]
|
||||
name = "repetition-long49"
|
||||
options = ['escaped']
|
||||
pattern = '''(a|ab|c|bcd){1,}(d*)'''
|
||||
input = '''ababcd'''
|
||||
matches = [[0, 1]]
|
||||
|
||||
[[tests]]
|
||||
name = "repetition-long50"
|
||||
options = ['escaped']
|
||||
pattern = '''(a|ab|c|bcd){2,}(d*)'''
|
||||
input = '''ababcd'''
|
||||
matches = [[0, 6]]
|
||||
|
||||
[[tests]]
|
||||
name = "repetition-long51"
|
||||
options = ['escaped']
|
||||
pattern = '''(a|ab|c|bcd){3,}(d*)'''
|
||||
input = '''ababcd'''
|
||||
matches = [[0, 6]]
|
||||
|
||||
[[tests]]
|
||||
name = "repetition-long52"
|
||||
options = ['escaped']
|
||||
pattern = '''(a|ab|c|bcd){4,}(d*)'''
|
||||
input = '''ababcd'''
|
||||
matches = []
|
||||
|
||||
[[tests]]
|
||||
name = "repetition-long53"
|
||||
options = ['escaped']
|
||||
pattern = '''(a|ab|c|bcd){0,10}(d*)'''
|
||||
input = '''ababcd'''
|
||||
matches = [[0, 1]]
|
||||
|
||||
[[tests]]
|
||||
name = "repetition-long54"
|
||||
options = ['escaped']
|
||||
pattern = '''(a|ab|c|bcd){1,10}(d*)'''
|
||||
input = '''ababcd'''
|
||||
matches = [[0, 1]]
|
||||
|
||||
[[tests]]
|
||||
name = "repetition-long55"
|
||||
options = ['escaped']
|
||||
pattern = '''(a|ab|c|bcd){2,10}(d*)'''
|
||||
input = '''ababcd'''
|
||||
matches = [[0, 6]]
|
||||
|
||||
[[tests]]
|
||||
name = "repetition-long56"
|
||||
options = ['escaped']
|
||||
pattern = '''(a|ab|c|bcd){3,10}(d*)'''
|
||||
input = '''ababcd'''
|
||||
matches = [[0, 6]]
|
||||
|
||||
[[tests]]
|
||||
name = "repetition-long57"
|
||||
options = ['escaped']
|
||||
pattern = '''(a|ab|c|bcd){4,10}(d*)'''
|
||||
input = '''ababcd'''
|
||||
matches = []
|
||||
|
||||
[[tests]]
|
||||
name = "repetition-long58"
|
||||
options = ['escaped']
|
||||
pattern = '''(a|ab|c|bcd)*(d*)'''
|
||||
input = '''ababcd'''
|
||||
matches = [[0, 1]]
|
||||
|
||||
[[tests]]
|
||||
name = "repetition-long59"
|
||||
options = ['escaped']
|
||||
pattern = '''(a|ab|c|bcd)+(d*)'''
|
||||
input = '''ababcd'''
|
||||
matches = [[0, 1]]
|
||||
|
||||
[[tests]]
|
||||
name = "repetition-long65"
|
||||
options = ['escaped']
|
||||
pattern = '''(ab|a|c|bcd){0,}(d*)'''
|
||||
input = '''ababcd'''
|
||||
matches = [[0, 6]]
|
||||
|
||||
[[tests]]
|
||||
name = "repetition-long67"
|
||||
options = ['escaped']
|
||||
pattern = '''(ab|a|c|bcd){1,}(d*)'''
|
||||
input = '''ababcd'''
|
||||
matches = [[0, 6]]
|
||||
|
||||
[[tests]]
|
||||
name = "repetition-long69"
|
||||
options = ['escaped']
|
||||
pattern = '''(ab|a|c|bcd){2,}(d*)'''
|
||||
input = '''ababcd'''
|
||||
matches = [[0, 6]]
|
||||
|
||||
[[tests]]
|
||||
name = "repetition-long71"
|
||||
options = ['escaped']
|
||||
pattern = '''(ab|a|c|bcd){3,}(d*)'''
|
||||
input = '''ababcd'''
|
||||
matches = [[0, 6]]
|
||||
|
||||
[[tests]]
|
||||
name = "repetition-long72"
|
||||
options = ['escaped']
|
||||
pattern = '''(ab|a|c|bcd){4,}(d*)'''
|
||||
input = '''ababcd'''
|
||||
matches = []
|
||||
|
||||
[[tests]]
|
||||
name = "repetition-long74"
|
||||
options = ['escaped']
|
||||
pattern = '''(ab|a|c|bcd){0,10}(d*)'''
|
||||
input = '''ababcd'''
|
||||
matches = [[0, 6]]
|
||||
|
||||
[[tests]]
|
||||
name = "repetition-long76"
|
||||
options = ['escaped']
|
||||
pattern = '''(ab|a|c|bcd){1,10}(d*)'''
|
||||
input = '''ababcd'''
|
||||
matches = [[0, 6]]
|
||||
|
||||
[[tests]]
|
||||
name = "repetition-long78"
|
||||
options = ['escaped']
|
||||
pattern = '''(ab|a|c|bcd){2,10}(d*)'''
|
||||
input = '''ababcd'''
|
||||
matches = [[0, 6]]
|
||||
|
||||
[[tests]]
|
||||
name = "repetition-long80"
|
||||
options = ['escaped']
|
||||
pattern = '''(ab|a|c|bcd){3,10}(d*)'''
|
||||
input = '''ababcd'''
|
||||
matches = [[0, 6]]
|
||||
|
||||
[[tests]]
|
||||
name = "repetition-long81"
|
||||
options = ['escaped']
|
||||
pattern = '''(ab|a|c|bcd){4,10}(d*)'''
|
||||
input = '''ababcd'''
|
||||
matches = []
|
||||
|
||||
[[tests]]
|
||||
name = "repetition-long83"
|
||||
options = ['escaped']
|
||||
pattern = '''(ab|a|c|bcd)*(d*)'''
|
||||
input = '''ababcd'''
|
||||
matches = [[0, 6]]
|
||||
|
||||
[[tests]]
|
||||
name = "repetition-long85"
|
||||
options = ['escaped']
|
||||
pattern = '''(ab|a|c|bcd)+(d*)'''
|
||||
input = '''ababcd'''
|
||||
matches = [[0, 6]]
|
||||
|
||||
83
third-party/vendor/regex-automata-0.1.10/data/tests/fowler/repetition.dat
vendored
Normal file
83
third-party/vendor/regex-automata-0.1.10/data/tests/fowler/repetition.dat
vendored
Normal file
|
|
@ -0,0 +1,83 @@
|
|||
NOTE implicit vs. explicit repetitions : 2009-02-02
|
||||
|
||||
# Glenn Fowler <gsf@research.att.com>
|
||||
# conforming matches (column 4) must match one of the following BREs
|
||||
# NOMATCH
|
||||
# (0,.)\((\(.\),\(.\))(?,?)(\2,\3)\)*
|
||||
# (0,.)\((\(.\),\(.\))(\2,\3)(?,?)\)*
|
||||
# i.e., each 3-tuple has two identical elements and one (?,?)
|
||||
|
||||
E ((..)|(.)) NULL NOMATCH
|
||||
E ((..)|(.))((..)|(.)) NULL NOMATCH
|
||||
E ((..)|(.))((..)|(.))((..)|(.)) NULL NOMATCH
|
||||
|
||||
E ((..)|(.)){1} NULL NOMATCH
|
||||
E ((..)|(.)){2} NULL NOMATCH
|
||||
E ((..)|(.)){3} NULL NOMATCH
|
||||
|
||||
E ((..)|(.))* NULL (0,0)
|
||||
|
||||
E ((..)|(.)) a (0,1)(0,1)(?,?)(0,1)
|
||||
E ((..)|(.))((..)|(.)) a NOMATCH
|
||||
E ((..)|(.))((..)|(.))((..)|(.)) a NOMATCH
|
||||
|
||||
E ((..)|(.)){1} a (0,1)(0,1)(?,?)(0,1)
|
||||
E ((..)|(.)){2} a NOMATCH
|
||||
E ((..)|(.)){3} a NOMATCH
|
||||
|
||||
E ((..)|(.))* a (0,1)(0,1)(?,?)(0,1)
|
||||
|
||||
E ((..)|(.)) aa (0,2)(0,2)(0,2)(?,?)
|
||||
E ((..)|(.))((..)|(.)) aa (0,2)(0,1)(?,?)(0,1)(1,2)(?,?)(1,2)
|
||||
E ((..)|(.))((..)|(.))((..)|(.)) aa NOMATCH
|
||||
|
||||
E ((..)|(.)){1} aa (0,2)(0,2)(0,2)(?,?)
|
||||
E ((..)|(.)){2} aa (0,2)(1,2)(?,?)(1,2)
|
||||
E ((..)|(.)){3} aa NOMATCH
|
||||
|
||||
E ((..)|(.))* aa (0,2)(0,2)(0,2)(?,?)
|
||||
|
||||
E ((..)|(.)) aaa (0,2)(0,2)(0,2)(?,?)
|
||||
E ((..)|(.))((..)|(.)) aaa (0,3)(0,2)(0,2)(?,?)(2,3)(?,?)(2,3)
|
||||
E ((..)|(.))((..)|(.))((..)|(.)) aaa (0,3)(0,1)(?,?)(0,1)(1,2)(?,?)(1,2)(2,3)(?,?)(2,3)
|
||||
|
||||
E ((..)|(.)){1} aaa (0,2)(0,2)(0,2)(?,?)
|
||||
#E ((..)|(.)){2} aaa (0,3)(2,3)(?,?)(2,3)
|
||||
E ((..)|(.)){2} aaa (0,3)(2,3)(0,2)(2,3) RE2/Go
|
||||
E ((..)|(.)){3} aaa (0,3)(2,3)(?,?)(2,3)
|
||||
|
||||
#E ((..)|(.))* aaa (0,3)(2,3)(?,?)(2,3)
|
||||
E ((..)|(.))* aaa (0,3)(2,3)(0,2)(2,3) RE2/Go
|
||||
|
||||
E ((..)|(.)) aaaa (0,2)(0,2)(0,2)(?,?)
|
||||
E ((..)|(.))((..)|(.)) aaaa (0,4)(0,2)(0,2)(?,?)(2,4)(2,4)(?,?)
|
||||
E ((..)|(.))((..)|(.))((..)|(.)) aaaa (0,4)(0,2)(0,2)(?,?)(2,3)(?,?)(2,3)(3,4)(?,?)(3,4)
|
||||
|
||||
E ((..)|(.)){1} aaaa (0,2)(0,2)(0,2)(?,?)
|
||||
E ((..)|(.)){2} aaaa (0,4)(2,4)(2,4)(?,?)
|
||||
#E ((..)|(.)){3} aaaa (0,4)(3,4)(?,?)(3,4)
|
||||
E ((..)|(.)){3} aaaa (0,4)(3,4)(0,2)(3,4) RE2/Go
|
||||
|
||||
E ((..)|(.))* aaaa (0,4)(2,4)(2,4)(?,?)
|
||||
|
||||
E ((..)|(.)) aaaaa (0,2)(0,2)(0,2)(?,?)
|
||||
E ((..)|(.))((..)|(.)) aaaaa (0,4)(0,2)(0,2)(?,?)(2,4)(2,4)(?,?)
|
||||
E ((..)|(.))((..)|(.))((..)|(.)) aaaaa (0,5)(0,2)(0,2)(?,?)(2,4)(2,4)(?,?)(4,5)(?,?)(4,5)
|
||||
|
||||
E ((..)|(.)){1} aaaaa (0,2)(0,2)(0,2)(?,?)
|
||||
E ((..)|(.)){2} aaaaa (0,4)(2,4)(2,4)(?,?)
|
||||
#E ((..)|(.)){3} aaaaa (0,5)(4,5)(?,?)(4,5)
|
||||
E ((..)|(.)){3} aaaaa (0,5)(4,5)(2,4)(4,5) RE2/Go
|
||||
|
||||
#E ((..)|(.))* aaaaa (0,5)(4,5)(?,?)(4,5)
|
||||
E ((..)|(.))* aaaaa (0,5)(4,5)(2,4)(4,5) RE2/Go
|
||||
|
||||
E ((..)|(.)) aaaaaa (0,2)(0,2)(0,2)(?,?)
|
||||
E ((..)|(.))((..)|(.)) aaaaaa (0,4)(0,2)(0,2)(?,?)(2,4)(2,4)(?,?)
|
||||
E ((..)|(.))((..)|(.))((..)|(.)) aaaaaa (0,6)(0,2)(0,2)(?,?)(2,4)(2,4)(?,?)(4,6)(4,6)(?,?)
|
||||
|
||||
E ((..)|(.)){1} aaaaaa (0,2)(0,2)(0,2)(?,?)
|
||||
E ((..)|(.)){2} aaaaaa (0,4)(2,4)(2,4)(?,?)
|
||||
E ((..)|(.)){3} aaaaaa (0,6)(4,6)(4,6)(?,?)
|
||||
|
||||
E ((..)|(.))* aaaaaa (0,6)(4,6)(4,6)(?,?)
|
||||
343
third-party/vendor/regex-automata-0.1.10/data/tests/fowler/repetition.toml
vendored
Normal file
343
third-party/vendor/regex-automata-0.1.10/data/tests/fowler/repetition.toml
vendored
Normal file
|
|
@ -0,0 +1,343 @@
|
|||
[[tests]]
|
||||
name = "repetition10"
|
||||
options = ['escaped']
|
||||
pattern = '''((..)|(.))'''
|
||||
input = ''''''
|
||||
matches = []
|
||||
|
||||
[[tests]]
|
||||
name = "repetition11"
|
||||
options = ['escaped']
|
||||
pattern = '''((..)|(.))((..)|(.))'''
|
||||
input = ''''''
|
||||
matches = []
|
||||
|
||||
[[tests]]
|
||||
name = "repetition12"
|
||||
options = ['escaped']
|
||||
pattern = '''((..)|(.))((..)|(.))((..)|(.))'''
|
||||
input = ''''''
|
||||
matches = []
|
||||
|
||||
[[tests]]
|
||||
name = "repetition14"
|
||||
options = ['escaped']
|
||||
pattern = '''((..)|(.)){1}'''
|
||||
input = ''''''
|
||||
matches = []
|
||||
|
||||
[[tests]]
|
||||
name = "repetition15"
|
||||
options = ['escaped']
|
||||
pattern = '''((..)|(.)){2}'''
|
||||
input = ''''''
|
||||
matches = []
|
||||
|
||||
[[tests]]
|
||||
name = "repetition16"
|
||||
options = ['escaped']
|
||||
pattern = '''((..)|(.)){3}'''
|
||||
input = ''''''
|
||||
matches = []
|
||||
|
||||
[[tests]]
|
||||
name = "repetition18"
|
||||
options = ['escaped']
|
||||
pattern = '''((..)|(.))*'''
|
||||
input = ''''''
|
||||
matches = [[0, 0]]
|
||||
|
||||
[[tests]]
|
||||
name = "repetition20"
|
||||
options = ['escaped']
|
||||
pattern = '''((..)|(.))'''
|
||||
input = '''a'''
|
||||
matches = [[0, 1]]
|
||||
|
||||
[[tests]]
|
||||
name = "repetition21"
|
||||
options = ['escaped']
|
||||
pattern = '''((..)|(.))((..)|(.))'''
|
||||
input = '''a'''
|
||||
matches = []
|
||||
|
||||
[[tests]]
|
||||
name = "repetition22"
|
||||
options = ['escaped']
|
||||
pattern = '''((..)|(.))((..)|(.))((..)|(.))'''
|
||||
input = '''a'''
|
||||
matches = []
|
||||
|
||||
[[tests]]
|
||||
name = "repetition24"
|
||||
options = ['escaped']
|
||||
pattern = '''((..)|(.)){1}'''
|
||||
input = '''a'''
|
||||
matches = [[0, 1]]
|
||||
|
||||
[[tests]]
|
||||
name = "repetition25"
|
||||
options = ['escaped']
|
||||
pattern = '''((..)|(.)){2}'''
|
||||
input = '''a'''
|
||||
matches = []
|
||||
|
||||
[[tests]]
|
||||
name = "repetition26"
|
||||
options = ['escaped']
|
||||
pattern = '''((..)|(.)){3}'''
|
||||
input = '''a'''
|
||||
matches = []
|
||||
|
||||
[[tests]]
|
||||
name = "repetition28"
|
||||
options = ['escaped']
|
||||
pattern = '''((..)|(.))*'''
|
||||
input = '''a'''
|
||||
matches = [[0, 1]]
|
||||
|
||||
[[tests]]
|
||||
name = "repetition30"
|
||||
options = ['escaped']
|
||||
pattern = '''((..)|(.))'''
|
||||
input = '''aa'''
|
||||
matches = [[0, 2]]
|
||||
|
||||
[[tests]]
|
||||
name = "repetition31"
|
||||
options = ['escaped']
|
||||
pattern = '''((..)|(.))((..)|(.))'''
|
||||
input = '''aa'''
|
||||
matches = [[0, 2]]
|
||||
|
||||
[[tests]]
|
||||
name = "repetition32"
|
||||
options = ['escaped']
|
||||
pattern = '''((..)|(.))((..)|(.))((..)|(.))'''
|
||||
input = '''aa'''
|
||||
matches = []
|
||||
|
||||
[[tests]]
|
||||
name = "repetition34"
|
||||
options = ['escaped']
|
||||
pattern = '''((..)|(.)){1}'''
|
||||
input = '''aa'''
|
||||
matches = [[0, 2]]
|
||||
|
||||
[[tests]]
|
||||
name = "repetition35"
|
||||
options = ['escaped']
|
||||
pattern = '''((..)|(.)){2}'''
|
||||
input = '''aa'''
|
||||
matches = [[0, 2]]
|
||||
|
||||
[[tests]]
|
||||
name = "repetition36"
|
||||
options = ['escaped']
|
||||
pattern = '''((..)|(.)){3}'''
|
||||
input = '''aa'''
|
||||
matches = []
|
||||
|
||||
[[tests]]
|
||||
name = "repetition38"
|
||||
options = ['escaped']
|
||||
pattern = '''((..)|(.))*'''
|
||||
input = '''aa'''
|
||||
matches = [[0, 2]]
|
||||
|
||||
[[tests]]
|
||||
name = "repetition40"
|
||||
options = ['escaped']
|
||||
pattern = '''((..)|(.))'''
|
||||
input = '''aaa'''
|
||||
matches = [[0, 2]]
|
||||
|
||||
[[tests]]
|
||||
name = "repetition41"
|
||||
options = ['escaped']
|
||||
pattern = '''((..)|(.))((..)|(.))'''
|
||||
input = '''aaa'''
|
||||
matches = [[0, 3]]
|
||||
|
||||
[[tests]]
|
||||
name = "repetition42"
|
||||
options = ['escaped']
|
||||
pattern = '''((..)|(.))((..)|(.))((..)|(.))'''
|
||||
input = '''aaa'''
|
||||
matches = [[0, 3]]
|
||||
|
||||
[[tests]]
|
||||
name = "repetition44"
|
||||
options = ['escaped']
|
||||
pattern = '''((..)|(.)){1}'''
|
||||
input = '''aaa'''
|
||||
matches = [[0, 2]]
|
||||
|
||||
[[tests]]
|
||||
name = "repetition46"
|
||||
options = ['escaped']
|
||||
pattern = '''((..)|(.)){2}'''
|
||||
input = '''aaa'''
|
||||
matches = [[0, 3]]
|
||||
|
||||
[[tests]]
|
||||
name = "repetition47"
|
||||
options = ['escaped']
|
||||
pattern = '''((..)|(.)){3}'''
|
||||
input = '''aaa'''
|
||||
matches = [[0, 3]]
|
||||
|
||||
[[tests]]
|
||||
name = "repetition50"
|
||||
options = ['escaped']
|
||||
pattern = '''((..)|(.))*'''
|
||||
input = '''aaa'''
|
||||
matches = [[0, 3]]
|
||||
|
||||
[[tests]]
|
||||
name = "repetition52"
|
||||
options = ['escaped']
|
||||
pattern = '''((..)|(.))'''
|
||||
input = '''aaaa'''
|
||||
matches = [[0, 2]]
|
||||
|
||||
[[tests]]
|
||||
name = "repetition53"
|
||||
options = ['escaped']
|
||||
pattern = '''((..)|(.))((..)|(.))'''
|
||||
input = '''aaaa'''
|
||||
matches = [[0, 4]]
|
||||
|
||||
[[tests]]
|
||||
name = "repetition54"
|
||||
options = ['escaped']
|
||||
pattern = '''((..)|(.))((..)|(.))((..)|(.))'''
|
||||
input = '''aaaa'''
|
||||
matches = [[0, 4]]
|
||||
|
||||
[[tests]]
|
||||
name = "repetition56"
|
||||
options = ['escaped']
|
||||
pattern = '''((..)|(.)){1}'''
|
||||
input = '''aaaa'''
|
||||
matches = [[0, 2]]
|
||||
|
||||
[[tests]]
|
||||
name = "repetition57"
|
||||
options = ['escaped']
|
||||
pattern = '''((..)|(.)){2}'''
|
||||
input = '''aaaa'''
|
||||
matches = [[0, 4]]
|
||||
|
||||
[[tests]]
|
||||
name = "repetition59"
|
||||
options = ['escaped']
|
||||
pattern = '''((..)|(.)){3}'''
|
||||
input = '''aaaa'''
|
||||
matches = [[0, 4]]
|
||||
|
||||
[[tests]]
|
||||
name = "repetition61"
|
||||
options = ['escaped']
|
||||
pattern = '''((..)|(.))*'''
|
||||
input = '''aaaa'''
|
||||
matches = [[0, 4]]
|
||||
|
||||
[[tests]]
|
||||
name = "repetition63"
|
||||
options = ['escaped']
|
||||
pattern = '''((..)|(.))'''
|
||||
input = '''aaaaa'''
|
||||
matches = [[0, 2]]
|
||||
|
||||
[[tests]]
|
||||
name = "repetition64"
|
||||
options = ['escaped']
|
||||
pattern = '''((..)|(.))((..)|(.))'''
|
||||
input = '''aaaaa'''
|
||||
matches = [[0, 4]]
|
||||
|
||||
[[tests]]
|
||||
name = "repetition65"
|
||||
options = ['escaped']
|
||||
pattern = '''((..)|(.))((..)|(.))((..)|(.))'''
|
||||
input = '''aaaaa'''
|
||||
matches = [[0, 5]]
|
||||
|
||||
[[tests]]
|
||||
name = "repetition67"
|
||||
options = ['escaped']
|
||||
pattern = '''((..)|(.)){1}'''
|
||||
input = '''aaaaa'''
|
||||
matches = [[0, 2]]
|
||||
|
||||
[[tests]]
|
||||
name = "repetition68"
|
||||
options = ['escaped']
|
||||
pattern = '''((..)|(.)){2}'''
|
||||
input = '''aaaaa'''
|
||||
matches = [[0, 4]]
|
||||
|
||||
[[tests]]
|
||||
name = "repetition70"
|
||||
options = ['escaped']
|
||||
pattern = '''((..)|(.)){3}'''
|
||||
input = '''aaaaa'''
|
||||
matches = [[0, 5]]
|
||||
|
||||
[[tests]]
|
||||
name = "repetition73"
|
||||
options = ['escaped']
|
||||
pattern = '''((..)|(.))*'''
|
||||
input = '''aaaaa'''
|
||||
matches = [[0, 5]]
|
||||
|
||||
[[tests]]
|
||||
name = "repetition75"
|
||||
options = ['escaped']
|
||||
pattern = '''((..)|(.))'''
|
||||
input = '''aaaaaa'''
|
||||
matches = [[0, 2]]
|
||||
|
||||
[[tests]]
|
||||
name = "repetition76"
|
||||
options = ['escaped']
|
||||
pattern = '''((..)|(.))((..)|(.))'''
|
||||
input = '''aaaaaa'''
|
||||
matches = [[0, 4]]
|
||||
|
||||
[[tests]]
|
||||
name = "repetition77"
|
||||
options = ['escaped']
|
||||
pattern = '''((..)|(.))((..)|(.))((..)|(.))'''
|
||||
input = '''aaaaaa'''
|
||||
matches = [[0, 6]]
|
||||
|
||||
[[tests]]
|
||||
name = "repetition79"
|
||||
options = ['escaped']
|
||||
pattern = '''((..)|(.)){1}'''
|
||||
input = '''aaaaaa'''
|
||||
matches = [[0, 2]]
|
||||
|
||||
[[tests]]
|
||||
name = "repetition80"
|
||||
options = ['escaped']
|
||||
pattern = '''((..)|(.)){2}'''
|
||||
input = '''aaaaaa'''
|
||||
matches = [[0, 4]]
|
||||
|
||||
[[tests]]
|
||||
name = "repetition81"
|
||||
options = ['escaped']
|
||||
pattern = '''((..)|(.)){3}'''
|
||||
input = '''aaaaaa'''
|
||||
matches = [[0, 6]]
|
||||
|
||||
[[tests]]
|
||||
name = "repetition83"
|
||||
options = ['escaped']
|
||||
pattern = '''((..)|(.))*'''
|
||||
input = '''aaaaaa'''
|
||||
matches = [[0, 6]]
|
||||
|
||||
92
third-party/vendor/regex-automata-0.1.10/data/tests/iter.toml
vendored
Normal file
92
third-party/vendor/regex-automata-0.1.10/data/tests/iter.toml
vendored
Normal file
|
|
@ -0,0 +1,92 @@
|
|||
[[tests]]
|
||||
name = "iter1"
|
||||
pattern = "a"
|
||||
input = "aaa"
|
||||
matches = [[0, 1], [1, 2], [2, 3]]
|
||||
|
||||
[[tests]]
|
||||
name = "iter2"
|
||||
pattern = "a"
|
||||
input = "aba"
|
||||
matches = [[0, 1], [2, 3]]
|
||||
|
||||
[[tests]]
|
||||
name = "iter-empty1"
|
||||
pattern = ''
|
||||
input = ''
|
||||
matches = [[0, 0]]
|
||||
|
||||
[[tests]]
|
||||
name = "iter-empty2"
|
||||
pattern = ''
|
||||
input = 'abc'
|
||||
matches = [[0, 0], [1, 1], [2, 2], [3, 3]]
|
||||
|
||||
[[tests]]
|
||||
name = "iter-empty3"
|
||||
pattern = '()'
|
||||
input = 'abc'
|
||||
matches = [[0, 0], [1, 1], [2, 2], [3, 3]]
|
||||
|
||||
[[tests]]
|
||||
name = "iter-empty4"
|
||||
pattern = '()*'
|
||||
input = 'abc'
|
||||
matches = [[0, 0], [1, 1], [2, 2], [3, 3]]
|
||||
|
||||
[[tests]]
|
||||
name = "iter-empty5"
|
||||
pattern = '()+'
|
||||
input = 'abc'
|
||||
matches = [[0, 0], [1, 1], [2, 2], [3, 3]]
|
||||
|
||||
[[tests]]
|
||||
name = "iter-empty6"
|
||||
pattern = '()?'
|
||||
input = 'abc'
|
||||
matches = [[0, 0], [1, 1], [2, 2], [3, 3]]
|
||||
|
||||
[[tests]]
|
||||
name = "iter-empty7"
|
||||
pattern = '()()'
|
||||
input = 'abc'
|
||||
matches = [[0, 0], [1, 1], [2, 2], [3, 3]]
|
||||
|
||||
[[tests]]
|
||||
name = "iter-empty8"
|
||||
pattern = '()+|z'
|
||||
input = 'abc'
|
||||
matches = [[0, 0], [1, 1], [2, 2], [3, 3]]
|
||||
|
||||
[[tests]]
|
||||
name = "iter-empty9"
|
||||
pattern = 'z|()+'
|
||||
input = 'abc'
|
||||
matches = [[0, 0], [1, 1], [2, 2], [3, 3]]
|
||||
|
||||
[[tests]]
|
||||
name = "iter-empty10"
|
||||
pattern = '()+|b'
|
||||
input = 'abc'
|
||||
matches = [[0, 0], [1, 1], [2, 2], [3, 3]]
|
||||
|
||||
[[tests]]
|
||||
name = "iter-empty11"
|
||||
pattern = 'b|()+'
|
||||
input = 'abc'
|
||||
matches = [[0, 0], [1, 2], [3, 3]]
|
||||
|
||||
|
||||
[[tests]]
|
||||
options = ["anchored"]
|
||||
name = "iter-anchored1"
|
||||
pattern = "a"
|
||||
input = "a"
|
||||
matches = [[0, 1]]
|
||||
|
||||
[[tests]]
|
||||
options = ["anchored"]
|
||||
name = "iter-anchored2"
|
||||
pattern = "a"
|
||||
input = "aa"
|
||||
matches = [[0, 1]]
|
||||
138
third-party/vendor/regex-automata-0.1.10/data/tests/no-unicode.toml
vendored
Normal file
138
third-party/vendor/regex-automata-0.1.10/data/tests/no-unicode.toml
vendored
Normal file
|
|
@ -0,0 +1,138 @@
|
|||
[[tests]]
|
||||
name = "invalid-utf8-literal1"
|
||||
options = ["escaped", "invalid-utf8", "no-unicode"]
|
||||
pattern = '\xFF'
|
||||
input = '\xFF'
|
||||
matches = [[0, 1]]
|
||||
|
||||
|
||||
[[tests]]
|
||||
name = "no-unicode-mixed"
|
||||
options = ["escaped", "invalid-utf8"]
|
||||
pattern = '(.+)(?-u)(.+)'
|
||||
input = '\xCE\x93\xCE\x94\xFF'
|
||||
matches = [[0, 5]]
|
||||
|
||||
|
||||
[[tests]]
|
||||
name = "no-unicode-case1"
|
||||
options = ["case-insensitive", "no-unicode"]
|
||||
pattern = "a"
|
||||
input = "A"
|
||||
matches = [[0, 1]]
|
||||
|
||||
[[tests]]
|
||||
name = "no-unicode-case2"
|
||||
options = ["case-insensitive", "no-unicode"]
|
||||
pattern = "[a-z]+"
|
||||
input = "AaAaA"
|
||||
matches = [[0, 5]]
|
||||
|
||||
[[tests]]
|
||||
name = "no-unicode-case3"
|
||||
options = ["case-insensitive"]
|
||||
pattern = "[a-z]+"
|
||||
input = "aA\u212AaA"
|
||||
matches = [[0, 7]]
|
||||
|
||||
[[tests]]
|
||||
name = "no-unicode-case4"
|
||||
options = ["case-insensitive", "no-unicode"]
|
||||
pattern = "[a-z]+"
|
||||
input = "aA\u212AaA"
|
||||
matches = [[0, 2]]
|
||||
|
||||
|
||||
[[tests]]
|
||||
name = "no-unicode-negate1"
|
||||
options = []
|
||||
pattern = "[^a]"
|
||||
input = "δ"
|
||||
matches = [[0, 2]]
|
||||
|
||||
[[tests]]
|
||||
name = "no-unicode-negate2"
|
||||
options = ["no-unicode", "invalid-utf8"]
|
||||
pattern = "[^a]"
|
||||
input = "δ"
|
||||
matches = [[0, 1]]
|
||||
|
||||
|
||||
[[tests]]
|
||||
name = "no-unicode-dotstar-prefix1"
|
||||
options = ["escaped", "no-unicode", "invalid-utf8"]
|
||||
pattern = "a"
|
||||
input = '\xFFa'
|
||||
matches = [[1, 2]]
|
||||
|
||||
[[tests]]
|
||||
name = "no-unicode-dotstar-prefix2"
|
||||
options = ["escaped", "invalid-utf8"]
|
||||
pattern = "a"
|
||||
input = '\xFFa'
|
||||
matches = [[1, 2]]
|
||||
|
||||
|
||||
[[tests]]
|
||||
name = "no-unicode-null-bytes1"
|
||||
options = ["escaped", "no-unicode", "invalid-utf8"]
|
||||
pattern = '[^\x00]+\x00'
|
||||
input = 'foo\x00'
|
||||
matches = [[0, 4]]
|
||||
|
||||
|
||||
[[tests]]
|
||||
name = "no-unicode1"
|
||||
options = ["no-unicode"]
|
||||
pattern = '\w+'
|
||||
input = "aδ"
|
||||
matches = [[0, 1]]
|
||||
|
||||
[[tests]]
|
||||
name = "no-unicode2"
|
||||
options = []
|
||||
pattern = '\w+'
|
||||
input = "aδ"
|
||||
matches = [[0, 3]]
|
||||
|
||||
[[tests]]
|
||||
name = "no-unicode3"
|
||||
options = ["no-unicode"]
|
||||
pattern = '\d+'
|
||||
input = "1२३9"
|
||||
matches = [[0, 1]]
|
||||
|
||||
[[tests]]
|
||||
name = "no-unicode4"
|
||||
pattern = '\d+'
|
||||
input = "1२३9"
|
||||
matches = [[0, 8]]
|
||||
|
||||
[[tests]]
|
||||
name = "no-unicode5"
|
||||
options = ["no-unicode"]
|
||||
pattern = '\s+'
|
||||
input = " \u1680"
|
||||
matches = [[0, 1]]
|
||||
|
||||
[[tests]]
|
||||
name = "no-unicode6"
|
||||
pattern = '\s+'
|
||||
input = " \u1680"
|
||||
matches = [[0, 4]]
|
||||
|
||||
|
||||
[[tests]]
|
||||
# See: https://github.com/rust-lang/regex/issues/484
|
||||
name = "no-unicode-iter1"
|
||||
pattern = ''
|
||||
input = "☃"
|
||||
matches = [[0, 0], [1, 1], [2, 2], [3, 3]]
|
||||
|
||||
[[tests]]
|
||||
# See: https://github.com/rust-lang/regex/issues/484
|
||||
options = ['escaped']
|
||||
name = "no-unicode-iter2"
|
||||
pattern = ''
|
||||
input = 'b\xFFr'
|
||||
matches = [[0, 0], [1, 1], [2, 2], [3, 3]]
|
||||
489
third-party/vendor/regex-automata-0.1.10/data/tests/unicode.toml
vendored
Normal file
489
third-party/vendor/regex-automata-0.1.10/data/tests/unicode.toml
vendored
Normal file
|
|
@ -0,0 +1,489 @@
|
|||
[[tests]]
|
||||
name = "unicode-literal1"
|
||||
pattern = '☃'
|
||||
input = "☃"
|
||||
matches = [[0, 3]]
|
||||
|
||||
[[tests]]
|
||||
name = "unicode-literal2"
|
||||
pattern = '☃+'
|
||||
input = "☃"
|
||||
matches = [[0, 3]]
|
||||
|
||||
[[tests]]
|
||||
name = "unicode-literal3"
|
||||
options = ["case-insensitive"]
|
||||
pattern = '☃+'
|
||||
input = "☃"
|
||||
matches = [[0, 3]]
|
||||
|
||||
[[tests]]
|
||||
name = "unicode-literal4"
|
||||
options = ["case-insensitive"]
|
||||
pattern = 'Δ'
|
||||
input = "δ"
|
||||
matches = [[0, 2]]
|
||||
|
||||
|
||||
[[tests]]
|
||||
name = "unicode-class1"
|
||||
pattern = '[☃Ⅰ]+'
|
||||
input = "☃"
|
||||
matches = [[0, 3]]
|
||||
|
||||
[[tests]]
|
||||
name = "unicode-class2"
|
||||
pattern = '\pN'
|
||||
input = "Ⅰ"
|
||||
matches = [[0, 3]]
|
||||
|
||||
[[tests]]
|
||||
name = "unicode-class3"
|
||||
pattern = '\pN+'
|
||||
input = "Ⅰ1Ⅱ2"
|
||||
matches = [[0, 8]]
|
||||
|
||||
[[tests]]
|
||||
name = "unicode-class4"
|
||||
pattern = '\PN+'
|
||||
input = "abⅠ"
|
||||
matches = [[0, 2]]
|
||||
|
||||
[[tests]]
|
||||
name = "unicode-class5"
|
||||
pattern = '[\PN]+'
|
||||
input = "abⅠ"
|
||||
matches = [[0, 2]]
|
||||
|
||||
[[tests]]
|
||||
name = "unicode-class6"
|
||||
pattern = '[^\PN]+'
|
||||
input = "abⅠ"
|
||||
matches = [[2, 5]]
|
||||
|
||||
[[tests]]
|
||||
name = "unicode-class7"
|
||||
pattern = '\p{Lu}+'
|
||||
input = "ΛΘΓΔα"
|
||||
matches = [[0, 8]]
|
||||
|
||||
[[tests]]
|
||||
name = "unicode-class8"
|
||||
options = ["case-insensitive"]
|
||||
pattern = '\p{Lu}+'
|
||||
input = "ΛΘΓΔα"
|
||||
matches = [[0, 10]]
|
||||
|
||||
[[tests]]
|
||||
name = "unicode-class9"
|
||||
pattern = '\pL+'
|
||||
input = "ΛΘΓΔα"
|
||||
matches = [[0, 10]]
|
||||
|
||||
[[tests]]
|
||||
name = "unicode-class10"
|
||||
pattern = '\p{Ll}+'
|
||||
input = "ΛΘΓΔα"
|
||||
matches = [[8, 10]]
|
||||
|
||||
|
||||
[[tests]]
|
||||
name = "unicode-perl1"
|
||||
pattern = '\w+'
|
||||
input = "dδd"
|
||||
matches = [[0, 4]]
|
||||
|
||||
[[tests]]
|
||||
name = "unicode-perl2"
|
||||
pattern = '\w+'
|
||||
input = "⥡"
|
||||
matches = []
|
||||
|
||||
[[tests]]
|
||||
name = "unicode-perl3"
|
||||
pattern = '\W+'
|
||||
input = "⥡"
|
||||
matches = [[0, 3]]
|
||||
|
||||
[[tests]]
|
||||
name = "unicode-perl4"
|
||||
pattern = '\d+'
|
||||
input = "1२३9"
|
||||
matches = [[0, 8]]
|
||||
|
||||
[[tests]]
|
||||
name = "unicode-perl5"
|
||||
pattern = '\d+'
|
||||
input = "Ⅱ"
|
||||
matches = []
|
||||
|
||||
[[tests]]
|
||||
name = "unicode-perl6"
|
||||
pattern = '\D+'
|
||||
input = "Ⅱ"
|
||||
matches = [[0, 3]]
|
||||
|
||||
[[tests]]
|
||||
name = "unicode-perl7"
|
||||
pattern = '\s+'
|
||||
input = " "
|
||||
matches = [[0, 3]]
|
||||
|
||||
[[tests]]
|
||||
name = "unicode-perl8"
|
||||
pattern = '\s+'
|
||||
input = "☃"
|
||||
matches = []
|
||||
|
||||
[[tests]]
|
||||
name = "unicode-perl9"
|
||||
pattern = '\S+'
|
||||
input = "☃"
|
||||
matches = [[0, 3]]
|
||||
|
||||
|
||||
[[tests]]
|
||||
name = "unicode-class-gencat1"
|
||||
pattern = '\p{Cased_Letter}'
|
||||
input = "A"
|
||||
matches = [[0, 3]]
|
||||
|
||||
[[tests]]
|
||||
name = "unicode-class-gencat2"
|
||||
pattern = '\p{Close_Punctuation}'
|
||||
input = "❯"
|
||||
matches = [[0, 3]]
|
||||
|
||||
[[tests]]
|
||||
name = "unicode-class-gencat3"
|
||||
pattern = '\p{Connector_Punctuation}'
|
||||
input = "⁀"
|
||||
matches = [[0, 3]]
|
||||
|
||||
[[tests]]
|
||||
name = "unicode-class-gencat4"
|
||||
pattern = '\p{Control}'
|
||||
input = "\u009F"
|
||||
matches = [[0, 2]]
|
||||
|
||||
[[tests]]
|
||||
name = "unicode-class-gencat5"
|
||||
pattern = '\p{Currency_Symbol}'
|
||||
input = "£"
|
||||
matches = [[0, 3]]
|
||||
|
||||
[[tests]]
|
||||
name = "unicode-class-gencat6"
|
||||
pattern = '\p{Dash_Punctuation}'
|
||||
input = "〰"
|
||||
matches = [[0, 3]]
|
||||
|
||||
[[tests]]
|
||||
name = "unicode-class-gencat7"
|
||||
pattern = '\p{Decimal_Number}'
|
||||
input = "𑓙"
|
||||
matches = [[0, 4]]
|
||||
|
||||
[[tests]]
|
||||
name = "unicode-class-gencat8"
|
||||
pattern = '\p{Enclosing_Mark}'
|
||||
input = "\uA672"
|
||||
matches = [[0, 3]]
|
||||
|
||||
[[tests]]
|
||||
name = "unicode-class-gencat9"
|
||||
pattern = '\p{Final_Punctuation}'
|
||||
input = "⸡"
|
||||
matches = [[0, 3]]
|
||||
|
||||
[[tests]]
|
||||
name = "unicode-class-gencat10"
|
||||
pattern = '\p{Format}'
|
||||
input = "\U000E007F"
|
||||
matches = [[0, 4]]
|
||||
|
||||
[[tests]]
|
||||
name = "unicode-class-gencat11"
|
||||
pattern = '\p{Initial_Punctuation}'
|
||||
input = "⸜"
|
||||
matches = [[0, 3]]
|
||||
|
||||
[[tests]]
|
||||
name = "unicode-class-gencat12"
|
||||
pattern = '\p{Letter}'
|
||||
input = "Έ"
|
||||
matches = [[0, 2]]
|
||||
|
||||
[[tests]]
|
||||
name = "unicode-class-gencat13"
|
||||
pattern = '\p{Letter_Number}'
|
||||
input = "ↂ"
|
||||
matches = [[0, 3]]
|
||||
|
||||
[[tests]]
|
||||
name = "unicode-class-gencat14"
|
||||
pattern = '\p{Line_Separator}'
|
||||
input = "\u2028"
|
||||
matches = [[0, 3]]
|
||||
|
||||
[[tests]]
|
||||
name = "unicode-class-gencat15"
|
||||
pattern = '\p{Lowercase_Letter}'
|
||||
input = "ϛ"
|
||||
matches = [[0, 2]]
|
||||
|
||||
[[tests]]
|
||||
name = "unicode-class-gencat16"
|
||||
pattern = '\p{Mark}'
|
||||
input = "\U000E01EF"
|
||||
matches = [[0, 4]]
|
||||
|
||||
[[tests]]
|
||||
name = "unicode-class-gencat17"
|
||||
pattern = '\p{Math}'
|
||||
input = "⋿"
|
||||
matches = [[0, 3]]
|
||||
|
||||
[[tests]]
|
||||
name = "unicode-class-gencat18"
|
||||
pattern = '\p{Modifier_Letter}'
|
||||
input = "𖭃"
|
||||
matches = [[0, 4]]
|
||||
|
||||
[[tests]]
|
||||
name = "unicode-class-gencat19"
|
||||
pattern = '\p{Modifier_Symbol}'
|
||||
input = "🏿"
|
||||
matches = [[0, 4]]
|
||||
|
||||
[[tests]]
|
||||
name = "unicode-class-gencat20"
|
||||
pattern = '\p{Nonspacing_Mark}'
|
||||
input = "\U0001E94A"
|
||||
matches = [[0, 4]]
|
||||
|
||||
[[tests]]
|
||||
name = "unicode-class-gencat21"
|
||||
pattern = '\p{Number}'
|
||||
input = "⓿"
|
||||
matches = [[0, 3]]
|
||||
|
||||
[[tests]]
|
||||
name = "unicode-class-gencat22"
|
||||
pattern = '\p{Open_Punctuation}'
|
||||
input = "⦅"
|
||||
matches = [[0, 3]]
|
||||
|
||||
[[tests]]
|
||||
name = "unicode-class-gencat23"
|
||||
pattern = '\p{Other}'
|
||||
input = "\u0BC9"
|
||||
matches = [[0, 3]]
|
||||
|
||||
[[tests]]
|
||||
name = "unicode-class-gencat24"
|
||||
pattern = '\p{Other_Letter}'
|
||||
input = "ꓷ"
|
||||
matches = [[0, 3]]
|
||||
|
||||
[[tests]]
|
||||
name = "unicode-class-gencat25"
|
||||
pattern = '\p{Other_Number}'
|
||||
input = "㉏"
|
||||
matches = [[0, 3]]
|
||||
|
||||
[[tests]]
|
||||
name = "unicode-class-gencat26"
|
||||
pattern = '\p{Other_Punctuation}'
|
||||
input = "𞥞"
|
||||
matches = [[0, 4]]
|
||||
|
||||
[[tests]]
|
||||
name = "unicode-class-gencat27"
|
||||
pattern = '\p{Other_Symbol}'
|
||||
input = "⅌"
|
||||
matches = [[0, 3]]
|
||||
|
||||
[[tests]]
|
||||
name = "unicode-class-gencat28"
|
||||
pattern = '\p{Paragraph_Separator}'
|
||||
input = "\u2029"
|
||||
matches = [[0, 3]]
|
||||
|
||||
[[tests]]
|
||||
name = "unicode-class-gencat29"
|
||||
pattern = '\p{Private_Use}'
|
||||
input = "\U0010FFFD"
|
||||
matches = [[0, 4]]
|
||||
|
||||
[[tests]]
|
||||
name = "unicode-class-gencat30"
|
||||
pattern = '\p{Punctuation}'
|
||||
input = "𑁍"
|
||||
matches = [[0, 4]]
|
||||
|
||||
[[tests]]
|
||||
name = "unicode-class-gencat31"
|
||||
pattern = '\p{Separator}'
|
||||
input = "\u3000"
|
||||
matches = [[0, 3]]
|
||||
|
||||
[[tests]]
|
||||
name = "unicode-class-gencat32"
|
||||
pattern = '\p{Space_Separator}'
|
||||
input = "\u205F"
|
||||
matches = [[0, 3]]
|
||||
|
||||
[[tests]]
|
||||
name = "unicode-class-gencat33"
|
||||
pattern = '\p{Spacing_Mark}'
|
||||
input = "\U00016F7E"
|
||||
matches = [[0, 4]]
|
||||
|
||||
[[tests]]
|
||||
name = "unicode-class-gencat34"
|
||||
pattern = '\p{Symbol}'
|
||||
input = "⯈"
|
||||
matches = [[0, 3]]
|
||||
|
||||
[[tests]]
|
||||
name = "unicode-class-gencat35"
|
||||
pattern = '\p{Titlecase_Letter}'
|
||||
input = "ῼ"
|
||||
matches = [[0, 3]]
|
||||
|
||||
[[tests]]
|
||||
name = "unicode-class-gencat36"
|
||||
pattern = '\p{Unassigned}'
|
||||
input = "\U0010FFFF"
|
||||
matches = [[0, 4]]
|
||||
|
||||
[[tests]]
|
||||
name = "unicode-class-gencat37"
|
||||
pattern = '\p{Uppercase_Letter}'
|
||||
input = "Ꝋ"
|
||||
matches = [[0, 3]]
|
||||
|
||||
|
||||
[[tests]]
|
||||
name = "unicode-class-emoji1"
|
||||
pattern = '\p{Emoji}'
|
||||
input = "\u23E9"
|
||||
matches = [[0, 3]]
|
||||
|
||||
[[tests]]
|
||||
name = "unicode-class-emoji2"
|
||||
pattern = '\p{emoji}'
|
||||
input = "\U0001F21A"
|
||||
matches = [[0, 4]]
|
||||
|
||||
[[tests]]
|
||||
name = "unicode-class-emoji3"
|
||||
pattern = '\p{extendedpictographic}'
|
||||
input = "\U0001FA6E"
|
||||
matches = [[0, 4]]
|
||||
|
||||
[[tests]]
|
||||
name = "unicode-class-emoji4"
|
||||
pattern = '\p{extendedpictographic}'
|
||||
input = "\U0001FFFD"
|
||||
matches = [[0, 4]]
|
||||
|
||||
|
||||
[[tests]]
|
||||
name = "unicode-class-gcb1"
|
||||
pattern = '\p{grapheme_cluster_break=prepend}'
|
||||
input = "\U00011D46"
|
||||
matches = [[0, 4]]
|
||||
|
||||
[[tests]]
|
||||
name = "unicode-class-gcb2"
|
||||
pattern = '\p{gcb=regional_indicator}'
|
||||
input = "\U0001F1E6"
|
||||
matches = [[0, 4]]
|
||||
|
||||
[[tests]]
|
||||
name = "unicode-class-gcb3"
|
||||
pattern = '\p{gcb=ri}'
|
||||
input = "\U0001F1E7"
|
||||
matches = [[0, 4]]
|
||||
|
||||
[[tests]]
|
||||
name = "unicode-class-gcb4"
|
||||
pattern = '\p{regionalindicator}'
|
||||
input = "\U0001F1FF"
|
||||
matches = [[0, 4]]
|
||||
|
||||
[[tests]]
|
||||
name = "unicode-class-gcb5"
|
||||
pattern = '\p{gcb=lvt}'
|
||||
input = "\uC989"
|
||||
matches = [[0, 3]]
|
||||
|
||||
[[tests]]
|
||||
name = "unicode-class-gcb6"
|
||||
pattern = '\p{gcb=zwj}'
|
||||
input = "\u200D"
|
||||
matches = [[0, 3]]
|
||||
|
||||
|
||||
[[tests]]
|
||||
name = "unicode-class-word-break1"
|
||||
pattern = '\p{word_break=Hebrew_Letter}'
|
||||
input = "\uFB46"
|
||||
matches = [[0, 3]]
|
||||
|
||||
[[tests]]
|
||||
name = "unicode-class-word-break2"
|
||||
pattern = '\p{wb=hebrewletter}'
|
||||
input = "\uFB46"
|
||||
matches = [[0, 3]]
|
||||
|
||||
[[tests]]
|
||||
name = "unicode-class-word-break3"
|
||||
pattern = '\p{wb=ExtendNumLet}'
|
||||
input = "\uFF3F"
|
||||
matches = [[0, 3]]
|
||||
|
||||
[[tests]]
|
||||
name = "unicode-class-word-break4"
|
||||
pattern = '\p{wb=WSegSpace}'
|
||||
input = "\u3000"
|
||||
matches = [[0, 3]]
|
||||
|
||||
[[tests]]
|
||||
name = "unicode-class-word-break5"
|
||||
pattern = '\p{wb=numeric}'
|
||||
input = "\U0001E950"
|
||||
matches = [[0, 4]]
|
||||
|
||||
|
||||
[[tests]]
|
||||
name = "unicode-class-sentence-break1"
|
||||
pattern = '\p{sentence_break=Lower}'
|
||||
input = "\u0469"
|
||||
matches = [[0, 2]]
|
||||
|
||||
[[tests]]
|
||||
name = "unicode-class-sentence-break2"
|
||||
pattern = '\p{sb=lower}'
|
||||
input = "\u0469"
|
||||
matches = [[0, 2]]
|
||||
|
||||
[[tests]]
|
||||
name = "unicode-class-sentence-break3"
|
||||
pattern = '\p{sb=Close}'
|
||||
input = "\uFF60"
|
||||
matches = [[0, 3]]
|
||||
|
||||
[[tests]]
|
||||
name = "unicode-class-sentence-break4"
|
||||
pattern = '\p{sb=Close}'
|
||||
input = "\U0001F677"
|
||||
matches = [[0, 4]]
|
||||
|
||||
[[tests]]
|
||||
name = "unicode-class-sentence-break5"
|
||||
pattern = '\p{sb=SContinue}'
|
||||
input = "\uFF64"
|
||||
matches = [[0, 3]]
|
||||
2
third-party/vendor/regex-automata-0.1.10/rustfmt.toml
vendored
Normal file
2
third-party/vendor/regex-automata-0.1.10/rustfmt.toml
vendored
Normal file
|
|
@ -0,0 +1,2 @@
|
|||
max_width = 79
|
||||
use_small_heuristics = "max"
|
||||
76
third-party/vendor/regex-automata-0.1.10/src/byteorder.rs
vendored
Normal file
76
third-party/vendor/regex-automata-0.1.10/src/byteorder.rs
vendored
Normal file
|
|
@ -0,0 +1,76 @@
|
|||
use core::convert::TryInto;
|
||||
|
||||
pub trait ByteOrder {
|
||||
fn read_u16(buf: &[u8]) -> u16;
|
||||
fn read_u32(buf: &[u8]) -> u32;
|
||||
fn read_u64(buf: &[u8]) -> u64;
|
||||
fn read_uint(buf: &[u8], nbytes: usize) -> u64;
|
||||
fn write_u16(buf: &mut [u8], n: u16);
|
||||
fn write_u32(buf: &mut [u8], n: u32);
|
||||
fn write_u64(buf: &mut [u8], n: u64);
|
||||
fn write_uint(buf: &mut [u8], n: u64, nbytes: usize);
|
||||
}
|
||||
|
||||
pub enum BigEndian {}
|
||||
pub enum LittleEndian {}
|
||||
pub enum NativeEndian {}
|
||||
|
||||
macro_rules! impl_endian {
|
||||
($t:ty, $from_endian:ident, $to_endian:ident) => {
|
||||
impl ByteOrder for $t {
|
||||
#[inline]
|
||||
fn read_u16(buf: &[u8]) -> u16 {
|
||||
u16::$from_endian(buf[0..2].try_into().unwrap())
|
||||
}
|
||||
|
||||
#[inline]
|
||||
fn read_u32(buf: &[u8]) -> u32 {
|
||||
u32::$from_endian(buf[0..4].try_into().unwrap())
|
||||
}
|
||||
|
||||
#[inline]
|
||||
fn read_u64(buf: &[u8]) -> u64 {
|
||||
u64::$from_endian(buf[0..8].try_into().unwrap())
|
||||
}
|
||||
|
||||
#[inline]
|
||||
fn read_uint(buf: &[u8], nbytes: usize) -> u64 {
|
||||
let mut dst = [0u8; 8];
|
||||
dst[..nbytes].copy_from_slice(&buf[..nbytes]);
|
||||
u64::$from_endian(dst)
|
||||
}
|
||||
|
||||
#[inline]
|
||||
fn write_u16(buf: &mut [u8], n: u16) {
|
||||
buf[0..2].copy_from_slice(&n.$to_endian()[..]);
|
||||
}
|
||||
|
||||
#[inline]
|
||||
fn write_u32(buf: &mut [u8], n: u32) {
|
||||
buf[0..4].copy_from_slice(&n.$to_endian()[..]);
|
||||
}
|
||||
|
||||
#[inline]
|
||||
fn write_u64(buf: &mut [u8], n: u64) {
|
||||
buf[0..8].copy_from_slice(&n.$to_endian()[..]);
|
||||
}
|
||||
|
||||
#[inline]
|
||||
fn write_uint(buf: &mut [u8], n: u64, nbytes: usize) {
|
||||
buf[..nbytes].copy_from_slice(&n.$to_endian()[..nbytes]);
|
||||
}
|
||||
}
|
||||
};
|
||||
}
|
||||
|
||||
impl_endian! {
|
||||
BigEndian, from_be_bytes, to_be_bytes
|
||||
}
|
||||
|
||||
impl_endian! {
|
||||
LittleEndian, from_le_bytes, to_le_bytes
|
||||
}
|
||||
|
||||
impl_endian! {
|
||||
NativeEndian, from_ne_bytes, to_ne_bytes
|
||||
}
|
||||
271
third-party/vendor/regex-automata-0.1.10/src/classes.rs
vendored
Normal file
271
third-party/vendor/regex-automata-0.1.10/src/classes.rs
vendored
Normal file
|
|
@ -0,0 +1,271 @@
|
|||
use core::fmt;
|
||||
|
||||
/// A representation of byte oriented equivalence classes.
|
||||
///
|
||||
/// This is used in a DFA to reduce the size of the transition table. This can
|
||||
/// have a particularly large impact not only on the total size of a dense DFA,
|
||||
/// but also on compile times.
|
||||
#[derive(Clone, Copy)]
|
||||
pub struct ByteClasses([u8; 256]);
|
||||
|
||||
impl ByteClasses {
|
||||
/// Creates a new set of equivalence classes where all bytes are mapped to
|
||||
/// the same class.
|
||||
pub fn empty() -> ByteClasses {
|
||||
ByteClasses([0; 256])
|
||||
}
|
||||
|
||||
/// Creates a new set of equivalence classes where each byte belongs to
|
||||
/// its own equivalence class.
|
||||
pub fn singletons() -> ByteClasses {
|
||||
let mut classes = ByteClasses::empty();
|
||||
for i in 0..256 {
|
||||
classes.set(i as u8, i as u8);
|
||||
}
|
||||
classes
|
||||
}
|
||||
|
||||
/// Copies the byte classes given. The given slice must have length 0 or
|
||||
/// length 256. Slices of length 0 are treated as singletons (every byte
|
||||
/// is its own class).
|
||||
pub fn from_slice(slice: &[u8]) -> ByteClasses {
|
||||
assert!(slice.is_empty() || slice.len() == 256);
|
||||
|
||||
if slice.is_empty() {
|
||||
ByteClasses::singletons()
|
||||
} else {
|
||||
let mut classes = ByteClasses::empty();
|
||||
for (b, &class) in slice.iter().enumerate() {
|
||||
classes.set(b as u8, class);
|
||||
}
|
||||
classes
|
||||
}
|
||||
}
|
||||
|
||||
/// Set the equivalence class for the given byte.
|
||||
#[inline]
|
||||
pub fn set(&mut self, byte: u8, class: u8) {
|
||||
self.0[byte as usize] = class;
|
||||
}
|
||||
|
||||
/// Get the equivalence class for the given byte.
|
||||
#[inline]
|
||||
pub fn get(&self, byte: u8) -> u8 {
|
||||
self.0[byte as usize]
|
||||
}
|
||||
|
||||
/// Get the equivalence class for the given byte while forcefully
|
||||
/// eliding bounds checks.
|
||||
#[inline]
|
||||
pub unsafe fn get_unchecked(&self, byte: u8) -> u8 {
|
||||
*self.0.get_unchecked(byte as usize)
|
||||
}
|
||||
|
||||
/// Return the total number of elements in the alphabet represented by
|
||||
/// these equivalence classes. Equivalently, this returns the total number
|
||||
/// of equivalence classes.
|
||||
#[inline]
|
||||
pub fn alphabet_len(&self) -> usize {
|
||||
self.0[255] as usize + 1
|
||||
}
|
||||
|
||||
/// Returns true if and only if every byte in this class maps to its own
|
||||
/// equivalence class. Equivalently, there are 256 equivalence classes
|
||||
/// and each class contains exactly one byte.
|
||||
#[inline]
|
||||
pub fn is_singleton(&self) -> bool {
|
||||
self.alphabet_len() == 256
|
||||
}
|
||||
|
||||
/// Returns an iterator over a sequence of representative bytes from each
|
||||
/// equivalence class. Namely, this yields exactly N items, where N is
|
||||
/// equivalent to the number of equivalence classes. Each item is an
|
||||
/// arbitrary byte drawn from each equivalence class.
|
||||
///
|
||||
/// This is useful when one is determinizing an NFA and the NFA's alphabet
|
||||
/// hasn't been converted to equivalence classes yet. Picking an arbitrary
|
||||
/// byte from each equivalence class then permits a full exploration of
|
||||
/// the NFA instead of using every possible byte value.
|
||||
#[cfg(feature = "std")]
|
||||
pub fn representatives(&self) -> ByteClassRepresentatives {
|
||||
ByteClassRepresentatives { classes: self, byte: 0, last_class: None }
|
||||
}
|
||||
|
||||
/// Returns all of the bytes in the given equivalence class.
|
||||
///
|
||||
/// The second element in the tuple indicates the number of elements in
|
||||
/// the array.
|
||||
fn elements(&self, equiv: u8) -> ([u8; 256], usize) {
|
||||
let (mut array, mut len) = ([0; 256], 0);
|
||||
for b in 0..256 {
|
||||
if self.get(b as u8) == equiv {
|
||||
array[len] = b as u8;
|
||||
len += 1;
|
||||
}
|
||||
}
|
||||
(array, len)
|
||||
}
|
||||
}
|
||||
|
||||
impl fmt::Debug for ByteClasses {
|
||||
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
|
||||
if self.is_singleton() {
|
||||
write!(f, "ByteClasses({{singletons}})")
|
||||
} else {
|
||||
write!(f, "ByteClasses(")?;
|
||||
for equiv in 0..self.alphabet_len() {
|
||||
let (members, len) = self.elements(equiv as u8);
|
||||
write!(f, "{} => {:?}", equiv, &members[..len])?;
|
||||
}
|
||||
write!(f, ")")
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// An iterator over representative bytes from each equivalence class.
|
||||
#[cfg(feature = "std")]
|
||||
#[derive(Debug)]
|
||||
pub struct ByteClassRepresentatives<'a> {
|
||||
classes: &'a ByteClasses,
|
||||
byte: usize,
|
||||
last_class: Option<u8>,
|
||||
}
|
||||
|
||||
#[cfg(feature = "std")]
|
||||
impl<'a> Iterator for ByteClassRepresentatives<'a> {
|
||||
type Item = u8;
|
||||
|
||||
fn next(&mut self) -> Option<u8> {
|
||||
while self.byte < 256 {
|
||||
let byte = self.byte as u8;
|
||||
let class = self.classes.get(byte);
|
||||
self.byte += 1;
|
||||
|
||||
if self.last_class != Some(class) {
|
||||
self.last_class = Some(class);
|
||||
return Some(byte);
|
||||
}
|
||||
}
|
||||
None
|
||||
}
|
||||
}
|
||||
|
||||
/// A byte class set keeps track of an *approximation* of equivalence classes
|
||||
/// of bytes during NFA construction. That is, every byte in an equivalence
|
||||
/// class cannot discriminate between a match and a non-match.
|
||||
///
|
||||
/// For example, in the regex `[ab]+`, the bytes `a` and `b` would be in the
|
||||
/// same equivalence class because it never matters whether an `a` or a `b` is
|
||||
/// seen, and no combination of `a`s and `b`s in the text can discriminate
|
||||
/// a match.
|
||||
///
|
||||
/// Note though that this does not compute the minimal set of equivalence
|
||||
/// classes. For example, in the regex `[ac]+`, both `a` and `c` are in the
|
||||
/// same equivalence class for the same reason that `a` and `b` are in the
|
||||
/// same equivalence class in the aforementioned regex. However, in this
|
||||
/// implementation, `a` and `c` are put into distinct equivalence classes.
|
||||
/// The reason for this is implementation complexity. In the future, we should
|
||||
/// endeavor to compute the minimal equivalence classes since they can have a
|
||||
/// rather large impact on the size of the DFA.
|
||||
///
|
||||
/// The representation here is 256 booleans, all initially set to false. Each
|
||||
/// boolean maps to its corresponding byte based on position. A `true` value
|
||||
/// indicates the end of an equivalence class, where its corresponding byte
|
||||
/// and all of the bytes corresponding to all previous contiguous `false`
|
||||
/// values are in the same equivalence class.
|
||||
///
|
||||
/// This particular representation only permits contiguous ranges of bytes to
|
||||
/// be in the same equivalence class, which means that we can never discover
|
||||
/// the true minimal set of equivalence classes.
|
||||
#[cfg(feature = "std")]
|
||||
#[derive(Debug)]
|
||||
pub struct ByteClassSet(Vec<bool>);
|
||||
|
||||
#[cfg(feature = "std")]
|
||||
impl ByteClassSet {
|
||||
/// Create a new set of byte classes where all bytes are part of the same
|
||||
/// equivalence class.
|
||||
pub fn new() -> Self {
|
||||
ByteClassSet(vec![false; 256])
|
||||
}
|
||||
|
||||
/// Indicate the the range of byte given (inclusive) can discriminate a
|
||||
/// match between it and all other bytes outside of the range.
|
||||
pub fn set_range(&mut self, start: u8, end: u8) {
|
||||
debug_assert!(start <= end);
|
||||
if start > 0 {
|
||||
self.0[start as usize - 1] = true;
|
||||
}
|
||||
self.0[end as usize] = true;
|
||||
}
|
||||
|
||||
/// Convert this boolean set to a map that maps all byte values to their
|
||||
/// corresponding equivalence class. The last mapping indicates the largest
|
||||
/// equivalence class identifier (which is never bigger than 255).
|
||||
pub fn byte_classes(&self) -> ByteClasses {
|
||||
let mut classes = ByteClasses::empty();
|
||||
let mut class = 0u8;
|
||||
let mut i = 0;
|
||||
loop {
|
||||
classes.set(i as u8, class as u8);
|
||||
if i >= 255 {
|
||||
break;
|
||||
}
|
||||
if self.0[i] {
|
||||
class = class.checked_add(1).unwrap();
|
||||
}
|
||||
i += 1;
|
||||
}
|
||||
classes
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
#[cfg(feature = "std")]
|
||||
#[test]
|
||||
fn byte_classes() {
|
||||
use super::ByteClassSet;
|
||||
|
||||
let mut set = ByteClassSet::new();
|
||||
set.set_range(b'a', b'z');
|
||||
|
||||
let classes = set.byte_classes();
|
||||
assert_eq!(classes.get(0), 0);
|
||||
assert_eq!(classes.get(1), 0);
|
||||
assert_eq!(classes.get(2), 0);
|
||||
assert_eq!(classes.get(b'a' - 1), 0);
|
||||
assert_eq!(classes.get(b'a'), 1);
|
||||
assert_eq!(classes.get(b'm'), 1);
|
||||
assert_eq!(classes.get(b'z'), 1);
|
||||
assert_eq!(classes.get(b'z' + 1), 2);
|
||||
assert_eq!(classes.get(254), 2);
|
||||
assert_eq!(classes.get(255), 2);
|
||||
|
||||
let mut set = ByteClassSet::new();
|
||||
set.set_range(0, 2);
|
||||
set.set_range(4, 6);
|
||||
let classes = set.byte_classes();
|
||||
assert_eq!(classes.get(0), 0);
|
||||
assert_eq!(classes.get(1), 0);
|
||||
assert_eq!(classes.get(2), 0);
|
||||
assert_eq!(classes.get(3), 1);
|
||||
assert_eq!(classes.get(4), 2);
|
||||
assert_eq!(classes.get(5), 2);
|
||||
assert_eq!(classes.get(6), 2);
|
||||
assert_eq!(classes.get(7), 3);
|
||||
assert_eq!(classes.get(255), 3);
|
||||
}
|
||||
|
||||
#[cfg(feature = "std")]
|
||||
#[test]
|
||||
fn full_byte_classes() {
|
||||
use super::ByteClassSet;
|
||||
|
||||
let mut set = ByteClassSet::new();
|
||||
for i in 0..256u16 {
|
||||
set.set_range(i as u8, i as u8);
|
||||
}
|
||||
assert_eq!(set.byte_classes().alphabet_len(), 256);
|
||||
}
|
||||
}
|
||||
104
third-party/vendor/regex-automata-0.1.10/src/codegen.rs
vendored
Normal file
104
third-party/vendor/regex-automata-0.1.10/src/codegen.rs
vendored
Normal file
|
|
@ -0,0 +1,104 @@
|
|||
// This module is unused. It was written as an experiment to get a ballpark
|
||||
// idea of what state machines look like when translated to Rust code, and
|
||||
// in particular, an idea of how much code it generates. The implementation
|
||||
// below isn't optimal with respect to size, but the result wasn't exactly
|
||||
// small. At some point, we should pursue building this out beyond
|
||||
// experimentation, and in particular, probably provide a command line tool
|
||||
// and/or a macro. It's a fair bit of work, so I abandoned it for the initial
|
||||
// release. ---AG
|
||||
|
||||
use std::collections::HashMap;
|
||||
use std::io::Write;
|
||||
|
||||
use dense::DFA;
|
||||
use state_id::StateID;
|
||||
|
||||
macro_rules! wstr {
|
||||
($($tt:tt)*) => { write!($($tt)*).unwrap() }
|
||||
}
|
||||
|
||||
macro_rules! wstrln {
|
||||
($($tt:tt)*) => { writeln!($($tt)*).unwrap() }
|
||||
}
|
||||
|
||||
pub fn is_match_forward<S: StateID>(dfa: &DFA<S>) -> String {
|
||||
let names = state_variant_names(dfa);
|
||||
|
||||
let mut buf = vec![];
|
||||
wstrln!(buf, "pub fn is_match(input: &[u8]) -> bool {{");
|
||||
if dfa.is_match_state(dfa.start()) {
|
||||
wstrln!(buf, " return true;");
|
||||
wstrln!(buf, "}}");
|
||||
return String::from_utf8(buf).unwrap();
|
||||
}
|
||||
|
||||
wstrln!(buf, "{}", state_enum_def(dfa, &names));
|
||||
|
||||
wstrln!(buf, " let mut state = {};", names[&dfa.start()]);
|
||||
wstrln!(buf, " for &b in input.iter() {{");
|
||||
wstrln!(buf, " state = match state {{");
|
||||
for (id, s) in dfa.iter() {
|
||||
if dfa.is_match_state(id) {
|
||||
continue;
|
||||
}
|
||||
|
||||
wstrln!(buf, " {} => {{", &names[&id]);
|
||||
wstrln!(buf, " match b {{");
|
||||
for (start, end, next_id) in s.sparse_transitions() {
|
||||
if dfa.is_match_state(next_id) {
|
||||
wstrln!(buf, " {:?}...{:?} => return true,", start, end);
|
||||
} else {
|
||||
if start == end {
|
||||
wstrln!(buf, " {:?} => {},", start, &names[&next_id]);
|
||||
} else {
|
||||
wstrln!(buf, " {:?}...{:?} => {},", start, end, &names[&next_id]);
|
||||
}
|
||||
}
|
||||
}
|
||||
wstrln!(buf, " _ => S::S0,");
|
||||
wstrln!(buf, " }}");
|
||||
wstrln!(buf, " }}");
|
||||
}
|
||||
wstrln!(buf, " }};");
|
||||
wstrln!(buf, " }}");
|
||||
|
||||
wstrln!(buf, " false");
|
||||
wstrln!(buf, "}}");
|
||||
String::from_utf8(buf).unwrap()
|
||||
}
|
||||
|
||||
fn state_enum_def<S: StateID>(
|
||||
dfa: &DFA<S>,
|
||||
variant_names: &HashMap<S, String>,
|
||||
) -> String {
|
||||
let mut buf = vec![];
|
||||
wstrln!(buf, " #[derive(Clone, Copy)]");
|
||||
wstr!(buf, " enum S {{");
|
||||
|
||||
let mut i = 0;
|
||||
for (id, _) in dfa.iter() {
|
||||
if dfa.is_match_state(id) {
|
||||
continue;
|
||||
}
|
||||
if i % 10 == 0 {
|
||||
wstr!(buf, "\n ");
|
||||
}
|
||||
let name = format!("S{}", id.to_usize());
|
||||
wstr!(buf, " {},", name);
|
||||
i += 1;
|
||||
}
|
||||
wstr!(buf, "\n");
|
||||
wstrln!(buf, " }}");
|
||||
String::from_utf8(buf).unwrap()
|
||||
}
|
||||
|
||||
fn state_variant_names<S: StateID>(dfa: &DFA<S>) -> HashMap<S, String> {
|
||||
let mut variants = HashMap::new();
|
||||
for (id, _) in dfa.iter() {
|
||||
if dfa.is_match_state(id) {
|
||||
continue;
|
||||
}
|
||||
variants.insert(id, format!("S::S{}", id.to_usize()));
|
||||
}
|
||||
variants
|
||||
}
|
||||
2332
third-party/vendor/regex-automata-0.1.10/src/dense.rs
vendored
Normal file
2332
third-party/vendor/regex-automata-0.1.10/src/dense.rs
vendored
Normal file
File diff suppressed because it is too large
Load diff
286
third-party/vendor/regex-automata-0.1.10/src/determinize.rs
vendored
Normal file
286
third-party/vendor/regex-automata-0.1.10/src/determinize.rs
vendored
Normal file
|
|
@ -0,0 +1,286 @@
|
|||
use std::collections::HashMap;
|
||||
use std::mem;
|
||||
use std::rc::Rc;
|
||||
|
||||
use dense;
|
||||
use error::Result;
|
||||
use nfa::{self, NFA};
|
||||
use sparse_set::SparseSet;
|
||||
use state_id::{dead_id, StateID};
|
||||
|
||||
type DFARepr<S> = dense::Repr<Vec<S>, S>;
|
||||
|
||||
/// A determinizer converts an NFA to a DFA.
|
||||
///
|
||||
/// This determinizer follows the typical powerset construction, where each
|
||||
/// DFA state is comprised of one or more NFA states. In the worst case, there
|
||||
/// is one DFA state for every possible combination of NFA states. In practice,
|
||||
/// this only happens in certain conditions, typically when there are bounded
|
||||
/// repetitions.
|
||||
///
|
||||
/// The type variable `S` refers to the chosen state identifier representation
|
||||
/// used for the DFA.
|
||||
///
|
||||
/// The lifetime variable `'a` refers to the lifetime of the NFA being
|
||||
/// converted to a DFA.
|
||||
#[derive(Debug)]
|
||||
pub(crate) struct Determinizer<'a, S: StateID> {
|
||||
/// The NFA we're converting into a DFA.
|
||||
nfa: &'a NFA,
|
||||
/// The DFA we're building.
|
||||
dfa: DFARepr<S>,
|
||||
/// Each DFA state being built is defined as an *ordered* set of NFA
|
||||
/// states, along with a flag indicating whether the state is a match
|
||||
/// state or not.
|
||||
///
|
||||
/// This is never empty. The first state is always a dummy state such that
|
||||
/// a state id == 0 corresponds to a dead state.
|
||||
builder_states: Vec<Rc<State>>,
|
||||
/// A cache of DFA states that already exist and can be easily looked up
|
||||
/// via ordered sets of NFA states.
|
||||
cache: HashMap<Rc<State>, S>,
|
||||
/// Scratch space for a stack of NFA states to visit, for depth first
|
||||
/// visiting without recursion.
|
||||
stack: Vec<nfa::StateID>,
|
||||
/// Scratch space for storing an ordered sequence of NFA states, for
|
||||
/// amortizing allocation.
|
||||
scratch_nfa_states: Vec<nfa::StateID>,
|
||||
/// Whether to build a DFA that finds the longest possible match.
|
||||
longest_match: bool,
|
||||
}
|
||||
|
||||
/// An intermediate representation for a DFA state during determinization.
|
||||
#[derive(Debug, Eq, Hash, PartialEq)]
|
||||
struct State {
|
||||
/// Whether this state is a match state or not.
|
||||
is_match: bool,
|
||||
/// An ordered sequence of NFA states that make up this DFA state.
|
||||
nfa_states: Vec<nfa::StateID>,
|
||||
}
|
||||
|
||||
impl<'a, S: StateID> Determinizer<'a, S> {
|
||||
/// Create a new determinizer for converting the given NFA to a DFA.
|
||||
pub fn new(nfa: &'a NFA) -> Determinizer<'a, S> {
|
||||
let dead = Rc::new(State::dead());
|
||||
let mut cache = HashMap::default();
|
||||
cache.insert(dead.clone(), dead_id());
|
||||
|
||||
Determinizer {
|
||||
nfa,
|
||||
dfa: DFARepr::empty().anchored(nfa.is_anchored()),
|
||||
builder_states: vec![dead],
|
||||
cache,
|
||||
stack: vec![],
|
||||
scratch_nfa_states: vec![],
|
||||
longest_match: false,
|
||||
}
|
||||
}
|
||||
|
||||
/// Instruct the determinizer to use equivalence classes as the transition
|
||||
/// alphabet instead of all possible byte values.
|
||||
pub fn with_byte_classes(mut self) -> Determinizer<'a, S> {
|
||||
let byte_classes = self.nfa.byte_classes().clone();
|
||||
self.dfa = DFARepr::empty_with_byte_classes(byte_classes)
|
||||
.anchored(self.nfa.is_anchored());
|
||||
self
|
||||
}
|
||||
|
||||
/// Instruct the determinizer to build a DFA that recognizes the longest
|
||||
/// possible match instead of the leftmost first match. This is useful when
|
||||
/// constructing reverse DFAs for finding the start of a match.
|
||||
pub fn longest_match(mut self, yes: bool) -> Determinizer<'a, S> {
|
||||
self.longest_match = yes;
|
||||
self
|
||||
}
|
||||
|
||||
/// Build the DFA. If there was a problem constructing the DFA (e.g., if
|
||||
/// the chosen state identifier representation is too small), then an error
|
||||
/// is returned.
|
||||
pub fn build(mut self) -> Result<DFARepr<S>> {
|
||||
let representative_bytes: Vec<u8> =
|
||||
self.dfa.byte_classes().representatives().collect();
|
||||
let mut sparse = self.new_sparse_set();
|
||||
let mut uncompiled = vec![self.add_start(&mut sparse)?];
|
||||
while let Some(dfa_id) = uncompiled.pop() {
|
||||
for &b in &representative_bytes {
|
||||
let (next_dfa_id, is_new) =
|
||||
self.cached_state(dfa_id, b, &mut sparse)?;
|
||||
self.dfa.add_transition(dfa_id, b, next_dfa_id);
|
||||
if is_new {
|
||||
uncompiled.push(next_dfa_id);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// At this point, we shuffle the matching states in the final DFA to
|
||||
// the beginning. This permits a DFA's match loop to detect a match
|
||||
// condition by merely inspecting the current state's identifier, and
|
||||
// avoids the need for any additional auxiliary storage.
|
||||
let is_match: Vec<bool> =
|
||||
self.builder_states.iter().map(|s| s.is_match).collect();
|
||||
self.dfa.shuffle_match_states(&is_match);
|
||||
Ok(self.dfa)
|
||||
}
|
||||
|
||||
/// Return the identifier for the next DFA state given an existing DFA
|
||||
/// state and an input byte. If the next DFA state already exists, then
|
||||
/// return its identifier from the cache. Otherwise, build the state, cache
|
||||
/// it and return its identifier.
|
||||
///
|
||||
/// The given sparse set is used for scratch space. It must have a capacity
|
||||
/// equivalent to the total number of NFA states, but its contents are
|
||||
/// otherwise unspecified.
|
||||
///
|
||||
/// This routine returns a boolean indicating whether a new state was
|
||||
/// built. If a new state is built, then the caller needs to add it to its
|
||||
/// frontier of uncompiled DFA states to compute transitions for.
|
||||
fn cached_state(
|
||||
&mut self,
|
||||
dfa_id: S,
|
||||
b: u8,
|
||||
sparse: &mut SparseSet,
|
||||
) -> Result<(S, bool)> {
|
||||
sparse.clear();
|
||||
// Compute the set of all reachable NFA states, including epsilons.
|
||||
self.next(dfa_id, b, sparse);
|
||||
// Build a candidate state and check if it has already been built.
|
||||
let state = self.new_state(sparse);
|
||||
if let Some(&cached_id) = self.cache.get(&state) {
|
||||
// Since we have a cached state, put the constructed state's
|
||||
// memory back into our scratch space, so that it can be reused.
|
||||
let _ =
|
||||
mem::replace(&mut self.scratch_nfa_states, state.nfa_states);
|
||||
return Ok((cached_id, false));
|
||||
}
|
||||
// Nothing was in the cache, so add this state to the cache.
|
||||
self.add_state(state).map(|s| (s, true))
|
||||
}
|
||||
|
||||
/// Compute the set of all eachable NFA states, including the full epsilon
|
||||
/// closure, from a DFA state for a single byte of input.
|
||||
fn next(&mut self, dfa_id: S, b: u8, next_nfa_states: &mut SparseSet) {
|
||||
next_nfa_states.clear();
|
||||
for i in 0..self.builder_states[dfa_id.to_usize()].nfa_states.len() {
|
||||
let nfa_id = self.builder_states[dfa_id.to_usize()].nfa_states[i];
|
||||
match *self.nfa.state(nfa_id) {
|
||||
nfa::State::Union { .. }
|
||||
| nfa::State::Fail
|
||||
| nfa::State::Match => {}
|
||||
nfa::State::Range { range: ref r } => {
|
||||
if r.start <= b && b <= r.end {
|
||||
self.epsilon_closure(r.next, next_nfa_states);
|
||||
}
|
||||
}
|
||||
nfa::State::Sparse { ref ranges } => {
|
||||
for r in ranges.iter() {
|
||||
if r.start > b {
|
||||
break;
|
||||
} else if r.start <= b && b <= r.end {
|
||||
self.epsilon_closure(r.next, next_nfa_states);
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Compute the epsilon closure for the given NFA state.
|
||||
fn epsilon_closure(&mut self, start: nfa::StateID, set: &mut SparseSet) {
|
||||
if !self.nfa.state(start).is_epsilon() {
|
||||
set.insert(start);
|
||||
return;
|
||||
}
|
||||
|
||||
self.stack.push(start);
|
||||
while let Some(mut id) = self.stack.pop() {
|
||||
loop {
|
||||
if set.contains(id) {
|
||||
break;
|
||||
}
|
||||
set.insert(id);
|
||||
match *self.nfa.state(id) {
|
||||
nfa::State::Range { .. }
|
||||
| nfa::State::Sparse { .. }
|
||||
| nfa::State::Fail
|
||||
| nfa::State::Match => break,
|
||||
nfa::State::Union { ref alternates } => {
|
||||
id = match alternates.get(0) {
|
||||
None => break,
|
||||
Some(&id) => id,
|
||||
};
|
||||
self.stack.extend(alternates[1..].iter().rev());
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Compute the initial DFA state and return its identifier.
|
||||
///
|
||||
/// The sparse set given is used for scratch space, and must have capacity
|
||||
/// equal to the total number of NFA states. Its contents are unspecified.
|
||||
fn add_start(&mut self, sparse: &mut SparseSet) -> Result<S> {
|
||||
sparse.clear();
|
||||
self.epsilon_closure(self.nfa.start(), sparse);
|
||||
let state = self.new_state(&sparse);
|
||||
let id = self.add_state(state)?;
|
||||
self.dfa.set_start_state(id);
|
||||
Ok(id)
|
||||
}
|
||||
|
||||
/// Add the given state to the DFA and make it available in the cache.
|
||||
///
|
||||
/// The state initially has no transitions. That is, it transitions to the
|
||||
/// dead state for all possible inputs.
|
||||
fn add_state(&mut self, state: State) -> Result<S> {
|
||||
let id = self.dfa.add_empty_state()?;
|
||||
let rstate = Rc::new(state);
|
||||
self.builder_states.push(rstate.clone());
|
||||
self.cache.insert(rstate, id);
|
||||
Ok(id)
|
||||
}
|
||||
|
||||
/// Convert the given set of ordered NFA states to a DFA state.
|
||||
fn new_state(&mut self, set: &SparseSet) -> State {
|
||||
let mut state = State {
|
||||
is_match: false,
|
||||
nfa_states: mem::replace(&mut self.scratch_nfa_states, vec![]),
|
||||
};
|
||||
state.nfa_states.clear();
|
||||
|
||||
for &id in set {
|
||||
match *self.nfa.state(id) {
|
||||
nfa::State::Range { .. } => {
|
||||
state.nfa_states.push(id);
|
||||
}
|
||||
nfa::State::Sparse { .. } => {
|
||||
state.nfa_states.push(id);
|
||||
}
|
||||
nfa::State::Fail => {
|
||||
break;
|
||||
}
|
||||
nfa::State::Match => {
|
||||
state.is_match = true;
|
||||
if !self.longest_match {
|
||||
break;
|
||||
}
|
||||
}
|
||||
nfa::State::Union { .. } => {}
|
||||
}
|
||||
}
|
||||
state
|
||||
}
|
||||
|
||||
/// Create a new sparse set with enough capacity to hold all NFA states.
|
||||
fn new_sparse_set(&self) -> SparseSet {
|
||||
SparseSet::new(self.nfa.len())
|
||||
}
|
||||
}
|
||||
|
||||
impl State {
|
||||
/// Create a new empty dead state.
|
||||
fn dead() -> State {
|
||||
State { nfa_states: vec![], is_match: false }
|
||||
}
|
||||
}
|
||||
363
third-party/vendor/regex-automata-0.1.10/src/dfa.rs
vendored
Normal file
363
third-party/vendor/regex-automata-0.1.10/src/dfa.rs
vendored
Normal file
|
|
@ -0,0 +1,363 @@
|
|||
use state_id::StateID;
|
||||
|
||||
/// A trait describing the interface of a deterministic finite automaton (DFA).
|
||||
///
|
||||
/// Every DFA has exactly one start state and at least one dead state (which
|
||||
/// may be the same, as in the case of an empty DFA). In all cases, a state
|
||||
/// identifier of `0` must be a dead state such that `DFA::is_dead_state(0)`
|
||||
/// always returns `true`.
|
||||
///
|
||||
/// Every DFA also has zero or more match states, such that
|
||||
/// `DFA::is_match_state(id)` returns `true` if and only if `id` corresponds to
|
||||
/// a match state.
|
||||
///
|
||||
/// In general, users of this trait likely will only need to use the search
|
||||
/// routines such as `is_match`, `shortest_match`, `find` or `rfind`. The other
|
||||
/// methods are lower level and are used for walking the transitions of a DFA
|
||||
/// manually. In particular, the aforementioned search routines are implemented
|
||||
/// generically in terms of the lower level transition walking routines.
|
||||
pub trait DFA {
|
||||
/// The representation used for state identifiers in this DFA.
|
||||
///
|
||||
/// Typically, this is one of `u8`, `u16`, `u32`, `u64` or `usize`.
|
||||
type ID: StateID;
|
||||
|
||||
/// Return the identifier of this DFA's start state.
|
||||
fn start_state(&self) -> Self::ID;
|
||||
|
||||
/// Returns true if and only if the given identifier corresponds to a match
|
||||
/// state.
|
||||
fn is_match_state(&self, id: Self::ID) -> bool;
|
||||
|
||||
/// Returns true if and only if the given identifier corresponds to a dead
|
||||
/// state. When a DFA enters a dead state, it is impossible to leave and
|
||||
/// thus can never lead to a match.
|
||||
fn is_dead_state(&self, id: Self::ID) -> bool;
|
||||
|
||||
/// Returns true if and only if the given identifier corresponds to either
|
||||
/// a dead state or a match state, such that one of `is_match_state(id)`
|
||||
/// or `is_dead_state(id)` must return true.
|
||||
///
|
||||
/// Depending on the implementation of the DFA, this routine can be used
|
||||
/// to save a branch in the core matching loop. Nevertheless,
|
||||
/// `is_match_state(id) || is_dead_state(id)` is always a valid
|
||||
/// implementation.
|
||||
fn is_match_or_dead_state(&self, id: Self::ID) -> bool;
|
||||
|
||||
/// Returns true if and only if this DFA is anchored.
|
||||
///
|
||||
/// When a DFA is anchored, it is only allowed to report matches that
|
||||
/// start at index `0`.
|
||||
fn is_anchored(&self) -> bool;
|
||||
|
||||
/// Given the current state that this DFA is in and the next input byte,
|
||||
/// this method returns the identifier of the next state. The identifier
|
||||
/// returned is always valid, but it may correspond to a dead state.
|
||||
fn next_state(&self, current: Self::ID, input: u8) -> Self::ID;
|
||||
|
||||
/// Like `next_state`, but its implementation may look up the next state
|
||||
/// without memory safety checks such as bounds checks. As such, callers
|
||||
/// must ensure that the given identifier corresponds to a valid DFA
|
||||
/// state. Implementors must, in turn, ensure that this routine is safe
|
||||
/// for all valid state identifiers and for all possible `u8` values.
|
||||
unsafe fn next_state_unchecked(
|
||||
&self,
|
||||
current: Self::ID,
|
||||
input: u8,
|
||||
) -> Self::ID;
|
||||
|
||||
/// Returns true if and only if the given bytes match this DFA.
|
||||
///
|
||||
/// This routine may short circuit if it knows that scanning future input
|
||||
/// will never lead to a different result. In particular, if a DFA enters
|
||||
/// a match state or a dead state, then this routine will return `true` or
|
||||
/// `false`, respectively, without inspecting any future input.
|
||||
///
|
||||
/// # Example
|
||||
///
|
||||
/// This example shows how to use this method with a
|
||||
/// [`DenseDFA`](enum.DenseDFA.html).
|
||||
///
|
||||
/// ```
|
||||
/// use regex_automata::{DFA, DenseDFA};
|
||||
///
|
||||
/// # fn example() -> Result<(), regex_automata::Error> {
|
||||
/// let dfa = DenseDFA::new("foo[0-9]+bar")?;
|
||||
/// assert_eq!(true, dfa.is_match(b"foo12345bar"));
|
||||
/// assert_eq!(false, dfa.is_match(b"foobar"));
|
||||
/// # Ok(()) }; example().unwrap()
|
||||
/// ```
|
||||
#[inline]
|
||||
fn is_match(&self, bytes: &[u8]) -> bool {
|
||||
self.is_match_at(bytes, 0)
|
||||
}
|
||||
|
||||
/// Returns the first position at which a match is found.
|
||||
///
|
||||
/// This routine stops scanning input in precisely the same circumstances
|
||||
/// as `is_match`. The key difference is that this routine returns the
|
||||
/// position at which it stopped scanning input if and only if a match
|
||||
/// was found. If no match is found, then `None` is returned.
|
||||
///
|
||||
/// # Example
|
||||
///
|
||||
/// This example shows how to use this method with a
|
||||
/// [`DenseDFA`](enum.DenseDFA.html).
|
||||
///
|
||||
/// ```
|
||||
/// use regex_automata::{DFA, DenseDFA};
|
||||
///
|
||||
/// # fn example() -> Result<(), regex_automata::Error> {
|
||||
/// let dfa = DenseDFA::new("foo[0-9]+")?;
|
||||
/// assert_eq!(Some(4), dfa.shortest_match(b"foo12345"));
|
||||
///
|
||||
/// // Normally, the end of the leftmost first match here would be 3,
|
||||
/// // but the shortest match semantics detect a match earlier.
|
||||
/// let dfa = DenseDFA::new("abc|a")?;
|
||||
/// assert_eq!(Some(1), dfa.shortest_match(b"abc"));
|
||||
/// # Ok(()) }; example().unwrap()
|
||||
/// ```
|
||||
#[inline]
|
||||
fn shortest_match(&self, bytes: &[u8]) -> Option<usize> {
|
||||
self.shortest_match_at(bytes, 0)
|
||||
}
|
||||
|
||||
/// Returns the end offset of the longest match. If no match exists,
|
||||
/// then `None` is returned.
|
||||
///
|
||||
/// Implementors of this trait are not required to implement any particular
|
||||
/// match semantics (such as leftmost-first), which are instead manifest in
|
||||
/// the DFA's topology itself.
|
||||
///
|
||||
/// In particular, this method must continue searching even after it
|
||||
/// enters a match state. The search should only terminate once it has
|
||||
/// reached the end of the input or when it has entered a dead state. Upon
|
||||
/// termination, the position of the last byte seen while still in a match
|
||||
/// state is returned.
|
||||
///
|
||||
/// # Example
|
||||
///
|
||||
/// This example shows how to use this method with a
|
||||
/// [`DenseDFA`](enum.DenseDFA.html). By default, a dense DFA uses
|
||||
/// "leftmost first" match semantics.
|
||||
///
|
||||
/// Leftmost first match semantics corresponds to the match with the
|
||||
/// smallest starting offset, but where the end offset is determined by
|
||||
/// preferring earlier branches in the original regular expression. For
|
||||
/// example, `Sam|Samwise` will match `Sam` in `Samwise`, but `Samwise|Sam`
|
||||
/// will match `Samwise` in `Samwise`.
|
||||
///
|
||||
/// Generally speaking, the "leftmost first" match is how most backtracking
|
||||
/// regular expressions tend to work. This is in contrast to POSIX-style
|
||||
/// regular expressions that yield "leftmost longest" matches. Namely,
|
||||
/// both `Sam|Samwise` and `Samwise|Sam` match `Samwise` when using
|
||||
/// leftmost longest semantics.
|
||||
///
|
||||
/// ```
|
||||
/// use regex_automata::{DFA, DenseDFA};
|
||||
///
|
||||
/// # fn example() -> Result<(), regex_automata::Error> {
|
||||
/// let dfa = DenseDFA::new("foo[0-9]+")?;
|
||||
/// assert_eq!(Some(8), dfa.find(b"foo12345"));
|
||||
///
|
||||
/// // Even though a match is found after reading the first byte (`a`),
|
||||
/// // the leftmost first match semantics demand that we find the earliest
|
||||
/// // match that prefers earlier parts of the pattern over latter parts.
|
||||
/// let dfa = DenseDFA::new("abc|a")?;
|
||||
/// assert_eq!(Some(3), dfa.find(b"abc"));
|
||||
/// # Ok(()) }; example().unwrap()
|
||||
/// ```
|
||||
#[inline]
|
||||
fn find(&self, bytes: &[u8]) -> Option<usize> {
|
||||
self.find_at(bytes, 0)
|
||||
}
|
||||
|
||||
/// Returns the start offset of the longest match in reverse, by searching
|
||||
/// from the end of the input towards the start of the input. If no match
|
||||
/// exists, then `None` is returned. In other words, this has the same
|
||||
/// match semantics as `find`, but in reverse.
|
||||
///
|
||||
/// # Example
|
||||
///
|
||||
/// This example shows how to use this method with a
|
||||
/// [`DenseDFA`](enum.DenseDFA.html). In particular, this routine
|
||||
/// is principally useful when used in conjunction with the
|
||||
/// [`dense::Builder::reverse`](dense/struct.Builder.html#method.reverse)
|
||||
/// configuration knob. In general, it's unlikely to be correct to use both
|
||||
/// `find` and `rfind` with the same DFA since any particular DFA will only
|
||||
/// support searching in one direction.
|
||||
///
|
||||
/// ```
|
||||
/// use regex_automata::{dense, DFA};
|
||||
///
|
||||
/// # fn example() -> Result<(), regex_automata::Error> {
|
||||
/// let dfa = dense::Builder::new().reverse(true).build("foo[0-9]+")?;
|
||||
/// assert_eq!(Some(0), dfa.rfind(b"foo12345"));
|
||||
/// # Ok(()) }; example().unwrap()
|
||||
/// ```
|
||||
#[inline]
|
||||
fn rfind(&self, bytes: &[u8]) -> Option<usize> {
|
||||
self.rfind_at(bytes, bytes.len())
|
||||
}
|
||||
|
||||
/// Returns the same as `is_match`, but starts the search at the given
|
||||
/// offset.
|
||||
///
|
||||
/// The significance of the starting point is that it takes the surrounding
|
||||
/// context into consideration. For example, if the DFA is anchored, then
|
||||
/// a match can only occur when `start == 0`.
|
||||
#[inline]
|
||||
fn is_match_at(&self, bytes: &[u8], start: usize) -> bool {
|
||||
if self.is_anchored() && start > 0 {
|
||||
return false;
|
||||
}
|
||||
|
||||
let mut state = self.start_state();
|
||||
if self.is_match_or_dead_state(state) {
|
||||
return self.is_match_state(state);
|
||||
}
|
||||
for &b in bytes[start..].iter() {
|
||||
state = unsafe { self.next_state_unchecked(state, b) };
|
||||
if self.is_match_or_dead_state(state) {
|
||||
return self.is_match_state(state);
|
||||
}
|
||||
}
|
||||
false
|
||||
}
|
||||
|
||||
/// Returns the same as `shortest_match`, but starts the search at the
|
||||
/// given offset.
|
||||
///
|
||||
/// The significance of the starting point is that it takes the surrounding
|
||||
/// context into consideration. For example, if the DFA is anchored, then
|
||||
/// a match can only occur when `start == 0`.
|
||||
#[inline]
|
||||
fn shortest_match_at(&self, bytes: &[u8], start: usize) -> Option<usize> {
|
||||
if self.is_anchored() && start > 0 {
|
||||
return None;
|
||||
}
|
||||
|
||||
let mut state = self.start_state();
|
||||
if self.is_match_or_dead_state(state) {
|
||||
return if self.is_dead_state(state) { None } else { Some(start) };
|
||||
}
|
||||
for (i, &b) in bytes[start..].iter().enumerate() {
|
||||
state = unsafe { self.next_state_unchecked(state, b) };
|
||||
if self.is_match_or_dead_state(state) {
|
||||
return if self.is_dead_state(state) {
|
||||
None
|
||||
} else {
|
||||
Some(start + i + 1)
|
||||
};
|
||||
}
|
||||
}
|
||||
None
|
||||
}
|
||||
|
||||
/// Returns the same as `find`, but starts the search at the given
|
||||
/// offset.
|
||||
///
|
||||
/// The significance of the starting point is that it takes the surrounding
|
||||
/// context into consideration. For example, if the DFA is anchored, then
|
||||
/// a match can only occur when `start == 0`.
|
||||
#[inline]
|
||||
fn find_at(&self, bytes: &[u8], start: usize) -> Option<usize> {
|
||||
if self.is_anchored() && start > 0 {
|
||||
return None;
|
||||
}
|
||||
|
||||
let mut state = self.start_state();
|
||||
let mut last_match = if self.is_dead_state(state) {
|
||||
return None;
|
||||
} else if self.is_match_state(state) {
|
||||
Some(start)
|
||||
} else {
|
||||
None
|
||||
};
|
||||
for (i, &b) in bytes[start..].iter().enumerate() {
|
||||
state = unsafe { self.next_state_unchecked(state, b) };
|
||||
if self.is_match_or_dead_state(state) {
|
||||
if self.is_dead_state(state) {
|
||||
return last_match;
|
||||
}
|
||||
last_match = Some(start + i + 1);
|
||||
}
|
||||
}
|
||||
last_match
|
||||
}
|
||||
|
||||
/// Returns the same as `rfind`, but starts the search at the given
|
||||
/// offset.
|
||||
///
|
||||
/// The significance of the starting point is that it takes the surrounding
|
||||
/// context into consideration. For example, if the DFA is anchored, then
|
||||
/// a match can only occur when `start == bytes.len()`.
|
||||
#[inline(never)]
|
||||
fn rfind_at(&self, bytes: &[u8], start: usize) -> Option<usize> {
|
||||
if self.is_anchored() && start < bytes.len() {
|
||||
return None;
|
||||
}
|
||||
|
||||
let mut state = self.start_state();
|
||||
let mut last_match = if self.is_dead_state(state) {
|
||||
return None;
|
||||
} else if self.is_match_state(state) {
|
||||
Some(start)
|
||||
} else {
|
||||
None
|
||||
};
|
||||
for (i, &b) in bytes[..start].iter().enumerate().rev() {
|
||||
state = unsafe { self.next_state_unchecked(state, b) };
|
||||
if self.is_match_or_dead_state(state) {
|
||||
if self.is_dead_state(state) {
|
||||
return last_match;
|
||||
}
|
||||
last_match = Some(i);
|
||||
}
|
||||
}
|
||||
last_match
|
||||
}
|
||||
}
|
||||
|
||||
impl<'a, T: DFA> DFA for &'a T {
|
||||
type ID = T::ID;
|
||||
|
||||
#[inline]
|
||||
fn start_state(&self) -> Self::ID {
|
||||
(**self).start_state()
|
||||
}
|
||||
|
||||
#[inline]
|
||||
fn is_match_state(&self, id: Self::ID) -> bool {
|
||||
(**self).is_match_state(id)
|
||||
}
|
||||
|
||||
#[inline]
|
||||
fn is_match_or_dead_state(&self, id: Self::ID) -> bool {
|
||||
(**self).is_match_or_dead_state(id)
|
||||
}
|
||||
|
||||
#[inline]
|
||||
fn is_dead_state(&self, id: Self::ID) -> bool {
|
||||
(**self).is_dead_state(id)
|
||||
}
|
||||
|
||||
#[inline]
|
||||
fn is_anchored(&self) -> bool {
|
||||
(**self).is_anchored()
|
||||
}
|
||||
|
||||
#[inline]
|
||||
fn next_state(&self, current: Self::ID, input: u8) -> Self::ID {
|
||||
(**self).next_state(current, input)
|
||||
}
|
||||
|
||||
#[inline]
|
||||
unsafe fn next_state_unchecked(
|
||||
&self,
|
||||
current: Self::ID,
|
||||
input: u8,
|
||||
) -> Self::ID {
|
||||
(**self).next_state_unchecked(current, input)
|
||||
}
|
||||
}
|
||||
150
third-party/vendor/regex-automata-0.1.10/src/error.rs
vendored
Normal file
150
third-party/vendor/regex-automata-0.1.10/src/error.rs
vendored
Normal file
|
|
@ -0,0 +1,150 @@
|
|||
use std::error;
|
||||
use std::fmt;
|
||||
use std::result;
|
||||
|
||||
use regex_syntax;
|
||||
|
||||
pub type Result<T> = result::Result<T, Error>;
|
||||
|
||||
/// An error that occurred during the construction of a DFA.
|
||||
#[derive(Clone, Debug)]
|
||||
pub struct Error {
|
||||
kind: ErrorKind,
|
||||
}
|
||||
|
||||
/// The kind of error that occurred.
|
||||
#[derive(Clone, Debug)]
|
||||
pub enum ErrorKind {
|
||||
/// An error that occurred while parsing a regular expression. Note that
|
||||
/// this error may be printed over multiple lines, and is generally
|
||||
/// intended to be end user readable on its own.
|
||||
Syntax(String),
|
||||
/// An error that occurred because an unsupported regex feature was used.
|
||||
/// The message string describes which unsupported feature was used.
|
||||
///
|
||||
/// The primary regex features that are unsupported are those that require
|
||||
/// look-around, such as the `^` and `$` anchors and the word boundary
|
||||
/// assertion `\b`. These may be supported in the future.
|
||||
Unsupported(String),
|
||||
/// An error that occurred when attempting to serialize a DFA to bytes.
|
||||
Serialize(String),
|
||||
/// An error that occurs when constructing a DFA would require the use of
|
||||
/// a state ID that overflows the chosen state ID representation. For
|
||||
/// example, if one is using `u8` for state IDs and builds a DFA with
|
||||
/// 257 states, then the last state's ID will be `256` which cannot be
|
||||
/// represented with `u8`.
|
||||
///
|
||||
/// Typically, this error occurs in the determinization process of building
|
||||
/// a DFA (the conversion step from NFA to DFA). It can also occur when
|
||||
/// trying to build a smaller DFA from an existing one.
|
||||
StateIDOverflow {
|
||||
/// The maximum possible state ID.
|
||||
max: usize,
|
||||
},
|
||||
/// An error that occurs when premultiplication of state IDs is requested,
|
||||
/// but doing so would overflow the chosen state ID representation.
|
||||
///
|
||||
/// When `max == requested_max`, then the state ID would overflow `usize`.
|
||||
PremultiplyOverflow {
|
||||
/// The maximum possible state id.
|
||||
max: usize,
|
||||
/// The maximum ID required by premultiplication.
|
||||
requested_max: usize,
|
||||
},
|
||||
}
|
||||
|
||||
impl Error {
|
||||
/// Return the kind of this error.
|
||||
pub fn kind(&self) -> &ErrorKind {
|
||||
&self.kind
|
||||
}
|
||||
|
||||
pub(crate) fn syntax(err: regex_syntax::Error) -> Error {
|
||||
Error { kind: ErrorKind::Syntax(err.to_string()) }
|
||||
}
|
||||
|
||||
pub(crate) fn unsupported_anchor() -> Error {
|
||||
let msg = r"anchors such as ^, $, \A and \z are not supported";
|
||||
Error { kind: ErrorKind::Unsupported(msg.to_string()) }
|
||||
}
|
||||
|
||||
pub(crate) fn unsupported_word() -> Error {
|
||||
let msg = r"word boundary assertions (\b and \B) are not supported";
|
||||
Error { kind: ErrorKind::Unsupported(msg.to_string()) }
|
||||
}
|
||||
|
||||
pub(crate) fn unsupported_longest_match() -> Error {
|
||||
let msg = "unachored searches with longest match \
|
||||
semantics are not supported";
|
||||
Error { kind: ErrorKind::Unsupported(msg.to_string()) }
|
||||
}
|
||||
|
||||
pub(crate) fn serialize(message: &str) -> Error {
|
||||
Error { kind: ErrorKind::Serialize(message.to_string()) }
|
||||
}
|
||||
|
||||
pub(crate) fn state_id_overflow(max: usize) -> Error {
|
||||
Error { kind: ErrorKind::StateIDOverflow { max } }
|
||||
}
|
||||
|
||||
pub(crate) fn premultiply_overflow(
|
||||
max: usize,
|
||||
requested_max: usize,
|
||||
) -> Error {
|
||||
Error { kind: ErrorKind::PremultiplyOverflow { max, requested_max } }
|
||||
}
|
||||
}
|
||||
|
||||
impl error::Error for Error {
|
||||
fn description(&self) -> &str {
|
||||
match self.kind {
|
||||
ErrorKind::Syntax(_) => "syntax error",
|
||||
ErrorKind::Unsupported(_) => "unsupported syntax",
|
||||
ErrorKind::Serialize(_) => "serialization error",
|
||||
ErrorKind::StateIDOverflow { .. } => {
|
||||
"state id representation too small"
|
||||
}
|
||||
ErrorKind::PremultiplyOverflow { .. } => {
|
||||
"state id representation too small for premultiplication"
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl fmt::Display for Error {
|
||||
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
|
||||
match self.kind {
|
||||
ErrorKind::Syntax(ref msg) => write!(f, "{}", msg),
|
||||
ErrorKind::Unsupported(ref msg) => write!(f, "{}", msg),
|
||||
ErrorKind::Serialize(ref msg) => {
|
||||
write!(f, "DFA serialization error: {}", msg)
|
||||
}
|
||||
ErrorKind::StateIDOverflow { max } => write!(
|
||||
f,
|
||||
"building the DFA failed because it required building \
|
||||
more states that can be identified, where the maximum \
|
||||
ID for the chosen representation is {}",
|
||||
max,
|
||||
),
|
||||
ErrorKind::PremultiplyOverflow { max, requested_max } => {
|
||||
if max == requested_max {
|
||||
write!(
|
||||
f,
|
||||
"premultiplication of states requires the ability to \
|
||||
represent a state ID greater than what can fit on \
|
||||
this platform's usize, which is {}",
|
||||
::std::usize::MAX,
|
||||
)
|
||||
} else {
|
||||
write!(
|
||||
f,
|
||||
"premultiplication of states requires the ability to \
|
||||
represent at least a state ID of {}, but the chosen \
|
||||
representation only permits a maximum state ID of {}",
|
||||
requested_max, max,
|
||||
)
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
360
third-party/vendor/regex-automata-0.1.10/src/lib.rs
vendored
Normal file
360
third-party/vendor/regex-automata-0.1.10/src/lib.rs
vendored
Normal file
|
|
@ -0,0 +1,360 @@
|
|||
/*!
|
||||
A low level regular expression library that uses deterministic finite automata.
|
||||
It supports a rich syntax with Unicode support, has extensive options for
|
||||
configuring the best space vs time trade off for your use case and provides
|
||||
support for cheap deserialization of automata for use in `no_std` environments.
|
||||
|
||||
# Overview
|
||||
|
||||
This section gives a brief overview of the primary types in this crate:
|
||||
|
||||
* A [`Regex`](struct.Regex.html) provides a way to search for matches of a
|
||||
regular expression. This includes iterating over matches with both the start
|
||||
and end positions of each match.
|
||||
* A [`RegexBuilder`](struct.RegexBuilder.html) provides a way configure many
|
||||
compilation options for a regex.
|
||||
* A [`DenseDFA`](enum.DenseDFA.html) provides low level access to a DFA that
|
||||
uses a dense representation (uses lots of space, but fast searching).
|
||||
* A [`SparseDFA`](enum.SparseDFA.html) provides the same API as a `DenseDFA`,
|
||||
but uses a sparse representation (uses less space, but slower matching).
|
||||
* A [`DFA`](trait.DFA.html) trait that defines an interface that all DFAs must
|
||||
implement.
|
||||
* Both dense DFAs and sparse DFAs support
|
||||
[serialization to raw bytes](enum.DenseDFA.html#method.to_bytes_little_endian)
|
||||
and
|
||||
[cheap deserialization](enum.DenseDFA.html#method.from_bytes).
|
||||
|
||||
# Example: basic regex searching
|
||||
|
||||
This example shows how to compile a regex using the default configuration
|
||||
and then use it to find matches in a byte string:
|
||||
|
||||
```
|
||||
use regex_automata::Regex;
|
||||
|
||||
let re = Regex::new(r"[0-9]{4}-[0-9]{2}-[0-9]{2}").unwrap();
|
||||
let text = b"2018-12-24 2016-10-08";
|
||||
let matches: Vec<(usize, usize)> = re.find_iter(text).collect();
|
||||
assert_eq!(matches, vec![(0, 10), (11, 21)]);
|
||||
```
|
||||
|
||||
# Example: use sparse DFAs
|
||||
|
||||
By default, compiling a regex will use dense DFAs internally. This uses more
|
||||
memory, but executes searches more quickly. If you can abide slower searches
|
||||
(somewhere around 3-5x), then sparse DFAs might make more sense since they can
|
||||
use significantly less space.
|
||||
|
||||
Using sparse DFAs is as easy as using `Regex::new_sparse` instead of
|
||||
`Regex::new`:
|
||||
|
||||
```
|
||||
use regex_automata::Regex;
|
||||
|
||||
# fn example() -> Result<(), regex_automata::Error> {
|
||||
let re = Regex::new_sparse(r"[0-9]{4}-[0-9]{2}-[0-9]{2}").unwrap();
|
||||
let text = b"2018-12-24 2016-10-08";
|
||||
let matches: Vec<(usize, usize)> = re.find_iter(text).collect();
|
||||
assert_eq!(matches, vec![(0, 10), (11, 21)]);
|
||||
# Ok(()) }; example().unwrap()
|
||||
```
|
||||
|
||||
If you already have dense DFAs for some reason, they can be converted to sparse
|
||||
DFAs and used to build a new `Regex`. For example:
|
||||
|
||||
```
|
||||
use regex_automata::Regex;
|
||||
|
||||
# fn example() -> Result<(), regex_automata::Error> {
|
||||
let dense_re = Regex::new(r"[0-9]{4}-[0-9]{2}-[0-9]{2}").unwrap();
|
||||
let sparse_re = Regex::from_dfas(
|
||||
dense_re.forward().to_sparse()?,
|
||||
dense_re.reverse().to_sparse()?,
|
||||
);
|
||||
let text = b"2018-12-24 2016-10-08";
|
||||
let matches: Vec<(usize, usize)> = sparse_re.find_iter(text).collect();
|
||||
assert_eq!(matches, vec![(0, 10), (11, 21)]);
|
||||
# Ok(()) }; example().unwrap()
|
||||
```
|
||||
|
||||
# Example: deserialize a DFA
|
||||
|
||||
This shows how to first serialize a DFA into raw bytes, and then deserialize
|
||||
those raw bytes back into a DFA. While this particular example is a bit
|
||||
contrived, this same technique can be used in your program to deserialize a
|
||||
DFA at start up time or by memory mapping a file. In particular,
|
||||
deserialization is guaranteed to be cheap because it will always be a constant
|
||||
time operation.
|
||||
|
||||
```
|
||||
use regex_automata::{DenseDFA, Regex};
|
||||
|
||||
# fn example() -> Result<(), regex_automata::Error> {
|
||||
let re1 = Regex::new(r"[0-9]{4}-[0-9]{2}-[0-9]{2}").unwrap();
|
||||
// serialize both the forward and reverse DFAs, see note below
|
||||
let fwd_bytes = re1.forward().to_u16()?.to_bytes_native_endian()?;
|
||||
let rev_bytes = re1.reverse().to_u16()?.to_bytes_native_endian()?;
|
||||
// now deserialize both---we need to specify the correct type!
|
||||
let fwd: DenseDFA<&[u16], u16> = unsafe { DenseDFA::from_bytes(&fwd_bytes) };
|
||||
let rev: DenseDFA<&[u16], u16> = unsafe { DenseDFA::from_bytes(&rev_bytes) };
|
||||
// finally, reconstruct our regex
|
||||
let re2 = Regex::from_dfas(fwd, rev);
|
||||
|
||||
// we can use it like normal
|
||||
let text = b"2018-12-24 2016-10-08";
|
||||
let matches: Vec<(usize, usize)> = re2.find_iter(text).collect();
|
||||
assert_eq!(matches, vec![(0, 10), (11, 21)]);
|
||||
# Ok(()) }; example().unwrap()
|
||||
```
|
||||
|
||||
There are a few points worth noting here:
|
||||
|
||||
* We need to extract the raw DFAs used by the regex and serialize those. You
|
||||
can build the DFAs manually yourself using
|
||||
[`dense::Builder`](dense/struct.Builder.html), but using the DFAs from a
|
||||
`Regex` guarantees that the DFAs are built correctly.
|
||||
* We specifically convert the dense DFA to a representation that uses `u16`
|
||||
for its state identifiers using
|
||||
[`DenseDFA::to_u16`](enum.DenseDFA.html#method.to_u16). While this isn't
|
||||
strictly necessary, if we skipped this step, then the serialized bytes would
|
||||
use `usize` for state identifiers, which does not have a fixed size. Using
|
||||
`u16` ensures that we can deserialize this DFA even on platforms with a
|
||||
smaller pointer size. If our DFA is too big for `u16` state identifiers, then
|
||||
one can use `u32` or `u64`.
|
||||
* To convert the DFA to raw bytes, we use the `to_bytes_native_endian`
|
||||
method. In practice, you'll want to use either
|
||||
[`DenseDFA::to_bytes_little_endian`](enum.DenseDFA.html#method.to_bytes_little_endian)
|
||||
or
|
||||
[`DenseDFA::to_bytes_big_endian`](enum.DenseDFA.html#method.to_bytes_big_endian),
|
||||
depending on which platform you're deserializing your DFA from. If you intend
|
||||
to deserialize on either platform, then you'll need to serialize both and
|
||||
deserialize the right one depending on your target's endianness.
|
||||
* Deserializing a DFA requires the use of `unsafe` because the raw bytes must
|
||||
be *trusted*. In particular, while some degree of sanity checks are
|
||||
performed, nothing guarantees the integrity of the DFA's transition table
|
||||
since deserialization is a constant time operation. Since searching with a
|
||||
DFA must be able to follow transitions blindly for performance reasons,
|
||||
giving incorrect bytes to the deserialization API can result in memory
|
||||
unsafety.
|
||||
|
||||
The same process can be achieved with sparse DFAs as well:
|
||||
|
||||
```
|
||||
use regex_automata::{SparseDFA, Regex};
|
||||
|
||||
# fn example() -> Result<(), regex_automata::Error> {
|
||||
let re1 = Regex::new(r"[0-9]{4}-[0-9]{2}-[0-9]{2}").unwrap();
|
||||
// serialize both
|
||||
let fwd_bytes = re1.forward().to_u16()?.to_sparse()?.to_bytes_native_endian()?;
|
||||
let rev_bytes = re1.reverse().to_u16()?.to_sparse()?.to_bytes_native_endian()?;
|
||||
// now deserialize both---we need to specify the correct type!
|
||||
let fwd: SparseDFA<&[u8], u16> = unsafe { SparseDFA::from_bytes(&fwd_bytes) };
|
||||
let rev: SparseDFA<&[u8], u16> = unsafe { SparseDFA::from_bytes(&rev_bytes) };
|
||||
// finally, reconstruct our regex
|
||||
let re2 = Regex::from_dfas(fwd, rev);
|
||||
|
||||
// we can use it like normal
|
||||
let text = b"2018-12-24 2016-10-08";
|
||||
let matches: Vec<(usize, usize)> = re2.find_iter(text).collect();
|
||||
assert_eq!(matches, vec![(0, 10), (11, 21)]);
|
||||
# Ok(()) }; example().unwrap()
|
||||
```
|
||||
|
||||
Note that unlike dense DFAs, sparse DFAs have no alignment requirements.
|
||||
Conversely, dense DFAs must be be aligned to the same alignment as their
|
||||
state identifier representation.
|
||||
|
||||
# Support for `no_std`
|
||||
|
||||
This crate comes with a `std` feature that is enabled by default. When the
|
||||
`std` feature is enabled, the API of this crate will include the facilities
|
||||
necessary for compiling, serializing, deserializing and searching with regular
|
||||
expressions. When the `std` feature is disabled, the API of this crate will
|
||||
shrink such that it only includes the facilities necessary for deserializing
|
||||
and searching with regular expressions.
|
||||
|
||||
The intended workflow for `no_std` environments is thus as follows:
|
||||
|
||||
* Write a program with the `std` feature that compiles and serializes a
|
||||
regular expression. Serialization should only happen after first converting
|
||||
the DFAs to use a fixed size state identifier instead of the default `usize`.
|
||||
You may also need to serialize both little and big endian versions of each
|
||||
DFA. (So that's 4 DFAs in total for each regex.)
|
||||
* In your `no_std` environment, follow the examples above for deserializing
|
||||
your previously serialized DFAs into regexes. You can then search with them
|
||||
as you would any regex.
|
||||
|
||||
Deserialization can happen anywhere. For example, with bytes embedded into a
|
||||
binary or with a file memory mapped at runtime.
|
||||
|
||||
Note that the
|
||||
[`ucd-generate`](https://github.com/BurntSushi/ucd-generate)
|
||||
tool will do the first step for you with its `dfa` or `regex` sub-commands.
|
||||
|
||||
# Syntax
|
||||
|
||||
This crate supports the same syntax as the `regex` crate, since they share the
|
||||
same parser. You can find an exhaustive list of supported syntax in the
|
||||
[documentation for the `regex` crate](https://docs.rs/regex/1.1/regex/#syntax).
|
||||
|
||||
Currently, there are a couple limitations. In general, this crate does not
|
||||
support zero-width assertions, although they may be added in the future. This
|
||||
includes:
|
||||
|
||||
* Anchors such as `^`, `$`, `\A` and `\z`.
|
||||
* Word boundary assertions such as `\b` and `\B`.
|
||||
|
||||
It is possible to run a search that is anchored at the beginning of the input.
|
||||
To do that, set the
|
||||
[`RegexBuilder::anchored`](struct.RegexBuilder.html#method.anchored)
|
||||
option when building a regex. By default, all searches are unanchored.
|
||||
|
||||
# Differences with the regex crate
|
||||
|
||||
The main goal of the [`regex`](https://docs.rs/regex) crate is to serve as a
|
||||
general purpose regular expression engine. It aims to automatically balance low
|
||||
compile times, fast search times and low memory usage, while also providing
|
||||
a convenient API for users. In contrast, this crate provides a lower level
|
||||
regular expression interface that is a bit less convenient while providing more
|
||||
explicit control over memory usage and search times.
|
||||
|
||||
Here are some specific negative differences:
|
||||
|
||||
* **Compilation can take an exponential amount of time and space** in the size
|
||||
of the regex pattern. While most patterns do not exhibit worst case
|
||||
exponential time, such patterns do exist. For example, `[01]*1[01]{N}` will
|
||||
build a DFA with `2^(N+1)` states. For this reason, untrusted patterns should
|
||||
not be compiled with this library. (In the future, the API may expose an
|
||||
option to return an error if the DFA gets too big.)
|
||||
* This crate does not support sub-match extraction, which can be achieved with
|
||||
the regex crate's "captures" API. This may be added in the future, but is
|
||||
unlikely.
|
||||
* While the regex crate doesn't necessarily sport fast compilation times, the
|
||||
regexes in this crate are almost universally slow to compile, especially when
|
||||
they contain large Unicode character classes. For example, on my system,
|
||||
compiling `\w{3}` with byte classes enabled takes just over 1 second and
|
||||
almost 5MB of memory! (Compiling a sparse regex takes about the same time
|
||||
but only uses about 500KB of memory.) Conversly, compiling the same regex
|
||||
without Unicode support, e.g., `(?-u)\w{3}`, takes under 1 millisecond and
|
||||
less than 5KB of memory. For this reason, you should only use Unicode
|
||||
character classes if you absolutely need them!
|
||||
* This crate does not support regex sets.
|
||||
* This crate does not support zero-width assertions such as `^`, `$`, `\b` or
|
||||
`\B`.
|
||||
* As a lower level crate, this library does not do literal optimizations. In
|
||||
exchange, you get predictable performance regardless of input. The
|
||||
philosophy here is that literal optimizations should be applied at a higher
|
||||
level, although there is no easy support for this in the ecosystem yet.
|
||||
* There is no `&str` API like in the regex crate. In this crate, all APIs
|
||||
operate on `&[u8]`. By default, match indices are guaranteed to fall on
|
||||
UTF-8 boundaries, unless
|
||||
[`RegexBuilder::allow_invalid_utf8`](struct.RegexBuilder.html#method.allow_invalid_utf8)
|
||||
is enabled.
|
||||
|
||||
With some of the downsides out of the way, here are some positive differences:
|
||||
|
||||
* Both dense and sparse DFAs can be serialized to raw bytes, and then cheaply
|
||||
deserialized. Deserialization always takes constant time since searching can
|
||||
be performed directly on the raw serialized bytes of a DFA.
|
||||
* This crate was specifically designed so that the searching phase of a DFA has
|
||||
minimal runtime requirements, and can therefore be used in `no_std`
|
||||
environments. While `no_std` environments cannot compile regexes, they can
|
||||
deserialize pre-compiled regexes.
|
||||
* Since this crate builds DFAs ahead of time, it will generally out-perform
|
||||
the `regex` crate on equivalent tasks. The performance difference is likely
|
||||
not large. However, because of a complex set of optimizations in the regex
|
||||
crate (like literal optimizations), an accurate performance comparison may be
|
||||
difficult to do.
|
||||
* Sparse DFAs provide a way to build a DFA ahead of time that sacrifices search
|
||||
performance a small amount, but uses much less storage space. Potentially
|
||||
even less than what the regex crate uses.
|
||||
* This crate exposes DFAs directly, such as
|
||||
[`DenseDFA`](enum.DenseDFA.html) and [`SparseDFA`](enum.SparseDFA.html),
|
||||
which enables one to do less work in some cases. For example, if you only
|
||||
need the end of a match and not the start of a match, then you can use a DFA
|
||||
directly without building a `Regex`, which always requires a second DFA to
|
||||
find the start of a match.
|
||||
* Aside from choosing between dense and sparse DFAs, there are several options
|
||||
for configuring the space usage vs search time trade off. These include
|
||||
things like choosing a smaller state identifier representation, to
|
||||
premultiplying state identifiers and splitting a DFA's alphabet into
|
||||
equivalence classes. Finally, DFA minimization is also provided, but can
|
||||
increase compilation times dramatically.
|
||||
*/
|
||||
|
||||
#![deny(missing_docs)]
|
||||
#![cfg_attr(not(feature = "std"), no_std)]
|
||||
|
||||
#[cfg(feature = "std")]
|
||||
extern crate core;
|
||||
|
||||
#[cfg(all(test, feature = "transducer"))]
|
||||
extern crate bstr;
|
||||
#[cfg(feature = "transducer")]
|
||||
extern crate fst;
|
||||
#[cfg(feature = "std")]
|
||||
extern crate regex_syntax;
|
||||
|
||||
pub use dense::DenseDFA;
|
||||
pub use dfa::DFA;
|
||||
#[cfg(feature = "std")]
|
||||
pub use error::{Error, ErrorKind};
|
||||
pub use regex::Regex;
|
||||
#[cfg(feature = "std")]
|
||||
pub use regex::RegexBuilder;
|
||||
pub use sparse::SparseDFA;
|
||||
pub use state_id::StateID;
|
||||
|
||||
mod byteorder;
|
||||
mod classes;
|
||||
#[path = "dense.rs"]
|
||||
mod dense_imp;
|
||||
#[cfg(feature = "std")]
|
||||
mod determinize;
|
||||
mod dfa;
|
||||
#[cfg(feature = "std")]
|
||||
mod error;
|
||||
#[cfg(feature = "std")]
|
||||
mod minimize;
|
||||
#[cfg(feature = "std")]
|
||||
#[doc(hidden)]
|
||||
pub mod nfa;
|
||||
mod regex;
|
||||
#[path = "sparse.rs"]
|
||||
mod sparse_imp;
|
||||
#[cfg(feature = "std")]
|
||||
mod sparse_set;
|
||||
mod state_id;
|
||||
#[cfg(feature = "transducer")]
|
||||
mod transducer;
|
||||
|
||||
/// Types and routines specific to dense DFAs.
|
||||
///
|
||||
/// This module is the home of [`DenseDFA`](enum.DenseDFA.html) and each of its
|
||||
/// corresponding variant DFA types, such as [`Standard`](struct.Standard.html)
|
||||
/// and [`ByteClass`](struct.ByteClass.html).
|
||||
///
|
||||
/// This module also contains a [builder](struct.Builder.html) for
|
||||
/// configuring the construction of a dense DFA.
|
||||
pub mod dense {
|
||||
pub use dense_imp::*;
|
||||
}
|
||||
|
||||
/// Types and routines specific to sparse DFAs.
|
||||
///
|
||||
/// This module is the home of [`SparseDFA`](enum.SparseDFA.html) and each of
|
||||
/// its corresponding variant DFA types, such as
|
||||
/// [`Standard`](struct.Standard.html) and
|
||||
/// [`ByteClass`](struct.ByteClass.html).
|
||||
///
|
||||
/// Unlike the [`dense`](../dense/index.html) module, this module does not
|
||||
/// contain a builder specific for sparse DFAs. Instead, the intended way to
|
||||
/// build a sparse DFA is either by using a default configuration with its
|
||||
/// [constructor](enum.SparseDFA.html#method.new),
|
||||
/// or by first
|
||||
/// [configuring the construction of a dense DFA](../dense/struct.Builder.html)
|
||||
/// and then calling
|
||||
/// [`DenseDFA::to_sparse`](../enum.DenseDFA.html#method.to_sparse).
|
||||
pub mod sparse {
|
||||
pub use sparse_imp::*;
|
||||
}
|
||||
373
third-party/vendor/regex-automata-0.1.10/src/minimize.rs
vendored
Normal file
373
third-party/vendor/regex-automata-0.1.10/src/minimize.rs
vendored
Normal file
|
|
@ -0,0 +1,373 @@
|
|||
use std::cell::RefCell;
|
||||
use std::fmt;
|
||||
use std::mem;
|
||||
use std::rc::Rc;
|
||||
|
||||
use dense;
|
||||
use state_id::{dead_id, StateID};
|
||||
|
||||
type DFARepr<S> = dense::Repr<Vec<S>, S>;
|
||||
|
||||
/// An implementation of Hopcroft's algorithm for minimizing DFAs.
|
||||
///
|
||||
/// The algorithm implemented here is mostly taken from Wikipedia:
|
||||
/// https://en.wikipedia.org/wiki/DFA_minimization#Hopcroft's_algorithm
|
||||
///
|
||||
/// This code has had some light optimization attention paid to it,
|
||||
/// particularly in the form of reducing allocation as much as possible.
|
||||
/// However, it is still generally slow. Future optimization work should
|
||||
/// probably focus on the bigger picture rather than micro-optimizations. For
|
||||
/// example:
|
||||
///
|
||||
/// 1. Figure out how to more intelligently create initial partitions. That is,
|
||||
/// Hopcroft's algorithm starts by creating two partitions of DFA states
|
||||
/// that are known to NOT be equivalent: match states and non-match states.
|
||||
/// The algorithm proceeds by progressively refining these partitions into
|
||||
/// smaller partitions. If we could start with more partitions, then we
|
||||
/// could reduce the amount of work that Hopcroft's algorithm needs to do.
|
||||
/// 2. For every partition that we visit, we find all incoming transitions to
|
||||
/// every state in the partition for *every* element in the alphabet. (This
|
||||
/// is why using byte classes can significantly decrease minimization times,
|
||||
/// since byte classes shrink the alphabet.) This is quite costly and there
|
||||
/// is perhaps some redundant work being performed depending on the specific
|
||||
/// states in the set. For example, we might be able to only visit some
|
||||
/// elements of the alphabet based on the transitions.
|
||||
/// 3. Move parts of minimization into determinization. If minimization has
|
||||
/// fewer states to deal with, then it should run faster. A prime example
|
||||
/// of this might be large Unicode classes, which are generated in way that
|
||||
/// can create a lot of redundant states. (Some work has been done on this
|
||||
/// point during NFA compilation via the algorithm described in the
|
||||
/// "Incremental Construction of MinimalAcyclic Finite-State Automata"
|
||||
/// paper.)
|
||||
pub(crate) struct Minimizer<'a, S: 'a> {
|
||||
dfa: &'a mut DFARepr<S>,
|
||||
in_transitions: Vec<Vec<Vec<S>>>,
|
||||
partitions: Vec<StateSet<S>>,
|
||||
waiting: Vec<StateSet<S>>,
|
||||
}
|
||||
|
||||
impl<'a, S: StateID> fmt::Debug for Minimizer<'a, S> {
|
||||
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
|
||||
f.debug_struct("Minimizer")
|
||||
.field("dfa", &self.dfa)
|
||||
.field("in_transitions", &self.in_transitions)
|
||||
.field("partitions", &self.partitions)
|
||||
.field("waiting", &self.waiting)
|
||||
.finish()
|
||||
}
|
||||
}
|
||||
|
||||
/// A set of states. A state set makes up a single partition in Hopcroft's
|
||||
/// algorithm.
|
||||
///
|
||||
/// It is represented by an ordered set of state identifiers. We use shared
|
||||
/// ownership so that a single state set can be in both the set of partitions
|
||||
/// and in the set of waiting sets simultaneously without an additional
|
||||
/// allocation. Generally, once a state set is built, it becomes immutable.
|
||||
///
|
||||
/// We use this representation because it avoids the overhead of more
|
||||
/// traditional set data structures (HashSet/BTreeSet), and also because
|
||||
/// computing intersection/subtraction on this representation is especially
|
||||
/// fast.
|
||||
#[derive(Clone, Debug, Eq, PartialEq, PartialOrd, Ord)]
|
||||
struct StateSet<S>(Rc<RefCell<Vec<S>>>);
|
||||
|
||||
impl<'a, S: StateID> Minimizer<'a, S> {
|
||||
pub fn new(dfa: &'a mut DFARepr<S>) -> Minimizer<'a, S> {
|
||||
let in_transitions = Minimizer::incoming_transitions(dfa);
|
||||
let partitions = Minimizer::initial_partitions(dfa);
|
||||
let waiting = vec![partitions[0].clone()];
|
||||
|
||||
Minimizer { dfa, in_transitions, partitions, waiting }
|
||||
}
|
||||
|
||||
pub fn run(mut self) {
|
||||
let mut incoming = StateSet::empty();
|
||||
let mut scratch1 = StateSet::empty();
|
||||
let mut scratch2 = StateSet::empty();
|
||||
let mut newparts = vec![];
|
||||
|
||||
while let Some(set) = self.waiting.pop() {
|
||||
for b in (0..self.dfa.alphabet_len()).map(|b| b as u8) {
|
||||
self.find_incoming_to(b, &set, &mut incoming);
|
||||
|
||||
for p in 0..self.partitions.len() {
|
||||
self.partitions[p].intersection(&incoming, &mut scratch1);
|
||||
if scratch1.is_empty() {
|
||||
newparts.push(self.partitions[p].clone());
|
||||
continue;
|
||||
}
|
||||
|
||||
self.partitions[p].subtract(&incoming, &mut scratch2);
|
||||
if scratch2.is_empty() {
|
||||
newparts.push(self.partitions[p].clone());
|
||||
continue;
|
||||
}
|
||||
|
||||
let (x, y) =
|
||||
(scratch1.deep_clone(), scratch2.deep_clone());
|
||||
newparts.push(x.clone());
|
||||
newparts.push(y.clone());
|
||||
match self.find_waiting(&self.partitions[p]) {
|
||||
Some(i) => {
|
||||
self.waiting[i] = x;
|
||||
self.waiting.push(y);
|
||||
}
|
||||
None => {
|
||||
if x.len() <= y.len() {
|
||||
self.waiting.push(x);
|
||||
} else {
|
||||
self.waiting.push(y);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
newparts = mem::replace(&mut self.partitions, newparts);
|
||||
newparts.clear();
|
||||
}
|
||||
}
|
||||
|
||||
// At this point, we now have a minimal partitioning of states, where
|
||||
// each partition is an equivalence class of DFA states. Now we need to
|
||||
// use this partioning to update the DFA to only contain one state for
|
||||
// each partition.
|
||||
|
||||
// Create a map from DFA state ID to the representative ID of the
|
||||
// equivalence class to which it belongs. The representative ID of an
|
||||
// equivalence class of states is the minimum ID in that class.
|
||||
let mut state_to_part = vec![dead_id(); self.dfa.state_count()];
|
||||
for p in &self.partitions {
|
||||
p.iter(|id| state_to_part[id.to_usize()] = p.min());
|
||||
}
|
||||
|
||||
// Generate a new contiguous sequence of IDs for minimal states, and
|
||||
// create a map from equivalence IDs to the new IDs. Thus, the new
|
||||
// minimal ID of *any* state in the unminimized DFA can be obtained
|
||||
// with minimals_ids[state_to_part[old_id]].
|
||||
let mut minimal_ids = vec![dead_id(); self.dfa.state_count()];
|
||||
let mut new_id = S::from_usize(0);
|
||||
for (id, _) in self.dfa.states() {
|
||||
if state_to_part[id.to_usize()] == id {
|
||||
minimal_ids[id.to_usize()] = new_id;
|
||||
new_id = S::from_usize(new_id.to_usize() + 1);
|
||||
}
|
||||
}
|
||||
// The total number of states in the minimal DFA.
|
||||
let minimal_count = new_id.to_usize();
|
||||
|
||||
// Re-map this DFA in place such that the only states remaining
|
||||
// correspond to the representative states of every equivalence class.
|
||||
for id in (0..self.dfa.state_count()).map(S::from_usize) {
|
||||
// If this state isn't a representative for an equivalence class,
|
||||
// then we skip it since it won't appear in the minimal DFA.
|
||||
if state_to_part[id.to_usize()] != id {
|
||||
continue;
|
||||
}
|
||||
for (_, next) in self.dfa.get_state_mut(id).iter_mut() {
|
||||
*next = minimal_ids[state_to_part[next.to_usize()].to_usize()];
|
||||
}
|
||||
self.dfa.swap_states(id, minimal_ids[id.to_usize()]);
|
||||
}
|
||||
// Trim off all unused states from the pre-minimized DFA. This
|
||||
// represents all states that were merged into a non-singleton
|
||||
// equivalence class of states, and appeared after the first state
|
||||
// in each such class. (Because the state with the smallest ID in each
|
||||
// equivalence class is its representative ID.)
|
||||
self.dfa.truncate_states(minimal_count);
|
||||
|
||||
// Update the new start state, which is now just the minimal ID of
|
||||
// whatever state the old start state was collapsed into.
|
||||
let old_start = self.dfa.start_state();
|
||||
self.dfa.set_start_state(
|
||||
minimal_ids[state_to_part[old_start.to_usize()].to_usize()],
|
||||
);
|
||||
|
||||
// In order to update the ID of the maximum match state, we need to
|
||||
// find the maximum ID among all of the match states in the minimized
|
||||
// DFA. This is not necessarily the new ID of the unminimized maximum
|
||||
// match state, since that could have been collapsed with a much
|
||||
// earlier match state. Therefore, to find the new max match state,
|
||||
// we iterate over all previous match states, find their corresponding
|
||||
// new minimal ID, and take the maximum of those.
|
||||
let old_max = self.dfa.max_match_state();
|
||||
self.dfa.set_max_match_state(dead_id());
|
||||
for id in (0..(old_max.to_usize() + 1)).map(S::from_usize) {
|
||||
let part = state_to_part[id.to_usize()];
|
||||
let new_id = minimal_ids[part.to_usize()];
|
||||
if new_id > self.dfa.max_match_state() {
|
||||
self.dfa.set_max_match_state(new_id);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
fn find_waiting(&self, set: &StateSet<S>) -> Option<usize> {
|
||||
self.waiting.iter().position(|s| s == set)
|
||||
}
|
||||
|
||||
fn find_incoming_to(
|
||||
&self,
|
||||
b: u8,
|
||||
set: &StateSet<S>,
|
||||
incoming: &mut StateSet<S>,
|
||||
) {
|
||||
incoming.clear();
|
||||
set.iter(|id| {
|
||||
for &inid in &self.in_transitions[id.to_usize()][b as usize] {
|
||||
incoming.add(inid);
|
||||
}
|
||||
});
|
||||
incoming.canonicalize();
|
||||
}
|
||||
|
||||
fn initial_partitions(dfa: &DFARepr<S>) -> Vec<StateSet<S>> {
|
||||
let mut is_match = StateSet::empty();
|
||||
let mut no_match = StateSet::empty();
|
||||
for (id, _) in dfa.states() {
|
||||
if dfa.is_match_state(id) {
|
||||
is_match.add(id);
|
||||
} else {
|
||||
no_match.add(id);
|
||||
}
|
||||
}
|
||||
|
||||
let mut sets = vec![is_match];
|
||||
if !no_match.is_empty() {
|
||||
sets.push(no_match);
|
||||
}
|
||||
sets.sort_by_key(|s| s.len());
|
||||
sets
|
||||
}
|
||||
|
||||
fn incoming_transitions(dfa: &DFARepr<S>) -> Vec<Vec<Vec<S>>> {
|
||||
let mut incoming = vec![];
|
||||
for _ in dfa.states() {
|
||||
incoming.push(vec![vec![]; dfa.alphabet_len()]);
|
||||
}
|
||||
for (id, state) in dfa.states() {
|
||||
for (b, next) in state.transitions() {
|
||||
incoming[next.to_usize()][b as usize].push(id);
|
||||
}
|
||||
}
|
||||
incoming
|
||||
}
|
||||
}
|
||||
|
||||
impl<S: StateID> StateSet<S> {
|
||||
fn empty() -> StateSet<S> {
|
||||
StateSet(Rc::new(RefCell::new(vec![])))
|
||||
}
|
||||
|
||||
fn add(&mut self, id: S) {
|
||||
self.0.borrow_mut().push(id);
|
||||
}
|
||||
|
||||
fn min(&self) -> S {
|
||||
self.0.borrow()[0]
|
||||
}
|
||||
|
||||
fn canonicalize(&mut self) {
|
||||
self.0.borrow_mut().sort();
|
||||
self.0.borrow_mut().dedup();
|
||||
}
|
||||
|
||||
fn clear(&mut self) {
|
||||
self.0.borrow_mut().clear();
|
||||
}
|
||||
|
||||
fn len(&self) -> usize {
|
||||
self.0.borrow().len()
|
||||
}
|
||||
|
||||
fn is_empty(&self) -> bool {
|
||||
self.len() == 0
|
||||
}
|
||||
|
||||
fn deep_clone(&self) -> StateSet<S> {
|
||||
let ids = self.0.borrow().iter().cloned().collect();
|
||||
StateSet(Rc::new(RefCell::new(ids)))
|
||||
}
|
||||
|
||||
fn iter<F: FnMut(S)>(&self, mut f: F) {
|
||||
for &id in self.0.borrow().iter() {
|
||||
f(id);
|
||||
}
|
||||
}
|
||||
|
||||
fn intersection(&self, other: &StateSet<S>, dest: &mut StateSet<S>) {
|
||||
dest.clear();
|
||||
if self.is_empty() || other.is_empty() {
|
||||
return;
|
||||
}
|
||||
|
||||
let (seta, setb) = (self.0.borrow(), other.0.borrow());
|
||||
let (mut ita, mut itb) = (seta.iter().cloned(), setb.iter().cloned());
|
||||
let (mut a, mut b) = (ita.next().unwrap(), itb.next().unwrap());
|
||||
loop {
|
||||
if a == b {
|
||||
dest.add(a);
|
||||
a = match ita.next() {
|
||||
None => break,
|
||||
Some(a) => a,
|
||||
};
|
||||
b = match itb.next() {
|
||||
None => break,
|
||||
Some(b) => b,
|
||||
};
|
||||
} else if a < b {
|
||||
a = match ita.next() {
|
||||
None => break,
|
||||
Some(a) => a,
|
||||
};
|
||||
} else {
|
||||
b = match itb.next() {
|
||||
None => break,
|
||||
Some(b) => b,
|
||||
};
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
fn subtract(&self, other: &StateSet<S>, dest: &mut StateSet<S>) {
|
||||
dest.clear();
|
||||
if self.is_empty() || other.is_empty() {
|
||||
self.iter(|s| dest.add(s));
|
||||
return;
|
||||
}
|
||||
|
||||
let (seta, setb) = (self.0.borrow(), other.0.borrow());
|
||||
let (mut ita, mut itb) = (seta.iter().cloned(), setb.iter().cloned());
|
||||
let (mut a, mut b) = (ita.next().unwrap(), itb.next().unwrap());
|
||||
loop {
|
||||
if a == b {
|
||||
a = match ita.next() {
|
||||
None => break,
|
||||
Some(a) => a,
|
||||
};
|
||||
b = match itb.next() {
|
||||
None => {
|
||||
dest.add(a);
|
||||
break;
|
||||
}
|
||||
Some(b) => b,
|
||||
};
|
||||
} else if a < b {
|
||||
dest.add(a);
|
||||
a = match ita.next() {
|
||||
None => break,
|
||||
Some(a) => a,
|
||||
};
|
||||
} else {
|
||||
b = match itb.next() {
|
||||
None => {
|
||||
dest.add(a);
|
||||
break;
|
||||
}
|
||||
Some(b) => b,
|
||||
};
|
||||
}
|
||||
}
|
||||
for a in ita {
|
||||
dest.add(a);
|
||||
}
|
||||
}
|
||||
}
|
||||
1193
third-party/vendor/regex-automata-0.1.10/src/nfa/compiler.rs
vendored
Normal file
1193
third-party/vendor/regex-automata-0.1.10/src/nfa/compiler.rs
vendored
Normal file
File diff suppressed because it is too large
Load diff
282
third-party/vendor/regex-automata-0.1.10/src/nfa/map.rs
vendored
Normal file
282
third-party/vendor/regex-automata-0.1.10/src/nfa/map.rs
vendored
Normal file
|
|
@ -0,0 +1,282 @@
|
|||
// This module contains a couple simple and purpose built hash maps. The key
|
||||
// trade off they make is that they serve as caches rather than true maps. That
|
||||
// is, inserting a new entry may cause eviction of another entry. This gives
|
||||
// us two things. First, there's less overhead associated with inserts and
|
||||
// lookups. Secondly, it lets us control our memory usage.
|
||||
//
|
||||
// These maps are used in some fairly hot code when generating NFA states for
|
||||
// large Unicode character classes.
|
||||
//
|
||||
// Instead of exposing a rich hashmap entry API, we just permit the caller
|
||||
// to produce a hash of the key directly. The hash can then be reused for both
|
||||
// lookups and insertions at the cost of leaking things a bit. But these are
|
||||
// for internal use only, so it's fine.
|
||||
//
|
||||
// The Utf8BoundedMap is used for Daciuk's algorithm for constructing a
|
||||
// (almost) minimal DFA for large Unicode character classes in linear time.
|
||||
// (Daciuk's algorithm is always used when compiling forward NFAs. For reverse
|
||||
// NFAs, it's only used when the compiler is configured to 'shrink' the NFA,
|
||||
// since there's a bit more expense in the reverse direction.)
|
||||
//
|
||||
// The Utf8SuffixMap is used when compiling large Unicode character classes for
|
||||
// reverse NFAs when 'shrink' is disabled. Specifically, it augments the naive
|
||||
// construction of UTF-8 automata by caching common suffixes. This doesn't
|
||||
// get the same space savings as Daciuk's algorithm, but it's basically as
|
||||
// fast as the naive approach and typically winds up using less memory (since
|
||||
// it generates smaller NFAs) despite the presence of the cache.
|
||||
//
|
||||
// These maps effectively represent caching mechanisms for CState::Sparse and
|
||||
// CState::Range, respectively. The former represents a single NFA state with
|
||||
// many transitions of equivalent priority while the latter represents a single
|
||||
// NFA state with a single transition. (Neither state ever has or is an
|
||||
// epsilon transition.) Thus, they have different key types. It's likely we
|
||||
// could make one generic map, but the machinery didn't seem worth it. They
|
||||
// are simple enough.
|
||||
|
||||
use nfa::{StateID, Transition};
|
||||
|
||||
// Basic FNV-1a hash constants as described in:
|
||||
// https://en.wikipedia.org/wiki/Fowler%E2%80%93Noll%E2%80%93Vo_hash_function
|
||||
const PRIME: u64 = 1099511628211;
|
||||
const INIT: u64 = 14695981039346656037;
|
||||
|
||||
/// A bounded hash map where the key is a sequence of NFA transitions and the
|
||||
/// value is a pre-existing NFA state ID.
|
||||
///
|
||||
/// std's hashmap can be used for this, however, this map has two important
|
||||
/// advantages. Firstly, it has lower overhead. Secondly, it permits us to
|
||||
/// control our memory usage by limited the number of slots. In general, the
|
||||
/// cost here is that this map acts as a cache. That is, inserting a new entry
|
||||
/// may remove an old entry. We are okay with this, since it does not impact
|
||||
/// correctness in the cases where it is used. The only effect that dropping
|
||||
/// states from the cache has is that the resulting NFA generated may be bigger
|
||||
/// than it otherwise would be.
|
||||
///
|
||||
/// This improves benchmarks that compile large Unicode character classes,
|
||||
/// since it makes the generation of (almost) minimal UTF-8 automaton faster.
|
||||
/// Specifically, one could observe the difference with std's hashmap via
|
||||
/// something like the following benchmark:
|
||||
///
|
||||
/// hyperfine "regex-automata-debug debug -acqr '\w{40} ecurB'"
|
||||
///
|
||||
/// But to observe that difference, you'd have to modify the code to use
|
||||
/// std's hashmap.
|
||||
///
|
||||
/// It is quite possible that there is a better way to approach this problem.
|
||||
/// For example, if there happens to be a very common state that collides with
|
||||
/// a lot of less frequent states, then we could wind up with very poor caching
|
||||
/// behavior. Alas, the effectiveness of this cache has not been measured.
|
||||
/// Instead, ad hoc experiments suggest that it is "good enough." Additional
|
||||
/// smarts (such as an LRU eviction policy) have to be weighed against the
|
||||
/// amount of extra time they cost.
|
||||
#[derive(Clone, Debug)]
|
||||
pub struct Utf8BoundedMap {
|
||||
/// The current version of this map. Only entries with matching versions
|
||||
/// are considered during lookups. If an entry is found with a mismatched
|
||||
/// version, then the map behaves as if the entry does not exist.
|
||||
version: u16,
|
||||
/// The total number of entries this map can store.
|
||||
capacity: usize,
|
||||
/// The actual entries, keyed by hash. Collisions between different states
|
||||
/// result in the old state being dropped.
|
||||
map: Vec<Utf8BoundedEntry>,
|
||||
}
|
||||
|
||||
/// An entry in this map.
|
||||
#[derive(Clone, Debug, Default)]
|
||||
struct Utf8BoundedEntry {
|
||||
/// The version of the map used to produce this entry. If this entry's
|
||||
/// version does not match the current version of the map, then the map
|
||||
/// should behave as if this entry does not exist.
|
||||
version: u16,
|
||||
/// The key, which is a sorted sequence of non-overlapping NFA transitions.
|
||||
key: Vec<Transition>,
|
||||
/// The state ID corresponding to the state containing the transitions in
|
||||
/// this entry.
|
||||
val: StateID,
|
||||
}
|
||||
|
||||
impl Utf8BoundedMap {
|
||||
/// Create a new bounded map with the given capacity. The map will never
|
||||
/// grow beyond the given size.
|
||||
///
|
||||
/// Note that this does not allocate. Instead, callers must call `clear`
|
||||
/// before using this map. `clear` will allocate space if necessary.
|
||||
///
|
||||
/// This avoids the need to pay for the allocation of this map when
|
||||
/// compiling regexes that lack large Unicode character classes.
|
||||
pub fn new(capacity: usize) -> Utf8BoundedMap {
|
||||
assert!(capacity > 0);
|
||||
Utf8BoundedMap { version: 0, capacity, map: vec![] }
|
||||
}
|
||||
|
||||
/// Clear this map of all entries, but permit the reuse of allocation
|
||||
/// if possible.
|
||||
///
|
||||
/// This must be called before the map can be used.
|
||||
pub fn clear(&mut self) {
|
||||
if self.map.is_empty() {
|
||||
self.map = vec![Utf8BoundedEntry::default(); self.capacity];
|
||||
} else {
|
||||
self.version = self.version.wrapping_add(1);
|
||||
if self.version == 0 {
|
||||
self.map = vec![Utf8BoundedEntry::default(); self.capacity];
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Return a hash of the given transitions.
|
||||
pub fn hash(&self, key: &[Transition]) -> usize {
|
||||
let mut h = INIT;
|
||||
for t in key {
|
||||
h = (h ^ (t.start as u64)).wrapping_mul(PRIME);
|
||||
h = (h ^ (t.end as u64)).wrapping_mul(PRIME);
|
||||
h = (h ^ (t.next as u64)).wrapping_mul(PRIME);
|
||||
}
|
||||
(h as usize) % self.map.len()
|
||||
}
|
||||
|
||||
/// Retrieve the cached state ID corresponding to the given key. The hash
|
||||
/// given must have been computed with `hash` using the same key value.
|
||||
///
|
||||
/// If there is no cached state with the given transitions, then None is
|
||||
/// returned.
|
||||
pub fn get(&mut self, key: &[Transition], hash: usize) -> Option<StateID> {
|
||||
let entry = &self.map[hash];
|
||||
if entry.version != self.version {
|
||||
return None;
|
||||
}
|
||||
// There may be a hash collision, so we need to confirm real equality.
|
||||
if entry.key != key {
|
||||
return None;
|
||||
}
|
||||
Some(entry.val)
|
||||
}
|
||||
|
||||
/// Add a cached state to this map with the given key. Callers should
|
||||
/// ensure that `state_id` points to a state that contains precisely the
|
||||
/// NFA transitions given.
|
||||
///
|
||||
/// `hash` must have been computed using the `hash` method with the same
|
||||
/// key.
|
||||
pub fn set(
|
||||
&mut self,
|
||||
key: Vec<Transition>,
|
||||
hash: usize,
|
||||
state_id: StateID,
|
||||
) {
|
||||
self.map[hash] =
|
||||
Utf8BoundedEntry { version: self.version, key, val: state_id };
|
||||
}
|
||||
}
|
||||
|
||||
/// A cache of suffixes used to modestly compress UTF-8 automata for large
|
||||
/// Unicode character classes.
|
||||
#[derive(Clone, Debug)]
|
||||
pub struct Utf8SuffixMap {
|
||||
/// The current version of this map. Only entries with matching versions
|
||||
/// are considered during lookups. If an entry is found with a mismatched
|
||||
/// version, then the map behaves as if the entry does not exist.
|
||||
version: u16,
|
||||
/// The total number of entries this map can store.
|
||||
capacity: usize,
|
||||
/// The actual entries, keyed by hash. Collisions between different states
|
||||
/// result in the old state being dropped.
|
||||
map: Vec<Utf8SuffixEntry>,
|
||||
}
|
||||
|
||||
/// A key that uniquely identifies an NFA state. It is a triple that represents
|
||||
/// a transition from one state for a particular byte range.
|
||||
#[derive(Clone, Debug, Default, Eq, PartialEq)]
|
||||
pub struct Utf8SuffixKey {
|
||||
pub from: StateID,
|
||||
pub start: u8,
|
||||
pub end: u8,
|
||||
}
|
||||
|
||||
/// An entry in this map.
|
||||
#[derive(Clone, Debug, Default)]
|
||||
struct Utf8SuffixEntry {
|
||||
/// The version of the map used to produce this entry. If this entry's
|
||||
/// version does not match the current version of the map, then the map
|
||||
/// should behave as if this entry does not exist.
|
||||
version: u16,
|
||||
/// The key, which consists of a transition in a particular state.
|
||||
key: Utf8SuffixKey,
|
||||
/// The identifier that the transition in the key maps to.
|
||||
val: StateID,
|
||||
}
|
||||
|
||||
impl Utf8SuffixMap {
|
||||
/// Create a new bounded map with the given capacity. The map will never
|
||||
/// grow beyond the given size.
|
||||
///
|
||||
/// Note that this does not allocate. Instead, callers must call `clear`
|
||||
/// before using this map. `clear` will allocate space if necessary.
|
||||
///
|
||||
/// This avoids the need to pay for the allocation of this map when
|
||||
/// compiling regexes that lack large Unicode character classes.
|
||||
pub fn new(capacity: usize) -> Utf8SuffixMap {
|
||||
assert!(capacity > 0);
|
||||
Utf8SuffixMap { version: 0, capacity, map: vec![] }
|
||||
}
|
||||
|
||||
/// Clear this map of all entries, but permit the reuse of allocation
|
||||
/// if possible.
|
||||
///
|
||||
/// This must be called before the map can be used.
|
||||
pub fn clear(&mut self) {
|
||||
if self.map.is_empty() {
|
||||
self.map = vec![Utf8SuffixEntry::default(); self.capacity];
|
||||
} else {
|
||||
self.version = self.version.wrapping_add(1);
|
||||
if self.version == 0 {
|
||||
self.map = vec![Utf8SuffixEntry::default(); self.capacity];
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Return a hash of the given transition.
|
||||
pub fn hash(&self, key: &Utf8SuffixKey) -> usize {
|
||||
// Basic FNV-1a hash as described:
|
||||
// https://en.wikipedia.org/wiki/Fowler%E2%80%93Noll%E2%80%93Vo_hash_function
|
||||
const PRIME: u64 = 1099511628211;
|
||||
const INIT: u64 = 14695981039346656037;
|
||||
|
||||
let mut h = INIT;
|
||||
h = (h ^ (key.from as u64)).wrapping_mul(PRIME);
|
||||
h = (h ^ (key.start as u64)).wrapping_mul(PRIME);
|
||||
h = (h ^ (key.end as u64)).wrapping_mul(PRIME);
|
||||
(h as usize) % self.map.len()
|
||||
}
|
||||
|
||||
/// Retrieve the cached state ID corresponding to the given key. The hash
|
||||
/// given must have been computed with `hash` using the same key value.
|
||||
///
|
||||
/// If there is no cached state with the given key, then None is returned.
|
||||
pub fn get(
|
||||
&mut self,
|
||||
key: &Utf8SuffixKey,
|
||||
hash: usize,
|
||||
) -> Option<StateID> {
|
||||
let entry = &self.map[hash];
|
||||
if entry.version != self.version {
|
||||
return None;
|
||||
}
|
||||
if key != &entry.key {
|
||||
return None;
|
||||
}
|
||||
Some(entry.val)
|
||||
}
|
||||
|
||||
/// Add a cached state to this map with the given key. Callers should
|
||||
/// ensure that `state_id` points to a state that contains precisely the
|
||||
/// NFA transition given.
|
||||
///
|
||||
/// `hash` must have been computed using the `hash` method with the same
|
||||
/// key.
|
||||
pub fn set(&mut self, key: Utf8SuffixKey, hash: usize, state_id: StateID) {
|
||||
self.map[hash] =
|
||||
Utf8SuffixEntry { version: self.version, key, val: state_id };
|
||||
}
|
||||
}
|
||||
252
third-party/vendor/regex-automata-0.1.10/src/nfa/mod.rs
vendored
Normal file
252
third-party/vendor/regex-automata-0.1.10/src/nfa/mod.rs
vendored
Normal file
|
|
@ -0,0 +1,252 @@
|
|||
use std::fmt;
|
||||
|
||||
use classes::ByteClasses;
|
||||
pub use nfa::compiler::Builder;
|
||||
|
||||
mod compiler;
|
||||
mod map;
|
||||
mod range_trie;
|
||||
|
||||
/// The representation for an NFA state identifier.
|
||||
pub type StateID = usize;
|
||||
|
||||
/// A final compiled NFA.
|
||||
///
|
||||
/// The states of the NFA are indexed by state IDs, which are how transitions
|
||||
/// are expressed.
|
||||
#[derive(Clone)]
|
||||
pub struct NFA {
|
||||
/// Whether this NFA can only match at the beginning of input or not.
|
||||
///
|
||||
/// When true, a match should only be reported if it begins at the 0th
|
||||
/// index of the haystack.
|
||||
anchored: bool,
|
||||
/// The starting state of this NFA.
|
||||
start: StateID,
|
||||
/// The state list. This list is guaranteed to be indexable by the starting
|
||||
/// state ID, and it is also guaranteed to contain exactly one `Match`
|
||||
/// state.
|
||||
states: Vec<State>,
|
||||
/// A mapping from any byte value to its corresponding equivalence class
|
||||
/// identifier. Two bytes in the same equivalence class cannot discriminate
|
||||
/// between a match or a non-match. This map can be used to shrink the
|
||||
/// total size of a DFA's transition table with a small match-time cost.
|
||||
///
|
||||
/// Note that the NFA's transitions are *not* defined in terms of these
|
||||
/// equivalence classes. The NFA's transitions are defined on the original
|
||||
/// byte values. For the most part, this is because they wouldn't really
|
||||
/// help the NFA much since the NFA already uses a sparse representation
|
||||
/// to represent transitions. Byte classes are most effective in a dense
|
||||
/// representation.
|
||||
byte_classes: ByteClasses,
|
||||
}
|
||||
|
||||
impl NFA {
|
||||
/// Returns an NFA that always matches at every position.
|
||||
pub fn always_match() -> NFA {
|
||||
NFA {
|
||||
anchored: false,
|
||||
start: 0,
|
||||
states: vec![State::Match],
|
||||
byte_classes: ByteClasses::empty(),
|
||||
}
|
||||
}
|
||||
|
||||
/// Returns an NFA that never matches at any position.
|
||||
pub fn never_match() -> NFA {
|
||||
NFA {
|
||||
anchored: false,
|
||||
start: 0,
|
||||
states: vec![State::Fail],
|
||||
byte_classes: ByteClasses::empty(),
|
||||
}
|
||||
}
|
||||
|
||||
/// Returns true if and only if this NFA is anchored.
|
||||
pub fn is_anchored(&self) -> bool {
|
||||
self.anchored
|
||||
}
|
||||
|
||||
/// Return the number of states in this NFA.
|
||||
pub fn len(&self) -> usize {
|
||||
self.states.len()
|
||||
}
|
||||
|
||||
/// Return the ID of the initial state of this NFA.
|
||||
pub fn start(&self) -> StateID {
|
||||
self.start
|
||||
}
|
||||
|
||||
/// Return the NFA state corresponding to the given ID.
|
||||
pub fn state(&self, id: StateID) -> &State {
|
||||
&self.states[id]
|
||||
}
|
||||
|
||||
/// Return the set of equivalence classes for this NFA. The slice returned
|
||||
/// always has length 256 and maps each possible byte value to its
|
||||
/// corresponding equivalence class ID (which is never more than 255).
|
||||
pub fn byte_classes(&self) -> &ByteClasses {
|
||||
&self.byte_classes
|
||||
}
|
||||
}
|
||||
|
||||
impl fmt::Debug for NFA {
|
||||
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
|
||||
for (i, state) in self.states.iter().enumerate() {
|
||||
let status = if i == self.start { '>' } else { ' ' };
|
||||
writeln!(f, "{}{:06}: {:?}", status, i, state)?;
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
|
||||
/// A state in a final compiled NFA.
|
||||
#[derive(Clone, Eq, PartialEq)]
|
||||
pub enum State {
|
||||
/// A state that transitions to `next` if and only if the current input
|
||||
/// byte is in the range `[start, end]` (inclusive).
|
||||
///
|
||||
/// This is a special case of Sparse in that it encodes only one transition
|
||||
/// (and therefore avoids the allocation).
|
||||
Range { range: Transition },
|
||||
/// A state with possibly many transitions, represented in a sparse
|
||||
/// fashion. Transitions are ordered lexicographically by input range.
|
||||
/// As such, this may only be used when every transition has equal
|
||||
/// priority. (In practice, this is only used for encoding large UTF-8
|
||||
/// automata.)
|
||||
Sparse { ranges: Box<[Transition]> },
|
||||
/// An alternation such that there exists an epsilon transition to all
|
||||
/// states in `alternates`, where matches found via earlier transitions
|
||||
/// are preferred over later transitions.
|
||||
Union { alternates: Box<[StateID]> },
|
||||
/// A fail state. When encountered, the automaton is guaranteed to never
|
||||
/// reach a match state.
|
||||
Fail,
|
||||
/// A match state. There is exactly one such occurrence of this state in
|
||||
/// an NFA.
|
||||
Match,
|
||||
}
|
||||
|
||||
/// A transition to another state, only if the given byte falls in the
|
||||
/// inclusive range specified.
|
||||
#[derive(Clone, Copy, Eq, Hash, PartialEq)]
|
||||
pub struct Transition {
|
||||
pub start: u8,
|
||||
pub end: u8,
|
||||
pub next: StateID,
|
||||
}
|
||||
|
||||
impl State {
|
||||
/// Returns true if and only if this state contains one or more epsilon
|
||||
/// transitions.
|
||||
pub fn is_epsilon(&self) -> bool {
|
||||
match *self {
|
||||
State::Range { .. }
|
||||
| State::Sparse { .. }
|
||||
| State::Fail
|
||||
| State::Match => false,
|
||||
State::Union { .. } => true,
|
||||
}
|
||||
}
|
||||
|
||||
/// Remap the transitions in this state using the given map. Namely, the
|
||||
/// given map should be indexed according to the transitions currently
|
||||
/// in this state.
|
||||
///
|
||||
/// This is used during the final phase of the NFA compiler, which turns
|
||||
/// its intermediate NFA into the final NFA.
|
||||
fn remap(&mut self, remap: &[StateID]) {
|
||||
match *self {
|
||||
State::Range { ref mut range } => range.next = remap[range.next],
|
||||
State::Sparse { ref mut ranges } => {
|
||||
for r in ranges.iter_mut() {
|
||||
r.next = remap[r.next];
|
||||
}
|
||||
}
|
||||
State::Union { ref mut alternates } => {
|
||||
for alt in alternates.iter_mut() {
|
||||
*alt = remap[*alt];
|
||||
}
|
||||
}
|
||||
State::Fail => {}
|
||||
State::Match => {}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl fmt::Debug for State {
|
||||
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
|
||||
match *self {
|
||||
State::Range { ref range } => range.fmt(f),
|
||||
State::Sparse { ref ranges } => {
|
||||
let rs = ranges
|
||||
.iter()
|
||||
.map(|t| format!("{:?}", t))
|
||||
.collect::<Vec<String>>()
|
||||
.join(", ");
|
||||
write!(f, "sparse({})", rs)
|
||||
}
|
||||
State::Union { ref alternates } => {
|
||||
let alts = alternates
|
||||
.iter()
|
||||
.map(|id| format!("{}", id))
|
||||
.collect::<Vec<String>>()
|
||||
.join(", ");
|
||||
write!(f, "alt({})", alts)
|
||||
}
|
||||
State::Fail => write!(f, "FAIL"),
|
||||
State::Match => write!(f, "MATCH"),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl fmt::Debug for Transition {
|
||||
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
|
||||
let Transition { start, end, next } = *self;
|
||||
if self.start == self.end {
|
||||
write!(f, "{} => {}", escape(start), next)
|
||||
} else {
|
||||
write!(f, "{}-{} => {}", escape(start), escape(end), next)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Return the given byte as its escaped string form.
|
||||
fn escape(b: u8) -> String {
|
||||
use std::ascii;
|
||||
|
||||
String::from_utf8(ascii::escape_default(b).collect::<Vec<_>>()).unwrap()
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
use dense;
|
||||
use dfa::DFA;
|
||||
|
||||
#[test]
|
||||
fn always_match() {
|
||||
let nfa = NFA::always_match();
|
||||
let dfa = dense::Builder::new().build_from_nfa::<usize>(&nfa).unwrap();
|
||||
|
||||
assert_eq!(Some(0), dfa.find_at(b"", 0));
|
||||
assert_eq!(Some(0), dfa.find_at(b"a", 0));
|
||||
assert_eq!(Some(1), dfa.find_at(b"a", 1));
|
||||
assert_eq!(Some(0), dfa.find_at(b"ab", 0));
|
||||
assert_eq!(Some(1), dfa.find_at(b"ab", 1));
|
||||
assert_eq!(Some(2), dfa.find_at(b"ab", 2));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn never_match() {
|
||||
let nfa = NFA::never_match();
|
||||
let dfa = dense::Builder::new().build_from_nfa::<usize>(&nfa).unwrap();
|
||||
|
||||
assert_eq!(None, dfa.find_at(b"", 0));
|
||||
assert_eq!(None, dfa.find_at(b"a", 0));
|
||||
assert_eq!(None, dfa.find_at(b"a", 1));
|
||||
assert_eq!(None, dfa.find_at(b"ab", 0));
|
||||
assert_eq!(None, dfa.find_at(b"ab", 1));
|
||||
assert_eq!(None, dfa.find_at(b"ab", 2));
|
||||
}
|
||||
}
|
||||
1048
third-party/vendor/regex-automata-0.1.10/src/nfa/range_trie.rs
vendored
Normal file
1048
third-party/vendor/regex-automata-0.1.10/src/nfa/range_trie.rs
vendored
Normal file
File diff suppressed because it is too large
Load diff
771
third-party/vendor/regex-automata-0.1.10/src/regex.rs
vendored
Normal file
771
third-party/vendor/regex-automata-0.1.10/src/regex.rs
vendored
Normal file
|
|
@ -0,0 +1,771 @@
|
|||
#[cfg(feature = "std")]
|
||||
use dense::{self, DenseDFA};
|
||||
use dfa::DFA;
|
||||
#[cfg(feature = "std")]
|
||||
use error::Result;
|
||||
#[cfg(feature = "std")]
|
||||
use sparse::SparseDFA;
|
||||
#[cfg(feature = "std")]
|
||||
use state_id::StateID;
|
||||
|
||||
/// A regular expression that uses deterministic finite automata for fast
|
||||
/// searching.
|
||||
///
|
||||
/// A regular expression is comprised of two DFAs, a "forward" DFA and a
|
||||
/// "reverse" DFA. The forward DFA is responsible for detecting the end of a
|
||||
/// match while the reverse DFA is responsible for detecting the start of a
|
||||
/// match. Thus, in order to find the bounds of any given match, a forward
|
||||
/// search must first be run followed by a reverse search. A match found by
|
||||
/// the forward DFA guarantees that the reverse DFA will also find a match.
|
||||
///
|
||||
/// The type of the DFA used by a `Regex` corresponds to the `D` type
|
||||
/// parameter, which must satisfy the [`DFA`](trait.DFA.html) trait. Typically,
|
||||
/// `D` is either a [`DenseDFA`](enum.DenseDFA.html) or a
|
||||
/// [`SparseDFA`](enum.SparseDFA.html), where dense DFAs use more memory but
|
||||
/// search faster, while sparse DFAs use less memory but search more slowly.
|
||||
///
|
||||
/// By default, a regex's DFA type parameter is set to
|
||||
/// `DenseDFA<Vec<usize>, usize>`. For most in-memory work loads, this is the
|
||||
/// most convenient type that gives the best search performance.
|
||||
///
|
||||
/// # Sparse DFAs
|
||||
///
|
||||
/// Since a `Regex` is generic over the `DFA` trait, it can be used with any
|
||||
/// kind of DFA. While this crate constructs dense DFAs by default, it is easy
|
||||
/// enough to build corresponding sparse DFAs, and then build a regex from
|
||||
/// them:
|
||||
///
|
||||
/// ```
|
||||
/// use regex_automata::Regex;
|
||||
///
|
||||
/// # fn example() -> Result<(), regex_automata::Error> {
|
||||
/// // First, build a regex that uses dense DFAs.
|
||||
/// let dense_re = Regex::new("foo[0-9]+")?;
|
||||
///
|
||||
/// // Second, build sparse DFAs from the forward and reverse dense DFAs.
|
||||
/// let fwd = dense_re.forward().to_sparse()?;
|
||||
/// let rev = dense_re.reverse().to_sparse()?;
|
||||
///
|
||||
/// // Third, build a new regex from the constituent sparse DFAs.
|
||||
/// let sparse_re = Regex::from_dfas(fwd, rev);
|
||||
///
|
||||
/// // A regex that uses sparse DFAs can be used just like with dense DFAs.
|
||||
/// assert_eq!(true, sparse_re.is_match(b"foo123"));
|
||||
/// # Ok(()) }; example().unwrap()
|
||||
/// ```
|
||||
#[cfg(feature = "std")]
|
||||
#[derive(Clone, Debug)]
|
||||
pub struct Regex<D: DFA = DenseDFA<Vec<usize>, usize>> {
|
||||
forward: D,
|
||||
reverse: D,
|
||||
}
|
||||
|
||||
/// A regular expression that uses deterministic finite automata for fast
|
||||
/// searching.
|
||||
///
|
||||
/// A regular expression is comprised of two DFAs, a "forward" DFA and a
|
||||
/// "reverse" DFA. The forward DFA is responsible for detecting the end of a
|
||||
/// match while the reverse DFA is responsible for detecting the start of a
|
||||
/// match. Thus, in order to find the bounds of any given match, a forward
|
||||
/// search must first be run followed by a reverse search. A match found by
|
||||
/// the forward DFA guarantees that the reverse DFA will also find a match.
|
||||
///
|
||||
/// The type of the DFA used by a `Regex` corresponds to the `D` type
|
||||
/// parameter, which must satisfy the [`DFA`](trait.DFA.html) trait. Typically,
|
||||
/// `D` is either a [`DenseDFA`](enum.DenseDFA.html) or a
|
||||
/// [`SparseDFA`](enum.SparseDFA.html), where dense DFAs use more memory but
|
||||
/// search faster, while sparse DFAs use less memory but search more slowly.
|
||||
///
|
||||
/// When using this crate without the standard library, the `Regex` type has
|
||||
/// no default type parameter.
|
||||
///
|
||||
/// # Sparse DFAs
|
||||
///
|
||||
/// Since a `Regex` is generic over the `DFA` trait, it can be used with any
|
||||
/// kind of DFA. While this crate constructs dense DFAs by default, it is easy
|
||||
/// enough to build corresponding sparse DFAs, and then build a regex from
|
||||
/// them:
|
||||
///
|
||||
/// ```
|
||||
/// use regex_automata::Regex;
|
||||
///
|
||||
/// # fn example() -> Result<(), regex_automata::Error> {
|
||||
/// // First, build a regex that uses dense DFAs.
|
||||
/// let dense_re = Regex::new("foo[0-9]+")?;
|
||||
///
|
||||
/// // Second, build sparse DFAs from the forward and reverse dense DFAs.
|
||||
/// let fwd = dense_re.forward().to_sparse()?;
|
||||
/// let rev = dense_re.reverse().to_sparse()?;
|
||||
///
|
||||
/// // Third, build a new regex from the constituent sparse DFAs.
|
||||
/// let sparse_re = Regex::from_dfas(fwd, rev);
|
||||
///
|
||||
/// // A regex that uses sparse DFAs can be used just like with dense DFAs.
|
||||
/// assert_eq!(true, sparse_re.is_match(b"foo123"));
|
||||
/// # Ok(()) }; example().unwrap()
|
||||
/// ```
|
||||
#[cfg(not(feature = "std"))]
|
||||
#[derive(Clone, Debug)]
|
||||
pub struct Regex<D> {
|
||||
forward: D,
|
||||
reverse: D,
|
||||
}
|
||||
|
||||
#[cfg(feature = "std")]
|
||||
impl Regex {
|
||||
/// Parse the given regular expression using a default configuration and
|
||||
/// return the corresponding regex.
|
||||
///
|
||||
/// The default configuration uses `usize` for state IDs, premultiplies
|
||||
/// them and reduces the alphabet size by splitting bytes into equivalence
|
||||
/// classes. The underlying DFAs are *not* minimized.
|
||||
///
|
||||
/// If you want a non-default configuration, then use the
|
||||
/// [`RegexBuilder`](struct.RegexBuilder.html)
|
||||
/// to set your own configuration.
|
||||
///
|
||||
/// # Example
|
||||
///
|
||||
/// ```
|
||||
/// use regex_automata::Regex;
|
||||
///
|
||||
/// # fn example() -> Result<(), regex_automata::Error> {
|
||||
/// let re = Regex::new("foo[0-9]+bar")?;
|
||||
/// assert_eq!(Some((3, 14)), re.find(b"zzzfoo12345barzzz"));
|
||||
/// # Ok(()) }; example().unwrap()
|
||||
/// ```
|
||||
pub fn new(pattern: &str) -> Result<Regex> {
|
||||
RegexBuilder::new().build(pattern)
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(feature = "std")]
|
||||
impl Regex<SparseDFA<Vec<u8>, usize>> {
|
||||
/// Parse the given regular expression using a default configuration and
|
||||
/// return the corresponding regex using sparse DFAs.
|
||||
///
|
||||
/// The default configuration uses `usize` for state IDs, reduces the
|
||||
/// alphabet size by splitting bytes into equivalence classes. The
|
||||
/// underlying DFAs are *not* minimized.
|
||||
///
|
||||
/// If you want a non-default configuration, then use the
|
||||
/// [`RegexBuilder`](struct.RegexBuilder.html)
|
||||
/// to set your own configuration.
|
||||
///
|
||||
/// # Example
|
||||
///
|
||||
/// ```
|
||||
/// use regex_automata::Regex;
|
||||
///
|
||||
/// # fn example() -> Result<(), regex_automata::Error> {
|
||||
/// let re = Regex::new_sparse("foo[0-9]+bar")?;
|
||||
/// assert_eq!(Some((3, 14)), re.find(b"zzzfoo12345barzzz"));
|
||||
/// # Ok(()) }; example().unwrap()
|
||||
/// ```
|
||||
pub fn new_sparse(
|
||||
pattern: &str,
|
||||
) -> Result<Regex<SparseDFA<Vec<u8>, usize>>> {
|
||||
RegexBuilder::new().build_sparse(pattern)
|
||||
}
|
||||
}
|
||||
|
||||
impl<D: DFA> Regex<D> {
|
||||
/// Returns true if and only if the given bytes match.
|
||||
///
|
||||
/// This routine may short circuit if it knows that scanning future input
|
||||
/// will never lead to a different result. In particular, if the underlying
|
||||
/// DFA enters a match state or a dead state, then this routine will return
|
||||
/// `true` or `false`, respectively, without inspecting any future input.
|
||||
///
|
||||
/// # Example
|
||||
///
|
||||
/// ```
|
||||
/// use regex_automata::Regex;
|
||||
///
|
||||
/// # fn example() -> Result<(), regex_automata::Error> {
|
||||
/// let re = Regex::new("foo[0-9]+bar")?;
|
||||
/// assert_eq!(true, re.is_match(b"foo12345bar"));
|
||||
/// assert_eq!(false, re.is_match(b"foobar"));
|
||||
/// # Ok(()) }; example().unwrap()
|
||||
/// ```
|
||||
pub fn is_match(&self, input: &[u8]) -> bool {
|
||||
self.is_match_at(input, 0)
|
||||
}
|
||||
|
||||
/// Returns the first position at which a match is found.
|
||||
///
|
||||
/// This routine stops scanning input in precisely the same circumstances
|
||||
/// as `is_match`. The key difference is that this routine returns the
|
||||
/// position at which it stopped scanning input if and only if a match
|
||||
/// was found. If no match is found, then `None` is returned.
|
||||
///
|
||||
/// # Example
|
||||
///
|
||||
/// ```
|
||||
/// use regex_automata::Regex;
|
||||
///
|
||||
/// # fn example() -> Result<(), regex_automata::Error> {
|
||||
/// let re = Regex::new("foo[0-9]+")?;
|
||||
/// assert_eq!(Some(4), re.shortest_match(b"foo12345"));
|
||||
///
|
||||
/// // Normally, the end of the leftmost first match here would be 3,
|
||||
/// // but the shortest match semantics detect a match earlier.
|
||||
/// let re = Regex::new("abc|a")?;
|
||||
/// assert_eq!(Some(1), re.shortest_match(b"abc"));
|
||||
/// # Ok(()) }; example().unwrap()
|
||||
/// ```
|
||||
pub fn shortest_match(&self, input: &[u8]) -> Option<usize> {
|
||||
self.shortest_match_at(input, 0)
|
||||
}
|
||||
|
||||
/// Returns the start and end offset of the leftmost first match. If no
|
||||
/// match exists, then `None` is returned.
|
||||
///
|
||||
/// The "leftmost first" match corresponds to the match with the smallest
|
||||
/// starting offset, but where the end offset is determined by preferring
|
||||
/// earlier branches in the original regular expression. For example,
|
||||
/// `Sam|Samwise` will match `Sam` in `Samwise`, but `Samwise|Sam` will
|
||||
/// match `Samwise` in `Samwise`.
|
||||
///
|
||||
/// Generally speaking, the "leftmost first" match is how most backtracking
|
||||
/// regular expressions tend to work. This is in contrast to POSIX-style
|
||||
/// regular expressions that yield "leftmost longest" matches. Namely,
|
||||
/// both `Sam|Samwise` and `Samwise|Sam` match `Samwise` when using
|
||||
/// leftmost longest semantics.
|
||||
///
|
||||
/// # Example
|
||||
///
|
||||
/// ```
|
||||
/// use regex_automata::Regex;
|
||||
///
|
||||
/// # fn example() -> Result<(), regex_automata::Error> {
|
||||
/// let re = Regex::new("foo[0-9]+")?;
|
||||
/// assert_eq!(Some((3, 11)), re.find(b"zzzfoo12345zzz"));
|
||||
///
|
||||
/// // Even though a match is found after reading the first byte (`a`),
|
||||
/// // the leftmost first match semantics demand that we find the earliest
|
||||
/// // match that prefers earlier parts of the pattern over latter parts.
|
||||
/// let re = Regex::new("abc|a")?;
|
||||
/// assert_eq!(Some((0, 3)), re.find(b"abc"));
|
||||
/// # Ok(()) }; example().unwrap()
|
||||
/// ```
|
||||
pub fn find(&self, input: &[u8]) -> Option<(usize, usize)> {
|
||||
self.find_at(input, 0)
|
||||
}
|
||||
|
||||
/// Returns the same as `is_match`, but starts the search at the given
|
||||
/// offset.
|
||||
///
|
||||
/// The significance of the starting point is that it takes the surrounding
|
||||
/// context into consideration. For example, if the DFA is anchored, then
|
||||
/// a match can only occur when `start == 0`.
|
||||
pub fn is_match_at(&self, input: &[u8], start: usize) -> bool {
|
||||
self.forward().is_match_at(input, start)
|
||||
}
|
||||
|
||||
/// Returns the same as `shortest_match`, but starts the search at the
|
||||
/// given offset.
|
||||
///
|
||||
/// The significance of the starting point is that it takes the surrounding
|
||||
/// context into consideration. For example, if the DFA is anchored, then
|
||||
/// a match can only occur when `start == 0`.
|
||||
pub fn shortest_match_at(
|
||||
&self,
|
||||
input: &[u8],
|
||||
start: usize,
|
||||
) -> Option<usize> {
|
||||
self.forward().shortest_match_at(input, start)
|
||||
}
|
||||
|
||||
/// Returns the same as `find`, but starts the search at the given
|
||||
/// offset.
|
||||
///
|
||||
/// The significance of the starting point is that it takes the surrounding
|
||||
/// context into consideration. For example, if the DFA is anchored, then
|
||||
/// a match can only occur when `start == 0`.
|
||||
pub fn find_at(
|
||||
&self,
|
||||
input: &[u8],
|
||||
start: usize,
|
||||
) -> Option<(usize, usize)> {
|
||||
let end = match self.forward().find_at(input, start) {
|
||||
None => return None,
|
||||
Some(end) => end,
|
||||
};
|
||||
let start = self
|
||||
.reverse()
|
||||
.rfind(&input[start..end])
|
||||
.map(|i| start + i)
|
||||
.expect("reverse search must match if forward search does");
|
||||
Some((start, end))
|
||||
}
|
||||
|
||||
/// Returns an iterator over all non-overlapping leftmost first matches
|
||||
/// in the given bytes. If no match exists, then the iterator yields no
|
||||
/// elements.
|
||||
///
|
||||
/// Note that if the regex can match the empty string, then it is
|
||||
/// possible for the iterator to yield a zero-width match at a location
|
||||
/// that is not a valid UTF-8 boundary (for example, between the code units
|
||||
/// of a UTF-8 encoded codepoint). This can happen regardless of whether
|
||||
/// [`allow_invalid_utf8`](struct.RegexBuilder.html#method.allow_invalid_utf8)
|
||||
/// was enabled or not.
|
||||
///
|
||||
/// # Example
|
||||
///
|
||||
/// ```
|
||||
/// use regex_automata::Regex;
|
||||
///
|
||||
/// # fn example() -> Result<(), regex_automata::Error> {
|
||||
/// let re = Regex::new("foo[0-9]+")?;
|
||||
/// let text = b"foo1 foo12 foo123";
|
||||
/// let matches: Vec<(usize, usize)> = re.find_iter(text).collect();
|
||||
/// assert_eq!(matches, vec![(0, 4), (5, 10), (11, 17)]);
|
||||
/// # Ok(()) }; example().unwrap()
|
||||
/// ```
|
||||
pub fn find_iter<'r, 't>(&'r self, input: &'t [u8]) -> Matches<'r, 't, D> {
|
||||
Matches::new(self, input)
|
||||
}
|
||||
|
||||
/// Build a new regex from its constituent forward and reverse DFAs.
|
||||
///
|
||||
/// This is useful when deserializing a regex from some arbitrary
|
||||
/// memory region. This is also useful for building regexes from other
|
||||
/// types of DFAs.
|
||||
///
|
||||
/// # Example
|
||||
///
|
||||
/// This example is a bit a contrived. The usual use of these methods
|
||||
/// would involve serializing `initial_re` somewhere and then deserializing
|
||||
/// it later to build a regex.
|
||||
///
|
||||
/// ```
|
||||
/// use regex_automata::Regex;
|
||||
///
|
||||
/// # fn example() -> Result<(), regex_automata::Error> {
|
||||
/// let initial_re = Regex::new("foo[0-9]+")?;
|
||||
/// assert_eq!(true, initial_re.is_match(b"foo123"));
|
||||
///
|
||||
/// let (fwd, rev) = (initial_re.forward(), initial_re.reverse());
|
||||
/// let re = Regex::from_dfas(fwd, rev);
|
||||
/// assert_eq!(true, re.is_match(b"foo123"));
|
||||
/// # Ok(()) }; example().unwrap()
|
||||
/// ```
|
||||
///
|
||||
/// This example shows how you might build smaller DFAs, and then use those
|
||||
/// smaller DFAs to build a new regex.
|
||||
///
|
||||
/// ```
|
||||
/// use regex_automata::Regex;
|
||||
///
|
||||
/// # fn example() -> Result<(), regex_automata::Error> {
|
||||
/// let initial_re = Regex::new("foo[0-9]+")?;
|
||||
/// assert_eq!(true, initial_re.is_match(b"foo123"));
|
||||
///
|
||||
/// let fwd = initial_re.forward().to_u16()?;
|
||||
/// let rev = initial_re.reverse().to_u16()?;
|
||||
/// let re = Regex::from_dfas(fwd, rev);
|
||||
/// assert_eq!(true, re.is_match(b"foo123"));
|
||||
/// # Ok(()) }; example().unwrap()
|
||||
/// ```
|
||||
///
|
||||
/// This example shows how to build a `Regex` that uses sparse DFAs instead
|
||||
/// of dense DFAs:
|
||||
///
|
||||
/// ```
|
||||
/// use regex_automata::Regex;
|
||||
///
|
||||
/// # fn example() -> Result<(), regex_automata::Error> {
|
||||
/// let initial_re = Regex::new("foo[0-9]+")?;
|
||||
/// assert_eq!(true, initial_re.is_match(b"foo123"));
|
||||
///
|
||||
/// let fwd = initial_re.forward().to_sparse()?;
|
||||
/// let rev = initial_re.reverse().to_sparse()?;
|
||||
/// let re = Regex::from_dfas(fwd, rev);
|
||||
/// assert_eq!(true, re.is_match(b"foo123"));
|
||||
/// # Ok(()) }; example().unwrap()
|
||||
/// ```
|
||||
pub fn from_dfas(forward: D, reverse: D) -> Regex<D> {
|
||||
Regex { forward, reverse }
|
||||
}
|
||||
|
||||
/// Return the underlying DFA responsible for forward matching.
|
||||
pub fn forward(&self) -> &D {
|
||||
&self.forward
|
||||
}
|
||||
|
||||
/// Return the underlying DFA responsible for reverse matching.
|
||||
pub fn reverse(&self) -> &D {
|
||||
&self.reverse
|
||||
}
|
||||
}
|
||||
|
||||
/// An iterator over all non-overlapping matches for a particular search.
|
||||
///
|
||||
/// The iterator yields a `(usize, usize)` value until no more matches could be
|
||||
/// found. The first `usize` is the start of the match (inclusive) while the
|
||||
/// second `usize` is the end of the match (exclusive).
|
||||
///
|
||||
/// `S` is the type used to represent state identifiers in the underlying
|
||||
/// regex. The lifetime variables are as follows:
|
||||
///
|
||||
/// * `'r` is the lifetime of the regular expression value itself.
|
||||
/// * `'t` is the lifetime of the text being searched.
|
||||
#[derive(Clone, Debug)]
|
||||
pub struct Matches<'r, 't, D: DFA + 'r> {
|
||||
re: &'r Regex<D>,
|
||||
text: &'t [u8],
|
||||
last_end: usize,
|
||||
last_match: Option<usize>,
|
||||
}
|
||||
|
||||
impl<'r, 't, D: DFA> Matches<'r, 't, D> {
|
||||
fn new(re: &'r Regex<D>, text: &'t [u8]) -> Matches<'r, 't, D> {
|
||||
Matches { re, text, last_end: 0, last_match: None }
|
||||
}
|
||||
}
|
||||
|
||||
impl<'r, 't, D: DFA> Iterator for Matches<'r, 't, D> {
|
||||
type Item = (usize, usize);
|
||||
|
||||
fn next(&mut self) -> Option<(usize, usize)> {
|
||||
if self.last_end > self.text.len() {
|
||||
return None;
|
||||
}
|
||||
let (s, e) = match self.re.find_at(self.text, self.last_end) {
|
||||
None => return None,
|
||||
Some((s, e)) => (s, e),
|
||||
};
|
||||
if s == e {
|
||||
// This is an empty match. To ensure we make progress, start
|
||||
// the next search at the smallest possible starting position
|
||||
// of the next match following this one.
|
||||
self.last_end = e + 1;
|
||||
// Don't accept empty matches immediately following a match.
|
||||
// Just move on to the next match.
|
||||
if Some(e) == self.last_match {
|
||||
return self.next();
|
||||
}
|
||||
} else {
|
||||
self.last_end = e;
|
||||
}
|
||||
self.last_match = Some(e);
|
||||
Some((s, e))
|
||||
}
|
||||
}
|
||||
|
||||
/// A builder for a regex based on deterministic finite automatons.
|
||||
///
|
||||
/// This builder permits configuring several aspects of the construction
|
||||
/// process such as case insensitivity, Unicode support and various options
|
||||
/// that impact the size of the underlying DFAs. In some cases, options (like
|
||||
/// performing DFA minimization) can come with a substantial additional cost.
|
||||
///
|
||||
/// This builder generally constructs two DFAs, where one is responsible for
|
||||
/// finding the end of a match and the other is responsible for finding the
|
||||
/// start of a match. If you only need to detect whether something matched,
|
||||
/// or only the end of a match, then you should use a
|
||||
/// [`dense::Builder`](dense/struct.Builder.html)
|
||||
/// to construct a single DFA, which is cheaper than building two DFAs.
|
||||
#[cfg(feature = "std")]
|
||||
#[derive(Clone, Debug)]
|
||||
pub struct RegexBuilder {
|
||||
dfa: dense::Builder,
|
||||
}
|
||||
|
||||
#[cfg(feature = "std")]
|
||||
impl RegexBuilder {
|
||||
/// Create a new regex builder with the default configuration.
|
||||
pub fn new() -> RegexBuilder {
|
||||
RegexBuilder { dfa: dense::Builder::new() }
|
||||
}
|
||||
|
||||
/// Build a regex from the given pattern.
|
||||
///
|
||||
/// If there was a problem parsing or compiling the pattern, then an error
|
||||
/// is returned.
|
||||
pub fn build(&self, pattern: &str) -> Result<Regex> {
|
||||
self.build_with_size::<usize>(pattern)
|
||||
}
|
||||
|
||||
/// Build a regex from the given pattern using sparse DFAs.
|
||||
///
|
||||
/// If there was a problem parsing or compiling the pattern, then an error
|
||||
/// is returned.
|
||||
pub fn build_sparse(
|
||||
&self,
|
||||
pattern: &str,
|
||||
) -> Result<Regex<SparseDFA<Vec<u8>, usize>>> {
|
||||
self.build_with_size_sparse::<usize>(pattern)
|
||||
}
|
||||
|
||||
/// Build a regex from the given pattern using a specific representation
|
||||
/// for the underlying DFA state IDs.
|
||||
///
|
||||
/// If there was a problem parsing or compiling the pattern, then an error
|
||||
/// is returned.
|
||||
///
|
||||
/// The representation of state IDs is determined by the `S` type
|
||||
/// parameter. In general, `S` is usually one of `u8`, `u16`, `u32`, `u64`
|
||||
/// or `usize`, where `usize` is the default used for `build`. The purpose
|
||||
/// of specifying a representation for state IDs is to reduce the memory
|
||||
/// footprint of the underlying DFAs.
|
||||
///
|
||||
/// When using this routine, the chosen state ID representation will be
|
||||
/// used throughout determinization and minimization, if minimization was
|
||||
/// requested. Even if the minimized DFAs can fit into the chosen state ID
|
||||
/// representation but the initial determinized DFA cannot, then this will
|
||||
/// still return an error. To get a minimized DFA with a smaller state ID
|
||||
/// representation, first build it with a bigger state ID representation,
|
||||
/// and then shrink the sizes of the DFAs using one of its conversion
|
||||
/// routines, such as [`DenseDFA::to_u16`](enum.DenseDFA.html#method.to_u16).
|
||||
/// Finally, reconstitute the regex via
|
||||
/// [`Regex::from_dfa`](struct.Regex.html#method.from_dfa).
|
||||
pub fn build_with_size<S: StateID>(
|
||||
&self,
|
||||
pattern: &str,
|
||||
) -> Result<Regex<DenseDFA<Vec<S>, S>>> {
|
||||
let forward = self.dfa.build_with_size(pattern)?;
|
||||
let reverse = self
|
||||
.dfa
|
||||
.clone()
|
||||
.anchored(true)
|
||||
.reverse(true)
|
||||
.longest_match(true)
|
||||
.build_with_size(pattern)?;
|
||||
Ok(Regex::from_dfas(forward, reverse))
|
||||
}
|
||||
|
||||
/// Build a regex from the given pattern using a specific representation
|
||||
/// for the underlying DFA state IDs using sparse DFAs.
|
||||
pub fn build_with_size_sparse<S: StateID>(
|
||||
&self,
|
||||
pattern: &str,
|
||||
) -> Result<Regex<SparseDFA<Vec<u8>, S>>> {
|
||||
let re = self.build_with_size(pattern)?;
|
||||
let fwd = re.forward().to_sparse()?;
|
||||
let rev = re.reverse().to_sparse()?;
|
||||
Ok(Regex::from_dfas(fwd, rev))
|
||||
}
|
||||
|
||||
/// Set whether matching must be anchored at the beginning of the input.
|
||||
///
|
||||
/// When enabled, a match must begin at the start of the input. When
|
||||
/// disabled, the regex will act as if the pattern started with a `.*?`,
|
||||
/// which enables a match to appear anywhere.
|
||||
///
|
||||
/// By default this is disabled.
|
||||
pub fn anchored(&mut self, yes: bool) -> &mut RegexBuilder {
|
||||
self.dfa.anchored(yes);
|
||||
self
|
||||
}
|
||||
|
||||
/// Enable or disable the case insensitive flag by default.
|
||||
///
|
||||
/// By default this is disabled. It may alternatively be selectively
|
||||
/// enabled in the regular expression itself via the `i` flag.
|
||||
pub fn case_insensitive(&mut self, yes: bool) -> &mut RegexBuilder {
|
||||
self.dfa.case_insensitive(yes);
|
||||
self
|
||||
}
|
||||
|
||||
/// Enable verbose mode in the regular expression.
|
||||
///
|
||||
/// When enabled, verbose mode permits insigificant whitespace in many
|
||||
/// places in the regular expression, as well as comments. Comments are
|
||||
/// started using `#` and continue until the end of the line.
|
||||
///
|
||||
/// By default, this is disabled. It may be selectively enabled in the
|
||||
/// regular expression by using the `x` flag regardless of this setting.
|
||||
pub fn ignore_whitespace(&mut self, yes: bool) -> &mut RegexBuilder {
|
||||
self.dfa.ignore_whitespace(yes);
|
||||
self
|
||||
}
|
||||
|
||||
/// Enable or disable the "dot matches any character" flag by default.
|
||||
///
|
||||
/// By default this is disabled. It may alternatively be selectively
|
||||
/// enabled in the regular expression itself via the `s` flag.
|
||||
pub fn dot_matches_new_line(&mut self, yes: bool) -> &mut RegexBuilder {
|
||||
self.dfa.dot_matches_new_line(yes);
|
||||
self
|
||||
}
|
||||
|
||||
/// Enable or disable the "swap greed" flag by default.
|
||||
///
|
||||
/// By default this is disabled. It may alternatively be selectively
|
||||
/// enabled in the regular expression itself via the `U` flag.
|
||||
pub fn swap_greed(&mut self, yes: bool) -> &mut RegexBuilder {
|
||||
self.dfa.swap_greed(yes);
|
||||
self
|
||||
}
|
||||
|
||||
/// Enable or disable the Unicode flag (`u`) by default.
|
||||
///
|
||||
/// By default this is **enabled**. It may alternatively be selectively
|
||||
/// disabled in the regular expression itself via the `u` flag.
|
||||
///
|
||||
/// Note that unless `allow_invalid_utf8` is enabled (it's disabled by
|
||||
/// default), a regular expression will fail to parse if Unicode mode is
|
||||
/// disabled and a sub-expression could possibly match invalid UTF-8.
|
||||
pub fn unicode(&mut self, yes: bool) -> &mut RegexBuilder {
|
||||
self.dfa.unicode(yes);
|
||||
self
|
||||
}
|
||||
|
||||
/// When enabled, the builder will permit the construction of a regular
|
||||
/// expression that may match invalid UTF-8.
|
||||
///
|
||||
/// When disabled (the default), the builder is guaranteed to produce a
|
||||
/// regex that will only ever match valid UTF-8 (otherwise, the builder
|
||||
/// will return an error).
|
||||
pub fn allow_invalid_utf8(&mut self, yes: bool) -> &mut RegexBuilder {
|
||||
self.dfa.allow_invalid_utf8(yes);
|
||||
self
|
||||
}
|
||||
|
||||
/// Set the nesting limit used for the regular expression parser.
|
||||
///
|
||||
/// The nesting limit controls how deep the abstract syntax tree is allowed
|
||||
/// to be. If the AST exceeds the given limit (e.g., with too many nested
|
||||
/// groups), then an error is returned by the parser.
|
||||
///
|
||||
/// The purpose of this limit is to act as a heuristic to prevent stack
|
||||
/// overflow when building a finite automaton from a regular expression's
|
||||
/// abstract syntax tree. In particular, construction currently uses
|
||||
/// recursion. In the future, the implementation may stop using recursion
|
||||
/// and this option will no longer be necessary.
|
||||
///
|
||||
/// This limit is not checked until the entire AST is parsed. Therefore,
|
||||
/// if callers want to put a limit on the amount of heap space used, then
|
||||
/// they should impose a limit on the length, in bytes, of the concrete
|
||||
/// pattern string. In particular, this is viable since the parser will
|
||||
/// limit itself to heap space proportional to the lenth of the pattern
|
||||
/// string.
|
||||
///
|
||||
/// Note that a nest limit of `0` will return a nest limit error for most
|
||||
/// patterns but not all. For example, a nest limit of `0` permits `a` but
|
||||
/// not `ab`, since `ab` requires a concatenation AST item, which results
|
||||
/// in a nest depth of `1`. In general, a nest limit is not something that
|
||||
/// manifests in an obvious way in the concrete syntax, therefore, it
|
||||
/// should not be used in a granular way.
|
||||
pub fn nest_limit(&mut self, limit: u32) -> &mut RegexBuilder {
|
||||
self.dfa.nest_limit(limit);
|
||||
self
|
||||
}
|
||||
|
||||
/// Minimize the underlying DFAs.
|
||||
///
|
||||
/// When enabled, the DFAs powering the resulting regex will be minimized
|
||||
/// such that it is as small as possible.
|
||||
///
|
||||
/// Whether one enables minimization or not depends on the types of costs
|
||||
/// you're willing to pay and how much you care about its benefits. In
|
||||
/// particular, minimization has worst case `O(n*k*logn)` time and `O(k*n)`
|
||||
/// space, where `n` is the number of DFA states and `k` is the alphabet
|
||||
/// size. In practice, minimization can be quite costly in terms of both
|
||||
/// space and time, so it should only be done if you're willing to wait
|
||||
/// longer to produce a DFA. In general, you might want a minimal DFA in
|
||||
/// the following circumstances:
|
||||
///
|
||||
/// 1. You would like to optimize for the size of the automaton. This can
|
||||
/// manifest in one of two ways. Firstly, if you're converting the
|
||||
/// DFA into Rust code (or a table embedded in the code), then a minimal
|
||||
/// DFA will translate into a corresponding reduction in code size, and
|
||||
/// thus, also the final compiled binary size. Secondly, if you are
|
||||
/// building many DFAs and putting them on the heap, you'll be able to
|
||||
/// fit more if they are smaller. Note though that building a minimal
|
||||
/// DFA itself requires additional space; you only realize the space
|
||||
/// savings once the minimal DFA is constructed (at which point, the
|
||||
/// space used for minimization is freed).
|
||||
/// 2. You've observed that a smaller DFA results in faster match
|
||||
/// performance. Naively, this isn't guaranteed since there is no
|
||||
/// inherent difference between matching with a bigger-than-minimal
|
||||
/// DFA and a minimal DFA. However, a smaller DFA may make use of your
|
||||
/// CPU's cache more efficiently.
|
||||
/// 3. You are trying to establish an equivalence between regular
|
||||
/// languages. The standard method for this is to build a minimal DFA
|
||||
/// for each language and then compare them. If the DFAs are equivalent
|
||||
/// (up to state renaming), then the languages are equivalent.
|
||||
///
|
||||
/// This option is disabled by default.
|
||||
pub fn minimize(&mut self, yes: bool) -> &mut RegexBuilder {
|
||||
self.dfa.minimize(yes);
|
||||
self
|
||||
}
|
||||
|
||||
/// Premultiply state identifiers in the underlying DFA transition tables.
|
||||
///
|
||||
/// When enabled, state identifiers are premultiplied to point to their
|
||||
/// corresponding row in the DFA's transition table. That is, given the
|
||||
/// `i`th state, its corresponding premultiplied identifier is `i * k`
|
||||
/// where `k` is the alphabet size of the DFA. (The alphabet size is at
|
||||
/// most 256, but is in practice smaller if byte classes is enabled.)
|
||||
///
|
||||
/// When state identifiers are not premultiplied, then the identifier of
|
||||
/// the `i`th state is `i`.
|
||||
///
|
||||
/// The advantage of premultiplying state identifiers is that is saves
|
||||
/// a multiplication instruction per byte when searching with the DFA.
|
||||
/// This has been observed to lead to a 20% performance benefit in
|
||||
/// micro-benchmarks.
|
||||
///
|
||||
/// The primary disadvantage of premultiplying state identifiers is
|
||||
/// that they require a larger integer size to represent. For example,
|
||||
/// if your DFA has 200 states, then its premultiplied form requires
|
||||
/// 16 bits to represent every possible state identifier, where as its
|
||||
/// non-premultiplied form only requires 8 bits.
|
||||
///
|
||||
/// This option is enabled by default.
|
||||
pub fn premultiply(&mut self, yes: bool) -> &mut RegexBuilder {
|
||||
self.dfa.premultiply(yes);
|
||||
self
|
||||
}
|
||||
|
||||
/// Shrink the size of the underlying DFA alphabet by mapping bytes to
|
||||
/// their equivalence classes.
|
||||
///
|
||||
/// When enabled, each DFA will use a map from all possible bytes to their
|
||||
/// corresponding equivalence class. Each equivalence class represents a
|
||||
/// set of bytes that does not discriminate between a match and a non-match
|
||||
/// in the DFA. For example, the pattern `[ab]+` has at least two
|
||||
/// equivalence classes: a set containing `a` and `b` and a set containing
|
||||
/// every byte except for `a` and `b`. `a` and `b` are in the same
|
||||
/// equivalence classes because they never discriminate between a match
|
||||
/// and a non-match.
|
||||
///
|
||||
/// The advantage of this map is that the size of the transition table can
|
||||
/// be reduced drastically from `#states * 256 * sizeof(id)` to
|
||||
/// `#states * k * sizeof(id)` where `k` is the number of equivalence
|
||||
/// classes. As a result, total space usage can decrease substantially.
|
||||
/// Moreover, since a smaller alphabet is used, compilation becomes faster
|
||||
/// as well.
|
||||
///
|
||||
/// The disadvantage of this map is that every byte searched must be
|
||||
/// passed through this map before it can be used to determine the next
|
||||
/// transition. This has a small match time performance cost.
|
||||
///
|
||||
/// This option is enabled by default.
|
||||
pub fn byte_classes(&mut self, yes: bool) -> &mut RegexBuilder {
|
||||
self.dfa.byte_classes(yes);
|
||||
self
|
||||
}
|
||||
|
||||
/// Apply best effort heuristics to shrink the NFA at the expense of more
|
||||
/// time/memory.
|
||||
///
|
||||
/// This may be exposed in the future, but for now is exported for use in
|
||||
/// the `regex-automata-debug` tool.
|
||||
#[doc(hidden)]
|
||||
pub fn shrink(&mut self, yes: bool) -> &mut RegexBuilder {
|
||||
self.dfa.shrink(yes);
|
||||
self
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(feature = "std")]
|
||||
impl Default for RegexBuilder {
|
||||
fn default() -> RegexBuilder {
|
||||
RegexBuilder::new()
|
||||
}
|
||||
}
|
||||
1256
third-party/vendor/regex-automata-0.1.10/src/sparse.rs
vendored
Normal file
1256
third-party/vendor/regex-automata-0.1.10/src/sparse.rs
vendored
Normal file
File diff suppressed because it is too large
Load diff
60
third-party/vendor/regex-automata-0.1.10/src/sparse_set.rs
vendored
Normal file
60
third-party/vendor/regex-automata-0.1.10/src/sparse_set.rs
vendored
Normal file
|
|
@ -0,0 +1,60 @@
|
|||
use std::slice;
|
||||
|
||||
/// A sparse set used for representing ordered NFA states.
|
||||
///
|
||||
/// This supports constant time addition and membership testing. Clearing an
|
||||
/// entire set can also be done in constant time. Iteration yields elements
|
||||
/// in the order in which they were inserted.
|
||||
///
|
||||
/// The data structure is based on: https://research.swtch.com/sparse
|
||||
/// Note though that we don't actually use uninitialized memory. We generally
|
||||
/// reuse sparse sets, so the initial allocation cost is bareable. However, its
|
||||
/// other properties listed above are extremely useful.
|
||||
#[derive(Clone, Debug)]
|
||||
pub struct SparseSet {
|
||||
/// Dense contains the instruction pointers in the order in which they
|
||||
/// were inserted.
|
||||
dense: Vec<usize>,
|
||||
/// Sparse maps instruction pointers to their location in dense.
|
||||
///
|
||||
/// An instruction pointer is in the set if and only if
|
||||
/// sparse[ip] < dense.len() && ip == dense[sparse[ip]].
|
||||
sparse: Box<[usize]>,
|
||||
}
|
||||
|
||||
impl SparseSet {
|
||||
pub fn new(size: usize) -> SparseSet {
|
||||
SparseSet {
|
||||
dense: Vec::with_capacity(size),
|
||||
sparse: vec![0; size].into_boxed_slice(),
|
||||
}
|
||||
}
|
||||
|
||||
pub fn len(&self) -> usize {
|
||||
self.dense.len()
|
||||
}
|
||||
|
||||
pub fn insert(&mut self, value: usize) {
|
||||
let i = self.len();
|
||||
assert!(i < self.dense.capacity());
|
||||
self.dense.push(value);
|
||||
self.sparse[value] = i;
|
||||
}
|
||||
|
||||
pub fn contains(&self, value: usize) -> bool {
|
||||
let i = self.sparse[value];
|
||||
self.dense.get(i) == Some(&value)
|
||||
}
|
||||
|
||||
pub fn clear(&mut self) {
|
||||
self.dense.clear();
|
||||
}
|
||||
}
|
||||
|
||||
impl<'a> IntoIterator for &'a SparseSet {
|
||||
type Item = &'a usize;
|
||||
type IntoIter = slice::Iter<'a, usize>;
|
||||
fn into_iter(self) -> Self::IntoIter {
|
||||
self.dense.iter()
|
||||
}
|
||||
}
|
||||
291
third-party/vendor/regex-automata-0.1.10/src/state_id.rs
vendored
Normal file
291
third-party/vendor/regex-automata-0.1.10/src/state_id.rs
vendored
Normal file
|
|
@ -0,0 +1,291 @@
|
|||
use core::fmt::Debug;
|
||||
use core::hash::Hash;
|
||||
use core::mem::size_of;
|
||||
|
||||
use byteorder::{ByteOrder, NativeEndian};
|
||||
|
||||
#[cfg(feature = "std")]
|
||||
pub use self::std::*;
|
||||
|
||||
#[cfg(feature = "std")]
|
||||
mod std {
|
||||
use byteorder::ByteOrder;
|
||||
use core::mem::size_of;
|
||||
use error::{Error, Result};
|
||||
|
||||
use super::StateID;
|
||||
|
||||
/// Check that the premultiplication of the given state identifier can
|
||||
/// fit into the representation indicated by `S`. If it cannot, or if it
|
||||
/// overflows `usize` itself, then an error is returned.
|
||||
pub fn premultiply_overflow_error<S: StateID>(
|
||||
last_state: S,
|
||||
alphabet_len: usize,
|
||||
) -> Result<()> {
|
||||
let requested = match last_state.to_usize().checked_mul(alphabet_len) {
|
||||
Some(requested) => requested,
|
||||
None => return Err(Error::premultiply_overflow(0, 0)),
|
||||
};
|
||||
if requested > S::max_id() {
|
||||
return Err(Error::premultiply_overflow(S::max_id(), requested));
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Allocate the next sequential identifier for a fresh state given
|
||||
/// the previously constructed state identified by `current`. If the
|
||||
/// next sequential identifier would overflow `usize` or the chosen
|
||||
/// representation indicated by `S`, then an error is returned.
|
||||
pub fn next_state_id<S: StateID>(current: S) -> Result<S> {
|
||||
let next = match current.to_usize().checked_add(1) {
|
||||
Some(next) => next,
|
||||
None => return Err(Error::state_id_overflow(::std::usize::MAX)),
|
||||
};
|
||||
if next > S::max_id() {
|
||||
return Err(Error::state_id_overflow(S::max_id()));
|
||||
}
|
||||
Ok(S::from_usize(next))
|
||||
}
|
||||
|
||||
/// Convert the given `usize` to the chosen state identifier
|
||||
/// representation. If the given value cannot fit in the chosen
|
||||
/// representation, then an error is returned.
|
||||
pub fn usize_to_state_id<S: StateID>(value: usize) -> Result<S> {
|
||||
if value > S::max_id() {
|
||||
Err(Error::state_id_overflow(S::max_id()))
|
||||
} else {
|
||||
Ok(S::from_usize(value))
|
||||
}
|
||||
}
|
||||
|
||||
/// Write the given identifier to the given slice of bytes using the
|
||||
/// specified endianness. The given slice must have length at least
|
||||
/// `size_of::<S>()`.
|
||||
///
|
||||
/// The given state identifier representation must have size 1, 2, 4 or 8.
|
||||
pub fn write_state_id_bytes<E: ByteOrder, S: StateID>(
|
||||
slice: &mut [u8],
|
||||
id: S,
|
||||
) {
|
||||
assert!(
|
||||
1 == size_of::<S>()
|
||||
|| 2 == size_of::<S>()
|
||||
|| 4 == size_of::<S>()
|
||||
|| 8 == size_of::<S>()
|
||||
);
|
||||
|
||||
match size_of::<S>() {
|
||||
1 => slice[0] = id.to_usize() as u8,
|
||||
2 => E::write_u16(slice, id.to_usize() as u16),
|
||||
4 => E::write_u32(slice, id.to_usize() as u32),
|
||||
8 => E::write_u64(slice, id.to_usize() as u64),
|
||||
_ => unreachable!(),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Return the unique identifier for a DFA's dead state in the chosen
|
||||
/// representation indicated by `S`.
|
||||
pub fn dead_id<S: StateID>() -> S {
|
||||
S::from_usize(0)
|
||||
}
|
||||
|
||||
/// A trait describing the representation of a DFA's state identifier.
|
||||
///
|
||||
/// The purpose of this trait is to safely express both the possible state
|
||||
/// identifier representations that can be used in a DFA and to convert between
|
||||
/// state identifier representations and types that can be used to efficiently
|
||||
/// index memory (such as `usize`).
|
||||
///
|
||||
/// In general, one should not need to implement this trait explicitly. In
|
||||
/// particular, this crate provides implementations for `u8`, `u16`, `u32`,
|
||||
/// `u64` and `usize`. (`u32` and `u64` are only provided for targets that can
|
||||
/// represent all corresponding values in a `usize`.)
|
||||
///
|
||||
/// # Safety
|
||||
///
|
||||
/// This trait is unsafe because the correctness of its implementations may be
|
||||
/// relied upon by other unsafe code. For example, one possible way to
|
||||
/// implement this trait incorrectly would be to return a maximum identifier
|
||||
/// in `max_id` that is greater than the real maximum identifier. This will
|
||||
/// likely result in wrap-on-overflow semantics in release mode, which can in
|
||||
/// turn produce incorrect state identifiers. Those state identifiers may then
|
||||
/// in turn access out-of-bounds memory in a DFA's search routine, where bounds
|
||||
/// checks are explicitly elided for performance reasons.
|
||||
pub unsafe trait StateID:
|
||||
Clone + Copy + Debug + Eq + Hash + PartialEq + PartialOrd + Ord
|
||||
{
|
||||
/// Convert from a `usize` to this implementation's representation.
|
||||
///
|
||||
/// Implementors may assume that `n <= Self::max_id`. That is, implementors
|
||||
/// do not need to check whether `n` can fit inside this implementation's
|
||||
/// representation.
|
||||
fn from_usize(n: usize) -> Self;
|
||||
|
||||
/// Convert this implementation's representation to a `usize`.
|
||||
///
|
||||
/// Implementors must not return a `usize` value greater than
|
||||
/// `Self::max_id` and must not permit overflow when converting between the
|
||||
/// implementor's representation and `usize`. In general, the preferred
|
||||
/// way for implementors to achieve this is to simply not provide
|
||||
/// implementations of `StateID` that cannot fit into the target platform's
|
||||
/// `usize`.
|
||||
fn to_usize(self) -> usize;
|
||||
|
||||
/// Return the maximum state identifier supported by this representation.
|
||||
///
|
||||
/// Implementors must return a correct bound. Doing otherwise may result
|
||||
/// in memory unsafety.
|
||||
fn max_id() -> usize;
|
||||
|
||||
/// Read a single state identifier from the given slice of bytes in native
|
||||
/// endian format.
|
||||
///
|
||||
/// Implementors may assume that the given slice has length at least
|
||||
/// `size_of::<Self>()`.
|
||||
fn read_bytes(slice: &[u8]) -> Self;
|
||||
|
||||
/// Write this state identifier to the given slice of bytes in native
|
||||
/// endian format.
|
||||
///
|
||||
/// Implementors may assume that the given slice has length at least
|
||||
/// `size_of::<Self>()`.
|
||||
fn write_bytes(self, slice: &mut [u8]);
|
||||
}
|
||||
|
||||
unsafe impl StateID for usize {
|
||||
#[inline]
|
||||
fn from_usize(n: usize) -> usize {
|
||||
n
|
||||
}
|
||||
|
||||
#[inline]
|
||||
fn to_usize(self) -> usize {
|
||||
self
|
||||
}
|
||||
|
||||
#[inline]
|
||||
fn max_id() -> usize {
|
||||
::core::usize::MAX
|
||||
}
|
||||
|
||||
#[inline]
|
||||
fn read_bytes(slice: &[u8]) -> Self {
|
||||
NativeEndian::read_uint(slice, size_of::<usize>()) as usize
|
||||
}
|
||||
|
||||
#[inline]
|
||||
fn write_bytes(self, slice: &mut [u8]) {
|
||||
NativeEndian::write_uint(slice, self as u64, size_of::<usize>())
|
||||
}
|
||||
}
|
||||
|
||||
unsafe impl StateID for u8 {
|
||||
#[inline]
|
||||
fn from_usize(n: usize) -> u8 {
|
||||
n as u8
|
||||
}
|
||||
|
||||
#[inline]
|
||||
fn to_usize(self) -> usize {
|
||||
self as usize
|
||||
}
|
||||
|
||||
#[inline]
|
||||
fn max_id() -> usize {
|
||||
::core::u8::MAX as usize
|
||||
}
|
||||
|
||||
#[inline]
|
||||
fn read_bytes(slice: &[u8]) -> Self {
|
||||
slice[0]
|
||||
}
|
||||
|
||||
#[inline]
|
||||
fn write_bytes(self, slice: &mut [u8]) {
|
||||
slice[0] = self;
|
||||
}
|
||||
}
|
||||
|
||||
unsafe impl StateID for u16 {
|
||||
#[inline]
|
||||
fn from_usize(n: usize) -> u16 {
|
||||
n as u16
|
||||
}
|
||||
|
||||
#[inline]
|
||||
fn to_usize(self) -> usize {
|
||||
self as usize
|
||||
}
|
||||
|
||||
#[inline]
|
||||
fn max_id() -> usize {
|
||||
::core::u16::MAX as usize
|
||||
}
|
||||
|
||||
#[inline]
|
||||
fn read_bytes(slice: &[u8]) -> Self {
|
||||
NativeEndian::read_u16(slice)
|
||||
}
|
||||
|
||||
#[inline]
|
||||
fn write_bytes(self, slice: &mut [u8]) {
|
||||
NativeEndian::write_u16(slice, self)
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(any(target_pointer_width = "32", target_pointer_width = "64"))]
|
||||
unsafe impl StateID for u32 {
|
||||
#[inline]
|
||||
fn from_usize(n: usize) -> u32 {
|
||||
n as u32
|
||||
}
|
||||
|
||||
#[inline]
|
||||
fn to_usize(self) -> usize {
|
||||
self as usize
|
||||
}
|
||||
|
||||
#[inline]
|
||||
fn max_id() -> usize {
|
||||
::core::u32::MAX as usize
|
||||
}
|
||||
|
||||
#[inline]
|
||||
fn read_bytes(slice: &[u8]) -> Self {
|
||||
NativeEndian::read_u32(slice)
|
||||
}
|
||||
|
||||
#[inline]
|
||||
fn write_bytes(self, slice: &mut [u8]) {
|
||||
NativeEndian::write_u32(slice, self)
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(target_pointer_width = "64")]
|
||||
unsafe impl StateID for u64 {
|
||||
#[inline]
|
||||
fn from_usize(n: usize) -> u64 {
|
||||
n as u64
|
||||
}
|
||||
|
||||
#[inline]
|
||||
fn to_usize(self) -> usize {
|
||||
self as usize
|
||||
}
|
||||
|
||||
#[inline]
|
||||
fn max_id() -> usize {
|
||||
::core::u64::MAX as usize
|
||||
}
|
||||
|
||||
#[inline]
|
||||
fn read_bytes(slice: &[u8]) -> Self {
|
||||
NativeEndian::read_u64(slice)
|
||||
}
|
||||
|
||||
#[inline]
|
||||
fn write_bytes(self, slice: &mut [u8]) {
|
||||
NativeEndian::write_u64(slice, self)
|
||||
}
|
||||
}
|
||||
107
third-party/vendor/regex-automata-0.1.10/src/transducer.rs
vendored
Normal file
107
third-party/vendor/regex-automata-0.1.10/src/transducer.rs
vendored
Normal file
|
|
@ -0,0 +1,107 @@
|
|||
use fst::Automaton;
|
||||
|
||||
use crate::{StateID, DFA};
|
||||
|
||||
macro_rules! imp {
|
||||
($ty:ty, $id:ty) => {
|
||||
impl<T: AsRef<[$id]>, S: StateID> Automaton for $ty {
|
||||
type State = S;
|
||||
|
||||
#[inline]
|
||||
fn start(&self) -> S {
|
||||
self.start_state()
|
||||
}
|
||||
|
||||
#[inline]
|
||||
fn is_match(&self, state: &S) -> bool {
|
||||
self.is_match_state(*state)
|
||||
}
|
||||
|
||||
#[inline]
|
||||
fn accept(&self, state: &S, byte: u8) -> S {
|
||||
self.next_state(*state, byte)
|
||||
}
|
||||
|
||||
#[inline]
|
||||
fn can_match(&self, state: &S) -> bool {
|
||||
!self.is_dead_state(*state)
|
||||
}
|
||||
}
|
||||
};
|
||||
}
|
||||
|
||||
imp!(crate::dense::DenseDFA<T, S>, S);
|
||||
imp!(crate::dense::Standard<T, S>, S);
|
||||
imp!(crate::dense::ByteClass<T, S>, S);
|
||||
imp!(crate::dense::Premultiplied<T, S>, S);
|
||||
imp!(crate::dense::PremultipliedByteClass<T, S>, S);
|
||||
imp!(crate::sparse::SparseDFA<T, S>, u8);
|
||||
imp!(crate::sparse::Standard<T, S>, u8);
|
||||
imp!(crate::sparse::ByteClass<T, S>, u8);
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use bstr::BString;
|
||||
use fst::{Automaton, IntoStreamer, Set, Streamer};
|
||||
|
||||
use crate::dense::{self, DenseDFA};
|
||||
use crate::sparse::SparseDFA;
|
||||
|
||||
fn search<A: Automaton, D: AsRef<[u8]>>(
|
||||
set: &Set<D>,
|
||||
aut: A,
|
||||
) -> Vec<BString> {
|
||||
let mut stream = set.search(aut).into_stream();
|
||||
|
||||
let mut results = vec![];
|
||||
while let Some(key) = stream.next() {
|
||||
results.push(BString::from(key));
|
||||
}
|
||||
results
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn dense_anywhere() {
|
||||
let set =
|
||||
Set::from_iter(&["a", "bar", "baz", "wat", "xba", "xbax", "z"])
|
||||
.unwrap();
|
||||
let dfa = DenseDFA::new("ba.*").unwrap();
|
||||
let got = search(&set, &dfa);
|
||||
assert_eq!(got, vec!["bar", "baz", "xba", "xbax"]);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn dense_anchored() {
|
||||
let set =
|
||||
Set::from_iter(&["a", "bar", "baz", "wat", "xba", "xbax", "z"])
|
||||
.unwrap();
|
||||
let dfa = dense::Builder::new().anchored(true).build("ba.*").unwrap();
|
||||
let got = search(&set, &dfa);
|
||||
assert_eq!(got, vec!["bar", "baz"]);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn sparse_anywhere() {
|
||||
let set =
|
||||
Set::from_iter(&["a", "bar", "baz", "wat", "xba", "xbax", "z"])
|
||||
.unwrap();
|
||||
let dfa = SparseDFA::new("ba.*").unwrap();
|
||||
let got = search(&set, &dfa);
|
||||
assert_eq!(got, vec!["bar", "baz", "xba", "xbax"]);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn sparse_anchored() {
|
||||
let set =
|
||||
Set::from_iter(&["a", "bar", "baz", "wat", "xba", "xbax", "z"])
|
||||
.unwrap();
|
||||
let dfa = dense::Builder::new()
|
||||
.anchored(true)
|
||||
.build("ba.*")
|
||||
.unwrap()
|
||||
.to_sparse()
|
||||
.unwrap();
|
||||
let got = search(&set, &dfa);
|
||||
assert_eq!(got, vec!["bar", "baz"]);
|
||||
}
|
||||
}
|
||||
461
third-party/vendor/regex-automata-0.1.10/tests/collection.rs
vendored
Normal file
461
third-party/vendor/regex-automata-0.1.10/tests/collection.rs
vendored
Normal file
|
|
@ -0,0 +1,461 @@
|
|||
use std::collections::BTreeMap;
|
||||
use std::env;
|
||||
use std::fmt::{self, Write};
|
||||
use std::thread;
|
||||
|
||||
use regex;
|
||||
use regex_automata::{DenseDFA, ErrorKind, Regex, RegexBuilder, StateID, DFA};
|
||||
use serde_bytes;
|
||||
use toml;
|
||||
|
||||
macro_rules! load {
|
||||
($col:ident, $path:expr) => {
|
||||
$col.extend(RegexTests::load(
|
||||
concat!("../data/tests/", $path),
|
||||
include_bytes!(concat!("../data/tests/", $path)),
|
||||
));
|
||||
};
|
||||
}
|
||||
|
||||
lazy_static! {
|
||||
pub static ref SUITE: RegexTestCollection = {
|
||||
let mut col = RegexTestCollection::new();
|
||||
load!(col, "fowler/basic.toml");
|
||||
load!(col, "fowler/nullsubexpr.toml");
|
||||
load!(col, "fowler/repetition.toml");
|
||||
load!(col, "fowler/repetition-long.toml");
|
||||
load!(col, "crazy.toml");
|
||||
load!(col, "flags.toml");
|
||||
load!(col, "iter.toml");
|
||||
load!(col, "no-unicode.toml");
|
||||
load!(col, "unicode.toml");
|
||||
col
|
||||
};
|
||||
}
|
||||
|
||||
#[derive(Clone, Debug)]
|
||||
pub struct RegexTestCollection {
|
||||
pub by_name: BTreeMap<String, RegexTest>,
|
||||
}
|
||||
|
||||
#[derive(Clone, Debug, Deserialize)]
|
||||
pub struct RegexTests {
|
||||
pub tests: Vec<RegexTest>,
|
||||
}
|
||||
|
||||
#[derive(Clone, Debug, Deserialize)]
|
||||
pub struct RegexTest {
|
||||
pub name: String,
|
||||
#[serde(default)]
|
||||
pub options: Vec<RegexTestOption>,
|
||||
pub pattern: String,
|
||||
#[serde(with = "serde_bytes")]
|
||||
pub input: Vec<u8>,
|
||||
#[serde(rename = "matches")]
|
||||
pub matches: Vec<Match>,
|
||||
#[serde(default)]
|
||||
pub captures: Vec<Option<Match>>,
|
||||
#[serde(default)]
|
||||
pub fowler_line_number: Option<u64>,
|
||||
}
|
||||
|
||||
#[derive(Clone, Copy, Debug, Deserialize, Eq, PartialEq)]
|
||||
#[serde(rename_all = "kebab-case")]
|
||||
pub enum RegexTestOption {
|
||||
Anchored,
|
||||
CaseInsensitive,
|
||||
NoUnicode,
|
||||
Escaped,
|
||||
#[serde(rename = "invalid-utf8")]
|
||||
InvalidUTF8,
|
||||
}
|
||||
|
||||
#[derive(Clone, Copy, Deserialize, Eq, PartialEq)]
|
||||
pub struct Match {
|
||||
pub start: usize,
|
||||
pub end: usize,
|
||||
}
|
||||
|
||||
impl RegexTestCollection {
|
||||
fn new() -> RegexTestCollection {
|
||||
RegexTestCollection { by_name: BTreeMap::new() }
|
||||
}
|
||||
|
||||
fn extend(&mut self, tests: RegexTests) {
|
||||
for test in tests.tests {
|
||||
let name = test.name.clone();
|
||||
if self.by_name.contains_key(&name) {
|
||||
panic!("found duplicate test {}", name);
|
||||
}
|
||||
self.by_name.insert(name, test);
|
||||
}
|
||||
}
|
||||
|
||||
pub fn tests(&self) -> Vec<&RegexTest> {
|
||||
self.by_name.values().collect()
|
||||
}
|
||||
}
|
||||
|
||||
impl RegexTests {
|
||||
fn load(path: &str, slice: &[u8]) -> RegexTests {
|
||||
let mut data: RegexTests = toml::from_slice(slice)
|
||||
.expect(&format!("failed to load {}", path));
|
||||
for test in &mut data.tests {
|
||||
if test.options.contains(&RegexTestOption::Escaped) {
|
||||
test.input = unescape_bytes(&test.input);
|
||||
}
|
||||
}
|
||||
data
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Debug)]
|
||||
pub struct RegexTester {
|
||||
asserted: bool,
|
||||
results: RegexTestResults,
|
||||
skip_expensive: bool,
|
||||
whitelist: Vec<regex::Regex>,
|
||||
blacklist: Vec<regex::Regex>,
|
||||
}
|
||||
|
||||
impl Drop for RegexTester {
|
||||
fn drop(&mut self) {
|
||||
// If we haven't asserted yet, then the test is probably buggy, so
|
||||
// fail it. But if we're already panicking (e.g., a bug in the regex
|
||||
// engine), then don't double-panic, which causes an immediate abort.
|
||||
if !thread::panicking() && !self.asserted {
|
||||
panic!("must call RegexTester::assert at end of test");
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl RegexTester {
|
||||
pub fn new() -> RegexTester {
|
||||
let mut tester = RegexTester {
|
||||
asserted: false,
|
||||
results: RegexTestResults::default(),
|
||||
skip_expensive: false,
|
||||
whitelist: vec![],
|
||||
blacklist: vec![],
|
||||
};
|
||||
for x in env::var("REGEX_TEST").unwrap_or("".to_string()).split(",") {
|
||||
let x = x.trim();
|
||||
if x.is_empty() {
|
||||
continue;
|
||||
}
|
||||
if x.starts_with("-") {
|
||||
tester = tester.blacklist(&x[1..]);
|
||||
} else {
|
||||
tester = tester.whitelist(x);
|
||||
}
|
||||
}
|
||||
tester
|
||||
}
|
||||
|
||||
pub fn skip_expensive(mut self) -> RegexTester {
|
||||
self.skip_expensive = true;
|
||||
self
|
||||
}
|
||||
|
||||
pub fn whitelist(mut self, name: &str) -> RegexTester {
|
||||
self.whitelist.push(regex::Regex::new(name).unwrap());
|
||||
self
|
||||
}
|
||||
|
||||
pub fn blacklist(mut self, name: &str) -> RegexTester {
|
||||
self.blacklist.push(regex::Regex::new(name).unwrap());
|
||||
self
|
||||
}
|
||||
|
||||
pub fn assert(&mut self) {
|
||||
self.asserted = true;
|
||||
self.results.assert();
|
||||
}
|
||||
|
||||
pub fn build_regex<S: StateID>(
|
||||
&self,
|
||||
mut builder: RegexBuilder,
|
||||
test: &RegexTest,
|
||||
) -> Option<Regex<DenseDFA<Vec<S>, S>>> {
|
||||
if self.skip(test) {
|
||||
return None;
|
||||
}
|
||||
self.apply_options(test, &mut builder);
|
||||
|
||||
match builder.build_with_size::<S>(&test.pattern) {
|
||||
Ok(re) => Some(re),
|
||||
Err(err) => {
|
||||
if let ErrorKind::Unsupported(_) = *err.kind() {
|
||||
None
|
||||
} else {
|
||||
panic!(
|
||||
"failed to build {:?} with pattern '{:?}': {}",
|
||||
test.name, test.pattern, err
|
||||
);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
pub fn test_all<'a, I, T>(&mut self, builder: RegexBuilder, tests: I)
|
||||
where
|
||||
I: IntoIterator<IntoIter = T, Item = &'a RegexTest>,
|
||||
T: Iterator<Item = &'a RegexTest>,
|
||||
{
|
||||
for test in tests {
|
||||
let builder = builder.clone();
|
||||
let re: Regex = match self.build_regex(builder, test) {
|
||||
None => continue,
|
||||
Some(re) => re,
|
||||
};
|
||||
self.test(test, &re);
|
||||
}
|
||||
}
|
||||
|
||||
pub fn test<'a, D: DFA>(&mut self, test: &RegexTest, re: &Regex<D>) {
|
||||
self.test_is_match(test, re);
|
||||
self.test_find(test, re);
|
||||
// Some tests (namely, fowler) are designed only to detect the
|
||||
// first match even if there are more subsequent matches. To that
|
||||
// end, we only test match iteration when the number of matches
|
||||
// expected is not 1, or if the test name has 'iter' in it.
|
||||
if test.name.contains("iter") || test.matches.len() != 1 {
|
||||
self.test_find_iter(test, re);
|
||||
}
|
||||
}
|
||||
|
||||
pub fn test_is_match<'a, D: DFA>(
|
||||
&mut self,
|
||||
test: &RegexTest,
|
||||
re: &Regex<D>,
|
||||
) {
|
||||
self.asserted = false;
|
||||
|
||||
let got = re.is_match(&test.input);
|
||||
let expected = test.matches.len() >= 1;
|
||||
if got == expected {
|
||||
self.results.succeeded.push(test.clone());
|
||||
return;
|
||||
}
|
||||
self.results.failed.push(RegexTestFailure {
|
||||
test: test.clone(),
|
||||
kind: RegexTestFailureKind::IsMatch,
|
||||
});
|
||||
}
|
||||
|
||||
pub fn test_find<'a, D: DFA>(&mut self, test: &RegexTest, re: &Regex<D>) {
|
||||
self.asserted = false;
|
||||
|
||||
let got =
|
||||
re.find(&test.input).map(|(start, end)| Match { start, end });
|
||||
if got == test.matches.get(0).map(|&m| m) {
|
||||
self.results.succeeded.push(test.clone());
|
||||
return;
|
||||
}
|
||||
self.results.failed.push(RegexTestFailure {
|
||||
test: test.clone(),
|
||||
kind: RegexTestFailureKind::Find { got },
|
||||
});
|
||||
}
|
||||
|
||||
pub fn test_find_iter<'a, D: DFA>(
|
||||
&mut self,
|
||||
test: &RegexTest,
|
||||
re: &Regex<D>,
|
||||
) {
|
||||
self.asserted = false;
|
||||
|
||||
let got: Vec<Match> = re
|
||||
.find_iter(&test.input)
|
||||
.map(|(start, end)| Match { start, end })
|
||||
.collect();
|
||||
if got == test.matches {
|
||||
self.results.succeeded.push(test.clone());
|
||||
return;
|
||||
}
|
||||
self.results.failed.push(RegexTestFailure {
|
||||
test: test.clone(),
|
||||
kind: RegexTestFailureKind::FindIter { got },
|
||||
});
|
||||
}
|
||||
|
||||
fn skip(&self, test: &RegexTest) -> bool {
|
||||
if self.skip_expensive {
|
||||
if test.name.starts_with("repetition-long") {
|
||||
return true;
|
||||
}
|
||||
}
|
||||
if !self.blacklist.is_empty() {
|
||||
if self.blacklist.iter().any(|re| re.is_match(&test.name)) {
|
||||
return true;
|
||||
}
|
||||
}
|
||||
if !self.whitelist.is_empty() {
|
||||
if !self.whitelist.iter().any(|re| re.is_match(&test.name)) {
|
||||
return true;
|
||||
}
|
||||
}
|
||||
false
|
||||
}
|
||||
|
||||
fn apply_options(&self, test: &RegexTest, builder: &mut RegexBuilder) {
|
||||
for opt in &test.options {
|
||||
match *opt {
|
||||
RegexTestOption::Anchored => {
|
||||
builder.anchored(true);
|
||||
}
|
||||
RegexTestOption::CaseInsensitive => {
|
||||
builder.case_insensitive(true);
|
||||
}
|
||||
RegexTestOption::NoUnicode => {
|
||||
builder.unicode(false);
|
||||
}
|
||||
RegexTestOption::Escaped => {}
|
||||
RegexTestOption::InvalidUTF8 => {
|
||||
builder.allow_invalid_utf8(true);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Clone, Debug, Default)]
|
||||
pub struct RegexTestResults {
|
||||
/// Tests that succeeded.
|
||||
pub succeeded: Vec<RegexTest>,
|
||||
/// Failed tests, indexed by group name.
|
||||
pub failed: Vec<RegexTestFailure>,
|
||||
}
|
||||
|
||||
#[derive(Clone, Debug)]
|
||||
pub struct RegexTestFailure {
|
||||
test: RegexTest,
|
||||
kind: RegexTestFailureKind,
|
||||
}
|
||||
|
||||
#[derive(Clone, Debug)]
|
||||
pub enum RegexTestFailureKind {
|
||||
IsMatch,
|
||||
Find { got: Option<Match> },
|
||||
FindIter { got: Vec<Match> },
|
||||
}
|
||||
|
||||
impl RegexTestResults {
|
||||
pub fn assert(&self) {
|
||||
if self.failed.is_empty() {
|
||||
return;
|
||||
}
|
||||
let failures = self
|
||||
.failed
|
||||
.iter()
|
||||
.map(|f| f.to_string())
|
||||
.collect::<Vec<String>>()
|
||||
.join("\n\n");
|
||||
panic!(
|
||||
"found {} failures:\n{}\n{}\n{}\n\n\
|
||||
Set the REGEX_TEST environment variable to filter tests, \n\
|
||||
e.g., REGEX_TEST=crazy-misc,-crazy-misc2 runs every test \n\
|
||||
whose name contains crazy-misc but not crazy-misc2\n\n",
|
||||
self.failed.len(),
|
||||
"~".repeat(79),
|
||||
failures.trim(),
|
||||
"~".repeat(79)
|
||||
)
|
||||
}
|
||||
}
|
||||
|
||||
impl fmt::Display for RegexTestFailure {
|
||||
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
|
||||
write!(
|
||||
f,
|
||||
"{}: {}\n \
|
||||
options: {:?}\n \
|
||||
pattern: {}\n \
|
||||
pattern (escape): {}\n \
|
||||
input: {}\n \
|
||||
input (escape): {}\n \
|
||||
input (hex): {}",
|
||||
self.test.name,
|
||||
self.kind.fmt(&self.test)?,
|
||||
self.test.options,
|
||||
self.test.pattern,
|
||||
escape_default(&self.test.pattern),
|
||||
nice_raw_bytes(&self.test.input),
|
||||
escape_bytes(&self.test.input),
|
||||
hex_bytes(&self.test.input)
|
||||
)
|
||||
}
|
||||
}
|
||||
|
||||
impl RegexTestFailureKind {
|
||||
fn fmt(&self, test: &RegexTest) -> Result<String, fmt::Error> {
|
||||
let mut buf = String::new();
|
||||
match *self {
|
||||
RegexTestFailureKind::IsMatch => {
|
||||
if let Some(&m) = test.matches.get(0) {
|
||||
write!(buf, "expected match (at {}), but none found", m)?
|
||||
} else {
|
||||
write!(buf, "expected no match, but found a match")?
|
||||
}
|
||||
}
|
||||
RegexTestFailureKind::Find { got } => write!(
|
||||
buf,
|
||||
"expected {:?}, but found {:?}",
|
||||
test.matches.get(0),
|
||||
got
|
||||
)?,
|
||||
RegexTestFailureKind::FindIter { ref got } => write!(
|
||||
buf,
|
||||
"expected {:?}, but found {:?}",
|
||||
test.matches, got
|
||||
)?,
|
||||
}
|
||||
Ok(buf)
|
||||
}
|
||||
}
|
||||
|
||||
impl fmt::Display for Match {
|
||||
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
|
||||
write!(f, "({}, {})", self.start, self.end)
|
||||
}
|
||||
}
|
||||
|
||||
impl fmt::Debug for Match {
|
||||
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
|
||||
write!(f, "({}, {})", self.start, self.end)
|
||||
}
|
||||
}
|
||||
|
||||
fn nice_raw_bytes(bytes: &[u8]) -> String {
|
||||
use std::str;
|
||||
|
||||
match str::from_utf8(bytes) {
|
||||
Ok(s) => s.to_string(),
|
||||
Err(_) => escape_bytes(bytes),
|
||||
}
|
||||
}
|
||||
|
||||
fn escape_bytes(bytes: &[u8]) -> String {
|
||||
use std::ascii;
|
||||
|
||||
let escaped = bytes
|
||||
.iter()
|
||||
.flat_map(|&b| ascii::escape_default(b))
|
||||
.collect::<Vec<u8>>();
|
||||
String::from_utf8(escaped).unwrap()
|
||||
}
|
||||
|
||||
fn hex_bytes(bytes: &[u8]) -> String {
|
||||
bytes.iter().map(|&b| format!(r"\x{:02X}", b)).collect()
|
||||
}
|
||||
|
||||
fn escape_default(s: &str) -> String {
|
||||
s.chars().flat_map(|c| c.escape_default()).collect()
|
||||
}
|
||||
|
||||
fn unescape_bytes(bytes: &[u8]) -> Vec<u8> {
|
||||
use std::str;
|
||||
use unescape::unescape;
|
||||
|
||||
unescape(&str::from_utf8(bytes).expect("all input must be valid UTF-8"))
|
||||
}
|
||||
42
third-party/vendor/regex-automata-0.1.10/tests/regression.rs
vendored
Normal file
42
third-party/vendor/regex-automata-0.1.10/tests/regression.rs
vendored
Normal file
|
|
@ -0,0 +1,42 @@
|
|||
use regex_automata::{dense, DFA};
|
||||
|
||||
// A regression test for checking that minimization correctly translates
|
||||
// whether a state is a match state or not. Previously, it was possible for
|
||||
// minimization to mark a non-matching state as matching.
|
||||
#[test]
|
||||
fn minimize_sets_correct_match_states() {
|
||||
let pattern =
|
||||
// This is a subset of the grapheme matching regex. I couldn't seem
|
||||
// to get a repro any smaller than this unfortunately.
|
||||
r"(?x)
|
||||
(?:
|
||||
\p{gcb=Prepend}*
|
||||
(?:
|
||||
(?:
|
||||
(?:
|
||||
\p{gcb=L}*
|
||||
(?:\p{gcb=V}+|\p{gcb=LV}\p{gcb=V}*|\p{gcb=LVT})
|
||||
\p{gcb=T}*
|
||||
)
|
||||
|
|
||||
\p{gcb=L}+
|
||||
|
|
||||
\p{gcb=T}+
|
||||
)
|
||||
|
|
||||
\p{Extended_Pictographic}
|
||||
(?:\p{gcb=Extend}*\p{gcb=ZWJ}\p{Extended_Pictographic})*
|
||||
|
|
||||
[^\p{gcb=Control}\p{gcb=CR}\p{gcb=LF}]
|
||||
)
|
||||
[\p{gcb=Extend}\p{gcb=ZWJ}\p{gcb=SpacingMark}]*
|
||||
)
|
||||
";
|
||||
|
||||
let dfa = dense::Builder::new()
|
||||
.minimize(true)
|
||||
.anchored(true)
|
||||
.build(pattern)
|
||||
.unwrap();
|
||||
assert_eq!(None, dfa.find(b"\xE2"));
|
||||
}
|
||||
250
third-party/vendor/regex-automata-0.1.10/tests/suite.rs
vendored
Normal file
250
third-party/vendor/regex-automata-0.1.10/tests/suite.rs
vendored
Normal file
|
|
@ -0,0 +1,250 @@
|
|||
use regex_automata::{DenseDFA, Regex, RegexBuilder, SparseDFA};
|
||||
|
||||
use collection::{RegexTester, SUITE};
|
||||
|
||||
#[test]
|
||||
fn unminimized_standard() {
|
||||
let mut builder = RegexBuilder::new();
|
||||
builder.minimize(false).premultiply(false).byte_classes(false);
|
||||
|
||||
let mut tester = RegexTester::new().skip_expensive();
|
||||
tester.test_all(builder, SUITE.tests());
|
||||
tester.assert();
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn unminimized_premultiply() {
|
||||
let mut builder = RegexBuilder::new();
|
||||
builder.minimize(false).premultiply(true).byte_classes(false);
|
||||
|
||||
let mut tester = RegexTester::new().skip_expensive();
|
||||
tester.test_all(builder, SUITE.tests());
|
||||
tester.assert();
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn unminimized_byte_class() {
|
||||
let mut builder = RegexBuilder::new();
|
||||
builder.minimize(false).premultiply(false).byte_classes(true);
|
||||
|
||||
let mut tester = RegexTester::new();
|
||||
tester.test_all(builder, SUITE.tests());
|
||||
tester.assert();
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn unminimized_premultiply_byte_class() {
|
||||
let mut builder = RegexBuilder::new();
|
||||
builder.minimize(false).premultiply(true).byte_classes(true);
|
||||
|
||||
let mut tester = RegexTester::new();
|
||||
tester.test_all(builder, SUITE.tests());
|
||||
tester.assert();
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn unminimized_standard_no_nfa_shrink() {
|
||||
let mut builder = RegexBuilder::new();
|
||||
builder
|
||||
.minimize(false)
|
||||
.premultiply(false)
|
||||
.byte_classes(false)
|
||||
.shrink(false);
|
||||
|
||||
let mut tester = RegexTester::new().skip_expensive();
|
||||
tester.test_all(builder, SUITE.tests());
|
||||
tester.assert();
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn minimized_standard() {
|
||||
let mut builder = RegexBuilder::new();
|
||||
builder.minimize(true).premultiply(false).byte_classes(false);
|
||||
|
||||
let mut tester = RegexTester::new().skip_expensive();
|
||||
tester.test_all(builder, SUITE.tests());
|
||||
tester.assert();
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn minimized_premultiply() {
|
||||
let mut builder = RegexBuilder::new();
|
||||
builder.minimize(true).premultiply(true).byte_classes(false);
|
||||
|
||||
let mut tester = RegexTester::new().skip_expensive();
|
||||
tester.test_all(builder, SUITE.tests());
|
||||
tester.assert();
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn minimized_byte_class() {
|
||||
let mut builder = RegexBuilder::new();
|
||||
builder.minimize(true).premultiply(false).byte_classes(true);
|
||||
|
||||
let mut tester = RegexTester::new();
|
||||
tester.test_all(builder, SUITE.tests());
|
||||
tester.assert();
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn minimized_premultiply_byte_class() {
|
||||
let mut builder = RegexBuilder::new();
|
||||
builder.minimize(true).premultiply(true).byte_classes(true);
|
||||
|
||||
let mut tester = RegexTester::new();
|
||||
tester.test_all(builder, SUITE.tests());
|
||||
tester.assert();
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn minimized_standard_no_nfa_shrink() {
|
||||
let mut builder = RegexBuilder::new();
|
||||
builder
|
||||
.minimize(true)
|
||||
.premultiply(false)
|
||||
.byte_classes(false)
|
||||
.shrink(false);
|
||||
|
||||
let mut tester = RegexTester::new().skip_expensive();
|
||||
tester.test_all(builder, SUITE.tests());
|
||||
tester.assert();
|
||||
}
|
||||
|
||||
// A basic sanity test that checks we can convert a regex to a smaller
|
||||
// representation and that the resulting regex still passes our tests.
|
||||
//
|
||||
// If tests grow minimal regexes that cannot be represented in 16 bits, then
|
||||
// we'll either want to skip those or increase the size to test to u32.
|
||||
#[test]
|
||||
fn u16() {
|
||||
let mut builder = RegexBuilder::new();
|
||||
builder.minimize(true).premultiply(false).byte_classes(true);
|
||||
|
||||
let mut tester = RegexTester::new().skip_expensive();
|
||||
for test in SUITE.tests() {
|
||||
let builder = builder.clone();
|
||||
let re: Regex = match tester.build_regex(builder, test) {
|
||||
None => continue,
|
||||
Some(re) => re,
|
||||
};
|
||||
let small_re = Regex::from_dfas(
|
||||
re.forward().to_u16().unwrap(),
|
||||
re.reverse().to_u16().unwrap(),
|
||||
);
|
||||
|
||||
tester.test(test, &small_re);
|
||||
}
|
||||
tester.assert();
|
||||
}
|
||||
|
||||
// Test that sparse DFAs work using the standard configuration.
|
||||
#[test]
|
||||
fn sparse_unminimized_standard() {
|
||||
let mut builder = RegexBuilder::new();
|
||||
builder.minimize(false).premultiply(false).byte_classes(false);
|
||||
|
||||
let mut tester = RegexTester::new().skip_expensive();
|
||||
for test in SUITE.tests() {
|
||||
let builder = builder.clone();
|
||||
let re: Regex = match tester.build_regex(builder, test) {
|
||||
None => continue,
|
||||
Some(re) => re,
|
||||
};
|
||||
let fwd = re.forward().to_sparse().unwrap();
|
||||
let rev = re.reverse().to_sparse().unwrap();
|
||||
let sparse_re = Regex::from_dfas(fwd, rev);
|
||||
|
||||
tester.test(test, &sparse_re);
|
||||
}
|
||||
tester.assert();
|
||||
}
|
||||
|
||||
// Test that sparse DFAs work after converting them to a different state ID
|
||||
// representation.
|
||||
#[test]
|
||||
fn sparse_u16() {
|
||||
let mut builder = RegexBuilder::new();
|
||||
builder.minimize(true).premultiply(false).byte_classes(false);
|
||||
|
||||
let mut tester = RegexTester::new().skip_expensive();
|
||||
for test in SUITE.tests() {
|
||||
let builder = builder.clone();
|
||||
let re: Regex = match tester.build_regex(builder, test) {
|
||||
None => continue,
|
||||
Some(re) => re,
|
||||
};
|
||||
let fwd = re.forward().to_sparse().unwrap().to_u16().unwrap();
|
||||
let rev = re.reverse().to_sparse().unwrap().to_u16().unwrap();
|
||||
let sparse_re = Regex::from_dfas(fwd, rev);
|
||||
|
||||
tester.test(test, &sparse_re);
|
||||
}
|
||||
tester.assert();
|
||||
}
|
||||
|
||||
// Another basic sanity test that checks we can serialize and then deserialize
|
||||
// a regex, and that the resulting regex can be used for searching correctly.
|
||||
#[test]
|
||||
fn serialization_roundtrip() {
|
||||
let mut builder = RegexBuilder::new();
|
||||
builder.premultiply(false).byte_classes(true);
|
||||
|
||||
let mut tester = RegexTester::new().skip_expensive();
|
||||
for test in SUITE.tests() {
|
||||
let builder = builder.clone();
|
||||
let re: Regex = match tester.build_regex(builder, test) {
|
||||
None => continue,
|
||||
Some(re) => re,
|
||||
};
|
||||
|
||||
let fwd_bytes = re.forward().to_bytes_native_endian().unwrap();
|
||||
let rev_bytes = re.reverse().to_bytes_native_endian().unwrap();
|
||||
let fwd: DenseDFA<&[usize], usize> =
|
||||
unsafe { DenseDFA::from_bytes(&fwd_bytes) };
|
||||
let rev: DenseDFA<&[usize], usize> =
|
||||
unsafe { DenseDFA::from_bytes(&rev_bytes) };
|
||||
let re = Regex::from_dfas(fwd, rev);
|
||||
|
||||
tester.test(test, &re);
|
||||
}
|
||||
tester.assert();
|
||||
}
|
||||
|
||||
// A basic sanity test that checks we can serialize and then deserialize a
|
||||
// regex using sparse DFAs, and that the resulting regex can be used for
|
||||
// searching correctly.
|
||||
#[test]
|
||||
fn sparse_serialization_roundtrip() {
|
||||
let mut builder = RegexBuilder::new();
|
||||
builder.byte_classes(true);
|
||||
|
||||
let mut tester = RegexTester::new().skip_expensive();
|
||||
for test in SUITE.tests() {
|
||||
let builder = builder.clone();
|
||||
let re: Regex = match tester.build_regex(builder, test) {
|
||||
None => continue,
|
||||
Some(re) => re,
|
||||
};
|
||||
|
||||
let fwd_bytes = re
|
||||
.forward()
|
||||
.to_sparse()
|
||||
.unwrap()
|
||||
.to_bytes_native_endian()
|
||||
.unwrap();
|
||||
let rev_bytes = re
|
||||
.reverse()
|
||||
.to_sparse()
|
||||
.unwrap()
|
||||
.to_bytes_native_endian()
|
||||
.unwrap();
|
||||
let fwd: SparseDFA<&[u8], usize> =
|
||||
unsafe { SparseDFA::from_bytes(&fwd_bytes) };
|
||||
let rev: SparseDFA<&[u8], usize> =
|
||||
unsafe { SparseDFA::from_bytes(&rev_bytes) };
|
||||
let re = Regex::from_dfas(fwd, rev);
|
||||
|
||||
tester.test(test, &re);
|
||||
}
|
||||
tester.assert();
|
||||
}
|
||||
25
third-party/vendor/regex-automata-0.1.10/tests/tests.rs
vendored
Normal file
25
third-party/vendor/regex-automata-0.1.10/tests/tests.rs
vendored
Normal file
|
|
@ -0,0 +1,25 @@
|
|||
#[cfg(feature = "std")]
|
||||
#[macro_use]
|
||||
extern crate lazy_static;
|
||||
#[cfg(feature = "std")]
|
||||
extern crate regex;
|
||||
#[cfg(feature = "std")]
|
||||
extern crate regex_automata;
|
||||
#[cfg(feature = "std")]
|
||||
extern crate serde;
|
||||
#[cfg(feature = "std")]
|
||||
extern crate serde_bytes;
|
||||
#[cfg(feature = "std")]
|
||||
#[macro_use]
|
||||
extern crate serde_derive;
|
||||
#[cfg(feature = "std")]
|
||||
extern crate toml;
|
||||
|
||||
#[cfg(feature = "std")]
|
||||
mod collection;
|
||||
#[cfg(feature = "std")]
|
||||
mod regression;
|
||||
#[cfg(feature = "std")]
|
||||
mod suite;
|
||||
#[cfg(feature = "std")]
|
||||
mod unescape;
|
||||
84
third-party/vendor/regex-automata-0.1.10/tests/unescape.rs
vendored
Normal file
84
third-party/vendor/regex-automata-0.1.10/tests/unescape.rs
vendored
Normal file
|
|
@ -0,0 +1,84 @@
|
|||
#[derive(Clone, Copy, Eq, PartialEq)]
|
||||
enum State {
|
||||
/// The state after seeing a `\`.
|
||||
Escape,
|
||||
/// The state after seeing a `\x`.
|
||||
HexFirst,
|
||||
/// The state after seeing a `\x[0-9A-Fa-f]`.
|
||||
HexSecond(char),
|
||||
/// Default state.
|
||||
Literal,
|
||||
}
|
||||
|
||||
pub fn unescape(s: &str) -> Vec<u8> {
|
||||
use self::State::*;
|
||||
|
||||
let mut bytes = vec![];
|
||||
let mut state = Literal;
|
||||
for c in s.chars() {
|
||||
match state {
|
||||
Escape => match c {
|
||||
'\\' => {
|
||||
bytes.push(b'\\');
|
||||
state = Literal;
|
||||
}
|
||||
'n' => {
|
||||
bytes.push(b'\n');
|
||||
state = Literal;
|
||||
}
|
||||
'r' => {
|
||||
bytes.push(b'\r');
|
||||
state = Literal;
|
||||
}
|
||||
't' => {
|
||||
bytes.push(b'\t');
|
||||
state = Literal;
|
||||
}
|
||||
'x' => {
|
||||
state = HexFirst;
|
||||
}
|
||||
c => {
|
||||
bytes.extend(format!(r"\{}", c).into_bytes());
|
||||
state = Literal;
|
||||
}
|
||||
},
|
||||
HexFirst => match c {
|
||||
'0'..='9' | 'A'..='F' | 'a'..='f' => {
|
||||
state = HexSecond(c);
|
||||
}
|
||||
c => {
|
||||
bytes.extend(format!(r"\x{}", c).into_bytes());
|
||||
state = Literal;
|
||||
}
|
||||
},
|
||||
HexSecond(first) => match c {
|
||||
'0'..='9' | 'A'..='F' | 'a'..='f' => {
|
||||
let ordinal = format!("{}{}", first, c);
|
||||
let byte = u8::from_str_radix(&ordinal, 16).unwrap();
|
||||
bytes.push(byte);
|
||||
state = Literal;
|
||||
}
|
||||
c => {
|
||||
let original = format!(r"\x{}{}", first, c);
|
||||
bytes.extend(original.into_bytes());
|
||||
state = Literal;
|
||||
}
|
||||
},
|
||||
Literal => match c {
|
||||
'\\' => {
|
||||
state = Escape;
|
||||
}
|
||||
c => {
|
||||
bytes.extend(c.to_string().as_bytes());
|
||||
}
|
||||
},
|
||||
}
|
||||
}
|
||||
match state {
|
||||
Escape => bytes.push(b'\\'),
|
||||
HexFirst => bytes.extend(b"\\x"),
|
||||
HexSecond(c) => bytes.extend(format!("\\x{}", c).into_bytes()),
|
||||
Literal => {}
|
||||
}
|
||||
bytes
|
||||
}
|
||||
Loading…
Add table
Add a link
Reference in a new issue