Vendor things

This commit is contained in:
John Doty 2024-03-08 11:03:01 -08:00
parent 5deceec006
commit 977e3c17e5
19434 changed files with 10682014 additions and 0 deletions

View file

@ -0,0 +1 @@
{"files":{"COPYING":"01c266bced4a434da0051174d6bee16a4c82cf634e2679b6155d40d75012390f","Cargo.toml":"674fda607d585e7a9d1d07e6fee2807e6a1a3709ca8d5a507dac051cac84dcf1","LICENSE-MIT":"0f96a83840e146e43c0ec96a22ec1f392e0680e6c1226e6f3ba87e0740af850f","README.md":"34ebd8d165fbd934198653a6d619d62788ff72f0e058139459d4369683423551","TODO":"daea9f7378f543311d657e6ef3d2a09d51e82b9e70d0026140130862c32b3c08","UNLICENSE":"7e12e5df4bae12cb21581ba157ced20e1986a0508dd10d0e8a4ab9a4cf94e85c","data/fowler-tests/LICENSE":"58cf078acc03da3e280a938c2bd9943f554fc9b6ced89ad93ba35ca436872899","data/fowler-tests/README":"45f869e37f798905c773bfbe0ef19a5fb7e585cbf0b7c21b5b5a784e8cec3c14","data/fowler-tests/basic.dat":"3756a5bdd6f387ed34731197fbdaab8521b0ae1726250100ba235756cb42b4b1","data/fowler-tests/nullsubexpr.dat":"496ac0278eec3b6d9170faace14554569032dd3d909618364d9326156de39ecf","data/fowler-tests/repetition.dat":"1f7959063015b284b18a4a2c1c8b416d438a2d6c4b1a362da43406b865f50e69","data/tests/crazy.toml":"b6e644a74b990a4344b15e7366da36e5b3f73a183944e249082f74c23ff01e5f","data/tests/flags.toml":"aefd9483c1c9c52c3669a9f2e88cd494c293f2e14c59aecb1d94dbb82546a705","data/tests/fowler/LICENSE":"58cf078acc03da3e280a938c2bd9943f554fc9b6ced89ad93ba35ca436872899","data/tests/fowler/README":"e9f049297023d5a81c5c600280016fe0271e7d0eda898c41399eb61431820404","data/tests/fowler/basic.dat":"3756a5bdd6f387ed34731197fbdaab8521b0ae1726250100ba235756cb42b4b1","data/tests/fowler/basic.toml":"7b043231ca8c89dbd10cef0de3b0be18c9ae442be1e99a657cd412b8b7edec21","data/tests/fowler/fowler-to-toml":"5bb78b924f3b6b1c27278b37baae556115fe03c864c1d33a7c53718b99885515","data/tests/fowler/nullsubexpr.dat":"496ac0278eec3b6d9170faace14554569032dd3d909618364d9326156de39ecf","data/tests/fowler/nullsubexpr.toml":"7e4bf9fec1c4a8aca04cc96e74b3f51ed6b8c3f85e4bfc7acc9c74ab95166976","data/tests/fowler/repetition-long.dat":"040b869c8c3a02ed0497943af1860094fd8437892811180fb9624a41eb491f23","data/tests/fowler/repetition-long.toml":"3eb7199d936b3f7eb9863ebc3b0c94648cfc32192f626dcfa33ddf352918c1c0","data/tests/fowler/repetition.dat":"d8fa959c2be524c90f2092f7c41ff3a7b48232b4378cb3b090625e74e76fc625","data/tests/fowler/repetition.toml":"ccf21430a325c4e1dae4eb6c52e3cea5d3c1847559ba6e75466bdb6bbd98204d","data/tests/iter.toml":"99adc397fe0a00c759eb659531d3e69445b43f5ecd5771c549117933b73bd43e","data/tests/no-unicode.toml":"f329ee939c2d07a17e51f0090d9f2431395e47dac8e0b982fb5e16e0555b75e3","data/tests/unicode.toml":"0ff418de5bc238e4595956b66981fe02018938d57d76d11cab840606b9da60ba","rustfmt.toml":"1ca600239a27401c4a43f363cf3f38183a212affc1f31bff3ae93234bbaec228","src/byteorder.rs":"0827852aa563e3c5b3ffaf484ce8a34537e82719a3606d4b948bc8a1e21d8b18","src/classes.rs":"706c8a8a9bf70260b9c92ff865891fc26de0453495afca7b325afdf5e6a3e242","src/codegen.rs":"5686b97fec69158c7264183a71ad9a1ff8e74db02fa0fcfccaa0a516cbfc7d1d","src/dense.rs":"7561f35019b20642f2ee75fd20365e21a4c8260deb7cee84fa3f8264b9fd9a4b","src/determinize.rs":"876c844d0470854dbbe3eb4386611fd57d95a5a4ae38ee937fbb14676f0a383a","src/dfa.rs":"032f09d187ec8dd06ef09940515690af045ca9f7ef7f819c31a97607df1432e5","src/error.rs":"d07ecdc617e243a43a99e911398b9c37721afd2b9548153c5f359b8c4605c749","src/lib.rs":"520781bdd60d425b16ef72f03330362e7c2aec274338e73f309d730bea4d7ab0","src/minimize.rs":"dfa7b6a6f36bb2dedaee8bfc5c4bb943f59e0cf98cde5358822e70cbdb284a7e","src/nfa/compiler.rs":"f43901929f44efa420e441cbff8687e05059ceae88492a2ed6c49fdd5a6a6b04","src/nfa/map.rs":"b7e2e561d6fe5775716e27eded1ae3e2277a50073a2e182f3dabedcda5c30d27","src/nfa/mod.rs":"93e7dee804751fcf66d48ca48b3467a4ab5155063461e69c428e46bcf977711d","src/nfa/range_trie.rs":"3a3d2853987619688ab5b61acef575f216d5bdd7b9e15fa508e0ba6f29c641a9","src/regex.rs":"2f3868a3fa52b2a040fd0fb9f12386b1af1f0f650d948e821c7ba83f087826f0","src/sparse.rs":"976540bcd134a225e5d39e1aef688f63b02b3d745249a3a95fec387a7ffb88cc","src/sparse_set.rs":"81bef5057781e26da39855b0f38b02ddfd09183bc62d30cf454ec706885e3a70","src/state_id.rs":"44c4bf1a5d091b97e8c1ce872bafe45d806905b07a73a6f82b1655b7897e7b5f","src/transducer.rs":"28c728ef45a3f6177d5a3ac589f166764c11d6c66bd5d916bcf30ad2be187a0c","tests/collection.rs":"2907cc0a32e5e59ceca4b34fe582f9275c12ee1a8d6e73d689056bdfd5357b9a","tests/regression.rs":"5a9b2654f88b1b07401c5b1fe925f62421bff67be7d80cae7a985eb66ed9886b","tests/suite.rs":"8148247667b34b370855c247ffcc9c6339f8f72d6fe481b79936afbb165dd6bd","tests/tests.rs":"f1b407d3d288a9c2b1500151205f9d0bcc0668b2ab38c5094ee459d6d4893e18","tests/unescape.rs":"67a7c466ba5c873a3c29f7e00649535ddc2921fcc14ac92cb207f43b4b6e461d"},"package":"6c230d73fb8d8c1b9c0b3135c5142a8acee3a0558fb8db5cf1cb65f8d7862132"}

View file

@ -0,0 +1,3 @@
This project is dual-licensed under the Unlicense and MIT licenses.
You may use this code under the terms of either license.

View file

@ -0,0 +1,86 @@
# THIS FILE IS AUTOMATICALLY GENERATED BY CARGO
#
# When uploading crates to the registry Cargo will automatically
# "normalize" Cargo.toml files for maximal compatibility
# with all versions of Cargo and also rewrite `path` dependencies
# to registry (e.g., crates.io) dependencies
#
# If you believe there's an error in this file please file an
# issue against the rust-lang/cargo repository. If you're
# editing this file be aware that the upstream Cargo.toml
# will likely look very different (and much more reasonable)
[package]
name = "regex-automata"
version = "0.1.10"
authors = ["Andrew Gallant <jamslam@gmail.com>"]
exclude = ["/.travis.yml", "/appveyor.yml", "/ci/*", "/scripts/*", "/regex-automata-debug"]
autoexamples = false
autotests = false
description = "Automata construction and matching using regular expressions."
homepage = "https://github.com/BurntSushi/regex-automata"
documentation = "https://docs.rs/regex-automata"
readme = "README.md"
keywords = ["regex", "dfa", "automata", "automaton", "nfa"]
categories = ["text-processing"]
license = "Unlicense/MIT"
repository = "https://github.com/BurntSushi/regex-automata"
[profile.bench]
debug = true
[profile.dev]
opt-level = 3
debug = true
[profile.release]
debug = true
[profile.test]
opt-level = 3
debug = true
[lib]
bench = false
[[test]]
name = "default"
path = "tests/tests.rs"
[dependencies.fst]
version = "0.4.0"
optional = true
[dependencies.regex-syntax]
version = "0.6.16"
optional = true
[dev-dependencies.bstr]
version = "0.2"
features = ["std"]
default-features = false
[dev-dependencies.lazy_static]
version = "1.2.0"
[dev-dependencies.regex]
version = "1.1"
[dev-dependencies.serde]
version = "1.0.82"
[dev-dependencies.serde_bytes]
version = "0.11"
[dev-dependencies.serde_derive]
version = "1.0.82"
[dev-dependencies.toml]
version = "0.4.10"
[features]
default = ["std"]
std = ["regex-syntax"]
transducer = ["std", "fst"]
[badges.appveyor]
repository = "BurntSushi/regex-automata"
[badges.travis-ci]
repository = "BurntSushi/regex-automata"

View file

@ -0,0 +1,21 @@
The MIT License (MIT)
Copyright (c) 2015 Andrew Gallant
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in
all copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
THE SOFTWARE.

View file

@ -0,0 +1,223 @@
regex-automata
==============
A low level regular expression library that uses deterministic finite automata.
It supports a rich syntax with Unicode support, has extensive options for
configuring the best space vs time trade off for your use case and provides
support for cheap deserialization of automata for use in `no_std` environments.
[![Build status](https://github.com/BurntSushi/regex-automata/workflows/ci/badge.svg)](https://github.com/BurntSushi/regex-automata/actions)
[![on crates.io](https://meritbadge.herokuapp.com/regex-automata)](https://crates.io/crates/regex-automata)
![Minimum Supported Rust Version 1.41](https://img.shields.io/badge/rustc-1.41-green)
Dual-licensed under MIT or the [UNLICENSE](https://unlicense.org/).
### Documentation
https://docs.rs/regex-automata
### Usage
Add this to your `Cargo.toml`:
```toml
[dependencies]
regex-automata = "0.1"
```
and this to your crate root (if you're using Rust 2015):
```rust
extern crate regex_automata;
```
### Example: basic regex searching
This example shows how to compile a regex using the default configuration
and then use it to find matches in a byte string:
```rust
use regex_automata::Regex;
let re = Regex::new(r"[0-9]{4}-[0-9]{2}-[0-9]{2}").unwrap();
let text = b"2018-12-24 2016-10-08";
let matches: Vec<(usize, usize)> = re.find_iter(text).collect();
assert_eq!(matches, vec![(0, 10), (11, 21)]);
```
For more examples and information about the various knobs that can be turned,
please see the [docs](https://docs.rs/regex-automata).
### Support for `no_std`
This crate comes with a `std` feature that is enabled by default. When the
`std` feature is enabled, the API of this crate will include the facilities
necessary for compiling, serializing, deserializing and searching with regular
expressions. When the `std` feature is disabled, the API of this crate will
shrink such that it only includes the facilities necessary for deserializing
and searching with regular expressions.
The intended workflow for `no_std` environments is thus as follows:
* Write a program with the `std` feature that compiles and serializes a
regular expression. Serialization should only happen after first converting
the DFAs to use a fixed size state identifier instead of the default `usize`.
You may also need to serialize both little and big endian versions of each
DFA. (So that's 4 DFAs in total for each regex.)
* In your `no_std` environment, follow the examples above for deserializing
your previously serialized DFAs into regexes. You can then search with them
as you would any regex.
Deserialization can happen anywhere. For example, with bytes embedded into a
binary or with a file memory mapped at runtime.
Note that the
[`ucd-generate`](https://github.com/BurntSushi/ucd-generate)
tool will do the first step for you with its `dfa` or `regex` sub-commands.
### Cargo features
* `std` - **Enabled** by default. This enables the ability to compile finite
automata. This requires the `regex-syntax` dependency. Without this feature
enabled, finite automata can only be used for searching (using the approach
described above).
* `transducer` - **Disabled** by default. This provides implementations of the
`Automaton` trait found in the `fst` crate. This permits using finite
automata generated by this crate to search finite state transducers. This
requires the `fst` dependency.
### Differences with the regex crate
The main goal of the [`regex`](https://docs.rs/regex) crate is to serve as a
general purpose regular expression engine. It aims to automatically balance low
compile times, fast search times and low memory usage, while also providing
a convenient API for users. In contrast, this crate provides a lower level
regular expression interface that is a bit less convenient while providing more
explicit control over memory usage and search times.
Here are some specific negative differences:
* **Compilation can take an exponential amount of time and space** in the size
of the regex pattern. While most patterns do not exhibit worst case
exponential time, such patterns do exist. For example, `[01]*1[01]{N}` will
build a DFA with `2^(N+1)` states. For this reason, untrusted patterns should
not be compiled with this library. (In the future, the API may expose an
option to return an error if the DFA gets too big.)
* This crate does not support sub-match extraction, which can be achieved with
the regex crate's "captures" API. This may be added in the future, but is
unlikely.
* While the regex crate doesn't necessarily sport fast compilation times, the
regexes in this crate are almost universally slow to compile, especially when
they contain large Unicode character classes. For example, on my system,
compiling `\w{3}` with byte classes enabled takes just over 1 second and
almost 5MB of memory! (Compiling a sparse regex takes about the same time
but only uses about 500KB of memory.) Conversly, compiling the same regex
without Unicode support, e.g., `(?-u)\w{3}`, takes under 1 millisecond and
less than 5KB of memory. For this reason, you should only use Unicode
character classes if you absolutely need them!
* This crate does not support regex sets.
* This crate does not support zero-width assertions such as `^`, `$`, `\b` or
`\B`.
* As a lower level crate, this library does not do literal optimizations. In
exchange, you get predictable performance regardless of input. The
philosophy here is that literal optimizations should be applied at a higher
level, although there is no easy support for this in the ecosystem yet.
* There is no `&str` API like in the regex crate. In this crate, all APIs
operate on `&[u8]`. By default, match indices are guaranteed to fall on
UTF-8 boundaries, unless `RegexBuilder::allow_invalid_utf8` is enabled.
With some of the downsides out of the way, here are some positive differences:
* Both dense and sparse DFAs can be serialized to raw bytes, and then cheaply
deserialized. Deserialization always takes constant time since searching can
be performed directly on the raw serialized bytes of a DFA.
* This crate was specifically designed so that the searching phase of a DFA has
minimal runtime requirements, and can therefore be used in `no_std`
environments. While `no_std` environments cannot compile regexes, they can
deserialize pre-compiled regexes.
* Since this crate builds DFAs ahead of time, it will generally out-perform
the `regex` crate on equivalent tasks. The performance difference is likely
not large. However, because of a complex set of optimizations in the regex
crate (like literal optimizations), an accurate performance comparison may be
difficult to do.
* Sparse DFAs provide a way to build a DFA ahead of time that sacrifices search
performance a small amount, but uses much less storage space. Potentially
even less than what the regex crate uses.
* This crate exposes DFAs directly, such as `DenseDFA` and `SparseDFA`,
which enables one to do less work in some cases. For example, if you only
need the end of a match and not the start of a match, then you can use a DFA
directly without building a `Regex`, which always requires a second DFA to
find the start of a match.
* Aside from choosing between dense and sparse DFAs, there are several options
for configuring the space usage vs search time trade off. These include
things like choosing a smaller state identifier representation, to
premultiplying state identifiers and splitting a DFA's alphabet into
equivalence classes. Finally, DFA minimization is also provided, but can
increase compilation times dramatically.
### Future work
* Look into being smarter about generating NFA states for large Unicode
character classes. These can create a lot of additional work for both the
determinizer and the minimizer, and I suspect this is the key thing we'll
want to improve if we want to make DFA compile times faster. I *believe*
it's possible to potentially build minimal or nearly minimal NFAs for the
special case of Unicode character classes by leveraging Daciuk's algorithms
for building minimal automata in linear time for sets of strings. See
https://blog.burntsushi.net/transducers/#construction for more details. The
key adaptation I think we need to make is to modify the algorithm to operate
on byte ranges instead of enumerating every codepoint in the set. Otherwise,
it might not be worth doing.
* Add support for regex sets. It should be possible to do this by "simply"
introducing more match states. I think we can also report the positions at
each match, similar to how Aho-Corasick works. I think the long pole in the
tent here is probably the API design work and arranging it so that we don't
introduce extra overhead into the non-regex-set case without duplicating a
lot of code. It seems doable.
* Stretch goal: support capturing groups by implementing "tagged" DFA
(transducers). Laurikari's paper is the usual reference here, but Trofimovich
has a much more thorough treatment here:
https://re2c.org/2017_trofimovich_tagged_deterministic_finite_automata_with_lookahead.pdf
I've only read the paper once. I suspect it will require at least a few more
read throughs before I understand it.
See also: https://re2c.org
* Possibly less ambitious goal: can we select a portion of Trofimovich's work
to make small fixed length look-around work? It would be really nice to
support ^, $ and \b, especially the Unicode variant of \b and CRLF aware $.
* Experiment with code generating Rust code. There is an early experiment in
src/codegen.rs that is thoroughly bit-rotted. At the time, I was
experimenting with whether or not codegen would significant decrease the size
of a DFA, since if you squint hard enough, it's kind of like a sparse
representation. However, it didn't shrink as much as I thought it would, so
I gave up. The other problem is that Rust doesn't support gotos, so I don't
even know whether the "match on each state" in a loop thing will be fast
enough. Either way, it's probably a good option to have. For one thing, it
would be endian independent where as the serialization format of the DFAs in
this crate are endian dependent (so you need two versions of every DFA, but
you only need to compile one of them for any given arch).
* Experiment with unrolling the match loops and fill out the benchmarks.
* Add some kind of streaming API. I believe users of the library can already
implement something for this outside of the crate, but it would be good to
provide an official API. The key thing here is figuring out the API. I
suspect we might want to support several variants.
* Make a decision on whether or not there is room for literal optimizations
in this crate. My original intent was to not let this crate sink down into
that very very very deep rabbit hole. But instead, we might want to provide
some way for literal optimizations to hook into the match routines. The right
path forward here is to probably build something outside of the crate and
then see about integrating it. After all, users can implement their own
match routines just as efficiently as what the crate provides.
* A key downside of DFAs is that they can take up a lot of memory and can be
quite costly to build. Their worst case compilation time is O(2^n), where
n is the number of NFA states. A paper by Yang and Prasanna (2011) actually
seems to provide a way to character state blow up such that it is detectable.
If we could know whether a regex will exhibit state explosion or not, then
we could make an intelligent decision about whether to ahead-of-time compile
a DFA.
See: https://www.researchgate.net/profile/Xu-Shutu/publication/229032602_Characterization_of_a_global_germplasm_collection_and_its_potential_utilization_for_analysis_of_complex_quantitative_traits_in_maize/links/02bfe50f914d04c837000000/Characterization-of-a-global-germplasm-collection-and-its-potential-utilization-for-analysis-of-complex-quantitative-traits-in-maize.pdf

View file

@ -0,0 +1,10 @@
* Remove the `empty` constructors for DFAs and replace them with
`never_match` and `always_match` constructors.
* Consider refactoring the NFA representation such that it can be instantly
loaded from a `&[u8]`, just like a sparse DFA. Main downside is that this
could negatively impact using the NFA with deserialization costs. Before
doing this, we should write PikeVM and backtracking implementations so that
they can be benchmarked.
* Add captures and anchors to NFA.
* Once we're happy, re-organize the public API such that NFAs are exported
and usable on their own.

View file

@ -0,0 +1,24 @@
This is free and unencumbered software released into the public domain.
Anyone is free to copy, modify, publish, use, compile, sell, or
distribute this software, either in source code form or as a compiled
binary, for any purpose, commercial or non-commercial, and by any
means.
In jurisdictions that recognize copyright laws, the author or authors
of this software dedicate any and all copyright interest in the
software to the public domain. We make this dedication for the benefit
of the public at large and to the detriment of our heirs and
successors. We intend this dedication to be an overt act of
relinquishment in perpetuity of all present and future rights to this
software under copyright law.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR
OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
OTHER DEALINGS IN THE SOFTWARE.
For more information, please refer to <http://unlicense.org/>

View file

@ -0,0 +1,19 @@
The following license covers testregex.c and all associated test data.
Permission is hereby granted, free of charge, to any person obtaining a
copy of THIS SOFTWARE FILE (the "Software"), to deal in the Software
without restriction, including without limitation the rights to use,
copy, modify, merge, publish, distribute, and/or sell copies of the
Software, and to permit persons to whom the Software is furnished to do
so, subject to the following disclaimer:
THIS SOFTWARE IS PROVIDED BY AT&T ``AS IS'' AND ANY EXPRESS OR IMPLIED
WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
IN NO EVENT SHALL AT&T BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

View file

@ -0,0 +1,17 @@
Test data was taken from the Go distribution, which was in turn taken from the
testregex test suite:
http://www2.research.att.com/~astopen/testregex/testregex.html
The LICENSE in this directory corresponds to the LICENSE that the data was
released under.
The tests themselves were modified for RE2/Go. A couple were modified further
by me (Andrew Gallant) (only in repetition.dat) so that RE2/Go would pass them.
(Yes, it seems like RE2/Go includes failing test cases.) This may or may not
have been a bad idea, but I think being consistent with an established Regex
library is worth something.
Note that these files are read by 'scripts/regex-match-tests.py' and turned
into Rust tests found in 'regex_macros/tests/matches.rs'.

View file

@ -0,0 +1,221 @@
NOTE all standard compliant implementations should pass these : 2002-05-31
BE abracadabra$ abracadabracadabra (7,18)
BE a...b abababbb (2,7)
BE XXXXXX ..XXXXXX (2,8)
E \) () (1,2)
BE a] a]a (0,2)
B } } (0,1)
E \} } (0,1)
BE \] ] (0,1)
B ] ] (0,1)
E ] ] (0,1)
B { { (0,1)
B } } (0,1)
BE ^a ax (0,1)
BE \^a a^a (1,3)
BE a\^ a^ (0,2)
BE a$ aa (1,2)
BE a\$ a$ (0,2)
BE ^$ NULL (0,0)
E $^ NULL (0,0)
E a($) aa (1,2)(2,2)
E a*(^a) aa (0,1)(0,1)
E (..)*(...)* a (0,0)
E (..)*(...)* abcd (0,4)(2,4)
E (ab|a)(bc|c) abc (0,3)(0,2)(2,3)
E (ab)c|abc abc (0,3)(0,2)
E a{0}b ab (1,2)
E (a*)(b?)(b+)b{3} aaabbbbbbb (0,10)(0,3)(3,4)(4,7)
E (a*)(b{0,1})(b{1,})b{3} aaabbbbbbb (0,10)(0,3)(3,4)(4,7)
E a{9876543210} NULL BADBR
E ((a|a)|a) a (0,1)(0,1)(0,1)
E (a*)(a|aa) aaaa (0,4)(0,3)(3,4)
E a*(a.|aa) aaaa (0,4)(2,4)
E a(b)|c(d)|a(e)f aef (0,3)(?,?)(?,?)(1,2)
E (a|b)?.* b (0,1)(0,1)
E (a|b)c|a(b|c) ac (0,2)(0,1)
E (a|b)c|a(b|c) ab (0,2)(?,?)(1,2)
E (a|b)*c|(a|ab)*c abc (0,3)(1,2)
E (a|b)*c|(a|ab)*c xc (1,2)
E (.a|.b).*|.*(.a|.b) xa (0,2)(0,2)
E a?(ab|ba)ab abab (0,4)(0,2)
E a?(ac{0}b|ba)ab abab (0,4)(0,2)
E ab|abab abbabab (0,2)
E aba|bab|bba baaabbbaba (5,8)
E aba|bab baaabbbaba (6,9)
E (aa|aaa)*|(a|aaaaa) aa (0,2)(0,2)
E (a.|.a.)*|(a|.a...) aa (0,2)(0,2)
E ab|a xabc (1,3)
E ab|a xxabc (2,4)
Ei (Ab|cD)* aBcD (0,4)(2,4)
BE [^-] --a (2,3)
BE [a-]* --a (0,3)
BE [a-m-]* --amoma-- (0,4)
E :::1:::0:|:::1:1:0: :::0:::1:::1:::0: (8,17)
E :::1:::0:|:::1:1:1: :::0:::1:::1:::0: (8,17)
{E [[:upper:]] A (0,1) [[<element>]] not supported
E [[:lower:]]+ `az{ (1,3)
E [[:upper:]]+ @AZ[ (1,3)
# No collation in Go
#BE [[-]] [[-]] (2,4)
#BE [[.NIL.]] NULL ECOLLATE
#BE [[=aleph=]] NULL ECOLLATE
}
BE$ \n \n (0,1)
BEn$ \n \n (0,1)
BE$ [^a] \n (0,1)
BE$ \na \na (0,2)
E (a)(b)(c) abc (0,3)(0,1)(1,2)(2,3)
BE xxx xxx (0,3)
E1 (^|[ (,;])((([Ff]eb[^ ]* *|0*2/|\* */?)0*[6-7]))([^0-9]|$) feb 6, (0,6)
E1 (^|[ (,;])((([Ff]eb[^ ]* *|0*2/|\* */?)0*[6-7]))([^0-9]|$) 2/7 (0,3)
E1 (^|[ (,;])((([Ff]eb[^ ]* *|0*2/|\* */?)0*[6-7]))([^0-9]|$) feb 1,Feb 6 (5,11)
E3 ((((((((((((((((((((((((((((((x)))))))))))))))))))))))))))))) x (0,1)(0,1)(0,1)
E3 ((((((((((((((((((((((((((((((x))))))))))))))))))))))))))))))* xx (0,2)(1,2)(1,2)
E a?(ab|ba)* ababababababababababababababababababababababababababababababababababababababababa (0,81)(79,81)
E abaa|abbaa|abbbaa|abbbbaa ababbabbbabbbabbbbabbbbaa (18,25)
E abaa|abbaa|abbbaa|abbbbaa ababbabbbabbbabbbbabaa (18,22)
E aaac|aabc|abac|abbc|baac|babc|bbac|bbbc baaabbbabac (7,11)
BE$ .* \x01\x7f (0,2)
E aaaa|bbbb|cccc|ddddd|eeeeee|fffffff|gggg|hhhh|iiiii|jjjjj|kkkkk|llll XaaaXbbbXcccXdddXeeeXfffXgggXhhhXiiiXjjjXkkkXlllXcbaXaaaa (53,57)
L aaaa\nbbbb\ncccc\nddddd\neeeeee\nfffffff\ngggg\nhhhh\niiiii\njjjjj\nkkkkk\nllll XaaaXbbbXcccXdddXeeeXfffXgggXhhhXiiiXjjjXkkkXlllXcbaXaaaa NOMATCH
E a*a*a*a*a*b aaaaaaaaab (0,10)
BE ^ NULL (0,0)
BE $ NULL (0,0)
BE ^$ NULL (0,0)
BE ^a$ a (0,1)
BE abc abc (0,3)
BE abc xabcy (1,4)
BE abc ababc (2,5)
BE ab*c abc (0,3)
BE ab*bc abc (0,3)
BE ab*bc abbc (0,4)
BE ab*bc abbbbc (0,6)
E ab+bc abbc (0,4)
E ab+bc abbbbc (0,6)
E ab?bc abbc (0,4)
E ab?bc abc (0,3)
E ab?c abc (0,3)
BE ^abc$ abc (0,3)
BE ^abc abcc (0,3)
BE abc$ aabc (1,4)
BE ^ abc (0,0)
BE $ abc (3,3)
BE a.c abc (0,3)
BE a.c axc (0,3)
BE a.*c axyzc (0,5)
BE a[bc]d abd (0,3)
BE a[b-d]e ace (0,3)
BE a[b-d] aac (1,3)
BE a[-b] a- (0,2)
BE a[b-] a- (0,2)
BE a] a] (0,2)
BE a[]]b a]b (0,3)
BE a[^bc]d aed (0,3)
BE a[^-b]c adc (0,3)
BE a[^]b]c adc (0,3)
E ab|cd abc (0,2)
E ab|cd abcd (0,2)
E a\(b a(b (0,3)
E a\(*b ab (0,2)
E a\(*b a((b (0,4)
E ((a)) abc (0,1)(0,1)(0,1)
E (a)b(c) abc (0,3)(0,1)(2,3)
E a+b+c aabbabc (4,7)
E a* aaa (0,3)
#E (a*)* - (0,0)(0,0)
E (a*)* - (0,0)(?,?) RE2/Go
E (a*)+ - (0,0)(0,0)
#E (a*|b)* - (0,0)(0,0)
E (a*|b)* - (0,0)(?,?) RE2/Go
E (a+|b)* ab (0,2)(1,2)
E (a+|b)+ ab (0,2)(1,2)
E (a+|b)? ab (0,1)(0,1)
BE [^ab]* cde (0,3)
#E (^)* - (0,0)(0,0)
E (^)* - (0,0)(?,?) RE2/Go
BE a* NULL (0,0)
E ([abc])*d abbbcd (0,6)(4,5)
E ([abc])*bcd abcd (0,4)(0,1)
E a|b|c|d|e e (0,1)
E (a|b|c|d|e)f ef (0,2)(0,1)
#E ((a*|b))* - (0,0)(0,0)(0,0)
E ((a*|b))* - (0,0)(?,?)(?,?) RE2/Go
BE abcd*efg abcdefg (0,7)
BE ab* xabyabbbz (1,3)
BE ab* xayabbbz (1,2)
E (ab|cd)e abcde (2,5)(2,4)
BE [abhgefdc]ij hij (0,3)
E (a|b)c*d abcd (1,4)(1,2)
E (ab|ab*)bc abc (0,3)(0,1)
E a([bc]*)c* abc (0,3)(1,3)
E a([bc]*)(c*d) abcd (0,4)(1,3)(3,4)
E a([bc]+)(c*d) abcd (0,4)(1,3)(3,4)
E a([bc]*)(c+d) abcd (0,4)(1,2)(2,4)
E a[bcd]*dcdcde adcdcde (0,7)
E (ab|a)b*c abc (0,3)(0,2)
E ((a)(b)c)(d) abcd (0,4)(0,3)(0,1)(1,2)(3,4)
BE [A-Za-z_][A-Za-z0-9_]* alpha (0,5)
E ^a(bc+|b[eh])g|.h$ abh (1,3)
E (bc+d$|ef*g.|h?i(j|k)) effgz (0,5)(0,5)
E (bc+d$|ef*g.|h?i(j|k)) ij (0,2)(0,2)(1,2)
E (bc+d$|ef*g.|h?i(j|k)) reffgz (1,6)(1,6)
E (((((((((a))))))))) a (0,1)(0,1)(0,1)(0,1)(0,1)(0,1)(0,1)(0,1)(0,1)(0,1)
BE multiple words multiple words yeah (0,14)
E (.*)c(.*) abcde (0,5)(0,2)(3,5)
BE abcd abcd (0,4)
E a(bc)d abcd (0,4)(1,3)
E a[-]?c ac (0,3)
E M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy] Muammar Qaddafi (0,15)(?,?)(10,12)
E M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy] Mo'ammar Gadhafi (0,16)(?,?)(11,13)
E M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy] Muammar Kaddafi (0,15)(?,?)(10,12)
E M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy] Muammar Qadhafi (0,15)(?,?)(10,12)
E M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy] Muammar Gadafi (0,14)(?,?)(10,11)
E M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy] Mu'ammar Qadafi (0,15)(?,?)(11,12)
E M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy] Moamar Gaddafi (0,14)(?,?)(9,11)
E M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy] Mu'ammar Qadhdhafi (0,18)(?,?)(13,15)
E M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy] Muammar Khaddafi (0,16)(?,?)(11,13)
E M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy] Muammar Ghaddafy (0,16)(?,?)(11,13)
E M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy] Muammar Ghadafi (0,15)(?,?)(11,12)
E M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy] Muammar Ghaddafi (0,16)(?,?)(11,13)
E M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy] Muamar Kaddafi (0,14)(?,?)(9,11)
E M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy] Muammar Quathafi (0,16)(?,?)(11,13)
E M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy] Muammar Gheddafi (0,16)(?,?)(11,13)
E M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy] Moammar Khadafy (0,15)(?,?)(11,12)
E M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy] Moammar Qudhafi (0,15)(?,?)(10,12)
E a+(b|c)*d+ aabcdd (0,6)(3,4)
E ^.+$ vivi (0,4)
E ^(.+)$ vivi (0,4)(0,4)
E ^([^!.]+).att.com!(.+)$ gryphon.att.com!eby (0,19)(0,7)(16,19)
E ^([^!]+!)?([^!]+)$ bas (0,3)(?,?)(0,3)
E ^([^!]+!)?([^!]+)$ bar!bas (0,7)(0,4)(4,7)
E ^([^!]+!)?([^!]+)$ foo!bas (0,7)(0,4)(4,7)
E ^.+!([^!]+!)([^!]+)$ foo!bar!bas (0,11)(4,8)(8,11)
E ((foo)|(bar))!bas bar!bas (0,7)(0,3)(?,?)(0,3)
E ((foo)|(bar))!bas foo!bar!bas (4,11)(4,7)(?,?)(4,7)
E ((foo)|(bar))!bas foo!bas (0,7)(0,3)(0,3)
E ((foo)|bar)!bas bar!bas (0,7)(0,3)
E ((foo)|bar)!bas foo!bar!bas (4,11)(4,7)
E ((foo)|bar)!bas foo!bas (0,7)(0,3)(0,3)
E (foo|(bar))!bas bar!bas (0,7)(0,3)(0,3)
E (foo|(bar))!bas foo!bar!bas (4,11)(4,7)(4,7)
E (foo|(bar))!bas foo!bas (0,7)(0,3)
E (foo|bar)!bas bar!bas (0,7)(0,3)
E (foo|bar)!bas foo!bar!bas (4,11)(4,7)
E (foo|bar)!bas foo!bas (0,7)(0,3)
E ^(([^!]+!)?([^!]+)|.+!([^!]+!)([^!]+))$ foo!bar!bas (0,11)(0,11)(?,?)(?,?)(4,8)(8,11)
E ^([^!]+!)?([^!]+)$|^.+!([^!]+!)([^!]+)$ bas (0,3)(?,?)(0,3)
E ^([^!]+!)?([^!]+)$|^.+!([^!]+!)([^!]+)$ bar!bas (0,7)(0,4)(4,7)
E ^([^!]+!)?([^!]+)$|^.+!([^!]+!)([^!]+)$ foo!bar!bas (0,11)(?,?)(?,?)(4,8)(8,11)
E ^([^!]+!)?([^!]+)$|^.+!([^!]+!)([^!]+)$ foo!bas (0,7)(0,4)(4,7)
E ^(([^!]+!)?([^!]+)|.+!([^!]+!)([^!]+))$ bas (0,3)(0,3)(?,?)(0,3)
E ^(([^!]+!)?([^!]+)|.+!([^!]+!)([^!]+))$ bar!bas (0,7)(0,7)(0,4)(4,7)
E ^(([^!]+!)?([^!]+)|.+!([^!]+!)([^!]+))$ foo!bar!bas (0,11)(0,11)(?,?)(?,?)(4,8)(8,11)
E ^(([^!]+!)?([^!]+)|.+!([^!]+!)([^!]+))$ foo!bas (0,7)(0,7)(0,4)(4,7)
E .*(/XXX).* /XXX (0,4)(0,4)
E .*(\\XXX).* \XXX (0,4)(0,4)
E \\XXX \XXX (0,4)
E .*(/000).* /000 (0,4)(0,4)
E .*(\\000).* \000 (0,4)(0,4)
E \\000 \000 (0,4)

View file

@ -0,0 +1,79 @@
NOTE null subexpression matches : 2002-06-06
E (a*)* a (0,1)(0,1)
#E SAME x (0,0)(0,0)
E SAME x (0,0)(?,?) RE2/Go
E SAME aaaaaa (0,6)(0,6)
E SAME aaaaaax (0,6)(0,6)
E (a*)+ a (0,1)(0,1)
E SAME x (0,0)(0,0)
E SAME aaaaaa (0,6)(0,6)
E SAME aaaaaax (0,6)(0,6)
E (a+)* a (0,1)(0,1)
E SAME x (0,0)
E SAME aaaaaa (0,6)(0,6)
E SAME aaaaaax (0,6)(0,6)
E (a+)+ a (0,1)(0,1)
E SAME x NOMATCH
E SAME aaaaaa (0,6)(0,6)
E SAME aaaaaax (0,6)(0,6)
E ([a]*)* a (0,1)(0,1)
#E SAME x (0,0)(0,0)
E SAME x (0,0)(?,?) RE2/Go
E SAME aaaaaa (0,6)(0,6)
E SAME aaaaaax (0,6)(0,6)
E ([a]*)+ a (0,1)(0,1)
E SAME x (0,0)(0,0)
E SAME aaaaaa (0,6)(0,6)
E SAME aaaaaax (0,6)(0,6)
E ([^b]*)* a (0,1)(0,1)
#E SAME b (0,0)(0,0)
E SAME b (0,0)(?,?) RE2/Go
E SAME aaaaaa (0,6)(0,6)
E SAME aaaaaab (0,6)(0,6)
E ([ab]*)* a (0,1)(0,1)
E SAME aaaaaa (0,6)(0,6)
E SAME ababab (0,6)(0,6)
E SAME bababa (0,6)(0,6)
E SAME b (0,1)(0,1)
E SAME bbbbbb (0,6)(0,6)
E SAME aaaabcde (0,5)(0,5)
E ([^a]*)* b (0,1)(0,1)
E SAME bbbbbb (0,6)(0,6)
#E SAME aaaaaa (0,0)(0,0)
E SAME aaaaaa (0,0)(?,?) RE2/Go
E ([^ab]*)* ccccxx (0,6)(0,6)
#E SAME ababab (0,0)(0,0)
E SAME ababab (0,0)(?,?) RE2/Go
E ((z)+|a)* zabcde (0,2)(1,2)
#{E a+? aaaaaa (0,1) no *? +? mimimal match ops
#E (a) aaa (0,1)(0,1)
#E (a*?) aaa (0,0)(0,0)
#E (a)*? aaa (0,0)
#E (a*?)*? aaa (0,0)
#}
B \(a*\)*\(x\) x (0,1)(0,0)(0,1)
B \(a*\)*\(x\) ax (0,2)(0,1)(1,2)
B \(a*\)*\(x\) axa (0,2)(0,1)(1,2)
B \(a*\)*\(x\)\(\1\) x (0,1)(0,0)(0,1)(1,1)
B \(a*\)*\(x\)\(\1\) ax (0,2)(1,1)(1,2)(2,2)
B \(a*\)*\(x\)\(\1\) axa (0,3)(0,1)(1,2)(2,3)
B \(a*\)*\(x\)\(\1\)\(x\) axax (0,4)(0,1)(1,2)(2,3)(3,4)
B \(a*\)*\(x\)\(\1\)\(x\) axxa (0,3)(1,1)(1,2)(2,2)(2,3)
#E (a*)*(x) x (0,1)(0,0)(0,1)
E (a*)*(x) x (0,1)(?,?)(0,1) RE2/Go
E (a*)*(x) ax (0,2)(0,1)(1,2)
E (a*)*(x) axa (0,2)(0,1)(1,2)
E (a*)+(x) x (0,1)(0,0)(0,1)
E (a*)+(x) ax (0,2)(0,1)(1,2)
E (a*)+(x) axa (0,2)(0,1)(1,2)
E (a*){2}(x) x (0,1)(0,0)(0,1)
E (a*){2}(x) ax (0,2)(1,1)(1,2)
E (a*){2}(x) axa (0,2)(1,1)(1,2)

View file

@ -0,0 +1,163 @@
NOTE implicit vs. explicit repetitions : 2009-02-02
# Glenn Fowler <gsf@research.att.com>
# conforming matches (column 4) must match one of the following BREs
# NOMATCH
# (0,.)\((\(.\),\(.\))(?,?)(\2,\3)\)*
# (0,.)\((\(.\),\(.\))(\2,\3)(?,?)\)*
# i.e., each 3-tuple has two identical elements and one (?,?)
E ((..)|(.)) NULL NOMATCH
E ((..)|(.))((..)|(.)) NULL NOMATCH
E ((..)|(.))((..)|(.))((..)|(.)) NULL NOMATCH
E ((..)|(.)){1} NULL NOMATCH
E ((..)|(.)){2} NULL NOMATCH
E ((..)|(.)){3} NULL NOMATCH
E ((..)|(.))* NULL (0,0)
E ((..)|(.)) a (0,1)(0,1)(?,?)(0,1)
E ((..)|(.))((..)|(.)) a NOMATCH
E ((..)|(.))((..)|(.))((..)|(.)) a NOMATCH
E ((..)|(.)){1} a (0,1)(0,1)(?,?)(0,1)
E ((..)|(.)){2} a NOMATCH
E ((..)|(.)){3} a NOMATCH
E ((..)|(.))* a (0,1)(0,1)(?,?)(0,1)
E ((..)|(.)) aa (0,2)(0,2)(0,2)(?,?)
E ((..)|(.))((..)|(.)) aa (0,2)(0,1)(?,?)(0,1)(1,2)(?,?)(1,2)
E ((..)|(.))((..)|(.))((..)|(.)) aa NOMATCH
E ((..)|(.)){1} aa (0,2)(0,2)(0,2)(?,?)
E ((..)|(.)){2} aa (0,2)(1,2)(?,?)(1,2)
E ((..)|(.)){3} aa NOMATCH
E ((..)|(.))* aa (0,2)(0,2)(0,2)(?,?)
E ((..)|(.)) aaa (0,2)(0,2)(0,2)(?,?)
E ((..)|(.))((..)|(.)) aaa (0,3)(0,2)(0,2)(?,?)(2,3)(?,?)(2,3)
E ((..)|(.))((..)|(.))((..)|(.)) aaa (0,3)(0,1)(?,?)(0,1)(1,2)(?,?)(1,2)(2,3)(?,?)(2,3)
E ((..)|(.)){1} aaa (0,2)(0,2)(0,2)(?,?)
#E ((..)|(.)){2} aaa (0,3)(2,3)(?,?)(2,3)
E ((..)|(.)){2} aaa (0,3)(2,3)(0,2)(2,3) RE2/Go
E ((..)|(.)){3} aaa (0,3)(2,3)(?,?)(2,3)
#E ((..)|(.))* aaa (0,3)(2,3)(?,?)(2,3)
E ((..)|(.))* aaa (0,3)(2,3)(0,2)(2,3) RE2/Go
E ((..)|(.)) aaaa (0,2)(0,2)(0,2)(?,?)
E ((..)|(.))((..)|(.)) aaaa (0,4)(0,2)(0,2)(?,?)(2,4)(2,4)(?,?)
E ((..)|(.))((..)|(.))((..)|(.)) aaaa (0,4)(0,2)(0,2)(?,?)(2,3)(?,?)(2,3)(3,4)(?,?)(3,4)
E ((..)|(.)){1} aaaa (0,2)(0,2)(0,2)(?,?)
E ((..)|(.)){2} aaaa (0,4)(2,4)(2,4)(?,?)
#E ((..)|(.)){3} aaaa (0,4)(3,4)(?,?)(3,4)
E ((..)|(.)){3} aaaa (0,4)(3,4)(0,2)(3,4) RE2/Go
E ((..)|(.))* aaaa (0,4)(2,4)(2,4)(?,?)
E ((..)|(.)) aaaaa (0,2)(0,2)(0,2)(?,?)
E ((..)|(.))((..)|(.)) aaaaa (0,4)(0,2)(0,2)(?,?)(2,4)(2,4)(?,?)
E ((..)|(.))((..)|(.))((..)|(.)) aaaaa (0,5)(0,2)(0,2)(?,?)(2,4)(2,4)(?,?)(4,5)(?,?)(4,5)
E ((..)|(.)){1} aaaaa (0,2)(0,2)(0,2)(?,?)
E ((..)|(.)){2} aaaaa (0,4)(2,4)(2,4)(?,?)
#E ((..)|(.)){3} aaaaa (0,5)(4,5)(?,?)(4,5)
E ((..)|(.)){3} aaaaa (0,5)(4,5)(2,4)(4,5) RE2/Go
#E ((..)|(.))* aaaaa (0,5)(4,5)(?,?)(4,5)
E ((..)|(.))* aaaaa (0,5)(4,5)(2,4)(4,5) RE2/Go
E ((..)|(.)) aaaaaa (0,2)(0,2)(0,2)(?,?)
E ((..)|(.))((..)|(.)) aaaaaa (0,4)(0,2)(0,2)(?,?)(2,4)(2,4)(?,?)
E ((..)|(.))((..)|(.))((..)|(.)) aaaaaa (0,6)(0,2)(0,2)(?,?)(2,4)(2,4)(?,?)(4,6)(4,6)(?,?)
E ((..)|(.)){1} aaaaaa (0,2)(0,2)(0,2)(?,?)
E ((..)|(.)){2} aaaaaa (0,4)(2,4)(2,4)(?,?)
E ((..)|(.)){3} aaaaaa (0,6)(4,6)(4,6)(?,?)
E ((..)|(.))* aaaaaa (0,6)(4,6)(4,6)(?,?)
NOTE additional repetition tests graciously provided by Chris Kuklewicz www.haskell.org 2009-02-02
# These test a bug in OS X / FreeBSD / NetBSD, and libtree.
# Linux/GLIBC gets the {8,} and {8,8} wrong.
:HA#100:E X(.?){0,}Y X1234567Y (0,9)(7,8)
:HA#101:E X(.?){1,}Y X1234567Y (0,9)(7,8)
:HA#102:E X(.?){2,}Y X1234567Y (0,9)(7,8)
:HA#103:E X(.?){3,}Y X1234567Y (0,9)(7,8)
:HA#104:E X(.?){4,}Y X1234567Y (0,9)(7,8)
:HA#105:E X(.?){5,}Y X1234567Y (0,9)(7,8)
:HA#106:E X(.?){6,}Y X1234567Y (0,9)(7,8)
:HA#107:E X(.?){7,}Y X1234567Y (0,9)(7,8)
:HA#108:E X(.?){8,}Y X1234567Y (0,9)(8,8)
#:HA#110:E X(.?){0,8}Y X1234567Y (0,9)(7,8)
:HA#110:E X(.?){0,8}Y X1234567Y (0,9)(8,8) RE2/Go
#:HA#111:E X(.?){1,8}Y X1234567Y (0,9)(7,8)
:HA#111:E X(.?){1,8}Y X1234567Y (0,9)(8,8) RE2/Go
#:HA#112:E X(.?){2,8}Y X1234567Y (0,9)(7,8)
:HA#112:E X(.?){2,8}Y X1234567Y (0,9)(8,8) RE2/Go
#:HA#113:E X(.?){3,8}Y X1234567Y (0,9)(7,8)
:HA#113:E X(.?){3,8}Y X1234567Y (0,9)(8,8) RE2/Go
#:HA#114:E X(.?){4,8}Y X1234567Y (0,9)(7,8)
:HA#114:E X(.?){4,8}Y X1234567Y (0,9)(8,8) RE2/Go
#:HA#115:E X(.?){5,8}Y X1234567Y (0,9)(7,8)
:HA#115:E X(.?){5,8}Y X1234567Y (0,9)(8,8) RE2/Go
#:HA#116:E X(.?){6,8}Y X1234567Y (0,9)(7,8)
:HA#116:E X(.?){6,8}Y X1234567Y (0,9)(8,8) RE2/Go
#:HA#117:E X(.?){7,8}Y X1234567Y (0,9)(7,8)
:HA#117:E X(.?){7,8}Y X1234567Y (0,9)(8,8) RE2/Go
:HA#118:E X(.?){8,8}Y X1234567Y (0,9)(8,8)
# These test a fixed bug in my regex-tdfa that did not keep the expanded
# form properly grouped, so right association did the wrong thing with
# these ambiguous patterns (crafted just to test my code when I became
# suspicious of my implementation). The first subexpression should use
# "ab" then "a" then "bcd".
# OS X / FreeBSD / NetBSD badly fail many of these, with impossible
# results like (0,6)(4,5)(6,6).
:HA#260:E (a|ab|c|bcd){0,}(d*) ababcd (0,1)(0,1)(1,1)
:HA#261:E (a|ab|c|bcd){1,}(d*) ababcd (0,1)(0,1)(1,1)
:HA#262:E (a|ab|c|bcd){2,}(d*) ababcd (0,6)(3,6)(6,6)
:HA#263:E (a|ab|c|bcd){3,}(d*) ababcd (0,6)(3,6)(6,6)
:HA#264:E (a|ab|c|bcd){4,}(d*) ababcd NOMATCH
:HA#265:E (a|ab|c|bcd){0,10}(d*) ababcd (0,1)(0,1)(1,1)
:HA#266:E (a|ab|c|bcd){1,10}(d*) ababcd (0,1)(0,1)(1,1)
:HA#267:E (a|ab|c|bcd){2,10}(d*) ababcd (0,6)(3,6)(6,6)
:HA#268:E (a|ab|c|bcd){3,10}(d*) ababcd (0,6)(3,6)(6,6)
:HA#269:E (a|ab|c|bcd){4,10}(d*) ababcd NOMATCH
:HA#270:E (a|ab|c|bcd)*(d*) ababcd (0,1)(0,1)(1,1)
:HA#271:E (a|ab|c|bcd)+(d*) ababcd (0,1)(0,1)(1,1)
# The above worked on Linux/GLIBC but the following often fail.
# They also trip up OS X / FreeBSD / NetBSD:
#:HA#280:E (ab|a|c|bcd){0,}(d*) ababcd (0,6)(3,6)(6,6)
:HA#280:E (ab|a|c|bcd){0,}(d*) ababcd (0,6)(4,5)(5,6) RE2/Go
#:HA#281:E (ab|a|c|bcd){1,}(d*) ababcd (0,6)(3,6)(6,6)
:HA#281:E (ab|a|c|bcd){1,}(d*) ababcd (0,6)(4,5)(5,6) RE2/Go
#:HA#282:E (ab|a|c|bcd){2,}(d*) ababcd (0,6)(3,6)(6,6)
:HA#282:E (ab|a|c|bcd){2,}(d*) ababcd (0,6)(4,5)(5,6) RE2/Go
#:HA#283:E (ab|a|c|bcd){3,}(d*) ababcd (0,6)(3,6)(6,6)
:HA#283:E (ab|a|c|bcd){3,}(d*) ababcd (0,6)(4,5)(5,6) RE2/Go
:HA#284:E (ab|a|c|bcd){4,}(d*) ababcd NOMATCH
#:HA#285:E (ab|a|c|bcd){0,10}(d*) ababcd (0,6)(3,6)(6,6)
:HA#285:E (ab|a|c|bcd){0,10}(d*) ababcd (0,6)(4,5)(5,6) RE2/Go
#:HA#286:E (ab|a|c|bcd){1,10}(d*) ababcd (0,6)(3,6)(6,6)
:HA#286:E (ab|a|c|bcd){1,10}(d*) ababcd (0,6)(4,5)(5,6) RE2/Go
#:HA#287:E (ab|a|c|bcd){2,10}(d*) ababcd (0,6)(3,6)(6,6)
:HA#287:E (ab|a|c|bcd){2,10}(d*) ababcd (0,6)(4,5)(5,6) RE2/Go
#:HA#288:E (ab|a|c|bcd){3,10}(d*) ababcd (0,6)(3,6)(6,6)
:HA#288:E (ab|a|c|bcd){3,10}(d*) ababcd (0,6)(4,5)(5,6) RE2/Go
:HA#289:E (ab|a|c|bcd){4,10}(d*) ababcd NOMATCH
#:HA#290:E (ab|a|c|bcd)*(d*) ababcd (0,6)(3,6)(6,6)
:HA#290:E (ab|a|c|bcd)*(d*) ababcd (0,6)(4,5)(5,6) RE2/Go
#:HA#291:E (ab|a|c|bcd)+(d*) ababcd (0,6)(3,6)(6,6)
:HA#291:E (ab|a|c|bcd)+(d*) ababcd (0,6)(4,5)(5,6) RE2/Go

View file

@ -0,0 +1,177 @@
[[tests]]
name = "crazy-misc1"
pattern = '[-+]?[0-9]*\.?[0-9]+'
input = "0.1"
matches = [[0, 3]]
[[tests]]
name = "crazy-misc2"
pattern = '[-+]?[0-9]*\.?[0-9]+'
input = "0.1.2"
matches = [[0, 3]]
[[tests]]
name = "crazy-misc3"
pattern = '[-+]?[0-9]*\.?[0-9]+'
input = "a1.2"
matches = [[1, 4]]
[[tests]]
options = ["case-insensitive"]
name = "crazy-misc4"
pattern = '[A-Z0-9._%+-]+@[A-Z0-9.-]+\.[A-Z]{2,4}'
input = "mine is jam.slam@gmail.com "
matches = [[8, 26]]
[[tests]]
options = ["case-insensitive"]
name = "crazy-misc5"
pattern = '[A-Z0-9._%+-]+@[A-Z0-9.-]+\.[A-Z]{2,4}'
input = "mine is jam.slam@gmail "
matches = []
[[tests]]
name = "crazy-misc6"
pattern = '''[a-z0-9!#$%&'*+/=?^_`{|}~-]+(?:\.[a-z0-9!#$%&'*+/=?^_`{|}~-]+)*@(?:[a-z0-9](?:[a-z0-9-]*[a-z0-9])?\.)+[a-z0-9](?:[a-z0-9-]*[a-z0-9])?'''
input = "mine is jam.slam@gmail.com "
matches = [[8, 26]]
[[tests]]
name = "crazy-misc7"
pattern = '(19|20)\d\d[- /.](0[1-9]|1[012])[- /.](0[1-9]|[12][0-9]|3[01])'
input = "1900-01-01"
matches = [[0, 10]]
[[tests]]
name = "crazy-misc8"
pattern = '(19|20)\d\d[- /.](0[1-9]|1[012])[- /.](0[1-9]|[12][0-9]|3[01])'
input = "1900-00-01"
matches = []
[[tests]]
name = "crazy-misc9"
pattern = '(19|20)\d\d[- /.](0[1-9]|1[012])[- /.](0[1-9]|[12][0-9]|3[01])'
input = "1900-13-01"
matches = []
[[tests]]
name = "crazy-negclass1"
pattern = "[^ac]"
input = "acx"
matches = [[2, 3]]
[[tests]]
name = "crazy-negclass2"
pattern = "[^a,]"
input = "a,x"
matches = [[2, 3]]
[[tests]]
name = "crazy-negclass3"
pattern = '[^a\s]'
input = "a x"
matches = [[2, 3]]
[[tests]]
name = "crazy-negclass4"
pattern = "[^,]"
input = ",,x"
matches = [[2, 3]]
[[tests]]
name = "crazy-negclass5"
pattern = '[^\s]'
input = " a"
matches = [[1, 2]]
[[tests]]
name = "crazy-negclass6"
pattern = '[^,\s]'
input = ", a"
matches = [[2, 3]]
[[tests]]
name = "crazy-negclass7"
pattern = '[^\s,]'
input = " ,a"
matches = [[2, 3]]
[[tests]]
name = "crazy-negclass8"
pattern = "[^[:alpha:]Z]"
input = "A1"
matches = [[1, 2]]
[[tests]]
name = "crazy-empty-repeat1"
pattern = "((.*)*?)="
input = "a=b"
matches = [[0, 2]]
[[tests]]
name = "crazy-empty-repeat2"
pattern = "((.?)*?)="
input = "a=b"
matches = [[0, 2]]
[[tests]]
name = "crazy-empty-repeat3"
pattern = "((.*)+?)="
input = "a=b"
matches = [[0, 2]]
[[tests]]
name = "crazy-empty-repeat4"
pattern = "((.?)+?)="
input = "a=b"
matches = [[0, 2]]
[[tests]]
name = "crazy-empty-repeat5"
pattern = "((.*){1,}?)="
input = "a=b"
matches = [[0, 2]]
[[tests]]
name = "crazy-empty-repeat6"
pattern = "((.*){1,2}?)="
input = "a=b"
matches = [[0, 2]]
[[tests]]
name = "crazy-empty-repeat7"
pattern = "((.*)*)="
input = "a=b"
matches = [[0, 2]]
[[tests]]
name = "crazy-empty-repeat8"
pattern = "((.?)*)="
input = "a=b"
matches = [[0, 2]]
[[tests]]
name = "crazy-empty-repeat9"
pattern = "((.*)+)="
input = "a=b"
matches = [[0, 2]]
[[tests]]
name = "crazy-empty-repeat10"
pattern = "((.?)+)="
input = "a=b"
matches = [[0, 2]]
[[tests]]
name = "crazy-empty-repeat11"
pattern = "((.*){1,})="
input = "a=b"
matches = [[0, 2]]
[[tests]]
name = "crazy-empty-repeat12"
pattern = "((.*){1,2})="
input = "a=b"
matches = [[0, 2]]

View file

@ -0,0 +1,59 @@
[[tests]]
name = "flags1"
pattern = "(?i)abc"
input = "ABC"
matches = [[0, 3]]
[[tests]]
name = "flags2"
pattern = "(?i)a(?-i)bc"
input = "Abc"
matches = [[0, 3]]
[[tests]]
name = "flags3"
pattern = "(?i)a(?-i)bc"
input = "ABC"
matches = []
[[tests]]
name = "flags4"
pattern = "(?is)a."
input = "A\n"
matches = [[0, 2]]
[[tests]]
name = "flags5"
pattern = "(?is)a.(?-is)a."
input = "A\nab"
matches = [[0, 4]]
[[tests]]
name = "flags6"
pattern = "(?is)a.(?-is)a."
input = "A\na\n"
matches = []
[[tests]]
name = "flags7"
pattern = "(?is)a.(?-is:a.)?"
input = "A\na\n"
matches = [[0, 2]]
[[tests]]
name = "flags8"
pattern = "(?U)a+"
input = "aa"
matches = [[0, 1]]
[[tests]]
name = "flags9"
pattern = "(?U)a+?"
input = "aa"
matches = [[0, 2]]
[[tests]]
name = "flags10"
pattern = "(?U)(?-U)a+"
input = "aa"
matches = [[0, 2]]

View file

@ -0,0 +1,19 @@
The following license covers testregex.c and all associated test data.
Permission is hereby granted, free of charge, to any person obtaining a
copy of THIS SOFTWARE FILE (the "Software"), to deal in the Software
without restriction, including without limitation the rights to use,
copy, modify, merge, publish, distribute, and/or sell copies of the
Software, and to permit persons to whom the Software is furnished to do
so, subject to the following disclaimer:
THIS SOFTWARE IS PROVIDED BY AT&T ``AS IS'' AND ANY EXPRESS OR IMPLIED
WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
IN NO EVENT SHALL AT&T BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

View file

@ -0,0 +1,23 @@
Test data was taken from the Go distribution, which was in turn taken from the
testregex test suite:
http://www2.research.att.com/~astopen/testregex/testregex.html
Unfortunately, the above link is now dead, but the test data lives on.
The LICENSE in this directory corresponds to the LICENSE that the data was
originally released under.
The tests themselves were modified for RE2/Go. A couple were modified further
by me (Andrew Gallant) (only in repetition.dat) so that RE2/Go would pass them.
(Yes, it seems like RE2/Go includes failing test cases.) This may or may not
have been a bad idea, but I think being consistent with an established Regex
library is worth something.
After some number of years, these tests were transformed into a JSON format
using the fowler-to-json script in this directory, e.g.,
./fowler-to-json basic.dat > basic.json
which brings them into a sensible structured format in which other tests can
be written.

View file

@ -0,0 +1,221 @@
NOTE all standard compliant implementations should pass these : 2002-05-31
BE abracadabra$ abracadabracadabra (7,18)
BE a...b abababbb (2,7)
BE XXXXXX ..XXXXXX (2,8)
E \) () (1,2)
BE a] a]a (0,2)
B } } (0,1)
E \} } (0,1)
BE \] ] (0,1)
B ] ] (0,1)
E ] ] (0,1)
B { { (0,1)
B } } (0,1)
BE ^a ax (0,1)
BE \^a a^a (1,3)
BE a\^ a^ (0,2)
BE a$ aa (1,2)
BE a\$ a$ (0,2)
BE ^$ NULL (0,0)
E $^ NULL (0,0)
E a($) aa (1,2)(2,2)
E a*(^a) aa (0,1)(0,1)
E (..)*(...)* a (0,0)
E (..)*(...)* abcd (0,4)(2,4)
E (ab|a)(bc|c) abc (0,3)(0,2)(2,3)
E (ab)c|abc abc (0,3)(0,2)
E a{0}b ab (1,2)
E (a*)(b?)(b+)b{3} aaabbbbbbb (0,10)(0,3)(3,4)(4,7)
E (a*)(b{0,1})(b{1,})b{3} aaabbbbbbb (0,10)(0,3)(3,4)(4,7)
E a{9876543210} NULL BADBR
E ((a|a)|a) a (0,1)(0,1)(0,1)
E (a*)(a|aa) aaaa (0,4)(0,3)(3,4)
E a*(a.|aa) aaaa (0,4)(2,4)
E a(b)|c(d)|a(e)f aef (0,3)(?,?)(?,?)(1,2)
E (a|b)?.* b (0,1)(0,1)
E (a|b)c|a(b|c) ac (0,2)(0,1)
E (a|b)c|a(b|c) ab (0,2)(?,?)(1,2)
E (a|b)*c|(a|ab)*c abc (0,3)(1,2)
E (a|b)*c|(a|ab)*c xc (1,2)
E (.a|.b).*|.*(.a|.b) xa (0,2)(0,2)
E a?(ab|ba)ab abab (0,4)(0,2)
E a?(ac{0}b|ba)ab abab (0,4)(0,2)
E ab|abab abbabab (0,2)
E aba|bab|bba baaabbbaba (5,8)
E aba|bab baaabbbaba (6,9)
E (aa|aaa)*|(a|aaaaa) aa (0,2)(0,2)
E (a.|.a.)*|(a|.a...) aa (0,2)(0,2)
E ab|a xabc (1,3)
E ab|a xxabc (2,4)
Ei (Ab|cD)* aBcD (0,4)(2,4)
BE [^-] --a (2,3)
BE [a-]* --a (0,3)
BE [a-m-]* --amoma-- (0,4)
E :::1:::0:|:::1:1:0: :::0:::1:::1:::0: (8,17)
E :::1:::0:|:::1:1:1: :::0:::1:::1:::0: (8,17)
{E [[:upper:]] A (0,1) [[<element>]] not supported
E [[:lower:]]+ `az{ (1,3)
E [[:upper:]]+ @AZ[ (1,3)
# No collation in Go
#BE [[-]] [[-]] (2,4)
#BE [[.NIL.]] NULL ECOLLATE
#BE [[=aleph=]] NULL ECOLLATE
}
BE$ \n \n (0,1)
BEn$ \n \n (0,1)
BE$ [^a] \n (0,1)
BE$ \na \na (0,2)
E (a)(b)(c) abc (0,3)(0,1)(1,2)(2,3)
BE xxx xxx (0,3)
E1 (^|[ (,;])((([Ff]eb[^ ]* *|0*2/|\* */?)0*[6-7]))([^0-9]|$) feb 6, (0,6)
E1 (^|[ (,;])((([Ff]eb[^ ]* *|0*2/|\* */?)0*[6-7]))([^0-9]|$) 2/7 (0,3)
E1 (^|[ (,;])((([Ff]eb[^ ]* *|0*2/|\* */?)0*[6-7]))([^0-9]|$) feb 1,Feb 6 (5,11)
E3 ((((((((((((((((((((((((((((((x)))))))))))))))))))))))))))))) x (0,1)(0,1)(0,1)
E3 ((((((((((((((((((((((((((((((x))))))))))))))))))))))))))))))* xx (0,2)(1,2)(1,2)
E a?(ab|ba)* ababababababababababababababababababababababababababababababababababababababababa (0,81)(79,81)
E abaa|abbaa|abbbaa|abbbbaa ababbabbbabbbabbbbabbbbaa (18,25)
E abaa|abbaa|abbbaa|abbbbaa ababbabbbabbbabbbbabaa (18,22)
E aaac|aabc|abac|abbc|baac|babc|bbac|bbbc baaabbbabac (7,11)
BE$ .* \x01\x7f (0,2)
E aaaa|bbbb|cccc|ddddd|eeeeee|fffffff|gggg|hhhh|iiiii|jjjjj|kkkkk|llll XaaaXbbbXcccXdddXeeeXfffXgggXhhhXiiiXjjjXkkkXlllXcbaXaaaa (53,57)
L aaaa\nbbbb\ncccc\nddddd\neeeeee\nfffffff\ngggg\nhhhh\niiiii\njjjjj\nkkkkk\nllll XaaaXbbbXcccXdddXeeeXfffXgggXhhhXiiiXjjjXkkkXlllXcbaXaaaa NOMATCH
E a*a*a*a*a*b aaaaaaaaab (0,10)
BE ^ NULL (0,0)
BE $ NULL (0,0)
BE ^$ NULL (0,0)
BE ^a$ a (0,1)
BE abc abc (0,3)
BE abc xabcy (1,4)
BE abc ababc (2,5)
BE ab*c abc (0,3)
BE ab*bc abc (0,3)
BE ab*bc abbc (0,4)
BE ab*bc abbbbc (0,6)
E ab+bc abbc (0,4)
E ab+bc abbbbc (0,6)
E ab?bc abbc (0,4)
E ab?bc abc (0,3)
E ab?c abc (0,3)
BE ^abc$ abc (0,3)
BE ^abc abcc (0,3)
BE abc$ aabc (1,4)
BE ^ abc (0,0)
BE $ abc (3,3)
BE a.c abc (0,3)
BE a.c axc (0,3)
BE a.*c axyzc (0,5)
BE a[bc]d abd (0,3)
BE a[b-d]e ace (0,3)
BE a[b-d] aac (1,3)
BE a[-b] a- (0,2)
BE a[b-] a- (0,2)
BE a] a] (0,2)
BE a[]]b a]b (0,3)
BE a[^bc]d aed (0,3)
BE a[^-b]c adc (0,3)
BE a[^]b]c adc (0,3)
E ab|cd abc (0,2)
E ab|cd abcd (0,2)
E a\(b a(b (0,3)
E a\(*b ab (0,2)
E a\(*b a((b (0,4)
E ((a)) abc (0,1)(0,1)(0,1)
E (a)b(c) abc (0,3)(0,1)(2,3)
E a+b+c aabbabc (4,7)
E a* aaa (0,3)
#E (a*)* - (0,0)(0,0)
E (a*)* - (0,0)(?,?) RE2/Go
E (a*)+ - (0,0)(0,0)
#E (a*|b)* - (0,0)(0,0)
E (a*|b)* - (0,0)(?,?) RE2/Go
E (a+|b)* ab (0,2)(1,2)
E (a+|b)+ ab (0,2)(1,2)
E (a+|b)? ab (0,1)(0,1)
BE [^ab]* cde (0,3)
#E (^)* - (0,0)(0,0)
E (^)* - (0,0)(?,?) RE2/Go
BE a* NULL (0,0)
E ([abc])*d abbbcd (0,6)(4,5)
E ([abc])*bcd abcd (0,4)(0,1)
E a|b|c|d|e e (0,1)
E (a|b|c|d|e)f ef (0,2)(0,1)
#E ((a*|b))* - (0,0)(0,0)(0,0)
E ((a*|b))* - (0,0)(?,?)(?,?) RE2/Go
BE abcd*efg abcdefg (0,7)
BE ab* xabyabbbz (1,3)
BE ab* xayabbbz (1,2)
E (ab|cd)e abcde (2,5)(2,4)
BE [abhgefdc]ij hij (0,3)
E (a|b)c*d abcd (1,4)(1,2)
E (ab|ab*)bc abc (0,3)(0,1)
E a([bc]*)c* abc (0,3)(1,3)
E a([bc]*)(c*d) abcd (0,4)(1,3)(3,4)
E a([bc]+)(c*d) abcd (0,4)(1,3)(3,4)
E a([bc]*)(c+d) abcd (0,4)(1,2)(2,4)
E a[bcd]*dcdcde adcdcde (0,7)
E (ab|a)b*c abc (0,3)(0,2)
E ((a)(b)c)(d) abcd (0,4)(0,3)(0,1)(1,2)(3,4)
BE [A-Za-z_][A-Za-z0-9_]* alpha (0,5)
E ^a(bc+|b[eh])g|.h$ abh (1,3)
E (bc+d$|ef*g.|h?i(j|k)) effgz (0,5)(0,5)
E (bc+d$|ef*g.|h?i(j|k)) ij (0,2)(0,2)(1,2)
E (bc+d$|ef*g.|h?i(j|k)) reffgz (1,6)(1,6)
E (((((((((a))))))))) a (0,1)(0,1)(0,1)(0,1)(0,1)(0,1)(0,1)(0,1)(0,1)(0,1)
BE multiple words multiple words yeah (0,14)
E (.*)c(.*) abcde (0,5)(0,2)(3,5)
BE abcd abcd (0,4)
E a(bc)d abcd (0,4)(1,3)
E a[-]?c ac (0,3)
E M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy] Muammar Qaddafi (0,15)(?,?)(10,12)
E M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy] Mo'ammar Gadhafi (0,16)(?,?)(11,13)
E M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy] Muammar Kaddafi (0,15)(?,?)(10,12)
E M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy] Muammar Qadhafi (0,15)(?,?)(10,12)
E M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy] Muammar Gadafi (0,14)(?,?)(10,11)
E M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy] Mu'ammar Qadafi (0,15)(?,?)(11,12)
E M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy] Moamar Gaddafi (0,14)(?,?)(9,11)
E M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy] Mu'ammar Qadhdhafi (0,18)(?,?)(13,15)
E M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy] Muammar Khaddafi (0,16)(?,?)(11,13)
E M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy] Muammar Ghaddafy (0,16)(?,?)(11,13)
E M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy] Muammar Ghadafi (0,15)(?,?)(11,12)
E M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy] Muammar Ghaddafi (0,16)(?,?)(11,13)
E M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy] Muamar Kaddafi (0,14)(?,?)(9,11)
E M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy] Muammar Quathafi (0,16)(?,?)(11,13)
E M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy] Muammar Gheddafi (0,16)(?,?)(11,13)
E M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy] Moammar Khadafy (0,15)(?,?)(11,12)
E M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy] Moammar Qudhafi (0,15)(?,?)(10,12)
E a+(b|c)*d+ aabcdd (0,6)(3,4)
E ^.+$ vivi (0,4)
E ^(.+)$ vivi (0,4)(0,4)
E ^([^!.]+).att.com!(.+)$ gryphon.att.com!eby (0,19)(0,7)(16,19)
E ^([^!]+!)?([^!]+)$ bas (0,3)(?,?)(0,3)
E ^([^!]+!)?([^!]+)$ bar!bas (0,7)(0,4)(4,7)
E ^([^!]+!)?([^!]+)$ foo!bas (0,7)(0,4)(4,7)
E ^.+!([^!]+!)([^!]+)$ foo!bar!bas (0,11)(4,8)(8,11)
E ((foo)|(bar))!bas bar!bas (0,7)(0,3)(?,?)(0,3)
E ((foo)|(bar))!bas foo!bar!bas (4,11)(4,7)(?,?)(4,7)
E ((foo)|(bar))!bas foo!bas (0,7)(0,3)(0,3)
E ((foo)|bar)!bas bar!bas (0,7)(0,3)
E ((foo)|bar)!bas foo!bar!bas (4,11)(4,7)
E ((foo)|bar)!bas foo!bas (0,7)(0,3)(0,3)
E (foo|(bar))!bas bar!bas (0,7)(0,3)(0,3)
E (foo|(bar))!bas foo!bar!bas (4,11)(4,7)(4,7)
E (foo|(bar))!bas foo!bas (0,7)(0,3)
E (foo|bar)!bas bar!bas (0,7)(0,3)
E (foo|bar)!bas foo!bar!bas (4,11)(4,7)
E (foo|bar)!bas foo!bas (0,7)(0,3)
E ^(([^!]+!)?([^!]+)|.+!([^!]+!)([^!]+))$ foo!bar!bas (0,11)(0,11)(?,?)(?,?)(4,8)(8,11)
E ^([^!]+!)?([^!]+)$|^.+!([^!]+!)([^!]+)$ bas (0,3)(?,?)(0,3)
E ^([^!]+!)?([^!]+)$|^.+!([^!]+!)([^!]+)$ bar!bas (0,7)(0,4)(4,7)
E ^([^!]+!)?([^!]+)$|^.+!([^!]+!)([^!]+)$ foo!bar!bas (0,11)(?,?)(?,?)(4,8)(8,11)
E ^([^!]+!)?([^!]+)$|^.+!([^!]+!)([^!]+)$ foo!bas (0,7)(0,4)(4,7)
E ^(([^!]+!)?([^!]+)|.+!([^!]+!)([^!]+))$ bas (0,3)(0,3)(?,?)(0,3)
E ^(([^!]+!)?([^!]+)|.+!([^!]+!)([^!]+))$ bar!bas (0,7)(0,7)(0,4)(4,7)
E ^(([^!]+!)?([^!]+)|.+!([^!]+!)([^!]+))$ foo!bar!bas (0,11)(0,11)(?,?)(?,?)(4,8)(8,11)
E ^(([^!]+!)?([^!]+)|.+!([^!]+!)([^!]+))$ foo!bas (0,7)(0,7)(0,4)(4,7)
E .*(/XXX).* /XXX (0,4)(0,4)
E .*(\\XXX).* \XXX (0,4)(0,4)
E \\XXX \XXX (0,4)
E .*(/000).* /000 (0,4)(0,4)
E .*(\\000).* \000 (0,4)(0,4)
E \\000 \000 (0,4)

File diff suppressed because it is too large Load diff

View file

@ -0,0 +1,76 @@
#!/usr/bin/env python
from __future__ import absolute_import, division, print_function
import argparse
import os.path as path
def read_tests(f):
basename, _ = path.splitext(path.basename(f))
tests = []
prev_pattern = None
for lineno, line in enumerate(open(f), 1):
fields = list(filter(None, map(str.strip, line.split('\t'))))
if not (4 <= len(fields) <= 5) \
or 'E' not in fields[0] or fields[0][0] == '#':
continue
terse_opts, pat, text, sgroups = fields[0:4]
groups = [] # groups as integer ranges
if sgroups == 'NOMATCH':
groups = []
elif ',' in sgroups:
noparen = map(lambda s: s.strip('()'), sgroups.split(')('))
for g in noparen:
s, e = map(str.strip, g.split(','))
groups.append([int(s), int(e)])
break
else:
# This skips tests that should result in an error.
# There aren't many, so I think we can just capture those
# manually. Possibly fix this in future.
continue
opts = []
if text == "NULL":
text = ""
if pat == 'SAME':
pat = prev_pattern
if '$' in terse_opts:
pat = pat.encode('utf-8').decode('unicode_escape')
text = text.encode('utf-8').decode('unicode_escape')
text = text.encode('unicode_escape').decode('utf-8')
opts.append('escaped')
else:
opts.append('escaped')
text = text.encode('unicode_escape').decode('utf-8')
if 'i' in terse_opts:
opts.append('case-insensitive')
pat = pat.encode('unicode_escape').decode('utf-8')
pat = pat.replace('\\\\', '\\')
tests.append({
'name': '"%s%d"' % (basename, lineno),
'options': repr(opts),
'pattern': "'''%s'''" % pat,
'input': "'''%s'''" % text,
'matches': str(groups),
})
prev_pattern = pat
return tests
if __name__ == '__main__':
parser = argparse.ArgumentParser(
description='Generate match tests from an AT&T POSIX test file.')
aa = parser.add_argument
aa('datfile', help='A dat AT&T POSIX test file.')
args = parser.parse_args()
tests = read_tests(args.datfile)
for t in tests:
print('[[tests]]')
for k, v in t.items():
print('%s = %s' % (k, v))
print('')

View file

@ -0,0 +1,79 @@
NOTE null subexpression matches : 2002-06-06
E (a*)* a (0,1)(0,1)
#E SAME x (0,0)(0,0)
E SAME x (0,0)(?,?) RE2/Go
E SAME aaaaaa (0,6)(0,6)
E SAME aaaaaax (0,6)(0,6)
E (a*)+ a (0,1)(0,1)
E SAME x (0,0)(0,0)
E SAME aaaaaa (0,6)(0,6)
E SAME aaaaaax (0,6)(0,6)
E (a+)* a (0,1)(0,1)
E SAME x (0,0)
E SAME aaaaaa (0,6)(0,6)
E SAME aaaaaax (0,6)(0,6)
E (a+)+ a (0,1)(0,1)
E SAME x NOMATCH
E SAME aaaaaa (0,6)(0,6)
E SAME aaaaaax (0,6)(0,6)
E ([a]*)* a (0,1)(0,1)
#E SAME x (0,0)(0,0)
E SAME x (0,0)(?,?) RE2/Go
E SAME aaaaaa (0,6)(0,6)
E SAME aaaaaax (0,6)(0,6)
E ([a]*)+ a (0,1)(0,1)
E SAME x (0,0)(0,0)
E SAME aaaaaa (0,6)(0,6)
E SAME aaaaaax (0,6)(0,6)
E ([^b]*)* a (0,1)(0,1)
#E SAME b (0,0)(0,0)
E SAME b (0,0)(?,?) RE2/Go
E SAME aaaaaa (0,6)(0,6)
E SAME aaaaaab (0,6)(0,6)
E ([ab]*)* a (0,1)(0,1)
E SAME aaaaaa (0,6)(0,6)
E SAME ababab (0,6)(0,6)
E SAME bababa (0,6)(0,6)
E SAME b (0,1)(0,1)
E SAME bbbbbb (0,6)(0,6)
E SAME aaaabcde (0,5)(0,5)
E ([^a]*)* b (0,1)(0,1)
E SAME bbbbbb (0,6)(0,6)
#E SAME aaaaaa (0,0)(0,0)
E SAME aaaaaa (0,0)(?,?) RE2/Go
E ([^ab]*)* ccccxx (0,6)(0,6)
#E SAME ababab (0,0)(0,0)
E SAME ababab (0,0)(?,?) RE2/Go
E ((z)+|a)* zabcde (0,2)(1,2)
#{E a+? aaaaaa (0,1) no *? +? mimimal match ops
#E (a) aaa (0,1)(0,1)
#E (a*?) aaa (0,0)(0,0)
#E (a)*? aaa (0,0)
#E (a*?)*? aaa (0,0)
#}
B \(a*\)*\(x\) x (0,1)(0,0)(0,1)
B \(a*\)*\(x\) ax (0,2)(0,1)(1,2)
B \(a*\)*\(x\) axa (0,2)(0,1)(1,2)
B \(a*\)*\(x\)\(\1\) x (0,1)(0,0)(0,1)(1,1)
B \(a*\)*\(x\)\(\1\) ax (0,2)(1,1)(1,2)(2,2)
B \(a*\)*\(x\)\(\1\) axa (0,3)(0,1)(1,2)(2,3)
B \(a*\)*\(x\)\(\1\)\(x\) axax (0,4)(0,1)(1,2)(2,3)(3,4)
B \(a*\)*\(x\)\(\1\)\(x\) axxa (0,3)(1,1)(1,2)(2,2)(2,3)
#E (a*)*(x) x (0,1)(0,0)(0,1)
E (a*)*(x) x (0,1)(?,?)(0,1) RE2/Go
E (a*)*(x) ax (0,2)(0,1)(1,2)
E (a*)*(x) axa (0,2)(0,1)(1,2)
E (a*)+(x) x (0,1)(0,0)(0,1)
E (a*)+(x) ax (0,2)(0,1)(1,2)
E (a*)+(x) axa (0,2)(0,1)(1,2)
E (a*){2}(x) x (0,1)(0,0)(0,1)
E (a*){2}(x) ax (0,2)(1,1)(1,2)
E (a*){2}(x) axa (0,2)(1,1)(1,2)

View file

@ -0,0 +1,350 @@
[[tests]]
name = "nullsubexpr3"
options = ['escaped']
pattern = '''(a*)*'''
input = '''a'''
matches = [[0, 1]]
[[tests]]
name = "nullsubexpr5"
options = ['escaped']
pattern = '''(a*)*'''
input = '''x'''
matches = [[0, 0]]
[[tests]]
name = "nullsubexpr6"
options = ['escaped']
pattern = '''(a*)*'''
input = '''aaaaaa'''
matches = [[0, 6]]
[[tests]]
name = "nullsubexpr7"
options = ['escaped']
pattern = '''(a*)*'''
input = '''aaaaaax'''
matches = [[0, 6]]
[[tests]]
name = "nullsubexpr8"
options = ['escaped']
pattern = '''(a*)+'''
input = '''a'''
matches = [[0, 1]]
[[tests]]
name = "nullsubexpr9"
options = ['escaped']
pattern = '''(a*)+'''
input = '''x'''
matches = [[0, 0]]
[[tests]]
name = "nullsubexpr10"
options = ['escaped']
pattern = '''(a*)+'''
input = '''aaaaaa'''
matches = [[0, 6]]
[[tests]]
name = "nullsubexpr11"
options = ['escaped']
pattern = '''(a*)+'''
input = '''aaaaaax'''
matches = [[0, 6]]
[[tests]]
name = "nullsubexpr12"
options = ['escaped']
pattern = '''(a+)*'''
input = '''a'''
matches = [[0, 1]]
[[tests]]
name = "nullsubexpr13"
options = ['escaped']
pattern = '''(a+)*'''
input = '''x'''
matches = [[0, 0]]
[[tests]]
name = "nullsubexpr14"
options = ['escaped']
pattern = '''(a+)*'''
input = '''aaaaaa'''
matches = [[0, 6]]
[[tests]]
name = "nullsubexpr15"
options = ['escaped']
pattern = '''(a+)*'''
input = '''aaaaaax'''
matches = [[0, 6]]
[[tests]]
name = "nullsubexpr16"
options = ['escaped']
pattern = '''(a+)+'''
input = '''a'''
matches = [[0, 1]]
[[tests]]
name = "nullsubexpr17"
options = ['escaped']
pattern = '''(a+)+'''
input = '''x'''
matches = []
[[tests]]
name = "nullsubexpr18"
options = ['escaped']
pattern = '''(a+)+'''
input = '''aaaaaa'''
matches = [[0, 6]]
[[tests]]
name = "nullsubexpr19"
options = ['escaped']
pattern = '''(a+)+'''
input = '''aaaaaax'''
matches = [[0, 6]]
[[tests]]
name = "nullsubexpr21"
options = ['escaped']
pattern = '''([a]*)*'''
input = '''a'''
matches = [[0, 1]]
[[tests]]
name = "nullsubexpr23"
options = ['escaped']
pattern = '''([a]*)*'''
input = '''x'''
matches = [[0, 0]]
[[tests]]
name = "nullsubexpr24"
options = ['escaped']
pattern = '''([a]*)*'''
input = '''aaaaaa'''
matches = [[0, 6]]
[[tests]]
name = "nullsubexpr25"
options = ['escaped']
pattern = '''([a]*)*'''
input = '''aaaaaax'''
matches = [[0, 6]]
[[tests]]
name = "nullsubexpr26"
options = ['escaped']
pattern = '''([a]*)+'''
input = '''a'''
matches = [[0, 1]]
[[tests]]
name = "nullsubexpr27"
options = ['escaped']
pattern = '''([a]*)+'''
input = '''x'''
matches = [[0, 0]]
[[tests]]
name = "nullsubexpr28"
options = ['escaped']
pattern = '''([a]*)+'''
input = '''aaaaaa'''
matches = [[0, 6]]
[[tests]]
name = "nullsubexpr29"
options = ['escaped']
pattern = '''([a]*)+'''
input = '''aaaaaax'''
matches = [[0, 6]]
[[tests]]
name = "nullsubexpr30"
options = ['escaped']
pattern = '''([^b]*)*'''
input = '''a'''
matches = [[0, 1]]
[[tests]]
name = "nullsubexpr32"
options = ['escaped']
pattern = '''([^b]*)*'''
input = '''b'''
matches = [[0, 0]]
[[tests]]
name = "nullsubexpr33"
options = ['escaped']
pattern = '''([^b]*)*'''
input = '''aaaaaa'''
matches = [[0, 6]]
[[tests]]
name = "nullsubexpr34"
options = ['escaped']
pattern = '''([^b]*)*'''
input = '''aaaaaab'''
matches = [[0, 6]]
[[tests]]
name = "nullsubexpr35"
options = ['escaped']
pattern = '''([ab]*)*'''
input = '''a'''
matches = [[0, 1]]
[[tests]]
name = "nullsubexpr36"
options = ['escaped']
pattern = '''([ab]*)*'''
input = '''aaaaaa'''
matches = [[0, 6]]
[[tests]]
name = "nullsubexpr37"
options = ['escaped']
pattern = '''([ab]*)*'''
input = '''ababab'''
matches = [[0, 6]]
[[tests]]
name = "nullsubexpr38"
options = ['escaped']
pattern = '''([ab]*)*'''
input = '''bababa'''
matches = [[0, 6]]
[[tests]]
name = "nullsubexpr39"
options = ['escaped']
pattern = '''([ab]*)*'''
input = '''b'''
matches = [[0, 1]]
[[tests]]
name = "nullsubexpr40"
options = ['escaped']
pattern = '''([ab]*)*'''
input = '''bbbbbb'''
matches = [[0, 6]]
[[tests]]
name = "nullsubexpr41"
options = ['escaped']
pattern = '''([ab]*)*'''
input = '''aaaabcde'''
matches = [[0, 5]]
[[tests]]
name = "nullsubexpr42"
options = ['escaped']
pattern = '''([^a]*)*'''
input = '''b'''
matches = [[0, 1]]
[[tests]]
name = "nullsubexpr43"
options = ['escaped']
pattern = '''([^a]*)*'''
input = '''bbbbbb'''
matches = [[0, 6]]
[[tests]]
name = "nullsubexpr45"
options = ['escaped']
pattern = '''([^a]*)*'''
input = '''aaaaaa'''
matches = [[0, 0]]
[[tests]]
name = "nullsubexpr46"
options = ['escaped']
pattern = '''([^ab]*)*'''
input = '''ccccxx'''
matches = [[0, 6]]
[[tests]]
name = "nullsubexpr48"
options = ['escaped']
pattern = '''([^ab]*)*'''
input = '''ababab'''
matches = [[0, 0]]
[[tests]]
name = "nullsubexpr50"
options = ['escaped']
pattern = '''((z)+|a)*'''
input = '''zabcde'''
matches = [[0, 2]]
[[tests]]
name = "nullsubexpr69"
options = ['escaped']
pattern = '''(a*)*(x)'''
input = '''x'''
matches = [[0, 1]]
[[tests]]
name = "nullsubexpr70"
options = ['escaped']
pattern = '''(a*)*(x)'''
input = '''ax'''
matches = [[0, 2]]
[[tests]]
name = "nullsubexpr71"
options = ['escaped']
pattern = '''(a*)*(x)'''
input = '''axa'''
matches = [[0, 2]]
[[tests]]
name = "nullsubexpr73"
options = ['escaped']
pattern = '''(a*)+(x)'''
input = '''x'''
matches = [[0, 1]]
[[tests]]
name = "nullsubexpr74"
options = ['escaped']
pattern = '''(a*)+(x)'''
input = '''ax'''
matches = [[0, 2]]
[[tests]]
name = "nullsubexpr75"
options = ['escaped']
pattern = '''(a*)+(x)'''
input = '''axa'''
matches = [[0, 2]]
[[tests]]
name = "nullsubexpr77"
options = ['escaped']
pattern = '''(a*){2}(x)'''
input = '''x'''
matches = [[0, 1]]
[[tests]]
name = "nullsubexpr78"
options = ['escaped']
pattern = '''(a*){2}(x)'''
input = '''ax'''
matches = [[0, 2]]
[[tests]]
name = "nullsubexpr79"
options = ['escaped']
pattern = '''(a*){2}(x)'''
input = '''axa'''
matches = [[0, 2]]

View file

@ -0,0 +1,85 @@
NOTE implicit vs. explicit repetitions : 2009-02-02
# Glenn Fowler <gsf@research.att.com>
# conforming matches (column 4) must match one of the following BREs
# NOMATCH
# (0,.)\((\(.\),\(.\))(?,?)(\2,\3)\)*
# (0,.)\((\(.\),\(.\))(\2,\3)(?,?)\)*
# i.e., each 3-tuple has two identical elements and one (?,?)
NOTE additional repetition tests graciously provided by Chris Kuklewicz www.haskell.org 2009-02-02
:HA#100:E X(.?){0,}Y X1234567Y (0,9)(7,8)
:HA#101:E X(.?){1,}Y X1234567Y (0,9)(7,8)
:HA#102:E X(.?){2,}Y X1234567Y (0,9)(7,8)
:HA#103:E X(.?){3,}Y X1234567Y (0,9)(7,8)
:HA#104:E X(.?){4,}Y X1234567Y (0,9)(7,8)
:HA#105:E X(.?){5,}Y X1234567Y (0,9)(7,8)
:HA#106:E X(.?){6,}Y X1234567Y (0,9)(7,8)
:HA#107:E X(.?){7,}Y X1234567Y (0,9)(7,8)
:HA#108:E X(.?){8,}Y X1234567Y (0,9)(8,8)
#:HA#110:E X(.?){0,8}Y X1234567Y (0,9)(7,8)
:HA#110:E X(.?){0,8}Y X1234567Y (0,9)(8,8) RE2/Go
#:HA#111:E X(.?){1,8}Y X1234567Y (0,9)(7,8)
:HA#111:E X(.?){1,8}Y X1234567Y (0,9)(8,8) RE2/Go
#:HA#112:E X(.?){2,8}Y X1234567Y (0,9)(7,8)
:HA#112:E X(.?){2,8}Y X1234567Y (0,9)(8,8) RE2/Go
#:HA#113:E X(.?){3,8}Y X1234567Y (0,9)(7,8)
:HA#113:E X(.?){3,8}Y X1234567Y (0,9)(8,8) RE2/Go
#:HA#114:E X(.?){4,8}Y X1234567Y (0,9)(7,8)
:HA#114:E X(.?){4,8}Y X1234567Y (0,9)(8,8) RE2/Go
#:HA#115:E X(.?){5,8}Y X1234567Y (0,9)(7,8)
:HA#115:E X(.?){5,8}Y X1234567Y (0,9)(8,8) RE2/Go
#:HA#116:E X(.?){6,8}Y X1234567Y (0,9)(7,8)
:HA#116:E X(.?){6,8}Y X1234567Y (0,9)(8,8) RE2/Go
#:HA#117:E X(.?){7,8}Y X1234567Y (0,9)(7,8)
:HA#117:E X(.?){7,8}Y X1234567Y (0,9)(8,8) RE2/Go
:HA#118:E X(.?){8,8}Y X1234567Y (0,9)(8,8)
# These test a fixed bug in my regex-tdfa that did not keep the expanded
# form properly grouped, so right association did the wrong thing with
# these ambiguous patterns (crafted just to test my code when I became
# suspicious of my implementation). The first subexpression should use
# "ab" then "a" then "bcd".
# OS X / FreeBSD / NetBSD badly fail many of these, with impossible
# results like (0,6)(4,5)(6,6).
:HA#260:E (a|ab|c|bcd){0,}(d*) ababcd (0,1)(0,1)(1,1)
:HA#261:E (a|ab|c|bcd){1,}(d*) ababcd (0,1)(0,1)(1,1)
:HA#262:E (a|ab|c|bcd){2,}(d*) ababcd (0,6)(3,6)(6,6)
:HA#263:E (a|ab|c|bcd){3,}(d*) ababcd (0,6)(3,6)(6,6)
:HA#264:E (a|ab|c|bcd){4,}(d*) ababcd NOMATCH
:HA#265:E (a|ab|c|bcd){0,10}(d*) ababcd (0,1)(0,1)(1,1)
:HA#266:E (a|ab|c|bcd){1,10}(d*) ababcd (0,1)(0,1)(1,1)
:HA#267:E (a|ab|c|bcd){2,10}(d*) ababcd (0,6)(3,6)(6,6)
:HA#268:E (a|ab|c|bcd){3,10}(d*) ababcd (0,6)(3,6)(6,6)
:HA#269:E (a|ab|c|bcd){4,10}(d*) ababcd NOMATCH
:HA#270:E (a|ab|c|bcd)*(d*) ababcd (0,1)(0,1)(1,1)
:HA#271:E (a|ab|c|bcd)+(d*) ababcd (0,1)(0,1)(1,1)
# The above worked on Linux/GLIBC but the following often fail.
# They also trip up OS X / FreeBSD / NetBSD:
#:HA#280:E (ab|a|c|bcd){0,}(d*) ababcd (0,6)(3,6)(6,6)
:HA#280:E (ab|a|c|bcd){0,}(d*) ababcd (0,6)(4,5)(5,6) RE2/Go
#:HA#281:E (ab|a|c|bcd){1,}(d*) ababcd (0,6)(3,6)(6,6)
:HA#281:E (ab|a|c|bcd){1,}(d*) ababcd (0,6)(4,5)(5,6) RE2/Go
#:HA#282:E (ab|a|c|bcd){2,}(d*) ababcd (0,6)(3,6)(6,6)
:HA#282:E (ab|a|c|bcd){2,}(d*) ababcd (0,6)(4,5)(5,6) RE2/Go
#:HA#283:E (ab|a|c|bcd){3,}(d*) ababcd (0,6)(3,6)(6,6)
:HA#283:E (ab|a|c|bcd){3,}(d*) ababcd (0,6)(4,5)(5,6) RE2/Go
:HA#284:E (ab|a|c|bcd){4,}(d*) ababcd NOMATCH
#:HA#285:E (ab|a|c|bcd){0,10}(d*) ababcd (0,6)(3,6)(6,6)
:HA#285:E (ab|a|c|bcd){0,10}(d*) ababcd (0,6)(4,5)(5,6) RE2/Go
#:HA#286:E (ab|a|c|bcd){1,10}(d*) ababcd (0,6)(3,6)(6,6)
:HA#286:E (ab|a|c|bcd){1,10}(d*) ababcd (0,6)(4,5)(5,6) RE2/Go
#:HA#287:E (ab|a|c|bcd){2,10}(d*) ababcd (0,6)(3,6)(6,6)
:HA#287:E (ab|a|c|bcd){2,10}(d*) ababcd (0,6)(4,5)(5,6) RE2/Go
#:HA#288:E (ab|a|c|bcd){3,10}(d*) ababcd (0,6)(3,6)(6,6)
:HA#288:E (ab|a|c|bcd){3,10}(d*) ababcd (0,6)(4,5)(5,6) RE2/Go
:HA#289:E (ab|a|c|bcd){4,10}(d*) ababcd NOMATCH
#:HA#290:E (ab|a|c|bcd)*(d*) ababcd (0,6)(3,6)(6,6)
:HA#290:E (ab|a|c|bcd)*(d*) ababcd (0,6)(4,5)(5,6) RE2/Go
#:HA#291:E (ab|a|c|bcd)+(d*) ababcd (0,6)(3,6)(6,6)
:HA#291:E (ab|a|c|bcd)+(d*) ababcd (0,6)(4,5)(5,6) RE2/Go

View file

@ -0,0 +1,294 @@
[[tests]]
name = "repetition-long12"
options = ['escaped']
pattern = '''X(.?){0,}Y'''
input = '''X1234567Y'''
matches = [[0, 9]]
[[tests]]
name = "repetition-long13"
options = ['escaped']
pattern = '''X(.?){1,}Y'''
input = '''X1234567Y'''
matches = [[0, 9]]
[[tests]]
name = "repetition-long14"
options = ['escaped']
pattern = '''X(.?){2,}Y'''
input = '''X1234567Y'''
matches = [[0, 9]]
[[tests]]
name = "repetition-long15"
options = ['escaped']
pattern = '''X(.?){3,}Y'''
input = '''X1234567Y'''
matches = [[0, 9]]
[[tests]]
name = "repetition-long16"
options = ['escaped']
pattern = '''X(.?){4,}Y'''
input = '''X1234567Y'''
matches = [[0, 9]]
[[tests]]
name = "repetition-long17"
options = ['escaped']
pattern = '''X(.?){5,}Y'''
input = '''X1234567Y'''
matches = [[0, 9]]
[[tests]]
name = "repetition-long18"
options = ['escaped']
pattern = '''X(.?){6,}Y'''
input = '''X1234567Y'''
matches = [[0, 9]]
[[tests]]
name = "repetition-long19"
options = ['escaped']
pattern = '''X(.?){7,}Y'''
input = '''X1234567Y'''
matches = [[0, 9]]
[[tests]]
name = "repetition-long20"
options = ['escaped']
pattern = '''X(.?){8,}Y'''
input = '''X1234567Y'''
matches = [[0, 9]]
[[tests]]
name = "repetition-long22"
options = ['escaped']
pattern = '''X(.?){0,8}Y'''
input = '''X1234567Y'''
matches = [[0, 9]]
[[tests]]
name = "repetition-long24"
options = ['escaped']
pattern = '''X(.?){1,8}Y'''
input = '''X1234567Y'''
matches = [[0, 9]]
[[tests]]
name = "repetition-long26"
options = ['escaped']
pattern = '''X(.?){2,8}Y'''
input = '''X1234567Y'''
matches = [[0, 9]]
[[tests]]
name = "repetition-long28"
options = ['escaped']
pattern = '''X(.?){3,8}Y'''
input = '''X1234567Y'''
matches = [[0, 9]]
[[tests]]
name = "repetition-long30"
options = ['escaped']
pattern = '''X(.?){4,8}Y'''
input = '''X1234567Y'''
matches = [[0, 9]]
[[tests]]
name = "repetition-long32"
options = ['escaped']
pattern = '''X(.?){5,8}Y'''
input = '''X1234567Y'''
matches = [[0, 9]]
[[tests]]
name = "repetition-long34"
options = ['escaped']
pattern = '''X(.?){6,8}Y'''
input = '''X1234567Y'''
matches = [[0, 9]]
[[tests]]
name = "repetition-long36"
options = ['escaped']
pattern = '''X(.?){7,8}Y'''
input = '''X1234567Y'''
matches = [[0, 9]]
[[tests]]
name = "repetition-long37"
options = ['escaped']
pattern = '''X(.?){8,8}Y'''
input = '''X1234567Y'''
matches = [[0, 9]]
[[tests]]
name = "repetition-long48"
options = ['escaped']
pattern = '''(a|ab|c|bcd){0,}(d*)'''
input = '''ababcd'''
matches = [[0, 1]]
[[tests]]
name = "repetition-long49"
options = ['escaped']
pattern = '''(a|ab|c|bcd){1,}(d*)'''
input = '''ababcd'''
matches = [[0, 1]]
[[tests]]
name = "repetition-long50"
options = ['escaped']
pattern = '''(a|ab|c|bcd){2,}(d*)'''
input = '''ababcd'''
matches = [[0, 6]]
[[tests]]
name = "repetition-long51"
options = ['escaped']
pattern = '''(a|ab|c|bcd){3,}(d*)'''
input = '''ababcd'''
matches = [[0, 6]]
[[tests]]
name = "repetition-long52"
options = ['escaped']
pattern = '''(a|ab|c|bcd){4,}(d*)'''
input = '''ababcd'''
matches = []
[[tests]]
name = "repetition-long53"
options = ['escaped']
pattern = '''(a|ab|c|bcd){0,10}(d*)'''
input = '''ababcd'''
matches = [[0, 1]]
[[tests]]
name = "repetition-long54"
options = ['escaped']
pattern = '''(a|ab|c|bcd){1,10}(d*)'''
input = '''ababcd'''
matches = [[0, 1]]
[[tests]]
name = "repetition-long55"
options = ['escaped']
pattern = '''(a|ab|c|bcd){2,10}(d*)'''
input = '''ababcd'''
matches = [[0, 6]]
[[tests]]
name = "repetition-long56"
options = ['escaped']
pattern = '''(a|ab|c|bcd){3,10}(d*)'''
input = '''ababcd'''
matches = [[0, 6]]
[[tests]]
name = "repetition-long57"
options = ['escaped']
pattern = '''(a|ab|c|bcd){4,10}(d*)'''
input = '''ababcd'''
matches = []
[[tests]]
name = "repetition-long58"
options = ['escaped']
pattern = '''(a|ab|c|bcd)*(d*)'''
input = '''ababcd'''
matches = [[0, 1]]
[[tests]]
name = "repetition-long59"
options = ['escaped']
pattern = '''(a|ab|c|bcd)+(d*)'''
input = '''ababcd'''
matches = [[0, 1]]
[[tests]]
name = "repetition-long65"
options = ['escaped']
pattern = '''(ab|a|c|bcd){0,}(d*)'''
input = '''ababcd'''
matches = [[0, 6]]
[[tests]]
name = "repetition-long67"
options = ['escaped']
pattern = '''(ab|a|c|bcd){1,}(d*)'''
input = '''ababcd'''
matches = [[0, 6]]
[[tests]]
name = "repetition-long69"
options = ['escaped']
pattern = '''(ab|a|c|bcd){2,}(d*)'''
input = '''ababcd'''
matches = [[0, 6]]
[[tests]]
name = "repetition-long71"
options = ['escaped']
pattern = '''(ab|a|c|bcd){3,}(d*)'''
input = '''ababcd'''
matches = [[0, 6]]
[[tests]]
name = "repetition-long72"
options = ['escaped']
pattern = '''(ab|a|c|bcd){4,}(d*)'''
input = '''ababcd'''
matches = []
[[tests]]
name = "repetition-long74"
options = ['escaped']
pattern = '''(ab|a|c|bcd){0,10}(d*)'''
input = '''ababcd'''
matches = [[0, 6]]
[[tests]]
name = "repetition-long76"
options = ['escaped']
pattern = '''(ab|a|c|bcd){1,10}(d*)'''
input = '''ababcd'''
matches = [[0, 6]]
[[tests]]
name = "repetition-long78"
options = ['escaped']
pattern = '''(ab|a|c|bcd){2,10}(d*)'''
input = '''ababcd'''
matches = [[0, 6]]
[[tests]]
name = "repetition-long80"
options = ['escaped']
pattern = '''(ab|a|c|bcd){3,10}(d*)'''
input = '''ababcd'''
matches = [[0, 6]]
[[tests]]
name = "repetition-long81"
options = ['escaped']
pattern = '''(ab|a|c|bcd){4,10}(d*)'''
input = '''ababcd'''
matches = []
[[tests]]
name = "repetition-long83"
options = ['escaped']
pattern = '''(ab|a|c|bcd)*(d*)'''
input = '''ababcd'''
matches = [[0, 6]]
[[tests]]
name = "repetition-long85"
options = ['escaped']
pattern = '''(ab|a|c|bcd)+(d*)'''
input = '''ababcd'''
matches = [[0, 6]]

View file

@ -0,0 +1,83 @@
NOTE implicit vs. explicit repetitions : 2009-02-02
# Glenn Fowler <gsf@research.att.com>
# conforming matches (column 4) must match one of the following BREs
# NOMATCH
# (0,.)\((\(.\),\(.\))(?,?)(\2,\3)\)*
# (0,.)\((\(.\),\(.\))(\2,\3)(?,?)\)*
# i.e., each 3-tuple has two identical elements and one (?,?)
E ((..)|(.)) NULL NOMATCH
E ((..)|(.))((..)|(.)) NULL NOMATCH
E ((..)|(.))((..)|(.))((..)|(.)) NULL NOMATCH
E ((..)|(.)){1} NULL NOMATCH
E ((..)|(.)){2} NULL NOMATCH
E ((..)|(.)){3} NULL NOMATCH
E ((..)|(.))* NULL (0,0)
E ((..)|(.)) a (0,1)(0,1)(?,?)(0,1)
E ((..)|(.))((..)|(.)) a NOMATCH
E ((..)|(.))((..)|(.))((..)|(.)) a NOMATCH
E ((..)|(.)){1} a (0,1)(0,1)(?,?)(0,1)
E ((..)|(.)){2} a NOMATCH
E ((..)|(.)){3} a NOMATCH
E ((..)|(.))* a (0,1)(0,1)(?,?)(0,1)
E ((..)|(.)) aa (0,2)(0,2)(0,2)(?,?)
E ((..)|(.))((..)|(.)) aa (0,2)(0,1)(?,?)(0,1)(1,2)(?,?)(1,2)
E ((..)|(.))((..)|(.))((..)|(.)) aa NOMATCH
E ((..)|(.)){1} aa (0,2)(0,2)(0,2)(?,?)
E ((..)|(.)){2} aa (0,2)(1,2)(?,?)(1,2)
E ((..)|(.)){3} aa NOMATCH
E ((..)|(.))* aa (0,2)(0,2)(0,2)(?,?)
E ((..)|(.)) aaa (0,2)(0,2)(0,2)(?,?)
E ((..)|(.))((..)|(.)) aaa (0,3)(0,2)(0,2)(?,?)(2,3)(?,?)(2,3)
E ((..)|(.))((..)|(.))((..)|(.)) aaa (0,3)(0,1)(?,?)(0,1)(1,2)(?,?)(1,2)(2,3)(?,?)(2,3)
E ((..)|(.)){1} aaa (0,2)(0,2)(0,2)(?,?)
#E ((..)|(.)){2} aaa (0,3)(2,3)(?,?)(2,3)
E ((..)|(.)){2} aaa (0,3)(2,3)(0,2)(2,3) RE2/Go
E ((..)|(.)){3} aaa (0,3)(2,3)(?,?)(2,3)
#E ((..)|(.))* aaa (0,3)(2,3)(?,?)(2,3)
E ((..)|(.))* aaa (0,3)(2,3)(0,2)(2,3) RE2/Go
E ((..)|(.)) aaaa (0,2)(0,2)(0,2)(?,?)
E ((..)|(.))((..)|(.)) aaaa (0,4)(0,2)(0,2)(?,?)(2,4)(2,4)(?,?)
E ((..)|(.))((..)|(.))((..)|(.)) aaaa (0,4)(0,2)(0,2)(?,?)(2,3)(?,?)(2,3)(3,4)(?,?)(3,4)
E ((..)|(.)){1} aaaa (0,2)(0,2)(0,2)(?,?)
E ((..)|(.)){2} aaaa (0,4)(2,4)(2,4)(?,?)
#E ((..)|(.)){3} aaaa (0,4)(3,4)(?,?)(3,4)
E ((..)|(.)){3} aaaa (0,4)(3,4)(0,2)(3,4) RE2/Go
E ((..)|(.))* aaaa (0,4)(2,4)(2,4)(?,?)
E ((..)|(.)) aaaaa (0,2)(0,2)(0,2)(?,?)
E ((..)|(.))((..)|(.)) aaaaa (0,4)(0,2)(0,2)(?,?)(2,4)(2,4)(?,?)
E ((..)|(.))((..)|(.))((..)|(.)) aaaaa (0,5)(0,2)(0,2)(?,?)(2,4)(2,4)(?,?)(4,5)(?,?)(4,5)
E ((..)|(.)){1} aaaaa (0,2)(0,2)(0,2)(?,?)
E ((..)|(.)){2} aaaaa (0,4)(2,4)(2,4)(?,?)
#E ((..)|(.)){3} aaaaa (0,5)(4,5)(?,?)(4,5)
E ((..)|(.)){3} aaaaa (0,5)(4,5)(2,4)(4,5) RE2/Go
#E ((..)|(.))* aaaaa (0,5)(4,5)(?,?)(4,5)
E ((..)|(.))* aaaaa (0,5)(4,5)(2,4)(4,5) RE2/Go
E ((..)|(.)) aaaaaa (0,2)(0,2)(0,2)(?,?)
E ((..)|(.))((..)|(.)) aaaaaa (0,4)(0,2)(0,2)(?,?)(2,4)(2,4)(?,?)
E ((..)|(.))((..)|(.))((..)|(.)) aaaaaa (0,6)(0,2)(0,2)(?,?)(2,4)(2,4)(?,?)(4,6)(4,6)(?,?)
E ((..)|(.)){1} aaaaaa (0,2)(0,2)(0,2)(?,?)
E ((..)|(.)){2} aaaaaa (0,4)(2,4)(2,4)(?,?)
E ((..)|(.)){3} aaaaaa (0,6)(4,6)(4,6)(?,?)
E ((..)|(.))* aaaaaa (0,6)(4,6)(4,6)(?,?)

View file

@ -0,0 +1,343 @@
[[tests]]
name = "repetition10"
options = ['escaped']
pattern = '''((..)|(.))'''
input = ''''''
matches = []
[[tests]]
name = "repetition11"
options = ['escaped']
pattern = '''((..)|(.))((..)|(.))'''
input = ''''''
matches = []
[[tests]]
name = "repetition12"
options = ['escaped']
pattern = '''((..)|(.))((..)|(.))((..)|(.))'''
input = ''''''
matches = []
[[tests]]
name = "repetition14"
options = ['escaped']
pattern = '''((..)|(.)){1}'''
input = ''''''
matches = []
[[tests]]
name = "repetition15"
options = ['escaped']
pattern = '''((..)|(.)){2}'''
input = ''''''
matches = []
[[tests]]
name = "repetition16"
options = ['escaped']
pattern = '''((..)|(.)){3}'''
input = ''''''
matches = []
[[tests]]
name = "repetition18"
options = ['escaped']
pattern = '''((..)|(.))*'''
input = ''''''
matches = [[0, 0]]
[[tests]]
name = "repetition20"
options = ['escaped']
pattern = '''((..)|(.))'''
input = '''a'''
matches = [[0, 1]]
[[tests]]
name = "repetition21"
options = ['escaped']
pattern = '''((..)|(.))((..)|(.))'''
input = '''a'''
matches = []
[[tests]]
name = "repetition22"
options = ['escaped']
pattern = '''((..)|(.))((..)|(.))((..)|(.))'''
input = '''a'''
matches = []
[[tests]]
name = "repetition24"
options = ['escaped']
pattern = '''((..)|(.)){1}'''
input = '''a'''
matches = [[0, 1]]
[[tests]]
name = "repetition25"
options = ['escaped']
pattern = '''((..)|(.)){2}'''
input = '''a'''
matches = []
[[tests]]
name = "repetition26"
options = ['escaped']
pattern = '''((..)|(.)){3}'''
input = '''a'''
matches = []
[[tests]]
name = "repetition28"
options = ['escaped']
pattern = '''((..)|(.))*'''
input = '''a'''
matches = [[0, 1]]
[[tests]]
name = "repetition30"
options = ['escaped']
pattern = '''((..)|(.))'''
input = '''aa'''
matches = [[0, 2]]
[[tests]]
name = "repetition31"
options = ['escaped']
pattern = '''((..)|(.))((..)|(.))'''
input = '''aa'''
matches = [[0, 2]]
[[tests]]
name = "repetition32"
options = ['escaped']
pattern = '''((..)|(.))((..)|(.))((..)|(.))'''
input = '''aa'''
matches = []
[[tests]]
name = "repetition34"
options = ['escaped']
pattern = '''((..)|(.)){1}'''
input = '''aa'''
matches = [[0, 2]]
[[tests]]
name = "repetition35"
options = ['escaped']
pattern = '''((..)|(.)){2}'''
input = '''aa'''
matches = [[0, 2]]
[[tests]]
name = "repetition36"
options = ['escaped']
pattern = '''((..)|(.)){3}'''
input = '''aa'''
matches = []
[[tests]]
name = "repetition38"
options = ['escaped']
pattern = '''((..)|(.))*'''
input = '''aa'''
matches = [[0, 2]]
[[tests]]
name = "repetition40"
options = ['escaped']
pattern = '''((..)|(.))'''
input = '''aaa'''
matches = [[0, 2]]
[[tests]]
name = "repetition41"
options = ['escaped']
pattern = '''((..)|(.))((..)|(.))'''
input = '''aaa'''
matches = [[0, 3]]
[[tests]]
name = "repetition42"
options = ['escaped']
pattern = '''((..)|(.))((..)|(.))((..)|(.))'''
input = '''aaa'''
matches = [[0, 3]]
[[tests]]
name = "repetition44"
options = ['escaped']
pattern = '''((..)|(.)){1}'''
input = '''aaa'''
matches = [[0, 2]]
[[tests]]
name = "repetition46"
options = ['escaped']
pattern = '''((..)|(.)){2}'''
input = '''aaa'''
matches = [[0, 3]]
[[tests]]
name = "repetition47"
options = ['escaped']
pattern = '''((..)|(.)){3}'''
input = '''aaa'''
matches = [[0, 3]]
[[tests]]
name = "repetition50"
options = ['escaped']
pattern = '''((..)|(.))*'''
input = '''aaa'''
matches = [[0, 3]]
[[tests]]
name = "repetition52"
options = ['escaped']
pattern = '''((..)|(.))'''
input = '''aaaa'''
matches = [[0, 2]]
[[tests]]
name = "repetition53"
options = ['escaped']
pattern = '''((..)|(.))((..)|(.))'''
input = '''aaaa'''
matches = [[0, 4]]
[[tests]]
name = "repetition54"
options = ['escaped']
pattern = '''((..)|(.))((..)|(.))((..)|(.))'''
input = '''aaaa'''
matches = [[0, 4]]
[[tests]]
name = "repetition56"
options = ['escaped']
pattern = '''((..)|(.)){1}'''
input = '''aaaa'''
matches = [[0, 2]]
[[tests]]
name = "repetition57"
options = ['escaped']
pattern = '''((..)|(.)){2}'''
input = '''aaaa'''
matches = [[0, 4]]
[[tests]]
name = "repetition59"
options = ['escaped']
pattern = '''((..)|(.)){3}'''
input = '''aaaa'''
matches = [[0, 4]]
[[tests]]
name = "repetition61"
options = ['escaped']
pattern = '''((..)|(.))*'''
input = '''aaaa'''
matches = [[0, 4]]
[[tests]]
name = "repetition63"
options = ['escaped']
pattern = '''((..)|(.))'''
input = '''aaaaa'''
matches = [[0, 2]]
[[tests]]
name = "repetition64"
options = ['escaped']
pattern = '''((..)|(.))((..)|(.))'''
input = '''aaaaa'''
matches = [[0, 4]]
[[tests]]
name = "repetition65"
options = ['escaped']
pattern = '''((..)|(.))((..)|(.))((..)|(.))'''
input = '''aaaaa'''
matches = [[0, 5]]
[[tests]]
name = "repetition67"
options = ['escaped']
pattern = '''((..)|(.)){1}'''
input = '''aaaaa'''
matches = [[0, 2]]
[[tests]]
name = "repetition68"
options = ['escaped']
pattern = '''((..)|(.)){2}'''
input = '''aaaaa'''
matches = [[0, 4]]
[[tests]]
name = "repetition70"
options = ['escaped']
pattern = '''((..)|(.)){3}'''
input = '''aaaaa'''
matches = [[0, 5]]
[[tests]]
name = "repetition73"
options = ['escaped']
pattern = '''((..)|(.))*'''
input = '''aaaaa'''
matches = [[0, 5]]
[[tests]]
name = "repetition75"
options = ['escaped']
pattern = '''((..)|(.))'''
input = '''aaaaaa'''
matches = [[0, 2]]
[[tests]]
name = "repetition76"
options = ['escaped']
pattern = '''((..)|(.))((..)|(.))'''
input = '''aaaaaa'''
matches = [[0, 4]]
[[tests]]
name = "repetition77"
options = ['escaped']
pattern = '''((..)|(.))((..)|(.))((..)|(.))'''
input = '''aaaaaa'''
matches = [[0, 6]]
[[tests]]
name = "repetition79"
options = ['escaped']
pattern = '''((..)|(.)){1}'''
input = '''aaaaaa'''
matches = [[0, 2]]
[[tests]]
name = "repetition80"
options = ['escaped']
pattern = '''((..)|(.)){2}'''
input = '''aaaaaa'''
matches = [[0, 4]]
[[tests]]
name = "repetition81"
options = ['escaped']
pattern = '''((..)|(.)){3}'''
input = '''aaaaaa'''
matches = [[0, 6]]
[[tests]]
name = "repetition83"
options = ['escaped']
pattern = '''((..)|(.))*'''
input = '''aaaaaa'''
matches = [[0, 6]]

View file

@ -0,0 +1,92 @@
[[tests]]
name = "iter1"
pattern = "a"
input = "aaa"
matches = [[0, 1], [1, 2], [2, 3]]
[[tests]]
name = "iter2"
pattern = "a"
input = "aba"
matches = [[0, 1], [2, 3]]
[[tests]]
name = "iter-empty1"
pattern = ''
input = ''
matches = [[0, 0]]
[[tests]]
name = "iter-empty2"
pattern = ''
input = 'abc'
matches = [[0, 0], [1, 1], [2, 2], [3, 3]]
[[tests]]
name = "iter-empty3"
pattern = '()'
input = 'abc'
matches = [[0, 0], [1, 1], [2, 2], [3, 3]]
[[tests]]
name = "iter-empty4"
pattern = '()*'
input = 'abc'
matches = [[0, 0], [1, 1], [2, 2], [3, 3]]
[[tests]]
name = "iter-empty5"
pattern = '()+'
input = 'abc'
matches = [[0, 0], [1, 1], [2, 2], [3, 3]]
[[tests]]
name = "iter-empty6"
pattern = '()?'
input = 'abc'
matches = [[0, 0], [1, 1], [2, 2], [3, 3]]
[[tests]]
name = "iter-empty7"
pattern = '()()'
input = 'abc'
matches = [[0, 0], [1, 1], [2, 2], [3, 3]]
[[tests]]
name = "iter-empty8"
pattern = '()+|z'
input = 'abc'
matches = [[0, 0], [1, 1], [2, 2], [3, 3]]
[[tests]]
name = "iter-empty9"
pattern = 'z|()+'
input = 'abc'
matches = [[0, 0], [1, 1], [2, 2], [3, 3]]
[[tests]]
name = "iter-empty10"
pattern = '()+|b'
input = 'abc'
matches = [[0, 0], [1, 1], [2, 2], [3, 3]]
[[tests]]
name = "iter-empty11"
pattern = 'b|()+'
input = 'abc'
matches = [[0, 0], [1, 2], [3, 3]]
[[tests]]
options = ["anchored"]
name = "iter-anchored1"
pattern = "a"
input = "a"
matches = [[0, 1]]
[[tests]]
options = ["anchored"]
name = "iter-anchored2"
pattern = "a"
input = "aa"
matches = [[0, 1]]

View file

@ -0,0 +1,138 @@
[[tests]]
name = "invalid-utf8-literal1"
options = ["escaped", "invalid-utf8", "no-unicode"]
pattern = '\xFF'
input = '\xFF'
matches = [[0, 1]]
[[tests]]
name = "no-unicode-mixed"
options = ["escaped", "invalid-utf8"]
pattern = '(.+)(?-u)(.+)'
input = '\xCE\x93\xCE\x94\xFF'
matches = [[0, 5]]
[[tests]]
name = "no-unicode-case1"
options = ["case-insensitive", "no-unicode"]
pattern = "a"
input = "A"
matches = [[0, 1]]
[[tests]]
name = "no-unicode-case2"
options = ["case-insensitive", "no-unicode"]
pattern = "[a-z]+"
input = "AaAaA"
matches = [[0, 5]]
[[tests]]
name = "no-unicode-case3"
options = ["case-insensitive"]
pattern = "[a-z]+"
input = "aA\u212AaA"
matches = [[0, 7]]
[[tests]]
name = "no-unicode-case4"
options = ["case-insensitive", "no-unicode"]
pattern = "[a-z]+"
input = "aA\u212AaA"
matches = [[0, 2]]
[[tests]]
name = "no-unicode-negate1"
options = []
pattern = "[^a]"
input = "δ"
matches = [[0, 2]]
[[tests]]
name = "no-unicode-negate2"
options = ["no-unicode", "invalid-utf8"]
pattern = "[^a]"
input = "δ"
matches = [[0, 1]]
[[tests]]
name = "no-unicode-dotstar-prefix1"
options = ["escaped", "no-unicode", "invalid-utf8"]
pattern = "a"
input = '\xFFa'
matches = [[1, 2]]
[[tests]]
name = "no-unicode-dotstar-prefix2"
options = ["escaped", "invalid-utf8"]
pattern = "a"
input = '\xFFa'
matches = [[1, 2]]
[[tests]]
name = "no-unicode-null-bytes1"
options = ["escaped", "no-unicode", "invalid-utf8"]
pattern = '[^\x00]+\x00'
input = 'foo\x00'
matches = [[0, 4]]
[[tests]]
name = "no-unicode1"
options = ["no-unicode"]
pattern = '\w+'
input = "aδ"
matches = [[0, 1]]
[[tests]]
name = "no-unicode2"
options = []
pattern = '\w+'
input = "aδ"
matches = [[0, 3]]
[[tests]]
name = "no-unicode3"
options = ["no-unicode"]
pattern = '\d+'
input = "1२३9"
matches = [[0, 1]]
[[tests]]
name = "no-unicode4"
pattern = '\d+'
input = "1२३9"
matches = [[0, 8]]
[[tests]]
name = "no-unicode5"
options = ["no-unicode"]
pattern = '\s+'
input = " \u1680"
matches = [[0, 1]]
[[tests]]
name = "no-unicode6"
pattern = '\s+'
input = " \u1680"
matches = [[0, 4]]
[[tests]]
# See: https://github.com/rust-lang/regex/issues/484
name = "no-unicode-iter1"
pattern = ''
input = "☃"
matches = [[0, 0], [1, 1], [2, 2], [3, 3]]
[[tests]]
# See: https://github.com/rust-lang/regex/issues/484
options = ['escaped']
name = "no-unicode-iter2"
pattern = ''
input = 'b\xFFr'
matches = [[0, 0], [1, 1], [2, 2], [3, 3]]

View file

@ -0,0 +1,489 @@
[[tests]]
name = "unicode-literal1"
pattern = '☃'
input = "☃"
matches = [[0, 3]]
[[tests]]
name = "unicode-literal2"
pattern = '☃+'
input = "☃"
matches = [[0, 3]]
[[tests]]
name = "unicode-literal3"
options = ["case-insensitive"]
pattern = '☃+'
input = "☃"
matches = [[0, 3]]
[[tests]]
name = "unicode-literal4"
options = ["case-insensitive"]
pattern = 'Δ'
input = "δ"
matches = [[0, 2]]
[[tests]]
name = "unicode-class1"
pattern = '[☃Ⅰ]+'
input = "☃"
matches = [[0, 3]]
[[tests]]
name = "unicode-class2"
pattern = '\pN'
input = ""
matches = [[0, 3]]
[[tests]]
name = "unicode-class3"
pattern = '\pN+'
input = "1Ⅱ2"
matches = [[0, 8]]
[[tests]]
name = "unicode-class4"
pattern = '\PN+'
input = "ab"
matches = [[0, 2]]
[[tests]]
name = "unicode-class5"
pattern = '[\PN]+'
input = "ab"
matches = [[0, 2]]
[[tests]]
name = "unicode-class6"
pattern = '[^\PN]+'
input = "ab"
matches = [[2, 5]]
[[tests]]
name = "unicode-class7"
pattern = '\p{Lu}+'
input = "ΛΘΓΔα"
matches = [[0, 8]]
[[tests]]
name = "unicode-class8"
options = ["case-insensitive"]
pattern = '\p{Lu}+'
input = "ΛΘΓΔα"
matches = [[0, 10]]
[[tests]]
name = "unicode-class9"
pattern = '\pL+'
input = "ΛΘΓΔα"
matches = [[0, 10]]
[[tests]]
name = "unicode-class10"
pattern = '\p{Ll}+'
input = "ΛΘΓΔα"
matches = [[8, 10]]
[[tests]]
name = "unicode-perl1"
pattern = '\w+'
input = "dδd"
matches = [[0, 4]]
[[tests]]
name = "unicode-perl2"
pattern = '\w+'
input = "⥡"
matches = []
[[tests]]
name = "unicode-perl3"
pattern = '\W+'
input = "⥡"
matches = [[0, 3]]
[[tests]]
name = "unicode-perl4"
pattern = '\d+'
input = "1२३9"
matches = [[0, 8]]
[[tests]]
name = "unicode-perl5"
pattern = '\d+'
input = "Ⅱ"
matches = []
[[tests]]
name = "unicode-perl6"
pattern = '\D+'
input = "Ⅱ"
matches = [[0, 3]]
[[tests]]
name = "unicode-perl7"
pattern = '\s+'
input = ""
matches = [[0, 3]]
[[tests]]
name = "unicode-perl8"
pattern = '\s+'
input = "☃"
matches = []
[[tests]]
name = "unicode-perl9"
pattern = '\S+'
input = "☃"
matches = [[0, 3]]
[[tests]]
name = "unicode-class-gencat1"
pattern = '\p{Cased_Letter}'
input = ""
matches = [[0, 3]]
[[tests]]
name = "unicode-class-gencat2"
pattern = '\p{Close_Punctuation}'
input = ""
matches = [[0, 3]]
[[tests]]
name = "unicode-class-gencat3"
pattern = '\p{Connector_Punctuation}'
input = "⁀"
matches = [[0, 3]]
[[tests]]
name = "unicode-class-gencat4"
pattern = '\p{Control}'
input = "\u009F"
matches = [[0, 2]]
[[tests]]
name = "unicode-class-gencat5"
pattern = '\p{Currency_Symbol}'
input = "£"
matches = [[0, 3]]
[[tests]]
name = "unicode-class-gencat6"
pattern = '\p{Dash_Punctuation}'
input = "〰"
matches = [[0, 3]]
[[tests]]
name = "unicode-class-gencat7"
pattern = '\p{Decimal_Number}'
input = "𑓙"
matches = [[0, 4]]
[[tests]]
name = "unicode-class-gencat8"
pattern = '\p{Enclosing_Mark}'
input = "\uA672"
matches = [[0, 3]]
[[tests]]
name = "unicode-class-gencat9"
pattern = '\p{Final_Punctuation}'
input = "⸡"
matches = [[0, 3]]
[[tests]]
name = "unicode-class-gencat10"
pattern = '\p{Format}'
input = "\U000E007F"
matches = [[0, 4]]
[[tests]]
name = "unicode-class-gencat11"
pattern = '\p{Initial_Punctuation}'
input = "⸜"
matches = [[0, 3]]
[[tests]]
name = "unicode-class-gencat12"
pattern = '\p{Letter}'
input = "Έ"
matches = [[0, 2]]
[[tests]]
name = "unicode-class-gencat13"
pattern = '\p{Letter_Number}'
input = "ↂ"
matches = [[0, 3]]
[[tests]]
name = "unicode-class-gencat14"
pattern = '\p{Line_Separator}'
input = "\u2028"
matches = [[0, 3]]
[[tests]]
name = "unicode-class-gencat15"
pattern = '\p{Lowercase_Letter}'
input = "ϛ"
matches = [[0, 2]]
[[tests]]
name = "unicode-class-gencat16"
pattern = '\p{Mark}'
input = "\U000E01EF"
matches = [[0, 4]]
[[tests]]
name = "unicode-class-gencat17"
pattern = '\p{Math}'
input = ""
matches = [[0, 3]]
[[tests]]
name = "unicode-class-gencat18"
pattern = '\p{Modifier_Letter}'
input = "𖭃"
matches = [[0, 4]]
[[tests]]
name = "unicode-class-gencat19"
pattern = '\p{Modifier_Symbol}'
input = "🏿"
matches = [[0, 4]]
[[tests]]
name = "unicode-class-gencat20"
pattern = '\p{Nonspacing_Mark}'
input = "\U0001E94A"
matches = [[0, 4]]
[[tests]]
name = "unicode-class-gencat21"
pattern = '\p{Number}'
input = "⓿"
matches = [[0, 3]]
[[tests]]
name = "unicode-class-gencat22"
pattern = '\p{Open_Punctuation}'
input = "⦅"
matches = [[0, 3]]
[[tests]]
name = "unicode-class-gencat23"
pattern = '\p{Other}'
input = "\u0BC9"
matches = [[0, 3]]
[[tests]]
name = "unicode-class-gencat24"
pattern = '\p{Other_Letter}'
input = "ꓷ"
matches = [[0, 3]]
[[tests]]
name = "unicode-class-gencat25"
pattern = '\p{Other_Number}'
input = "㉏"
matches = [[0, 3]]
[[tests]]
name = "unicode-class-gencat26"
pattern = '\p{Other_Punctuation}'
input = "𞥞"
matches = [[0, 4]]
[[tests]]
name = "unicode-class-gencat27"
pattern = '\p{Other_Symbol}'
input = "⅌"
matches = [[0, 3]]
[[tests]]
name = "unicode-class-gencat28"
pattern = '\p{Paragraph_Separator}'
input = "\u2029"
matches = [[0, 3]]
[[tests]]
name = "unicode-class-gencat29"
pattern = '\p{Private_Use}'
input = "\U0010FFFD"
matches = [[0, 4]]
[[tests]]
name = "unicode-class-gencat30"
pattern = '\p{Punctuation}'
input = "𑁍"
matches = [[0, 4]]
[[tests]]
name = "unicode-class-gencat31"
pattern = '\p{Separator}'
input = "\u3000"
matches = [[0, 3]]
[[tests]]
name = "unicode-class-gencat32"
pattern = '\p{Space_Separator}'
input = "\u205F"
matches = [[0, 3]]
[[tests]]
name = "unicode-class-gencat33"
pattern = '\p{Spacing_Mark}'
input = "\U00016F7E"
matches = [[0, 4]]
[[tests]]
name = "unicode-class-gencat34"
pattern = '\p{Symbol}'
input = "⯈"
matches = [[0, 3]]
[[tests]]
name = "unicode-class-gencat35"
pattern = '\p{Titlecase_Letter}'
input = "ῼ"
matches = [[0, 3]]
[[tests]]
name = "unicode-class-gencat36"
pattern = '\p{Unassigned}'
input = "\U0010FFFF"
matches = [[0, 4]]
[[tests]]
name = "unicode-class-gencat37"
pattern = '\p{Uppercase_Letter}'
input = "Ꝋ"
matches = [[0, 3]]
[[tests]]
name = "unicode-class-emoji1"
pattern = '\p{Emoji}'
input = "\u23E9"
matches = [[0, 3]]
[[tests]]
name = "unicode-class-emoji2"
pattern = '\p{emoji}'
input = "\U0001F21A"
matches = [[0, 4]]
[[tests]]
name = "unicode-class-emoji3"
pattern = '\p{extendedpictographic}'
input = "\U0001FA6E"
matches = [[0, 4]]
[[tests]]
name = "unicode-class-emoji4"
pattern = '\p{extendedpictographic}'
input = "\U0001FFFD"
matches = [[0, 4]]
[[tests]]
name = "unicode-class-gcb1"
pattern = '\p{grapheme_cluster_break=prepend}'
input = "\U00011D46"
matches = [[0, 4]]
[[tests]]
name = "unicode-class-gcb2"
pattern = '\p{gcb=regional_indicator}'
input = "\U0001F1E6"
matches = [[0, 4]]
[[tests]]
name = "unicode-class-gcb3"
pattern = '\p{gcb=ri}'
input = "\U0001F1E7"
matches = [[0, 4]]
[[tests]]
name = "unicode-class-gcb4"
pattern = '\p{regionalindicator}'
input = "\U0001F1FF"
matches = [[0, 4]]
[[tests]]
name = "unicode-class-gcb5"
pattern = '\p{gcb=lvt}'
input = "\uC989"
matches = [[0, 3]]
[[tests]]
name = "unicode-class-gcb6"
pattern = '\p{gcb=zwj}'
input = "\u200D"
matches = [[0, 3]]
[[tests]]
name = "unicode-class-word-break1"
pattern = '\p{word_break=Hebrew_Letter}'
input = "\uFB46"
matches = [[0, 3]]
[[tests]]
name = "unicode-class-word-break2"
pattern = '\p{wb=hebrewletter}'
input = "\uFB46"
matches = [[0, 3]]
[[tests]]
name = "unicode-class-word-break3"
pattern = '\p{wb=ExtendNumLet}'
input = "\uFF3F"
matches = [[0, 3]]
[[tests]]
name = "unicode-class-word-break4"
pattern = '\p{wb=WSegSpace}'
input = "\u3000"
matches = [[0, 3]]
[[tests]]
name = "unicode-class-word-break5"
pattern = '\p{wb=numeric}'
input = "\U0001E950"
matches = [[0, 4]]
[[tests]]
name = "unicode-class-sentence-break1"
pattern = '\p{sentence_break=Lower}'
input = "\u0469"
matches = [[0, 2]]
[[tests]]
name = "unicode-class-sentence-break2"
pattern = '\p{sb=lower}'
input = "\u0469"
matches = [[0, 2]]
[[tests]]
name = "unicode-class-sentence-break3"
pattern = '\p{sb=Close}'
input = "\uFF60"
matches = [[0, 3]]
[[tests]]
name = "unicode-class-sentence-break4"
pattern = '\p{sb=Close}'
input = "\U0001F677"
matches = [[0, 4]]
[[tests]]
name = "unicode-class-sentence-break5"
pattern = '\p{sb=SContinue}'
input = "\uFF64"
matches = [[0, 3]]

View file

@ -0,0 +1,2 @@
max_width = 79
use_small_heuristics = "max"

View file

@ -0,0 +1,76 @@
use core::convert::TryInto;
pub trait ByteOrder {
fn read_u16(buf: &[u8]) -> u16;
fn read_u32(buf: &[u8]) -> u32;
fn read_u64(buf: &[u8]) -> u64;
fn read_uint(buf: &[u8], nbytes: usize) -> u64;
fn write_u16(buf: &mut [u8], n: u16);
fn write_u32(buf: &mut [u8], n: u32);
fn write_u64(buf: &mut [u8], n: u64);
fn write_uint(buf: &mut [u8], n: u64, nbytes: usize);
}
pub enum BigEndian {}
pub enum LittleEndian {}
pub enum NativeEndian {}
macro_rules! impl_endian {
($t:ty, $from_endian:ident, $to_endian:ident) => {
impl ByteOrder for $t {
#[inline]
fn read_u16(buf: &[u8]) -> u16 {
u16::$from_endian(buf[0..2].try_into().unwrap())
}
#[inline]
fn read_u32(buf: &[u8]) -> u32 {
u32::$from_endian(buf[0..4].try_into().unwrap())
}
#[inline]
fn read_u64(buf: &[u8]) -> u64 {
u64::$from_endian(buf[0..8].try_into().unwrap())
}
#[inline]
fn read_uint(buf: &[u8], nbytes: usize) -> u64 {
let mut dst = [0u8; 8];
dst[..nbytes].copy_from_slice(&buf[..nbytes]);
u64::$from_endian(dst)
}
#[inline]
fn write_u16(buf: &mut [u8], n: u16) {
buf[0..2].copy_from_slice(&n.$to_endian()[..]);
}
#[inline]
fn write_u32(buf: &mut [u8], n: u32) {
buf[0..4].copy_from_slice(&n.$to_endian()[..]);
}
#[inline]
fn write_u64(buf: &mut [u8], n: u64) {
buf[0..8].copy_from_slice(&n.$to_endian()[..]);
}
#[inline]
fn write_uint(buf: &mut [u8], n: u64, nbytes: usize) {
buf[..nbytes].copy_from_slice(&n.$to_endian()[..nbytes]);
}
}
};
}
impl_endian! {
BigEndian, from_be_bytes, to_be_bytes
}
impl_endian! {
LittleEndian, from_le_bytes, to_le_bytes
}
impl_endian! {
NativeEndian, from_ne_bytes, to_ne_bytes
}

View file

@ -0,0 +1,271 @@
use core::fmt;
/// A representation of byte oriented equivalence classes.
///
/// This is used in a DFA to reduce the size of the transition table. This can
/// have a particularly large impact not only on the total size of a dense DFA,
/// but also on compile times.
#[derive(Clone, Copy)]
pub struct ByteClasses([u8; 256]);
impl ByteClasses {
/// Creates a new set of equivalence classes where all bytes are mapped to
/// the same class.
pub fn empty() -> ByteClasses {
ByteClasses([0; 256])
}
/// Creates a new set of equivalence classes where each byte belongs to
/// its own equivalence class.
pub fn singletons() -> ByteClasses {
let mut classes = ByteClasses::empty();
for i in 0..256 {
classes.set(i as u8, i as u8);
}
classes
}
/// Copies the byte classes given. The given slice must have length 0 or
/// length 256. Slices of length 0 are treated as singletons (every byte
/// is its own class).
pub fn from_slice(slice: &[u8]) -> ByteClasses {
assert!(slice.is_empty() || slice.len() == 256);
if slice.is_empty() {
ByteClasses::singletons()
} else {
let mut classes = ByteClasses::empty();
for (b, &class) in slice.iter().enumerate() {
classes.set(b as u8, class);
}
classes
}
}
/// Set the equivalence class for the given byte.
#[inline]
pub fn set(&mut self, byte: u8, class: u8) {
self.0[byte as usize] = class;
}
/// Get the equivalence class for the given byte.
#[inline]
pub fn get(&self, byte: u8) -> u8 {
self.0[byte as usize]
}
/// Get the equivalence class for the given byte while forcefully
/// eliding bounds checks.
#[inline]
pub unsafe fn get_unchecked(&self, byte: u8) -> u8 {
*self.0.get_unchecked(byte as usize)
}
/// Return the total number of elements in the alphabet represented by
/// these equivalence classes. Equivalently, this returns the total number
/// of equivalence classes.
#[inline]
pub fn alphabet_len(&self) -> usize {
self.0[255] as usize + 1
}
/// Returns true if and only if every byte in this class maps to its own
/// equivalence class. Equivalently, there are 256 equivalence classes
/// and each class contains exactly one byte.
#[inline]
pub fn is_singleton(&self) -> bool {
self.alphabet_len() == 256
}
/// Returns an iterator over a sequence of representative bytes from each
/// equivalence class. Namely, this yields exactly N items, where N is
/// equivalent to the number of equivalence classes. Each item is an
/// arbitrary byte drawn from each equivalence class.
///
/// This is useful when one is determinizing an NFA and the NFA's alphabet
/// hasn't been converted to equivalence classes yet. Picking an arbitrary
/// byte from each equivalence class then permits a full exploration of
/// the NFA instead of using every possible byte value.
#[cfg(feature = "std")]
pub fn representatives(&self) -> ByteClassRepresentatives {
ByteClassRepresentatives { classes: self, byte: 0, last_class: None }
}
/// Returns all of the bytes in the given equivalence class.
///
/// The second element in the tuple indicates the number of elements in
/// the array.
fn elements(&self, equiv: u8) -> ([u8; 256], usize) {
let (mut array, mut len) = ([0; 256], 0);
for b in 0..256 {
if self.get(b as u8) == equiv {
array[len] = b as u8;
len += 1;
}
}
(array, len)
}
}
impl fmt::Debug for ByteClasses {
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
if self.is_singleton() {
write!(f, "ByteClasses({{singletons}})")
} else {
write!(f, "ByteClasses(")?;
for equiv in 0..self.alphabet_len() {
let (members, len) = self.elements(equiv as u8);
write!(f, "{} => {:?}", equiv, &members[..len])?;
}
write!(f, ")")
}
}
}
/// An iterator over representative bytes from each equivalence class.
#[cfg(feature = "std")]
#[derive(Debug)]
pub struct ByteClassRepresentatives<'a> {
classes: &'a ByteClasses,
byte: usize,
last_class: Option<u8>,
}
#[cfg(feature = "std")]
impl<'a> Iterator for ByteClassRepresentatives<'a> {
type Item = u8;
fn next(&mut self) -> Option<u8> {
while self.byte < 256 {
let byte = self.byte as u8;
let class = self.classes.get(byte);
self.byte += 1;
if self.last_class != Some(class) {
self.last_class = Some(class);
return Some(byte);
}
}
None
}
}
/// A byte class set keeps track of an *approximation* of equivalence classes
/// of bytes during NFA construction. That is, every byte in an equivalence
/// class cannot discriminate between a match and a non-match.
///
/// For example, in the regex `[ab]+`, the bytes `a` and `b` would be in the
/// same equivalence class because it never matters whether an `a` or a `b` is
/// seen, and no combination of `a`s and `b`s in the text can discriminate
/// a match.
///
/// Note though that this does not compute the minimal set of equivalence
/// classes. For example, in the regex `[ac]+`, both `a` and `c` are in the
/// same equivalence class for the same reason that `a` and `b` are in the
/// same equivalence class in the aforementioned regex. However, in this
/// implementation, `a` and `c` are put into distinct equivalence classes.
/// The reason for this is implementation complexity. In the future, we should
/// endeavor to compute the minimal equivalence classes since they can have a
/// rather large impact on the size of the DFA.
///
/// The representation here is 256 booleans, all initially set to false. Each
/// boolean maps to its corresponding byte based on position. A `true` value
/// indicates the end of an equivalence class, where its corresponding byte
/// and all of the bytes corresponding to all previous contiguous `false`
/// values are in the same equivalence class.
///
/// This particular representation only permits contiguous ranges of bytes to
/// be in the same equivalence class, which means that we can never discover
/// the true minimal set of equivalence classes.
#[cfg(feature = "std")]
#[derive(Debug)]
pub struct ByteClassSet(Vec<bool>);
#[cfg(feature = "std")]
impl ByteClassSet {
/// Create a new set of byte classes where all bytes are part of the same
/// equivalence class.
pub fn new() -> Self {
ByteClassSet(vec![false; 256])
}
/// Indicate the the range of byte given (inclusive) can discriminate a
/// match between it and all other bytes outside of the range.
pub fn set_range(&mut self, start: u8, end: u8) {
debug_assert!(start <= end);
if start > 0 {
self.0[start as usize - 1] = true;
}
self.0[end as usize] = true;
}
/// Convert this boolean set to a map that maps all byte values to their
/// corresponding equivalence class. The last mapping indicates the largest
/// equivalence class identifier (which is never bigger than 255).
pub fn byte_classes(&self) -> ByteClasses {
let mut classes = ByteClasses::empty();
let mut class = 0u8;
let mut i = 0;
loop {
classes.set(i as u8, class as u8);
if i >= 255 {
break;
}
if self.0[i] {
class = class.checked_add(1).unwrap();
}
i += 1;
}
classes
}
}
#[cfg(test)]
mod tests {
#[cfg(feature = "std")]
#[test]
fn byte_classes() {
use super::ByteClassSet;
let mut set = ByteClassSet::new();
set.set_range(b'a', b'z');
let classes = set.byte_classes();
assert_eq!(classes.get(0), 0);
assert_eq!(classes.get(1), 0);
assert_eq!(classes.get(2), 0);
assert_eq!(classes.get(b'a' - 1), 0);
assert_eq!(classes.get(b'a'), 1);
assert_eq!(classes.get(b'm'), 1);
assert_eq!(classes.get(b'z'), 1);
assert_eq!(classes.get(b'z' + 1), 2);
assert_eq!(classes.get(254), 2);
assert_eq!(classes.get(255), 2);
let mut set = ByteClassSet::new();
set.set_range(0, 2);
set.set_range(4, 6);
let classes = set.byte_classes();
assert_eq!(classes.get(0), 0);
assert_eq!(classes.get(1), 0);
assert_eq!(classes.get(2), 0);
assert_eq!(classes.get(3), 1);
assert_eq!(classes.get(4), 2);
assert_eq!(classes.get(5), 2);
assert_eq!(classes.get(6), 2);
assert_eq!(classes.get(7), 3);
assert_eq!(classes.get(255), 3);
}
#[cfg(feature = "std")]
#[test]
fn full_byte_classes() {
use super::ByteClassSet;
let mut set = ByteClassSet::new();
for i in 0..256u16 {
set.set_range(i as u8, i as u8);
}
assert_eq!(set.byte_classes().alphabet_len(), 256);
}
}

View file

@ -0,0 +1,104 @@
// This module is unused. It was written as an experiment to get a ballpark
// idea of what state machines look like when translated to Rust code, and
// in particular, an idea of how much code it generates. The implementation
// below isn't optimal with respect to size, but the result wasn't exactly
// small. At some point, we should pursue building this out beyond
// experimentation, and in particular, probably provide a command line tool
// and/or a macro. It's a fair bit of work, so I abandoned it for the initial
// release. ---AG
use std::collections::HashMap;
use std::io::Write;
use dense::DFA;
use state_id::StateID;
macro_rules! wstr {
($($tt:tt)*) => { write!($($tt)*).unwrap() }
}
macro_rules! wstrln {
($($tt:tt)*) => { writeln!($($tt)*).unwrap() }
}
pub fn is_match_forward<S: StateID>(dfa: &DFA<S>) -> String {
let names = state_variant_names(dfa);
let mut buf = vec![];
wstrln!(buf, "pub fn is_match(input: &[u8]) -> bool {{");
if dfa.is_match_state(dfa.start()) {
wstrln!(buf, " return true;");
wstrln!(buf, "}}");
return String::from_utf8(buf).unwrap();
}
wstrln!(buf, "{}", state_enum_def(dfa, &names));
wstrln!(buf, " let mut state = {};", names[&dfa.start()]);
wstrln!(buf, " for &b in input.iter() {{");
wstrln!(buf, " state = match state {{");
for (id, s) in dfa.iter() {
if dfa.is_match_state(id) {
continue;
}
wstrln!(buf, " {} => {{", &names[&id]);
wstrln!(buf, " match b {{");
for (start, end, next_id) in s.sparse_transitions() {
if dfa.is_match_state(next_id) {
wstrln!(buf, " {:?}...{:?} => return true,", start, end);
} else {
if start == end {
wstrln!(buf, " {:?} => {},", start, &names[&next_id]);
} else {
wstrln!(buf, " {:?}...{:?} => {},", start, end, &names[&next_id]);
}
}
}
wstrln!(buf, " _ => S::S0,");
wstrln!(buf, " }}");
wstrln!(buf, " }}");
}
wstrln!(buf, " }};");
wstrln!(buf, " }}");
wstrln!(buf, " false");
wstrln!(buf, "}}");
String::from_utf8(buf).unwrap()
}
fn state_enum_def<S: StateID>(
dfa: &DFA<S>,
variant_names: &HashMap<S, String>,
) -> String {
let mut buf = vec![];
wstrln!(buf, " #[derive(Clone, Copy)]");
wstr!(buf, " enum S {{");
let mut i = 0;
for (id, _) in dfa.iter() {
if dfa.is_match_state(id) {
continue;
}
if i % 10 == 0 {
wstr!(buf, "\n ");
}
let name = format!("S{}", id.to_usize());
wstr!(buf, " {},", name);
i += 1;
}
wstr!(buf, "\n");
wstrln!(buf, " }}");
String::from_utf8(buf).unwrap()
}
fn state_variant_names<S: StateID>(dfa: &DFA<S>) -> HashMap<S, String> {
let mut variants = HashMap::new();
for (id, _) in dfa.iter() {
if dfa.is_match_state(id) {
continue;
}
variants.insert(id, format!("S::S{}", id.to_usize()));
}
variants
}

File diff suppressed because it is too large Load diff

View file

@ -0,0 +1,286 @@
use std::collections::HashMap;
use std::mem;
use std::rc::Rc;
use dense;
use error::Result;
use nfa::{self, NFA};
use sparse_set::SparseSet;
use state_id::{dead_id, StateID};
type DFARepr<S> = dense::Repr<Vec<S>, S>;
/// A determinizer converts an NFA to a DFA.
///
/// This determinizer follows the typical powerset construction, where each
/// DFA state is comprised of one or more NFA states. In the worst case, there
/// is one DFA state for every possible combination of NFA states. In practice,
/// this only happens in certain conditions, typically when there are bounded
/// repetitions.
///
/// The type variable `S` refers to the chosen state identifier representation
/// used for the DFA.
///
/// The lifetime variable `'a` refers to the lifetime of the NFA being
/// converted to a DFA.
#[derive(Debug)]
pub(crate) struct Determinizer<'a, S: StateID> {
/// The NFA we're converting into a DFA.
nfa: &'a NFA,
/// The DFA we're building.
dfa: DFARepr<S>,
/// Each DFA state being built is defined as an *ordered* set of NFA
/// states, along with a flag indicating whether the state is a match
/// state or not.
///
/// This is never empty. The first state is always a dummy state such that
/// a state id == 0 corresponds to a dead state.
builder_states: Vec<Rc<State>>,
/// A cache of DFA states that already exist and can be easily looked up
/// via ordered sets of NFA states.
cache: HashMap<Rc<State>, S>,
/// Scratch space for a stack of NFA states to visit, for depth first
/// visiting without recursion.
stack: Vec<nfa::StateID>,
/// Scratch space for storing an ordered sequence of NFA states, for
/// amortizing allocation.
scratch_nfa_states: Vec<nfa::StateID>,
/// Whether to build a DFA that finds the longest possible match.
longest_match: bool,
}
/// An intermediate representation for a DFA state during determinization.
#[derive(Debug, Eq, Hash, PartialEq)]
struct State {
/// Whether this state is a match state or not.
is_match: bool,
/// An ordered sequence of NFA states that make up this DFA state.
nfa_states: Vec<nfa::StateID>,
}
impl<'a, S: StateID> Determinizer<'a, S> {
/// Create a new determinizer for converting the given NFA to a DFA.
pub fn new(nfa: &'a NFA) -> Determinizer<'a, S> {
let dead = Rc::new(State::dead());
let mut cache = HashMap::default();
cache.insert(dead.clone(), dead_id());
Determinizer {
nfa,
dfa: DFARepr::empty().anchored(nfa.is_anchored()),
builder_states: vec![dead],
cache,
stack: vec![],
scratch_nfa_states: vec![],
longest_match: false,
}
}
/// Instruct the determinizer to use equivalence classes as the transition
/// alphabet instead of all possible byte values.
pub fn with_byte_classes(mut self) -> Determinizer<'a, S> {
let byte_classes = self.nfa.byte_classes().clone();
self.dfa = DFARepr::empty_with_byte_classes(byte_classes)
.anchored(self.nfa.is_anchored());
self
}
/// Instruct the determinizer to build a DFA that recognizes the longest
/// possible match instead of the leftmost first match. This is useful when
/// constructing reverse DFAs for finding the start of a match.
pub fn longest_match(mut self, yes: bool) -> Determinizer<'a, S> {
self.longest_match = yes;
self
}
/// Build the DFA. If there was a problem constructing the DFA (e.g., if
/// the chosen state identifier representation is too small), then an error
/// is returned.
pub fn build(mut self) -> Result<DFARepr<S>> {
let representative_bytes: Vec<u8> =
self.dfa.byte_classes().representatives().collect();
let mut sparse = self.new_sparse_set();
let mut uncompiled = vec![self.add_start(&mut sparse)?];
while let Some(dfa_id) = uncompiled.pop() {
for &b in &representative_bytes {
let (next_dfa_id, is_new) =
self.cached_state(dfa_id, b, &mut sparse)?;
self.dfa.add_transition(dfa_id, b, next_dfa_id);
if is_new {
uncompiled.push(next_dfa_id);
}
}
}
// At this point, we shuffle the matching states in the final DFA to
// the beginning. This permits a DFA's match loop to detect a match
// condition by merely inspecting the current state's identifier, and
// avoids the need for any additional auxiliary storage.
let is_match: Vec<bool> =
self.builder_states.iter().map(|s| s.is_match).collect();
self.dfa.shuffle_match_states(&is_match);
Ok(self.dfa)
}
/// Return the identifier for the next DFA state given an existing DFA
/// state and an input byte. If the next DFA state already exists, then
/// return its identifier from the cache. Otherwise, build the state, cache
/// it and return its identifier.
///
/// The given sparse set is used for scratch space. It must have a capacity
/// equivalent to the total number of NFA states, but its contents are
/// otherwise unspecified.
///
/// This routine returns a boolean indicating whether a new state was
/// built. If a new state is built, then the caller needs to add it to its
/// frontier of uncompiled DFA states to compute transitions for.
fn cached_state(
&mut self,
dfa_id: S,
b: u8,
sparse: &mut SparseSet,
) -> Result<(S, bool)> {
sparse.clear();
// Compute the set of all reachable NFA states, including epsilons.
self.next(dfa_id, b, sparse);
// Build a candidate state and check if it has already been built.
let state = self.new_state(sparse);
if let Some(&cached_id) = self.cache.get(&state) {
// Since we have a cached state, put the constructed state's
// memory back into our scratch space, so that it can be reused.
let _ =
mem::replace(&mut self.scratch_nfa_states, state.nfa_states);
return Ok((cached_id, false));
}
// Nothing was in the cache, so add this state to the cache.
self.add_state(state).map(|s| (s, true))
}
/// Compute the set of all eachable NFA states, including the full epsilon
/// closure, from a DFA state for a single byte of input.
fn next(&mut self, dfa_id: S, b: u8, next_nfa_states: &mut SparseSet) {
next_nfa_states.clear();
for i in 0..self.builder_states[dfa_id.to_usize()].nfa_states.len() {
let nfa_id = self.builder_states[dfa_id.to_usize()].nfa_states[i];
match *self.nfa.state(nfa_id) {
nfa::State::Union { .. }
| nfa::State::Fail
| nfa::State::Match => {}
nfa::State::Range { range: ref r } => {
if r.start <= b && b <= r.end {
self.epsilon_closure(r.next, next_nfa_states);
}
}
nfa::State::Sparse { ref ranges } => {
for r in ranges.iter() {
if r.start > b {
break;
} else if r.start <= b && b <= r.end {
self.epsilon_closure(r.next, next_nfa_states);
break;
}
}
}
}
}
}
/// Compute the epsilon closure for the given NFA state.
fn epsilon_closure(&mut self, start: nfa::StateID, set: &mut SparseSet) {
if !self.nfa.state(start).is_epsilon() {
set.insert(start);
return;
}
self.stack.push(start);
while let Some(mut id) = self.stack.pop() {
loop {
if set.contains(id) {
break;
}
set.insert(id);
match *self.nfa.state(id) {
nfa::State::Range { .. }
| nfa::State::Sparse { .. }
| nfa::State::Fail
| nfa::State::Match => break,
nfa::State::Union { ref alternates } => {
id = match alternates.get(0) {
None => break,
Some(&id) => id,
};
self.stack.extend(alternates[1..].iter().rev());
}
}
}
}
}
/// Compute the initial DFA state and return its identifier.
///
/// The sparse set given is used for scratch space, and must have capacity
/// equal to the total number of NFA states. Its contents are unspecified.
fn add_start(&mut self, sparse: &mut SparseSet) -> Result<S> {
sparse.clear();
self.epsilon_closure(self.nfa.start(), sparse);
let state = self.new_state(&sparse);
let id = self.add_state(state)?;
self.dfa.set_start_state(id);
Ok(id)
}
/// Add the given state to the DFA and make it available in the cache.
///
/// The state initially has no transitions. That is, it transitions to the
/// dead state for all possible inputs.
fn add_state(&mut self, state: State) -> Result<S> {
let id = self.dfa.add_empty_state()?;
let rstate = Rc::new(state);
self.builder_states.push(rstate.clone());
self.cache.insert(rstate, id);
Ok(id)
}
/// Convert the given set of ordered NFA states to a DFA state.
fn new_state(&mut self, set: &SparseSet) -> State {
let mut state = State {
is_match: false,
nfa_states: mem::replace(&mut self.scratch_nfa_states, vec![]),
};
state.nfa_states.clear();
for &id in set {
match *self.nfa.state(id) {
nfa::State::Range { .. } => {
state.nfa_states.push(id);
}
nfa::State::Sparse { .. } => {
state.nfa_states.push(id);
}
nfa::State::Fail => {
break;
}
nfa::State::Match => {
state.is_match = true;
if !self.longest_match {
break;
}
}
nfa::State::Union { .. } => {}
}
}
state
}
/// Create a new sparse set with enough capacity to hold all NFA states.
fn new_sparse_set(&self) -> SparseSet {
SparseSet::new(self.nfa.len())
}
}
impl State {
/// Create a new empty dead state.
fn dead() -> State {
State { nfa_states: vec![], is_match: false }
}
}

View file

@ -0,0 +1,363 @@
use state_id::StateID;
/// A trait describing the interface of a deterministic finite automaton (DFA).
///
/// Every DFA has exactly one start state and at least one dead state (which
/// may be the same, as in the case of an empty DFA). In all cases, a state
/// identifier of `0` must be a dead state such that `DFA::is_dead_state(0)`
/// always returns `true`.
///
/// Every DFA also has zero or more match states, such that
/// `DFA::is_match_state(id)` returns `true` if and only if `id` corresponds to
/// a match state.
///
/// In general, users of this trait likely will only need to use the search
/// routines such as `is_match`, `shortest_match`, `find` or `rfind`. The other
/// methods are lower level and are used for walking the transitions of a DFA
/// manually. In particular, the aforementioned search routines are implemented
/// generically in terms of the lower level transition walking routines.
pub trait DFA {
/// The representation used for state identifiers in this DFA.
///
/// Typically, this is one of `u8`, `u16`, `u32`, `u64` or `usize`.
type ID: StateID;
/// Return the identifier of this DFA's start state.
fn start_state(&self) -> Self::ID;
/// Returns true if and only if the given identifier corresponds to a match
/// state.
fn is_match_state(&self, id: Self::ID) -> bool;
/// Returns true if and only if the given identifier corresponds to a dead
/// state. When a DFA enters a dead state, it is impossible to leave and
/// thus can never lead to a match.
fn is_dead_state(&self, id: Self::ID) -> bool;
/// Returns true if and only if the given identifier corresponds to either
/// a dead state or a match state, such that one of `is_match_state(id)`
/// or `is_dead_state(id)` must return true.
///
/// Depending on the implementation of the DFA, this routine can be used
/// to save a branch in the core matching loop. Nevertheless,
/// `is_match_state(id) || is_dead_state(id)` is always a valid
/// implementation.
fn is_match_or_dead_state(&self, id: Self::ID) -> bool;
/// Returns true if and only if this DFA is anchored.
///
/// When a DFA is anchored, it is only allowed to report matches that
/// start at index `0`.
fn is_anchored(&self) -> bool;
/// Given the current state that this DFA is in and the next input byte,
/// this method returns the identifier of the next state. The identifier
/// returned is always valid, but it may correspond to a dead state.
fn next_state(&self, current: Self::ID, input: u8) -> Self::ID;
/// Like `next_state`, but its implementation may look up the next state
/// without memory safety checks such as bounds checks. As such, callers
/// must ensure that the given identifier corresponds to a valid DFA
/// state. Implementors must, in turn, ensure that this routine is safe
/// for all valid state identifiers and for all possible `u8` values.
unsafe fn next_state_unchecked(
&self,
current: Self::ID,
input: u8,
) -> Self::ID;
/// Returns true if and only if the given bytes match this DFA.
///
/// This routine may short circuit if it knows that scanning future input
/// will never lead to a different result. In particular, if a DFA enters
/// a match state or a dead state, then this routine will return `true` or
/// `false`, respectively, without inspecting any future input.
///
/// # Example
///
/// This example shows how to use this method with a
/// [`DenseDFA`](enum.DenseDFA.html).
///
/// ```
/// use regex_automata::{DFA, DenseDFA};
///
/// # fn example() -> Result<(), regex_automata::Error> {
/// let dfa = DenseDFA::new("foo[0-9]+bar")?;
/// assert_eq!(true, dfa.is_match(b"foo12345bar"));
/// assert_eq!(false, dfa.is_match(b"foobar"));
/// # Ok(()) }; example().unwrap()
/// ```
#[inline]
fn is_match(&self, bytes: &[u8]) -> bool {
self.is_match_at(bytes, 0)
}
/// Returns the first position at which a match is found.
///
/// This routine stops scanning input in precisely the same circumstances
/// as `is_match`. The key difference is that this routine returns the
/// position at which it stopped scanning input if and only if a match
/// was found. If no match is found, then `None` is returned.
///
/// # Example
///
/// This example shows how to use this method with a
/// [`DenseDFA`](enum.DenseDFA.html).
///
/// ```
/// use regex_automata::{DFA, DenseDFA};
///
/// # fn example() -> Result<(), regex_automata::Error> {
/// let dfa = DenseDFA::new("foo[0-9]+")?;
/// assert_eq!(Some(4), dfa.shortest_match(b"foo12345"));
///
/// // Normally, the end of the leftmost first match here would be 3,
/// // but the shortest match semantics detect a match earlier.
/// let dfa = DenseDFA::new("abc|a")?;
/// assert_eq!(Some(1), dfa.shortest_match(b"abc"));
/// # Ok(()) }; example().unwrap()
/// ```
#[inline]
fn shortest_match(&self, bytes: &[u8]) -> Option<usize> {
self.shortest_match_at(bytes, 0)
}
/// Returns the end offset of the longest match. If no match exists,
/// then `None` is returned.
///
/// Implementors of this trait are not required to implement any particular
/// match semantics (such as leftmost-first), which are instead manifest in
/// the DFA's topology itself.
///
/// In particular, this method must continue searching even after it
/// enters a match state. The search should only terminate once it has
/// reached the end of the input or when it has entered a dead state. Upon
/// termination, the position of the last byte seen while still in a match
/// state is returned.
///
/// # Example
///
/// This example shows how to use this method with a
/// [`DenseDFA`](enum.DenseDFA.html). By default, a dense DFA uses
/// "leftmost first" match semantics.
///
/// Leftmost first match semantics corresponds to the match with the
/// smallest starting offset, but where the end offset is determined by
/// preferring earlier branches in the original regular expression. For
/// example, `Sam|Samwise` will match `Sam` in `Samwise`, but `Samwise|Sam`
/// will match `Samwise` in `Samwise`.
///
/// Generally speaking, the "leftmost first" match is how most backtracking
/// regular expressions tend to work. This is in contrast to POSIX-style
/// regular expressions that yield "leftmost longest" matches. Namely,
/// both `Sam|Samwise` and `Samwise|Sam` match `Samwise` when using
/// leftmost longest semantics.
///
/// ```
/// use regex_automata::{DFA, DenseDFA};
///
/// # fn example() -> Result<(), regex_automata::Error> {
/// let dfa = DenseDFA::new("foo[0-9]+")?;
/// assert_eq!(Some(8), dfa.find(b"foo12345"));
///
/// // Even though a match is found after reading the first byte (`a`),
/// // the leftmost first match semantics demand that we find the earliest
/// // match that prefers earlier parts of the pattern over latter parts.
/// let dfa = DenseDFA::new("abc|a")?;
/// assert_eq!(Some(3), dfa.find(b"abc"));
/// # Ok(()) }; example().unwrap()
/// ```
#[inline]
fn find(&self, bytes: &[u8]) -> Option<usize> {
self.find_at(bytes, 0)
}
/// Returns the start offset of the longest match in reverse, by searching
/// from the end of the input towards the start of the input. If no match
/// exists, then `None` is returned. In other words, this has the same
/// match semantics as `find`, but in reverse.
///
/// # Example
///
/// This example shows how to use this method with a
/// [`DenseDFA`](enum.DenseDFA.html). In particular, this routine
/// is principally useful when used in conjunction with the
/// [`dense::Builder::reverse`](dense/struct.Builder.html#method.reverse)
/// configuration knob. In general, it's unlikely to be correct to use both
/// `find` and `rfind` with the same DFA since any particular DFA will only
/// support searching in one direction.
///
/// ```
/// use regex_automata::{dense, DFA};
///
/// # fn example() -> Result<(), regex_automata::Error> {
/// let dfa = dense::Builder::new().reverse(true).build("foo[0-9]+")?;
/// assert_eq!(Some(0), dfa.rfind(b"foo12345"));
/// # Ok(()) }; example().unwrap()
/// ```
#[inline]
fn rfind(&self, bytes: &[u8]) -> Option<usize> {
self.rfind_at(bytes, bytes.len())
}
/// Returns the same as `is_match`, but starts the search at the given
/// offset.
///
/// The significance of the starting point is that it takes the surrounding
/// context into consideration. For example, if the DFA is anchored, then
/// a match can only occur when `start == 0`.
#[inline]
fn is_match_at(&self, bytes: &[u8], start: usize) -> bool {
if self.is_anchored() && start > 0 {
return false;
}
let mut state = self.start_state();
if self.is_match_or_dead_state(state) {
return self.is_match_state(state);
}
for &b in bytes[start..].iter() {
state = unsafe { self.next_state_unchecked(state, b) };
if self.is_match_or_dead_state(state) {
return self.is_match_state(state);
}
}
false
}
/// Returns the same as `shortest_match`, but starts the search at the
/// given offset.
///
/// The significance of the starting point is that it takes the surrounding
/// context into consideration. For example, if the DFA is anchored, then
/// a match can only occur when `start == 0`.
#[inline]
fn shortest_match_at(&self, bytes: &[u8], start: usize) -> Option<usize> {
if self.is_anchored() && start > 0 {
return None;
}
let mut state = self.start_state();
if self.is_match_or_dead_state(state) {
return if self.is_dead_state(state) { None } else { Some(start) };
}
for (i, &b) in bytes[start..].iter().enumerate() {
state = unsafe { self.next_state_unchecked(state, b) };
if self.is_match_or_dead_state(state) {
return if self.is_dead_state(state) {
None
} else {
Some(start + i + 1)
};
}
}
None
}
/// Returns the same as `find`, but starts the search at the given
/// offset.
///
/// The significance of the starting point is that it takes the surrounding
/// context into consideration. For example, if the DFA is anchored, then
/// a match can only occur when `start == 0`.
#[inline]
fn find_at(&self, bytes: &[u8], start: usize) -> Option<usize> {
if self.is_anchored() && start > 0 {
return None;
}
let mut state = self.start_state();
let mut last_match = if self.is_dead_state(state) {
return None;
} else if self.is_match_state(state) {
Some(start)
} else {
None
};
for (i, &b) in bytes[start..].iter().enumerate() {
state = unsafe { self.next_state_unchecked(state, b) };
if self.is_match_or_dead_state(state) {
if self.is_dead_state(state) {
return last_match;
}
last_match = Some(start + i + 1);
}
}
last_match
}
/// Returns the same as `rfind`, but starts the search at the given
/// offset.
///
/// The significance of the starting point is that it takes the surrounding
/// context into consideration. For example, if the DFA is anchored, then
/// a match can only occur when `start == bytes.len()`.
#[inline(never)]
fn rfind_at(&self, bytes: &[u8], start: usize) -> Option<usize> {
if self.is_anchored() && start < bytes.len() {
return None;
}
let mut state = self.start_state();
let mut last_match = if self.is_dead_state(state) {
return None;
} else if self.is_match_state(state) {
Some(start)
} else {
None
};
for (i, &b) in bytes[..start].iter().enumerate().rev() {
state = unsafe { self.next_state_unchecked(state, b) };
if self.is_match_or_dead_state(state) {
if self.is_dead_state(state) {
return last_match;
}
last_match = Some(i);
}
}
last_match
}
}
impl<'a, T: DFA> DFA for &'a T {
type ID = T::ID;
#[inline]
fn start_state(&self) -> Self::ID {
(**self).start_state()
}
#[inline]
fn is_match_state(&self, id: Self::ID) -> bool {
(**self).is_match_state(id)
}
#[inline]
fn is_match_or_dead_state(&self, id: Self::ID) -> bool {
(**self).is_match_or_dead_state(id)
}
#[inline]
fn is_dead_state(&self, id: Self::ID) -> bool {
(**self).is_dead_state(id)
}
#[inline]
fn is_anchored(&self) -> bool {
(**self).is_anchored()
}
#[inline]
fn next_state(&self, current: Self::ID, input: u8) -> Self::ID {
(**self).next_state(current, input)
}
#[inline]
unsafe fn next_state_unchecked(
&self,
current: Self::ID,
input: u8,
) -> Self::ID {
(**self).next_state_unchecked(current, input)
}
}

View file

@ -0,0 +1,150 @@
use std::error;
use std::fmt;
use std::result;
use regex_syntax;
pub type Result<T> = result::Result<T, Error>;
/// An error that occurred during the construction of a DFA.
#[derive(Clone, Debug)]
pub struct Error {
kind: ErrorKind,
}
/// The kind of error that occurred.
#[derive(Clone, Debug)]
pub enum ErrorKind {
/// An error that occurred while parsing a regular expression. Note that
/// this error may be printed over multiple lines, and is generally
/// intended to be end user readable on its own.
Syntax(String),
/// An error that occurred because an unsupported regex feature was used.
/// The message string describes which unsupported feature was used.
///
/// The primary regex features that are unsupported are those that require
/// look-around, such as the `^` and `$` anchors and the word boundary
/// assertion `\b`. These may be supported in the future.
Unsupported(String),
/// An error that occurred when attempting to serialize a DFA to bytes.
Serialize(String),
/// An error that occurs when constructing a DFA would require the use of
/// a state ID that overflows the chosen state ID representation. For
/// example, if one is using `u8` for state IDs and builds a DFA with
/// 257 states, then the last state's ID will be `256` which cannot be
/// represented with `u8`.
///
/// Typically, this error occurs in the determinization process of building
/// a DFA (the conversion step from NFA to DFA). It can also occur when
/// trying to build a smaller DFA from an existing one.
StateIDOverflow {
/// The maximum possible state ID.
max: usize,
},
/// An error that occurs when premultiplication of state IDs is requested,
/// but doing so would overflow the chosen state ID representation.
///
/// When `max == requested_max`, then the state ID would overflow `usize`.
PremultiplyOverflow {
/// The maximum possible state id.
max: usize,
/// The maximum ID required by premultiplication.
requested_max: usize,
},
}
impl Error {
/// Return the kind of this error.
pub fn kind(&self) -> &ErrorKind {
&self.kind
}
pub(crate) fn syntax(err: regex_syntax::Error) -> Error {
Error { kind: ErrorKind::Syntax(err.to_string()) }
}
pub(crate) fn unsupported_anchor() -> Error {
let msg = r"anchors such as ^, $, \A and \z are not supported";
Error { kind: ErrorKind::Unsupported(msg.to_string()) }
}
pub(crate) fn unsupported_word() -> Error {
let msg = r"word boundary assertions (\b and \B) are not supported";
Error { kind: ErrorKind::Unsupported(msg.to_string()) }
}
pub(crate) fn unsupported_longest_match() -> Error {
let msg = "unachored searches with longest match \
semantics are not supported";
Error { kind: ErrorKind::Unsupported(msg.to_string()) }
}
pub(crate) fn serialize(message: &str) -> Error {
Error { kind: ErrorKind::Serialize(message.to_string()) }
}
pub(crate) fn state_id_overflow(max: usize) -> Error {
Error { kind: ErrorKind::StateIDOverflow { max } }
}
pub(crate) fn premultiply_overflow(
max: usize,
requested_max: usize,
) -> Error {
Error { kind: ErrorKind::PremultiplyOverflow { max, requested_max } }
}
}
impl error::Error for Error {
fn description(&self) -> &str {
match self.kind {
ErrorKind::Syntax(_) => "syntax error",
ErrorKind::Unsupported(_) => "unsupported syntax",
ErrorKind::Serialize(_) => "serialization error",
ErrorKind::StateIDOverflow { .. } => {
"state id representation too small"
}
ErrorKind::PremultiplyOverflow { .. } => {
"state id representation too small for premultiplication"
}
}
}
}
impl fmt::Display for Error {
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
match self.kind {
ErrorKind::Syntax(ref msg) => write!(f, "{}", msg),
ErrorKind::Unsupported(ref msg) => write!(f, "{}", msg),
ErrorKind::Serialize(ref msg) => {
write!(f, "DFA serialization error: {}", msg)
}
ErrorKind::StateIDOverflow { max } => write!(
f,
"building the DFA failed because it required building \
more states that can be identified, where the maximum \
ID for the chosen representation is {}",
max,
),
ErrorKind::PremultiplyOverflow { max, requested_max } => {
if max == requested_max {
write!(
f,
"premultiplication of states requires the ability to \
represent a state ID greater than what can fit on \
this platform's usize, which is {}",
::std::usize::MAX,
)
} else {
write!(
f,
"premultiplication of states requires the ability to \
represent at least a state ID of {}, but the chosen \
representation only permits a maximum state ID of {}",
requested_max, max,
)
}
}
}
}
}

View file

@ -0,0 +1,360 @@
/*!
A low level regular expression library that uses deterministic finite automata.
It supports a rich syntax with Unicode support, has extensive options for
configuring the best space vs time trade off for your use case and provides
support for cheap deserialization of automata for use in `no_std` environments.
# Overview
This section gives a brief overview of the primary types in this crate:
* A [`Regex`](struct.Regex.html) provides a way to search for matches of a
regular expression. This includes iterating over matches with both the start
and end positions of each match.
* A [`RegexBuilder`](struct.RegexBuilder.html) provides a way configure many
compilation options for a regex.
* A [`DenseDFA`](enum.DenseDFA.html) provides low level access to a DFA that
uses a dense representation (uses lots of space, but fast searching).
* A [`SparseDFA`](enum.SparseDFA.html) provides the same API as a `DenseDFA`,
but uses a sparse representation (uses less space, but slower matching).
* A [`DFA`](trait.DFA.html) trait that defines an interface that all DFAs must
implement.
* Both dense DFAs and sparse DFAs support
[serialization to raw bytes](enum.DenseDFA.html#method.to_bytes_little_endian)
and
[cheap deserialization](enum.DenseDFA.html#method.from_bytes).
# Example: basic regex searching
This example shows how to compile a regex using the default configuration
and then use it to find matches in a byte string:
```
use regex_automata::Regex;
let re = Regex::new(r"[0-9]{4}-[0-9]{2}-[0-9]{2}").unwrap();
let text = b"2018-12-24 2016-10-08";
let matches: Vec<(usize, usize)> = re.find_iter(text).collect();
assert_eq!(matches, vec![(0, 10), (11, 21)]);
```
# Example: use sparse DFAs
By default, compiling a regex will use dense DFAs internally. This uses more
memory, but executes searches more quickly. If you can abide slower searches
(somewhere around 3-5x), then sparse DFAs might make more sense since they can
use significantly less space.
Using sparse DFAs is as easy as using `Regex::new_sparse` instead of
`Regex::new`:
```
use regex_automata::Regex;
# fn example() -> Result<(), regex_automata::Error> {
let re = Regex::new_sparse(r"[0-9]{4}-[0-9]{2}-[0-9]{2}").unwrap();
let text = b"2018-12-24 2016-10-08";
let matches: Vec<(usize, usize)> = re.find_iter(text).collect();
assert_eq!(matches, vec![(0, 10), (11, 21)]);
# Ok(()) }; example().unwrap()
```
If you already have dense DFAs for some reason, they can be converted to sparse
DFAs and used to build a new `Regex`. For example:
```
use regex_automata::Regex;
# fn example() -> Result<(), regex_automata::Error> {
let dense_re = Regex::new(r"[0-9]{4}-[0-9]{2}-[0-9]{2}").unwrap();
let sparse_re = Regex::from_dfas(
dense_re.forward().to_sparse()?,
dense_re.reverse().to_sparse()?,
);
let text = b"2018-12-24 2016-10-08";
let matches: Vec<(usize, usize)> = sparse_re.find_iter(text).collect();
assert_eq!(matches, vec![(0, 10), (11, 21)]);
# Ok(()) }; example().unwrap()
```
# Example: deserialize a DFA
This shows how to first serialize a DFA into raw bytes, and then deserialize
those raw bytes back into a DFA. While this particular example is a bit
contrived, this same technique can be used in your program to deserialize a
DFA at start up time or by memory mapping a file. In particular,
deserialization is guaranteed to be cheap because it will always be a constant
time operation.
```
use regex_automata::{DenseDFA, Regex};
# fn example() -> Result<(), regex_automata::Error> {
let re1 = Regex::new(r"[0-9]{4}-[0-9]{2}-[0-9]{2}").unwrap();
// serialize both the forward and reverse DFAs, see note below
let fwd_bytes = re1.forward().to_u16()?.to_bytes_native_endian()?;
let rev_bytes = re1.reverse().to_u16()?.to_bytes_native_endian()?;
// now deserialize both---we need to specify the correct type!
let fwd: DenseDFA<&[u16], u16> = unsafe { DenseDFA::from_bytes(&fwd_bytes) };
let rev: DenseDFA<&[u16], u16> = unsafe { DenseDFA::from_bytes(&rev_bytes) };
// finally, reconstruct our regex
let re2 = Regex::from_dfas(fwd, rev);
// we can use it like normal
let text = b"2018-12-24 2016-10-08";
let matches: Vec<(usize, usize)> = re2.find_iter(text).collect();
assert_eq!(matches, vec![(0, 10), (11, 21)]);
# Ok(()) }; example().unwrap()
```
There are a few points worth noting here:
* We need to extract the raw DFAs used by the regex and serialize those. You
can build the DFAs manually yourself using
[`dense::Builder`](dense/struct.Builder.html), but using the DFAs from a
`Regex` guarantees that the DFAs are built correctly.
* We specifically convert the dense DFA to a representation that uses `u16`
for its state identifiers using
[`DenseDFA::to_u16`](enum.DenseDFA.html#method.to_u16). While this isn't
strictly necessary, if we skipped this step, then the serialized bytes would
use `usize` for state identifiers, which does not have a fixed size. Using
`u16` ensures that we can deserialize this DFA even on platforms with a
smaller pointer size. If our DFA is too big for `u16` state identifiers, then
one can use `u32` or `u64`.
* To convert the DFA to raw bytes, we use the `to_bytes_native_endian`
method. In practice, you'll want to use either
[`DenseDFA::to_bytes_little_endian`](enum.DenseDFA.html#method.to_bytes_little_endian)
or
[`DenseDFA::to_bytes_big_endian`](enum.DenseDFA.html#method.to_bytes_big_endian),
depending on which platform you're deserializing your DFA from. If you intend
to deserialize on either platform, then you'll need to serialize both and
deserialize the right one depending on your target's endianness.
* Deserializing a DFA requires the use of `unsafe` because the raw bytes must
be *trusted*. In particular, while some degree of sanity checks are
performed, nothing guarantees the integrity of the DFA's transition table
since deserialization is a constant time operation. Since searching with a
DFA must be able to follow transitions blindly for performance reasons,
giving incorrect bytes to the deserialization API can result in memory
unsafety.
The same process can be achieved with sparse DFAs as well:
```
use regex_automata::{SparseDFA, Regex};
# fn example() -> Result<(), regex_automata::Error> {
let re1 = Regex::new(r"[0-9]{4}-[0-9]{2}-[0-9]{2}").unwrap();
// serialize both
let fwd_bytes = re1.forward().to_u16()?.to_sparse()?.to_bytes_native_endian()?;
let rev_bytes = re1.reverse().to_u16()?.to_sparse()?.to_bytes_native_endian()?;
// now deserialize both---we need to specify the correct type!
let fwd: SparseDFA<&[u8], u16> = unsafe { SparseDFA::from_bytes(&fwd_bytes) };
let rev: SparseDFA<&[u8], u16> = unsafe { SparseDFA::from_bytes(&rev_bytes) };
// finally, reconstruct our regex
let re2 = Regex::from_dfas(fwd, rev);
// we can use it like normal
let text = b"2018-12-24 2016-10-08";
let matches: Vec<(usize, usize)> = re2.find_iter(text).collect();
assert_eq!(matches, vec![(0, 10), (11, 21)]);
# Ok(()) }; example().unwrap()
```
Note that unlike dense DFAs, sparse DFAs have no alignment requirements.
Conversely, dense DFAs must be be aligned to the same alignment as their
state identifier representation.
# Support for `no_std`
This crate comes with a `std` feature that is enabled by default. When the
`std` feature is enabled, the API of this crate will include the facilities
necessary for compiling, serializing, deserializing and searching with regular
expressions. When the `std` feature is disabled, the API of this crate will
shrink such that it only includes the facilities necessary for deserializing
and searching with regular expressions.
The intended workflow for `no_std` environments is thus as follows:
* Write a program with the `std` feature that compiles and serializes a
regular expression. Serialization should only happen after first converting
the DFAs to use a fixed size state identifier instead of the default `usize`.
You may also need to serialize both little and big endian versions of each
DFA. (So that's 4 DFAs in total for each regex.)
* In your `no_std` environment, follow the examples above for deserializing
your previously serialized DFAs into regexes. You can then search with them
as you would any regex.
Deserialization can happen anywhere. For example, with bytes embedded into a
binary or with a file memory mapped at runtime.
Note that the
[`ucd-generate`](https://github.com/BurntSushi/ucd-generate)
tool will do the first step for you with its `dfa` or `regex` sub-commands.
# Syntax
This crate supports the same syntax as the `regex` crate, since they share the
same parser. You can find an exhaustive list of supported syntax in the
[documentation for the `regex` crate](https://docs.rs/regex/1.1/regex/#syntax).
Currently, there are a couple limitations. In general, this crate does not
support zero-width assertions, although they may be added in the future. This
includes:
* Anchors such as `^`, `$`, `\A` and `\z`.
* Word boundary assertions such as `\b` and `\B`.
It is possible to run a search that is anchored at the beginning of the input.
To do that, set the
[`RegexBuilder::anchored`](struct.RegexBuilder.html#method.anchored)
option when building a regex. By default, all searches are unanchored.
# Differences with the regex crate
The main goal of the [`regex`](https://docs.rs/regex) crate is to serve as a
general purpose regular expression engine. It aims to automatically balance low
compile times, fast search times and low memory usage, while also providing
a convenient API for users. In contrast, this crate provides a lower level
regular expression interface that is a bit less convenient while providing more
explicit control over memory usage and search times.
Here are some specific negative differences:
* **Compilation can take an exponential amount of time and space** in the size
of the regex pattern. While most patterns do not exhibit worst case
exponential time, such patterns do exist. For example, `[01]*1[01]{N}` will
build a DFA with `2^(N+1)` states. For this reason, untrusted patterns should
not be compiled with this library. (In the future, the API may expose an
option to return an error if the DFA gets too big.)
* This crate does not support sub-match extraction, which can be achieved with
the regex crate's "captures" API. This may be added in the future, but is
unlikely.
* While the regex crate doesn't necessarily sport fast compilation times, the
regexes in this crate are almost universally slow to compile, especially when
they contain large Unicode character classes. For example, on my system,
compiling `\w{3}` with byte classes enabled takes just over 1 second and
almost 5MB of memory! (Compiling a sparse regex takes about the same time
but only uses about 500KB of memory.) Conversly, compiling the same regex
without Unicode support, e.g., `(?-u)\w{3}`, takes under 1 millisecond and
less than 5KB of memory. For this reason, you should only use Unicode
character classes if you absolutely need them!
* This crate does not support regex sets.
* This crate does not support zero-width assertions such as `^`, `$`, `\b` or
`\B`.
* As a lower level crate, this library does not do literal optimizations. In
exchange, you get predictable performance regardless of input. The
philosophy here is that literal optimizations should be applied at a higher
level, although there is no easy support for this in the ecosystem yet.
* There is no `&str` API like in the regex crate. In this crate, all APIs
operate on `&[u8]`. By default, match indices are guaranteed to fall on
UTF-8 boundaries, unless
[`RegexBuilder::allow_invalid_utf8`](struct.RegexBuilder.html#method.allow_invalid_utf8)
is enabled.
With some of the downsides out of the way, here are some positive differences:
* Both dense and sparse DFAs can be serialized to raw bytes, and then cheaply
deserialized. Deserialization always takes constant time since searching can
be performed directly on the raw serialized bytes of a DFA.
* This crate was specifically designed so that the searching phase of a DFA has
minimal runtime requirements, and can therefore be used in `no_std`
environments. While `no_std` environments cannot compile regexes, they can
deserialize pre-compiled regexes.
* Since this crate builds DFAs ahead of time, it will generally out-perform
the `regex` crate on equivalent tasks. The performance difference is likely
not large. However, because of a complex set of optimizations in the regex
crate (like literal optimizations), an accurate performance comparison may be
difficult to do.
* Sparse DFAs provide a way to build a DFA ahead of time that sacrifices search
performance a small amount, but uses much less storage space. Potentially
even less than what the regex crate uses.
* This crate exposes DFAs directly, such as
[`DenseDFA`](enum.DenseDFA.html) and [`SparseDFA`](enum.SparseDFA.html),
which enables one to do less work in some cases. For example, if you only
need the end of a match and not the start of a match, then you can use a DFA
directly without building a `Regex`, which always requires a second DFA to
find the start of a match.
* Aside from choosing between dense and sparse DFAs, there are several options
for configuring the space usage vs search time trade off. These include
things like choosing a smaller state identifier representation, to
premultiplying state identifiers and splitting a DFA's alphabet into
equivalence classes. Finally, DFA minimization is also provided, but can
increase compilation times dramatically.
*/
#![deny(missing_docs)]
#![cfg_attr(not(feature = "std"), no_std)]
#[cfg(feature = "std")]
extern crate core;
#[cfg(all(test, feature = "transducer"))]
extern crate bstr;
#[cfg(feature = "transducer")]
extern crate fst;
#[cfg(feature = "std")]
extern crate regex_syntax;
pub use dense::DenseDFA;
pub use dfa::DFA;
#[cfg(feature = "std")]
pub use error::{Error, ErrorKind};
pub use regex::Regex;
#[cfg(feature = "std")]
pub use regex::RegexBuilder;
pub use sparse::SparseDFA;
pub use state_id::StateID;
mod byteorder;
mod classes;
#[path = "dense.rs"]
mod dense_imp;
#[cfg(feature = "std")]
mod determinize;
mod dfa;
#[cfg(feature = "std")]
mod error;
#[cfg(feature = "std")]
mod minimize;
#[cfg(feature = "std")]
#[doc(hidden)]
pub mod nfa;
mod regex;
#[path = "sparse.rs"]
mod sparse_imp;
#[cfg(feature = "std")]
mod sparse_set;
mod state_id;
#[cfg(feature = "transducer")]
mod transducer;
/// Types and routines specific to dense DFAs.
///
/// This module is the home of [`DenseDFA`](enum.DenseDFA.html) and each of its
/// corresponding variant DFA types, such as [`Standard`](struct.Standard.html)
/// and [`ByteClass`](struct.ByteClass.html).
///
/// This module also contains a [builder](struct.Builder.html) for
/// configuring the construction of a dense DFA.
pub mod dense {
pub use dense_imp::*;
}
/// Types and routines specific to sparse DFAs.
///
/// This module is the home of [`SparseDFA`](enum.SparseDFA.html) and each of
/// its corresponding variant DFA types, such as
/// [`Standard`](struct.Standard.html) and
/// [`ByteClass`](struct.ByteClass.html).
///
/// Unlike the [`dense`](../dense/index.html) module, this module does not
/// contain a builder specific for sparse DFAs. Instead, the intended way to
/// build a sparse DFA is either by using a default configuration with its
/// [constructor](enum.SparseDFA.html#method.new),
/// or by first
/// [configuring the construction of a dense DFA](../dense/struct.Builder.html)
/// and then calling
/// [`DenseDFA::to_sparse`](../enum.DenseDFA.html#method.to_sparse).
pub mod sparse {
pub use sparse_imp::*;
}

View file

@ -0,0 +1,373 @@
use std::cell::RefCell;
use std::fmt;
use std::mem;
use std::rc::Rc;
use dense;
use state_id::{dead_id, StateID};
type DFARepr<S> = dense::Repr<Vec<S>, S>;
/// An implementation of Hopcroft's algorithm for minimizing DFAs.
///
/// The algorithm implemented here is mostly taken from Wikipedia:
/// https://en.wikipedia.org/wiki/DFA_minimization#Hopcroft's_algorithm
///
/// This code has had some light optimization attention paid to it,
/// particularly in the form of reducing allocation as much as possible.
/// However, it is still generally slow. Future optimization work should
/// probably focus on the bigger picture rather than micro-optimizations. For
/// example:
///
/// 1. Figure out how to more intelligently create initial partitions. That is,
/// Hopcroft's algorithm starts by creating two partitions of DFA states
/// that are known to NOT be equivalent: match states and non-match states.
/// The algorithm proceeds by progressively refining these partitions into
/// smaller partitions. If we could start with more partitions, then we
/// could reduce the amount of work that Hopcroft's algorithm needs to do.
/// 2. For every partition that we visit, we find all incoming transitions to
/// every state in the partition for *every* element in the alphabet. (This
/// is why using byte classes can significantly decrease minimization times,
/// since byte classes shrink the alphabet.) This is quite costly and there
/// is perhaps some redundant work being performed depending on the specific
/// states in the set. For example, we might be able to only visit some
/// elements of the alphabet based on the transitions.
/// 3. Move parts of minimization into determinization. If minimization has
/// fewer states to deal with, then it should run faster. A prime example
/// of this might be large Unicode classes, which are generated in way that
/// can create a lot of redundant states. (Some work has been done on this
/// point during NFA compilation via the algorithm described in the
/// "Incremental Construction of MinimalAcyclic Finite-State Automata"
/// paper.)
pub(crate) struct Minimizer<'a, S: 'a> {
dfa: &'a mut DFARepr<S>,
in_transitions: Vec<Vec<Vec<S>>>,
partitions: Vec<StateSet<S>>,
waiting: Vec<StateSet<S>>,
}
impl<'a, S: StateID> fmt::Debug for Minimizer<'a, S> {
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
f.debug_struct("Minimizer")
.field("dfa", &self.dfa)
.field("in_transitions", &self.in_transitions)
.field("partitions", &self.partitions)
.field("waiting", &self.waiting)
.finish()
}
}
/// A set of states. A state set makes up a single partition in Hopcroft's
/// algorithm.
///
/// It is represented by an ordered set of state identifiers. We use shared
/// ownership so that a single state set can be in both the set of partitions
/// and in the set of waiting sets simultaneously without an additional
/// allocation. Generally, once a state set is built, it becomes immutable.
///
/// We use this representation because it avoids the overhead of more
/// traditional set data structures (HashSet/BTreeSet), and also because
/// computing intersection/subtraction on this representation is especially
/// fast.
#[derive(Clone, Debug, Eq, PartialEq, PartialOrd, Ord)]
struct StateSet<S>(Rc<RefCell<Vec<S>>>);
impl<'a, S: StateID> Minimizer<'a, S> {
pub fn new(dfa: &'a mut DFARepr<S>) -> Minimizer<'a, S> {
let in_transitions = Minimizer::incoming_transitions(dfa);
let partitions = Minimizer::initial_partitions(dfa);
let waiting = vec![partitions[0].clone()];
Minimizer { dfa, in_transitions, partitions, waiting }
}
pub fn run(mut self) {
let mut incoming = StateSet::empty();
let mut scratch1 = StateSet::empty();
let mut scratch2 = StateSet::empty();
let mut newparts = vec![];
while let Some(set) = self.waiting.pop() {
for b in (0..self.dfa.alphabet_len()).map(|b| b as u8) {
self.find_incoming_to(b, &set, &mut incoming);
for p in 0..self.partitions.len() {
self.partitions[p].intersection(&incoming, &mut scratch1);
if scratch1.is_empty() {
newparts.push(self.partitions[p].clone());
continue;
}
self.partitions[p].subtract(&incoming, &mut scratch2);
if scratch2.is_empty() {
newparts.push(self.partitions[p].clone());
continue;
}
let (x, y) =
(scratch1.deep_clone(), scratch2.deep_clone());
newparts.push(x.clone());
newparts.push(y.clone());
match self.find_waiting(&self.partitions[p]) {
Some(i) => {
self.waiting[i] = x;
self.waiting.push(y);
}
None => {
if x.len() <= y.len() {
self.waiting.push(x);
} else {
self.waiting.push(y);
}
}
}
}
newparts = mem::replace(&mut self.partitions, newparts);
newparts.clear();
}
}
// At this point, we now have a minimal partitioning of states, where
// each partition is an equivalence class of DFA states. Now we need to
// use this partioning to update the DFA to only contain one state for
// each partition.
// Create a map from DFA state ID to the representative ID of the
// equivalence class to which it belongs. The representative ID of an
// equivalence class of states is the minimum ID in that class.
let mut state_to_part = vec![dead_id(); self.dfa.state_count()];
for p in &self.partitions {
p.iter(|id| state_to_part[id.to_usize()] = p.min());
}
// Generate a new contiguous sequence of IDs for minimal states, and
// create a map from equivalence IDs to the new IDs. Thus, the new
// minimal ID of *any* state in the unminimized DFA can be obtained
// with minimals_ids[state_to_part[old_id]].
let mut minimal_ids = vec![dead_id(); self.dfa.state_count()];
let mut new_id = S::from_usize(0);
for (id, _) in self.dfa.states() {
if state_to_part[id.to_usize()] == id {
minimal_ids[id.to_usize()] = new_id;
new_id = S::from_usize(new_id.to_usize() + 1);
}
}
// The total number of states in the minimal DFA.
let minimal_count = new_id.to_usize();
// Re-map this DFA in place such that the only states remaining
// correspond to the representative states of every equivalence class.
for id in (0..self.dfa.state_count()).map(S::from_usize) {
// If this state isn't a representative for an equivalence class,
// then we skip it since it won't appear in the minimal DFA.
if state_to_part[id.to_usize()] != id {
continue;
}
for (_, next) in self.dfa.get_state_mut(id).iter_mut() {
*next = minimal_ids[state_to_part[next.to_usize()].to_usize()];
}
self.dfa.swap_states(id, minimal_ids[id.to_usize()]);
}
// Trim off all unused states from the pre-minimized DFA. This
// represents all states that were merged into a non-singleton
// equivalence class of states, and appeared after the first state
// in each such class. (Because the state with the smallest ID in each
// equivalence class is its representative ID.)
self.dfa.truncate_states(minimal_count);
// Update the new start state, which is now just the minimal ID of
// whatever state the old start state was collapsed into.
let old_start = self.dfa.start_state();
self.dfa.set_start_state(
minimal_ids[state_to_part[old_start.to_usize()].to_usize()],
);
// In order to update the ID of the maximum match state, we need to
// find the maximum ID among all of the match states in the minimized
// DFA. This is not necessarily the new ID of the unminimized maximum
// match state, since that could have been collapsed with a much
// earlier match state. Therefore, to find the new max match state,
// we iterate over all previous match states, find their corresponding
// new minimal ID, and take the maximum of those.
let old_max = self.dfa.max_match_state();
self.dfa.set_max_match_state(dead_id());
for id in (0..(old_max.to_usize() + 1)).map(S::from_usize) {
let part = state_to_part[id.to_usize()];
let new_id = minimal_ids[part.to_usize()];
if new_id > self.dfa.max_match_state() {
self.dfa.set_max_match_state(new_id);
}
}
}
fn find_waiting(&self, set: &StateSet<S>) -> Option<usize> {
self.waiting.iter().position(|s| s == set)
}
fn find_incoming_to(
&self,
b: u8,
set: &StateSet<S>,
incoming: &mut StateSet<S>,
) {
incoming.clear();
set.iter(|id| {
for &inid in &self.in_transitions[id.to_usize()][b as usize] {
incoming.add(inid);
}
});
incoming.canonicalize();
}
fn initial_partitions(dfa: &DFARepr<S>) -> Vec<StateSet<S>> {
let mut is_match = StateSet::empty();
let mut no_match = StateSet::empty();
for (id, _) in dfa.states() {
if dfa.is_match_state(id) {
is_match.add(id);
} else {
no_match.add(id);
}
}
let mut sets = vec![is_match];
if !no_match.is_empty() {
sets.push(no_match);
}
sets.sort_by_key(|s| s.len());
sets
}
fn incoming_transitions(dfa: &DFARepr<S>) -> Vec<Vec<Vec<S>>> {
let mut incoming = vec![];
for _ in dfa.states() {
incoming.push(vec![vec![]; dfa.alphabet_len()]);
}
for (id, state) in dfa.states() {
for (b, next) in state.transitions() {
incoming[next.to_usize()][b as usize].push(id);
}
}
incoming
}
}
impl<S: StateID> StateSet<S> {
fn empty() -> StateSet<S> {
StateSet(Rc::new(RefCell::new(vec![])))
}
fn add(&mut self, id: S) {
self.0.borrow_mut().push(id);
}
fn min(&self) -> S {
self.0.borrow()[0]
}
fn canonicalize(&mut self) {
self.0.borrow_mut().sort();
self.0.borrow_mut().dedup();
}
fn clear(&mut self) {
self.0.borrow_mut().clear();
}
fn len(&self) -> usize {
self.0.borrow().len()
}
fn is_empty(&self) -> bool {
self.len() == 0
}
fn deep_clone(&self) -> StateSet<S> {
let ids = self.0.borrow().iter().cloned().collect();
StateSet(Rc::new(RefCell::new(ids)))
}
fn iter<F: FnMut(S)>(&self, mut f: F) {
for &id in self.0.borrow().iter() {
f(id);
}
}
fn intersection(&self, other: &StateSet<S>, dest: &mut StateSet<S>) {
dest.clear();
if self.is_empty() || other.is_empty() {
return;
}
let (seta, setb) = (self.0.borrow(), other.0.borrow());
let (mut ita, mut itb) = (seta.iter().cloned(), setb.iter().cloned());
let (mut a, mut b) = (ita.next().unwrap(), itb.next().unwrap());
loop {
if a == b {
dest.add(a);
a = match ita.next() {
None => break,
Some(a) => a,
};
b = match itb.next() {
None => break,
Some(b) => b,
};
} else if a < b {
a = match ita.next() {
None => break,
Some(a) => a,
};
} else {
b = match itb.next() {
None => break,
Some(b) => b,
};
}
}
}
fn subtract(&self, other: &StateSet<S>, dest: &mut StateSet<S>) {
dest.clear();
if self.is_empty() || other.is_empty() {
self.iter(|s| dest.add(s));
return;
}
let (seta, setb) = (self.0.borrow(), other.0.borrow());
let (mut ita, mut itb) = (seta.iter().cloned(), setb.iter().cloned());
let (mut a, mut b) = (ita.next().unwrap(), itb.next().unwrap());
loop {
if a == b {
a = match ita.next() {
None => break,
Some(a) => a,
};
b = match itb.next() {
None => {
dest.add(a);
break;
}
Some(b) => b,
};
} else if a < b {
dest.add(a);
a = match ita.next() {
None => break,
Some(a) => a,
};
} else {
b = match itb.next() {
None => {
dest.add(a);
break;
}
Some(b) => b,
};
}
}
for a in ita {
dest.add(a);
}
}
}

File diff suppressed because it is too large Load diff

View file

@ -0,0 +1,282 @@
// This module contains a couple simple and purpose built hash maps. The key
// trade off they make is that they serve as caches rather than true maps. That
// is, inserting a new entry may cause eviction of another entry. This gives
// us two things. First, there's less overhead associated with inserts and
// lookups. Secondly, it lets us control our memory usage.
//
// These maps are used in some fairly hot code when generating NFA states for
// large Unicode character classes.
//
// Instead of exposing a rich hashmap entry API, we just permit the caller
// to produce a hash of the key directly. The hash can then be reused for both
// lookups and insertions at the cost of leaking things a bit. But these are
// for internal use only, so it's fine.
//
// The Utf8BoundedMap is used for Daciuk's algorithm for constructing a
// (almost) minimal DFA for large Unicode character classes in linear time.
// (Daciuk's algorithm is always used when compiling forward NFAs. For reverse
// NFAs, it's only used when the compiler is configured to 'shrink' the NFA,
// since there's a bit more expense in the reverse direction.)
//
// The Utf8SuffixMap is used when compiling large Unicode character classes for
// reverse NFAs when 'shrink' is disabled. Specifically, it augments the naive
// construction of UTF-8 automata by caching common suffixes. This doesn't
// get the same space savings as Daciuk's algorithm, but it's basically as
// fast as the naive approach and typically winds up using less memory (since
// it generates smaller NFAs) despite the presence of the cache.
//
// These maps effectively represent caching mechanisms for CState::Sparse and
// CState::Range, respectively. The former represents a single NFA state with
// many transitions of equivalent priority while the latter represents a single
// NFA state with a single transition. (Neither state ever has or is an
// epsilon transition.) Thus, they have different key types. It's likely we
// could make one generic map, but the machinery didn't seem worth it. They
// are simple enough.
use nfa::{StateID, Transition};
// Basic FNV-1a hash constants as described in:
// https://en.wikipedia.org/wiki/Fowler%E2%80%93Noll%E2%80%93Vo_hash_function
const PRIME: u64 = 1099511628211;
const INIT: u64 = 14695981039346656037;
/// A bounded hash map where the key is a sequence of NFA transitions and the
/// value is a pre-existing NFA state ID.
///
/// std's hashmap can be used for this, however, this map has two important
/// advantages. Firstly, it has lower overhead. Secondly, it permits us to
/// control our memory usage by limited the number of slots. In general, the
/// cost here is that this map acts as a cache. That is, inserting a new entry
/// may remove an old entry. We are okay with this, since it does not impact
/// correctness in the cases where it is used. The only effect that dropping
/// states from the cache has is that the resulting NFA generated may be bigger
/// than it otherwise would be.
///
/// This improves benchmarks that compile large Unicode character classes,
/// since it makes the generation of (almost) minimal UTF-8 automaton faster.
/// Specifically, one could observe the difference with std's hashmap via
/// something like the following benchmark:
///
/// hyperfine "regex-automata-debug debug -acqr '\w{40} ecurB'"
///
/// But to observe that difference, you'd have to modify the code to use
/// std's hashmap.
///
/// It is quite possible that there is a better way to approach this problem.
/// For example, if there happens to be a very common state that collides with
/// a lot of less frequent states, then we could wind up with very poor caching
/// behavior. Alas, the effectiveness of this cache has not been measured.
/// Instead, ad hoc experiments suggest that it is "good enough." Additional
/// smarts (such as an LRU eviction policy) have to be weighed against the
/// amount of extra time they cost.
#[derive(Clone, Debug)]
pub struct Utf8BoundedMap {
/// The current version of this map. Only entries with matching versions
/// are considered during lookups. If an entry is found with a mismatched
/// version, then the map behaves as if the entry does not exist.
version: u16,
/// The total number of entries this map can store.
capacity: usize,
/// The actual entries, keyed by hash. Collisions between different states
/// result in the old state being dropped.
map: Vec<Utf8BoundedEntry>,
}
/// An entry in this map.
#[derive(Clone, Debug, Default)]
struct Utf8BoundedEntry {
/// The version of the map used to produce this entry. If this entry's
/// version does not match the current version of the map, then the map
/// should behave as if this entry does not exist.
version: u16,
/// The key, which is a sorted sequence of non-overlapping NFA transitions.
key: Vec<Transition>,
/// The state ID corresponding to the state containing the transitions in
/// this entry.
val: StateID,
}
impl Utf8BoundedMap {
/// Create a new bounded map with the given capacity. The map will never
/// grow beyond the given size.
///
/// Note that this does not allocate. Instead, callers must call `clear`
/// before using this map. `clear` will allocate space if necessary.
///
/// This avoids the need to pay for the allocation of this map when
/// compiling regexes that lack large Unicode character classes.
pub fn new(capacity: usize) -> Utf8BoundedMap {
assert!(capacity > 0);
Utf8BoundedMap { version: 0, capacity, map: vec![] }
}
/// Clear this map of all entries, but permit the reuse of allocation
/// if possible.
///
/// This must be called before the map can be used.
pub fn clear(&mut self) {
if self.map.is_empty() {
self.map = vec![Utf8BoundedEntry::default(); self.capacity];
} else {
self.version = self.version.wrapping_add(1);
if self.version == 0 {
self.map = vec![Utf8BoundedEntry::default(); self.capacity];
}
}
}
/// Return a hash of the given transitions.
pub fn hash(&self, key: &[Transition]) -> usize {
let mut h = INIT;
for t in key {
h = (h ^ (t.start as u64)).wrapping_mul(PRIME);
h = (h ^ (t.end as u64)).wrapping_mul(PRIME);
h = (h ^ (t.next as u64)).wrapping_mul(PRIME);
}
(h as usize) % self.map.len()
}
/// Retrieve the cached state ID corresponding to the given key. The hash
/// given must have been computed with `hash` using the same key value.
///
/// If there is no cached state with the given transitions, then None is
/// returned.
pub fn get(&mut self, key: &[Transition], hash: usize) -> Option<StateID> {
let entry = &self.map[hash];
if entry.version != self.version {
return None;
}
// There may be a hash collision, so we need to confirm real equality.
if entry.key != key {
return None;
}
Some(entry.val)
}
/// Add a cached state to this map with the given key. Callers should
/// ensure that `state_id` points to a state that contains precisely the
/// NFA transitions given.
///
/// `hash` must have been computed using the `hash` method with the same
/// key.
pub fn set(
&mut self,
key: Vec<Transition>,
hash: usize,
state_id: StateID,
) {
self.map[hash] =
Utf8BoundedEntry { version: self.version, key, val: state_id };
}
}
/// A cache of suffixes used to modestly compress UTF-8 automata for large
/// Unicode character classes.
#[derive(Clone, Debug)]
pub struct Utf8SuffixMap {
/// The current version of this map. Only entries with matching versions
/// are considered during lookups. If an entry is found with a mismatched
/// version, then the map behaves as if the entry does not exist.
version: u16,
/// The total number of entries this map can store.
capacity: usize,
/// The actual entries, keyed by hash. Collisions between different states
/// result in the old state being dropped.
map: Vec<Utf8SuffixEntry>,
}
/// A key that uniquely identifies an NFA state. It is a triple that represents
/// a transition from one state for a particular byte range.
#[derive(Clone, Debug, Default, Eq, PartialEq)]
pub struct Utf8SuffixKey {
pub from: StateID,
pub start: u8,
pub end: u8,
}
/// An entry in this map.
#[derive(Clone, Debug, Default)]
struct Utf8SuffixEntry {
/// The version of the map used to produce this entry. If this entry's
/// version does not match the current version of the map, then the map
/// should behave as if this entry does not exist.
version: u16,
/// The key, which consists of a transition in a particular state.
key: Utf8SuffixKey,
/// The identifier that the transition in the key maps to.
val: StateID,
}
impl Utf8SuffixMap {
/// Create a new bounded map with the given capacity. The map will never
/// grow beyond the given size.
///
/// Note that this does not allocate. Instead, callers must call `clear`
/// before using this map. `clear` will allocate space if necessary.
///
/// This avoids the need to pay for the allocation of this map when
/// compiling regexes that lack large Unicode character classes.
pub fn new(capacity: usize) -> Utf8SuffixMap {
assert!(capacity > 0);
Utf8SuffixMap { version: 0, capacity, map: vec![] }
}
/// Clear this map of all entries, but permit the reuse of allocation
/// if possible.
///
/// This must be called before the map can be used.
pub fn clear(&mut self) {
if self.map.is_empty() {
self.map = vec![Utf8SuffixEntry::default(); self.capacity];
} else {
self.version = self.version.wrapping_add(1);
if self.version == 0 {
self.map = vec![Utf8SuffixEntry::default(); self.capacity];
}
}
}
/// Return a hash of the given transition.
pub fn hash(&self, key: &Utf8SuffixKey) -> usize {
// Basic FNV-1a hash as described:
// https://en.wikipedia.org/wiki/Fowler%E2%80%93Noll%E2%80%93Vo_hash_function
const PRIME: u64 = 1099511628211;
const INIT: u64 = 14695981039346656037;
let mut h = INIT;
h = (h ^ (key.from as u64)).wrapping_mul(PRIME);
h = (h ^ (key.start as u64)).wrapping_mul(PRIME);
h = (h ^ (key.end as u64)).wrapping_mul(PRIME);
(h as usize) % self.map.len()
}
/// Retrieve the cached state ID corresponding to the given key. The hash
/// given must have been computed with `hash` using the same key value.
///
/// If there is no cached state with the given key, then None is returned.
pub fn get(
&mut self,
key: &Utf8SuffixKey,
hash: usize,
) -> Option<StateID> {
let entry = &self.map[hash];
if entry.version != self.version {
return None;
}
if key != &entry.key {
return None;
}
Some(entry.val)
}
/// Add a cached state to this map with the given key. Callers should
/// ensure that `state_id` points to a state that contains precisely the
/// NFA transition given.
///
/// `hash` must have been computed using the `hash` method with the same
/// key.
pub fn set(&mut self, key: Utf8SuffixKey, hash: usize, state_id: StateID) {
self.map[hash] =
Utf8SuffixEntry { version: self.version, key, val: state_id };
}
}

View file

@ -0,0 +1,252 @@
use std::fmt;
use classes::ByteClasses;
pub use nfa::compiler::Builder;
mod compiler;
mod map;
mod range_trie;
/// The representation for an NFA state identifier.
pub type StateID = usize;
/// A final compiled NFA.
///
/// The states of the NFA are indexed by state IDs, which are how transitions
/// are expressed.
#[derive(Clone)]
pub struct NFA {
/// Whether this NFA can only match at the beginning of input or not.
///
/// When true, a match should only be reported if it begins at the 0th
/// index of the haystack.
anchored: bool,
/// The starting state of this NFA.
start: StateID,
/// The state list. This list is guaranteed to be indexable by the starting
/// state ID, and it is also guaranteed to contain exactly one `Match`
/// state.
states: Vec<State>,
/// A mapping from any byte value to its corresponding equivalence class
/// identifier. Two bytes in the same equivalence class cannot discriminate
/// between a match or a non-match. This map can be used to shrink the
/// total size of a DFA's transition table with a small match-time cost.
///
/// Note that the NFA's transitions are *not* defined in terms of these
/// equivalence classes. The NFA's transitions are defined on the original
/// byte values. For the most part, this is because they wouldn't really
/// help the NFA much since the NFA already uses a sparse representation
/// to represent transitions. Byte classes are most effective in a dense
/// representation.
byte_classes: ByteClasses,
}
impl NFA {
/// Returns an NFA that always matches at every position.
pub fn always_match() -> NFA {
NFA {
anchored: false,
start: 0,
states: vec![State::Match],
byte_classes: ByteClasses::empty(),
}
}
/// Returns an NFA that never matches at any position.
pub fn never_match() -> NFA {
NFA {
anchored: false,
start: 0,
states: vec![State::Fail],
byte_classes: ByteClasses::empty(),
}
}
/// Returns true if and only if this NFA is anchored.
pub fn is_anchored(&self) -> bool {
self.anchored
}
/// Return the number of states in this NFA.
pub fn len(&self) -> usize {
self.states.len()
}
/// Return the ID of the initial state of this NFA.
pub fn start(&self) -> StateID {
self.start
}
/// Return the NFA state corresponding to the given ID.
pub fn state(&self, id: StateID) -> &State {
&self.states[id]
}
/// Return the set of equivalence classes for this NFA. The slice returned
/// always has length 256 and maps each possible byte value to its
/// corresponding equivalence class ID (which is never more than 255).
pub fn byte_classes(&self) -> &ByteClasses {
&self.byte_classes
}
}
impl fmt::Debug for NFA {
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
for (i, state) in self.states.iter().enumerate() {
let status = if i == self.start { '>' } else { ' ' };
writeln!(f, "{}{:06}: {:?}", status, i, state)?;
}
Ok(())
}
}
/// A state in a final compiled NFA.
#[derive(Clone, Eq, PartialEq)]
pub enum State {
/// A state that transitions to `next` if and only if the current input
/// byte is in the range `[start, end]` (inclusive).
///
/// This is a special case of Sparse in that it encodes only one transition
/// (and therefore avoids the allocation).
Range { range: Transition },
/// A state with possibly many transitions, represented in a sparse
/// fashion. Transitions are ordered lexicographically by input range.
/// As such, this may only be used when every transition has equal
/// priority. (In practice, this is only used for encoding large UTF-8
/// automata.)
Sparse { ranges: Box<[Transition]> },
/// An alternation such that there exists an epsilon transition to all
/// states in `alternates`, where matches found via earlier transitions
/// are preferred over later transitions.
Union { alternates: Box<[StateID]> },
/// A fail state. When encountered, the automaton is guaranteed to never
/// reach a match state.
Fail,
/// A match state. There is exactly one such occurrence of this state in
/// an NFA.
Match,
}
/// A transition to another state, only if the given byte falls in the
/// inclusive range specified.
#[derive(Clone, Copy, Eq, Hash, PartialEq)]
pub struct Transition {
pub start: u8,
pub end: u8,
pub next: StateID,
}
impl State {
/// Returns true if and only if this state contains one or more epsilon
/// transitions.
pub fn is_epsilon(&self) -> bool {
match *self {
State::Range { .. }
| State::Sparse { .. }
| State::Fail
| State::Match => false,
State::Union { .. } => true,
}
}
/// Remap the transitions in this state using the given map. Namely, the
/// given map should be indexed according to the transitions currently
/// in this state.
///
/// This is used during the final phase of the NFA compiler, which turns
/// its intermediate NFA into the final NFA.
fn remap(&mut self, remap: &[StateID]) {
match *self {
State::Range { ref mut range } => range.next = remap[range.next],
State::Sparse { ref mut ranges } => {
for r in ranges.iter_mut() {
r.next = remap[r.next];
}
}
State::Union { ref mut alternates } => {
for alt in alternates.iter_mut() {
*alt = remap[*alt];
}
}
State::Fail => {}
State::Match => {}
}
}
}
impl fmt::Debug for State {
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
match *self {
State::Range { ref range } => range.fmt(f),
State::Sparse { ref ranges } => {
let rs = ranges
.iter()
.map(|t| format!("{:?}", t))
.collect::<Vec<String>>()
.join(", ");
write!(f, "sparse({})", rs)
}
State::Union { ref alternates } => {
let alts = alternates
.iter()
.map(|id| format!("{}", id))
.collect::<Vec<String>>()
.join(", ");
write!(f, "alt({})", alts)
}
State::Fail => write!(f, "FAIL"),
State::Match => write!(f, "MATCH"),
}
}
}
impl fmt::Debug for Transition {
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
let Transition { start, end, next } = *self;
if self.start == self.end {
write!(f, "{} => {}", escape(start), next)
} else {
write!(f, "{}-{} => {}", escape(start), escape(end), next)
}
}
}
/// Return the given byte as its escaped string form.
fn escape(b: u8) -> String {
use std::ascii;
String::from_utf8(ascii::escape_default(b).collect::<Vec<_>>()).unwrap()
}
#[cfg(test)]
mod tests {
use super::*;
use dense;
use dfa::DFA;
#[test]
fn always_match() {
let nfa = NFA::always_match();
let dfa = dense::Builder::new().build_from_nfa::<usize>(&nfa).unwrap();
assert_eq!(Some(0), dfa.find_at(b"", 0));
assert_eq!(Some(0), dfa.find_at(b"a", 0));
assert_eq!(Some(1), dfa.find_at(b"a", 1));
assert_eq!(Some(0), dfa.find_at(b"ab", 0));
assert_eq!(Some(1), dfa.find_at(b"ab", 1));
assert_eq!(Some(2), dfa.find_at(b"ab", 2));
}
#[test]
fn never_match() {
let nfa = NFA::never_match();
let dfa = dense::Builder::new().build_from_nfa::<usize>(&nfa).unwrap();
assert_eq!(None, dfa.find_at(b"", 0));
assert_eq!(None, dfa.find_at(b"a", 0));
assert_eq!(None, dfa.find_at(b"a", 1));
assert_eq!(None, dfa.find_at(b"ab", 0));
assert_eq!(None, dfa.find_at(b"ab", 1));
assert_eq!(None, dfa.find_at(b"ab", 2));
}
}

File diff suppressed because it is too large Load diff

View file

@ -0,0 +1,771 @@
#[cfg(feature = "std")]
use dense::{self, DenseDFA};
use dfa::DFA;
#[cfg(feature = "std")]
use error::Result;
#[cfg(feature = "std")]
use sparse::SparseDFA;
#[cfg(feature = "std")]
use state_id::StateID;
/// A regular expression that uses deterministic finite automata for fast
/// searching.
///
/// A regular expression is comprised of two DFAs, a "forward" DFA and a
/// "reverse" DFA. The forward DFA is responsible for detecting the end of a
/// match while the reverse DFA is responsible for detecting the start of a
/// match. Thus, in order to find the bounds of any given match, a forward
/// search must first be run followed by a reverse search. A match found by
/// the forward DFA guarantees that the reverse DFA will also find a match.
///
/// The type of the DFA used by a `Regex` corresponds to the `D` type
/// parameter, which must satisfy the [`DFA`](trait.DFA.html) trait. Typically,
/// `D` is either a [`DenseDFA`](enum.DenseDFA.html) or a
/// [`SparseDFA`](enum.SparseDFA.html), where dense DFAs use more memory but
/// search faster, while sparse DFAs use less memory but search more slowly.
///
/// By default, a regex's DFA type parameter is set to
/// `DenseDFA<Vec<usize>, usize>`. For most in-memory work loads, this is the
/// most convenient type that gives the best search performance.
///
/// # Sparse DFAs
///
/// Since a `Regex` is generic over the `DFA` trait, it can be used with any
/// kind of DFA. While this crate constructs dense DFAs by default, it is easy
/// enough to build corresponding sparse DFAs, and then build a regex from
/// them:
///
/// ```
/// use regex_automata::Regex;
///
/// # fn example() -> Result<(), regex_automata::Error> {
/// // First, build a regex that uses dense DFAs.
/// let dense_re = Regex::new("foo[0-9]+")?;
///
/// // Second, build sparse DFAs from the forward and reverse dense DFAs.
/// let fwd = dense_re.forward().to_sparse()?;
/// let rev = dense_re.reverse().to_sparse()?;
///
/// // Third, build a new regex from the constituent sparse DFAs.
/// let sparse_re = Regex::from_dfas(fwd, rev);
///
/// // A regex that uses sparse DFAs can be used just like with dense DFAs.
/// assert_eq!(true, sparse_re.is_match(b"foo123"));
/// # Ok(()) }; example().unwrap()
/// ```
#[cfg(feature = "std")]
#[derive(Clone, Debug)]
pub struct Regex<D: DFA = DenseDFA<Vec<usize>, usize>> {
forward: D,
reverse: D,
}
/// A regular expression that uses deterministic finite automata for fast
/// searching.
///
/// A regular expression is comprised of two DFAs, a "forward" DFA and a
/// "reverse" DFA. The forward DFA is responsible for detecting the end of a
/// match while the reverse DFA is responsible for detecting the start of a
/// match. Thus, in order to find the bounds of any given match, a forward
/// search must first be run followed by a reverse search. A match found by
/// the forward DFA guarantees that the reverse DFA will also find a match.
///
/// The type of the DFA used by a `Regex` corresponds to the `D` type
/// parameter, which must satisfy the [`DFA`](trait.DFA.html) trait. Typically,
/// `D` is either a [`DenseDFA`](enum.DenseDFA.html) or a
/// [`SparseDFA`](enum.SparseDFA.html), where dense DFAs use more memory but
/// search faster, while sparse DFAs use less memory but search more slowly.
///
/// When using this crate without the standard library, the `Regex` type has
/// no default type parameter.
///
/// # Sparse DFAs
///
/// Since a `Regex` is generic over the `DFA` trait, it can be used with any
/// kind of DFA. While this crate constructs dense DFAs by default, it is easy
/// enough to build corresponding sparse DFAs, and then build a regex from
/// them:
///
/// ```
/// use regex_automata::Regex;
///
/// # fn example() -> Result<(), regex_automata::Error> {
/// // First, build a regex that uses dense DFAs.
/// let dense_re = Regex::new("foo[0-9]+")?;
///
/// // Second, build sparse DFAs from the forward and reverse dense DFAs.
/// let fwd = dense_re.forward().to_sparse()?;
/// let rev = dense_re.reverse().to_sparse()?;
///
/// // Third, build a new regex from the constituent sparse DFAs.
/// let sparse_re = Regex::from_dfas(fwd, rev);
///
/// // A regex that uses sparse DFAs can be used just like with dense DFAs.
/// assert_eq!(true, sparse_re.is_match(b"foo123"));
/// # Ok(()) }; example().unwrap()
/// ```
#[cfg(not(feature = "std"))]
#[derive(Clone, Debug)]
pub struct Regex<D> {
forward: D,
reverse: D,
}
#[cfg(feature = "std")]
impl Regex {
/// Parse the given regular expression using a default configuration and
/// return the corresponding regex.
///
/// The default configuration uses `usize` for state IDs, premultiplies
/// them and reduces the alphabet size by splitting bytes into equivalence
/// classes. The underlying DFAs are *not* minimized.
///
/// If you want a non-default configuration, then use the
/// [`RegexBuilder`](struct.RegexBuilder.html)
/// to set your own configuration.
///
/// # Example
///
/// ```
/// use regex_automata::Regex;
///
/// # fn example() -> Result<(), regex_automata::Error> {
/// let re = Regex::new("foo[0-9]+bar")?;
/// assert_eq!(Some((3, 14)), re.find(b"zzzfoo12345barzzz"));
/// # Ok(()) }; example().unwrap()
/// ```
pub fn new(pattern: &str) -> Result<Regex> {
RegexBuilder::new().build(pattern)
}
}
#[cfg(feature = "std")]
impl Regex<SparseDFA<Vec<u8>, usize>> {
/// Parse the given regular expression using a default configuration and
/// return the corresponding regex using sparse DFAs.
///
/// The default configuration uses `usize` for state IDs, reduces the
/// alphabet size by splitting bytes into equivalence classes. The
/// underlying DFAs are *not* minimized.
///
/// If you want a non-default configuration, then use the
/// [`RegexBuilder`](struct.RegexBuilder.html)
/// to set your own configuration.
///
/// # Example
///
/// ```
/// use regex_automata::Regex;
///
/// # fn example() -> Result<(), regex_automata::Error> {
/// let re = Regex::new_sparse("foo[0-9]+bar")?;
/// assert_eq!(Some((3, 14)), re.find(b"zzzfoo12345barzzz"));
/// # Ok(()) }; example().unwrap()
/// ```
pub fn new_sparse(
pattern: &str,
) -> Result<Regex<SparseDFA<Vec<u8>, usize>>> {
RegexBuilder::new().build_sparse(pattern)
}
}
impl<D: DFA> Regex<D> {
/// Returns true if and only if the given bytes match.
///
/// This routine may short circuit if it knows that scanning future input
/// will never lead to a different result. In particular, if the underlying
/// DFA enters a match state or a dead state, then this routine will return
/// `true` or `false`, respectively, without inspecting any future input.
///
/// # Example
///
/// ```
/// use regex_automata::Regex;
///
/// # fn example() -> Result<(), regex_automata::Error> {
/// let re = Regex::new("foo[0-9]+bar")?;
/// assert_eq!(true, re.is_match(b"foo12345bar"));
/// assert_eq!(false, re.is_match(b"foobar"));
/// # Ok(()) }; example().unwrap()
/// ```
pub fn is_match(&self, input: &[u8]) -> bool {
self.is_match_at(input, 0)
}
/// Returns the first position at which a match is found.
///
/// This routine stops scanning input in precisely the same circumstances
/// as `is_match`. The key difference is that this routine returns the
/// position at which it stopped scanning input if and only if a match
/// was found. If no match is found, then `None` is returned.
///
/// # Example
///
/// ```
/// use regex_automata::Regex;
///
/// # fn example() -> Result<(), regex_automata::Error> {
/// let re = Regex::new("foo[0-9]+")?;
/// assert_eq!(Some(4), re.shortest_match(b"foo12345"));
///
/// // Normally, the end of the leftmost first match here would be 3,
/// // but the shortest match semantics detect a match earlier.
/// let re = Regex::new("abc|a")?;
/// assert_eq!(Some(1), re.shortest_match(b"abc"));
/// # Ok(()) }; example().unwrap()
/// ```
pub fn shortest_match(&self, input: &[u8]) -> Option<usize> {
self.shortest_match_at(input, 0)
}
/// Returns the start and end offset of the leftmost first match. If no
/// match exists, then `None` is returned.
///
/// The "leftmost first" match corresponds to the match with the smallest
/// starting offset, but where the end offset is determined by preferring
/// earlier branches in the original regular expression. For example,
/// `Sam|Samwise` will match `Sam` in `Samwise`, but `Samwise|Sam` will
/// match `Samwise` in `Samwise`.
///
/// Generally speaking, the "leftmost first" match is how most backtracking
/// regular expressions tend to work. This is in contrast to POSIX-style
/// regular expressions that yield "leftmost longest" matches. Namely,
/// both `Sam|Samwise` and `Samwise|Sam` match `Samwise` when using
/// leftmost longest semantics.
///
/// # Example
///
/// ```
/// use regex_automata::Regex;
///
/// # fn example() -> Result<(), regex_automata::Error> {
/// let re = Regex::new("foo[0-9]+")?;
/// assert_eq!(Some((3, 11)), re.find(b"zzzfoo12345zzz"));
///
/// // Even though a match is found after reading the first byte (`a`),
/// // the leftmost first match semantics demand that we find the earliest
/// // match that prefers earlier parts of the pattern over latter parts.
/// let re = Regex::new("abc|a")?;
/// assert_eq!(Some((0, 3)), re.find(b"abc"));
/// # Ok(()) }; example().unwrap()
/// ```
pub fn find(&self, input: &[u8]) -> Option<(usize, usize)> {
self.find_at(input, 0)
}
/// Returns the same as `is_match`, but starts the search at the given
/// offset.
///
/// The significance of the starting point is that it takes the surrounding
/// context into consideration. For example, if the DFA is anchored, then
/// a match can only occur when `start == 0`.
pub fn is_match_at(&self, input: &[u8], start: usize) -> bool {
self.forward().is_match_at(input, start)
}
/// Returns the same as `shortest_match`, but starts the search at the
/// given offset.
///
/// The significance of the starting point is that it takes the surrounding
/// context into consideration. For example, if the DFA is anchored, then
/// a match can only occur when `start == 0`.
pub fn shortest_match_at(
&self,
input: &[u8],
start: usize,
) -> Option<usize> {
self.forward().shortest_match_at(input, start)
}
/// Returns the same as `find`, but starts the search at the given
/// offset.
///
/// The significance of the starting point is that it takes the surrounding
/// context into consideration. For example, if the DFA is anchored, then
/// a match can only occur when `start == 0`.
pub fn find_at(
&self,
input: &[u8],
start: usize,
) -> Option<(usize, usize)> {
let end = match self.forward().find_at(input, start) {
None => return None,
Some(end) => end,
};
let start = self
.reverse()
.rfind(&input[start..end])
.map(|i| start + i)
.expect("reverse search must match if forward search does");
Some((start, end))
}
/// Returns an iterator over all non-overlapping leftmost first matches
/// in the given bytes. If no match exists, then the iterator yields no
/// elements.
///
/// Note that if the regex can match the empty string, then it is
/// possible for the iterator to yield a zero-width match at a location
/// that is not a valid UTF-8 boundary (for example, between the code units
/// of a UTF-8 encoded codepoint). This can happen regardless of whether
/// [`allow_invalid_utf8`](struct.RegexBuilder.html#method.allow_invalid_utf8)
/// was enabled or not.
///
/// # Example
///
/// ```
/// use regex_automata::Regex;
///
/// # fn example() -> Result<(), regex_automata::Error> {
/// let re = Regex::new("foo[0-9]+")?;
/// let text = b"foo1 foo12 foo123";
/// let matches: Vec<(usize, usize)> = re.find_iter(text).collect();
/// assert_eq!(matches, vec![(0, 4), (5, 10), (11, 17)]);
/// # Ok(()) }; example().unwrap()
/// ```
pub fn find_iter<'r, 't>(&'r self, input: &'t [u8]) -> Matches<'r, 't, D> {
Matches::new(self, input)
}
/// Build a new regex from its constituent forward and reverse DFAs.
///
/// This is useful when deserializing a regex from some arbitrary
/// memory region. This is also useful for building regexes from other
/// types of DFAs.
///
/// # Example
///
/// This example is a bit a contrived. The usual use of these methods
/// would involve serializing `initial_re` somewhere and then deserializing
/// it later to build a regex.
///
/// ```
/// use regex_automata::Regex;
///
/// # fn example() -> Result<(), regex_automata::Error> {
/// let initial_re = Regex::new("foo[0-9]+")?;
/// assert_eq!(true, initial_re.is_match(b"foo123"));
///
/// let (fwd, rev) = (initial_re.forward(), initial_re.reverse());
/// let re = Regex::from_dfas(fwd, rev);
/// assert_eq!(true, re.is_match(b"foo123"));
/// # Ok(()) }; example().unwrap()
/// ```
///
/// This example shows how you might build smaller DFAs, and then use those
/// smaller DFAs to build a new regex.
///
/// ```
/// use regex_automata::Regex;
///
/// # fn example() -> Result<(), regex_automata::Error> {
/// let initial_re = Regex::new("foo[0-9]+")?;
/// assert_eq!(true, initial_re.is_match(b"foo123"));
///
/// let fwd = initial_re.forward().to_u16()?;
/// let rev = initial_re.reverse().to_u16()?;
/// let re = Regex::from_dfas(fwd, rev);
/// assert_eq!(true, re.is_match(b"foo123"));
/// # Ok(()) }; example().unwrap()
/// ```
///
/// This example shows how to build a `Regex` that uses sparse DFAs instead
/// of dense DFAs:
///
/// ```
/// use regex_automata::Regex;
///
/// # fn example() -> Result<(), regex_automata::Error> {
/// let initial_re = Regex::new("foo[0-9]+")?;
/// assert_eq!(true, initial_re.is_match(b"foo123"));
///
/// let fwd = initial_re.forward().to_sparse()?;
/// let rev = initial_re.reverse().to_sparse()?;
/// let re = Regex::from_dfas(fwd, rev);
/// assert_eq!(true, re.is_match(b"foo123"));
/// # Ok(()) }; example().unwrap()
/// ```
pub fn from_dfas(forward: D, reverse: D) -> Regex<D> {
Regex { forward, reverse }
}
/// Return the underlying DFA responsible for forward matching.
pub fn forward(&self) -> &D {
&self.forward
}
/// Return the underlying DFA responsible for reverse matching.
pub fn reverse(&self) -> &D {
&self.reverse
}
}
/// An iterator over all non-overlapping matches for a particular search.
///
/// The iterator yields a `(usize, usize)` value until no more matches could be
/// found. The first `usize` is the start of the match (inclusive) while the
/// second `usize` is the end of the match (exclusive).
///
/// `S` is the type used to represent state identifiers in the underlying
/// regex. The lifetime variables are as follows:
///
/// * `'r` is the lifetime of the regular expression value itself.
/// * `'t` is the lifetime of the text being searched.
#[derive(Clone, Debug)]
pub struct Matches<'r, 't, D: DFA + 'r> {
re: &'r Regex<D>,
text: &'t [u8],
last_end: usize,
last_match: Option<usize>,
}
impl<'r, 't, D: DFA> Matches<'r, 't, D> {
fn new(re: &'r Regex<D>, text: &'t [u8]) -> Matches<'r, 't, D> {
Matches { re, text, last_end: 0, last_match: None }
}
}
impl<'r, 't, D: DFA> Iterator for Matches<'r, 't, D> {
type Item = (usize, usize);
fn next(&mut self) -> Option<(usize, usize)> {
if self.last_end > self.text.len() {
return None;
}
let (s, e) = match self.re.find_at(self.text, self.last_end) {
None => return None,
Some((s, e)) => (s, e),
};
if s == e {
// This is an empty match. To ensure we make progress, start
// the next search at the smallest possible starting position
// of the next match following this one.
self.last_end = e + 1;
// Don't accept empty matches immediately following a match.
// Just move on to the next match.
if Some(e) == self.last_match {
return self.next();
}
} else {
self.last_end = e;
}
self.last_match = Some(e);
Some((s, e))
}
}
/// A builder for a regex based on deterministic finite automatons.
///
/// This builder permits configuring several aspects of the construction
/// process such as case insensitivity, Unicode support and various options
/// that impact the size of the underlying DFAs. In some cases, options (like
/// performing DFA minimization) can come with a substantial additional cost.
///
/// This builder generally constructs two DFAs, where one is responsible for
/// finding the end of a match and the other is responsible for finding the
/// start of a match. If you only need to detect whether something matched,
/// or only the end of a match, then you should use a
/// [`dense::Builder`](dense/struct.Builder.html)
/// to construct a single DFA, which is cheaper than building two DFAs.
#[cfg(feature = "std")]
#[derive(Clone, Debug)]
pub struct RegexBuilder {
dfa: dense::Builder,
}
#[cfg(feature = "std")]
impl RegexBuilder {
/// Create a new regex builder with the default configuration.
pub fn new() -> RegexBuilder {
RegexBuilder { dfa: dense::Builder::new() }
}
/// Build a regex from the given pattern.
///
/// If there was a problem parsing or compiling the pattern, then an error
/// is returned.
pub fn build(&self, pattern: &str) -> Result<Regex> {
self.build_with_size::<usize>(pattern)
}
/// Build a regex from the given pattern using sparse DFAs.
///
/// If there was a problem parsing or compiling the pattern, then an error
/// is returned.
pub fn build_sparse(
&self,
pattern: &str,
) -> Result<Regex<SparseDFA<Vec<u8>, usize>>> {
self.build_with_size_sparse::<usize>(pattern)
}
/// Build a regex from the given pattern using a specific representation
/// for the underlying DFA state IDs.
///
/// If there was a problem parsing or compiling the pattern, then an error
/// is returned.
///
/// The representation of state IDs is determined by the `S` type
/// parameter. In general, `S` is usually one of `u8`, `u16`, `u32`, `u64`
/// or `usize`, where `usize` is the default used for `build`. The purpose
/// of specifying a representation for state IDs is to reduce the memory
/// footprint of the underlying DFAs.
///
/// When using this routine, the chosen state ID representation will be
/// used throughout determinization and minimization, if minimization was
/// requested. Even if the minimized DFAs can fit into the chosen state ID
/// representation but the initial determinized DFA cannot, then this will
/// still return an error. To get a minimized DFA with a smaller state ID
/// representation, first build it with a bigger state ID representation,
/// and then shrink the sizes of the DFAs using one of its conversion
/// routines, such as [`DenseDFA::to_u16`](enum.DenseDFA.html#method.to_u16).
/// Finally, reconstitute the regex via
/// [`Regex::from_dfa`](struct.Regex.html#method.from_dfa).
pub fn build_with_size<S: StateID>(
&self,
pattern: &str,
) -> Result<Regex<DenseDFA<Vec<S>, S>>> {
let forward = self.dfa.build_with_size(pattern)?;
let reverse = self
.dfa
.clone()
.anchored(true)
.reverse(true)
.longest_match(true)
.build_with_size(pattern)?;
Ok(Regex::from_dfas(forward, reverse))
}
/// Build a regex from the given pattern using a specific representation
/// for the underlying DFA state IDs using sparse DFAs.
pub fn build_with_size_sparse<S: StateID>(
&self,
pattern: &str,
) -> Result<Regex<SparseDFA<Vec<u8>, S>>> {
let re = self.build_with_size(pattern)?;
let fwd = re.forward().to_sparse()?;
let rev = re.reverse().to_sparse()?;
Ok(Regex::from_dfas(fwd, rev))
}
/// Set whether matching must be anchored at the beginning of the input.
///
/// When enabled, a match must begin at the start of the input. When
/// disabled, the regex will act as if the pattern started with a `.*?`,
/// which enables a match to appear anywhere.
///
/// By default this is disabled.
pub fn anchored(&mut self, yes: bool) -> &mut RegexBuilder {
self.dfa.anchored(yes);
self
}
/// Enable or disable the case insensitive flag by default.
///
/// By default this is disabled. It may alternatively be selectively
/// enabled in the regular expression itself via the `i` flag.
pub fn case_insensitive(&mut self, yes: bool) -> &mut RegexBuilder {
self.dfa.case_insensitive(yes);
self
}
/// Enable verbose mode in the regular expression.
///
/// When enabled, verbose mode permits insigificant whitespace in many
/// places in the regular expression, as well as comments. Comments are
/// started using `#` and continue until the end of the line.
///
/// By default, this is disabled. It may be selectively enabled in the
/// regular expression by using the `x` flag regardless of this setting.
pub fn ignore_whitespace(&mut self, yes: bool) -> &mut RegexBuilder {
self.dfa.ignore_whitespace(yes);
self
}
/// Enable or disable the "dot matches any character" flag by default.
///
/// By default this is disabled. It may alternatively be selectively
/// enabled in the regular expression itself via the `s` flag.
pub fn dot_matches_new_line(&mut self, yes: bool) -> &mut RegexBuilder {
self.dfa.dot_matches_new_line(yes);
self
}
/// Enable or disable the "swap greed" flag by default.
///
/// By default this is disabled. It may alternatively be selectively
/// enabled in the regular expression itself via the `U` flag.
pub fn swap_greed(&mut self, yes: bool) -> &mut RegexBuilder {
self.dfa.swap_greed(yes);
self
}
/// Enable or disable the Unicode flag (`u`) by default.
///
/// By default this is **enabled**. It may alternatively be selectively
/// disabled in the regular expression itself via the `u` flag.
///
/// Note that unless `allow_invalid_utf8` is enabled (it's disabled by
/// default), a regular expression will fail to parse if Unicode mode is
/// disabled and a sub-expression could possibly match invalid UTF-8.
pub fn unicode(&mut self, yes: bool) -> &mut RegexBuilder {
self.dfa.unicode(yes);
self
}
/// When enabled, the builder will permit the construction of a regular
/// expression that may match invalid UTF-8.
///
/// When disabled (the default), the builder is guaranteed to produce a
/// regex that will only ever match valid UTF-8 (otherwise, the builder
/// will return an error).
pub fn allow_invalid_utf8(&mut self, yes: bool) -> &mut RegexBuilder {
self.dfa.allow_invalid_utf8(yes);
self
}
/// Set the nesting limit used for the regular expression parser.
///
/// The nesting limit controls how deep the abstract syntax tree is allowed
/// to be. If the AST exceeds the given limit (e.g., with too many nested
/// groups), then an error is returned by the parser.
///
/// The purpose of this limit is to act as a heuristic to prevent stack
/// overflow when building a finite automaton from a regular expression's
/// abstract syntax tree. In particular, construction currently uses
/// recursion. In the future, the implementation may stop using recursion
/// and this option will no longer be necessary.
///
/// This limit is not checked until the entire AST is parsed. Therefore,
/// if callers want to put a limit on the amount of heap space used, then
/// they should impose a limit on the length, in bytes, of the concrete
/// pattern string. In particular, this is viable since the parser will
/// limit itself to heap space proportional to the lenth of the pattern
/// string.
///
/// Note that a nest limit of `0` will return a nest limit error for most
/// patterns but not all. For example, a nest limit of `0` permits `a` but
/// not `ab`, since `ab` requires a concatenation AST item, which results
/// in a nest depth of `1`. In general, a nest limit is not something that
/// manifests in an obvious way in the concrete syntax, therefore, it
/// should not be used in a granular way.
pub fn nest_limit(&mut self, limit: u32) -> &mut RegexBuilder {
self.dfa.nest_limit(limit);
self
}
/// Minimize the underlying DFAs.
///
/// When enabled, the DFAs powering the resulting regex will be minimized
/// such that it is as small as possible.
///
/// Whether one enables minimization or not depends on the types of costs
/// you're willing to pay and how much you care about its benefits. In
/// particular, minimization has worst case `O(n*k*logn)` time and `O(k*n)`
/// space, where `n` is the number of DFA states and `k` is the alphabet
/// size. In practice, minimization can be quite costly in terms of both
/// space and time, so it should only be done if you're willing to wait
/// longer to produce a DFA. In general, you might want a minimal DFA in
/// the following circumstances:
///
/// 1. You would like to optimize for the size of the automaton. This can
/// manifest in one of two ways. Firstly, if you're converting the
/// DFA into Rust code (or a table embedded in the code), then a minimal
/// DFA will translate into a corresponding reduction in code size, and
/// thus, also the final compiled binary size. Secondly, if you are
/// building many DFAs and putting them on the heap, you'll be able to
/// fit more if they are smaller. Note though that building a minimal
/// DFA itself requires additional space; you only realize the space
/// savings once the minimal DFA is constructed (at which point, the
/// space used for minimization is freed).
/// 2. You've observed that a smaller DFA results in faster match
/// performance. Naively, this isn't guaranteed since there is no
/// inherent difference between matching with a bigger-than-minimal
/// DFA and a minimal DFA. However, a smaller DFA may make use of your
/// CPU's cache more efficiently.
/// 3. You are trying to establish an equivalence between regular
/// languages. The standard method for this is to build a minimal DFA
/// for each language and then compare them. If the DFAs are equivalent
/// (up to state renaming), then the languages are equivalent.
///
/// This option is disabled by default.
pub fn minimize(&mut self, yes: bool) -> &mut RegexBuilder {
self.dfa.minimize(yes);
self
}
/// Premultiply state identifiers in the underlying DFA transition tables.
///
/// When enabled, state identifiers are premultiplied to point to their
/// corresponding row in the DFA's transition table. That is, given the
/// `i`th state, its corresponding premultiplied identifier is `i * k`
/// where `k` is the alphabet size of the DFA. (The alphabet size is at
/// most 256, but is in practice smaller if byte classes is enabled.)
///
/// When state identifiers are not premultiplied, then the identifier of
/// the `i`th state is `i`.
///
/// The advantage of premultiplying state identifiers is that is saves
/// a multiplication instruction per byte when searching with the DFA.
/// This has been observed to lead to a 20% performance benefit in
/// micro-benchmarks.
///
/// The primary disadvantage of premultiplying state identifiers is
/// that they require a larger integer size to represent. For example,
/// if your DFA has 200 states, then its premultiplied form requires
/// 16 bits to represent every possible state identifier, where as its
/// non-premultiplied form only requires 8 bits.
///
/// This option is enabled by default.
pub fn premultiply(&mut self, yes: bool) -> &mut RegexBuilder {
self.dfa.premultiply(yes);
self
}
/// Shrink the size of the underlying DFA alphabet by mapping bytes to
/// their equivalence classes.
///
/// When enabled, each DFA will use a map from all possible bytes to their
/// corresponding equivalence class. Each equivalence class represents a
/// set of bytes that does not discriminate between a match and a non-match
/// in the DFA. For example, the pattern `[ab]+` has at least two
/// equivalence classes: a set containing `a` and `b` and a set containing
/// every byte except for `a` and `b`. `a` and `b` are in the same
/// equivalence classes because they never discriminate between a match
/// and a non-match.
///
/// The advantage of this map is that the size of the transition table can
/// be reduced drastically from `#states * 256 * sizeof(id)` to
/// `#states * k * sizeof(id)` where `k` is the number of equivalence
/// classes. As a result, total space usage can decrease substantially.
/// Moreover, since a smaller alphabet is used, compilation becomes faster
/// as well.
///
/// The disadvantage of this map is that every byte searched must be
/// passed through this map before it can be used to determine the next
/// transition. This has a small match time performance cost.
///
/// This option is enabled by default.
pub fn byte_classes(&mut self, yes: bool) -> &mut RegexBuilder {
self.dfa.byte_classes(yes);
self
}
/// Apply best effort heuristics to shrink the NFA at the expense of more
/// time/memory.
///
/// This may be exposed in the future, but for now is exported for use in
/// the `regex-automata-debug` tool.
#[doc(hidden)]
pub fn shrink(&mut self, yes: bool) -> &mut RegexBuilder {
self.dfa.shrink(yes);
self
}
}
#[cfg(feature = "std")]
impl Default for RegexBuilder {
fn default() -> RegexBuilder {
RegexBuilder::new()
}
}

File diff suppressed because it is too large Load diff

View file

@ -0,0 +1,60 @@
use std::slice;
/// A sparse set used for representing ordered NFA states.
///
/// This supports constant time addition and membership testing. Clearing an
/// entire set can also be done in constant time. Iteration yields elements
/// in the order in which they were inserted.
///
/// The data structure is based on: https://research.swtch.com/sparse
/// Note though that we don't actually use uninitialized memory. We generally
/// reuse sparse sets, so the initial allocation cost is bareable. However, its
/// other properties listed above are extremely useful.
#[derive(Clone, Debug)]
pub struct SparseSet {
/// Dense contains the instruction pointers in the order in which they
/// were inserted.
dense: Vec<usize>,
/// Sparse maps instruction pointers to their location in dense.
///
/// An instruction pointer is in the set if and only if
/// sparse[ip] < dense.len() && ip == dense[sparse[ip]].
sparse: Box<[usize]>,
}
impl SparseSet {
pub fn new(size: usize) -> SparseSet {
SparseSet {
dense: Vec::with_capacity(size),
sparse: vec![0; size].into_boxed_slice(),
}
}
pub fn len(&self) -> usize {
self.dense.len()
}
pub fn insert(&mut self, value: usize) {
let i = self.len();
assert!(i < self.dense.capacity());
self.dense.push(value);
self.sparse[value] = i;
}
pub fn contains(&self, value: usize) -> bool {
let i = self.sparse[value];
self.dense.get(i) == Some(&value)
}
pub fn clear(&mut self) {
self.dense.clear();
}
}
impl<'a> IntoIterator for &'a SparseSet {
type Item = &'a usize;
type IntoIter = slice::Iter<'a, usize>;
fn into_iter(self) -> Self::IntoIter {
self.dense.iter()
}
}

View file

@ -0,0 +1,291 @@
use core::fmt::Debug;
use core::hash::Hash;
use core::mem::size_of;
use byteorder::{ByteOrder, NativeEndian};
#[cfg(feature = "std")]
pub use self::std::*;
#[cfg(feature = "std")]
mod std {
use byteorder::ByteOrder;
use core::mem::size_of;
use error::{Error, Result};
use super::StateID;
/// Check that the premultiplication of the given state identifier can
/// fit into the representation indicated by `S`. If it cannot, or if it
/// overflows `usize` itself, then an error is returned.
pub fn premultiply_overflow_error<S: StateID>(
last_state: S,
alphabet_len: usize,
) -> Result<()> {
let requested = match last_state.to_usize().checked_mul(alphabet_len) {
Some(requested) => requested,
None => return Err(Error::premultiply_overflow(0, 0)),
};
if requested > S::max_id() {
return Err(Error::premultiply_overflow(S::max_id(), requested));
}
Ok(())
}
/// Allocate the next sequential identifier for a fresh state given
/// the previously constructed state identified by `current`. If the
/// next sequential identifier would overflow `usize` or the chosen
/// representation indicated by `S`, then an error is returned.
pub fn next_state_id<S: StateID>(current: S) -> Result<S> {
let next = match current.to_usize().checked_add(1) {
Some(next) => next,
None => return Err(Error::state_id_overflow(::std::usize::MAX)),
};
if next > S::max_id() {
return Err(Error::state_id_overflow(S::max_id()));
}
Ok(S::from_usize(next))
}
/// Convert the given `usize` to the chosen state identifier
/// representation. If the given value cannot fit in the chosen
/// representation, then an error is returned.
pub fn usize_to_state_id<S: StateID>(value: usize) -> Result<S> {
if value > S::max_id() {
Err(Error::state_id_overflow(S::max_id()))
} else {
Ok(S::from_usize(value))
}
}
/// Write the given identifier to the given slice of bytes using the
/// specified endianness. The given slice must have length at least
/// `size_of::<S>()`.
///
/// The given state identifier representation must have size 1, 2, 4 or 8.
pub fn write_state_id_bytes<E: ByteOrder, S: StateID>(
slice: &mut [u8],
id: S,
) {
assert!(
1 == size_of::<S>()
|| 2 == size_of::<S>()
|| 4 == size_of::<S>()
|| 8 == size_of::<S>()
);
match size_of::<S>() {
1 => slice[0] = id.to_usize() as u8,
2 => E::write_u16(slice, id.to_usize() as u16),
4 => E::write_u32(slice, id.to_usize() as u32),
8 => E::write_u64(slice, id.to_usize() as u64),
_ => unreachable!(),
}
}
}
/// Return the unique identifier for a DFA's dead state in the chosen
/// representation indicated by `S`.
pub fn dead_id<S: StateID>() -> S {
S::from_usize(0)
}
/// A trait describing the representation of a DFA's state identifier.
///
/// The purpose of this trait is to safely express both the possible state
/// identifier representations that can be used in a DFA and to convert between
/// state identifier representations and types that can be used to efficiently
/// index memory (such as `usize`).
///
/// In general, one should not need to implement this trait explicitly. In
/// particular, this crate provides implementations for `u8`, `u16`, `u32`,
/// `u64` and `usize`. (`u32` and `u64` are only provided for targets that can
/// represent all corresponding values in a `usize`.)
///
/// # Safety
///
/// This trait is unsafe because the correctness of its implementations may be
/// relied upon by other unsafe code. For example, one possible way to
/// implement this trait incorrectly would be to return a maximum identifier
/// in `max_id` that is greater than the real maximum identifier. This will
/// likely result in wrap-on-overflow semantics in release mode, which can in
/// turn produce incorrect state identifiers. Those state identifiers may then
/// in turn access out-of-bounds memory in a DFA's search routine, where bounds
/// checks are explicitly elided for performance reasons.
pub unsafe trait StateID:
Clone + Copy + Debug + Eq + Hash + PartialEq + PartialOrd + Ord
{
/// Convert from a `usize` to this implementation's representation.
///
/// Implementors may assume that `n <= Self::max_id`. That is, implementors
/// do not need to check whether `n` can fit inside this implementation's
/// representation.
fn from_usize(n: usize) -> Self;
/// Convert this implementation's representation to a `usize`.
///
/// Implementors must not return a `usize` value greater than
/// `Self::max_id` and must not permit overflow when converting between the
/// implementor's representation and `usize`. In general, the preferred
/// way for implementors to achieve this is to simply not provide
/// implementations of `StateID` that cannot fit into the target platform's
/// `usize`.
fn to_usize(self) -> usize;
/// Return the maximum state identifier supported by this representation.
///
/// Implementors must return a correct bound. Doing otherwise may result
/// in memory unsafety.
fn max_id() -> usize;
/// Read a single state identifier from the given slice of bytes in native
/// endian format.
///
/// Implementors may assume that the given slice has length at least
/// `size_of::<Self>()`.
fn read_bytes(slice: &[u8]) -> Self;
/// Write this state identifier to the given slice of bytes in native
/// endian format.
///
/// Implementors may assume that the given slice has length at least
/// `size_of::<Self>()`.
fn write_bytes(self, slice: &mut [u8]);
}
unsafe impl StateID for usize {
#[inline]
fn from_usize(n: usize) -> usize {
n
}
#[inline]
fn to_usize(self) -> usize {
self
}
#[inline]
fn max_id() -> usize {
::core::usize::MAX
}
#[inline]
fn read_bytes(slice: &[u8]) -> Self {
NativeEndian::read_uint(slice, size_of::<usize>()) as usize
}
#[inline]
fn write_bytes(self, slice: &mut [u8]) {
NativeEndian::write_uint(slice, self as u64, size_of::<usize>())
}
}
unsafe impl StateID for u8 {
#[inline]
fn from_usize(n: usize) -> u8 {
n as u8
}
#[inline]
fn to_usize(self) -> usize {
self as usize
}
#[inline]
fn max_id() -> usize {
::core::u8::MAX as usize
}
#[inline]
fn read_bytes(slice: &[u8]) -> Self {
slice[0]
}
#[inline]
fn write_bytes(self, slice: &mut [u8]) {
slice[0] = self;
}
}
unsafe impl StateID for u16 {
#[inline]
fn from_usize(n: usize) -> u16 {
n as u16
}
#[inline]
fn to_usize(self) -> usize {
self as usize
}
#[inline]
fn max_id() -> usize {
::core::u16::MAX as usize
}
#[inline]
fn read_bytes(slice: &[u8]) -> Self {
NativeEndian::read_u16(slice)
}
#[inline]
fn write_bytes(self, slice: &mut [u8]) {
NativeEndian::write_u16(slice, self)
}
}
#[cfg(any(target_pointer_width = "32", target_pointer_width = "64"))]
unsafe impl StateID for u32 {
#[inline]
fn from_usize(n: usize) -> u32 {
n as u32
}
#[inline]
fn to_usize(self) -> usize {
self as usize
}
#[inline]
fn max_id() -> usize {
::core::u32::MAX as usize
}
#[inline]
fn read_bytes(slice: &[u8]) -> Self {
NativeEndian::read_u32(slice)
}
#[inline]
fn write_bytes(self, slice: &mut [u8]) {
NativeEndian::write_u32(slice, self)
}
}
#[cfg(target_pointer_width = "64")]
unsafe impl StateID for u64 {
#[inline]
fn from_usize(n: usize) -> u64 {
n as u64
}
#[inline]
fn to_usize(self) -> usize {
self as usize
}
#[inline]
fn max_id() -> usize {
::core::u64::MAX as usize
}
#[inline]
fn read_bytes(slice: &[u8]) -> Self {
NativeEndian::read_u64(slice)
}
#[inline]
fn write_bytes(self, slice: &mut [u8]) {
NativeEndian::write_u64(slice, self)
}
}

View file

@ -0,0 +1,107 @@
use fst::Automaton;
use crate::{StateID, DFA};
macro_rules! imp {
($ty:ty, $id:ty) => {
impl<T: AsRef<[$id]>, S: StateID> Automaton for $ty {
type State = S;
#[inline]
fn start(&self) -> S {
self.start_state()
}
#[inline]
fn is_match(&self, state: &S) -> bool {
self.is_match_state(*state)
}
#[inline]
fn accept(&self, state: &S, byte: u8) -> S {
self.next_state(*state, byte)
}
#[inline]
fn can_match(&self, state: &S) -> bool {
!self.is_dead_state(*state)
}
}
};
}
imp!(crate::dense::DenseDFA<T, S>, S);
imp!(crate::dense::Standard<T, S>, S);
imp!(crate::dense::ByteClass<T, S>, S);
imp!(crate::dense::Premultiplied<T, S>, S);
imp!(crate::dense::PremultipliedByteClass<T, S>, S);
imp!(crate::sparse::SparseDFA<T, S>, u8);
imp!(crate::sparse::Standard<T, S>, u8);
imp!(crate::sparse::ByteClass<T, S>, u8);
#[cfg(test)]
mod tests {
use bstr::BString;
use fst::{Automaton, IntoStreamer, Set, Streamer};
use crate::dense::{self, DenseDFA};
use crate::sparse::SparseDFA;
fn search<A: Automaton, D: AsRef<[u8]>>(
set: &Set<D>,
aut: A,
) -> Vec<BString> {
let mut stream = set.search(aut).into_stream();
let mut results = vec![];
while let Some(key) = stream.next() {
results.push(BString::from(key));
}
results
}
#[test]
fn dense_anywhere() {
let set =
Set::from_iter(&["a", "bar", "baz", "wat", "xba", "xbax", "z"])
.unwrap();
let dfa = DenseDFA::new("ba.*").unwrap();
let got = search(&set, &dfa);
assert_eq!(got, vec!["bar", "baz", "xba", "xbax"]);
}
#[test]
fn dense_anchored() {
let set =
Set::from_iter(&["a", "bar", "baz", "wat", "xba", "xbax", "z"])
.unwrap();
let dfa = dense::Builder::new().anchored(true).build("ba.*").unwrap();
let got = search(&set, &dfa);
assert_eq!(got, vec!["bar", "baz"]);
}
#[test]
fn sparse_anywhere() {
let set =
Set::from_iter(&["a", "bar", "baz", "wat", "xba", "xbax", "z"])
.unwrap();
let dfa = SparseDFA::new("ba.*").unwrap();
let got = search(&set, &dfa);
assert_eq!(got, vec!["bar", "baz", "xba", "xbax"]);
}
#[test]
fn sparse_anchored() {
let set =
Set::from_iter(&["a", "bar", "baz", "wat", "xba", "xbax", "z"])
.unwrap();
let dfa = dense::Builder::new()
.anchored(true)
.build("ba.*")
.unwrap()
.to_sparse()
.unwrap();
let got = search(&set, &dfa);
assert_eq!(got, vec!["bar", "baz"]);
}
}

View file

@ -0,0 +1,461 @@
use std::collections::BTreeMap;
use std::env;
use std::fmt::{self, Write};
use std::thread;
use regex;
use regex_automata::{DenseDFA, ErrorKind, Regex, RegexBuilder, StateID, DFA};
use serde_bytes;
use toml;
macro_rules! load {
($col:ident, $path:expr) => {
$col.extend(RegexTests::load(
concat!("../data/tests/", $path),
include_bytes!(concat!("../data/tests/", $path)),
));
};
}
lazy_static! {
pub static ref SUITE: RegexTestCollection = {
let mut col = RegexTestCollection::new();
load!(col, "fowler/basic.toml");
load!(col, "fowler/nullsubexpr.toml");
load!(col, "fowler/repetition.toml");
load!(col, "fowler/repetition-long.toml");
load!(col, "crazy.toml");
load!(col, "flags.toml");
load!(col, "iter.toml");
load!(col, "no-unicode.toml");
load!(col, "unicode.toml");
col
};
}
#[derive(Clone, Debug)]
pub struct RegexTestCollection {
pub by_name: BTreeMap<String, RegexTest>,
}
#[derive(Clone, Debug, Deserialize)]
pub struct RegexTests {
pub tests: Vec<RegexTest>,
}
#[derive(Clone, Debug, Deserialize)]
pub struct RegexTest {
pub name: String,
#[serde(default)]
pub options: Vec<RegexTestOption>,
pub pattern: String,
#[serde(with = "serde_bytes")]
pub input: Vec<u8>,
#[serde(rename = "matches")]
pub matches: Vec<Match>,
#[serde(default)]
pub captures: Vec<Option<Match>>,
#[serde(default)]
pub fowler_line_number: Option<u64>,
}
#[derive(Clone, Copy, Debug, Deserialize, Eq, PartialEq)]
#[serde(rename_all = "kebab-case")]
pub enum RegexTestOption {
Anchored,
CaseInsensitive,
NoUnicode,
Escaped,
#[serde(rename = "invalid-utf8")]
InvalidUTF8,
}
#[derive(Clone, Copy, Deserialize, Eq, PartialEq)]
pub struct Match {
pub start: usize,
pub end: usize,
}
impl RegexTestCollection {
fn new() -> RegexTestCollection {
RegexTestCollection { by_name: BTreeMap::new() }
}
fn extend(&mut self, tests: RegexTests) {
for test in tests.tests {
let name = test.name.clone();
if self.by_name.contains_key(&name) {
panic!("found duplicate test {}", name);
}
self.by_name.insert(name, test);
}
}
pub fn tests(&self) -> Vec<&RegexTest> {
self.by_name.values().collect()
}
}
impl RegexTests {
fn load(path: &str, slice: &[u8]) -> RegexTests {
let mut data: RegexTests = toml::from_slice(slice)
.expect(&format!("failed to load {}", path));
for test in &mut data.tests {
if test.options.contains(&RegexTestOption::Escaped) {
test.input = unescape_bytes(&test.input);
}
}
data
}
}
#[derive(Debug)]
pub struct RegexTester {
asserted: bool,
results: RegexTestResults,
skip_expensive: bool,
whitelist: Vec<regex::Regex>,
blacklist: Vec<regex::Regex>,
}
impl Drop for RegexTester {
fn drop(&mut self) {
// If we haven't asserted yet, then the test is probably buggy, so
// fail it. But if we're already panicking (e.g., a bug in the regex
// engine), then don't double-panic, which causes an immediate abort.
if !thread::panicking() && !self.asserted {
panic!("must call RegexTester::assert at end of test");
}
}
}
impl RegexTester {
pub fn new() -> RegexTester {
let mut tester = RegexTester {
asserted: false,
results: RegexTestResults::default(),
skip_expensive: false,
whitelist: vec![],
blacklist: vec![],
};
for x in env::var("REGEX_TEST").unwrap_or("".to_string()).split(",") {
let x = x.trim();
if x.is_empty() {
continue;
}
if x.starts_with("-") {
tester = tester.blacklist(&x[1..]);
} else {
tester = tester.whitelist(x);
}
}
tester
}
pub fn skip_expensive(mut self) -> RegexTester {
self.skip_expensive = true;
self
}
pub fn whitelist(mut self, name: &str) -> RegexTester {
self.whitelist.push(regex::Regex::new(name).unwrap());
self
}
pub fn blacklist(mut self, name: &str) -> RegexTester {
self.blacklist.push(regex::Regex::new(name).unwrap());
self
}
pub fn assert(&mut self) {
self.asserted = true;
self.results.assert();
}
pub fn build_regex<S: StateID>(
&self,
mut builder: RegexBuilder,
test: &RegexTest,
) -> Option<Regex<DenseDFA<Vec<S>, S>>> {
if self.skip(test) {
return None;
}
self.apply_options(test, &mut builder);
match builder.build_with_size::<S>(&test.pattern) {
Ok(re) => Some(re),
Err(err) => {
if let ErrorKind::Unsupported(_) = *err.kind() {
None
} else {
panic!(
"failed to build {:?} with pattern '{:?}': {}",
test.name, test.pattern, err
);
}
}
}
}
pub fn test_all<'a, I, T>(&mut self, builder: RegexBuilder, tests: I)
where
I: IntoIterator<IntoIter = T, Item = &'a RegexTest>,
T: Iterator<Item = &'a RegexTest>,
{
for test in tests {
let builder = builder.clone();
let re: Regex = match self.build_regex(builder, test) {
None => continue,
Some(re) => re,
};
self.test(test, &re);
}
}
pub fn test<'a, D: DFA>(&mut self, test: &RegexTest, re: &Regex<D>) {
self.test_is_match(test, re);
self.test_find(test, re);
// Some tests (namely, fowler) are designed only to detect the
// first match even if there are more subsequent matches. To that
// end, we only test match iteration when the number of matches
// expected is not 1, or if the test name has 'iter' in it.
if test.name.contains("iter") || test.matches.len() != 1 {
self.test_find_iter(test, re);
}
}
pub fn test_is_match<'a, D: DFA>(
&mut self,
test: &RegexTest,
re: &Regex<D>,
) {
self.asserted = false;
let got = re.is_match(&test.input);
let expected = test.matches.len() >= 1;
if got == expected {
self.results.succeeded.push(test.clone());
return;
}
self.results.failed.push(RegexTestFailure {
test: test.clone(),
kind: RegexTestFailureKind::IsMatch,
});
}
pub fn test_find<'a, D: DFA>(&mut self, test: &RegexTest, re: &Regex<D>) {
self.asserted = false;
let got =
re.find(&test.input).map(|(start, end)| Match { start, end });
if got == test.matches.get(0).map(|&m| m) {
self.results.succeeded.push(test.clone());
return;
}
self.results.failed.push(RegexTestFailure {
test: test.clone(),
kind: RegexTestFailureKind::Find { got },
});
}
pub fn test_find_iter<'a, D: DFA>(
&mut self,
test: &RegexTest,
re: &Regex<D>,
) {
self.asserted = false;
let got: Vec<Match> = re
.find_iter(&test.input)
.map(|(start, end)| Match { start, end })
.collect();
if got == test.matches {
self.results.succeeded.push(test.clone());
return;
}
self.results.failed.push(RegexTestFailure {
test: test.clone(),
kind: RegexTestFailureKind::FindIter { got },
});
}
fn skip(&self, test: &RegexTest) -> bool {
if self.skip_expensive {
if test.name.starts_with("repetition-long") {
return true;
}
}
if !self.blacklist.is_empty() {
if self.blacklist.iter().any(|re| re.is_match(&test.name)) {
return true;
}
}
if !self.whitelist.is_empty() {
if !self.whitelist.iter().any(|re| re.is_match(&test.name)) {
return true;
}
}
false
}
fn apply_options(&self, test: &RegexTest, builder: &mut RegexBuilder) {
for opt in &test.options {
match *opt {
RegexTestOption::Anchored => {
builder.anchored(true);
}
RegexTestOption::CaseInsensitive => {
builder.case_insensitive(true);
}
RegexTestOption::NoUnicode => {
builder.unicode(false);
}
RegexTestOption::Escaped => {}
RegexTestOption::InvalidUTF8 => {
builder.allow_invalid_utf8(true);
}
}
}
}
}
#[derive(Clone, Debug, Default)]
pub struct RegexTestResults {
/// Tests that succeeded.
pub succeeded: Vec<RegexTest>,
/// Failed tests, indexed by group name.
pub failed: Vec<RegexTestFailure>,
}
#[derive(Clone, Debug)]
pub struct RegexTestFailure {
test: RegexTest,
kind: RegexTestFailureKind,
}
#[derive(Clone, Debug)]
pub enum RegexTestFailureKind {
IsMatch,
Find { got: Option<Match> },
FindIter { got: Vec<Match> },
}
impl RegexTestResults {
pub fn assert(&self) {
if self.failed.is_empty() {
return;
}
let failures = self
.failed
.iter()
.map(|f| f.to_string())
.collect::<Vec<String>>()
.join("\n\n");
panic!(
"found {} failures:\n{}\n{}\n{}\n\n\
Set the REGEX_TEST environment variable to filter tests, \n\
e.g., REGEX_TEST=crazy-misc,-crazy-misc2 runs every test \n\
whose name contains crazy-misc but not crazy-misc2\n\n",
self.failed.len(),
"~".repeat(79),
failures.trim(),
"~".repeat(79)
)
}
}
impl fmt::Display for RegexTestFailure {
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
write!(
f,
"{}: {}\n \
options: {:?}\n \
pattern: {}\n \
pattern (escape): {}\n \
input: {}\n \
input (escape): {}\n \
input (hex): {}",
self.test.name,
self.kind.fmt(&self.test)?,
self.test.options,
self.test.pattern,
escape_default(&self.test.pattern),
nice_raw_bytes(&self.test.input),
escape_bytes(&self.test.input),
hex_bytes(&self.test.input)
)
}
}
impl RegexTestFailureKind {
fn fmt(&self, test: &RegexTest) -> Result<String, fmt::Error> {
let mut buf = String::new();
match *self {
RegexTestFailureKind::IsMatch => {
if let Some(&m) = test.matches.get(0) {
write!(buf, "expected match (at {}), but none found", m)?
} else {
write!(buf, "expected no match, but found a match")?
}
}
RegexTestFailureKind::Find { got } => write!(
buf,
"expected {:?}, but found {:?}",
test.matches.get(0),
got
)?,
RegexTestFailureKind::FindIter { ref got } => write!(
buf,
"expected {:?}, but found {:?}",
test.matches, got
)?,
}
Ok(buf)
}
}
impl fmt::Display for Match {
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
write!(f, "({}, {})", self.start, self.end)
}
}
impl fmt::Debug for Match {
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
write!(f, "({}, {})", self.start, self.end)
}
}
fn nice_raw_bytes(bytes: &[u8]) -> String {
use std::str;
match str::from_utf8(bytes) {
Ok(s) => s.to_string(),
Err(_) => escape_bytes(bytes),
}
}
fn escape_bytes(bytes: &[u8]) -> String {
use std::ascii;
let escaped = bytes
.iter()
.flat_map(|&b| ascii::escape_default(b))
.collect::<Vec<u8>>();
String::from_utf8(escaped).unwrap()
}
fn hex_bytes(bytes: &[u8]) -> String {
bytes.iter().map(|&b| format!(r"\x{:02X}", b)).collect()
}
fn escape_default(s: &str) -> String {
s.chars().flat_map(|c| c.escape_default()).collect()
}
fn unescape_bytes(bytes: &[u8]) -> Vec<u8> {
use std::str;
use unescape::unescape;
unescape(&str::from_utf8(bytes).expect("all input must be valid UTF-8"))
}

View file

@ -0,0 +1,42 @@
use regex_automata::{dense, DFA};
// A regression test for checking that minimization correctly translates
// whether a state is a match state or not. Previously, it was possible for
// minimization to mark a non-matching state as matching.
#[test]
fn minimize_sets_correct_match_states() {
let pattern =
// This is a subset of the grapheme matching regex. I couldn't seem
// to get a repro any smaller than this unfortunately.
r"(?x)
(?:
\p{gcb=Prepend}*
(?:
(?:
(?:
\p{gcb=L}*
(?:\p{gcb=V}+|\p{gcb=LV}\p{gcb=V}*|\p{gcb=LVT})
\p{gcb=T}*
)
|
\p{gcb=L}+
|
\p{gcb=T}+
)
|
\p{Extended_Pictographic}
(?:\p{gcb=Extend}*\p{gcb=ZWJ}\p{Extended_Pictographic})*
|
[^\p{gcb=Control}\p{gcb=CR}\p{gcb=LF}]
)
[\p{gcb=Extend}\p{gcb=ZWJ}\p{gcb=SpacingMark}]*
)
";
let dfa = dense::Builder::new()
.minimize(true)
.anchored(true)
.build(pattern)
.unwrap();
assert_eq!(None, dfa.find(b"\xE2"));
}

View file

@ -0,0 +1,250 @@
use regex_automata::{DenseDFA, Regex, RegexBuilder, SparseDFA};
use collection::{RegexTester, SUITE};
#[test]
fn unminimized_standard() {
let mut builder = RegexBuilder::new();
builder.minimize(false).premultiply(false).byte_classes(false);
let mut tester = RegexTester::new().skip_expensive();
tester.test_all(builder, SUITE.tests());
tester.assert();
}
#[test]
fn unminimized_premultiply() {
let mut builder = RegexBuilder::new();
builder.minimize(false).premultiply(true).byte_classes(false);
let mut tester = RegexTester::new().skip_expensive();
tester.test_all(builder, SUITE.tests());
tester.assert();
}
#[test]
fn unminimized_byte_class() {
let mut builder = RegexBuilder::new();
builder.minimize(false).premultiply(false).byte_classes(true);
let mut tester = RegexTester::new();
tester.test_all(builder, SUITE.tests());
tester.assert();
}
#[test]
fn unminimized_premultiply_byte_class() {
let mut builder = RegexBuilder::new();
builder.minimize(false).premultiply(true).byte_classes(true);
let mut tester = RegexTester::new();
tester.test_all(builder, SUITE.tests());
tester.assert();
}
#[test]
fn unminimized_standard_no_nfa_shrink() {
let mut builder = RegexBuilder::new();
builder
.minimize(false)
.premultiply(false)
.byte_classes(false)
.shrink(false);
let mut tester = RegexTester::new().skip_expensive();
tester.test_all(builder, SUITE.tests());
tester.assert();
}
#[test]
fn minimized_standard() {
let mut builder = RegexBuilder::new();
builder.minimize(true).premultiply(false).byte_classes(false);
let mut tester = RegexTester::new().skip_expensive();
tester.test_all(builder, SUITE.tests());
tester.assert();
}
#[test]
fn minimized_premultiply() {
let mut builder = RegexBuilder::new();
builder.minimize(true).premultiply(true).byte_classes(false);
let mut tester = RegexTester::new().skip_expensive();
tester.test_all(builder, SUITE.tests());
tester.assert();
}
#[test]
fn minimized_byte_class() {
let mut builder = RegexBuilder::new();
builder.minimize(true).premultiply(false).byte_classes(true);
let mut tester = RegexTester::new();
tester.test_all(builder, SUITE.tests());
tester.assert();
}
#[test]
fn minimized_premultiply_byte_class() {
let mut builder = RegexBuilder::new();
builder.minimize(true).premultiply(true).byte_classes(true);
let mut tester = RegexTester::new();
tester.test_all(builder, SUITE.tests());
tester.assert();
}
#[test]
fn minimized_standard_no_nfa_shrink() {
let mut builder = RegexBuilder::new();
builder
.minimize(true)
.premultiply(false)
.byte_classes(false)
.shrink(false);
let mut tester = RegexTester::new().skip_expensive();
tester.test_all(builder, SUITE.tests());
tester.assert();
}
// A basic sanity test that checks we can convert a regex to a smaller
// representation and that the resulting regex still passes our tests.
//
// If tests grow minimal regexes that cannot be represented in 16 bits, then
// we'll either want to skip those or increase the size to test to u32.
#[test]
fn u16() {
let mut builder = RegexBuilder::new();
builder.minimize(true).premultiply(false).byte_classes(true);
let mut tester = RegexTester::new().skip_expensive();
for test in SUITE.tests() {
let builder = builder.clone();
let re: Regex = match tester.build_regex(builder, test) {
None => continue,
Some(re) => re,
};
let small_re = Regex::from_dfas(
re.forward().to_u16().unwrap(),
re.reverse().to_u16().unwrap(),
);
tester.test(test, &small_re);
}
tester.assert();
}
// Test that sparse DFAs work using the standard configuration.
#[test]
fn sparse_unminimized_standard() {
let mut builder = RegexBuilder::new();
builder.minimize(false).premultiply(false).byte_classes(false);
let mut tester = RegexTester::new().skip_expensive();
for test in SUITE.tests() {
let builder = builder.clone();
let re: Regex = match tester.build_regex(builder, test) {
None => continue,
Some(re) => re,
};
let fwd = re.forward().to_sparse().unwrap();
let rev = re.reverse().to_sparse().unwrap();
let sparse_re = Regex::from_dfas(fwd, rev);
tester.test(test, &sparse_re);
}
tester.assert();
}
// Test that sparse DFAs work after converting them to a different state ID
// representation.
#[test]
fn sparse_u16() {
let mut builder = RegexBuilder::new();
builder.minimize(true).premultiply(false).byte_classes(false);
let mut tester = RegexTester::new().skip_expensive();
for test in SUITE.tests() {
let builder = builder.clone();
let re: Regex = match tester.build_regex(builder, test) {
None => continue,
Some(re) => re,
};
let fwd = re.forward().to_sparse().unwrap().to_u16().unwrap();
let rev = re.reverse().to_sparse().unwrap().to_u16().unwrap();
let sparse_re = Regex::from_dfas(fwd, rev);
tester.test(test, &sparse_re);
}
tester.assert();
}
// Another basic sanity test that checks we can serialize and then deserialize
// a regex, and that the resulting regex can be used for searching correctly.
#[test]
fn serialization_roundtrip() {
let mut builder = RegexBuilder::new();
builder.premultiply(false).byte_classes(true);
let mut tester = RegexTester::new().skip_expensive();
for test in SUITE.tests() {
let builder = builder.clone();
let re: Regex = match tester.build_regex(builder, test) {
None => continue,
Some(re) => re,
};
let fwd_bytes = re.forward().to_bytes_native_endian().unwrap();
let rev_bytes = re.reverse().to_bytes_native_endian().unwrap();
let fwd: DenseDFA<&[usize], usize> =
unsafe { DenseDFA::from_bytes(&fwd_bytes) };
let rev: DenseDFA<&[usize], usize> =
unsafe { DenseDFA::from_bytes(&rev_bytes) };
let re = Regex::from_dfas(fwd, rev);
tester.test(test, &re);
}
tester.assert();
}
// A basic sanity test that checks we can serialize and then deserialize a
// regex using sparse DFAs, and that the resulting regex can be used for
// searching correctly.
#[test]
fn sparse_serialization_roundtrip() {
let mut builder = RegexBuilder::new();
builder.byte_classes(true);
let mut tester = RegexTester::new().skip_expensive();
for test in SUITE.tests() {
let builder = builder.clone();
let re: Regex = match tester.build_regex(builder, test) {
None => continue,
Some(re) => re,
};
let fwd_bytes = re
.forward()
.to_sparse()
.unwrap()
.to_bytes_native_endian()
.unwrap();
let rev_bytes = re
.reverse()
.to_sparse()
.unwrap()
.to_bytes_native_endian()
.unwrap();
let fwd: SparseDFA<&[u8], usize> =
unsafe { SparseDFA::from_bytes(&fwd_bytes) };
let rev: SparseDFA<&[u8], usize> =
unsafe { SparseDFA::from_bytes(&rev_bytes) };
let re = Regex::from_dfas(fwd, rev);
tester.test(test, &re);
}
tester.assert();
}

View file

@ -0,0 +1,25 @@
#[cfg(feature = "std")]
#[macro_use]
extern crate lazy_static;
#[cfg(feature = "std")]
extern crate regex;
#[cfg(feature = "std")]
extern crate regex_automata;
#[cfg(feature = "std")]
extern crate serde;
#[cfg(feature = "std")]
extern crate serde_bytes;
#[cfg(feature = "std")]
#[macro_use]
extern crate serde_derive;
#[cfg(feature = "std")]
extern crate toml;
#[cfg(feature = "std")]
mod collection;
#[cfg(feature = "std")]
mod regression;
#[cfg(feature = "std")]
mod suite;
#[cfg(feature = "std")]
mod unescape;

View file

@ -0,0 +1,84 @@
#[derive(Clone, Copy, Eq, PartialEq)]
enum State {
/// The state after seeing a `\`.
Escape,
/// The state after seeing a `\x`.
HexFirst,
/// The state after seeing a `\x[0-9A-Fa-f]`.
HexSecond(char),
/// Default state.
Literal,
}
pub fn unescape(s: &str) -> Vec<u8> {
use self::State::*;
let mut bytes = vec![];
let mut state = Literal;
for c in s.chars() {
match state {
Escape => match c {
'\\' => {
bytes.push(b'\\');
state = Literal;
}
'n' => {
bytes.push(b'\n');
state = Literal;
}
'r' => {
bytes.push(b'\r');
state = Literal;
}
't' => {
bytes.push(b'\t');
state = Literal;
}
'x' => {
state = HexFirst;
}
c => {
bytes.extend(format!(r"\{}", c).into_bytes());
state = Literal;
}
},
HexFirst => match c {
'0'..='9' | 'A'..='F' | 'a'..='f' => {
state = HexSecond(c);
}
c => {
bytes.extend(format!(r"\x{}", c).into_bytes());
state = Literal;
}
},
HexSecond(first) => match c {
'0'..='9' | 'A'..='F' | 'a'..='f' => {
let ordinal = format!("{}{}", first, c);
let byte = u8::from_str_radix(&ordinal, 16).unwrap();
bytes.push(byte);
state = Literal;
}
c => {
let original = format!(r"\x{}{}", first, c);
bytes.extend(original.into_bytes());
state = Literal;
}
},
Literal => match c {
'\\' => {
state = Escape;
}
c => {
bytes.extend(c.to_string().as_bytes());
}
},
}
}
match state {
Escape => bytes.push(b'\\'),
HexFirst => bytes.extend(b"\\x"),
HexSecond(c) => bytes.extend(format!("\\x{}", c).into_bytes()),
Literal => {}
}
bytes
}