Vendor things
This commit is contained in:
parent
5deceec006
commit
977e3c17e5
19434 changed files with 10682014 additions and 0 deletions
222
third-party/vendor/regex/testdata/no-unicode.toml
vendored
Normal file
222
third-party/vendor/regex/testdata/no-unicode.toml
vendored
Normal file
|
|
@ -0,0 +1,222 @@
|
|||
[[test]]
|
||||
name = "invalid-utf8-literal1"
|
||||
regex = '\xFF'
|
||||
haystack = '\xFF'
|
||||
matches = [[0, 1]]
|
||||
unicode = false
|
||||
utf8 = false
|
||||
unescape = true
|
||||
|
||||
|
||||
[[test]]
|
||||
name = "mixed"
|
||||
regex = '(?:.+)(?-u)(?:.+)'
|
||||
haystack = '\xCE\x93\xCE\x94\xFF'
|
||||
matches = [[0, 5]]
|
||||
utf8 = false
|
||||
unescape = true
|
||||
|
||||
|
||||
[[test]]
|
||||
name = "case1"
|
||||
regex = "a"
|
||||
haystack = "A"
|
||||
matches = [[0, 1]]
|
||||
case-insensitive = true
|
||||
unicode = false
|
||||
|
||||
[[test]]
|
||||
name = "case2"
|
||||
regex = "[a-z]+"
|
||||
haystack = "AaAaA"
|
||||
matches = [[0, 5]]
|
||||
case-insensitive = true
|
||||
unicode = false
|
||||
|
||||
[[test]]
|
||||
name = "case3"
|
||||
regex = "[a-z]+"
|
||||
haystack = "aA\u212AaA"
|
||||
matches = [[0, 7]]
|
||||
case-insensitive = true
|
||||
|
||||
[[test]]
|
||||
name = "case4"
|
||||
regex = "[a-z]+"
|
||||
haystack = "aA\u212AaA"
|
||||
matches = [[0, 2], [5, 7]]
|
||||
case-insensitive = true
|
||||
unicode = false
|
||||
|
||||
|
||||
[[test]]
|
||||
name = "negate1"
|
||||
regex = "[^a]"
|
||||
haystack = "δ"
|
||||
matches = [[0, 2]]
|
||||
|
||||
[[test]]
|
||||
name = "negate2"
|
||||
regex = "[^a]"
|
||||
haystack = "δ"
|
||||
matches = [[0, 1], [1, 2]]
|
||||
unicode = false
|
||||
utf8 = false
|
||||
|
||||
|
||||
[[test]]
|
||||
name = "dotstar-prefix1"
|
||||
regex = "a"
|
||||
haystack = '\xFFa'
|
||||
matches = [[1, 2]]
|
||||
unicode = false
|
||||
utf8 = false
|
||||
unescape = true
|
||||
|
||||
[[test]]
|
||||
name = "dotstar-prefix2"
|
||||
regex = "a"
|
||||
haystack = '\xFFa'
|
||||
matches = [[1, 2]]
|
||||
utf8 = false
|
||||
unescape = true
|
||||
|
||||
|
||||
[[test]]
|
||||
name = "null-bytes1"
|
||||
regex = '[^\x00]+\x00'
|
||||
haystack = 'foo\x00'
|
||||
matches = [[0, 4]]
|
||||
unicode = false
|
||||
utf8 = false
|
||||
unescape = true
|
||||
|
||||
|
||||
[[test]]
|
||||
name = "word-ascii"
|
||||
regex = '\w+'
|
||||
haystack = "aδ"
|
||||
matches = [[0, 1]]
|
||||
unicode = false
|
||||
|
||||
[[test]]
|
||||
name = "word-unicode"
|
||||
regex = '\w+'
|
||||
haystack = "aδ"
|
||||
matches = [[0, 3]]
|
||||
|
||||
[[test]]
|
||||
name = "decimal-ascii"
|
||||
regex = '\d+'
|
||||
haystack = "1२३9"
|
||||
matches = [[0, 1], [7, 8]]
|
||||
unicode = false
|
||||
|
||||
[[test]]
|
||||
name = "decimal-unicode"
|
||||
regex = '\d+'
|
||||
haystack = "1२३9"
|
||||
matches = [[0, 8]]
|
||||
|
||||
[[test]]
|
||||
name = "space-ascii"
|
||||
regex = '\s+'
|
||||
haystack = " \u1680"
|
||||
matches = [[0, 1]]
|
||||
unicode = false
|
||||
|
||||
[[test]]
|
||||
name = "space-unicode"
|
||||
regex = '\s+'
|
||||
haystack = " \u1680"
|
||||
matches = [[0, 4]]
|
||||
|
||||
|
||||
[[test]]
|
||||
# See: https://github.com/rust-lang/regex/issues/484
|
||||
name = "iter1-bytes"
|
||||
regex = ''
|
||||
haystack = "☃"
|
||||
matches = [[0, 0], [1, 1], [2, 2], [3, 3]]
|
||||
utf8 = false
|
||||
|
||||
[[test]]
|
||||
# See: https://github.com/rust-lang/regex/issues/484
|
||||
name = "iter1-utf8"
|
||||
regex = ''
|
||||
haystack = "☃"
|
||||
matches = [[0, 0], [3, 3]]
|
||||
|
||||
[[test]]
|
||||
# See: https://github.com/rust-lang/regex/issues/484
|
||||
# Note that iter2-utf8 doesn't make sense here, since the input isn't UTF-8.
|
||||
name = "iter2-bytes"
|
||||
regex = ''
|
||||
haystack = 'b\xFFr'
|
||||
matches = [[0, 0], [1, 1], [2, 2], [3, 3]]
|
||||
unescape = true
|
||||
utf8 = false
|
||||
|
||||
|
||||
# These test that unanchored prefixes can munch through invalid UTF-8 even when
|
||||
# utf8 is enabled.
|
||||
#
|
||||
# This test actually reflects an interesting simplification in how the Thompson
|
||||
# NFA is constructed. It used to be that the NFA could be built with an
|
||||
# unanchored prefix that either matched any byte or _only_ matched valid UTF-8.
|
||||
# But the latter turns out to be pretty precarious when it comes to prefilters,
|
||||
# because if you search a haystack that contains invalid UTF-8 but have an
|
||||
# unanchored prefix that requires UTF-8, then prefilters are no longer a valid
|
||||
# optimization because you actually have to check that everything is valid
|
||||
# UTF-8.
|
||||
#
|
||||
# Originally, I had thought that we needed a valid UTF-8 unanchored prefix in
|
||||
# order to guarantee that we only match at valid UTF-8 boundaries. But this
|
||||
# isn't actually true! There are really only two things to consider here:
|
||||
#
|
||||
# 1) Will a regex match split an encoded codepoint? No. Because by construction,
|
||||
# we ensure that a MATCH state can only be reached by following valid UTF-8 (assuming
|
||||
# all of the UTF-8 modes are enabled).
|
||||
#
|
||||
# 2) Will a regex match arbitrary bytes that aren't valid UTF-8? Again, no,
|
||||
# assuming all of the UTF-8 modes are enabled.
|
||||
[[test]]
|
||||
name = "unanchored-invalid-utf8-match-100"
|
||||
regex = '[a-z]'
|
||||
haystack = '\xFFa\xFF'
|
||||
matches = [[1, 2]]
|
||||
unescape = true
|
||||
utf8 = false
|
||||
|
||||
# This test shows that we can still prevent a match from occurring by requiring
|
||||
# that valid UTF-8 match by inserting our own unanchored prefix. Thus, if the
|
||||
# behavior of not munching through invalid UTF-8 anywhere is needed, then it
|
||||
# can be achieved thusly.
|
||||
[[test]]
|
||||
name = "unanchored-invalid-utf8-nomatch"
|
||||
regex = '^(?s:.)*?[a-z]'
|
||||
haystack = '\xFFa\xFF'
|
||||
matches = []
|
||||
unescape = true
|
||||
utf8 = false
|
||||
|
||||
# This is a tricky test that makes sure we don't accidentally do a kind of
|
||||
# unanchored search when we've requested that a regex engine not report
|
||||
# empty matches that split a codepoint. This test caught a regression during
|
||||
# development where the code for skipping over bad empty matches would do so
|
||||
# even if the search should have been anchored. This is ultimately what led to
|
||||
# making 'anchored' an 'Input' option, so that it was always clear what kind
|
||||
# of search was being performed. (Before that, whether a search was anchored
|
||||
# or not was a config knob on the regex engine.) This did wind up making DFAs
|
||||
# a little more complex to configure (with their 'StartKind' knob), but it
|
||||
# generally smoothed out everything else.
|
||||
#
|
||||
# Great example of a test whose failure motivated a sweeping API refactoring.
|
||||
[[test]]
|
||||
name = "anchored-iter-empty-utf8"
|
||||
regex = ''
|
||||
haystack = 'a☃z'
|
||||
matches = [[0, 0], [1, 1]]
|
||||
unescape = false
|
||||
utf8 = true
|
||||
anchored = true
|
||||
Loading…
Add table
Add a link
Reference in a new issue