Vendor things

2024-03-08 11:03:01 -08:00 · 2024-03-08 11:03:01 -08:00 · 977e3c17e5
commit 977e3c17e5
parent 5deceec006
19434 changed files with 10682014 additions and 0 deletions
--- a/third-party/vendor/regex/testdata/regex-lite.toml
+++ b/third-party/vendor/regex/testdata/regex-lite.toml
@ -0,0 +1,98 @@
+# These tests are specifically written to test the regex-lite crate. While it
+# largely has the same semantics as the regex crate, there are some differences
+# around Unicode support and UTF-8.
+#
+# To be clear, regex-lite supports far fewer patterns because of its lack of
+# Unicode support, nested character classes and character class set operations.
+# What we're talking about here are the patterns that both crates support but
+# where the semantics might differ.
+
+# regex-lite uses ASCII definitions for Perl character classes.
+[[test]]
+name = "perl-class-decimal"
+regex = '\d'
+haystack = '᠕'
+matches = []
+unicode = true
+
+# regex-lite uses ASCII definitions for Perl character classes.
+[[test]]
+name = "perl-class-space"
+regex = '\s'
+haystack = "\u2000"
+matches = []
+unicode = true
+
+# regex-lite uses ASCII definitions for Perl character classes.
+[[test]]
+name = "perl-class-word"
+regex = '\w'
+haystack = 'δ'
+matches = []
+unicode = true
+
+# regex-lite uses the ASCII definition of word for word boundary assertions.
+[[test]]
+name = "word-boundary"
+regex = '\b'
+haystack = 'δ'
+matches = []
+unicode = true
+
+# regex-lite uses the ASCII definition of word for negated word boundary
+# assertions. But note that it should still not split codepoints!
+[[test]]
+name = "word-boundary-negated"
+regex = '\B'
+haystack = 'δ'
+matches = [[0, 0], [2, 2]]
+unicode = true
+
+# While we're here, the empty regex---which matches at every
+# position---shouldn't split a codepoint either.
+[[test]]
+name = "empty-no-split-codepoint"
+regex = ''
+haystack = '💩'
+matches = [[0, 0], [4, 4]]
+unicode = true
+
+# A dot always matches a full codepoint.
+[[test]]
+name = "dot-always-matches-codepoint"
+regex = '.'
+haystack = '💩'
+matches = [[0, 4]]
+unicode = false
+
+# A negated character class also always matches a full codepoint.
+[[test]]
+name = "negated-class-always-matches-codepoint"
+regex = '[^a]'
+haystack = '💩'
+matches = [[0, 4]]
+unicode = false
+
+# regex-lite only supports ASCII-aware case insensitive matching.
+[[test]]
+name = "case-insensitive-is-ascii-only"
+regex = 's'
+haystack = 'ſ'
+matches = []
+unicode = true
+case-insensitive = true
+
+# Negated word boundaries shouldn't split a codepoint, but they will match
+# between invalid UTF-8.
+#
+# This test is only valid for a 'bytes' API, but that doesn't (yet) exist in
+# regex-lite. This can't happen in the main API because &str can't contain
+# invalid UTF-8.
+# [[test]]
+# name = "word-boundary-invalid-utf8"
+# regex = '\B'
+# haystack = '\xFF\xFF\xFF\xFF'
+# unescape = true
+# matches = [[0, 0], [1, 1], [2, 2], [3, 3], [4, 4]]
+# unicode = true
+# utf8 = false