Vendor things
This commit is contained in:
parent
5deceec006
commit
977e3c17e5
19434 changed files with 10682014 additions and 0 deletions
1
third-party/vendor/regex-syntax/.cargo-checksum.json
vendored
Normal file
1
third-party/vendor/regex-syntax/.cargo-checksum.json
vendored
Normal file
|
|
@ -0,0 +1 @@
|
|||
{"files":{"Cargo.toml":"33c96af38ed9f42d1ccbf85ecfeea1d46202943d01c595b8ee4dddef760e6bd5","LICENSE-APACHE":"a60eea817514531668d7e00765731449fe14d059d3249e0bc93b36de45f759f2","LICENSE-MIT":"6485b8ed310d3f0340bf1ad1f47645069ce4069dcc6bb46c7d5c6faf41de1fdb","README.md":"b2484aa7e66fb92d1378e9a7ce7605af18f77cb12c179866eaf92ba28cfec1d9","benches/bench.rs":"d2b6ae5b939abd6093064f144b981b7739d7f474ec0698a1268052fc92406635","src/ast/mod.rs":"700c2f779fccb529db7b444819d53c38f916b065d3d05a74282f929af581e8b1","src/ast/parse.rs":"fcd45146eaf747d15a2a519d34754638d451ab83e88b5962841cf7a0dd32e988","src/ast/print.rs":"99cb69ece252ef31e0be177fb3364797eb30b785f936532b8dcd8106e7be0738","src/ast/visitor.rs":"f0fdf758801fe70e6b299b73ab63196e814af95ef6eccad7ef4f72075743fcf6","src/debug.rs":"7a16cca02be9715fdc8c26a32279465774623cd12fab1ec59ac25a6e3047817f","src/either.rs":"1758e3edd056884eccadd995708d1e374ba9aa65846bd0e13b1aae852607c560","src/error.rs":"01a67e3407b0d0d869119363e47a94d92158834bfe5936366c2e3f6f4ed13f36","src/hir/interval.rs":"2358e74b4d4aabfa62f79df855fd5d183779b86c4e14aae4ee42d8695bb3d010","src/hir/literal.rs":"6a8108b8919fbfd9ab93072846124c51d2998489810fcd6e7a89fdccc45833e0","src/hir/mod.rs":"eca183b8e173f486c1a11a5fa10895c96067162c8ec936871f937ca7fca5f710","src/hir/print.rs":"ad51c515c933bfd67d307ba3d7e6ac59c9c5903b4f393a9f9a4785c92b88348d","src/hir/translate.rs":"5fbff527c53f217ba2bac9b0948d7de74164625d08674b91a479ced271159ebd","src/hir/visitor.rs":"71ca9c93aa48a5ed445399659fa6455093a1bbd9ef44b66bc7095c1b08b2ec1f","src/lib.rs":"5ae457d402e49443bdb23b71353693dd3b0d263b57a6eeb9eb5b5dae5c901bdd","src/parser.rs":"6b2f4f27e3331a01a25b87c89368dd2e54396bd425dac57941f9c1ebfd238ac8","src/rank.rs":"ff3d58b0cc5ffa69e2e8c56fc7d9ef41dd399d59a639a253a51551b858cb5bbd","src/unicode.rs":"9829458ef321b3bc22c21eae4b22805b33f8b5e67022928ffd9a9e0287bc7c31","src/unicode_tables/LICENSE-UNICODE":"74db5baf44a41b1000312c673544b3374e4198af5605c7f9080a402cec42cfa3","src/unicode_tables/age.rs":"2a2599a4e406fbbd0efd16aa6ce385c3f97b87c34820d6686a9f9113a5231c67","src/unicode_tables/case_folding_simple.rs":"9583803d4a10486da372b76979dbd26349b40766229467238eff972c1d78e47b","src/unicode_tables/general_category.rs":"36a93ba1cdeed96a00ff29a5ab5afd2c578a89541bf4dd8b18478146cebda0aa","src/unicode_tables/grapheme_cluster_break.rs":"39c388e9805a8391d3d3e69d74d831ce4fb99aa7e13e52c64dd2bd16d4765301","src/unicode_tables/mod.rs":"26c837099cd934c8062e24bc9a0aaecf15fe1de03f9c6da3f3e1e5ac3ca24bee","src/unicode_tables/perl_decimal.rs":"a98ea4afe71c2947023ae12bd25c46bf4c7de48eeb40979eca5c96ba62cee02e","src/unicode_tables/perl_space.rs":"ea2b3b84b4a48334082dadc6c37d9fcc9c9ded84b40e8f5c9c9314898638967e","src/unicode_tables/perl_word.rs":"6f1156bd6af32151ecffea4abe07a38fa04b1fc1b227ec1a8dac5d5f08d9d74b","src/unicode_tables/property_bool.rs":"0bd64f6e3228eaecf47824e238bdf1f8a9eef113ace6e790a57f045a8106701c","src/unicode_tables/property_names.rs":"5ca25437927eb70c62adf7d038e99a601cfb8a718677fd6de832589664d3c481","src/unicode_tables/property_values.rs":"5b4cc02392d382cf7af60455fc87b9980e97409b62a4b8d6c5843190d2e2d21d","src/unicode_tables/script.rs":"ea1d771b6d0a4b12d143f9bad2ea9342a0887878cbbe3c11262b6eabedaf2dd4","src/unicode_tables/script_extension.rs":"beeb8349703d903ff861beb8401bfd2599e457dc25df872e69d6ad1615f8b5e9","src/unicode_tables/sentence_break.rs":"2befe2a27cc4e8aecb624e310ef9f371462470dd3b2f572cec1f5873a5e30aa9","src/unicode_tables/word_break.rs":"94679177731b515f0c360eff394286a1f99b59527bdbc826cbf51d32f9666187","src/utf8.rs":"e9a13623a94295b81969c5483de17219ff74bb20768be13c527010351245acbd","test":"c7de5fbc0010d9b5b758cd49956375a64b88601c068167fd366808950257f108"},"package":"c08c74e62047bb2de4ff487b251e4a92e24f48745648451635cec7d591162d9f"}
|
||||
61
third-party/vendor/regex-syntax/Cargo.toml
vendored
Normal file
61
third-party/vendor/regex-syntax/Cargo.toml
vendored
Normal file
|
|
@ -0,0 +1,61 @@
|
|||
# THIS FILE IS AUTOMATICALLY GENERATED BY CARGO
|
||||
#
|
||||
# When uploading crates to the registry Cargo will automatically
|
||||
# "normalize" Cargo.toml files for maximal compatibility
|
||||
# with all versions of Cargo and also rewrite `path` dependencies
|
||||
# to registry (e.g., crates.io) dependencies.
|
||||
#
|
||||
# If you are reading this file be aware that the original Cargo.toml
|
||||
# will likely look very different (and much more reasonable).
|
||||
# See Cargo.toml.orig for the original contents.
|
||||
|
||||
[package]
|
||||
edition = "2021"
|
||||
rust-version = "1.65"
|
||||
name = "regex-syntax"
|
||||
version = "0.8.2"
|
||||
authors = [
|
||||
"The Rust Project Developers",
|
||||
"Andrew Gallant <jamslam@gmail.com>",
|
||||
]
|
||||
description = "A regular expression parser."
|
||||
documentation = "https://docs.rs/regex-syntax"
|
||||
readme = "README.md"
|
||||
license = "MIT OR Apache-2.0"
|
||||
repository = "https://github.com/rust-lang/regex/tree/master/regex-syntax"
|
||||
|
||||
[package.metadata.docs.rs]
|
||||
all-features = true
|
||||
rustdoc-args = [
|
||||
"--cfg",
|
||||
"docsrs",
|
||||
]
|
||||
|
||||
[dependencies.arbitrary]
|
||||
version = "1.3.0"
|
||||
features = ["derive"]
|
||||
optional = true
|
||||
|
||||
[features]
|
||||
arbitrary = ["dep:arbitrary"]
|
||||
default = [
|
||||
"std",
|
||||
"unicode",
|
||||
]
|
||||
std = []
|
||||
unicode = [
|
||||
"unicode-age",
|
||||
"unicode-bool",
|
||||
"unicode-case",
|
||||
"unicode-gencat",
|
||||
"unicode-perl",
|
||||
"unicode-script",
|
||||
"unicode-segment",
|
||||
]
|
||||
unicode-age = []
|
||||
unicode-bool = []
|
||||
unicode-case = []
|
||||
unicode-gencat = []
|
||||
unicode-perl = []
|
||||
unicode-script = []
|
||||
unicode-segment = []
|
||||
201
third-party/vendor/regex-syntax/LICENSE-APACHE
vendored
Normal file
201
third-party/vendor/regex-syntax/LICENSE-APACHE
vendored
Normal file
|
|
@ -0,0 +1,201 @@
|
|||
Apache License
|
||||
Version 2.0, January 2004
|
||||
http://www.apache.org/licenses/
|
||||
|
||||
TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
|
||||
|
||||
1. Definitions.
|
||||
|
||||
"License" shall mean the terms and conditions for use, reproduction,
|
||||
and distribution as defined by Sections 1 through 9 of this document.
|
||||
|
||||
"Licensor" shall mean the copyright owner or entity authorized by
|
||||
the copyright owner that is granting the License.
|
||||
|
||||
"Legal Entity" shall mean the union of the acting entity and all
|
||||
other entities that control, are controlled by, or are under common
|
||||
control with that entity. For the purposes of this definition,
|
||||
"control" means (i) the power, direct or indirect, to cause the
|
||||
direction or management of such entity, whether by contract or
|
||||
otherwise, or (ii) ownership of fifty percent (50%) or more of the
|
||||
outstanding shares, or (iii) beneficial ownership of such entity.
|
||||
|
||||
"You" (or "Your") shall mean an individual or Legal Entity
|
||||
exercising permissions granted by this License.
|
||||
|
||||
"Source" form shall mean the preferred form for making modifications,
|
||||
including but not limited to software source code, documentation
|
||||
source, and configuration files.
|
||||
|
||||
"Object" form shall mean any form resulting from mechanical
|
||||
transformation or translation of a Source form, including but
|
||||
not limited to compiled object code, generated documentation,
|
||||
and conversions to other media types.
|
||||
|
||||
"Work" shall mean the work of authorship, whether in Source or
|
||||
Object form, made available under the License, as indicated by a
|
||||
copyright notice that is included in or attached to the work
|
||||
(an example is provided in the Appendix below).
|
||||
|
||||
"Derivative Works" shall mean any work, whether in Source or Object
|
||||
form, that is based on (or derived from) the Work and for which the
|
||||
editorial revisions, annotations, elaborations, or other modifications
|
||||
represent, as a whole, an original work of authorship. For the purposes
|
||||
of this License, Derivative Works shall not include works that remain
|
||||
separable from, or merely link (or bind by name) to the interfaces of,
|
||||
the Work and Derivative Works thereof.
|
||||
|
||||
"Contribution" shall mean any work of authorship, including
|
||||
the original version of the Work and any modifications or additions
|
||||
to that Work or Derivative Works thereof, that is intentionally
|
||||
submitted to Licensor for inclusion in the Work by the copyright owner
|
||||
or by an individual or Legal Entity authorized to submit on behalf of
|
||||
the copyright owner. For the purposes of this definition, "submitted"
|
||||
means any form of electronic, verbal, or written communication sent
|
||||
to the Licensor or its representatives, including but not limited to
|
||||
communication on electronic mailing lists, source code control systems,
|
||||
and issue tracking systems that are managed by, or on behalf of, the
|
||||
Licensor for the purpose of discussing and improving the Work, but
|
||||
excluding communication that is conspicuously marked or otherwise
|
||||
designated in writing by the copyright owner as "Not a Contribution."
|
||||
|
||||
"Contributor" shall mean Licensor and any individual or Legal Entity
|
||||
on behalf of whom a Contribution has been received by Licensor and
|
||||
subsequently incorporated within the Work.
|
||||
|
||||
2. Grant of Copyright License. Subject to the terms and conditions of
|
||||
this License, each Contributor hereby grants to You a perpetual,
|
||||
worldwide, non-exclusive, no-charge, royalty-free, irrevocable
|
||||
copyright license to reproduce, prepare Derivative Works of,
|
||||
publicly display, publicly perform, sublicense, and distribute the
|
||||
Work and such Derivative Works in Source or Object form.
|
||||
|
||||
3. Grant of Patent License. Subject to the terms and conditions of
|
||||
this License, each Contributor hereby grants to You a perpetual,
|
||||
worldwide, non-exclusive, no-charge, royalty-free, irrevocable
|
||||
(except as stated in this section) patent license to make, have made,
|
||||
use, offer to sell, sell, import, and otherwise transfer the Work,
|
||||
where such license applies only to those patent claims licensable
|
||||
by such Contributor that are necessarily infringed by their
|
||||
Contribution(s) alone or by combination of their Contribution(s)
|
||||
with the Work to which such Contribution(s) was submitted. If You
|
||||
institute patent litigation against any entity (including a
|
||||
cross-claim or counterclaim in a lawsuit) alleging that the Work
|
||||
or a Contribution incorporated within the Work constitutes direct
|
||||
or contributory patent infringement, then any patent licenses
|
||||
granted to You under this License for that Work shall terminate
|
||||
as of the date such litigation is filed.
|
||||
|
||||
4. Redistribution. You may reproduce and distribute copies of the
|
||||
Work or Derivative Works thereof in any medium, with or without
|
||||
modifications, and in Source or Object form, provided that You
|
||||
meet the following conditions:
|
||||
|
||||
(a) You must give any other recipients of the Work or
|
||||
Derivative Works a copy of this License; and
|
||||
|
||||
(b) You must cause any modified files to carry prominent notices
|
||||
stating that You changed the files; and
|
||||
|
||||
(c) You must retain, in the Source form of any Derivative Works
|
||||
that You distribute, all copyright, patent, trademark, and
|
||||
attribution notices from the Source form of the Work,
|
||||
excluding those notices that do not pertain to any part of
|
||||
the Derivative Works; and
|
||||
|
||||
(d) If the Work includes a "NOTICE" text file as part of its
|
||||
distribution, then any Derivative Works that You distribute must
|
||||
include a readable copy of the attribution notices contained
|
||||
within such NOTICE file, excluding those notices that do not
|
||||
pertain to any part of the Derivative Works, in at least one
|
||||
of the following places: within a NOTICE text file distributed
|
||||
as part of the Derivative Works; within the Source form or
|
||||
documentation, if provided along with the Derivative Works; or,
|
||||
within a display generated by the Derivative Works, if and
|
||||
wherever such third-party notices normally appear. The contents
|
||||
of the NOTICE file are for informational purposes only and
|
||||
do not modify the License. You may add Your own attribution
|
||||
notices within Derivative Works that You distribute, alongside
|
||||
or as an addendum to the NOTICE text from the Work, provided
|
||||
that such additional attribution notices cannot be construed
|
||||
as modifying the License.
|
||||
|
||||
You may add Your own copyright statement to Your modifications and
|
||||
may provide additional or different license terms and conditions
|
||||
for use, reproduction, or distribution of Your modifications, or
|
||||
for any such Derivative Works as a whole, provided Your use,
|
||||
reproduction, and distribution of the Work otherwise complies with
|
||||
the conditions stated in this License.
|
||||
|
||||
5. Submission of Contributions. Unless You explicitly state otherwise,
|
||||
any Contribution intentionally submitted for inclusion in the Work
|
||||
by You to the Licensor shall be under the terms and conditions of
|
||||
this License, without any additional terms or conditions.
|
||||
Notwithstanding the above, nothing herein shall supersede or modify
|
||||
the terms of any separate license agreement you may have executed
|
||||
with Licensor regarding such Contributions.
|
||||
|
||||
6. Trademarks. This License does not grant permission to use the trade
|
||||
names, trademarks, service marks, or product names of the Licensor,
|
||||
except as required for reasonable and customary use in describing the
|
||||
origin of the Work and reproducing the content of the NOTICE file.
|
||||
|
||||
7. Disclaimer of Warranty. Unless required by applicable law or
|
||||
agreed to in writing, Licensor provides the Work (and each
|
||||
Contributor provides its Contributions) on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
|
||||
implied, including, without limitation, any warranties or conditions
|
||||
of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
|
||||
PARTICULAR PURPOSE. You are solely responsible for determining the
|
||||
appropriateness of using or redistributing the Work and assume any
|
||||
risks associated with Your exercise of permissions under this License.
|
||||
|
||||
8. Limitation of Liability. In no event and under no legal theory,
|
||||
whether in tort (including negligence), contract, or otherwise,
|
||||
unless required by applicable law (such as deliberate and grossly
|
||||
negligent acts) or agreed to in writing, shall any Contributor be
|
||||
liable to You for damages, including any direct, indirect, special,
|
||||
incidental, or consequential damages of any character arising as a
|
||||
result of this License or out of the use or inability to use the
|
||||
Work (including but not limited to damages for loss of goodwill,
|
||||
work stoppage, computer failure or malfunction, or any and all
|
||||
other commercial damages or losses), even if such Contributor
|
||||
has been advised of the possibility of such damages.
|
||||
|
||||
9. Accepting Warranty or Additional Liability. While redistributing
|
||||
the Work or Derivative Works thereof, You may choose to offer,
|
||||
and charge a fee for, acceptance of support, warranty, indemnity,
|
||||
or other liability obligations and/or rights consistent with this
|
||||
License. However, in accepting such obligations, You may act only
|
||||
on Your own behalf and on Your sole responsibility, not on behalf
|
||||
of any other Contributor, and only if You agree to indemnify,
|
||||
defend, and hold each Contributor harmless for any liability
|
||||
incurred by, or claims asserted against, such Contributor by reason
|
||||
of your accepting any such warranty or additional liability.
|
||||
|
||||
END OF TERMS AND CONDITIONS
|
||||
|
||||
APPENDIX: How to apply the Apache License to your work.
|
||||
|
||||
To apply the Apache License to your work, attach the following
|
||||
boilerplate notice, with the fields enclosed by brackets "[]"
|
||||
replaced with your own identifying information. (Don't include
|
||||
the brackets!) The text should be enclosed in the appropriate
|
||||
comment syntax for the file format. We also recommend that a
|
||||
file or class name and description of purpose be included on the
|
||||
same "printed page" as the copyright notice for easier
|
||||
identification within third-party archives.
|
||||
|
||||
Copyright [yyyy] [name of copyright owner]
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License");
|
||||
you may not use this file except in compliance with the License.
|
||||
You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
||||
25
third-party/vendor/regex-syntax/LICENSE-MIT
vendored
Normal file
25
third-party/vendor/regex-syntax/LICENSE-MIT
vendored
Normal file
|
|
@ -0,0 +1,25 @@
|
|||
Copyright (c) 2014 The Rust Project Developers
|
||||
|
||||
Permission is hereby granted, free of charge, to any
|
||||
person obtaining a copy of this software and associated
|
||||
documentation files (the "Software"), to deal in the
|
||||
Software without restriction, including without
|
||||
limitation the rights to use, copy, modify, merge,
|
||||
publish, distribute, sublicense, and/or sell copies of
|
||||
the Software, and to permit persons to whom the Software
|
||||
is furnished to do so, subject to the following
|
||||
conditions:
|
||||
|
||||
The above copyright notice and this permission notice
|
||||
shall be included in all copies or substantial portions
|
||||
of the Software.
|
||||
|
||||
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF
|
||||
ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED
|
||||
TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A
|
||||
PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT
|
||||
SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
|
||||
CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
|
||||
OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR
|
||||
IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
|
||||
DEALINGS IN THE SOFTWARE.
|
||||
96
third-party/vendor/regex-syntax/README.md
vendored
Normal file
96
third-party/vendor/regex-syntax/README.md
vendored
Normal file
|
|
@ -0,0 +1,96 @@
|
|||
regex-syntax
|
||||
============
|
||||
This crate provides a robust regular expression parser.
|
||||
|
||||
[](https://github.com/rust-lang/regex/actions)
|
||||
[](https://crates.io/crates/regex-syntax)
|
||||
|
||||
|
||||
### Documentation
|
||||
|
||||
https://docs.rs/regex-syntax
|
||||
|
||||
|
||||
### Overview
|
||||
|
||||
There are two primary types exported by this crate: `Ast` and `Hir`. The former
|
||||
is a faithful abstract syntax of a regular expression, and can convert regular
|
||||
expressions back to their concrete syntax while mostly preserving its original
|
||||
form. The latter type is a high level intermediate representation of a regular
|
||||
expression that is amenable to analysis and compilation into byte codes or
|
||||
automata. An `Hir` achieves this by drastically simplifying the syntactic
|
||||
structure of the regular expression. While an `Hir` can be converted back to
|
||||
its equivalent concrete syntax, the result is unlikely to resemble the original
|
||||
concrete syntax that produced the `Hir`.
|
||||
|
||||
|
||||
### Example
|
||||
|
||||
This example shows how to parse a pattern string into its HIR:
|
||||
|
||||
```rust
|
||||
use regex_syntax::{hir::Hir, parse};
|
||||
|
||||
let hir = parse("a|b").unwrap();
|
||||
assert_eq!(hir, Hir::alternation(vec![
|
||||
Hir::literal("a".as_bytes()),
|
||||
Hir::literal("b".as_bytes()),
|
||||
]));
|
||||
```
|
||||
|
||||
|
||||
### Safety
|
||||
|
||||
This crate has no `unsafe` code and sets `forbid(unsafe_code)`. While it's
|
||||
possible this crate could use `unsafe` code in the future, the standard
|
||||
for doing so is extremely high. In general, most code in this crate is not
|
||||
performance critical, since it tends to be dwarfed by the time it takes to
|
||||
compile a regular expression into an automaton. Therefore, there is little need
|
||||
for extreme optimization, and therefore, use of `unsafe`.
|
||||
|
||||
The standard for using `unsafe` in this crate is extremely high because this
|
||||
crate is intended to be reasonably safe to use with user supplied regular
|
||||
expressions. Therefore, while there may be bugs in the regex parser itself,
|
||||
they should _never_ result in memory unsafety unless there is either a bug
|
||||
in the compiler or the standard library. (Since `regex-syntax` has zero
|
||||
dependencies.)
|
||||
|
||||
|
||||
### Crate features
|
||||
|
||||
By default, this crate bundles a fairly large amount of Unicode data tables
|
||||
(a source size of ~750KB). Because of their large size, one can disable some
|
||||
or all of these data tables. If a regular expression attempts to use Unicode
|
||||
data that is not available, then an error will occur when translating the `Ast`
|
||||
to the `Hir`.
|
||||
|
||||
The full set of features one can disable are
|
||||
[in the "Crate features" section of the documentation](https://docs.rs/regex-syntax/*/#crate-features).
|
||||
|
||||
|
||||
### Testing
|
||||
|
||||
Simply running `cargo test` will give you very good coverage. However, because
|
||||
of the large number of features exposed by this crate, a `test` script is
|
||||
included in this directory which will test several feature combinations. This
|
||||
is the same script that is run in CI.
|
||||
|
||||
|
||||
### Motivation
|
||||
|
||||
The primary purpose of this crate is to provide the parser used by `regex`.
|
||||
Specifically, this crate is treated as an implementation detail of the `regex`,
|
||||
and is primarily developed for the needs of `regex`.
|
||||
|
||||
Since this crate is an implementation detail of `regex`, it may experience
|
||||
breaking change releases at a different cadence from `regex`. This is only
|
||||
possible because this crate is _not_ a public dependency of `regex`.
|
||||
|
||||
Another consequence of this de-coupling is that there is no direct way to
|
||||
compile a `regex::Regex` from a `regex_syntax::hir::Hir`. Instead, one must
|
||||
first convert the `Hir` to a string (via its `std::fmt::Display`) and then
|
||||
compile that via `Regex::new`. While this does repeat some work, compilation
|
||||
typically takes much longer than parsing.
|
||||
|
||||
Stated differently, the coupling between `regex` and `regex-syntax` exists only
|
||||
at the level of the concrete syntax.
|
||||
63
third-party/vendor/regex-syntax/benches/bench.rs
vendored
Normal file
63
third-party/vendor/regex-syntax/benches/bench.rs
vendored
Normal file
|
|
@ -0,0 +1,63 @@
|
|||
#![feature(test)]
|
||||
|
||||
extern crate test;
|
||||
|
||||
use regex_syntax::Parser;
|
||||
use test::Bencher;
|
||||
|
||||
#[bench]
|
||||
fn parse_simple1(b: &mut Bencher) {
|
||||
b.iter(|| {
|
||||
let re = r"^bc(d|e)*$";
|
||||
Parser::new().parse(re).unwrap()
|
||||
});
|
||||
}
|
||||
|
||||
#[bench]
|
||||
fn parse_simple2(b: &mut Bencher) {
|
||||
b.iter(|| {
|
||||
let re = r"'[a-zA-Z_][a-zA-Z0-9_]*(')\b";
|
||||
Parser::new().parse(re).unwrap()
|
||||
});
|
||||
}
|
||||
|
||||
#[bench]
|
||||
fn parse_small1(b: &mut Bencher) {
|
||||
b.iter(|| {
|
||||
let re = r"\p{L}|\p{N}|\s|.|\d";
|
||||
Parser::new().parse(re).unwrap()
|
||||
});
|
||||
}
|
||||
|
||||
#[bench]
|
||||
fn parse_medium1(b: &mut Bencher) {
|
||||
b.iter(|| {
|
||||
let re = r"\pL\p{Greek}\p{Hiragana}\p{Alphabetic}\p{Hebrew}\p{Arabic}";
|
||||
Parser::new().parse(re).unwrap()
|
||||
});
|
||||
}
|
||||
|
||||
#[bench]
|
||||
fn parse_medium2(b: &mut Bencher) {
|
||||
b.iter(|| {
|
||||
let re = r"\s\S\w\W\d\D";
|
||||
Parser::new().parse(re).unwrap()
|
||||
});
|
||||
}
|
||||
|
||||
#[bench]
|
||||
fn parse_medium3(b: &mut Bencher) {
|
||||
b.iter(|| {
|
||||
let re =
|
||||
r"\p{age:3.2}\p{hira}\p{scx:hira}\p{alphabetic}\p{sc:Greek}\pL";
|
||||
Parser::new().parse(re).unwrap()
|
||||
});
|
||||
}
|
||||
|
||||
#[bench]
|
||||
fn parse_huge(b: &mut Bencher) {
|
||||
b.iter(|| {
|
||||
let re = r"\p{L}{100}";
|
||||
Parser::new().parse(re).unwrap()
|
||||
});
|
||||
}
|
||||
1809
third-party/vendor/regex-syntax/src/ast/mod.rs
vendored
Normal file
1809
third-party/vendor/regex-syntax/src/ast/mod.rs
vendored
Normal file
File diff suppressed because it is too large
Load diff
6326
third-party/vendor/regex-syntax/src/ast/parse.rs
vendored
Normal file
6326
third-party/vendor/regex-syntax/src/ast/parse.rs
vendored
Normal file
File diff suppressed because it is too large
Load diff
577
third-party/vendor/regex-syntax/src/ast/print.rs
vendored
Normal file
577
third-party/vendor/regex-syntax/src/ast/print.rs
vendored
Normal file
|
|
@ -0,0 +1,577 @@
|
|||
/*!
|
||||
This module provides a regular expression printer for `Ast`.
|
||||
*/
|
||||
|
||||
use core::fmt;
|
||||
|
||||
use crate::ast::{
|
||||
self,
|
||||
visitor::{self, Visitor},
|
||||
Ast,
|
||||
};
|
||||
|
||||
/// A builder for constructing a printer.
|
||||
///
|
||||
/// Note that since a printer doesn't have any configuration knobs, this type
|
||||
/// remains unexported.
|
||||
#[derive(Clone, Debug)]
|
||||
struct PrinterBuilder {
|
||||
_priv: (),
|
||||
}
|
||||
|
||||
impl Default for PrinterBuilder {
|
||||
fn default() -> PrinterBuilder {
|
||||
PrinterBuilder::new()
|
||||
}
|
||||
}
|
||||
|
||||
impl PrinterBuilder {
|
||||
fn new() -> PrinterBuilder {
|
||||
PrinterBuilder { _priv: () }
|
||||
}
|
||||
|
||||
fn build(&self) -> Printer {
|
||||
Printer { _priv: () }
|
||||
}
|
||||
}
|
||||
|
||||
/// A printer for a regular expression abstract syntax tree.
|
||||
///
|
||||
/// A printer converts an abstract syntax tree (AST) to a regular expression
|
||||
/// pattern string. This particular printer uses constant stack space and heap
|
||||
/// space proportional to the size of the AST.
|
||||
///
|
||||
/// This printer will not necessarily preserve the original formatting of the
|
||||
/// regular expression pattern string. For example, all whitespace and comments
|
||||
/// are ignored.
|
||||
#[derive(Debug)]
|
||||
pub struct Printer {
|
||||
_priv: (),
|
||||
}
|
||||
|
||||
impl Printer {
|
||||
/// Create a new printer.
|
||||
pub fn new() -> Printer {
|
||||
PrinterBuilder::new().build()
|
||||
}
|
||||
|
||||
/// Print the given `Ast` to the given writer. The writer must implement
|
||||
/// `fmt::Write`. Typical implementations of `fmt::Write` that can be used
|
||||
/// here are a `fmt::Formatter` (which is available in `fmt::Display`
|
||||
/// implementations) or a `&mut String`.
|
||||
pub fn print<W: fmt::Write>(&mut self, ast: &Ast, wtr: W) -> fmt::Result {
|
||||
visitor::visit(ast, Writer { wtr })
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Debug)]
|
||||
struct Writer<W> {
|
||||
wtr: W,
|
||||
}
|
||||
|
||||
impl<W: fmt::Write> Visitor for Writer<W> {
|
||||
type Output = ();
|
||||
type Err = fmt::Error;
|
||||
|
||||
fn finish(self) -> fmt::Result {
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn visit_pre(&mut self, ast: &Ast) -> fmt::Result {
|
||||
match *ast {
|
||||
Ast::Group(ref x) => self.fmt_group_pre(x),
|
||||
Ast::ClassBracketed(ref x) => self.fmt_class_bracketed_pre(x),
|
||||
_ => Ok(()),
|
||||
}
|
||||
}
|
||||
|
||||
fn visit_post(&mut self, ast: &Ast) -> fmt::Result {
|
||||
match *ast {
|
||||
Ast::Empty(_) => Ok(()),
|
||||
Ast::Flags(ref x) => self.fmt_set_flags(x),
|
||||
Ast::Literal(ref x) => self.fmt_literal(x),
|
||||
Ast::Dot(_) => self.wtr.write_str("."),
|
||||
Ast::Assertion(ref x) => self.fmt_assertion(x),
|
||||
Ast::ClassPerl(ref x) => self.fmt_class_perl(x),
|
||||
Ast::ClassUnicode(ref x) => self.fmt_class_unicode(x),
|
||||
Ast::ClassBracketed(ref x) => self.fmt_class_bracketed_post(x),
|
||||
Ast::Repetition(ref x) => self.fmt_repetition(x),
|
||||
Ast::Group(ref x) => self.fmt_group_post(x),
|
||||
Ast::Alternation(_) => Ok(()),
|
||||
Ast::Concat(_) => Ok(()),
|
||||
}
|
||||
}
|
||||
|
||||
fn visit_alternation_in(&mut self) -> fmt::Result {
|
||||
self.wtr.write_str("|")
|
||||
}
|
||||
|
||||
fn visit_class_set_item_pre(
|
||||
&mut self,
|
||||
ast: &ast::ClassSetItem,
|
||||
) -> Result<(), Self::Err> {
|
||||
match *ast {
|
||||
ast::ClassSetItem::Bracketed(ref x) => {
|
||||
self.fmt_class_bracketed_pre(x)
|
||||
}
|
||||
_ => Ok(()),
|
||||
}
|
||||
}
|
||||
|
||||
fn visit_class_set_item_post(
|
||||
&mut self,
|
||||
ast: &ast::ClassSetItem,
|
||||
) -> Result<(), Self::Err> {
|
||||
use crate::ast::ClassSetItem::*;
|
||||
|
||||
match *ast {
|
||||
Empty(_) => Ok(()),
|
||||
Literal(ref x) => self.fmt_literal(x),
|
||||
Range(ref x) => {
|
||||
self.fmt_literal(&x.start)?;
|
||||
self.wtr.write_str("-")?;
|
||||
self.fmt_literal(&x.end)?;
|
||||
Ok(())
|
||||
}
|
||||
Ascii(ref x) => self.fmt_class_ascii(x),
|
||||
Unicode(ref x) => self.fmt_class_unicode(x),
|
||||
Perl(ref x) => self.fmt_class_perl(x),
|
||||
Bracketed(ref x) => self.fmt_class_bracketed_post(x),
|
||||
Union(_) => Ok(()),
|
||||
}
|
||||
}
|
||||
|
||||
fn visit_class_set_binary_op_in(
|
||||
&mut self,
|
||||
ast: &ast::ClassSetBinaryOp,
|
||||
) -> Result<(), Self::Err> {
|
||||
self.fmt_class_set_binary_op_kind(&ast.kind)
|
||||
}
|
||||
}
|
||||
|
||||
impl<W: fmt::Write> Writer<W> {
|
||||
fn fmt_group_pre(&mut self, ast: &ast::Group) -> fmt::Result {
|
||||
use crate::ast::GroupKind::*;
|
||||
match ast.kind {
|
||||
CaptureIndex(_) => self.wtr.write_str("("),
|
||||
CaptureName { ref name, starts_with_p } => {
|
||||
let start = if starts_with_p { "(?P<" } else { "(?<" };
|
||||
self.wtr.write_str(start)?;
|
||||
self.wtr.write_str(&name.name)?;
|
||||
self.wtr.write_str(">")?;
|
||||
Ok(())
|
||||
}
|
||||
NonCapturing(ref flags) => {
|
||||
self.wtr.write_str("(?")?;
|
||||
self.fmt_flags(flags)?;
|
||||
self.wtr.write_str(":")?;
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
fn fmt_group_post(&mut self, _ast: &ast::Group) -> fmt::Result {
|
||||
self.wtr.write_str(")")
|
||||
}
|
||||
|
||||
fn fmt_repetition(&mut self, ast: &ast::Repetition) -> fmt::Result {
|
||||
use crate::ast::RepetitionKind::*;
|
||||
match ast.op.kind {
|
||||
ZeroOrOne if ast.greedy => self.wtr.write_str("?"),
|
||||
ZeroOrOne => self.wtr.write_str("??"),
|
||||
ZeroOrMore if ast.greedy => self.wtr.write_str("*"),
|
||||
ZeroOrMore => self.wtr.write_str("*?"),
|
||||
OneOrMore if ast.greedy => self.wtr.write_str("+"),
|
||||
OneOrMore => self.wtr.write_str("+?"),
|
||||
Range(ref x) => {
|
||||
self.fmt_repetition_range(x)?;
|
||||
if !ast.greedy {
|
||||
self.wtr.write_str("?")?;
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
fn fmt_repetition_range(
|
||||
&mut self,
|
||||
ast: &ast::RepetitionRange,
|
||||
) -> fmt::Result {
|
||||
use crate::ast::RepetitionRange::*;
|
||||
match *ast {
|
||||
Exactly(x) => write!(self.wtr, "{{{}}}", x),
|
||||
AtLeast(x) => write!(self.wtr, "{{{},}}", x),
|
||||
Bounded(x, y) => write!(self.wtr, "{{{},{}}}", x, y),
|
||||
}
|
||||
}
|
||||
|
||||
fn fmt_literal(&mut self, ast: &ast::Literal) -> fmt::Result {
|
||||
use crate::ast::LiteralKind::*;
|
||||
|
||||
match ast.kind {
|
||||
Verbatim => self.wtr.write_char(ast.c),
|
||||
Meta | Superfluous => write!(self.wtr, r"\{}", ast.c),
|
||||
Octal => write!(self.wtr, r"\{:o}", u32::from(ast.c)),
|
||||
HexFixed(ast::HexLiteralKind::X) => {
|
||||
write!(self.wtr, r"\x{:02X}", u32::from(ast.c))
|
||||
}
|
||||
HexFixed(ast::HexLiteralKind::UnicodeShort) => {
|
||||
write!(self.wtr, r"\u{:04X}", u32::from(ast.c))
|
||||
}
|
||||
HexFixed(ast::HexLiteralKind::UnicodeLong) => {
|
||||
write!(self.wtr, r"\U{:08X}", u32::from(ast.c))
|
||||
}
|
||||
HexBrace(ast::HexLiteralKind::X) => {
|
||||
write!(self.wtr, r"\x{{{:X}}}", u32::from(ast.c))
|
||||
}
|
||||
HexBrace(ast::HexLiteralKind::UnicodeShort) => {
|
||||
write!(self.wtr, r"\u{{{:X}}}", u32::from(ast.c))
|
||||
}
|
||||
HexBrace(ast::HexLiteralKind::UnicodeLong) => {
|
||||
write!(self.wtr, r"\U{{{:X}}}", u32::from(ast.c))
|
||||
}
|
||||
Special(ast::SpecialLiteralKind::Bell) => {
|
||||
self.wtr.write_str(r"\a")
|
||||
}
|
||||
Special(ast::SpecialLiteralKind::FormFeed) => {
|
||||
self.wtr.write_str(r"\f")
|
||||
}
|
||||
Special(ast::SpecialLiteralKind::Tab) => self.wtr.write_str(r"\t"),
|
||||
Special(ast::SpecialLiteralKind::LineFeed) => {
|
||||
self.wtr.write_str(r"\n")
|
||||
}
|
||||
Special(ast::SpecialLiteralKind::CarriageReturn) => {
|
||||
self.wtr.write_str(r"\r")
|
||||
}
|
||||
Special(ast::SpecialLiteralKind::VerticalTab) => {
|
||||
self.wtr.write_str(r"\v")
|
||||
}
|
||||
Special(ast::SpecialLiteralKind::Space) => {
|
||||
self.wtr.write_str(r"\ ")
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
fn fmt_assertion(&mut self, ast: &ast::Assertion) -> fmt::Result {
|
||||
use crate::ast::AssertionKind::*;
|
||||
match ast.kind {
|
||||
StartLine => self.wtr.write_str("^"),
|
||||
EndLine => self.wtr.write_str("$"),
|
||||
StartText => self.wtr.write_str(r"\A"),
|
||||
EndText => self.wtr.write_str(r"\z"),
|
||||
WordBoundary => self.wtr.write_str(r"\b"),
|
||||
NotWordBoundary => self.wtr.write_str(r"\B"),
|
||||
WordBoundaryStart => self.wtr.write_str(r"\b{start}"),
|
||||
WordBoundaryEnd => self.wtr.write_str(r"\b{end}"),
|
||||
WordBoundaryStartAngle => self.wtr.write_str(r"\<"),
|
||||
WordBoundaryEndAngle => self.wtr.write_str(r"\>"),
|
||||
WordBoundaryStartHalf => self.wtr.write_str(r"\b{start-half}"),
|
||||
WordBoundaryEndHalf => self.wtr.write_str(r"\b{end-half}"),
|
||||
}
|
||||
}
|
||||
|
||||
fn fmt_set_flags(&mut self, ast: &ast::SetFlags) -> fmt::Result {
|
||||
self.wtr.write_str("(?")?;
|
||||
self.fmt_flags(&ast.flags)?;
|
||||
self.wtr.write_str(")")?;
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn fmt_flags(&mut self, ast: &ast::Flags) -> fmt::Result {
|
||||
use crate::ast::{Flag, FlagsItemKind};
|
||||
|
||||
for item in &ast.items {
|
||||
match item.kind {
|
||||
FlagsItemKind::Negation => self.wtr.write_str("-"),
|
||||
FlagsItemKind::Flag(ref flag) => match *flag {
|
||||
Flag::CaseInsensitive => self.wtr.write_str("i"),
|
||||
Flag::MultiLine => self.wtr.write_str("m"),
|
||||
Flag::DotMatchesNewLine => self.wtr.write_str("s"),
|
||||
Flag::SwapGreed => self.wtr.write_str("U"),
|
||||
Flag::Unicode => self.wtr.write_str("u"),
|
||||
Flag::CRLF => self.wtr.write_str("R"),
|
||||
Flag::IgnoreWhitespace => self.wtr.write_str("x"),
|
||||
},
|
||||
}?;
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn fmt_class_bracketed_pre(
|
||||
&mut self,
|
||||
ast: &ast::ClassBracketed,
|
||||
) -> fmt::Result {
|
||||
if ast.negated {
|
||||
self.wtr.write_str("[^")
|
||||
} else {
|
||||
self.wtr.write_str("[")
|
||||
}
|
||||
}
|
||||
|
||||
fn fmt_class_bracketed_post(
|
||||
&mut self,
|
||||
_ast: &ast::ClassBracketed,
|
||||
) -> fmt::Result {
|
||||
self.wtr.write_str("]")
|
||||
}
|
||||
|
||||
fn fmt_class_set_binary_op_kind(
|
||||
&mut self,
|
||||
ast: &ast::ClassSetBinaryOpKind,
|
||||
) -> fmt::Result {
|
||||
use crate::ast::ClassSetBinaryOpKind::*;
|
||||
match *ast {
|
||||
Intersection => self.wtr.write_str("&&"),
|
||||
Difference => self.wtr.write_str("--"),
|
||||
SymmetricDifference => self.wtr.write_str("~~"),
|
||||
}
|
||||
}
|
||||
|
||||
fn fmt_class_perl(&mut self, ast: &ast::ClassPerl) -> fmt::Result {
|
||||
use crate::ast::ClassPerlKind::*;
|
||||
match ast.kind {
|
||||
Digit if ast.negated => self.wtr.write_str(r"\D"),
|
||||
Digit => self.wtr.write_str(r"\d"),
|
||||
Space if ast.negated => self.wtr.write_str(r"\S"),
|
||||
Space => self.wtr.write_str(r"\s"),
|
||||
Word if ast.negated => self.wtr.write_str(r"\W"),
|
||||
Word => self.wtr.write_str(r"\w"),
|
||||
}
|
||||
}
|
||||
|
||||
fn fmt_class_ascii(&mut self, ast: &ast::ClassAscii) -> fmt::Result {
|
||||
use crate::ast::ClassAsciiKind::*;
|
||||
match ast.kind {
|
||||
Alnum if ast.negated => self.wtr.write_str("[:^alnum:]"),
|
||||
Alnum => self.wtr.write_str("[:alnum:]"),
|
||||
Alpha if ast.negated => self.wtr.write_str("[:^alpha:]"),
|
||||
Alpha => self.wtr.write_str("[:alpha:]"),
|
||||
Ascii if ast.negated => self.wtr.write_str("[:^ascii:]"),
|
||||
Ascii => self.wtr.write_str("[:ascii:]"),
|
||||
Blank if ast.negated => self.wtr.write_str("[:^blank:]"),
|
||||
Blank => self.wtr.write_str("[:blank:]"),
|
||||
Cntrl if ast.negated => self.wtr.write_str("[:^cntrl:]"),
|
||||
Cntrl => self.wtr.write_str("[:cntrl:]"),
|
||||
Digit if ast.negated => self.wtr.write_str("[:^digit:]"),
|
||||
Digit => self.wtr.write_str("[:digit:]"),
|
||||
Graph if ast.negated => self.wtr.write_str("[:^graph:]"),
|
||||
Graph => self.wtr.write_str("[:graph:]"),
|
||||
Lower if ast.negated => self.wtr.write_str("[:^lower:]"),
|
||||
Lower => self.wtr.write_str("[:lower:]"),
|
||||
Print if ast.negated => self.wtr.write_str("[:^print:]"),
|
||||
Print => self.wtr.write_str("[:print:]"),
|
||||
Punct if ast.negated => self.wtr.write_str("[:^punct:]"),
|
||||
Punct => self.wtr.write_str("[:punct:]"),
|
||||
Space if ast.negated => self.wtr.write_str("[:^space:]"),
|
||||
Space => self.wtr.write_str("[:space:]"),
|
||||
Upper if ast.negated => self.wtr.write_str("[:^upper:]"),
|
||||
Upper => self.wtr.write_str("[:upper:]"),
|
||||
Word if ast.negated => self.wtr.write_str("[:^word:]"),
|
||||
Word => self.wtr.write_str("[:word:]"),
|
||||
Xdigit if ast.negated => self.wtr.write_str("[:^xdigit:]"),
|
||||
Xdigit => self.wtr.write_str("[:xdigit:]"),
|
||||
}
|
||||
}
|
||||
|
||||
fn fmt_class_unicode(&mut self, ast: &ast::ClassUnicode) -> fmt::Result {
|
||||
use crate::ast::ClassUnicodeKind::*;
|
||||
use crate::ast::ClassUnicodeOpKind::*;
|
||||
|
||||
if ast.negated {
|
||||
self.wtr.write_str(r"\P")?;
|
||||
} else {
|
||||
self.wtr.write_str(r"\p")?;
|
||||
}
|
||||
match ast.kind {
|
||||
OneLetter(c) => self.wtr.write_char(c),
|
||||
Named(ref x) => write!(self.wtr, "{{{}}}", x),
|
||||
NamedValue { op: Equal, ref name, ref value } => {
|
||||
write!(self.wtr, "{{{}={}}}", name, value)
|
||||
}
|
||||
NamedValue { op: Colon, ref name, ref value } => {
|
||||
write!(self.wtr, "{{{}:{}}}", name, value)
|
||||
}
|
||||
NamedValue { op: NotEqual, ref name, ref value } => {
|
||||
write!(self.wtr, "{{{}!={}}}", name, value)
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use alloc::string::String;
|
||||
|
||||
use crate::ast::parse::ParserBuilder;
|
||||
|
||||
use super::*;
|
||||
|
||||
fn roundtrip(given: &str) {
|
||||
roundtrip_with(|b| b, given);
|
||||
}
|
||||
|
||||
fn roundtrip_with<F>(mut f: F, given: &str)
|
||||
where
|
||||
F: FnMut(&mut ParserBuilder) -> &mut ParserBuilder,
|
||||
{
|
||||
let mut builder = ParserBuilder::new();
|
||||
f(&mut builder);
|
||||
let ast = builder.build().parse(given).unwrap();
|
||||
|
||||
let mut printer = Printer::new();
|
||||
let mut dst = String::new();
|
||||
printer.print(&ast, &mut dst).unwrap();
|
||||
assert_eq!(given, dst);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn print_literal() {
|
||||
roundtrip("a");
|
||||
roundtrip(r"\[");
|
||||
roundtrip_with(|b| b.octal(true), r"\141");
|
||||
roundtrip(r"\x61");
|
||||
roundtrip(r"\x7F");
|
||||
roundtrip(r"\u0061");
|
||||
roundtrip(r"\U00000061");
|
||||
roundtrip(r"\x{61}");
|
||||
roundtrip(r"\x{7F}");
|
||||
roundtrip(r"\u{61}");
|
||||
roundtrip(r"\U{61}");
|
||||
|
||||
roundtrip(r"\a");
|
||||
roundtrip(r"\f");
|
||||
roundtrip(r"\t");
|
||||
roundtrip(r"\n");
|
||||
roundtrip(r"\r");
|
||||
roundtrip(r"\v");
|
||||
roundtrip(r"(?x)\ ");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn print_dot() {
|
||||
roundtrip(".");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn print_concat() {
|
||||
roundtrip("ab");
|
||||
roundtrip("abcde");
|
||||
roundtrip("a(bcd)ef");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn print_alternation() {
|
||||
roundtrip("a|b");
|
||||
roundtrip("a|b|c|d|e");
|
||||
roundtrip("|a|b|c|d|e");
|
||||
roundtrip("|a|b|c|d|e|");
|
||||
roundtrip("a(b|c|d)|e|f");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn print_assertion() {
|
||||
roundtrip(r"^");
|
||||
roundtrip(r"$");
|
||||
roundtrip(r"\A");
|
||||
roundtrip(r"\z");
|
||||
roundtrip(r"\b");
|
||||
roundtrip(r"\B");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn print_repetition() {
|
||||
roundtrip("a?");
|
||||
roundtrip("a??");
|
||||
roundtrip("a*");
|
||||
roundtrip("a*?");
|
||||
roundtrip("a+");
|
||||
roundtrip("a+?");
|
||||
roundtrip("a{5}");
|
||||
roundtrip("a{5}?");
|
||||
roundtrip("a{5,}");
|
||||
roundtrip("a{5,}?");
|
||||
roundtrip("a{5,10}");
|
||||
roundtrip("a{5,10}?");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn print_flags() {
|
||||
roundtrip("(?i)");
|
||||
roundtrip("(?-i)");
|
||||
roundtrip("(?s-i)");
|
||||
roundtrip("(?-si)");
|
||||
roundtrip("(?siUmux)");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn print_group() {
|
||||
roundtrip("(?i:a)");
|
||||
roundtrip("(?P<foo>a)");
|
||||
roundtrip("(?<foo>a)");
|
||||
roundtrip("(a)");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn print_class() {
|
||||
roundtrip(r"[abc]");
|
||||
roundtrip(r"[a-z]");
|
||||
roundtrip(r"[^a-z]");
|
||||
roundtrip(r"[a-z0-9]");
|
||||
roundtrip(r"[-a-z0-9]");
|
||||
roundtrip(r"[-a-z0-9]");
|
||||
roundtrip(r"[a-z0-9---]");
|
||||
roundtrip(r"[a-z&&m-n]");
|
||||
roundtrip(r"[[a-z&&m-n]]");
|
||||
roundtrip(r"[a-z--m-n]");
|
||||
roundtrip(r"[a-z~~m-n]");
|
||||
roundtrip(r"[a-z[0-9]]");
|
||||
roundtrip(r"[a-z[^0-9]]");
|
||||
|
||||
roundtrip(r"\d");
|
||||
roundtrip(r"\D");
|
||||
roundtrip(r"\s");
|
||||
roundtrip(r"\S");
|
||||
roundtrip(r"\w");
|
||||
roundtrip(r"\W");
|
||||
|
||||
roundtrip(r"[[:alnum:]]");
|
||||
roundtrip(r"[[:^alnum:]]");
|
||||
roundtrip(r"[[:alpha:]]");
|
||||
roundtrip(r"[[:^alpha:]]");
|
||||
roundtrip(r"[[:ascii:]]");
|
||||
roundtrip(r"[[:^ascii:]]");
|
||||
roundtrip(r"[[:blank:]]");
|
||||
roundtrip(r"[[:^blank:]]");
|
||||
roundtrip(r"[[:cntrl:]]");
|
||||
roundtrip(r"[[:^cntrl:]]");
|
||||
roundtrip(r"[[:digit:]]");
|
||||
roundtrip(r"[[:^digit:]]");
|
||||
roundtrip(r"[[:graph:]]");
|
||||
roundtrip(r"[[:^graph:]]");
|
||||
roundtrip(r"[[:lower:]]");
|
||||
roundtrip(r"[[:^lower:]]");
|
||||
roundtrip(r"[[:print:]]");
|
||||
roundtrip(r"[[:^print:]]");
|
||||
roundtrip(r"[[:punct:]]");
|
||||
roundtrip(r"[[:^punct:]]");
|
||||
roundtrip(r"[[:space:]]");
|
||||
roundtrip(r"[[:^space:]]");
|
||||
roundtrip(r"[[:upper:]]");
|
||||
roundtrip(r"[[:^upper:]]");
|
||||
roundtrip(r"[[:word:]]");
|
||||
roundtrip(r"[[:^word:]]");
|
||||
roundtrip(r"[[:xdigit:]]");
|
||||
roundtrip(r"[[:^xdigit:]]");
|
||||
|
||||
roundtrip(r"\pL");
|
||||
roundtrip(r"\PL");
|
||||
roundtrip(r"\p{L}");
|
||||
roundtrip(r"\P{L}");
|
||||
roundtrip(r"\p{X=Y}");
|
||||
roundtrip(r"\P{X=Y}");
|
||||
roundtrip(r"\p{X:Y}");
|
||||
roundtrip(r"\P{X:Y}");
|
||||
roundtrip(r"\p{X!=Y}");
|
||||
roundtrip(r"\P{X!=Y}");
|
||||
}
|
||||
}
|
||||
522
third-party/vendor/regex-syntax/src/ast/visitor.rs
vendored
Normal file
522
third-party/vendor/regex-syntax/src/ast/visitor.rs
vendored
Normal file
|
|
@ -0,0 +1,522 @@
|
|||
use alloc::{vec, vec::Vec};
|
||||
|
||||
use crate::ast::{self, Ast};
|
||||
|
||||
/// A trait for visiting an abstract syntax tree (AST) in depth first order.
|
||||
///
|
||||
/// The principle aim of this trait is to enable callers to perform case
|
||||
/// analysis on an abstract syntax tree without necessarily using recursion.
|
||||
/// In particular, this permits callers to do case analysis with constant stack
|
||||
/// usage, which can be important since the size of an abstract syntax tree
|
||||
/// may be proportional to end user input.
|
||||
///
|
||||
/// Typical usage of this trait involves providing an implementation and then
|
||||
/// running it using the [`visit`] function.
|
||||
///
|
||||
/// Note that the abstract syntax tree for a regular expression is quite
|
||||
/// complex. Unless you specifically need it, you might be able to use the much
|
||||
/// simpler [high-level intermediate representation](crate::hir::Hir) and its
|
||||
/// [corresponding `Visitor` trait](crate::hir::Visitor) instead.
|
||||
pub trait Visitor {
|
||||
/// The result of visiting an AST.
|
||||
type Output;
|
||||
/// An error that visiting an AST might return.
|
||||
type Err;
|
||||
|
||||
/// All implementors of `Visitor` must provide a `finish` method, which
|
||||
/// yields the result of visiting the AST or an error.
|
||||
fn finish(self) -> Result<Self::Output, Self::Err>;
|
||||
|
||||
/// This method is called before beginning traversal of the AST.
|
||||
fn start(&mut self) {}
|
||||
|
||||
/// This method is called on an `Ast` before descending into child `Ast`
|
||||
/// nodes.
|
||||
fn visit_pre(&mut self, _ast: &Ast) -> Result<(), Self::Err> {
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// This method is called on an `Ast` after descending all of its child
|
||||
/// `Ast` nodes.
|
||||
fn visit_post(&mut self, _ast: &Ast) -> Result<(), Self::Err> {
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// This method is called between child nodes of an
|
||||
/// [`Alternation`](ast::Alternation).
|
||||
fn visit_alternation_in(&mut self) -> Result<(), Self::Err> {
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// This method is called between child nodes of a concatenation.
|
||||
fn visit_concat_in(&mut self) -> Result<(), Self::Err> {
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// This method is called on every [`ClassSetItem`](ast::ClassSetItem)
|
||||
/// before descending into child nodes.
|
||||
fn visit_class_set_item_pre(
|
||||
&mut self,
|
||||
_ast: &ast::ClassSetItem,
|
||||
) -> Result<(), Self::Err> {
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// This method is called on every [`ClassSetItem`](ast::ClassSetItem)
|
||||
/// after descending into child nodes.
|
||||
fn visit_class_set_item_post(
|
||||
&mut self,
|
||||
_ast: &ast::ClassSetItem,
|
||||
) -> Result<(), Self::Err> {
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// This method is called on every
|
||||
/// [`ClassSetBinaryOp`](ast::ClassSetBinaryOp) before descending into
|
||||
/// child nodes.
|
||||
fn visit_class_set_binary_op_pre(
|
||||
&mut self,
|
||||
_ast: &ast::ClassSetBinaryOp,
|
||||
) -> Result<(), Self::Err> {
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// This method is called on every
|
||||
/// [`ClassSetBinaryOp`](ast::ClassSetBinaryOp) after descending into child
|
||||
/// nodes.
|
||||
fn visit_class_set_binary_op_post(
|
||||
&mut self,
|
||||
_ast: &ast::ClassSetBinaryOp,
|
||||
) -> Result<(), Self::Err> {
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// This method is called between the left hand and right hand child nodes
|
||||
/// of a [`ClassSetBinaryOp`](ast::ClassSetBinaryOp).
|
||||
fn visit_class_set_binary_op_in(
|
||||
&mut self,
|
||||
_ast: &ast::ClassSetBinaryOp,
|
||||
) -> Result<(), Self::Err> {
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
|
||||
/// Executes an implementation of `Visitor` in constant stack space.
|
||||
///
|
||||
/// This function will visit every node in the given `Ast` while calling the
|
||||
/// appropriate methods provided by the [`Visitor`] trait.
|
||||
///
|
||||
/// The primary use case for this method is when one wants to perform case
|
||||
/// analysis over an `Ast` without using a stack size proportional to the depth
|
||||
/// of the `Ast`. Namely, this method will instead use constant stack size, but
|
||||
/// will use heap space proportional to the size of the `Ast`. This may be
|
||||
/// desirable in cases where the size of `Ast` is proportional to end user
|
||||
/// input.
|
||||
///
|
||||
/// If the visitor returns an error at any point, then visiting is stopped and
|
||||
/// the error is returned.
|
||||
pub fn visit<V: Visitor>(ast: &Ast, visitor: V) -> Result<V::Output, V::Err> {
|
||||
HeapVisitor::new().visit(ast, visitor)
|
||||
}
|
||||
|
||||
/// HeapVisitor visits every item in an `Ast` recursively using constant stack
|
||||
/// size and a heap size proportional to the size of the `Ast`.
|
||||
struct HeapVisitor<'a> {
|
||||
/// A stack of `Ast` nodes. This is roughly analogous to the call stack
|
||||
/// used in a typical recursive visitor.
|
||||
stack: Vec<(&'a Ast, Frame<'a>)>,
|
||||
/// Similar to the `Ast` stack above, but is used only for character
|
||||
/// classes. In particular, character classes embed their own mini
|
||||
/// recursive syntax.
|
||||
stack_class: Vec<(ClassInduct<'a>, ClassFrame<'a>)>,
|
||||
}
|
||||
|
||||
/// Represents a single stack frame while performing structural induction over
|
||||
/// an `Ast`.
|
||||
enum Frame<'a> {
|
||||
/// A stack frame allocated just before descending into a repetition
|
||||
/// operator's child node.
|
||||
Repetition(&'a ast::Repetition),
|
||||
/// A stack frame allocated just before descending into a group's child
|
||||
/// node.
|
||||
Group(&'a ast::Group),
|
||||
/// The stack frame used while visiting every child node of a concatenation
|
||||
/// of expressions.
|
||||
Concat {
|
||||
/// The child node we are currently visiting.
|
||||
head: &'a Ast,
|
||||
/// The remaining child nodes to visit (which may be empty).
|
||||
tail: &'a [Ast],
|
||||
},
|
||||
/// The stack frame used while visiting every child node of an alternation
|
||||
/// of expressions.
|
||||
Alternation {
|
||||
/// The child node we are currently visiting.
|
||||
head: &'a Ast,
|
||||
/// The remaining child nodes to visit (which may be empty).
|
||||
tail: &'a [Ast],
|
||||
},
|
||||
}
|
||||
|
||||
/// Represents a single stack frame while performing structural induction over
|
||||
/// a character class.
|
||||
enum ClassFrame<'a> {
|
||||
/// The stack frame used while visiting every child node of a union of
|
||||
/// character class items.
|
||||
Union {
|
||||
/// The child node we are currently visiting.
|
||||
head: &'a ast::ClassSetItem,
|
||||
/// The remaining child nodes to visit (which may be empty).
|
||||
tail: &'a [ast::ClassSetItem],
|
||||
},
|
||||
/// The stack frame used while a binary class operation.
|
||||
Binary { op: &'a ast::ClassSetBinaryOp },
|
||||
/// A stack frame allocated just before descending into a binary operator's
|
||||
/// left hand child node.
|
||||
BinaryLHS {
|
||||
op: &'a ast::ClassSetBinaryOp,
|
||||
lhs: &'a ast::ClassSet,
|
||||
rhs: &'a ast::ClassSet,
|
||||
},
|
||||
/// A stack frame allocated just before descending into a binary operator's
|
||||
/// right hand child node.
|
||||
BinaryRHS { op: &'a ast::ClassSetBinaryOp, rhs: &'a ast::ClassSet },
|
||||
}
|
||||
|
||||
/// A representation of the inductive step when performing structural induction
|
||||
/// over a character class.
|
||||
///
|
||||
/// Note that there is no analogous explicit type for the inductive step for
|
||||
/// `Ast` nodes because the inductive step is just an `Ast`. For character
|
||||
/// classes, the inductive step can produce one of two possible child nodes:
|
||||
/// an item or a binary operation. (An item cannot be a binary operation
|
||||
/// because that would imply binary operations can be unioned in the concrete
|
||||
/// syntax, which is not possible.)
|
||||
enum ClassInduct<'a> {
|
||||
Item(&'a ast::ClassSetItem),
|
||||
BinaryOp(&'a ast::ClassSetBinaryOp),
|
||||
}
|
||||
|
||||
impl<'a> HeapVisitor<'a> {
|
||||
fn new() -> HeapVisitor<'a> {
|
||||
HeapVisitor { stack: vec![], stack_class: vec![] }
|
||||
}
|
||||
|
||||
fn visit<V: Visitor>(
|
||||
&mut self,
|
||||
mut ast: &'a Ast,
|
||||
mut visitor: V,
|
||||
) -> Result<V::Output, V::Err> {
|
||||
self.stack.clear();
|
||||
self.stack_class.clear();
|
||||
|
||||
visitor.start();
|
||||
loop {
|
||||
visitor.visit_pre(ast)?;
|
||||
if let Some(x) = self.induct(ast, &mut visitor)? {
|
||||
let child = x.child();
|
||||
self.stack.push((ast, x));
|
||||
ast = child;
|
||||
continue;
|
||||
}
|
||||
// No induction means we have a base case, so we can post visit
|
||||
// it now.
|
||||
visitor.visit_post(ast)?;
|
||||
|
||||
// At this point, we now try to pop our call stack until it is
|
||||
// either empty or we hit another inductive case.
|
||||
loop {
|
||||
let (post_ast, frame) = match self.stack.pop() {
|
||||
None => return visitor.finish(),
|
||||
Some((post_ast, frame)) => (post_ast, frame),
|
||||
};
|
||||
// If this is a concat/alternate, then we might have additional
|
||||
// inductive steps to process.
|
||||
if let Some(x) = self.pop(frame) {
|
||||
match x {
|
||||
Frame::Alternation { .. } => {
|
||||
visitor.visit_alternation_in()?;
|
||||
}
|
||||
Frame::Concat { .. } => {
|
||||
visitor.visit_concat_in()?;
|
||||
}
|
||||
_ => {}
|
||||
}
|
||||
ast = x.child();
|
||||
self.stack.push((post_ast, x));
|
||||
break;
|
||||
}
|
||||
// Otherwise, we've finished visiting all the child nodes for
|
||||
// this AST, so we can post visit it now.
|
||||
visitor.visit_post(post_ast)?;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Build a stack frame for the given AST if one is needed (which occurs if
|
||||
/// and only if there are child nodes in the AST). Otherwise, return None.
|
||||
///
|
||||
/// If this visits a class, then the underlying visitor implementation may
|
||||
/// return an error which will be passed on here.
|
||||
fn induct<V: Visitor>(
|
||||
&mut self,
|
||||
ast: &'a Ast,
|
||||
visitor: &mut V,
|
||||
) -> Result<Option<Frame<'a>>, V::Err> {
|
||||
Ok(match *ast {
|
||||
Ast::ClassBracketed(ref x) => {
|
||||
self.visit_class(x, visitor)?;
|
||||
None
|
||||
}
|
||||
Ast::Repetition(ref x) => Some(Frame::Repetition(x)),
|
||||
Ast::Group(ref x) => Some(Frame::Group(x)),
|
||||
Ast::Concat(ref x) if x.asts.is_empty() => None,
|
||||
Ast::Concat(ref x) => {
|
||||
Some(Frame::Concat { head: &x.asts[0], tail: &x.asts[1..] })
|
||||
}
|
||||
Ast::Alternation(ref x) if x.asts.is_empty() => None,
|
||||
Ast::Alternation(ref x) => Some(Frame::Alternation {
|
||||
head: &x.asts[0],
|
||||
tail: &x.asts[1..],
|
||||
}),
|
||||
_ => None,
|
||||
})
|
||||
}
|
||||
|
||||
/// Pops the given frame. If the frame has an additional inductive step,
|
||||
/// then return it, otherwise return `None`.
|
||||
fn pop(&self, induct: Frame<'a>) -> Option<Frame<'a>> {
|
||||
match induct {
|
||||
Frame::Repetition(_) => None,
|
||||
Frame::Group(_) => None,
|
||||
Frame::Concat { tail, .. } => {
|
||||
if tail.is_empty() {
|
||||
None
|
||||
} else {
|
||||
Some(Frame::Concat { head: &tail[0], tail: &tail[1..] })
|
||||
}
|
||||
}
|
||||
Frame::Alternation { tail, .. } => {
|
||||
if tail.is_empty() {
|
||||
None
|
||||
} else {
|
||||
Some(Frame::Alternation {
|
||||
head: &tail[0],
|
||||
tail: &tail[1..],
|
||||
})
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
fn visit_class<V: Visitor>(
|
||||
&mut self,
|
||||
ast: &'a ast::ClassBracketed,
|
||||
visitor: &mut V,
|
||||
) -> Result<(), V::Err> {
|
||||
let mut ast = ClassInduct::from_bracketed(ast);
|
||||
loop {
|
||||
self.visit_class_pre(&ast, visitor)?;
|
||||
if let Some(x) = self.induct_class(&ast) {
|
||||
let child = x.child();
|
||||
self.stack_class.push((ast, x));
|
||||
ast = child;
|
||||
continue;
|
||||
}
|
||||
self.visit_class_post(&ast, visitor)?;
|
||||
|
||||
// At this point, we now try to pop our call stack until it is
|
||||
// either empty or we hit another inductive case.
|
||||
loop {
|
||||
let (post_ast, frame) = match self.stack_class.pop() {
|
||||
None => return Ok(()),
|
||||
Some((post_ast, frame)) => (post_ast, frame),
|
||||
};
|
||||
// If this is a union or a binary op, then we might have
|
||||
// additional inductive steps to process.
|
||||
if let Some(x) = self.pop_class(frame) {
|
||||
if let ClassFrame::BinaryRHS { ref op, .. } = x {
|
||||
visitor.visit_class_set_binary_op_in(op)?;
|
||||
}
|
||||
ast = x.child();
|
||||
self.stack_class.push((post_ast, x));
|
||||
break;
|
||||
}
|
||||
// Otherwise, we've finished visiting all the child nodes for
|
||||
// this class node, so we can post visit it now.
|
||||
self.visit_class_post(&post_ast, visitor)?;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Call the appropriate `Visitor` methods given an inductive step.
|
||||
fn visit_class_pre<V: Visitor>(
|
||||
&self,
|
||||
ast: &ClassInduct<'a>,
|
||||
visitor: &mut V,
|
||||
) -> Result<(), V::Err> {
|
||||
match *ast {
|
||||
ClassInduct::Item(item) => {
|
||||
visitor.visit_class_set_item_pre(item)?;
|
||||
}
|
||||
ClassInduct::BinaryOp(op) => {
|
||||
visitor.visit_class_set_binary_op_pre(op)?;
|
||||
}
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Call the appropriate `Visitor` methods given an inductive step.
|
||||
fn visit_class_post<V: Visitor>(
|
||||
&self,
|
||||
ast: &ClassInduct<'a>,
|
||||
visitor: &mut V,
|
||||
) -> Result<(), V::Err> {
|
||||
match *ast {
|
||||
ClassInduct::Item(item) => {
|
||||
visitor.visit_class_set_item_post(item)?;
|
||||
}
|
||||
ClassInduct::BinaryOp(op) => {
|
||||
visitor.visit_class_set_binary_op_post(op)?;
|
||||
}
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Build a stack frame for the given class node if one is needed (which
|
||||
/// occurs if and only if there are child nodes). Otherwise, return None.
|
||||
fn induct_class(&self, ast: &ClassInduct<'a>) -> Option<ClassFrame<'a>> {
|
||||
match *ast {
|
||||
ClassInduct::Item(&ast::ClassSetItem::Bracketed(ref x)) => {
|
||||
match x.kind {
|
||||
ast::ClassSet::Item(ref item) => {
|
||||
Some(ClassFrame::Union { head: item, tail: &[] })
|
||||
}
|
||||
ast::ClassSet::BinaryOp(ref op) => {
|
||||
Some(ClassFrame::Binary { op })
|
||||
}
|
||||
}
|
||||
}
|
||||
ClassInduct::Item(&ast::ClassSetItem::Union(ref x)) => {
|
||||
if x.items.is_empty() {
|
||||
None
|
||||
} else {
|
||||
Some(ClassFrame::Union {
|
||||
head: &x.items[0],
|
||||
tail: &x.items[1..],
|
||||
})
|
||||
}
|
||||
}
|
||||
ClassInduct::BinaryOp(op) => {
|
||||
Some(ClassFrame::BinaryLHS { op, lhs: &op.lhs, rhs: &op.rhs })
|
||||
}
|
||||
_ => None,
|
||||
}
|
||||
}
|
||||
|
||||
/// Pops the given frame. If the frame has an additional inductive step,
|
||||
/// then return it, otherwise return `None`.
|
||||
fn pop_class(&self, induct: ClassFrame<'a>) -> Option<ClassFrame<'a>> {
|
||||
match induct {
|
||||
ClassFrame::Union { tail, .. } => {
|
||||
if tail.is_empty() {
|
||||
None
|
||||
} else {
|
||||
Some(ClassFrame::Union {
|
||||
head: &tail[0],
|
||||
tail: &tail[1..],
|
||||
})
|
||||
}
|
||||
}
|
||||
ClassFrame::Binary { .. } => None,
|
||||
ClassFrame::BinaryLHS { op, rhs, .. } => {
|
||||
Some(ClassFrame::BinaryRHS { op, rhs })
|
||||
}
|
||||
ClassFrame::BinaryRHS { .. } => None,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl<'a> Frame<'a> {
|
||||
/// Perform the next inductive step on this frame and return the next
|
||||
/// child AST node to visit.
|
||||
fn child(&self) -> &'a Ast {
|
||||
match *self {
|
||||
Frame::Repetition(rep) => &rep.ast,
|
||||
Frame::Group(group) => &group.ast,
|
||||
Frame::Concat { head, .. } => head,
|
||||
Frame::Alternation { head, .. } => head,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl<'a> ClassFrame<'a> {
|
||||
/// Perform the next inductive step on this frame and return the next
|
||||
/// child class node to visit.
|
||||
fn child(&self) -> ClassInduct<'a> {
|
||||
match *self {
|
||||
ClassFrame::Union { head, .. } => ClassInduct::Item(head),
|
||||
ClassFrame::Binary { op, .. } => ClassInduct::BinaryOp(op),
|
||||
ClassFrame::BinaryLHS { ref lhs, .. } => {
|
||||
ClassInduct::from_set(lhs)
|
||||
}
|
||||
ClassFrame::BinaryRHS { ref rhs, .. } => {
|
||||
ClassInduct::from_set(rhs)
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl<'a> ClassInduct<'a> {
|
||||
fn from_bracketed(ast: &'a ast::ClassBracketed) -> ClassInduct<'a> {
|
||||
ClassInduct::from_set(&ast.kind)
|
||||
}
|
||||
|
||||
fn from_set(ast: &'a ast::ClassSet) -> ClassInduct<'a> {
|
||||
match *ast {
|
||||
ast::ClassSet::Item(ref item) => ClassInduct::Item(item),
|
||||
ast::ClassSet::BinaryOp(ref op) => ClassInduct::BinaryOp(op),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl<'a> core::fmt::Debug for ClassFrame<'a> {
|
||||
fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result {
|
||||
let x = match *self {
|
||||
ClassFrame::Union { .. } => "Union",
|
||||
ClassFrame::Binary { .. } => "Binary",
|
||||
ClassFrame::BinaryLHS { .. } => "BinaryLHS",
|
||||
ClassFrame::BinaryRHS { .. } => "BinaryRHS",
|
||||
};
|
||||
write!(f, "{}", x)
|
||||
}
|
||||
}
|
||||
|
||||
impl<'a> core::fmt::Debug for ClassInduct<'a> {
|
||||
fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result {
|
||||
let x = match *self {
|
||||
ClassInduct::Item(it) => match *it {
|
||||
ast::ClassSetItem::Empty(_) => "Item(Empty)",
|
||||
ast::ClassSetItem::Literal(_) => "Item(Literal)",
|
||||
ast::ClassSetItem::Range(_) => "Item(Range)",
|
||||
ast::ClassSetItem::Ascii(_) => "Item(Ascii)",
|
||||
ast::ClassSetItem::Perl(_) => "Item(Perl)",
|
||||
ast::ClassSetItem::Unicode(_) => "Item(Unicode)",
|
||||
ast::ClassSetItem::Bracketed(_) => "Item(Bracketed)",
|
||||
ast::ClassSetItem::Union(_) => "Item(Union)",
|
||||
},
|
||||
ClassInduct::BinaryOp(it) => match it.kind {
|
||||
ast::ClassSetBinaryOpKind::Intersection => {
|
||||
"BinaryOp(Intersection)"
|
||||
}
|
||||
ast::ClassSetBinaryOpKind::Difference => {
|
||||
"BinaryOp(Difference)"
|
||||
}
|
||||
ast::ClassSetBinaryOpKind::SymmetricDifference => {
|
||||
"BinaryOp(SymmetricDifference)"
|
||||
}
|
||||
},
|
||||
};
|
||||
write!(f, "{}", x)
|
||||
}
|
||||
}
|
||||
107
third-party/vendor/regex-syntax/src/debug.rs
vendored
Normal file
107
third-party/vendor/regex-syntax/src/debug.rs
vendored
Normal file
|
|
@ -0,0 +1,107 @@
|
|||
/// A type that wraps a single byte with a convenient fmt::Debug impl that
|
||||
/// escapes the byte.
|
||||
pub(crate) struct Byte(pub(crate) u8);
|
||||
|
||||
impl core::fmt::Debug for Byte {
|
||||
fn fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result {
|
||||
// Special case ASCII space. It's too hard to read otherwise, so
|
||||
// put quotes around it. I sometimes wonder whether just '\x20' would
|
||||
// be better...
|
||||
if self.0 == b' ' {
|
||||
return write!(f, "' '");
|
||||
}
|
||||
// 10 bytes is enough to cover any output from ascii::escape_default.
|
||||
let mut bytes = [0u8; 10];
|
||||
let mut len = 0;
|
||||
for (i, mut b) in core::ascii::escape_default(self.0).enumerate() {
|
||||
// capitalize \xab to \xAB
|
||||
if i >= 2 && b'a' <= b && b <= b'f' {
|
||||
b -= 32;
|
||||
}
|
||||
bytes[len] = b;
|
||||
len += 1;
|
||||
}
|
||||
write!(f, "{}", core::str::from_utf8(&bytes[..len]).unwrap())
|
||||
}
|
||||
}
|
||||
|
||||
/// A type that provides a human readable debug impl for arbitrary bytes.
|
||||
///
|
||||
/// This generally works best when the bytes are presumed to be mostly UTF-8,
|
||||
/// but will work for anything.
|
||||
///
|
||||
/// N.B. This is copied nearly verbatim from regex-automata. Sigh.
|
||||
pub(crate) struct Bytes<'a>(pub(crate) &'a [u8]);
|
||||
|
||||
impl<'a> core::fmt::Debug for Bytes<'a> {
|
||||
fn fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result {
|
||||
write!(f, "\"")?;
|
||||
// This is a sad re-implementation of a similar impl found in bstr.
|
||||
let mut bytes = self.0;
|
||||
while let Some(result) = utf8_decode(bytes) {
|
||||
let ch = match result {
|
||||
Ok(ch) => ch,
|
||||
Err(byte) => {
|
||||
write!(f, r"\x{:02x}", byte)?;
|
||||
bytes = &bytes[1..];
|
||||
continue;
|
||||
}
|
||||
};
|
||||
bytes = &bytes[ch.len_utf8()..];
|
||||
match ch {
|
||||
'\0' => write!(f, "\\0")?,
|
||||
// ASCII control characters except \0, \n, \r, \t
|
||||
'\x01'..='\x08'
|
||||
| '\x0b'
|
||||
| '\x0c'
|
||||
| '\x0e'..='\x19'
|
||||
| '\x7f' => {
|
||||
write!(f, "\\x{:02x}", u32::from(ch))?;
|
||||
}
|
||||
'\n' | '\r' | '\t' | _ => {
|
||||
write!(f, "{}", ch.escape_debug())?;
|
||||
}
|
||||
}
|
||||
}
|
||||
write!(f, "\"")?;
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
|
||||
/// Decodes the next UTF-8 encoded codepoint from the given byte slice.
|
||||
///
|
||||
/// If no valid encoding of a codepoint exists at the beginning of the given
|
||||
/// byte slice, then the first byte is returned instead.
|
||||
///
|
||||
/// This returns `None` if and only if `bytes` is empty.
|
||||
pub(crate) fn utf8_decode(bytes: &[u8]) -> Option<Result<char, u8>> {
|
||||
fn len(byte: u8) -> Option<usize> {
|
||||
if byte <= 0x7F {
|
||||
return Some(1);
|
||||
} else if byte & 0b1100_0000 == 0b1000_0000 {
|
||||
return None;
|
||||
} else if byte <= 0b1101_1111 {
|
||||
Some(2)
|
||||
} else if byte <= 0b1110_1111 {
|
||||
Some(3)
|
||||
} else if byte <= 0b1111_0111 {
|
||||
Some(4)
|
||||
} else {
|
||||
None
|
||||
}
|
||||
}
|
||||
|
||||
if bytes.is_empty() {
|
||||
return None;
|
||||
}
|
||||
let len = match len(bytes[0]) {
|
||||
None => return Some(Err(bytes[0])),
|
||||
Some(len) if len > bytes.len() => return Some(Err(bytes[0])),
|
||||
Some(1) => return Some(Ok(char::from(bytes[0]))),
|
||||
Some(len) => len,
|
||||
};
|
||||
match core::str::from_utf8(&bytes[..len]) {
|
||||
Ok(s) => Some(Ok(s.chars().next().unwrap())),
|
||||
Err(_) => Some(Err(bytes[0])),
|
||||
}
|
||||
}
|
||||
8
third-party/vendor/regex-syntax/src/either.rs
vendored
Normal file
8
third-party/vendor/regex-syntax/src/either.rs
vendored
Normal file
|
|
@ -0,0 +1,8 @@
|
|||
/// A simple binary sum type.
|
||||
///
|
||||
/// This is occasionally useful in an ad hoc fashion.
|
||||
#[derive(Clone, Debug, Eq, PartialEq)]
|
||||
pub enum Either<Left, Right> {
|
||||
Left(Left),
|
||||
Right(Right),
|
||||
}
|
||||
311
third-party/vendor/regex-syntax/src/error.rs
vendored
Normal file
311
third-party/vendor/regex-syntax/src/error.rs
vendored
Normal file
|
|
@ -0,0 +1,311 @@
|
|||
use alloc::{
|
||||
format,
|
||||
string::{String, ToString},
|
||||
vec,
|
||||
vec::Vec,
|
||||
};
|
||||
|
||||
use crate::{ast, hir};
|
||||
|
||||
/// This error type encompasses any error that can be returned by this crate.
|
||||
///
|
||||
/// This error type is marked as `non_exhaustive`. This means that adding a
|
||||
/// new variant is not considered a breaking change.
|
||||
#[non_exhaustive]
|
||||
#[derive(Clone, Debug, Eq, PartialEq)]
|
||||
pub enum Error {
|
||||
/// An error that occurred while translating concrete syntax into abstract
|
||||
/// syntax (AST).
|
||||
Parse(ast::Error),
|
||||
/// An error that occurred while translating abstract syntax into a high
|
||||
/// level intermediate representation (HIR).
|
||||
Translate(hir::Error),
|
||||
}
|
||||
|
||||
impl From<ast::Error> for Error {
|
||||
fn from(err: ast::Error) -> Error {
|
||||
Error::Parse(err)
|
||||
}
|
||||
}
|
||||
|
||||
impl From<hir::Error> for Error {
|
||||
fn from(err: hir::Error) -> Error {
|
||||
Error::Translate(err)
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(feature = "std")]
|
||||
impl std::error::Error for Error {}
|
||||
|
||||
impl core::fmt::Display for Error {
|
||||
fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result {
|
||||
match *self {
|
||||
Error::Parse(ref x) => x.fmt(f),
|
||||
Error::Translate(ref x) => x.fmt(f),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// A helper type for formatting nice error messages.
|
||||
///
|
||||
/// This type is responsible for reporting regex parse errors in a nice human
|
||||
/// readable format. Most of its complexity is from interspersing notational
|
||||
/// markers pointing out the position where an error occurred.
|
||||
#[derive(Debug)]
|
||||
pub struct Formatter<'e, E> {
|
||||
/// The original regex pattern in which the error occurred.
|
||||
pattern: &'e str,
|
||||
/// The error kind. It must impl fmt::Display.
|
||||
err: &'e E,
|
||||
/// The primary span of the error.
|
||||
span: &'e ast::Span,
|
||||
/// An auxiliary and optional span, in case the error needs to point to
|
||||
/// two locations (e.g., when reporting a duplicate capture group name).
|
||||
aux_span: Option<&'e ast::Span>,
|
||||
}
|
||||
|
||||
impl<'e> From<&'e ast::Error> for Formatter<'e, ast::ErrorKind> {
|
||||
fn from(err: &'e ast::Error) -> Self {
|
||||
Formatter {
|
||||
pattern: err.pattern(),
|
||||
err: err.kind(),
|
||||
span: err.span(),
|
||||
aux_span: err.auxiliary_span(),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl<'e> From<&'e hir::Error> for Formatter<'e, hir::ErrorKind> {
|
||||
fn from(err: &'e hir::Error) -> Self {
|
||||
Formatter {
|
||||
pattern: err.pattern(),
|
||||
err: err.kind(),
|
||||
span: err.span(),
|
||||
aux_span: None,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl<'e, E: core::fmt::Display> core::fmt::Display for Formatter<'e, E> {
|
||||
fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result {
|
||||
let spans = Spans::from_formatter(self);
|
||||
if self.pattern.contains('\n') {
|
||||
let divider = repeat_char('~', 79);
|
||||
|
||||
writeln!(f, "regex parse error:")?;
|
||||
writeln!(f, "{}", divider)?;
|
||||
let notated = spans.notate();
|
||||
write!(f, "{}", notated)?;
|
||||
writeln!(f, "{}", divider)?;
|
||||
// If we have error spans that cover multiple lines, then we just
|
||||
// note the line numbers.
|
||||
if !spans.multi_line.is_empty() {
|
||||
let mut notes = vec![];
|
||||
for span in &spans.multi_line {
|
||||
notes.push(format!(
|
||||
"on line {} (column {}) through line {} (column {})",
|
||||
span.start.line,
|
||||
span.start.column,
|
||||
span.end.line,
|
||||
span.end.column - 1
|
||||
));
|
||||
}
|
||||
writeln!(f, "{}", notes.join("\n"))?;
|
||||
}
|
||||
write!(f, "error: {}", self.err)?;
|
||||
} else {
|
||||
writeln!(f, "regex parse error:")?;
|
||||
let notated = Spans::from_formatter(self).notate();
|
||||
write!(f, "{}", notated)?;
|
||||
write!(f, "error: {}", self.err)?;
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
|
||||
/// This type represents an arbitrary number of error spans in a way that makes
|
||||
/// it convenient to notate the regex pattern. ("Notate" means "point out
|
||||
/// exactly where the error occurred in the regex pattern.")
|
||||
///
|
||||
/// Technically, we can only ever have two spans given our current error
|
||||
/// structure. However, after toiling with a specific algorithm for handling
|
||||
/// two spans, it became obvious that an algorithm to handle an arbitrary
|
||||
/// number of spans was actually much simpler.
|
||||
struct Spans<'p> {
|
||||
/// The original regex pattern string.
|
||||
pattern: &'p str,
|
||||
/// The total width that should be used for line numbers. The width is
|
||||
/// used for left padding the line numbers for alignment.
|
||||
///
|
||||
/// A value of `0` means line numbers should not be displayed. That is,
|
||||
/// the pattern is itself only one line.
|
||||
line_number_width: usize,
|
||||
/// All error spans that occur on a single line. This sequence always has
|
||||
/// length equivalent to the number of lines in `pattern`, where the index
|
||||
/// of the sequence represents a line number, starting at `0`. The spans
|
||||
/// in each line are sorted in ascending order.
|
||||
by_line: Vec<Vec<ast::Span>>,
|
||||
/// All error spans that occur over one or more lines. That is, the start
|
||||
/// and end position of the span have different line numbers. The spans are
|
||||
/// sorted in ascending order.
|
||||
multi_line: Vec<ast::Span>,
|
||||
}
|
||||
|
||||
impl<'p> Spans<'p> {
|
||||
/// Build a sequence of spans from a formatter.
|
||||
fn from_formatter<'e, E: core::fmt::Display>(
|
||||
fmter: &'p Formatter<'e, E>,
|
||||
) -> Spans<'p> {
|
||||
let mut line_count = fmter.pattern.lines().count();
|
||||
// If the pattern ends with a `\n` literal, then our line count is
|
||||
// off by one, since a span can occur immediately after the last `\n`,
|
||||
// which is consider to be an additional line.
|
||||
if fmter.pattern.ends_with('\n') {
|
||||
line_count += 1;
|
||||
}
|
||||
let line_number_width =
|
||||
if line_count <= 1 { 0 } else { line_count.to_string().len() };
|
||||
let mut spans = Spans {
|
||||
pattern: &fmter.pattern,
|
||||
line_number_width,
|
||||
by_line: vec![vec![]; line_count],
|
||||
multi_line: vec![],
|
||||
};
|
||||
spans.add(fmter.span.clone());
|
||||
if let Some(span) = fmter.aux_span {
|
||||
spans.add(span.clone());
|
||||
}
|
||||
spans
|
||||
}
|
||||
|
||||
/// Add the given span to this sequence, putting it in the right place.
|
||||
fn add(&mut self, span: ast::Span) {
|
||||
// This is grossly inefficient since we sort after each add, but right
|
||||
// now, we only ever add two spans at most.
|
||||
if span.is_one_line() {
|
||||
let i = span.start.line - 1; // because lines are 1-indexed
|
||||
self.by_line[i].push(span);
|
||||
self.by_line[i].sort();
|
||||
} else {
|
||||
self.multi_line.push(span);
|
||||
self.multi_line.sort();
|
||||
}
|
||||
}
|
||||
|
||||
/// Notate the pattern string with carents (`^`) pointing at each span
|
||||
/// location. This only applies to spans that occur within a single line.
|
||||
fn notate(&self) -> String {
|
||||
let mut notated = String::new();
|
||||
for (i, line) in self.pattern.lines().enumerate() {
|
||||
if self.line_number_width > 0 {
|
||||
notated.push_str(&self.left_pad_line_number(i + 1));
|
||||
notated.push_str(": ");
|
||||
} else {
|
||||
notated.push_str(" ");
|
||||
}
|
||||
notated.push_str(line);
|
||||
notated.push('\n');
|
||||
if let Some(notes) = self.notate_line(i) {
|
||||
notated.push_str(¬es);
|
||||
notated.push('\n');
|
||||
}
|
||||
}
|
||||
notated
|
||||
}
|
||||
|
||||
/// Return notes for the line indexed at `i` (zero-based). If there are no
|
||||
/// spans for the given line, then `None` is returned. Otherwise, an
|
||||
/// appropriately space padded string with correctly positioned `^` is
|
||||
/// returned, accounting for line numbers.
|
||||
fn notate_line(&self, i: usize) -> Option<String> {
|
||||
let spans = &self.by_line[i];
|
||||
if spans.is_empty() {
|
||||
return None;
|
||||
}
|
||||
let mut notes = String::new();
|
||||
for _ in 0..self.line_number_padding() {
|
||||
notes.push(' ');
|
||||
}
|
||||
let mut pos = 0;
|
||||
for span in spans {
|
||||
for _ in pos..(span.start.column - 1) {
|
||||
notes.push(' ');
|
||||
pos += 1;
|
||||
}
|
||||
let note_len = span.end.column.saturating_sub(span.start.column);
|
||||
for _ in 0..core::cmp::max(1, note_len) {
|
||||
notes.push('^');
|
||||
pos += 1;
|
||||
}
|
||||
}
|
||||
Some(notes)
|
||||
}
|
||||
|
||||
/// Left pad the given line number with spaces such that it is aligned with
|
||||
/// other line numbers.
|
||||
fn left_pad_line_number(&self, n: usize) -> String {
|
||||
let n = n.to_string();
|
||||
let pad = self.line_number_width.checked_sub(n.len()).unwrap();
|
||||
let mut result = repeat_char(' ', pad);
|
||||
result.push_str(&n);
|
||||
result
|
||||
}
|
||||
|
||||
/// Return the line number padding beginning at the start of each line of
|
||||
/// the pattern.
|
||||
///
|
||||
/// If the pattern is only one line, then this returns a fixed padding
|
||||
/// for visual indentation.
|
||||
fn line_number_padding(&self) -> usize {
|
||||
if self.line_number_width == 0 {
|
||||
4
|
||||
} else {
|
||||
2 + self.line_number_width
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
fn repeat_char(c: char, count: usize) -> String {
|
||||
core::iter::repeat(c).take(count).collect()
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use alloc::string::ToString;
|
||||
|
||||
use crate::ast::parse::Parser;
|
||||
|
||||
fn assert_panic_message(pattern: &str, expected_msg: &str) {
|
||||
let result = Parser::new().parse(pattern);
|
||||
match result {
|
||||
Ok(_) => {
|
||||
panic!("regex should not have parsed");
|
||||
}
|
||||
Err(err) => {
|
||||
assert_eq!(err.to_string(), expected_msg.trim());
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// See: https://github.com/rust-lang/regex/issues/464
|
||||
#[test]
|
||||
fn regression_464() {
|
||||
let err = Parser::new().parse("a{\n").unwrap_err();
|
||||
// This test checks that the error formatter doesn't panic.
|
||||
assert!(!err.to_string().is_empty());
|
||||
}
|
||||
|
||||
// See: https://github.com/rust-lang/regex/issues/545
|
||||
#[test]
|
||||
fn repetition_quantifier_expects_a_valid_decimal() {
|
||||
assert_panic_message(
|
||||
r"\\u{[^}]*}",
|
||||
r#"
|
||||
regex parse error:
|
||||
\\u{[^}]*}
|
||||
^
|
||||
error: repetition quantifier expects a valid decimal
|
||||
"#,
|
||||
);
|
||||
}
|
||||
}
|
||||
581
third-party/vendor/regex-syntax/src/hir/interval.rs
vendored
Normal file
581
third-party/vendor/regex-syntax/src/hir/interval.rs
vendored
Normal file
|
|
@ -0,0 +1,581 @@
|
|||
use core::{char, cmp, fmt::Debug, slice};
|
||||
|
||||
use alloc::vec::Vec;
|
||||
|
||||
use crate::unicode;
|
||||
|
||||
// This module contains an *internal* implementation of interval sets.
|
||||
//
|
||||
// The primary invariant that interval sets guards is canonical ordering. That
|
||||
// is, every interval set contains an ordered sequence of intervals where
|
||||
// no two intervals are overlapping or adjacent. While this invariant is
|
||||
// occasionally broken within the implementation, it should be impossible for
|
||||
// callers to observe it.
|
||||
//
|
||||
// Since case folding (as implemented below) breaks that invariant, we roll
|
||||
// that into this API even though it is a little out of place in an otherwise
|
||||
// generic interval set. (Hence the reason why the `unicode` module is imported
|
||||
// here.)
|
||||
//
|
||||
// Some of the implementation complexity here is a result of me wanting to
|
||||
// preserve the sequential representation without using additional memory.
|
||||
// In many cases, we do use linear extra memory, but it is at most 2x and it
|
||||
// is amortized. If we relaxed the memory requirements, this implementation
|
||||
// could become much simpler. The extra memory is honestly probably OK, but
|
||||
// character classes (especially of the Unicode variety) can become quite
|
||||
// large, and it would be nice to keep regex compilation snappy even in debug
|
||||
// builds. (In the past, I have been careless with this area of code and it has
|
||||
// caused slow regex compilations in debug mode, so this isn't entirely
|
||||
// unwarranted.)
|
||||
//
|
||||
// Tests on this are relegated to the public API of HIR in src/hir.rs.
|
||||
|
||||
#[derive(Clone, Debug)]
|
||||
pub struct IntervalSet<I> {
|
||||
/// A sorted set of non-overlapping ranges.
|
||||
ranges: Vec<I>,
|
||||
/// While not required at all for correctness, we keep track of whether an
|
||||
/// interval set has been case folded or not. This helps us avoid doing
|
||||
/// redundant work if, for example, a set has already been cased folded.
|
||||
/// And note that whether a set is folded or not is preserved through
|
||||
/// all of the pairwise set operations. That is, if both interval sets
|
||||
/// have been case folded, then any of difference, union, intersection or
|
||||
/// symmetric difference all produce a case folded set.
|
||||
///
|
||||
/// Note that when this is true, it *must* be the case that the set is case
|
||||
/// folded. But when it's false, the set *may* be case folded. In other
|
||||
/// words, we only set this to true when we know it to be case, but we're
|
||||
/// okay with it being false if it would otherwise be costly to determine
|
||||
/// whether it should be true. This means code cannot assume that a false
|
||||
/// value necessarily indicates that the set is not case folded.
|
||||
///
|
||||
/// Bottom line: this is a performance optimization.
|
||||
folded: bool,
|
||||
}
|
||||
|
||||
impl<I: Interval> Eq for IntervalSet<I> {}
|
||||
|
||||
// We implement PartialEq manually so that we don't consider the set's internal
|
||||
// 'folded' property to be part of its identity. The 'folded' property is
|
||||
// strictly an optimization.
|
||||
impl<I: Interval> PartialEq for IntervalSet<I> {
|
||||
fn eq(&self, other: &IntervalSet<I>) -> bool {
|
||||
self.ranges.eq(&other.ranges)
|
||||
}
|
||||
}
|
||||
|
||||
impl<I: Interval> IntervalSet<I> {
|
||||
/// Create a new set from a sequence of intervals. Each interval is
|
||||
/// specified as a pair of bounds, where both bounds are inclusive.
|
||||
///
|
||||
/// The given ranges do not need to be in any specific order, and ranges
|
||||
/// may overlap.
|
||||
pub fn new<T: IntoIterator<Item = I>>(intervals: T) -> IntervalSet<I> {
|
||||
let ranges: Vec<I> = intervals.into_iter().collect();
|
||||
// An empty set is case folded.
|
||||
let folded = ranges.is_empty();
|
||||
let mut set = IntervalSet { ranges, folded };
|
||||
set.canonicalize();
|
||||
set
|
||||
}
|
||||
|
||||
/// Add a new interval to this set.
|
||||
pub fn push(&mut self, interval: I) {
|
||||
// TODO: This could be faster. e.g., Push the interval such that
|
||||
// it preserves canonicalization.
|
||||
self.ranges.push(interval);
|
||||
self.canonicalize();
|
||||
// We don't know whether the new interval added here is considered
|
||||
// case folded, so we conservatively assume that the entire set is
|
||||
// no longer case folded if it was previously.
|
||||
self.folded = false;
|
||||
}
|
||||
|
||||
/// Return an iterator over all intervals in this set.
|
||||
///
|
||||
/// The iterator yields intervals in ascending order.
|
||||
pub fn iter(&self) -> IntervalSetIter<'_, I> {
|
||||
IntervalSetIter(self.ranges.iter())
|
||||
}
|
||||
|
||||
/// Return an immutable slice of intervals in this set.
|
||||
///
|
||||
/// The sequence returned is in canonical ordering.
|
||||
pub fn intervals(&self) -> &[I] {
|
||||
&self.ranges
|
||||
}
|
||||
|
||||
/// Expand this interval set such that it contains all case folded
|
||||
/// characters. For example, if this class consists of the range `a-z`,
|
||||
/// then applying case folding will result in the class containing both the
|
||||
/// ranges `a-z` and `A-Z`.
|
||||
///
|
||||
/// This returns an error if the necessary case mapping data is not
|
||||
/// available.
|
||||
pub fn case_fold_simple(&mut self) -> Result<(), unicode::CaseFoldError> {
|
||||
if self.folded {
|
||||
return Ok(());
|
||||
}
|
||||
let len = self.ranges.len();
|
||||
for i in 0..len {
|
||||
let range = self.ranges[i];
|
||||
if let Err(err) = range.case_fold_simple(&mut self.ranges) {
|
||||
self.canonicalize();
|
||||
return Err(err);
|
||||
}
|
||||
}
|
||||
self.canonicalize();
|
||||
self.folded = true;
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Union this set with the given set, in place.
|
||||
pub fn union(&mut self, other: &IntervalSet<I>) {
|
||||
if other.ranges.is_empty() || self.ranges == other.ranges {
|
||||
return;
|
||||
}
|
||||
// This could almost certainly be done more efficiently.
|
||||
self.ranges.extend(&other.ranges);
|
||||
self.canonicalize();
|
||||
self.folded = self.folded && other.folded;
|
||||
}
|
||||
|
||||
/// Intersect this set with the given set, in place.
|
||||
pub fn intersect(&mut self, other: &IntervalSet<I>) {
|
||||
if self.ranges.is_empty() {
|
||||
return;
|
||||
}
|
||||
if other.ranges.is_empty() {
|
||||
self.ranges.clear();
|
||||
// An empty set is case folded.
|
||||
self.folded = true;
|
||||
return;
|
||||
}
|
||||
|
||||
// There should be a way to do this in-place with constant memory,
|
||||
// but I couldn't figure out a simple way to do it. So just append
|
||||
// the intersection to the end of this range, and then drain it before
|
||||
// we're done.
|
||||
let drain_end = self.ranges.len();
|
||||
|
||||
let mut ita = 0..drain_end;
|
||||
let mut itb = 0..other.ranges.len();
|
||||
let mut a = ita.next().unwrap();
|
||||
let mut b = itb.next().unwrap();
|
||||
loop {
|
||||
if let Some(ab) = self.ranges[a].intersect(&other.ranges[b]) {
|
||||
self.ranges.push(ab);
|
||||
}
|
||||
let (it, aorb) =
|
||||
if self.ranges[a].upper() < other.ranges[b].upper() {
|
||||
(&mut ita, &mut a)
|
||||
} else {
|
||||
(&mut itb, &mut b)
|
||||
};
|
||||
match it.next() {
|
||||
Some(v) => *aorb = v,
|
||||
None => break,
|
||||
}
|
||||
}
|
||||
self.ranges.drain(..drain_end);
|
||||
self.folded = self.folded && other.folded;
|
||||
}
|
||||
|
||||
/// Subtract the given set from this set, in place.
|
||||
pub fn difference(&mut self, other: &IntervalSet<I>) {
|
||||
if self.ranges.is_empty() || other.ranges.is_empty() {
|
||||
return;
|
||||
}
|
||||
|
||||
// This algorithm is (to me) surprisingly complex. A search of the
|
||||
// interwebs indicate that this is a potentially interesting problem.
|
||||
// Folks seem to suggest interval or segment trees, but I'd like to
|
||||
// avoid the overhead (both runtime and conceptual) of that.
|
||||
//
|
||||
// The following is basically my Shitty First Draft. Therefore, in
|
||||
// order to grok it, you probably need to read each line carefully.
|
||||
// Simplifications are most welcome!
|
||||
//
|
||||
// Remember, we can assume the canonical format invariant here, which
|
||||
// says that all ranges are sorted, not overlapping and not adjacent in
|
||||
// each class.
|
||||
let drain_end = self.ranges.len();
|
||||
let (mut a, mut b) = (0, 0);
|
||||
'LOOP: while a < drain_end && b < other.ranges.len() {
|
||||
// Basically, the easy cases are when neither range overlaps with
|
||||
// each other. If the `b` range is less than our current `a`
|
||||
// range, then we can skip it and move on.
|
||||
if other.ranges[b].upper() < self.ranges[a].lower() {
|
||||
b += 1;
|
||||
continue;
|
||||
}
|
||||
// ... similarly for the `a` range. If it's less than the smallest
|
||||
// `b` range, then we can add it as-is.
|
||||
if self.ranges[a].upper() < other.ranges[b].lower() {
|
||||
let range = self.ranges[a];
|
||||
self.ranges.push(range);
|
||||
a += 1;
|
||||
continue;
|
||||
}
|
||||
// Otherwise, we have overlapping ranges.
|
||||
assert!(!self.ranges[a].is_intersection_empty(&other.ranges[b]));
|
||||
|
||||
// This part is tricky and was non-obvious to me without looking
|
||||
// at explicit examples (see the tests). The trickiness stems from
|
||||
// two things: 1) subtracting a range from another range could
|
||||
// yield two ranges and 2) after subtracting a range, it's possible
|
||||
// that future ranges can have an impact. The loop below advances
|
||||
// the `b` ranges until they can't possible impact the current
|
||||
// range.
|
||||
//
|
||||
// For example, if our `a` range is `a-t` and our next three `b`
|
||||
// ranges are `a-c`, `g-i`, `r-t` and `x-z`, then we need to apply
|
||||
// subtraction three times before moving on to the next `a` range.
|
||||
let mut range = self.ranges[a];
|
||||
while b < other.ranges.len()
|
||||
&& !range.is_intersection_empty(&other.ranges[b])
|
||||
{
|
||||
let old_range = range;
|
||||
range = match range.difference(&other.ranges[b]) {
|
||||
(None, None) => {
|
||||
// We lost the entire range, so move on to the next
|
||||
// without adding this one.
|
||||
a += 1;
|
||||
continue 'LOOP;
|
||||
}
|
||||
(Some(range1), None) | (None, Some(range1)) => range1,
|
||||
(Some(range1), Some(range2)) => {
|
||||
self.ranges.push(range1);
|
||||
range2
|
||||
}
|
||||
};
|
||||
// It's possible that the `b` range has more to contribute
|
||||
// here. In particular, if it is greater than the original
|
||||
// range, then it might impact the next `a` range *and* it
|
||||
// has impacted the current `a` range as much as possible,
|
||||
// so we can quit. We don't bump `b` so that the next `a`
|
||||
// range can apply it.
|
||||
if other.ranges[b].upper() > old_range.upper() {
|
||||
break;
|
||||
}
|
||||
// Otherwise, the next `b` range might apply to the current
|
||||
// `a` range.
|
||||
b += 1;
|
||||
}
|
||||
self.ranges.push(range);
|
||||
a += 1;
|
||||
}
|
||||
while a < drain_end {
|
||||
let range = self.ranges[a];
|
||||
self.ranges.push(range);
|
||||
a += 1;
|
||||
}
|
||||
self.ranges.drain(..drain_end);
|
||||
self.folded = self.folded && other.folded;
|
||||
}
|
||||
|
||||
/// Compute the symmetric difference of the two sets, in place.
|
||||
///
|
||||
/// This computes the symmetric difference of two interval sets. This
|
||||
/// removes all elements in this set that are also in the given set,
|
||||
/// but also adds all elements from the given set that aren't in this
|
||||
/// set. That is, the set will contain all elements in either set,
|
||||
/// but will not contain any elements that are in both sets.
|
||||
pub fn symmetric_difference(&mut self, other: &IntervalSet<I>) {
|
||||
// TODO(burntsushi): Fix this so that it amortizes allocation.
|
||||
let mut intersection = self.clone();
|
||||
intersection.intersect(other);
|
||||
self.union(other);
|
||||
self.difference(&intersection);
|
||||
}
|
||||
|
||||
/// Negate this interval set.
|
||||
///
|
||||
/// For all `x` where `x` is any element, if `x` was in this set, then it
|
||||
/// will not be in this set after negation.
|
||||
pub fn negate(&mut self) {
|
||||
if self.ranges.is_empty() {
|
||||
let (min, max) = (I::Bound::min_value(), I::Bound::max_value());
|
||||
self.ranges.push(I::create(min, max));
|
||||
// The set containing everything must case folded.
|
||||
self.folded = true;
|
||||
return;
|
||||
}
|
||||
|
||||
// There should be a way to do this in-place with constant memory,
|
||||
// but I couldn't figure out a simple way to do it. So just append
|
||||
// the negation to the end of this range, and then drain it before
|
||||
// we're done.
|
||||
let drain_end = self.ranges.len();
|
||||
|
||||
// We do checked arithmetic below because of the canonical ordering
|
||||
// invariant.
|
||||
if self.ranges[0].lower() > I::Bound::min_value() {
|
||||
let upper = self.ranges[0].lower().decrement();
|
||||
self.ranges.push(I::create(I::Bound::min_value(), upper));
|
||||
}
|
||||
for i in 1..drain_end {
|
||||
let lower = self.ranges[i - 1].upper().increment();
|
||||
let upper = self.ranges[i].lower().decrement();
|
||||
self.ranges.push(I::create(lower, upper));
|
||||
}
|
||||
if self.ranges[drain_end - 1].upper() < I::Bound::max_value() {
|
||||
let lower = self.ranges[drain_end - 1].upper().increment();
|
||||
self.ranges.push(I::create(lower, I::Bound::max_value()));
|
||||
}
|
||||
self.ranges.drain(..drain_end);
|
||||
// We don't need to update whether this set is folded or not, because
|
||||
// it is conservatively preserved through negation. Namely, if a set
|
||||
// is not folded, then it is possible that its negation is folded, for
|
||||
// example, [^☃]. But we're fine with assuming that the set is not
|
||||
// folded in that case. (`folded` permits false negatives but not false
|
||||
// positives.)
|
||||
//
|
||||
// But what about when a set is folded, is its negation also
|
||||
// necessarily folded? Yes. Because if a set is folded, then for every
|
||||
// character in the set, it necessarily included its equivalence class
|
||||
// of case folded characters. Negating it in turn means that all
|
||||
// equivalence classes in the set are negated, and any equivalence
|
||||
// class that was previously not in the set is now entirely in the set.
|
||||
}
|
||||
|
||||
/// Converts this set into a canonical ordering.
|
||||
fn canonicalize(&mut self) {
|
||||
if self.is_canonical() {
|
||||
return;
|
||||
}
|
||||
self.ranges.sort();
|
||||
assert!(!self.ranges.is_empty());
|
||||
|
||||
// Is there a way to do this in-place with constant memory? I couldn't
|
||||
// figure out a way to do it. So just append the canonicalization to
|
||||
// the end of this range, and then drain it before we're done.
|
||||
let drain_end = self.ranges.len();
|
||||
for oldi in 0..drain_end {
|
||||
// If we've added at least one new range, then check if we can
|
||||
// merge this range in the previously added range.
|
||||
if self.ranges.len() > drain_end {
|
||||
let (last, rest) = self.ranges.split_last_mut().unwrap();
|
||||
if let Some(union) = last.union(&rest[oldi]) {
|
||||
*last = union;
|
||||
continue;
|
||||
}
|
||||
}
|
||||
let range = self.ranges[oldi];
|
||||
self.ranges.push(range);
|
||||
}
|
||||
self.ranges.drain(..drain_end);
|
||||
}
|
||||
|
||||
/// Returns true if and only if this class is in a canonical ordering.
|
||||
fn is_canonical(&self) -> bool {
|
||||
for pair in self.ranges.windows(2) {
|
||||
if pair[0] >= pair[1] {
|
||||
return false;
|
||||
}
|
||||
if pair[0].is_contiguous(&pair[1]) {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
true
|
||||
}
|
||||
}
|
||||
|
||||
/// An iterator over intervals.
|
||||
#[derive(Debug)]
|
||||
pub struct IntervalSetIter<'a, I>(slice::Iter<'a, I>);
|
||||
|
||||
impl<'a, I> Iterator for IntervalSetIter<'a, I> {
|
||||
type Item = &'a I;
|
||||
|
||||
fn next(&mut self) -> Option<&'a I> {
|
||||
self.0.next()
|
||||
}
|
||||
}
|
||||
|
||||
pub trait Interval:
|
||||
Clone + Copy + Debug + Default + Eq + PartialEq + PartialOrd + Ord
|
||||
{
|
||||
type Bound: Bound;
|
||||
|
||||
fn lower(&self) -> Self::Bound;
|
||||
fn upper(&self) -> Self::Bound;
|
||||
fn set_lower(&mut self, bound: Self::Bound);
|
||||
fn set_upper(&mut self, bound: Self::Bound);
|
||||
fn case_fold_simple(
|
||||
&self,
|
||||
intervals: &mut Vec<Self>,
|
||||
) -> Result<(), unicode::CaseFoldError>;
|
||||
|
||||
/// Create a new interval.
|
||||
fn create(lower: Self::Bound, upper: Self::Bound) -> Self {
|
||||
let mut int = Self::default();
|
||||
if lower <= upper {
|
||||
int.set_lower(lower);
|
||||
int.set_upper(upper);
|
||||
} else {
|
||||
int.set_lower(upper);
|
||||
int.set_upper(lower);
|
||||
}
|
||||
int
|
||||
}
|
||||
|
||||
/// Union the given overlapping range into this range.
|
||||
///
|
||||
/// If the two ranges aren't contiguous, then this returns `None`.
|
||||
fn union(&self, other: &Self) -> Option<Self> {
|
||||
if !self.is_contiguous(other) {
|
||||
return None;
|
||||
}
|
||||
let lower = cmp::min(self.lower(), other.lower());
|
||||
let upper = cmp::max(self.upper(), other.upper());
|
||||
Some(Self::create(lower, upper))
|
||||
}
|
||||
|
||||
/// Intersect this range with the given range and return the result.
|
||||
///
|
||||
/// If the intersection is empty, then this returns `None`.
|
||||
fn intersect(&self, other: &Self) -> Option<Self> {
|
||||
let lower = cmp::max(self.lower(), other.lower());
|
||||
let upper = cmp::min(self.upper(), other.upper());
|
||||
if lower <= upper {
|
||||
Some(Self::create(lower, upper))
|
||||
} else {
|
||||
None
|
||||
}
|
||||
}
|
||||
|
||||
/// Subtract the given range from this range and return the resulting
|
||||
/// ranges.
|
||||
///
|
||||
/// If subtraction would result in an empty range, then no ranges are
|
||||
/// returned.
|
||||
fn difference(&self, other: &Self) -> (Option<Self>, Option<Self>) {
|
||||
if self.is_subset(other) {
|
||||
return (None, None);
|
||||
}
|
||||
if self.is_intersection_empty(other) {
|
||||
return (Some(self.clone()), None);
|
||||
}
|
||||
let add_lower = other.lower() > self.lower();
|
||||
let add_upper = other.upper() < self.upper();
|
||||
// We know this because !self.is_subset(other) and the ranges have
|
||||
// a non-empty intersection.
|
||||
assert!(add_lower || add_upper);
|
||||
let mut ret = (None, None);
|
||||
if add_lower {
|
||||
let upper = other.lower().decrement();
|
||||
ret.0 = Some(Self::create(self.lower(), upper));
|
||||
}
|
||||
if add_upper {
|
||||
let lower = other.upper().increment();
|
||||
let range = Self::create(lower, self.upper());
|
||||
if ret.0.is_none() {
|
||||
ret.0 = Some(range);
|
||||
} else {
|
||||
ret.1 = Some(range);
|
||||
}
|
||||
}
|
||||
ret
|
||||
}
|
||||
|
||||
/// Compute the symmetric difference the given range from this range. This
|
||||
/// returns the union of the two ranges minus its intersection.
|
||||
fn symmetric_difference(
|
||||
&self,
|
||||
other: &Self,
|
||||
) -> (Option<Self>, Option<Self>) {
|
||||
let union = match self.union(other) {
|
||||
None => return (Some(self.clone()), Some(other.clone())),
|
||||
Some(union) => union,
|
||||
};
|
||||
let intersection = match self.intersect(other) {
|
||||
None => return (Some(self.clone()), Some(other.clone())),
|
||||
Some(intersection) => intersection,
|
||||
};
|
||||
union.difference(&intersection)
|
||||
}
|
||||
|
||||
/// Returns true if and only if the two ranges are contiguous. Two ranges
|
||||
/// are contiguous if and only if the ranges are either overlapping or
|
||||
/// adjacent.
|
||||
fn is_contiguous(&self, other: &Self) -> bool {
|
||||
let lower1 = self.lower().as_u32();
|
||||
let upper1 = self.upper().as_u32();
|
||||
let lower2 = other.lower().as_u32();
|
||||
let upper2 = other.upper().as_u32();
|
||||
cmp::max(lower1, lower2) <= cmp::min(upper1, upper2).saturating_add(1)
|
||||
}
|
||||
|
||||
/// Returns true if and only if the intersection of this range and the
|
||||
/// other range is empty.
|
||||
fn is_intersection_empty(&self, other: &Self) -> bool {
|
||||
let (lower1, upper1) = (self.lower(), self.upper());
|
||||
let (lower2, upper2) = (other.lower(), other.upper());
|
||||
cmp::max(lower1, lower2) > cmp::min(upper1, upper2)
|
||||
}
|
||||
|
||||
/// Returns true if and only if this range is a subset of the other range.
|
||||
fn is_subset(&self, other: &Self) -> bool {
|
||||
let (lower1, upper1) = (self.lower(), self.upper());
|
||||
let (lower2, upper2) = (other.lower(), other.upper());
|
||||
(lower2 <= lower1 && lower1 <= upper2)
|
||||
&& (lower2 <= upper1 && upper1 <= upper2)
|
||||
}
|
||||
}
|
||||
|
||||
pub trait Bound:
|
||||
Copy + Clone + Debug + Eq + PartialEq + PartialOrd + Ord
|
||||
{
|
||||
fn min_value() -> Self;
|
||||
fn max_value() -> Self;
|
||||
fn as_u32(self) -> u32;
|
||||
fn increment(self) -> Self;
|
||||
fn decrement(self) -> Self;
|
||||
}
|
||||
|
||||
impl Bound for u8 {
|
||||
fn min_value() -> Self {
|
||||
u8::MIN
|
||||
}
|
||||
fn max_value() -> Self {
|
||||
u8::MAX
|
||||
}
|
||||
fn as_u32(self) -> u32 {
|
||||
u32::from(self)
|
||||
}
|
||||
fn increment(self) -> Self {
|
||||
self.checked_add(1).unwrap()
|
||||
}
|
||||
fn decrement(self) -> Self {
|
||||
self.checked_sub(1).unwrap()
|
||||
}
|
||||
}
|
||||
|
||||
impl Bound for char {
|
||||
fn min_value() -> Self {
|
||||
'\x00'
|
||||
}
|
||||
fn max_value() -> Self {
|
||||
'\u{10FFFF}'
|
||||
}
|
||||
fn as_u32(self) -> u32 {
|
||||
u32::from(self)
|
||||
}
|
||||
|
||||
fn increment(self) -> Self {
|
||||
match self {
|
||||
'\u{D7FF}' => '\u{E000}',
|
||||
c => char::from_u32(u32::from(c).checked_add(1).unwrap()).unwrap(),
|
||||
}
|
||||
}
|
||||
|
||||
fn decrement(self) -> Self {
|
||||
match self {
|
||||
'\u{E000}' => '\u{D7FF}',
|
||||
c => char::from_u32(u32::from(c).checked_sub(1).unwrap()).unwrap(),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Tests for interval sets are written in src/hir.rs against the public API.
|
||||
3214
third-party/vendor/regex-syntax/src/hir/literal.rs
vendored
Normal file
3214
third-party/vendor/regex-syntax/src/hir/literal.rs
vendored
Normal file
File diff suppressed because it is too large
Load diff
3861
third-party/vendor/regex-syntax/src/hir/mod.rs
vendored
Normal file
3861
third-party/vendor/regex-syntax/src/hir/mod.rs
vendored
Normal file
File diff suppressed because it is too large
Load diff
608
third-party/vendor/regex-syntax/src/hir/print.rs
vendored
Normal file
608
third-party/vendor/regex-syntax/src/hir/print.rs
vendored
Normal file
|
|
@ -0,0 +1,608 @@
|
|||
/*!
|
||||
This module provides a regular expression printer for `Hir`.
|
||||
*/
|
||||
|
||||
use core::fmt;
|
||||
|
||||
use crate::{
|
||||
hir::{
|
||||
self,
|
||||
visitor::{self, Visitor},
|
||||
Hir, HirKind,
|
||||
},
|
||||
is_meta_character,
|
||||
};
|
||||
|
||||
/// A builder for constructing a printer.
|
||||
///
|
||||
/// Note that since a printer doesn't have any configuration knobs, this type
|
||||
/// remains unexported.
|
||||
#[derive(Clone, Debug)]
|
||||
struct PrinterBuilder {
|
||||
_priv: (),
|
||||
}
|
||||
|
||||
impl Default for PrinterBuilder {
|
||||
fn default() -> PrinterBuilder {
|
||||
PrinterBuilder::new()
|
||||
}
|
||||
}
|
||||
|
||||
impl PrinterBuilder {
|
||||
fn new() -> PrinterBuilder {
|
||||
PrinterBuilder { _priv: () }
|
||||
}
|
||||
|
||||
fn build(&self) -> Printer {
|
||||
Printer { _priv: () }
|
||||
}
|
||||
}
|
||||
|
||||
/// A printer for a regular expression's high-level intermediate
|
||||
/// representation.
|
||||
///
|
||||
/// A printer converts a high-level intermediate representation (HIR) to a
|
||||
/// regular expression pattern string. This particular printer uses constant
|
||||
/// stack space and heap space proportional to the size of the HIR.
|
||||
///
|
||||
/// Since this printer is only using the HIR, the pattern it prints will likely
|
||||
/// not resemble the original pattern at all. For example, a pattern like
|
||||
/// `\pL` will have its entire class written out.
|
||||
///
|
||||
/// The purpose of this printer is to provide a means to mutate an HIR and then
|
||||
/// build a regular expression from the result of that mutation. (A regex
|
||||
/// library could provide a constructor from this HIR explicitly, but that
|
||||
/// creates an unnecessary public coupling between the regex library and this
|
||||
/// specific HIR representation.)
|
||||
#[derive(Debug)]
|
||||
pub struct Printer {
|
||||
_priv: (),
|
||||
}
|
||||
|
||||
impl Printer {
|
||||
/// Create a new printer.
|
||||
pub fn new() -> Printer {
|
||||
PrinterBuilder::new().build()
|
||||
}
|
||||
|
||||
/// Print the given `Ast` to the given writer. The writer must implement
|
||||
/// `fmt::Write`. Typical implementations of `fmt::Write` that can be used
|
||||
/// here are a `fmt::Formatter` (which is available in `fmt::Display`
|
||||
/// implementations) or a `&mut String`.
|
||||
pub fn print<W: fmt::Write>(&mut self, hir: &Hir, wtr: W) -> fmt::Result {
|
||||
visitor::visit(hir, Writer { wtr })
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Debug)]
|
||||
struct Writer<W> {
|
||||
wtr: W,
|
||||
}
|
||||
|
||||
impl<W: fmt::Write> Visitor for Writer<W> {
|
||||
type Output = ();
|
||||
type Err = fmt::Error;
|
||||
|
||||
fn finish(self) -> fmt::Result {
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn visit_pre(&mut self, hir: &Hir) -> fmt::Result {
|
||||
match *hir.kind() {
|
||||
HirKind::Empty => {
|
||||
// Technically an empty sub-expression could be "printed" by
|
||||
// just ignoring it, but in practice, you could have a
|
||||
// repetition operator attached to an empty expression, and you
|
||||
// really need something in the concrete syntax to make that
|
||||
// work as you'd expect.
|
||||
self.wtr.write_str(r"(?:)")?;
|
||||
}
|
||||
// Repetition operators are strictly suffix oriented.
|
||||
HirKind::Repetition(_) => {}
|
||||
HirKind::Literal(hir::Literal(ref bytes)) => {
|
||||
// See the comment on the 'Concat' and 'Alternation' case below
|
||||
// for why we put parens here. Literals are, conceptually,
|
||||
// a special case of concatenation where each element is a
|
||||
// character. The HIR flattens this into a Box<[u8]>, but we
|
||||
// still need to treat it like a concatenation for correct
|
||||
// printing. As a special case, we don't write parens if there
|
||||
// is only one character. One character means there is no
|
||||
// concat so we don't need parens. Adding parens would still be
|
||||
// correct, but we drop them here because it tends to create
|
||||
// rather noisy regexes even in simple cases.
|
||||
let result = core::str::from_utf8(bytes);
|
||||
let len = result.map_or(bytes.len(), |s| s.chars().count());
|
||||
if len > 1 {
|
||||
self.wtr.write_str(r"(?:")?;
|
||||
}
|
||||
match result {
|
||||
Ok(string) => {
|
||||
for c in string.chars() {
|
||||
self.write_literal_char(c)?;
|
||||
}
|
||||
}
|
||||
Err(_) => {
|
||||
for &b in bytes.iter() {
|
||||
self.write_literal_byte(b)?;
|
||||
}
|
||||
}
|
||||
}
|
||||
if len > 1 {
|
||||
self.wtr.write_str(r")")?;
|
||||
}
|
||||
}
|
||||
HirKind::Class(hir::Class::Unicode(ref cls)) => {
|
||||
if cls.ranges().is_empty() {
|
||||
return self.wtr.write_str("[a&&b]");
|
||||
}
|
||||
self.wtr.write_str("[")?;
|
||||
for range in cls.iter() {
|
||||
if range.start() == range.end() {
|
||||
self.write_literal_char(range.start())?;
|
||||
} else if u32::from(range.start()) + 1
|
||||
== u32::from(range.end())
|
||||
{
|
||||
self.write_literal_char(range.start())?;
|
||||
self.write_literal_char(range.end())?;
|
||||
} else {
|
||||
self.write_literal_char(range.start())?;
|
||||
self.wtr.write_str("-")?;
|
||||
self.write_literal_char(range.end())?;
|
||||
}
|
||||
}
|
||||
self.wtr.write_str("]")?;
|
||||
}
|
||||
HirKind::Class(hir::Class::Bytes(ref cls)) => {
|
||||
if cls.ranges().is_empty() {
|
||||
return self.wtr.write_str("[a&&b]");
|
||||
}
|
||||
self.wtr.write_str("(?-u:[")?;
|
||||
for range in cls.iter() {
|
||||
if range.start() == range.end() {
|
||||
self.write_literal_class_byte(range.start())?;
|
||||
} else if range.start() + 1 == range.end() {
|
||||
self.write_literal_class_byte(range.start())?;
|
||||
self.write_literal_class_byte(range.end())?;
|
||||
} else {
|
||||
self.write_literal_class_byte(range.start())?;
|
||||
self.wtr.write_str("-")?;
|
||||
self.write_literal_class_byte(range.end())?;
|
||||
}
|
||||
}
|
||||
self.wtr.write_str("])")?;
|
||||
}
|
||||
HirKind::Look(ref look) => match *look {
|
||||
hir::Look::Start => {
|
||||
self.wtr.write_str(r"\A")?;
|
||||
}
|
||||
hir::Look::End => {
|
||||
self.wtr.write_str(r"\z")?;
|
||||
}
|
||||
hir::Look::StartLF => {
|
||||
self.wtr.write_str("(?m:^)")?;
|
||||
}
|
||||
hir::Look::EndLF => {
|
||||
self.wtr.write_str("(?m:$)")?;
|
||||
}
|
||||
hir::Look::StartCRLF => {
|
||||
self.wtr.write_str("(?mR:^)")?;
|
||||
}
|
||||
hir::Look::EndCRLF => {
|
||||
self.wtr.write_str("(?mR:$)")?;
|
||||
}
|
||||
hir::Look::WordAscii => {
|
||||
self.wtr.write_str(r"(?-u:\b)")?;
|
||||
}
|
||||
hir::Look::WordAsciiNegate => {
|
||||
self.wtr.write_str(r"(?-u:\B)")?;
|
||||
}
|
||||
hir::Look::WordUnicode => {
|
||||
self.wtr.write_str(r"\b")?;
|
||||
}
|
||||
hir::Look::WordUnicodeNegate => {
|
||||
self.wtr.write_str(r"\B")?;
|
||||
}
|
||||
hir::Look::WordStartAscii => {
|
||||
self.wtr.write_str(r"(?-u:\b{start})")?;
|
||||
}
|
||||
hir::Look::WordEndAscii => {
|
||||
self.wtr.write_str(r"(?-u:\b{end})")?;
|
||||
}
|
||||
hir::Look::WordStartUnicode => {
|
||||
self.wtr.write_str(r"\b{start}")?;
|
||||
}
|
||||
hir::Look::WordEndUnicode => {
|
||||
self.wtr.write_str(r"\b{end}")?;
|
||||
}
|
||||
hir::Look::WordStartHalfAscii => {
|
||||
self.wtr.write_str(r"(?-u:\b{start-half})")?;
|
||||
}
|
||||
hir::Look::WordEndHalfAscii => {
|
||||
self.wtr.write_str(r"(?-u:\b{end-half})")?;
|
||||
}
|
||||
hir::Look::WordStartHalfUnicode => {
|
||||
self.wtr.write_str(r"\b{start-half}")?;
|
||||
}
|
||||
hir::Look::WordEndHalfUnicode => {
|
||||
self.wtr.write_str(r"\b{end-half}")?;
|
||||
}
|
||||
},
|
||||
HirKind::Capture(hir::Capture { ref name, .. }) => {
|
||||
self.wtr.write_str("(")?;
|
||||
if let Some(ref name) = *name {
|
||||
write!(self.wtr, "?P<{}>", name)?;
|
||||
}
|
||||
}
|
||||
// Why do this? Wrapping concats and alts in non-capturing groups
|
||||
// is not *always* necessary, but is sometimes necessary. For
|
||||
// example, 'concat(a, alt(b, c))' should be written as 'a(?:b|c)'
|
||||
// and not 'ab|c'. The former is clearly the intended meaning, but
|
||||
// the latter is actually 'alt(concat(a, b), c)'.
|
||||
//
|
||||
// It would be possible to only group these things in cases where
|
||||
// it's strictly necessary, but it requires knowing the parent
|
||||
// expression. And since this technique is simpler and always
|
||||
// correct, we take this route. More to the point, it is a non-goal
|
||||
// of an HIR printer to show a nice easy-to-read regex. Indeed,
|
||||
// its construction forbids it from doing so. Therefore, inserting
|
||||
// extra groups where they aren't necessary is perfectly okay.
|
||||
HirKind::Concat(_) | HirKind::Alternation(_) => {
|
||||
self.wtr.write_str(r"(?:")?;
|
||||
}
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn visit_post(&mut self, hir: &Hir) -> fmt::Result {
|
||||
match *hir.kind() {
|
||||
// Handled during visit_pre
|
||||
HirKind::Empty
|
||||
| HirKind::Literal(_)
|
||||
| HirKind::Class(_)
|
||||
| HirKind::Look(_) => {}
|
||||
HirKind::Repetition(ref x) => {
|
||||
match (x.min, x.max) {
|
||||
(0, Some(1)) => {
|
||||
self.wtr.write_str("?")?;
|
||||
}
|
||||
(0, None) => {
|
||||
self.wtr.write_str("*")?;
|
||||
}
|
||||
(1, None) => {
|
||||
self.wtr.write_str("+")?;
|
||||
}
|
||||
(1, Some(1)) => {
|
||||
// 'a{1}' and 'a{1}?' are exactly equivalent to 'a'.
|
||||
return Ok(());
|
||||
}
|
||||
(m, None) => {
|
||||
write!(self.wtr, "{{{},}}", m)?;
|
||||
}
|
||||
(m, Some(n)) if m == n => {
|
||||
write!(self.wtr, "{{{}}}", m)?;
|
||||
// a{m} and a{m}? are always exactly equivalent.
|
||||
return Ok(());
|
||||
}
|
||||
(m, Some(n)) => {
|
||||
write!(self.wtr, "{{{},{}}}", m, n)?;
|
||||
}
|
||||
}
|
||||
if !x.greedy {
|
||||
self.wtr.write_str("?")?;
|
||||
}
|
||||
}
|
||||
HirKind::Capture(_)
|
||||
| HirKind::Concat(_)
|
||||
| HirKind::Alternation(_) => {
|
||||
self.wtr.write_str(r")")?;
|
||||
}
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn visit_alternation_in(&mut self) -> fmt::Result {
|
||||
self.wtr.write_str("|")
|
||||
}
|
||||
}
|
||||
|
||||
impl<W: fmt::Write> Writer<W> {
|
||||
fn write_literal_char(&mut self, c: char) -> fmt::Result {
|
||||
if is_meta_character(c) {
|
||||
self.wtr.write_str("\\")?;
|
||||
}
|
||||
self.wtr.write_char(c)
|
||||
}
|
||||
|
||||
fn write_literal_byte(&mut self, b: u8) -> fmt::Result {
|
||||
if b <= 0x7F && !b.is_ascii_control() && !b.is_ascii_whitespace() {
|
||||
self.write_literal_char(char::try_from(b).unwrap())
|
||||
} else {
|
||||
write!(self.wtr, "(?-u:\\x{:02X})", b)
|
||||
}
|
||||
}
|
||||
|
||||
fn write_literal_class_byte(&mut self, b: u8) -> fmt::Result {
|
||||
if b <= 0x7F && !b.is_ascii_control() && !b.is_ascii_whitespace() {
|
||||
self.write_literal_char(char::try_from(b).unwrap())
|
||||
} else {
|
||||
write!(self.wtr, "\\x{:02X}", b)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use alloc::{
|
||||
boxed::Box,
|
||||
string::{String, ToString},
|
||||
};
|
||||
|
||||
use crate::ParserBuilder;
|
||||
|
||||
use super::*;
|
||||
|
||||
fn roundtrip(given: &str, expected: &str) {
|
||||
roundtrip_with(|b| b, given, expected);
|
||||
}
|
||||
|
||||
fn roundtrip_bytes(given: &str, expected: &str) {
|
||||
roundtrip_with(|b| b.utf8(false), given, expected);
|
||||
}
|
||||
|
||||
fn roundtrip_with<F>(mut f: F, given: &str, expected: &str)
|
||||
where
|
||||
F: FnMut(&mut ParserBuilder) -> &mut ParserBuilder,
|
||||
{
|
||||
let mut builder = ParserBuilder::new();
|
||||
f(&mut builder);
|
||||
let hir = builder.build().parse(given).unwrap();
|
||||
|
||||
let mut printer = Printer::new();
|
||||
let mut dst = String::new();
|
||||
printer.print(&hir, &mut dst).unwrap();
|
||||
|
||||
// Check that the result is actually valid.
|
||||
builder.build().parse(&dst).unwrap();
|
||||
|
||||
assert_eq!(expected, dst);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn print_literal() {
|
||||
roundtrip("a", "a");
|
||||
roundtrip(r"\xff", "\u{FF}");
|
||||
roundtrip_bytes(r"\xff", "\u{FF}");
|
||||
roundtrip_bytes(r"(?-u)\xff", r"(?-u:\xFF)");
|
||||
roundtrip("☃", "☃");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn print_class() {
|
||||
roundtrip(r"[a]", r"a");
|
||||
roundtrip(r"[ab]", r"[ab]");
|
||||
roundtrip(r"[a-z]", r"[a-z]");
|
||||
roundtrip(r"[a-z--b-c--x-y]", r"[ad-wz]");
|
||||
roundtrip(r"[^\x01-\u{10FFFF}]", "\u{0}");
|
||||
roundtrip(r"[-]", r"\-");
|
||||
roundtrip(r"[☃-⛄]", r"[☃-⛄]");
|
||||
|
||||
roundtrip(r"(?-u)[a]", r"a");
|
||||
roundtrip(r"(?-u)[ab]", r"(?-u:[ab])");
|
||||
roundtrip(r"(?-u)[a-z]", r"(?-u:[a-z])");
|
||||
roundtrip_bytes(r"(?-u)[a-\xFF]", r"(?-u:[a-\xFF])");
|
||||
|
||||
// The following test that the printer escapes meta characters
|
||||
// in character classes.
|
||||
roundtrip(r"[\[]", r"\[");
|
||||
roundtrip(r"[Z-_]", r"[Z-_]");
|
||||
roundtrip(r"[Z-_--Z]", r"[\[-_]");
|
||||
|
||||
// The following test that the printer escapes meta characters
|
||||
// in byte oriented character classes.
|
||||
roundtrip_bytes(r"(?-u)[\[]", r"\[");
|
||||
roundtrip_bytes(r"(?-u)[Z-_]", r"(?-u:[Z-_])");
|
||||
roundtrip_bytes(r"(?-u)[Z-_--Z]", r"(?-u:[\[-_])");
|
||||
|
||||
// This tests that an empty character class is correctly roundtripped.
|
||||
#[cfg(feature = "unicode-gencat")]
|
||||
roundtrip(r"\P{any}", r"[a&&b]");
|
||||
roundtrip_bytes(r"(?-u)[^\x00-\xFF]", r"[a&&b]");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn print_anchor() {
|
||||
roundtrip(r"^", r"\A");
|
||||
roundtrip(r"$", r"\z");
|
||||
roundtrip(r"(?m)^", r"(?m:^)");
|
||||
roundtrip(r"(?m)$", r"(?m:$)");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn print_word_boundary() {
|
||||
roundtrip(r"\b", r"\b");
|
||||
roundtrip(r"\B", r"\B");
|
||||
roundtrip(r"(?-u)\b", r"(?-u:\b)");
|
||||
roundtrip_bytes(r"(?-u)\B", r"(?-u:\B)");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn print_repetition() {
|
||||
roundtrip("a?", "a?");
|
||||
roundtrip("a??", "a??");
|
||||
roundtrip("(?U)a?", "a??");
|
||||
|
||||
roundtrip("a*", "a*");
|
||||
roundtrip("a*?", "a*?");
|
||||
roundtrip("(?U)a*", "a*?");
|
||||
|
||||
roundtrip("a+", "a+");
|
||||
roundtrip("a+?", "a+?");
|
||||
roundtrip("(?U)a+", "a+?");
|
||||
|
||||
roundtrip("a{1}", "a");
|
||||
roundtrip("a{2}", "a{2}");
|
||||
roundtrip("a{1,}", "a+");
|
||||
roundtrip("a{1,5}", "a{1,5}");
|
||||
roundtrip("a{1}?", "a");
|
||||
roundtrip("a{2}?", "a{2}");
|
||||
roundtrip("a{1,}?", "a+?");
|
||||
roundtrip("a{1,5}?", "a{1,5}?");
|
||||
roundtrip("(?U)a{1}", "a");
|
||||
roundtrip("(?U)a{2}", "a{2}");
|
||||
roundtrip("(?U)a{1,}", "a+?");
|
||||
roundtrip("(?U)a{1,5}", "a{1,5}?");
|
||||
|
||||
// Test that various zero-length repetitions always translate to an
|
||||
// empty regex. This is more a property of HIR's smart constructors
|
||||
// than the printer though.
|
||||
roundtrip("a{0}", "(?:)");
|
||||
roundtrip("(?:ab){0}", "(?:)");
|
||||
#[cfg(feature = "unicode-gencat")]
|
||||
{
|
||||
roundtrip(r"\p{any}{0}", "(?:)");
|
||||
roundtrip(r"\P{any}{0}", "(?:)");
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn print_group() {
|
||||
roundtrip("()", "((?:))");
|
||||
roundtrip("(?P<foo>)", "(?P<foo>(?:))");
|
||||
roundtrip("(?:)", "(?:)");
|
||||
|
||||
roundtrip("(a)", "(a)");
|
||||
roundtrip("(?P<foo>a)", "(?P<foo>a)");
|
||||
roundtrip("(?:a)", "a");
|
||||
|
||||
roundtrip("((((a))))", "((((a))))");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn print_alternation() {
|
||||
roundtrip("|", "(?:(?:)|(?:))");
|
||||
roundtrip("||", "(?:(?:)|(?:)|(?:))");
|
||||
|
||||
roundtrip("a|b", "[ab]");
|
||||
roundtrip("ab|cd", "(?:(?:ab)|(?:cd))");
|
||||
roundtrip("a|b|c", "[a-c]");
|
||||
roundtrip("ab|cd|ef", "(?:(?:ab)|(?:cd)|(?:ef))");
|
||||
roundtrip("foo|bar|quux", "(?:(?:foo)|(?:bar)|(?:quux))");
|
||||
}
|
||||
|
||||
// This is a regression test that stresses a peculiarity of how the HIR
|
||||
// is both constructed and printed. Namely, it is legal for a repetition
|
||||
// to directly contain a concatenation. This particular construct isn't
|
||||
// really possible to build from the concrete syntax directly, since you'd
|
||||
// be forced to put the concatenation into (at least) a non-capturing
|
||||
// group. Concurrently, the printer doesn't consider this case and just
|
||||
// kind of naively prints the child expression and tacks on the repetition
|
||||
// operator.
|
||||
//
|
||||
// As a result, if you attached '+' to a 'concat(a, b)', the printer gives
|
||||
// you 'ab+', but clearly it really should be '(?:ab)+'.
|
||||
//
|
||||
// This bug isn't easy to surface because most ways of building an HIR
|
||||
// come directly from the concrete syntax, and as mentioned above, it just
|
||||
// isn't possible to build this kind of HIR from the concrete syntax.
|
||||
// Nevertheless, this is definitely a bug.
|
||||
//
|
||||
// See: https://github.com/rust-lang/regex/issues/731
|
||||
#[test]
|
||||
fn regression_repetition_concat() {
|
||||
let expr = Hir::concat(alloc::vec![
|
||||
Hir::literal("x".as_bytes()),
|
||||
Hir::repetition(hir::Repetition {
|
||||
min: 1,
|
||||
max: None,
|
||||
greedy: true,
|
||||
sub: Box::new(Hir::literal("ab".as_bytes())),
|
||||
}),
|
||||
Hir::literal("y".as_bytes()),
|
||||
]);
|
||||
assert_eq!(r"(?:x(?:ab)+y)", expr.to_string());
|
||||
|
||||
let expr = Hir::concat(alloc::vec![
|
||||
Hir::look(hir::Look::Start),
|
||||
Hir::repetition(hir::Repetition {
|
||||
min: 1,
|
||||
max: None,
|
||||
greedy: true,
|
||||
sub: Box::new(Hir::concat(alloc::vec![
|
||||
Hir::look(hir::Look::Start),
|
||||
Hir::look(hir::Look::End),
|
||||
])),
|
||||
}),
|
||||
Hir::look(hir::Look::End),
|
||||
]);
|
||||
assert_eq!(r"(?:\A\A\z\z)", expr.to_string());
|
||||
}
|
||||
|
||||
// Just like regression_repetition_concat, but with the repetition using
|
||||
// an alternation as a child expression instead.
|
||||
//
|
||||
// See: https://github.com/rust-lang/regex/issues/731
|
||||
#[test]
|
||||
fn regression_repetition_alternation() {
|
||||
let expr = Hir::concat(alloc::vec![
|
||||
Hir::literal("ab".as_bytes()),
|
||||
Hir::repetition(hir::Repetition {
|
||||
min: 1,
|
||||
max: None,
|
||||
greedy: true,
|
||||
sub: Box::new(Hir::alternation(alloc::vec![
|
||||
Hir::literal("cd".as_bytes()),
|
||||
Hir::literal("ef".as_bytes()),
|
||||
])),
|
||||
}),
|
||||
Hir::literal("gh".as_bytes()),
|
||||
]);
|
||||
assert_eq!(r"(?:(?:ab)(?:(?:cd)|(?:ef))+(?:gh))", expr.to_string());
|
||||
|
||||
let expr = Hir::concat(alloc::vec![
|
||||
Hir::look(hir::Look::Start),
|
||||
Hir::repetition(hir::Repetition {
|
||||
min: 1,
|
||||
max: None,
|
||||
greedy: true,
|
||||
sub: Box::new(Hir::alternation(alloc::vec![
|
||||
Hir::look(hir::Look::Start),
|
||||
Hir::look(hir::Look::End),
|
||||
])),
|
||||
}),
|
||||
Hir::look(hir::Look::End),
|
||||
]);
|
||||
assert_eq!(r"(?:\A(?:\A|\z)\z)", expr.to_string());
|
||||
}
|
||||
|
||||
// This regression test is very similar in flavor to
|
||||
// regression_repetition_concat in that the root of the issue lies in a
|
||||
// peculiarity of how the HIR is represented and how the printer writes it
|
||||
// out. Like the other regression, this one is also rooted in the fact that
|
||||
// you can't produce the peculiar HIR from the concrete syntax. Namely, you
|
||||
// just can't have a 'concat(a, alt(b, c))' because the 'alt' will normally
|
||||
// be in (at least) a non-capturing group. Why? Because the '|' has very
|
||||
// low precedence (lower that concatenation), and so something like 'ab|c'
|
||||
// is actually 'alt(ab, c)'.
|
||||
//
|
||||
// See: https://github.com/rust-lang/regex/issues/516
|
||||
#[test]
|
||||
fn regression_alternation_concat() {
|
||||
let expr = Hir::concat(alloc::vec![
|
||||
Hir::literal("ab".as_bytes()),
|
||||
Hir::alternation(alloc::vec![
|
||||
Hir::literal("mn".as_bytes()),
|
||||
Hir::literal("xy".as_bytes()),
|
||||
]),
|
||||
]);
|
||||
assert_eq!(r"(?:(?:ab)(?:(?:mn)|(?:xy)))", expr.to_string());
|
||||
|
||||
let expr = Hir::concat(alloc::vec![
|
||||
Hir::look(hir::Look::Start),
|
||||
Hir::alternation(alloc::vec![
|
||||
Hir::look(hir::Look::Start),
|
||||
Hir::look(hir::Look::End),
|
||||
]),
|
||||
]);
|
||||
assert_eq!(r"(?:\A(?:\A|\z))", expr.to_string());
|
||||
}
|
||||
}
|
||||
3724
third-party/vendor/regex-syntax/src/hir/translate.rs
vendored
Normal file
3724
third-party/vendor/regex-syntax/src/hir/translate.rs
vendored
Normal file
File diff suppressed because it is too large
Load diff
215
third-party/vendor/regex-syntax/src/hir/visitor.rs
vendored
Normal file
215
third-party/vendor/regex-syntax/src/hir/visitor.rs
vendored
Normal file
|
|
@ -0,0 +1,215 @@
|
|||
use alloc::{vec, vec::Vec};
|
||||
|
||||
use crate::hir::{self, Hir, HirKind};
|
||||
|
||||
/// A trait for visiting the high-level IR (HIR) in depth first order.
|
||||
///
|
||||
/// The principle aim of this trait is to enable callers to perform case
|
||||
/// analysis on a high-level intermediate representation of a regular
|
||||
/// expression without necessarily using recursion. In particular, this permits
|
||||
/// callers to do case analysis with constant stack usage, which can be
|
||||
/// important since the size of an HIR may be proportional to end user input.
|
||||
///
|
||||
/// Typical usage of this trait involves providing an implementation and then
|
||||
/// running it using the [`visit`] function.
|
||||
pub trait Visitor {
|
||||
/// The result of visiting an HIR.
|
||||
type Output;
|
||||
/// An error that visiting an HIR might return.
|
||||
type Err;
|
||||
|
||||
/// All implementors of `Visitor` must provide a `finish` method, which
|
||||
/// yields the result of visiting the HIR or an error.
|
||||
fn finish(self) -> Result<Self::Output, Self::Err>;
|
||||
|
||||
/// This method is called before beginning traversal of the HIR.
|
||||
fn start(&mut self) {}
|
||||
|
||||
/// This method is called on an `Hir` before descending into child `Hir`
|
||||
/// nodes.
|
||||
fn visit_pre(&mut self, _hir: &Hir) -> Result<(), Self::Err> {
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// This method is called on an `Hir` after descending all of its child
|
||||
/// `Hir` nodes.
|
||||
fn visit_post(&mut self, _hir: &Hir) -> Result<(), Self::Err> {
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// This method is called between child nodes of an alternation.
|
||||
fn visit_alternation_in(&mut self) -> Result<(), Self::Err> {
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// This method is called between child nodes of a concatenation.
|
||||
fn visit_concat_in(&mut self) -> Result<(), Self::Err> {
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
|
||||
/// Executes an implementation of `Visitor` in constant stack space.
|
||||
///
|
||||
/// This function will visit every node in the given `Hir` while calling
|
||||
/// appropriate methods provided by the [`Visitor`] trait.
|
||||
///
|
||||
/// The primary use case for this method is when one wants to perform case
|
||||
/// analysis over an `Hir` without using a stack size proportional to the depth
|
||||
/// of the `Hir`. Namely, this method will instead use constant stack space,
|
||||
/// but will use heap space proportional to the size of the `Hir`. This may be
|
||||
/// desirable in cases where the size of `Hir` is proportional to end user
|
||||
/// input.
|
||||
///
|
||||
/// If the visitor returns an error at any point, then visiting is stopped and
|
||||
/// the error is returned.
|
||||
pub fn visit<V: Visitor>(hir: &Hir, visitor: V) -> Result<V::Output, V::Err> {
|
||||
HeapVisitor::new().visit(hir, visitor)
|
||||
}
|
||||
|
||||
/// HeapVisitor visits every item in an `Hir` recursively using constant stack
|
||||
/// size and a heap size proportional to the size of the `Hir`.
|
||||
struct HeapVisitor<'a> {
|
||||
/// A stack of `Hir` nodes. This is roughly analogous to the call stack
|
||||
/// used in a typical recursive visitor.
|
||||
stack: Vec<(&'a Hir, Frame<'a>)>,
|
||||
}
|
||||
|
||||
/// Represents a single stack frame while performing structural induction over
|
||||
/// an `Hir`.
|
||||
enum Frame<'a> {
|
||||
/// A stack frame allocated just before descending into a repetition
|
||||
/// operator's child node.
|
||||
Repetition(&'a hir::Repetition),
|
||||
/// A stack frame allocated just before descending into a capture's child
|
||||
/// node.
|
||||
Capture(&'a hir::Capture),
|
||||
/// The stack frame used while visiting every child node of a concatenation
|
||||
/// of expressions.
|
||||
Concat {
|
||||
/// The child node we are currently visiting.
|
||||
head: &'a Hir,
|
||||
/// The remaining child nodes to visit (which may be empty).
|
||||
tail: &'a [Hir],
|
||||
},
|
||||
/// The stack frame used while visiting every child node of an alternation
|
||||
/// of expressions.
|
||||
Alternation {
|
||||
/// The child node we are currently visiting.
|
||||
head: &'a Hir,
|
||||
/// The remaining child nodes to visit (which may be empty).
|
||||
tail: &'a [Hir],
|
||||
},
|
||||
}
|
||||
|
||||
impl<'a> HeapVisitor<'a> {
|
||||
fn new() -> HeapVisitor<'a> {
|
||||
HeapVisitor { stack: vec![] }
|
||||
}
|
||||
|
||||
fn visit<V: Visitor>(
|
||||
&mut self,
|
||||
mut hir: &'a Hir,
|
||||
mut visitor: V,
|
||||
) -> Result<V::Output, V::Err> {
|
||||
self.stack.clear();
|
||||
|
||||
visitor.start();
|
||||
loop {
|
||||
visitor.visit_pre(hir)?;
|
||||
if let Some(x) = self.induct(hir) {
|
||||
let child = x.child();
|
||||
self.stack.push((hir, x));
|
||||
hir = child;
|
||||
continue;
|
||||
}
|
||||
// No induction means we have a base case, so we can post visit
|
||||
// it now.
|
||||
visitor.visit_post(hir)?;
|
||||
|
||||
// At this point, we now try to pop our call stack until it is
|
||||
// either empty or we hit another inductive case.
|
||||
loop {
|
||||
let (post_hir, frame) = match self.stack.pop() {
|
||||
None => return visitor.finish(),
|
||||
Some((post_hir, frame)) => (post_hir, frame),
|
||||
};
|
||||
// If this is a concat/alternate, then we might have additional
|
||||
// inductive steps to process.
|
||||
if let Some(x) = self.pop(frame) {
|
||||
match x {
|
||||
Frame::Alternation { .. } => {
|
||||
visitor.visit_alternation_in()?;
|
||||
}
|
||||
Frame::Concat { .. } => {
|
||||
visitor.visit_concat_in()?;
|
||||
}
|
||||
_ => {}
|
||||
}
|
||||
hir = x.child();
|
||||
self.stack.push((post_hir, x));
|
||||
break;
|
||||
}
|
||||
// Otherwise, we've finished visiting all the child nodes for
|
||||
// this HIR, so we can post visit it now.
|
||||
visitor.visit_post(post_hir)?;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Build a stack frame for the given HIR if one is needed (which occurs if
|
||||
/// and only if there are child nodes in the HIR). Otherwise, return None.
|
||||
fn induct(&mut self, hir: &'a Hir) -> Option<Frame<'a>> {
|
||||
match *hir.kind() {
|
||||
HirKind::Repetition(ref x) => Some(Frame::Repetition(x)),
|
||||
HirKind::Capture(ref x) => Some(Frame::Capture(x)),
|
||||
HirKind::Concat(ref x) if x.is_empty() => None,
|
||||
HirKind::Concat(ref x) => {
|
||||
Some(Frame::Concat { head: &x[0], tail: &x[1..] })
|
||||
}
|
||||
HirKind::Alternation(ref x) if x.is_empty() => None,
|
||||
HirKind::Alternation(ref x) => {
|
||||
Some(Frame::Alternation { head: &x[0], tail: &x[1..] })
|
||||
}
|
||||
_ => None,
|
||||
}
|
||||
}
|
||||
|
||||
/// Pops the given frame. If the frame has an additional inductive step,
|
||||
/// then return it, otherwise return `None`.
|
||||
fn pop(&self, induct: Frame<'a>) -> Option<Frame<'a>> {
|
||||
match induct {
|
||||
Frame::Repetition(_) => None,
|
||||
Frame::Capture(_) => None,
|
||||
Frame::Concat { tail, .. } => {
|
||||
if tail.is_empty() {
|
||||
None
|
||||
} else {
|
||||
Some(Frame::Concat { head: &tail[0], tail: &tail[1..] })
|
||||
}
|
||||
}
|
||||
Frame::Alternation { tail, .. } => {
|
||||
if tail.is_empty() {
|
||||
None
|
||||
} else {
|
||||
Some(Frame::Alternation {
|
||||
head: &tail[0],
|
||||
tail: &tail[1..],
|
||||
})
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl<'a> Frame<'a> {
|
||||
/// Perform the next inductive step on this frame and return the next
|
||||
/// child HIR node to visit.
|
||||
fn child(&self) -> &'a Hir {
|
||||
match *self {
|
||||
Frame::Repetition(rep) => &rep.sub,
|
||||
Frame::Capture(capture) => &capture.sub,
|
||||
Frame::Concat { head, .. } => head,
|
||||
Frame::Alternation { head, .. } => head,
|
||||
}
|
||||
}
|
||||
}
|
||||
431
third-party/vendor/regex-syntax/src/lib.rs
vendored
Normal file
431
third-party/vendor/regex-syntax/src/lib.rs
vendored
Normal file
|
|
@ -0,0 +1,431 @@
|
|||
/*!
|
||||
This crate provides a robust regular expression parser.
|
||||
|
||||
This crate defines two primary types:
|
||||
|
||||
* [`Ast`](ast::Ast) is the abstract syntax of a regular expression.
|
||||
An abstract syntax corresponds to a *structured representation* of the
|
||||
concrete syntax of a regular expression, where the concrete syntax is the
|
||||
pattern string itself (e.g., `foo(bar)+`). Given some abstract syntax, it
|
||||
can be converted back to the original concrete syntax (modulo some details,
|
||||
like whitespace). To a first approximation, the abstract syntax is complex
|
||||
and difficult to analyze.
|
||||
* [`Hir`](hir::Hir) is the high-level intermediate representation
|
||||
("HIR" or "high-level IR" for short) of regular expression. It corresponds to
|
||||
an intermediate state of a regular expression that sits between the abstract
|
||||
syntax and the low level compiled opcodes that are eventually responsible for
|
||||
executing a regular expression search. Given some high-level IR, it is not
|
||||
possible to produce the original concrete syntax (although it is possible to
|
||||
produce an equivalent concrete syntax, but it will likely scarcely resemble
|
||||
the original pattern). To a first approximation, the high-level IR is simple
|
||||
and easy to analyze.
|
||||
|
||||
These two types come with conversion routines:
|
||||
|
||||
* An [`ast::parse::Parser`] converts concrete syntax (a `&str`) to an
|
||||
[`Ast`](ast::Ast).
|
||||
* A [`hir::translate::Translator`] converts an [`Ast`](ast::Ast) to a
|
||||
[`Hir`](hir::Hir).
|
||||
|
||||
As a convenience, the above two conversion routines are combined into one via
|
||||
the top-level [`Parser`] type. This `Parser` will first convert your pattern to
|
||||
an `Ast` and then convert the `Ast` to an `Hir`. It's also exposed as top-level
|
||||
[`parse`] free function.
|
||||
|
||||
|
||||
# Example
|
||||
|
||||
This example shows how to parse a pattern string into its HIR:
|
||||
|
||||
```
|
||||
use regex_syntax::{hir::Hir, parse};
|
||||
|
||||
let hir = parse("a|b")?;
|
||||
assert_eq!(hir, Hir::alternation(vec![
|
||||
Hir::literal("a".as_bytes()),
|
||||
Hir::literal("b".as_bytes()),
|
||||
]));
|
||||
# Ok::<(), Box<dyn std::error::Error>>(())
|
||||
```
|
||||
|
||||
|
||||
# Concrete syntax supported
|
||||
|
||||
The concrete syntax is documented as part of the public API of the
|
||||
[`regex` crate](https://docs.rs/regex/%2A/regex/#syntax).
|
||||
|
||||
|
||||
# Input safety
|
||||
|
||||
A key feature of this library is that it is safe to use with end user facing
|
||||
input. This plays a significant role in the internal implementation. In
|
||||
particular:
|
||||
|
||||
1. Parsers provide a `nest_limit` option that permits callers to control how
|
||||
deeply nested a regular expression is allowed to be. This makes it possible
|
||||
to do case analysis over an `Ast` or an `Hir` using recursion without
|
||||
worrying about stack overflow.
|
||||
2. Since relying on a particular stack size is brittle, this crate goes to
|
||||
great lengths to ensure that all interactions with both the `Ast` and the
|
||||
`Hir` do not use recursion. Namely, they use constant stack space and heap
|
||||
space proportional to the size of the original pattern string (in bytes).
|
||||
This includes the type's corresponding destructors. (One exception to this
|
||||
is literal extraction, but this will eventually get fixed.)
|
||||
|
||||
|
||||
# Error reporting
|
||||
|
||||
The `Display` implementations on all `Error` types exposed in this library
|
||||
provide nice human readable errors that are suitable for showing to end users
|
||||
in a monospace font.
|
||||
|
||||
|
||||
# Literal extraction
|
||||
|
||||
This crate provides limited support for [literal extraction from `Hir`
|
||||
values](hir::literal). Be warned that literal extraction uses recursion, and
|
||||
therefore, stack size proportional to the size of the `Hir`.
|
||||
|
||||
The purpose of literal extraction is to speed up searches. That is, if you
|
||||
know a regular expression must match a prefix or suffix literal, then it is
|
||||
often quicker to search for instances of that literal, and then confirm or deny
|
||||
the match using the full regular expression engine. These optimizations are
|
||||
done automatically in the `regex` crate.
|
||||
|
||||
|
||||
# Crate features
|
||||
|
||||
An important feature provided by this crate is its Unicode support. This
|
||||
includes things like case folding, boolean properties, general categories,
|
||||
scripts and Unicode-aware support for the Perl classes `\w`, `\s` and `\d`.
|
||||
However, a downside of this support is that it requires bundling several
|
||||
Unicode data tables that are substantial in size.
|
||||
|
||||
A fair number of use cases do not require full Unicode support. For this
|
||||
reason, this crate exposes a number of features to control which Unicode
|
||||
data is available.
|
||||
|
||||
If a regular expression attempts to use a Unicode feature that is not available
|
||||
because the corresponding crate feature was disabled, then translating that
|
||||
regular expression to an `Hir` will return an error. (It is still possible
|
||||
construct an `Ast` for such a regular expression, since Unicode data is not
|
||||
used until translation to an `Hir`.) Stated differently, enabling or disabling
|
||||
any of the features below can only add or subtract from the total set of valid
|
||||
regular expressions. Enabling or disabling a feature will never modify the
|
||||
match semantics of a regular expression.
|
||||
|
||||
The following features are available:
|
||||
|
||||
* **std** -
|
||||
Enables support for the standard library. This feature is enabled by default.
|
||||
When disabled, only `core` and `alloc` are used. Otherwise, enabling `std`
|
||||
generally just enables `std::error::Error` trait impls for the various error
|
||||
types.
|
||||
* **unicode** -
|
||||
Enables all Unicode features. This feature is enabled by default, and will
|
||||
always cover all Unicode features, even if more are added in the future.
|
||||
* **unicode-age** -
|
||||
Provide the data for the
|
||||
[Unicode `Age` property](https://www.unicode.org/reports/tr44/tr44-24.html#Character_Age).
|
||||
This makes it possible to use classes like `\p{Age:6.0}` to refer to all
|
||||
codepoints first introduced in Unicode 6.0
|
||||
* **unicode-bool** -
|
||||
Provide the data for numerous Unicode boolean properties. The full list
|
||||
is not included here, but contains properties like `Alphabetic`, `Emoji`,
|
||||
`Lowercase`, `Math`, `Uppercase` and `White_Space`.
|
||||
* **unicode-case** -
|
||||
Provide the data for case insensitive matching using
|
||||
[Unicode's "simple loose matches" specification](https://www.unicode.org/reports/tr18/#Simple_Loose_Matches).
|
||||
* **unicode-gencat** -
|
||||
Provide the data for
|
||||
[Unicode general categories](https://www.unicode.org/reports/tr44/tr44-24.html#General_Category_Values).
|
||||
This includes, but is not limited to, `Decimal_Number`, `Letter`,
|
||||
`Math_Symbol`, `Number` and `Punctuation`.
|
||||
* **unicode-perl** -
|
||||
Provide the data for supporting the Unicode-aware Perl character classes,
|
||||
corresponding to `\w`, `\s` and `\d`. This is also necessary for using
|
||||
Unicode-aware word boundary assertions. Note that if this feature is
|
||||
disabled, the `\s` and `\d` character classes are still available if the
|
||||
`unicode-bool` and `unicode-gencat` features are enabled, respectively.
|
||||
* **unicode-script** -
|
||||
Provide the data for
|
||||
[Unicode scripts and script extensions](https://www.unicode.org/reports/tr24/).
|
||||
This includes, but is not limited to, `Arabic`, `Cyrillic`, `Hebrew`,
|
||||
`Latin` and `Thai`.
|
||||
* **unicode-segment** -
|
||||
Provide the data necessary to provide the properties used to implement the
|
||||
[Unicode text segmentation algorithms](https://www.unicode.org/reports/tr29/).
|
||||
This enables using classes like `\p{gcb=Extend}`, `\p{wb=Katakana}` and
|
||||
`\p{sb=ATerm}`.
|
||||
* **arbitrary** -
|
||||
Enabling this feature introduces a public dependency on the
|
||||
[`arbitrary`](https://crates.io/crates/arbitrary)
|
||||
crate. Namely, it implements the `Arbitrary` trait from that crate for the
|
||||
[`Ast`](crate::ast::Ast) type. This feature is disabled by default.
|
||||
*/
|
||||
|
||||
#![no_std]
|
||||
#![forbid(unsafe_code)]
|
||||
#![deny(missing_docs, rustdoc::broken_intra_doc_links)]
|
||||
#![warn(missing_debug_implementations)]
|
||||
#![cfg_attr(docsrs, feature(doc_auto_cfg))]
|
||||
|
||||
#[cfg(any(test, feature = "std"))]
|
||||
extern crate std;
|
||||
|
||||
extern crate alloc;
|
||||
|
||||
pub use crate::{
|
||||
error::Error,
|
||||
parser::{parse, Parser, ParserBuilder},
|
||||
unicode::UnicodeWordError,
|
||||
};
|
||||
|
||||
use alloc::string::String;
|
||||
|
||||
pub mod ast;
|
||||
mod debug;
|
||||
mod either;
|
||||
mod error;
|
||||
pub mod hir;
|
||||
mod parser;
|
||||
mod rank;
|
||||
mod unicode;
|
||||
mod unicode_tables;
|
||||
pub mod utf8;
|
||||
|
||||
/// Escapes all regular expression meta characters in `text`.
|
||||
///
|
||||
/// The string returned may be safely used as a literal in a regular
|
||||
/// expression.
|
||||
pub fn escape(text: &str) -> String {
|
||||
let mut quoted = String::new();
|
||||
escape_into(text, &mut quoted);
|
||||
quoted
|
||||
}
|
||||
|
||||
/// Escapes all meta characters in `text` and writes the result into `buf`.
|
||||
///
|
||||
/// This will append escape characters into the given buffer. The characters
|
||||
/// that are appended are safe to use as a literal in a regular expression.
|
||||
pub fn escape_into(text: &str, buf: &mut String) {
|
||||
buf.reserve(text.len());
|
||||
for c in text.chars() {
|
||||
if is_meta_character(c) {
|
||||
buf.push('\\');
|
||||
}
|
||||
buf.push(c);
|
||||
}
|
||||
}
|
||||
|
||||
/// Returns true if the given character has significance in a regex.
|
||||
///
|
||||
/// Generally speaking, these are the only characters which _must_ be escaped
|
||||
/// in order to match their literal meaning. For example, to match a literal
|
||||
/// `|`, one could write `\|`. Sometimes escaping isn't always necessary. For
|
||||
/// example, `-` is treated as a meta character because of its significance
|
||||
/// for writing ranges inside of character classes, but the regex `-` will
|
||||
/// match a literal `-` because `-` has no special meaning outside of character
|
||||
/// classes.
|
||||
///
|
||||
/// In order to determine whether a character may be escaped at all, the
|
||||
/// [`is_escapeable_character`] routine should be used. The difference between
|
||||
/// `is_meta_character` and `is_escapeable_character` is that the latter will
|
||||
/// return true for some characters that are _not_ meta characters. For
|
||||
/// example, `%` and `\%` both match a literal `%` in all contexts. In other
|
||||
/// words, `is_escapeable_character` includes "superfluous" escapes.
|
||||
///
|
||||
/// Note that the set of characters for which this function returns `true` or
|
||||
/// `false` is fixed and won't change in a semver compatible release. (In this
|
||||
/// case, "semver compatible release" actually refers to the `regex` crate
|
||||
/// itself, since reducing or expanding the set of meta characters would be a
|
||||
/// breaking change for not just `regex-syntax` but also `regex` itself.)
|
||||
///
|
||||
/// # Example
|
||||
///
|
||||
/// ```
|
||||
/// use regex_syntax::is_meta_character;
|
||||
///
|
||||
/// assert!(is_meta_character('?'));
|
||||
/// assert!(is_meta_character('-'));
|
||||
/// assert!(is_meta_character('&'));
|
||||
/// assert!(is_meta_character('#'));
|
||||
///
|
||||
/// assert!(!is_meta_character('%'));
|
||||
/// assert!(!is_meta_character('/'));
|
||||
/// assert!(!is_meta_character('!'));
|
||||
/// assert!(!is_meta_character('"'));
|
||||
/// assert!(!is_meta_character('e'));
|
||||
/// ```
|
||||
pub fn is_meta_character(c: char) -> bool {
|
||||
match c {
|
||||
'\\' | '.' | '+' | '*' | '?' | '(' | ')' | '|' | '[' | ']' | '{'
|
||||
| '}' | '^' | '$' | '#' | '&' | '-' | '~' => true,
|
||||
_ => false,
|
||||
}
|
||||
}
|
||||
|
||||
/// Returns true if the given character can be escaped in a regex.
|
||||
///
|
||||
/// This returns true in all cases that `is_meta_character` returns true, but
|
||||
/// also returns true in some cases where `is_meta_character` returns false.
|
||||
/// For example, `%` is not a meta character, but it is escapeable. That is,
|
||||
/// `%` and `\%` both match a literal `%` in all contexts.
|
||||
///
|
||||
/// The purpose of this routine is to provide knowledge about what characters
|
||||
/// may be escaped. Namely, most regex engines permit "superfluous" escapes
|
||||
/// where characters without any special significance may be escaped even
|
||||
/// though there is no actual _need_ to do so.
|
||||
///
|
||||
/// This will return false for some characters. For example, `e` is not
|
||||
/// escapeable. Therefore, `\e` will either result in a parse error (which is
|
||||
/// true today), or it could backwards compatibly evolve into a new construct
|
||||
/// with its own meaning. Indeed, that is the purpose of banning _some_
|
||||
/// superfluous escapes: it provides a way to evolve the syntax in a compatible
|
||||
/// manner.
|
||||
///
|
||||
/// # Example
|
||||
///
|
||||
/// ```
|
||||
/// use regex_syntax::is_escapeable_character;
|
||||
///
|
||||
/// assert!(is_escapeable_character('?'));
|
||||
/// assert!(is_escapeable_character('-'));
|
||||
/// assert!(is_escapeable_character('&'));
|
||||
/// assert!(is_escapeable_character('#'));
|
||||
/// assert!(is_escapeable_character('%'));
|
||||
/// assert!(is_escapeable_character('/'));
|
||||
/// assert!(is_escapeable_character('!'));
|
||||
/// assert!(is_escapeable_character('"'));
|
||||
///
|
||||
/// assert!(!is_escapeable_character('e'));
|
||||
/// ```
|
||||
pub fn is_escapeable_character(c: char) -> bool {
|
||||
// Certainly escapeable if it's a meta character.
|
||||
if is_meta_character(c) {
|
||||
return true;
|
||||
}
|
||||
// Any character that isn't ASCII is definitely not escapeable. There's
|
||||
// no real need to allow things like \☃ right?
|
||||
if !c.is_ascii() {
|
||||
return false;
|
||||
}
|
||||
// Otherwise, we basically say that everything is escapeable unless it's a
|
||||
// letter or digit. Things like \3 are either octal (when enabled) or an
|
||||
// error, and we should keep it that way. Otherwise, letters are reserved
|
||||
// for adding new syntax in a backwards compatible way.
|
||||
match c {
|
||||
'0'..='9' | 'A'..='Z' | 'a'..='z' => false,
|
||||
// While not currently supported, we keep these as not escapeable to
|
||||
// give us some flexibility with respect to supporting the \< and
|
||||
// \> word boundary assertions in the future. By rejecting them as
|
||||
// escapeable, \< and \> will result in a parse error. Thus, we can
|
||||
// turn them into something else in the future without it being a
|
||||
// backwards incompatible change.
|
||||
//
|
||||
// OK, now we support \< and \>, and we need to retain them as *not*
|
||||
// escapeable here since the escape sequence is significant.
|
||||
'<' | '>' => false,
|
||||
_ => true,
|
||||
}
|
||||
}
|
||||
|
||||
/// Returns true if and only if the given character is a Unicode word
|
||||
/// character.
|
||||
///
|
||||
/// A Unicode word character is defined by
|
||||
/// [UTS#18 Annex C](https://unicode.org/reports/tr18/#Compatibility_Properties).
|
||||
/// In particular, a character
|
||||
/// is considered a word character if it is in either of the `Alphabetic` or
|
||||
/// `Join_Control` properties, or is in one of the `Decimal_Number`, `Mark`
|
||||
/// or `Connector_Punctuation` general categories.
|
||||
///
|
||||
/// # Panics
|
||||
///
|
||||
/// If the `unicode-perl` feature is not enabled, then this function
|
||||
/// panics. For this reason, it is recommended that callers use
|
||||
/// [`try_is_word_character`] instead.
|
||||
pub fn is_word_character(c: char) -> bool {
|
||||
try_is_word_character(c).expect("unicode-perl feature must be enabled")
|
||||
}
|
||||
|
||||
/// Returns true if and only if the given character is a Unicode word
|
||||
/// character.
|
||||
///
|
||||
/// A Unicode word character is defined by
|
||||
/// [UTS#18 Annex C](https://unicode.org/reports/tr18/#Compatibility_Properties).
|
||||
/// In particular, a character
|
||||
/// is considered a word character if it is in either of the `Alphabetic` or
|
||||
/// `Join_Control` properties, or is in one of the `Decimal_Number`, `Mark`
|
||||
/// or `Connector_Punctuation` general categories.
|
||||
///
|
||||
/// # Errors
|
||||
///
|
||||
/// If the `unicode-perl` feature is not enabled, then this function always
|
||||
/// returns an error.
|
||||
pub fn try_is_word_character(
|
||||
c: char,
|
||||
) -> core::result::Result<bool, UnicodeWordError> {
|
||||
unicode::is_word_character(c)
|
||||
}
|
||||
|
||||
/// Returns true if and only if the given character is an ASCII word character.
|
||||
///
|
||||
/// An ASCII word character is defined by the following character class:
|
||||
/// `[_0-9a-zA-Z]`.
|
||||
pub fn is_word_byte(c: u8) -> bool {
|
||||
match c {
|
||||
b'_' | b'0'..=b'9' | b'a'..=b'z' | b'A'..=b'Z' => true,
|
||||
_ => false,
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use alloc::string::ToString;
|
||||
|
||||
use super::*;
|
||||
|
||||
#[test]
|
||||
fn escape_meta() {
|
||||
assert_eq!(
|
||||
escape(r"\.+*?()|[]{}^$#&-~"),
|
||||
r"\\\.\+\*\?\(\)\|\[\]\{\}\^\$\#\&\-\~".to_string()
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn word_byte() {
|
||||
assert!(is_word_byte(b'a'));
|
||||
assert!(!is_word_byte(b'-'));
|
||||
}
|
||||
|
||||
#[test]
|
||||
#[cfg(feature = "unicode-perl")]
|
||||
fn word_char() {
|
||||
assert!(is_word_character('a'), "ASCII");
|
||||
assert!(is_word_character('à'), "Latin-1");
|
||||
assert!(is_word_character('β'), "Greek");
|
||||
assert!(is_word_character('\u{11011}'), "Brahmi (Unicode 6.0)");
|
||||
assert!(is_word_character('\u{11611}'), "Modi (Unicode 7.0)");
|
||||
assert!(is_word_character('\u{11711}'), "Ahom (Unicode 8.0)");
|
||||
assert!(is_word_character('\u{17828}'), "Tangut (Unicode 9.0)");
|
||||
assert!(is_word_character('\u{1B1B1}'), "Nushu (Unicode 10.0)");
|
||||
assert!(is_word_character('\u{16E40}'), "Medefaidrin (Unicode 11.0)");
|
||||
assert!(!is_word_character('-'));
|
||||
assert!(!is_word_character('☃'));
|
||||
}
|
||||
|
||||
#[test]
|
||||
#[should_panic]
|
||||
#[cfg(not(feature = "unicode-perl"))]
|
||||
fn word_char_disabled_panic() {
|
||||
assert!(is_word_character('a'));
|
||||
}
|
||||
|
||||
#[test]
|
||||
#[cfg(not(feature = "unicode-perl"))]
|
||||
fn word_char_disabled_error() {
|
||||
assert!(try_is_word_character('a').is_err());
|
||||
}
|
||||
}
|
||||
254
third-party/vendor/regex-syntax/src/parser.rs
vendored
Normal file
254
third-party/vendor/regex-syntax/src/parser.rs
vendored
Normal file
|
|
@ -0,0 +1,254 @@
|
|||
use crate::{ast, hir, Error};
|
||||
|
||||
/// A convenience routine for parsing a regex using default options.
|
||||
///
|
||||
/// This is equivalent to `Parser::new().parse(pattern)`.
|
||||
///
|
||||
/// If you need to set non-default options, then use a [`ParserBuilder`].
|
||||
///
|
||||
/// This routine returns an [`Hir`](hir::Hir) value. Namely, it automatically
|
||||
/// parses the pattern as an [`Ast`](ast::Ast) and then invokes the translator
|
||||
/// to convert the `Ast` into an `Hir`. If you need access to the `Ast`, then
|
||||
/// you should use a [`ast::parse::Parser`].
|
||||
pub fn parse(pattern: &str) -> Result<hir::Hir, Error> {
|
||||
Parser::new().parse(pattern)
|
||||
}
|
||||
|
||||
/// A builder for a regular expression parser.
|
||||
///
|
||||
/// This builder permits modifying configuration options for the parser.
|
||||
///
|
||||
/// This type combines the builder options for both the [AST
|
||||
/// `ParserBuilder`](ast::parse::ParserBuilder) and the [HIR
|
||||
/// `TranslatorBuilder`](hir::translate::TranslatorBuilder).
|
||||
#[derive(Clone, Debug, Default)]
|
||||
pub struct ParserBuilder {
|
||||
ast: ast::parse::ParserBuilder,
|
||||
hir: hir::translate::TranslatorBuilder,
|
||||
}
|
||||
|
||||
impl ParserBuilder {
|
||||
/// Create a new parser builder with a default configuration.
|
||||
pub fn new() -> ParserBuilder {
|
||||
ParserBuilder::default()
|
||||
}
|
||||
|
||||
/// Build a parser from this configuration with the given pattern.
|
||||
pub fn build(&self) -> Parser {
|
||||
Parser { ast: self.ast.build(), hir: self.hir.build() }
|
||||
}
|
||||
|
||||
/// Set the nesting limit for this parser.
|
||||
///
|
||||
/// The nesting limit controls how deep the abstract syntax tree is allowed
|
||||
/// to be. If the AST exceeds the given limit (e.g., with too many nested
|
||||
/// groups), then an error is returned by the parser.
|
||||
///
|
||||
/// The purpose of this limit is to act as a heuristic to prevent stack
|
||||
/// overflow for consumers that do structural induction on an `Ast` using
|
||||
/// explicit recursion. While this crate never does this (instead using
|
||||
/// constant stack space and moving the call stack to the heap), other
|
||||
/// crates may.
|
||||
///
|
||||
/// This limit is not checked until the entire Ast is parsed. Therefore,
|
||||
/// if callers want to put a limit on the amount of heap space used, then
|
||||
/// they should impose a limit on the length, in bytes, of the concrete
|
||||
/// pattern string. In particular, this is viable since this parser
|
||||
/// implementation will limit itself to heap space proportional to the
|
||||
/// length of the pattern string.
|
||||
///
|
||||
/// Note that a nest limit of `0` will return a nest limit error for most
|
||||
/// patterns but not all. For example, a nest limit of `0` permits `a` but
|
||||
/// not `ab`, since `ab` requires a concatenation, which results in a nest
|
||||
/// depth of `1`. In general, a nest limit is not something that manifests
|
||||
/// in an obvious way in the concrete syntax, therefore, it should not be
|
||||
/// used in a granular way.
|
||||
pub fn nest_limit(&mut self, limit: u32) -> &mut ParserBuilder {
|
||||
self.ast.nest_limit(limit);
|
||||
self
|
||||
}
|
||||
|
||||
/// Whether to support octal syntax or not.
|
||||
///
|
||||
/// Octal syntax is a little-known way of uttering Unicode codepoints in
|
||||
/// a regular expression. For example, `a`, `\x61`, `\u0061` and
|
||||
/// `\141` are all equivalent regular expressions, where the last example
|
||||
/// shows octal syntax.
|
||||
///
|
||||
/// While supporting octal syntax isn't in and of itself a problem, it does
|
||||
/// make good error messages harder. That is, in PCRE based regex engines,
|
||||
/// syntax like `\0` invokes a backreference, which is explicitly
|
||||
/// unsupported in Rust's regex engine. However, many users expect it to
|
||||
/// be supported. Therefore, when octal support is disabled, the error
|
||||
/// message will explicitly mention that backreferences aren't supported.
|
||||
///
|
||||
/// Octal syntax is disabled by default.
|
||||
pub fn octal(&mut self, yes: bool) -> &mut ParserBuilder {
|
||||
self.ast.octal(yes);
|
||||
self
|
||||
}
|
||||
|
||||
/// When disabled, translation will permit the construction of a regular
|
||||
/// expression that may match invalid UTF-8.
|
||||
///
|
||||
/// When enabled (the default), the translator is guaranteed to produce an
|
||||
/// expression that, for non-empty matches, will only ever produce spans
|
||||
/// that are entirely valid UTF-8 (otherwise, the translator will return an
|
||||
/// error).
|
||||
///
|
||||
/// Perhaps surprisingly, when UTF-8 is enabled, an empty regex or even
|
||||
/// a negated ASCII word boundary (uttered as `(?-u:\B)` in the concrete
|
||||
/// syntax) will be allowed even though they can produce matches that split
|
||||
/// a UTF-8 encoded codepoint. This only applies to zero-width or "empty"
|
||||
/// matches, and it is expected that the regex engine itself must handle
|
||||
/// these cases if necessary (perhaps by suppressing any zero-width matches
|
||||
/// that split a codepoint).
|
||||
pub fn utf8(&mut self, yes: bool) -> &mut ParserBuilder {
|
||||
self.hir.utf8(yes);
|
||||
self
|
||||
}
|
||||
|
||||
/// Enable verbose mode in the regular expression.
|
||||
///
|
||||
/// When enabled, verbose mode permits insignificant whitespace in many
|
||||
/// places in the regular expression, as well as comments. Comments are
|
||||
/// started using `#` and continue until the end of the line.
|
||||
///
|
||||
/// By default, this is disabled. It may be selectively enabled in the
|
||||
/// regular expression by using the `x` flag regardless of this setting.
|
||||
pub fn ignore_whitespace(&mut self, yes: bool) -> &mut ParserBuilder {
|
||||
self.ast.ignore_whitespace(yes);
|
||||
self
|
||||
}
|
||||
|
||||
/// Enable or disable the case insensitive flag by default.
|
||||
///
|
||||
/// By default this is disabled. It may alternatively be selectively
|
||||
/// enabled in the regular expression itself via the `i` flag.
|
||||
pub fn case_insensitive(&mut self, yes: bool) -> &mut ParserBuilder {
|
||||
self.hir.case_insensitive(yes);
|
||||
self
|
||||
}
|
||||
|
||||
/// Enable or disable the multi-line matching flag by default.
|
||||
///
|
||||
/// By default this is disabled. It may alternatively be selectively
|
||||
/// enabled in the regular expression itself via the `m` flag.
|
||||
pub fn multi_line(&mut self, yes: bool) -> &mut ParserBuilder {
|
||||
self.hir.multi_line(yes);
|
||||
self
|
||||
}
|
||||
|
||||
/// Enable or disable the "dot matches any character" flag by default.
|
||||
///
|
||||
/// By default this is disabled. It may alternatively be selectively
|
||||
/// enabled in the regular expression itself via the `s` flag.
|
||||
pub fn dot_matches_new_line(&mut self, yes: bool) -> &mut ParserBuilder {
|
||||
self.hir.dot_matches_new_line(yes);
|
||||
self
|
||||
}
|
||||
|
||||
/// Enable or disable the CRLF mode flag by default.
|
||||
///
|
||||
/// By default this is disabled. It may alternatively be selectively
|
||||
/// enabled in the regular expression itself via the `R` flag.
|
||||
///
|
||||
/// When CRLF mode is enabled, the following happens:
|
||||
///
|
||||
/// * Unless `dot_matches_new_line` is enabled, `.` will match any character
|
||||
/// except for `\r` and `\n`.
|
||||
/// * When `multi_line` mode is enabled, `^` and `$` will treat `\r\n`,
|
||||
/// `\r` and `\n` as line terminators. And in particular, neither will
|
||||
/// match between a `\r` and a `\n`.
|
||||
pub fn crlf(&mut self, yes: bool) -> &mut ParserBuilder {
|
||||
self.hir.crlf(yes);
|
||||
self
|
||||
}
|
||||
|
||||
/// Sets the line terminator for use with `(?u-s:.)` and `(?-us:.)`.
|
||||
///
|
||||
/// Namely, instead of `.` (by default) matching everything except for `\n`,
|
||||
/// this will cause `.` to match everything except for the byte given.
|
||||
///
|
||||
/// If `.` is used in a context where Unicode mode is enabled and this byte
|
||||
/// isn't ASCII, then an error will be returned. When Unicode mode is
|
||||
/// disabled, then any byte is permitted, but will return an error if UTF-8
|
||||
/// mode is enabled and it is a non-ASCII byte.
|
||||
///
|
||||
/// In short, any ASCII value for a line terminator is always okay. But a
|
||||
/// non-ASCII byte might result in an error depending on whether Unicode
|
||||
/// mode or UTF-8 mode are enabled.
|
||||
///
|
||||
/// Note that if `R` mode is enabled then it always takes precedence and
|
||||
/// the line terminator will be treated as `\r` and `\n` simultaneously.
|
||||
///
|
||||
/// Note also that this *doesn't* impact the look-around assertions
|
||||
/// `(?m:^)` and `(?m:$)`. That's usually controlled by additional
|
||||
/// configuration in the regex engine itself.
|
||||
pub fn line_terminator(&mut self, byte: u8) -> &mut ParserBuilder {
|
||||
self.hir.line_terminator(byte);
|
||||
self
|
||||
}
|
||||
|
||||
/// Enable or disable the "swap greed" flag by default.
|
||||
///
|
||||
/// By default this is disabled. It may alternatively be selectively
|
||||
/// enabled in the regular expression itself via the `U` flag.
|
||||
pub fn swap_greed(&mut self, yes: bool) -> &mut ParserBuilder {
|
||||
self.hir.swap_greed(yes);
|
||||
self
|
||||
}
|
||||
|
||||
/// Enable or disable the Unicode flag (`u`) by default.
|
||||
///
|
||||
/// By default this is **enabled**. It may alternatively be selectively
|
||||
/// disabled in the regular expression itself via the `u` flag.
|
||||
///
|
||||
/// Note that unless `utf8` is disabled (it's enabled by default), a
|
||||
/// regular expression will fail to parse if Unicode mode is disabled and a
|
||||
/// sub-expression could possibly match invalid UTF-8.
|
||||
pub fn unicode(&mut self, yes: bool) -> &mut ParserBuilder {
|
||||
self.hir.unicode(yes);
|
||||
self
|
||||
}
|
||||
}
|
||||
|
||||
/// A convenience parser for regular expressions.
|
||||
///
|
||||
/// This parser takes as input a regular expression pattern string (the
|
||||
/// "concrete syntax") and returns a high-level intermediate representation
|
||||
/// (the HIR) suitable for most types of analysis. In particular, this parser
|
||||
/// hides the intermediate state of producing an AST (the "abstract syntax").
|
||||
/// The AST is itself far more complex than the HIR, so this parser serves as a
|
||||
/// convenience for never having to deal with it at all.
|
||||
///
|
||||
/// If callers have more fine grained use cases that need an AST, then please
|
||||
/// see the [`ast::parse`] module.
|
||||
///
|
||||
/// A `Parser` can be configured in more detail via a [`ParserBuilder`].
|
||||
#[derive(Clone, Debug)]
|
||||
pub struct Parser {
|
||||
ast: ast::parse::Parser,
|
||||
hir: hir::translate::Translator,
|
||||
}
|
||||
|
||||
impl Parser {
|
||||
/// Create a new parser with a default configuration.
|
||||
///
|
||||
/// The parser can be run with `parse` method. The parse method returns
|
||||
/// a high level intermediate representation of the given regular
|
||||
/// expression.
|
||||
///
|
||||
/// To set configuration options on the parser, use [`ParserBuilder`].
|
||||
pub fn new() -> Parser {
|
||||
ParserBuilder::new().build()
|
||||
}
|
||||
|
||||
/// Parse the regular expression into a high level intermediate
|
||||
/// representation.
|
||||
pub fn parse(&mut self, pattern: &str) -> Result<hir::Hir, Error> {
|
||||
let ast = self.ast.parse(pattern)?;
|
||||
let hir = self.hir.translate(pattern, &ast)?;
|
||||
Ok(hir)
|
||||
}
|
||||
}
|
||||
258
third-party/vendor/regex-syntax/src/rank.rs
vendored
Normal file
258
third-party/vendor/regex-syntax/src/rank.rs
vendored
Normal file
|
|
@ -0,0 +1,258 @@
|
|||
pub(crate) const BYTE_FREQUENCIES: [u8; 256] = [
|
||||
55, // '\x00'
|
||||
52, // '\x01'
|
||||
51, // '\x02'
|
||||
50, // '\x03'
|
||||
49, // '\x04'
|
||||
48, // '\x05'
|
||||
47, // '\x06'
|
||||
46, // '\x07'
|
||||
45, // '\x08'
|
||||
103, // '\t'
|
||||
242, // '\n'
|
||||
66, // '\x0b'
|
||||
67, // '\x0c'
|
||||
229, // '\r'
|
||||
44, // '\x0e'
|
||||
43, // '\x0f'
|
||||
42, // '\x10'
|
||||
41, // '\x11'
|
||||
40, // '\x12'
|
||||
39, // '\x13'
|
||||
38, // '\x14'
|
||||
37, // '\x15'
|
||||
36, // '\x16'
|
||||
35, // '\x17'
|
||||
34, // '\x18'
|
||||
33, // '\x19'
|
||||
56, // '\x1a'
|
||||
32, // '\x1b'
|
||||
31, // '\x1c'
|
||||
30, // '\x1d'
|
||||
29, // '\x1e'
|
||||
28, // '\x1f'
|
||||
255, // ' '
|
||||
148, // '!'
|
||||
164, // '"'
|
||||
149, // '#'
|
||||
136, // '$'
|
||||
160, // '%'
|
||||
155, // '&'
|
||||
173, // "'"
|
||||
221, // '('
|
||||
222, // ')'
|
||||
134, // '*'
|
||||
122, // '+'
|
||||
232, // ','
|
||||
202, // '-'
|
||||
215, // '.'
|
||||
224, // '/'
|
||||
208, // '0'
|
||||
220, // '1'
|
||||
204, // '2'
|
||||
187, // '3'
|
||||
183, // '4'
|
||||
179, // '5'
|
||||
177, // '6'
|
||||
168, // '7'
|
||||
178, // '8'
|
||||
200, // '9'
|
||||
226, // ':'
|
||||
195, // ';'
|
||||
154, // '<'
|
||||
184, // '='
|
||||
174, // '>'
|
||||
126, // '?'
|
||||
120, // '@'
|
||||
191, // 'A'
|
||||
157, // 'B'
|
||||
194, // 'C'
|
||||
170, // 'D'
|
||||
189, // 'E'
|
||||
162, // 'F'
|
||||
161, // 'G'
|
||||
150, // 'H'
|
||||
193, // 'I'
|
||||
142, // 'J'
|
||||
137, // 'K'
|
||||
171, // 'L'
|
||||
176, // 'M'
|
||||
185, // 'N'
|
||||
167, // 'O'
|
||||
186, // 'P'
|
||||
112, // 'Q'
|
||||
175, // 'R'
|
||||
192, // 'S'
|
||||
188, // 'T'
|
||||
156, // 'U'
|
||||
140, // 'V'
|
||||
143, // 'W'
|
||||
123, // 'X'
|
||||
133, // 'Y'
|
||||
128, // 'Z'
|
||||
147, // '['
|
||||
138, // '\\'
|
||||
146, // ']'
|
||||
114, // '^'
|
||||
223, // '_'
|
||||
151, // '`'
|
||||
249, // 'a'
|
||||
216, // 'b'
|
||||
238, // 'c'
|
||||
236, // 'd'
|
||||
253, // 'e'
|
||||
227, // 'f'
|
||||
218, // 'g'
|
||||
230, // 'h'
|
||||
247, // 'i'
|
||||
135, // 'j'
|
||||
180, // 'k'
|
||||
241, // 'l'
|
||||
233, // 'm'
|
||||
246, // 'n'
|
||||
244, // 'o'
|
||||
231, // 'p'
|
||||
139, // 'q'
|
||||
245, // 'r'
|
||||
243, // 's'
|
||||
251, // 't'
|
||||
235, // 'u'
|
||||
201, // 'v'
|
||||
196, // 'w'
|
||||
240, // 'x'
|
||||
214, // 'y'
|
||||
152, // 'z'
|
||||
182, // '{'
|
||||
205, // '|'
|
||||
181, // '}'
|
||||
127, // '~'
|
||||
27, // '\x7f'
|
||||
212, // '\x80'
|
||||
211, // '\x81'
|
||||
210, // '\x82'
|
||||
213, // '\x83'
|
||||
228, // '\x84'
|
||||
197, // '\x85'
|
||||
169, // '\x86'
|
||||
159, // '\x87'
|
||||
131, // '\x88'
|
||||
172, // '\x89'
|
||||
105, // '\x8a'
|
||||
80, // '\x8b'
|
||||
98, // '\x8c'
|
||||
96, // '\x8d'
|
||||
97, // '\x8e'
|
||||
81, // '\x8f'
|
||||
207, // '\x90'
|
||||
145, // '\x91'
|
||||
116, // '\x92'
|
||||
115, // '\x93'
|
||||
144, // '\x94'
|
||||
130, // '\x95'
|
||||
153, // '\x96'
|
||||
121, // '\x97'
|
||||
107, // '\x98'
|
||||
132, // '\x99'
|
||||
109, // '\x9a'
|
||||
110, // '\x9b'
|
||||
124, // '\x9c'
|
||||
111, // '\x9d'
|
||||
82, // '\x9e'
|
||||
108, // '\x9f'
|
||||
118, // '\xa0'
|
||||
141, // '¡'
|
||||
113, // '¢'
|
||||
129, // '£'
|
||||
119, // '¤'
|
||||
125, // '¥'
|
||||
165, // '¦'
|
||||
117, // '§'
|
||||
92, // '¨'
|
||||
106, // '©'
|
||||
83, // 'ª'
|
||||
72, // '«'
|
||||
99, // '¬'
|
||||
93, // '\xad'
|
||||
65, // '®'
|
||||
79, // '¯'
|
||||
166, // '°'
|
||||
237, // '±'
|
||||
163, // '²'
|
||||
199, // '³'
|
||||
190, // '´'
|
||||
225, // 'µ'
|
||||
209, // '¶'
|
||||
203, // '·'
|
||||
198, // '¸'
|
||||
217, // '¹'
|
||||
219, // 'º'
|
||||
206, // '»'
|
||||
234, // '¼'
|
||||
248, // '½'
|
||||
158, // '¾'
|
||||
239, // '¿'
|
||||
255, // 'À'
|
||||
255, // 'Á'
|
||||
255, // 'Â'
|
||||
255, // 'Ã'
|
||||
255, // 'Ä'
|
||||
255, // 'Å'
|
||||
255, // 'Æ'
|
||||
255, // 'Ç'
|
||||
255, // 'È'
|
||||
255, // 'É'
|
||||
255, // 'Ê'
|
||||
255, // 'Ë'
|
||||
255, // 'Ì'
|
||||
255, // 'Í'
|
||||
255, // 'Î'
|
||||
255, // 'Ï'
|
||||
255, // 'Ð'
|
||||
255, // 'Ñ'
|
||||
255, // 'Ò'
|
||||
255, // 'Ó'
|
||||
255, // 'Ô'
|
||||
255, // 'Õ'
|
||||
255, // 'Ö'
|
||||
255, // '×'
|
||||
255, // 'Ø'
|
||||
255, // 'Ù'
|
||||
255, // 'Ú'
|
||||
255, // 'Û'
|
||||
255, // 'Ü'
|
||||
255, // 'Ý'
|
||||
255, // 'Þ'
|
||||
255, // 'ß'
|
||||
255, // 'à'
|
||||
255, // 'á'
|
||||
255, // 'â'
|
||||
255, // 'ã'
|
||||
255, // 'ä'
|
||||
255, // 'å'
|
||||
255, // 'æ'
|
||||
255, // 'ç'
|
||||
255, // 'è'
|
||||
255, // 'é'
|
||||
255, // 'ê'
|
||||
255, // 'ë'
|
||||
255, // 'ì'
|
||||
255, // 'í'
|
||||
255, // 'î'
|
||||
255, // 'ï'
|
||||
255, // 'ð'
|
||||
255, // 'ñ'
|
||||
255, // 'ò'
|
||||
255, // 'ó'
|
||||
255, // 'ô'
|
||||
255, // 'õ'
|
||||
255, // 'ö'
|
||||
255, // '÷'
|
||||
255, // 'ø'
|
||||
255, // 'ù'
|
||||
255, // 'ú'
|
||||
255, // 'û'
|
||||
255, // 'ü'
|
||||
255, // 'ý'
|
||||
255, // 'þ'
|
||||
255, // 'ÿ'
|
||||
];
|
||||
1039
third-party/vendor/regex-syntax/src/unicode.rs
vendored
Normal file
1039
third-party/vendor/regex-syntax/src/unicode.rs
vendored
Normal file
File diff suppressed because it is too large
Load diff
57
third-party/vendor/regex-syntax/src/unicode_tables/LICENSE-UNICODE
vendored
Normal file
57
third-party/vendor/regex-syntax/src/unicode_tables/LICENSE-UNICODE
vendored
Normal file
|
|
@ -0,0 +1,57 @@
|
|||
UNICODE, INC. LICENSE AGREEMENT - DATA FILES AND SOFTWARE
|
||||
|
||||
Unicode Data Files include all data files under the directories
|
||||
http://www.unicode.org/Public/, http://www.unicode.org/reports/,
|
||||
http://www.unicode.org/cldr/data/, http://source.icu-project.org/repos/icu/, and
|
||||
http://www.unicode.org/utility/trac/browser/.
|
||||
|
||||
Unicode Data Files do not include PDF online code charts under the
|
||||
directory http://www.unicode.org/Public/.
|
||||
|
||||
Software includes any source code published in the Unicode Standard
|
||||
or under the directories
|
||||
http://www.unicode.org/Public/, http://www.unicode.org/reports/,
|
||||
http://www.unicode.org/cldr/data/, http://source.icu-project.org/repos/icu/, and
|
||||
http://www.unicode.org/utility/trac/browser/.
|
||||
|
||||
NOTICE TO USER: Carefully read the following legal agreement.
|
||||
BY DOWNLOADING, INSTALLING, COPYING OR OTHERWISE USING UNICODE INC.'S
|
||||
DATA FILES ("DATA FILES"), AND/OR SOFTWARE ("SOFTWARE"),
|
||||
YOU UNEQUIVOCALLY ACCEPT, AND AGREE TO BE BOUND BY, ALL OF THE
|
||||
TERMS AND CONDITIONS OF THIS AGREEMENT.
|
||||
IF YOU DO NOT AGREE, DO NOT DOWNLOAD, INSTALL, COPY, DISTRIBUTE OR USE
|
||||
THE DATA FILES OR SOFTWARE.
|
||||
|
||||
COPYRIGHT AND PERMISSION NOTICE
|
||||
|
||||
Copyright © 1991-2018 Unicode, Inc. All rights reserved.
|
||||
Distributed under the Terms of Use in http://www.unicode.org/copyright.html.
|
||||
|
||||
Permission is hereby granted, free of charge, to any person obtaining
|
||||
a copy of the Unicode data files and any associated documentation
|
||||
(the "Data Files") or Unicode software and any associated documentation
|
||||
(the "Software") to deal in the Data Files or Software
|
||||
without restriction, including without limitation the rights to use,
|
||||
copy, modify, merge, publish, distribute, and/or sell copies of
|
||||
the Data Files or Software, and to permit persons to whom the Data Files
|
||||
or Software are furnished to do so, provided that either
|
||||
(a) this copyright and permission notice appear with all copies
|
||||
of the Data Files or Software, or
|
||||
(b) this copyright and permission notice appear in associated
|
||||
Documentation.
|
||||
|
||||
THE DATA FILES AND SOFTWARE ARE PROVIDED "AS IS", WITHOUT WARRANTY OF
|
||||
ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
|
||||
WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
||||
NONINFRINGEMENT OF THIRD PARTY RIGHTS.
|
||||
IN NO EVENT SHALL THE COPYRIGHT HOLDER OR HOLDERS INCLUDED IN THIS
|
||||
NOTICE BE LIABLE FOR ANY CLAIM, OR ANY SPECIAL INDIRECT OR CONSEQUENTIAL
|
||||
DAMAGES, OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE,
|
||||
DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER
|
||||
TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR
|
||||
PERFORMANCE OF THE DATA FILES OR SOFTWARE.
|
||||
|
||||
Except as contained in this notice, the name of a copyright holder
|
||||
shall not be used in advertising or otherwise to promote the sale,
|
||||
use or other dealings in these Data Files or Software without prior
|
||||
written authorization of the copyright holder.
|
||||
1791
third-party/vendor/regex-syntax/src/unicode_tables/age.rs
vendored
Normal file
1791
third-party/vendor/regex-syntax/src/unicode_tables/age.rs
vendored
Normal file
File diff suppressed because it is too large
Load diff
2888
third-party/vendor/regex-syntax/src/unicode_tables/case_folding_simple.rs
vendored
Normal file
2888
third-party/vendor/regex-syntax/src/unicode_tables/case_folding_simple.rs
vendored
Normal file
File diff suppressed because it is too large
Load diff
6552
third-party/vendor/regex-syntax/src/unicode_tables/general_category.rs
vendored
Normal file
6552
third-party/vendor/regex-syntax/src/unicode_tables/general_category.rs
vendored
Normal file
File diff suppressed because it is too large
Load diff
1416
third-party/vendor/regex-syntax/src/unicode_tables/grapheme_cluster_break.rs
vendored
Normal file
1416
third-party/vendor/regex-syntax/src/unicode_tables/grapheme_cluster_break.rs
vendored
Normal file
File diff suppressed because it is too large
Load diff
57
third-party/vendor/regex-syntax/src/unicode_tables/mod.rs
vendored
Normal file
57
third-party/vendor/regex-syntax/src/unicode_tables/mod.rs
vendored
Normal file
|
|
@ -0,0 +1,57 @@
|
|||
#[cfg(feature = "unicode-age")]
|
||||
pub mod age;
|
||||
|
||||
#[cfg(feature = "unicode-case")]
|
||||
pub mod case_folding_simple;
|
||||
|
||||
#[cfg(feature = "unicode-gencat")]
|
||||
pub mod general_category;
|
||||
|
||||
#[cfg(feature = "unicode-segment")]
|
||||
pub mod grapheme_cluster_break;
|
||||
|
||||
#[cfg(all(feature = "unicode-perl", not(feature = "unicode-gencat")))]
|
||||
#[allow(dead_code)]
|
||||
pub mod perl_decimal;
|
||||
|
||||
#[cfg(all(feature = "unicode-perl", not(feature = "unicode-bool")))]
|
||||
#[allow(dead_code)]
|
||||
pub mod perl_space;
|
||||
|
||||
#[cfg(feature = "unicode-perl")]
|
||||
pub mod perl_word;
|
||||
|
||||
#[cfg(feature = "unicode-bool")]
|
||||
pub mod property_bool;
|
||||
|
||||
#[cfg(any(
|
||||
feature = "unicode-age",
|
||||
feature = "unicode-bool",
|
||||
feature = "unicode-gencat",
|
||||
feature = "unicode-perl",
|
||||
feature = "unicode-script",
|
||||
feature = "unicode-segment",
|
||||
))]
|
||||
pub mod property_names;
|
||||
|
||||
#[cfg(any(
|
||||
feature = "unicode-age",
|
||||
feature = "unicode-bool",
|
||||
feature = "unicode-gencat",
|
||||
feature = "unicode-perl",
|
||||
feature = "unicode-script",
|
||||
feature = "unicode-segment",
|
||||
))]
|
||||
pub mod property_values;
|
||||
|
||||
#[cfg(feature = "unicode-script")]
|
||||
pub mod script;
|
||||
|
||||
#[cfg(feature = "unicode-script")]
|
||||
pub mod script_extension;
|
||||
|
||||
#[cfg(feature = "unicode-segment")]
|
||||
pub mod sentence_break;
|
||||
|
||||
#[cfg(feature = "unicode-segment")]
|
||||
pub mod word_break;
|
||||
77
third-party/vendor/regex-syntax/src/unicode_tables/perl_decimal.rs
vendored
Normal file
77
third-party/vendor/regex-syntax/src/unicode_tables/perl_decimal.rs
vendored
Normal file
|
|
@ -0,0 +1,77 @@
|
|||
// DO NOT EDIT THIS FILE. IT WAS AUTOMATICALLY GENERATED BY:
|
||||
//
|
||||
// ucd-generate general-category ucd-15.0.0 --chars --include decimalnumber
|
||||
//
|
||||
// Unicode version: 15.0.0.
|
||||
//
|
||||
// ucd-generate 0.2.14 is available on crates.io.
|
||||
|
||||
pub const BY_NAME: &'static [(&'static str, &'static [(char, char)])] =
|
||||
&[("Decimal_Number", DECIMAL_NUMBER)];
|
||||
|
||||
pub const DECIMAL_NUMBER: &'static [(char, char)] = &[
|
||||
('0', '9'),
|
||||
('٠', '٩'),
|
||||
('۰', '۹'),
|
||||
('߀', '߉'),
|
||||
('०', '९'),
|
||||
('০', '৯'),
|
||||
('੦', '੯'),
|
||||
('૦', '૯'),
|
||||
('୦', '୯'),
|
||||
('௦', '௯'),
|
||||
('౦', '౯'),
|
||||
('೦', '೯'),
|
||||
('൦', '൯'),
|
||||
('෦', '෯'),
|
||||
('๐', '๙'),
|
||||
('໐', '໙'),
|
||||
('༠', '༩'),
|
||||
('၀', '၉'),
|
||||
('႐', '႙'),
|
||||
('០', '៩'),
|
||||
('᠐', '᠙'),
|
||||
('᥆', '᥏'),
|
||||
('᧐', '᧙'),
|
||||
('᪀', '᪉'),
|
||||
('᪐', '᪙'),
|
||||
('᭐', '᭙'),
|
||||
('᮰', '᮹'),
|
||||
('᱀', '᱉'),
|
||||
('᱐', '᱙'),
|
||||
('꘠', '꘩'),
|
||||
('꣐', '꣙'),
|
||||
('꤀', '꤉'),
|
||||
('꧐', '꧙'),
|
||||
('꧰', '꧹'),
|
||||
('꩐', '꩙'),
|
||||
('꯰', '꯹'),
|
||||
('0', '9'),
|
||||
('𐒠', '𐒩'),
|
||||
('𐴰', '𐴹'),
|
||||
('𑁦', '𑁯'),
|
||||
('𑃰', '𑃹'),
|
||||
('𑄶', '𑄿'),
|
||||
('𑇐', '𑇙'),
|
||||
('𑋰', '𑋹'),
|
||||
('𑑐', '𑑙'),
|
||||
('𑓐', '𑓙'),
|
||||
('𑙐', '𑙙'),
|
||||
('𑛀', '𑛉'),
|
||||
('𑜰', '𑜹'),
|
||||
('𑣠', '𑣩'),
|
||||
('𑥐', '𑥙'),
|
||||
('𑱐', '𑱙'),
|
||||
('𑵐', '𑵙'),
|
||||
('𑶠', '𑶩'),
|
||||
('𑽐', '𑽙'),
|
||||
('𖩠', '𖩩'),
|
||||
('𖫀', '𖫉'),
|
||||
('𖭐', '𖭙'),
|
||||
('𝟎', '𝟿'),
|
||||
('𞅀', '𞅉'),
|
||||
('𞋰', '𞋹'),
|
||||
('𞓰', '𞓹'),
|
||||
('𞥐', '𞥙'),
|
||||
('🯰', '🯹'),
|
||||
];
|
||||
23
third-party/vendor/regex-syntax/src/unicode_tables/perl_space.rs
vendored
Normal file
23
third-party/vendor/regex-syntax/src/unicode_tables/perl_space.rs
vendored
Normal file
|
|
@ -0,0 +1,23 @@
|
|||
// DO NOT EDIT THIS FILE. IT WAS AUTOMATICALLY GENERATED BY:
|
||||
//
|
||||
// ucd-generate property-bool ucd-15.0.0 --chars --include whitespace
|
||||
//
|
||||
// Unicode version: 15.0.0.
|
||||
//
|
||||
// ucd-generate 0.2.14 is available on crates.io.
|
||||
|
||||
pub const BY_NAME: &'static [(&'static str, &'static [(char, char)])] =
|
||||
&[("White_Space", WHITE_SPACE)];
|
||||
|
||||
pub const WHITE_SPACE: &'static [(char, char)] = &[
|
||||
('\t', '\r'),
|
||||
(' ', ' '),
|
||||
('\u{85}', '\u{85}'),
|
||||
('\u{a0}', '\u{a0}'),
|
||||
('\u{1680}', '\u{1680}'),
|
||||
('\u{2000}', '\u{200a}'),
|
||||
('\u{2028}', '\u{2029}'),
|
||||
('\u{202f}', '\u{202f}'),
|
||||
('\u{205f}', '\u{205f}'),
|
||||
('\u{3000}', '\u{3000}'),
|
||||
];
|
||||
781
third-party/vendor/regex-syntax/src/unicode_tables/perl_word.rs
vendored
Normal file
781
third-party/vendor/regex-syntax/src/unicode_tables/perl_word.rs
vendored
Normal file
|
|
@ -0,0 +1,781 @@
|
|||
// DO NOT EDIT THIS FILE. IT WAS AUTOMATICALLY GENERATED BY:
|
||||
//
|
||||
// ucd-generate perl-word ucd-15.0.0 --chars
|
||||
//
|
||||
// Unicode version: 15.0.0.
|
||||
//
|
||||
// ucd-generate 0.2.14 is available on crates.io.
|
||||
|
||||
pub const PERL_WORD: &'static [(char, char)] = &[
|
||||
('0', '9'),
|
||||
('A', 'Z'),
|
||||
('_', '_'),
|
||||
('a', 'z'),
|
||||
('ª', 'ª'),
|
||||
('µ', 'µ'),
|
||||
('º', 'º'),
|
||||
('À', 'Ö'),
|
||||
('Ø', 'ö'),
|
||||
('ø', 'ˁ'),
|
||||
('ˆ', 'ˑ'),
|
||||
('ˠ', 'ˤ'),
|
||||
('ˬ', 'ˬ'),
|
||||
('ˮ', 'ˮ'),
|
||||
('\u{300}', 'ʹ'),
|
||||
('Ͷ', 'ͷ'),
|
||||
('ͺ', 'ͽ'),
|
||||
('Ϳ', 'Ϳ'),
|
||||
('Ά', 'Ά'),
|
||||
('Έ', 'Ί'),
|
||||
('Ό', 'Ό'),
|
||||
('Ύ', 'Ρ'),
|
||||
('Σ', 'ϵ'),
|
||||
('Ϸ', 'ҁ'),
|
||||
('\u{483}', 'ԯ'),
|
||||
('Ա', 'Ֆ'),
|
||||
('ՙ', 'ՙ'),
|
||||
('ՠ', 'ֈ'),
|
||||
('\u{591}', '\u{5bd}'),
|
||||
('\u{5bf}', '\u{5bf}'),
|
||||
('\u{5c1}', '\u{5c2}'),
|
||||
('\u{5c4}', '\u{5c5}'),
|
||||
('\u{5c7}', '\u{5c7}'),
|
||||
('א', 'ת'),
|
||||
('ׯ', 'ײ'),
|
||||
('\u{610}', '\u{61a}'),
|
||||
('ؠ', '٩'),
|
||||
('ٮ', 'ۓ'),
|
||||
('ە', '\u{6dc}'),
|
||||
('\u{6df}', '\u{6e8}'),
|
||||
('\u{6ea}', 'ۼ'),
|
||||
('ۿ', 'ۿ'),
|
||||
('ܐ', '\u{74a}'),
|
||||
('ݍ', 'ޱ'),
|
||||
('߀', 'ߵ'),
|
||||
('ߺ', 'ߺ'),
|
||||
('\u{7fd}', '\u{7fd}'),
|
||||
('ࠀ', '\u{82d}'),
|
||||
('ࡀ', '\u{85b}'),
|
||||
('ࡠ', 'ࡪ'),
|
||||
('ࡰ', 'ࢇ'),
|
||||
('ࢉ', 'ࢎ'),
|
||||
('\u{898}', '\u{8e1}'),
|
||||
('\u{8e3}', '\u{963}'),
|
||||
('०', '९'),
|
||||
('ॱ', 'ঃ'),
|
||||
('অ', 'ঌ'),
|
||||
('এ', 'ঐ'),
|
||||
('ও', 'ন'),
|
||||
('প', 'র'),
|
||||
('ল', 'ল'),
|
||||
('শ', 'হ'),
|
||||
('\u{9bc}', '\u{9c4}'),
|
||||
('ে', 'ৈ'),
|
||||
('ো', 'ৎ'),
|
||||
('\u{9d7}', '\u{9d7}'),
|
||||
('ড়', 'ঢ়'),
|
||||
('য়', '\u{9e3}'),
|
||||
('০', 'ৱ'),
|
||||
('ৼ', 'ৼ'),
|
||||
('\u{9fe}', '\u{9fe}'),
|
||||
('\u{a01}', 'ਃ'),
|
||||
('ਅ', 'ਊ'),
|
||||
('ਏ', 'ਐ'),
|
||||
('ਓ', 'ਨ'),
|
||||
('ਪ', 'ਰ'),
|
||||
('ਲ', 'ਲ਼'),
|
||||
('ਵ', 'ਸ਼'),
|
||||
('ਸ', 'ਹ'),
|
||||
('\u{a3c}', '\u{a3c}'),
|
||||
('ਾ', '\u{a42}'),
|
||||
('\u{a47}', '\u{a48}'),
|
||||
('\u{a4b}', '\u{a4d}'),
|
||||
('\u{a51}', '\u{a51}'),
|
||||
('ਖ਼', 'ੜ'),
|
||||
('ਫ਼', 'ਫ਼'),
|
||||
('੦', '\u{a75}'),
|
||||
('\u{a81}', 'ઃ'),
|
||||
('અ', 'ઍ'),
|
||||
('એ', 'ઑ'),
|
||||
('ઓ', 'ન'),
|
||||
('પ', 'ર'),
|
||||
('લ', 'ળ'),
|
||||
('વ', 'હ'),
|
||||
('\u{abc}', '\u{ac5}'),
|
||||
('\u{ac7}', 'ૉ'),
|
||||
('ો', '\u{acd}'),
|
||||
('ૐ', 'ૐ'),
|
||||
('ૠ', '\u{ae3}'),
|
||||
('૦', '૯'),
|
||||
('ૹ', '\u{aff}'),
|
||||
('\u{b01}', 'ଃ'),
|
||||
('ଅ', 'ଌ'),
|
||||
('ଏ', 'ଐ'),
|
||||
('ଓ', 'ନ'),
|
||||
('ପ', 'ର'),
|
||||
('ଲ', 'ଳ'),
|
||||
('ଵ', 'ହ'),
|
||||
('\u{b3c}', '\u{b44}'),
|
||||
('େ', 'ୈ'),
|
||||
('ୋ', '\u{b4d}'),
|
||||
('\u{b55}', '\u{b57}'),
|
||||
('ଡ଼', 'ଢ଼'),
|
||||
('ୟ', '\u{b63}'),
|
||||
('୦', '୯'),
|
||||
('ୱ', 'ୱ'),
|
||||
('\u{b82}', 'ஃ'),
|
||||
('அ', 'ஊ'),
|
||||
('எ', 'ஐ'),
|
||||
('ஒ', 'க'),
|
||||
('ங', 'ச'),
|
||||
('ஜ', 'ஜ'),
|
||||
('ஞ', 'ட'),
|
||||
('ண', 'த'),
|
||||
('ந', 'ப'),
|
||||
('ம', 'ஹ'),
|
||||
('\u{bbe}', 'ூ'),
|
||||
('ெ', 'ை'),
|
||||
('ொ', '\u{bcd}'),
|
||||
('ௐ', 'ௐ'),
|
||||
('\u{bd7}', '\u{bd7}'),
|
||||
('௦', '௯'),
|
||||
('\u{c00}', 'ఌ'),
|
||||
('ఎ', 'ఐ'),
|
||||
('ఒ', 'న'),
|
||||
('ప', 'హ'),
|
||||
('\u{c3c}', 'ౄ'),
|
||||
('\u{c46}', '\u{c48}'),
|
||||
('\u{c4a}', '\u{c4d}'),
|
||||
('\u{c55}', '\u{c56}'),
|
||||
('ౘ', 'ౚ'),
|
||||
('ౝ', 'ౝ'),
|
||||
('ౠ', '\u{c63}'),
|
||||
('౦', '౯'),
|
||||
('ಀ', 'ಃ'),
|
||||
('ಅ', 'ಌ'),
|
||||
('ಎ', 'ಐ'),
|
||||
('ಒ', 'ನ'),
|
||||
('ಪ', 'ಳ'),
|
||||
('ವ', 'ಹ'),
|
||||
('\u{cbc}', 'ೄ'),
|
||||
('\u{cc6}', 'ೈ'),
|
||||
('ೊ', '\u{ccd}'),
|
||||
('\u{cd5}', '\u{cd6}'),
|
||||
('ೝ', 'ೞ'),
|
||||
('ೠ', '\u{ce3}'),
|
||||
('೦', '೯'),
|
||||
('ೱ', 'ೳ'),
|
||||
('\u{d00}', 'ഌ'),
|
||||
('എ', 'ഐ'),
|
||||
('ഒ', '\u{d44}'),
|
||||
('െ', 'ൈ'),
|
||||
('ൊ', 'ൎ'),
|
||||
('ൔ', '\u{d57}'),
|
||||
('ൟ', '\u{d63}'),
|
||||
('൦', '൯'),
|
||||
('ൺ', 'ൿ'),
|
||||
('\u{d81}', 'ඃ'),
|
||||
('අ', 'ඖ'),
|
||||
('ක', 'න'),
|
||||
('ඳ', 'ර'),
|
||||
('ල', 'ල'),
|
||||
('ව', 'ෆ'),
|
||||
('\u{dca}', '\u{dca}'),
|
||||
('\u{dcf}', '\u{dd4}'),
|
||||
('\u{dd6}', '\u{dd6}'),
|
||||
('ෘ', '\u{ddf}'),
|
||||
('෦', '෯'),
|
||||
('ෲ', 'ෳ'),
|
||||
('ก', '\u{e3a}'),
|
||||
('เ', '\u{e4e}'),
|
||||
('๐', '๙'),
|
||||
('ກ', 'ຂ'),
|
||||
('ຄ', 'ຄ'),
|
||||
('ຆ', 'ຊ'),
|
||||
('ຌ', 'ຣ'),
|
||||
('ລ', 'ລ'),
|
||||
('ວ', 'ຽ'),
|
||||
('ເ', 'ໄ'),
|
||||
('ໆ', 'ໆ'),
|
||||
('\u{ec8}', '\u{ece}'),
|
||||
('໐', '໙'),
|
||||
('ໜ', 'ໟ'),
|
||||
('ༀ', 'ༀ'),
|
||||
('\u{f18}', '\u{f19}'),
|
||||
('༠', '༩'),
|
||||
('\u{f35}', '\u{f35}'),
|
||||
('\u{f37}', '\u{f37}'),
|
||||
('\u{f39}', '\u{f39}'),
|
||||
('༾', 'ཇ'),
|
||||
('ཉ', 'ཬ'),
|
||||
('\u{f71}', '\u{f84}'),
|
||||
('\u{f86}', '\u{f97}'),
|
||||
('\u{f99}', '\u{fbc}'),
|
||||
('\u{fc6}', '\u{fc6}'),
|
||||
('က', '၉'),
|
||||
('ၐ', '\u{109d}'),
|
||||
('Ⴀ', 'Ⴥ'),
|
||||
('Ⴧ', 'Ⴧ'),
|
||||
('Ⴭ', 'Ⴭ'),
|
||||
('ა', 'ჺ'),
|
||||
('ჼ', 'ቈ'),
|
||||
('ቊ', 'ቍ'),
|
||||
('ቐ', 'ቖ'),
|
||||
('ቘ', 'ቘ'),
|
||||
('ቚ', 'ቝ'),
|
||||
('በ', 'ኈ'),
|
||||
('ኊ', 'ኍ'),
|
||||
('ነ', 'ኰ'),
|
||||
('ኲ', 'ኵ'),
|
||||
('ኸ', 'ኾ'),
|
||||
('ዀ', 'ዀ'),
|
||||
('ዂ', 'ዅ'),
|
||||
('ወ', 'ዖ'),
|
||||
('ዘ', 'ጐ'),
|
||||
('ጒ', 'ጕ'),
|
||||
('ጘ', 'ፚ'),
|
||||
('\u{135d}', '\u{135f}'),
|
||||
('ᎀ', 'ᎏ'),
|
||||
('Ꭰ', 'Ᏽ'),
|
||||
('ᏸ', 'ᏽ'),
|
||||
('ᐁ', 'ᙬ'),
|
||||
('ᙯ', 'ᙿ'),
|
||||
('ᚁ', 'ᚚ'),
|
||||
('ᚠ', 'ᛪ'),
|
||||
('ᛮ', 'ᛸ'),
|
||||
('ᜀ', '᜕'),
|
||||
('ᜟ', '᜴'),
|
||||
('ᝀ', '\u{1753}'),
|
||||
('ᝠ', 'ᝬ'),
|
||||
('ᝮ', 'ᝰ'),
|
||||
('\u{1772}', '\u{1773}'),
|
||||
('ក', '\u{17d3}'),
|
||||
('ៗ', 'ៗ'),
|
||||
('ៜ', '\u{17dd}'),
|
||||
('០', '៩'),
|
||||
('\u{180b}', '\u{180d}'),
|
||||
('\u{180f}', '᠙'),
|
||||
('ᠠ', 'ᡸ'),
|
||||
('ᢀ', 'ᢪ'),
|
||||
('ᢰ', 'ᣵ'),
|
||||
('ᤀ', 'ᤞ'),
|
||||
('\u{1920}', 'ᤫ'),
|
||||
('ᤰ', '\u{193b}'),
|
||||
('᥆', 'ᥭ'),
|
||||
('ᥰ', 'ᥴ'),
|
||||
('ᦀ', 'ᦫ'),
|
||||
('ᦰ', 'ᧉ'),
|
||||
('᧐', '᧙'),
|
||||
('ᨀ', '\u{1a1b}'),
|
||||
('ᨠ', '\u{1a5e}'),
|
||||
('\u{1a60}', '\u{1a7c}'),
|
||||
('\u{1a7f}', '᪉'),
|
||||
('᪐', '᪙'),
|
||||
('ᪧ', 'ᪧ'),
|
||||
('\u{1ab0}', '\u{1ace}'),
|
||||
('\u{1b00}', 'ᭌ'),
|
||||
('᭐', '᭙'),
|
||||
('\u{1b6b}', '\u{1b73}'),
|
||||
('\u{1b80}', '᯳'),
|
||||
('ᰀ', '\u{1c37}'),
|
||||
('᱀', '᱉'),
|
||||
('ᱍ', 'ᱽ'),
|
||||
('ᲀ', 'ᲈ'),
|
||||
('Ა', 'Ჺ'),
|
||||
('Ჽ', 'Ჿ'),
|
||||
('\u{1cd0}', '\u{1cd2}'),
|
||||
('\u{1cd4}', 'ᳺ'),
|
||||
('ᴀ', 'ἕ'),
|
||||
('Ἐ', 'Ἕ'),
|
||||
('ἠ', 'ὅ'),
|
||||
('Ὀ', 'Ὅ'),
|
||||
('ὐ', 'ὗ'),
|
||||
('Ὑ', 'Ὑ'),
|
||||
('Ὓ', 'Ὓ'),
|
||||
('Ὕ', 'Ὕ'),
|
||||
('Ὗ', 'ώ'),
|
||||
('ᾀ', 'ᾴ'),
|
||||
('ᾶ', 'ᾼ'),
|
||||
('ι', 'ι'),
|
||||
('ῂ', 'ῄ'),
|
||||
('ῆ', 'ῌ'),
|
||||
('ῐ', 'ΐ'),
|
||||
('ῖ', 'Ί'),
|
||||
('ῠ', 'Ῥ'),
|
||||
('ῲ', 'ῴ'),
|
||||
('ῶ', 'ῼ'),
|
||||
('\u{200c}', '\u{200d}'),
|
||||
('‿', '⁀'),
|
||||
('⁔', '⁔'),
|
||||
('ⁱ', 'ⁱ'),
|
||||
('ⁿ', 'ⁿ'),
|
||||
('ₐ', 'ₜ'),
|
||||
('\u{20d0}', '\u{20f0}'),
|
||||
('ℂ', 'ℂ'),
|
||||
('ℇ', 'ℇ'),
|
||||
('ℊ', 'ℓ'),
|
||||
('ℕ', 'ℕ'),
|
||||
('ℙ', 'ℝ'),
|
||||
('ℤ', 'ℤ'),
|
||||
('Ω', 'Ω'),
|
||||
('ℨ', 'ℨ'),
|
||||
('K', 'ℭ'),
|
||||
('ℯ', 'ℹ'),
|
||||
('ℼ', 'ℿ'),
|
||||
('ⅅ', 'ⅉ'),
|
||||
('ⅎ', 'ⅎ'),
|
||||
('Ⅰ', 'ↈ'),
|
||||
('Ⓐ', 'ⓩ'),
|
||||
('Ⰰ', 'ⳤ'),
|
||||
('Ⳬ', 'ⳳ'),
|
||||
('ⴀ', 'ⴥ'),
|
||||
('ⴧ', 'ⴧ'),
|
||||
('ⴭ', 'ⴭ'),
|
||||
('ⴰ', 'ⵧ'),
|
||||
('ⵯ', 'ⵯ'),
|
||||
('\u{2d7f}', 'ⶖ'),
|
||||
('ⶠ', 'ⶦ'),
|
||||
('ⶨ', 'ⶮ'),
|
||||
('ⶰ', 'ⶶ'),
|
||||
('ⶸ', 'ⶾ'),
|
||||
('ⷀ', 'ⷆ'),
|
||||
('ⷈ', 'ⷎ'),
|
||||
('ⷐ', 'ⷖ'),
|
||||
('ⷘ', 'ⷞ'),
|
||||
('\u{2de0}', '\u{2dff}'),
|
||||
('ⸯ', 'ⸯ'),
|
||||
('々', '〇'),
|
||||
('〡', '\u{302f}'),
|
||||
('〱', '〵'),
|
||||
('〸', '〼'),
|
||||
('ぁ', 'ゖ'),
|
||||
('\u{3099}', '\u{309a}'),
|
||||
('ゝ', 'ゟ'),
|
||||
('ァ', 'ヺ'),
|
||||
('ー', 'ヿ'),
|
||||
('ㄅ', 'ㄯ'),
|
||||
('ㄱ', 'ㆎ'),
|
||||
('ㆠ', 'ㆿ'),
|
||||
('ㇰ', 'ㇿ'),
|
||||
('㐀', '䶿'),
|
||||
('一', 'ꒌ'),
|
||||
('ꓐ', 'ꓽ'),
|
||||
('ꔀ', 'ꘌ'),
|
||||
('ꘐ', 'ꘫ'),
|
||||
('Ꙁ', '\u{a672}'),
|
||||
('\u{a674}', '\u{a67d}'),
|
||||
('ꙿ', '\u{a6f1}'),
|
||||
('ꜗ', 'ꜟ'),
|
||||
('Ꜣ', 'ꞈ'),
|
||||
('Ꞌ', 'ꟊ'),
|
||||
('Ꟑ', 'ꟑ'),
|
||||
('ꟓ', 'ꟓ'),
|
||||
('ꟕ', 'ꟙ'),
|
||||
('ꟲ', 'ꠧ'),
|
||||
('\u{a82c}', '\u{a82c}'),
|
||||
('ꡀ', 'ꡳ'),
|
||||
('ꢀ', '\u{a8c5}'),
|
||||
('꣐', '꣙'),
|
||||
('\u{a8e0}', 'ꣷ'),
|
||||
('ꣻ', 'ꣻ'),
|
||||
('ꣽ', '\u{a92d}'),
|
||||
('ꤰ', '꥓'),
|
||||
('ꥠ', 'ꥼ'),
|
||||
('\u{a980}', '꧀'),
|
||||
('ꧏ', '꧙'),
|
||||
('ꧠ', 'ꧾ'),
|
||||
('ꨀ', '\u{aa36}'),
|
||||
('ꩀ', 'ꩍ'),
|
||||
('꩐', '꩙'),
|
||||
('ꩠ', 'ꩶ'),
|
||||
('ꩺ', 'ꫂ'),
|
||||
('ꫛ', 'ꫝ'),
|
||||
('ꫠ', 'ꫯ'),
|
||||
('ꫲ', '\u{aaf6}'),
|
||||
('ꬁ', 'ꬆ'),
|
||||
('ꬉ', 'ꬎ'),
|
||||
('ꬑ', 'ꬖ'),
|
||||
('ꬠ', 'ꬦ'),
|
||||
('ꬨ', 'ꬮ'),
|
||||
('ꬰ', 'ꭚ'),
|
||||
('ꭜ', 'ꭩ'),
|
||||
('ꭰ', 'ꯪ'),
|
||||
('꯬', '\u{abed}'),
|
||||
('꯰', '꯹'),
|
||||
('가', '힣'),
|
||||
('ힰ', 'ퟆ'),
|
||||
('ퟋ', 'ퟻ'),
|
||||
('豈', '舘'),
|
||||
('並', '龎'),
|
||||
('ff', 'st'),
|
||||
('ﬓ', 'ﬗ'),
|
||||
('יִ', 'ﬨ'),
|
||||
('שׁ', 'זּ'),
|
||||
('טּ', 'לּ'),
|
||||
('מּ', 'מּ'),
|
||||
('נּ', 'סּ'),
|
||||
('ףּ', 'פּ'),
|
||||
('צּ', 'ﮱ'),
|
||||
('ﯓ', 'ﴽ'),
|
||||
('ﵐ', 'ﶏ'),
|
||||
('ﶒ', 'ﷇ'),
|
||||
('ﷰ', 'ﷻ'),
|
||||
('\u{fe00}', '\u{fe0f}'),
|
||||
('\u{fe20}', '\u{fe2f}'),
|
||||
('︳', '︴'),
|
||||
('﹍', '﹏'),
|
||||
('ﹰ', 'ﹴ'),
|
||||
('ﹶ', 'ﻼ'),
|
||||
('0', '9'),
|
||||
('A', 'Z'),
|
||||
('_', '_'),
|
||||
('a', 'z'),
|
||||
('ヲ', 'ᄒ'),
|
||||
('ᅡ', 'ᅦ'),
|
||||
('ᅧ', 'ᅬ'),
|
||||
('ᅭ', 'ᅲ'),
|
||||
('ᅳ', 'ᅵ'),
|
||||
('𐀀', '𐀋'),
|
||||
('𐀍', '𐀦'),
|
||||
('𐀨', '𐀺'),
|
||||
('𐀼', '𐀽'),
|
||||
('𐀿', '𐁍'),
|
||||
('𐁐', '𐁝'),
|
||||
('𐂀', '𐃺'),
|
||||
('𐅀', '𐅴'),
|
||||
('\u{101fd}', '\u{101fd}'),
|
||||
('𐊀', '𐊜'),
|
||||
('𐊠', '𐋐'),
|
||||
('\u{102e0}', '\u{102e0}'),
|
||||
('𐌀', '𐌟'),
|
||||
('𐌭', '𐍊'),
|
||||
('𐍐', '\u{1037a}'),
|
||||
('𐎀', '𐎝'),
|
||||
('𐎠', '𐏃'),
|
||||
('𐏈', '𐏏'),
|
||||
('𐏑', '𐏕'),
|
||||
('𐐀', '𐒝'),
|
||||
('𐒠', '𐒩'),
|
||||
('𐒰', '𐓓'),
|
||||
('𐓘', '𐓻'),
|
||||
('𐔀', '𐔧'),
|
||||
('𐔰', '𐕣'),
|
||||
('𐕰', '𐕺'),
|
||||
('𐕼', '𐖊'),
|
||||
('𐖌', '𐖒'),
|
||||
('𐖔', '𐖕'),
|
||||
('𐖗', '𐖡'),
|
||||
('𐖣', '𐖱'),
|
||||
('𐖳', '𐖹'),
|
||||
('𐖻', '𐖼'),
|
||||
('𐘀', '𐜶'),
|
||||
('𐝀', '𐝕'),
|
||||
('𐝠', '𐝧'),
|
||||
('𐞀', '𐞅'),
|
||||
('𐞇', '𐞰'),
|
||||
('𐞲', '𐞺'),
|
||||
('𐠀', '𐠅'),
|
||||
('𐠈', '𐠈'),
|
||||
('𐠊', '𐠵'),
|
||||
('𐠷', '𐠸'),
|
||||
('𐠼', '𐠼'),
|
||||
('𐠿', '𐡕'),
|
||||
('𐡠', '𐡶'),
|
||||
('𐢀', '𐢞'),
|
||||
('𐣠', '𐣲'),
|
||||
('𐣴', '𐣵'),
|
||||
('𐤀', '𐤕'),
|
||||
('𐤠', '𐤹'),
|
||||
('𐦀', '𐦷'),
|
||||
('𐦾', '𐦿'),
|
||||
('𐨀', '\u{10a03}'),
|
||||
('\u{10a05}', '\u{10a06}'),
|
||||
('\u{10a0c}', '𐨓'),
|
||||
('𐨕', '𐨗'),
|
||||
('𐨙', '𐨵'),
|
||||
('\u{10a38}', '\u{10a3a}'),
|
||||
('\u{10a3f}', '\u{10a3f}'),
|
||||
('𐩠', '𐩼'),
|
||||
('𐪀', '𐪜'),
|
||||
('𐫀', '𐫇'),
|
||||
('𐫉', '\u{10ae6}'),
|
||||
('𐬀', '𐬵'),
|
||||
('𐭀', '𐭕'),
|
||||
('𐭠', '𐭲'),
|
||||
('𐮀', '𐮑'),
|
||||
('𐰀', '𐱈'),
|
||||
('𐲀', '𐲲'),
|
||||
('𐳀', '𐳲'),
|
||||
('𐴀', '\u{10d27}'),
|
||||
('𐴰', '𐴹'),
|
||||
('𐺀', '𐺩'),
|
||||
('\u{10eab}', '\u{10eac}'),
|
||||
('𐺰', '𐺱'),
|
||||
('\u{10efd}', '𐼜'),
|
||||
('𐼧', '𐼧'),
|
||||
('𐼰', '\u{10f50}'),
|
||||
('𐽰', '\u{10f85}'),
|
||||
('𐾰', '𐿄'),
|
||||
('𐿠', '𐿶'),
|
||||
('𑀀', '\u{11046}'),
|
||||
('𑁦', '𑁵'),
|
||||
('\u{1107f}', '\u{110ba}'),
|
||||
('\u{110c2}', '\u{110c2}'),
|
||||
('𑃐', '𑃨'),
|
||||
('𑃰', '𑃹'),
|
||||
('\u{11100}', '\u{11134}'),
|
||||
('𑄶', '𑄿'),
|
||||
('𑅄', '𑅇'),
|
||||
('𑅐', '\u{11173}'),
|
||||
('𑅶', '𑅶'),
|
||||
('\u{11180}', '𑇄'),
|
||||
('\u{111c9}', '\u{111cc}'),
|
||||
('𑇎', '𑇚'),
|
||||
('𑇜', '𑇜'),
|
||||
('𑈀', '𑈑'),
|
||||
('𑈓', '\u{11237}'),
|
||||
('\u{1123e}', '\u{11241}'),
|
||||
('𑊀', '𑊆'),
|
||||
('𑊈', '𑊈'),
|
||||
('𑊊', '𑊍'),
|
||||
('𑊏', '𑊝'),
|
||||
('𑊟', '𑊨'),
|
||||
('𑊰', '\u{112ea}'),
|
||||
('𑋰', '𑋹'),
|
||||
('\u{11300}', '𑌃'),
|
||||
('𑌅', '𑌌'),
|
||||
('𑌏', '𑌐'),
|
||||
('𑌓', '𑌨'),
|
||||
('𑌪', '𑌰'),
|
||||
('𑌲', '𑌳'),
|
||||
('𑌵', '𑌹'),
|
||||
('\u{1133b}', '𑍄'),
|
||||
('𑍇', '𑍈'),
|
||||
('𑍋', '𑍍'),
|
||||
('𑍐', '𑍐'),
|
||||
('\u{11357}', '\u{11357}'),
|
||||
('𑍝', '𑍣'),
|
||||
('\u{11366}', '\u{1136c}'),
|
||||
('\u{11370}', '\u{11374}'),
|
||||
('𑐀', '𑑊'),
|
||||
('𑑐', '𑑙'),
|
||||
('\u{1145e}', '𑑡'),
|
||||
('𑒀', '𑓅'),
|
||||
('𑓇', '𑓇'),
|
||||
('𑓐', '𑓙'),
|
||||
('𑖀', '\u{115b5}'),
|
||||
('𑖸', '\u{115c0}'),
|
||||
('𑗘', '\u{115dd}'),
|
||||
('𑘀', '\u{11640}'),
|
||||
('𑙄', '𑙄'),
|
||||
('𑙐', '𑙙'),
|
||||
('𑚀', '𑚸'),
|
||||
('𑛀', '𑛉'),
|
||||
('𑜀', '𑜚'),
|
||||
('\u{1171d}', '\u{1172b}'),
|
||||
('𑜰', '𑜹'),
|
||||
('𑝀', '𑝆'),
|
||||
('𑠀', '\u{1183a}'),
|
||||
('𑢠', '𑣩'),
|
||||
('𑣿', '𑤆'),
|
||||
('𑤉', '𑤉'),
|
||||
('𑤌', '𑤓'),
|
||||
('𑤕', '𑤖'),
|
||||
('𑤘', '𑤵'),
|
||||
('𑤷', '𑤸'),
|
||||
('\u{1193b}', '\u{11943}'),
|
||||
('𑥐', '𑥙'),
|
||||
('𑦠', '𑦧'),
|
||||
('𑦪', '\u{119d7}'),
|
||||
('\u{119da}', '𑧡'),
|
||||
('𑧣', '𑧤'),
|
||||
('𑨀', '\u{11a3e}'),
|
||||
('\u{11a47}', '\u{11a47}'),
|
||||
('𑩐', '\u{11a99}'),
|
||||
('𑪝', '𑪝'),
|
||||
('𑪰', '𑫸'),
|
||||
('𑰀', '𑰈'),
|
||||
('𑰊', '\u{11c36}'),
|
||||
('\u{11c38}', '𑱀'),
|
||||
('𑱐', '𑱙'),
|
||||
('𑱲', '𑲏'),
|
||||
('\u{11c92}', '\u{11ca7}'),
|
||||
('𑲩', '\u{11cb6}'),
|
||||
('𑴀', '𑴆'),
|
||||
('𑴈', '𑴉'),
|
||||
('𑴋', '\u{11d36}'),
|
||||
('\u{11d3a}', '\u{11d3a}'),
|
||||
('\u{11d3c}', '\u{11d3d}'),
|
||||
('\u{11d3f}', '\u{11d47}'),
|
||||
('𑵐', '𑵙'),
|
||||
('𑵠', '𑵥'),
|
||||
('𑵧', '𑵨'),
|
||||
('𑵪', '𑶎'),
|
||||
('\u{11d90}', '\u{11d91}'),
|
||||
('𑶓', '𑶘'),
|
||||
('𑶠', '𑶩'),
|
||||
('𑻠', '𑻶'),
|
||||
('\u{11f00}', '𑼐'),
|
||||
('𑼒', '\u{11f3a}'),
|
||||
('𑼾', '\u{11f42}'),
|
||||
('𑽐', '𑽙'),
|
||||
('𑾰', '𑾰'),
|
||||
('𒀀', '𒎙'),
|
||||
('𒐀', '𒑮'),
|
||||
('𒒀', '𒕃'),
|
||||
('𒾐', '𒿰'),
|
||||
('𓀀', '𓐯'),
|
||||
('\u{13440}', '\u{13455}'),
|
||||
('𔐀', '𔙆'),
|
||||
('𖠀', '𖨸'),
|
||||
('𖩀', '𖩞'),
|
||||
('𖩠', '𖩩'),
|
||||
('𖩰', '𖪾'),
|
||||
('𖫀', '𖫉'),
|
||||
('𖫐', '𖫭'),
|
||||
('\u{16af0}', '\u{16af4}'),
|
||||
('𖬀', '\u{16b36}'),
|
||||
('𖭀', '𖭃'),
|
||||
('𖭐', '𖭙'),
|
||||
('𖭣', '𖭷'),
|
||||
('𖭽', '𖮏'),
|
||||
('𖹀', '𖹿'),
|
||||
('𖼀', '𖽊'),
|
||||
('\u{16f4f}', '𖾇'),
|
||||
('\u{16f8f}', '𖾟'),
|
||||
('𖿠', '𖿡'),
|
||||
('𖿣', '\u{16fe4}'),
|
||||
('𖿰', '𖿱'),
|
||||
('𗀀', '𘟷'),
|
||||
('𘠀', '𘳕'),
|
||||
('𘴀', '𘴈'),
|
||||
('𚿰', '𚿳'),
|
||||
('𚿵', '𚿻'),
|
||||
('𚿽', '𚿾'),
|
||||
('𛀀', '𛄢'),
|
||||
('𛄲', '𛄲'),
|
||||
('𛅐', '𛅒'),
|
||||
('𛅕', '𛅕'),
|
||||
('𛅤', '𛅧'),
|
||||
('𛅰', '𛋻'),
|
||||
('𛰀', '𛱪'),
|
||||
('𛱰', '𛱼'),
|
||||
('𛲀', '𛲈'),
|
||||
('𛲐', '𛲙'),
|
||||
('\u{1bc9d}', '\u{1bc9e}'),
|
||||
('\u{1cf00}', '\u{1cf2d}'),
|
||||
('\u{1cf30}', '\u{1cf46}'),
|
||||
('\u{1d165}', '\u{1d169}'),
|
||||
('𝅭', '\u{1d172}'),
|
||||
('\u{1d17b}', '\u{1d182}'),
|
||||
('\u{1d185}', '\u{1d18b}'),
|
||||
('\u{1d1aa}', '\u{1d1ad}'),
|
||||
('\u{1d242}', '\u{1d244}'),
|
||||
('𝐀', '𝑔'),
|
||||
('𝑖', '𝒜'),
|
||||
('𝒞', '𝒟'),
|
||||
('𝒢', '𝒢'),
|
||||
('𝒥', '𝒦'),
|
||||
('𝒩', '𝒬'),
|
||||
('𝒮', '𝒹'),
|
||||
('𝒻', '𝒻'),
|
||||
('𝒽', '𝓃'),
|
||||
('𝓅', '𝔅'),
|
||||
('𝔇', '𝔊'),
|
||||
('𝔍', '𝔔'),
|
||||
('𝔖', '𝔜'),
|
||||
('𝔞', '𝔹'),
|
||||
('𝔻', '𝔾'),
|
||||
('𝕀', '𝕄'),
|
||||
('𝕆', '𝕆'),
|
||||
('𝕊', '𝕐'),
|
||||
('𝕒', '𝚥'),
|
||||
('𝚨', '𝛀'),
|
||||
('𝛂', '𝛚'),
|
||||
('𝛜', '𝛺'),
|
||||
('𝛼', '𝜔'),
|
||||
('𝜖', '𝜴'),
|
||||
('𝜶', '𝝎'),
|
||||
('𝝐', '𝝮'),
|
||||
('𝝰', '𝞈'),
|
||||
('𝞊', '𝞨'),
|
||||
('𝞪', '𝟂'),
|
||||
('𝟄', '𝟋'),
|
||||
('𝟎', '𝟿'),
|
||||
('\u{1da00}', '\u{1da36}'),
|
||||
('\u{1da3b}', '\u{1da6c}'),
|
||||
('\u{1da75}', '\u{1da75}'),
|
||||
('\u{1da84}', '\u{1da84}'),
|
||||
('\u{1da9b}', '\u{1da9f}'),
|
||||
('\u{1daa1}', '\u{1daaf}'),
|
||||
('𝼀', '𝼞'),
|
||||
('𝼥', '𝼪'),
|
||||
('\u{1e000}', '\u{1e006}'),
|
||||
('\u{1e008}', '\u{1e018}'),
|
||||
('\u{1e01b}', '\u{1e021}'),
|
||||
('\u{1e023}', '\u{1e024}'),
|
||||
('\u{1e026}', '\u{1e02a}'),
|
||||
('𞀰', '𞁭'),
|
||||
('\u{1e08f}', '\u{1e08f}'),
|
||||
('𞄀', '𞄬'),
|
||||
('\u{1e130}', '𞄽'),
|
||||
('𞅀', '𞅉'),
|
||||
('𞅎', '𞅎'),
|
||||
('𞊐', '\u{1e2ae}'),
|
||||
('𞋀', '𞋹'),
|
||||
('𞓐', '𞓹'),
|
||||
('𞟠', '𞟦'),
|
||||
('𞟨', '𞟫'),
|
||||
('𞟭', '𞟮'),
|
||||
('𞟰', '𞟾'),
|
||||
('𞠀', '𞣄'),
|
||||
('\u{1e8d0}', '\u{1e8d6}'),
|
||||
('𞤀', '𞥋'),
|
||||
('𞥐', '𞥙'),
|
||||
('𞸀', '𞸃'),
|
||||
('𞸅', '𞸟'),
|
||||
('𞸡', '𞸢'),
|
||||
('𞸤', '𞸤'),
|
||||
('𞸧', '𞸧'),
|
||||
('𞸩', '𞸲'),
|
||||
('𞸴', '𞸷'),
|
||||
('𞸹', '𞸹'),
|
||||
('𞸻', '𞸻'),
|
||||
('𞹂', '𞹂'),
|
||||
('𞹇', '𞹇'),
|
||||
('𞹉', '𞹉'),
|
||||
('𞹋', '𞹋'),
|
||||
('𞹍', '𞹏'),
|
||||
('𞹑', '𞹒'),
|
||||
('𞹔', '𞹔'),
|
||||
('𞹗', '𞹗'),
|
||||
('𞹙', '𞹙'),
|
||||
('𞹛', '𞹛'),
|
||||
('𞹝', '𞹝'),
|
||||
('𞹟', '𞹟'),
|
||||
('𞹡', '𞹢'),
|
||||
('𞹤', '𞹤'),
|
||||
('𞹧', '𞹪'),
|
||||
('𞹬', '𞹲'),
|
||||
('𞹴', '𞹷'),
|
||||
('𞹹', '𞹼'),
|
||||
('𞹾', '𞹾'),
|
||||
('𞺀', '𞺉'),
|
||||
('𞺋', '𞺛'),
|
||||
('𞺡', '𞺣'),
|
||||
('𞺥', '𞺩'),
|
||||
('𞺫', '𞺻'),
|
||||
('🄰', '🅉'),
|
||||
('🅐', '🅩'),
|
||||
('🅰', '🆉'),
|
||||
('🯰', '🯹'),
|
||||
('𠀀', '𪛟'),
|
||||
('𪜀', '𫜹'),
|
||||
('𫝀', '𫠝'),
|
||||
('𫠠', '𬺡'),
|
||||
('𬺰', '𮯠'),
|
||||
('丽', '𪘀'),
|
||||
('𰀀', '𱍊'),
|
||||
('𱍐', '𲎯'),
|
||||
('\u{e0100}', '\u{e01ef}'),
|
||||
];
|
||||
11367
third-party/vendor/regex-syntax/src/unicode_tables/property_bool.rs
vendored
Normal file
11367
third-party/vendor/regex-syntax/src/unicode_tables/property_bool.rs
vendored
Normal file
File diff suppressed because it is too large
Load diff
264
third-party/vendor/regex-syntax/src/unicode_tables/property_names.rs
vendored
Normal file
264
third-party/vendor/regex-syntax/src/unicode_tables/property_names.rs
vendored
Normal file
|
|
@ -0,0 +1,264 @@
|
|||
// DO NOT EDIT THIS FILE. IT WAS AUTOMATICALLY GENERATED BY:
|
||||
//
|
||||
// ucd-generate property-names ucd-15.0.0
|
||||
//
|
||||
// Unicode version: 15.0.0.
|
||||
//
|
||||
// ucd-generate 0.2.14 is available on crates.io.
|
||||
|
||||
pub const PROPERTY_NAMES: &'static [(&'static str, &'static str)] = &[
|
||||
("age", "Age"),
|
||||
("ahex", "ASCII_Hex_Digit"),
|
||||
("alpha", "Alphabetic"),
|
||||
("alphabetic", "Alphabetic"),
|
||||
("asciihexdigit", "ASCII_Hex_Digit"),
|
||||
("bc", "Bidi_Class"),
|
||||
("bidic", "Bidi_Control"),
|
||||
("bidiclass", "Bidi_Class"),
|
||||
("bidicontrol", "Bidi_Control"),
|
||||
("bidim", "Bidi_Mirrored"),
|
||||
("bidimirrored", "Bidi_Mirrored"),
|
||||
("bidimirroringglyph", "Bidi_Mirroring_Glyph"),
|
||||
("bidipairedbracket", "Bidi_Paired_Bracket"),
|
||||
("bidipairedbrackettype", "Bidi_Paired_Bracket_Type"),
|
||||
("blk", "Block"),
|
||||
("block", "Block"),
|
||||
("bmg", "Bidi_Mirroring_Glyph"),
|
||||
("bpb", "Bidi_Paired_Bracket"),
|
||||
("bpt", "Bidi_Paired_Bracket_Type"),
|
||||
("canonicalcombiningclass", "Canonical_Combining_Class"),
|
||||
("cased", "Cased"),
|
||||
("casefolding", "Case_Folding"),
|
||||
("caseignorable", "Case_Ignorable"),
|
||||
("ccc", "Canonical_Combining_Class"),
|
||||
("ce", "Composition_Exclusion"),
|
||||
("cf", "Case_Folding"),
|
||||
("changeswhencasefolded", "Changes_When_Casefolded"),
|
||||
("changeswhencasemapped", "Changes_When_Casemapped"),
|
||||
("changeswhenlowercased", "Changes_When_Lowercased"),
|
||||
("changeswhennfkccasefolded", "Changes_When_NFKC_Casefolded"),
|
||||
("changeswhentitlecased", "Changes_When_Titlecased"),
|
||||
("changeswhenuppercased", "Changes_When_Uppercased"),
|
||||
("ci", "Case_Ignorable"),
|
||||
("cjkaccountingnumeric", "kAccountingNumeric"),
|
||||
("cjkcompatibilityvariant", "kCompatibilityVariant"),
|
||||
("cjkiicore", "kIICore"),
|
||||
("cjkirggsource", "kIRG_GSource"),
|
||||
("cjkirghsource", "kIRG_HSource"),
|
||||
("cjkirgjsource", "kIRG_JSource"),
|
||||
("cjkirgkpsource", "kIRG_KPSource"),
|
||||
("cjkirgksource", "kIRG_KSource"),
|
||||
("cjkirgmsource", "kIRG_MSource"),
|
||||
("cjkirgssource", "kIRG_SSource"),
|
||||
("cjkirgtsource", "kIRG_TSource"),
|
||||
("cjkirguksource", "kIRG_UKSource"),
|
||||
("cjkirgusource", "kIRG_USource"),
|
||||
("cjkirgvsource", "kIRG_VSource"),
|
||||
("cjkothernumeric", "kOtherNumeric"),
|
||||
("cjkprimarynumeric", "kPrimaryNumeric"),
|
||||
("cjkrsunicode", "kRSUnicode"),
|
||||
("compex", "Full_Composition_Exclusion"),
|
||||
("compositionexclusion", "Composition_Exclusion"),
|
||||
("cwcf", "Changes_When_Casefolded"),
|
||||
("cwcm", "Changes_When_Casemapped"),
|
||||
("cwkcf", "Changes_When_NFKC_Casefolded"),
|
||||
("cwl", "Changes_When_Lowercased"),
|
||||
("cwt", "Changes_When_Titlecased"),
|
||||
("cwu", "Changes_When_Uppercased"),
|
||||
("dash", "Dash"),
|
||||
("decompositionmapping", "Decomposition_Mapping"),
|
||||
("decompositiontype", "Decomposition_Type"),
|
||||
("defaultignorablecodepoint", "Default_Ignorable_Code_Point"),
|
||||
("dep", "Deprecated"),
|
||||
("deprecated", "Deprecated"),
|
||||
("di", "Default_Ignorable_Code_Point"),
|
||||
("dia", "Diacritic"),
|
||||
("diacritic", "Diacritic"),
|
||||
("dm", "Decomposition_Mapping"),
|
||||
("dt", "Decomposition_Type"),
|
||||
("ea", "East_Asian_Width"),
|
||||
("eastasianwidth", "East_Asian_Width"),
|
||||
("ebase", "Emoji_Modifier_Base"),
|
||||
("ecomp", "Emoji_Component"),
|
||||
("emod", "Emoji_Modifier"),
|
||||
("emoji", "Emoji"),
|
||||
("emojicomponent", "Emoji_Component"),
|
||||
("emojimodifier", "Emoji_Modifier"),
|
||||
("emojimodifierbase", "Emoji_Modifier_Base"),
|
||||
("emojipresentation", "Emoji_Presentation"),
|
||||
("epres", "Emoji_Presentation"),
|
||||
("equideo", "Equivalent_Unified_Ideograph"),
|
||||
("equivalentunifiedideograph", "Equivalent_Unified_Ideograph"),
|
||||
("expandsonnfc", "Expands_On_NFC"),
|
||||
("expandsonnfd", "Expands_On_NFD"),
|
||||
("expandsonnfkc", "Expands_On_NFKC"),
|
||||
("expandsonnfkd", "Expands_On_NFKD"),
|
||||
("ext", "Extender"),
|
||||
("extendedpictographic", "Extended_Pictographic"),
|
||||
("extender", "Extender"),
|
||||
("extpict", "Extended_Pictographic"),
|
||||
("fcnfkc", "FC_NFKC_Closure"),
|
||||
("fcnfkcclosure", "FC_NFKC_Closure"),
|
||||
("fullcompositionexclusion", "Full_Composition_Exclusion"),
|
||||
("gc", "General_Category"),
|
||||
("gcb", "Grapheme_Cluster_Break"),
|
||||
("generalcategory", "General_Category"),
|
||||
("graphemebase", "Grapheme_Base"),
|
||||
("graphemeclusterbreak", "Grapheme_Cluster_Break"),
|
||||
("graphemeextend", "Grapheme_Extend"),
|
||||
("graphemelink", "Grapheme_Link"),
|
||||
("grbase", "Grapheme_Base"),
|
||||
("grext", "Grapheme_Extend"),
|
||||
("grlink", "Grapheme_Link"),
|
||||
("hangulsyllabletype", "Hangul_Syllable_Type"),
|
||||
("hex", "Hex_Digit"),
|
||||
("hexdigit", "Hex_Digit"),
|
||||
("hst", "Hangul_Syllable_Type"),
|
||||
("hyphen", "Hyphen"),
|
||||
("idc", "ID_Continue"),
|
||||
("idcontinue", "ID_Continue"),
|
||||
("ideo", "Ideographic"),
|
||||
("ideographic", "Ideographic"),
|
||||
("ids", "ID_Start"),
|
||||
("idsb", "IDS_Binary_Operator"),
|
||||
("idsbinaryoperator", "IDS_Binary_Operator"),
|
||||
("idst", "IDS_Trinary_Operator"),
|
||||
("idstart", "ID_Start"),
|
||||
("idstrinaryoperator", "IDS_Trinary_Operator"),
|
||||
("indicpositionalcategory", "Indic_Positional_Category"),
|
||||
("indicsyllabiccategory", "Indic_Syllabic_Category"),
|
||||
("inpc", "Indic_Positional_Category"),
|
||||
("insc", "Indic_Syllabic_Category"),
|
||||
("isc", "ISO_Comment"),
|
||||
("jamoshortname", "Jamo_Short_Name"),
|
||||
("jg", "Joining_Group"),
|
||||
("joinc", "Join_Control"),
|
||||
("joincontrol", "Join_Control"),
|
||||
("joininggroup", "Joining_Group"),
|
||||
("joiningtype", "Joining_Type"),
|
||||
("jsn", "Jamo_Short_Name"),
|
||||
("jt", "Joining_Type"),
|
||||
("kaccountingnumeric", "kAccountingNumeric"),
|
||||
("kcompatibilityvariant", "kCompatibilityVariant"),
|
||||
("kiicore", "kIICore"),
|
||||
("kirggsource", "kIRG_GSource"),
|
||||
("kirghsource", "kIRG_HSource"),
|
||||
("kirgjsource", "kIRG_JSource"),
|
||||
("kirgkpsource", "kIRG_KPSource"),
|
||||
("kirgksource", "kIRG_KSource"),
|
||||
("kirgmsource", "kIRG_MSource"),
|
||||
("kirgssource", "kIRG_SSource"),
|
||||
("kirgtsource", "kIRG_TSource"),
|
||||
("kirguksource", "kIRG_UKSource"),
|
||||
("kirgusource", "kIRG_USource"),
|
||||
("kirgvsource", "kIRG_VSource"),
|
||||
("kothernumeric", "kOtherNumeric"),
|
||||
("kprimarynumeric", "kPrimaryNumeric"),
|
||||
("krsunicode", "kRSUnicode"),
|
||||
("lb", "Line_Break"),
|
||||
("lc", "Lowercase_Mapping"),
|
||||
("linebreak", "Line_Break"),
|
||||
("loe", "Logical_Order_Exception"),
|
||||
("logicalorderexception", "Logical_Order_Exception"),
|
||||
("lower", "Lowercase"),
|
||||
("lowercase", "Lowercase"),
|
||||
("lowercasemapping", "Lowercase_Mapping"),
|
||||
("math", "Math"),
|
||||
("na", "Name"),
|
||||
("na1", "Unicode_1_Name"),
|
||||
("name", "Name"),
|
||||
("namealias", "Name_Alias"),
|
||||
("nchar", "Noncharacter_Code_Point"),
|
||||
("nfcqc", "NFC_Quick_Check"),
|
||||
("nfcquickcheck", "NFC_Quick_Check"),
|
||||
("nfdqc", "NFD_Quick_Check"),
|
||||
("nfdquickcheck", "NFD_Quick_Check"),
|
||||
("nfkccasefold", "NFKC_Casefold"),
|
||||
("nfkccf", "NFKC_Casefold"),
|
||||
("nfkcqc", "NFKC_Quick_Check"),
|
||||
("nfkcquickcheck", "NFKC_Quick_Check"),
|
||||
("nfkdqc", "NFKD_Quick_Check"),
|
||||
("nfkdquickcheck", "NFKD_Quick_Check"),
|
||||
("noncharactercodepoint", "Noncharacter_Code_Point"),
|
||||
("nt", "Numeric_Type"),
|
||||
("numerictype", "Numeric_Type"),
|
||||
("numericvalue", "Numeric_Value"),
|
||||
("nv", "Numeric_Value"),
|
||||
("oalpha", "Other_Alphabetic"),
|
||||
("ocomment", "ISO_Comment"),
|
||||
("odi", "Other_Default_Ignorable_Code_Point"),
|
||||
("ogrext", "Other_Grapheme_Extend"),
|
||||
("oidc", "Other_ID_Continue"),
|
||||
("oids", "Other_ID_Start"),
|
||||
("olower", "Other_Lowercase"),
|
||||
("omath", "Other_Math"),
|
||||
("otheralphabetic", "Other_Alphabetic"),
|
||||
("otherdefaultignorablecodepoint", "Other_Default_Ignorable_Code_Point"),
|
||||
("othergraphemeextend", "Other_Grapheme_Extend"),
|
||||
("otheridcontinue", "Other_ID_Continue"),
|
||||
("otheridstart", "Other_ID_Start"),
|
||||
("otherlowercase", "Other_Lowercase"),
|
||||
("othermath", "Other_Math"),
|
||||
("otheruppercase", "Other_Uppercase"),
|
||||
("oupper", "Other_Uppercase"),
|
||||
("patsyn", "Pattern_Syntax"),
|
||||
("patternsyntax", "Pattern_Syntax"),
|
||||
("patternwhitespace", "Pattern_White_Space"),
|
||||
("patws", "Pattern_White_Space"),
|
||||
("pcm", "Prepended_Concatenation_Mark"),
|
||||
("prependedconcatenationmark", "Prepended_Concatenation_Mark"),
|
||||
("qmark", "Quotation_Mark"),
|
||||
("quotationmark", "Quotation_Mark"),
|
||||
("radical", "Radical"),
|
||||
("regionalindicator", "Regional_Indicator"),
|
||||
("ri", "Regional_Indicator"),
|
||||
("sb", "Sentence_Break"),
|
||||
("sc", "Script"),
|
||||
("scf", "Simple_Case_Folding"),
|
||||
("script", "Script"),
|
||||
("scriptextensions", "Script_Extensions"),
|
||||
("scx", "Script_Extensions"),
|
||||
("sd", "Soft_Dotted"),
|
||||
("sentencebreak", "Sentence_Break"),
|
||||
("sentenceterminal", "Sentence_Terminal"),
|
||||
("sfc", "Simple_Case_Folding"),
|
||||
("simplecasefolding", "Simple_Case_Folding"),
|
||||
("simplelowercasemapping", "Simple_Lowercase_Mapping"),
|
||||
("simpletitlecasemapping", "Simple_Titlecase_Mapping"),
|
||||
("simpleuppercasemapping", "Simple_Uppercase_Mapping"),
|
||||
("slc", "Simple_Lowercase_Mapping"),
|
||||
("softdotted", "Soft_Dotted"),
|
||||
("space", "White_Space"),
|
||||
("stc", "Simple_Titlecase_Mapping"),
|
||||
("sterm", "Sentence_Terminal"),
|
||||
("suc", "Simple_Uppercase_Mapping"),
|
||||
("tc", "Titlecase_Mapping"),
|
||||
("term", "Terminal_Punctuation"),
|
||||
("terminalpunctuation", "Terminal_Punctuation"),
|
||||
("titlecasemapping", "Titlecase_Mapping"),
|
||||
("uc", "Uppercase_Mapping"),
|
||||
("uideo", "Unified_Ideograph"),
|
||||
("unicode1name", "Unicode_1_Name"),
|
||||
("unicoderadicalstroke", "kRSUnicode"),
|
||||
("unifiedideograph", "Unified_Ideograph"),
|
||||
("upper", "Uppercase"),
|
||||
("uppercase", "Uppercase"),
|
||||
("uppercasemapping", "Uppercase_Mapping"),
|
||||
("urs", "kRSUnicode"),
|
||||
("variationselector", "Variation_Selector"),
|
||||
("verticalorientation", "Vertical_Orientation"),
|
||||
("vo", "Vertical_Orientation"),
|
||||
("vs", "Variation_Selector"),
|
||||
("wb", "Word_Break"),
|
||||
("whitespace", "White_Space"),
|
||||
("wordbreak", "Word_Break"),
|
||||
("wspace", "White_Space"),
|
||||
("xidc", "XID_Continue"),
|
||||
("xidcontinue", "XID_Continue"),
|
||||
("xids", "XID_Start"),
|
||||
("xidstart", "XID_Start"),
|
||||
("xonfc", "Expands_On_NFC"),
|
||||
("xonfd", "Expands_On_NFD"),
|
||||
("xonfkc", "Expands_On_NFKC"),
|
||||
("xonfkd", "Expands_On_NFKD"),
|
||||
];
|
||||
924
third-party/vendor/regex-syntax/src/unicode_tables/property_values.rs
vendored
Normal file
924
third-party/vendor/regex-syntax/src/unicode_tables/property_values.rs
vendored
Normal file
|
|
@ -0,0 +1,924 @@
|
|||
// DO NOT EDIT THIS FILE. IT WAS AUTOMATICALLY GENERATED BY:
|
||||
//
|
||||
// ucd-generate property-values ucd-15.0.0 --include gc,script,scx,age,gcb,wb,sb
|
||||
//
|
||||
// Unicode version: 15.0.0.
|
||||
//
|
||||
// ucd-generate 0.2.14 is available on crates.io.
|
||||
|
||||
pub const PROPERTY_VALUES: &'static [(
|
||||
&'static str,
|
||||
&'static [(&'static str, &'static str)],
|
||||
)] = &[
|
||||
(
|
||||
"Age",
|
||||
&[
|
||||
("1.1", "V1_1"),
|
||||
("10.0", "V10_0"),
|
||||
("11.0", "V11_0"),
|
||||
("12.0", "V12_0"),
|
||||
("12.1", "V12_1"),
|
||||
("13.0", "V13_0"),
|
||||
("14.0", "V14_0"),
|
||||
("15.0", "V15_0"),
|
||||
("2.0", "V2_0"),
|
||||
("2.1", "V2_1"),
|
||||
("3.0", "V3_0"),
|
||||
("3.1", "V3_1"),
|
||||
("3.2", "V3_2"),
|
||||
("4.0", "V4_0"),
|
||||
("4.1", "V4_1"),
|
||||
("5.0", "V5_0"),
|
||||
("5.1", "V5_1"),
|
||||
("5.2", "V5_2"),
|
||||
("6.0", "V6_0"),
|
||||
("6.1", "V6_1"),
|
||||
("6.2", "V6_2"),
|
||||
("6.3", "V6_3"),
|
||||
("7.0", "V7_0"),
|
||||
("8.0", "V8_0"),
|
||||
("9.0", "V9_0"),
|
||||
("na", "Unassigned"),
|
||||
("unassigned", "Unassigned"),
|
||||
("v100", "V10_0"),
|
||||
("v11", "V1_1"),
|
||||
("v110", "V11_0"),
|
||||
("v120", "V12_0"),
|
||||
("v121", "V12_1"),
|
||||
("v130", "V13_0"),
|
||||
("v140", "V14_0"),
|
||||
("v150", "V15_0"),
|
||||
("v20", "V2_0"),
|
||||
("v21", "V2_1"),
|
||||
("v30", "V3_0"),
|
||||
("v31", "V3_1"),
|
||||
("v32", "V3_2"),
|
||||
("v40", "V4_0"),
|
||||
("v41", "V4_1"),
|
||||
("v50", "V5_0"),
|
||||
("v51", "V5_1"),
|
||||
("v52", "V5_2"),
|
||||
("v60", "V6_0"),
|
||||
("v61", "V6_1"),
|
||||
("v62", "V6_2"),
|
||||
("v63", "V6_3"),
|
||||
("v70", "V7_0"),
|
||||
("v80", "V8_0"),
|
||||
("v90", "V9_0"),
|
||||
],
|
||||
),
|
||||
(
|
||||
"General_Category",
|
||||
&[
|
||||
("c", "Other"),
|
||||
("casedletter", "Cased_Letter"),
|
||||
("cc", "Control"),
|
||||
("cf", "Format"),
|
||||
("closepunctuation", "Close_Punctuation"),
|
||||
("cn", "Unassigned"),
|
||||
("cntrl", "Control"),
|
||||
("co", "Private_Use"),
|
||||
("combiningmark", "Mark"),
|
||||
("connectorpunctuation", "Connector_Punctuation"),
|
||||
("control", "Control"),
|
||||
("cs", "Surrogate"),
|
||||
("currencysymbol", "Currency_Symbol"),
|
||||
("dashpunctuation", "Dash_Punctuation"),
|
||||
("decimalnumber", "Decimal_Number"),
|
||||
("digit", "Decimal_Number"),
|
||||
("enclosingmark", "Enclosing_Mark"),
|
||||
("finalpunctuation", "Final_Punctuation"),
|
||||
("format", "Format"),
|
||||
("initialpunctuation", "Initial_Punctuation"),
|
||||
("l", "Letter"),
|
||||
("lc", "Cased_Letter"),
|
||||
("letter", "Letter"),
|
||||
("letternumber", "Letter_Number"),
|
||||
("lineseparator", "Line_Separator"),
|
||||
("ll", "Lowercase_Letter"),
|
||||
("lm", "Modifier_Letter"),
|
||||
("lo", "Other_Letter"),
|
||||
("lowercaseletter", "Lowercase_Letter"),
|
||||
("lt", "Titlecase_Letter"),
|
||||
("lu", "Uppercase_Letter"),
|
||||
("m", "Mark"),
|
||||
("mark", "Mark"),
|
||||
("mathsymbol", "Math_Symbol"),
|
||||
("mc", "Spacing_Mark"),
|
||||
("me", "Enclosing_Mark"),
|
||||
("mn", "Nonspacing_Mark"),
|
||||
("modifierletter", "Modifier_Letter"),
|
||||
("modifiersymbol", "Modifier_Symbol"),
|
||||
("n", "Number"),
|
||||
("nd", "Decimal_Number"),
|
||||
("nl", "Letter_Number"),
|
||||
("no", "Other_Number"),
|
||||
("nonspacingmark", "Nonspacing_Mark"),
|
||||
("number", "Number"),
|
||||
("openpunctuation", "Open_Punctuation"),
|
||||
("other", "Other"),
|
||||
("otherletter", "Other_Letter"),
|
||||
("othernumber", "Other_Number"),
|
||||
("otherpunctuation", "Other_Punctuation"),
|
||||
("othersymbol", "Other_Symbol"),
|
||||
("p", "Punctuation"),
|
||||
("paragraphseparator", "Paragraph_Separator"),
|
||||
("pc", "Connector_Punctuation"),
|
||||
("pd", "Dash_Punctuation"),
|
||||
("pe", "Close_Punctuation"),
|
||||
("pf", "Final_Punctuation"),
|
||||
("pi", "Initial_Punctuation"),
|
||||
("po", "Other_Punctuation"),
|
||||
("privateuse", "Private_Use"),
|
||||
("ps", "Open_Punctuation"),
|
||||
("punct", "Punctuation"),
|
||||
("punctuation", "Punctuation"),
|
||||
("s", "Symbol"),
|
||||
("sc", "Currency_Symbol"),
|
||||
("separator", "Separator"),
|
||||
("sk", "Modifier_Symbol"),
|
||||
("sm", "Math_Symbol"),
|
||||
("so", "Other_Symbol"),
|
||||
("spaceseparator", "Space_Separator"),
|
||||
("spacingmark", "Spacing_Mark"),
|
||||
("surrogate", "Surrogate"),
|
||||
("symbol", "Symbol"),
|
||||
("titlecaseletter", "Titlecase_Letter"),
|
||||
("unassigned", "Unassigned"),
|
||||
("uppercaseletter", "Uppercase_Letter"),
|
||||
("z", "Separator"),
|
||||
("zl", "Line_Separator"),
|
||||
("zp", "Paragraph_Separator"),
|
||||
("zs", "Space_Separator"),
|
||||
],
|
||||
),
|
||||
(
|
||||
"Grapheme_Cluster_Break",
|
||||
&[
|
||||
("cn", "Control"),
|
||||
("control", "Control"),
|
||||
("cr", "CR"),
|
||||
("eb", "E_Base"),
|
||||
("ebase", "E_Base"),
|
||||
("ebasegaz", "E_Base_GAZ"),
|
||||
("ebg", "E_Base_GAZ"),
|
||||
("em", "E_Modifier"),
|
||||
("emodifier", "E_Modifier"),
|
||||
("ex", "Extend"),
|
||||
("extend", "Extend"),
|
||||
("gaz", "Glue_After_Zwj"),
|
||||
("glueafterzwj", "Glue_After_Zwj"),
|
||||
("l", "L"),
|
||||
("lf", "LF"),
|
||||
("lv", "LV"),
|
||||
("lvt", "LVT"),
|
||||
("other", "Other"),
|
||||
("pp", "Prepend"),
|
||||
("prepend", "Prepend"),
|
||||
("regionalindicator", "Regional_Indicator"),
|
||||
("ri", "Regional_Indicator"),
|
||||
("sm", "SpacingMark"),
|
||||
("spacingmark", "SpacingMark"),
|
||||
("t", "T"),
|
||||
("v", "V"),
|
||||
("xx", "Other"),
|
||||
("zwj", "ZWJ"),
|
||||
],
|
||||
),
|
||||
(
|
||||
"Script",
|
||||
&[
|
||||
("adlam", "Adlam"),
|
||||
("adlm", "Adlam"),
|
||||
("aghb", "Caucasian_Albanian"),
|
||||
("ahom", "Ahom"),
|
||||
("anatolianhieroglyphs", "Anatolian_Hieroglyphs"),
|
||||
("arab", "Arabic"),
|
||||
("arabic", "Arabic"),
|
||||
("armenian", "Armenian"),
|
||||
("armi", "Imperial_Aramaic"),
|
||||
("armn", "Armenian"),
|
||||
("avestan", "Avestan"),
|
||||
("avst", "Avestan"),
|
||||
("bali", "Balinese"),
|
||||
("balinese", "Balinese"),
|
||||
("bamu", "Bamum"),
|
||||
("bamum", "Bamum"),
|
||||
("bass", "Bassa_Vah"),
|
||||
("bassavah", "Bassa_Vah"),
|
||||
("batak", "Batak"),
|
||||
("batk", "Batak"),
|
||||
("beng", "Bengali"),
|
||||
("bengali", "Bengali"),
|
||||
("bhaiksuki", "Bhaiksuki"),
|
||||
("bhks", "Bhaiksuki"),
|
||||
("bopo", "Bopomofo"),
|
||||
("bopomofo", "Bopomofo"),
|
||||
("brah", "Brahmi"),
|
||||
("brahmi", "Brahmi"),
|
||||
("brai", "Braille"),
|
||||
("braille", "Braille"),
|
||||
("bugi", "Buginese"),
|
||||
("buginese", "Buginese"),
|
||||
("buhd", "Buhid"),
|
||||
("buhid", "Buhid"),
|
||||
("cakm", "Chakma"),
|
||||
("canadianaboriginal", "Canadian_Aboriginal"),
|
||||
("cans", "Canadian_Aboriginal"),
|
||||
("cari", "Carian"),
|
||||
("carian", "Carian"),
|
||||
("caucasianalbanian", "Caucasian_Albanian"),
|
||||
("chakma", "Chakma"),
|
||||
("cham", "Cham"),
|
||||
("cher", "Cherokee"),
|
||||
("cherokee", "Cherokee"),
|
||||
("chorasmian", "Chorasmian"),
|
||||
("chrs", "Chorasmian"),
|
||||
("common", "Common"),
|
||||
("copt", "Coptic"),
|
||||
("coptic", "Coptic"),
|
||||
("cpmn", "Cypro_Minoan"),
|
||||
("cprt", "Cypriot"),
|
||||
("cuneiform", "Cuneiform"),
|
||||
("cypriot", "Cypriot"),
|
||||
("cyprominoan", "Cypro_Minoan"),
|
||||
("cyrillic", "Cyrillic"),
|
||||
("cyrl", "Cyrillic"),
|
||||
("deseret", "Deseret"),
|
||||
("deva", "Devanagari"),
|
||||
("devanagari", "Devanagari"),
|
||||
("diak", "Dives_Akuru"),
|
||||
("divesakuru", "Dives_Akuru"),
|
||||
("dogr", "Dogra"),
|
||||
("dogra", "Dogra"),
|
||||
("dsrt", "Deseret"),
|
||||
("dupl", "Duployan"),
|
||||
("duployan", "Duployan"),
|
||||
("egyp", "Egyptian_Hieroglyphs"),
|
||||
("egyptianhieroglyphs", "Egyptian_Hieroglyphs"),
|
||||
("elba", "Elbasan"),
|
||||
("elbasan", "Elbasan"),
|
||||
("elym", "Elymaic"),
|
||||
("elymaic", "Elymaic"),
|
||||
("ethi", "Ethiopic"),
|
||||
("ethiopic", "Ethiopic"),
|
||||
("geor", "Georgian"),
|
||||
("georgian", "Georgian"),
|
||||
("glag", "Glagolitic"),
|
||||
("glagolitic", "Glagolitic"),
|
||||
("gong", "Gunjala_Gondi"),
|
||||
("gonm", "Masaram_Gondi"),
|
||||
("goth", "Gothic"),
|
||||
("gothic", "Gothic"),
|
||||
("gran", "Grantha"),
|
||||
("grantha", "Grantha"),
|
||||
("greek", "Greek"),
|
||||
("grek", "Greek"),
|
||||
("gujarati", "Gujarati"),
|
||||
("gujr", "Gujarati"),
|
||||
("gunjalagondi", "Gunjala_Gondi"),
|
||||
("gurmukhi", "Gurmukhi"),
|
||||
("guru", "Gurmukhi"),
|
||||
("han", "Han"),
|
||||
("hang", "Hangul"),
|
||||
("hangul", "Hangul"),
|
||||
("hani", "Han"),
|
||||
("hanifirohingya", "Hanifi_Rohingya"),
|
||||
("hano", "Hanunoo"),
|
||||
("hanunoo", "Hanunoo"),
|
||||
("hatr", "Hatran"),
|
||||
("hatran", "Hatran"),
|
||||
("hebr", "Hebrew"),
|
||||
("hebrew", "Hebrew"),
|
||||
("hira", "Hiragana"),
|
||||
("hiragana", "Hiragana"),
|
||||
("hluw", "Anatolian_Hieroglyphs"),
|
||||
("hmng", "Pahawh_Hmong"),
|
||||
("hmnp", "Nyiakeng_Puachue_Hmong"),
|
||||
("hrkt", "Katakana_Or_Hiragana"),
|
||||
("hung", "Old_Hungarian"),
|
||||
("imperialaramaic", "Imperial_Aramaic"),
|
||||
("inherited", "Inherited"),
|
||||
("inscriptionalpahlavi", "Inscriptional_Pahlavi"),
|
||||
("inscriptionalparthian", "Inscriptional_Parthian"),
|
||||
("ital", "Old_Italic"),
|
||||
("java", "Javanese"),
|
||||
("javanese", "Javanese"),
|
||||
("kaithi", "Kaithi"),
|
||||
("kali", "Kayah_Li"),
|
||||
("kana", "Katakana"),
|
||||
("kannada", "Kannada"),
|
||||
("katakana", "Katakana"),
|
||||
("katakanaorhiragana", "Katakana_Or_Hiragana"),
|
||||
("kawi", "Kawi"),
|
||||
("kayahli", "Kayah_Li"),
|
||||
("khar", "Kharoshthi"),
|
||||
("kharoshthi", "Kharoshthi"),
|
||||
("khitansmallscript", "Khitan_Small_Script"),
|
||||
("khmer", "Khmer"),
|
||||
("khmr", "Khmer"),
|
||||
("khoj", "Khojki"),
|
||||
("khojki", "Khojki"),
|
||||
("khudawadi", "Khudawadi"),
|
||||
("kits", "Khitan_Small_Script"),
|
||||
("knda", "Kannada"),
|
||||
("kthi", "Kaithi"),
|
||||
("lana", "Tai_Tham"),
|
||||
("lao", "Lao"),
|
||||
("laoo", "Lao"),
|
||||
("latin", "Latin"),
|
||||
("latn", "Latin"),
|
||||
("lepc", "Lepcha"),
|
||||
("lepcha", "Lepcha"),
|
||||
("limb", "Limbu"),
|
||||
("limbu", "Limbu"),
|
||||
("lina", "Linear_A"),
|
||||
("linb", "Linear_B"),
|
||||
("lineara", "Linear_A"),
|
||||
("linearb", "Linear_B"),
|
||||
("lisu", "Lisu"),
|
||||
("lyci", "Lycian"),
|
||||
("lycian", "Lycian"),
|
||||
("lydi", "Lydian"),
|
||||
("lydian", "Lydian"),
|
||||
("mahajani", "Mahajani"),
|
||||
("mahj", "Mahajani"),
|
||||
("maka", "Makasar"),
|
||||
("makasar", "Makasar"),
|
||||
("malayalam", "Malayalam"),
|
||||
("mand", "Mandaic"),
|
||||
("mandaic", "Mandaic"),
|
||||
("mani", "Manichaean"),
|
||||
("manichaean", "Manichaean"),
|
||||
("marc", "Marchen"),
|
||||
("marchen", "Marchen"),
|
||||
("masaramgondi", "Masaram_Gondi"),
|
||||
("medefaidrin", "Medefaidrin"),
|
||||
("medf", "Medefaidrin"),
|
||||
("meeteimayek", "Meetei_Mayek"),
|
||||
("mend", "Mende_Kikakui"),
|
||||
("mendekikakui", "Mende_Kikakui"),
|
||||
("merc", "Meroitic_Cursive"),
|
||||
("mero", "Meroitic_Hieroglyphs"),
|
||||
("meroiticcursive", "Meroitic_Cursive"),
|
||||
("meroitichieroglyphs", "Meroitic_Hieroglyphs"),
|
||||
("miao", "Miao"),
|
||||
("mlym", "Malayalam"),
|
||||
("modi", "Modi"),
|
||||
("mong", "Mongolian"),
|
||||
("mongolian", "Mongolian"),
|
||||
("mro", "Mro"),
|
||||
("mroo", "Mro"),
|
||||
("mtei", "Meetei_Mayek"),
|
||||
("mult", "Multani"),
|
||||
("multani", "Multani"),
|
||||
("myanmar", "Myanmar"),
|
||||
("mymr", "Myanmar"),
|
||||
("nabataean", "Nabataean"),
|
||||
("nagm", "Nag_Mundari"),
|
||||
("nagmundari", "Nag_Mundari"),
|
||||
("nand", "Nandinagari"),
|
||||
("nandinagari", "Nandinagari"),
|
||||
("narb", "Old_North_Arabian"),
|
||||
("nbat", "Nabataean"),
|
||||
("newa", "Newa"),
|
||||
("newtailue", "New_Tai_Lue"),
|
||||
("nko", "Nko"),
|
||||
("nkoo", "Nko"),
|
||||
("nshu", "Nushu"),
|
||||
("nushu", "Nushu"),
|
||||
("nyiakengpuachuehmong", "Nyiakeng_Puachue_Hmong"),
|
||||
("ogam", "Ogham"),
|
||||
("ogham", "Ogham"),
|
||||
("olchiki", "Ol_Chiki"),
|
||||
("olck", "Ol_Chiki"),
|
||||
("oldhungarian", "Old_Hungarian"),
|
||||
("olditalic", "Old_Italic"),
|
||||
("oldnortharabian", "Old_North_Arabian"),
|
||||
("oldpermic", "Old_Permic"),
|
||||
("oldpersian", "Old_Persian"),
|
||||
("oldsogdian", "Old_Sogdian"),
|
||||
("oldsoutharabian", "Old_South_Arabian"),
|
||||
("oldturkic", "Old_Turkic"),
|
||||
("olduyghur", "Old_Uyghur"),
|
||||
("oriya", "Oriya"),
|
||||
("orkh", "Old_Turkic"),
|
||||
("orya", "Oriya"),
|
||||
("osage", "Osage"),
|
||||
("osge", "Osage"),
|
||||
("osma", "Osmanya"),
|
||||
("osmanya", "Osmanya"),
|
||||
("ougr", "Old_Uyghur"),
|
||||
("pahawhhmong", "Pahawh_Hmong"),
|
||||
("palm", "Palmyrene"),
|
||||
("palmyrene", "Palmyrene"),
|
||||
("pauc", "Pau_Cin_Hau"),
|
||||
("paucinhau", "Pau_Cin_Hau"),
|
||||
("perm", "Old_Permic"),
|
||||
("phag", "Phags_Pa"),
|
||||
("phagspa", "Phags_Pa"),
|
||||
("phli", "Inscriptional_Pahlavi"),
|
||||
("phlp", "Psalter_Pahlavi"),
|
||||
("phnx", "Phoenician"),
|
||||
("phoenician", "Phoenician"),
|
||||
("plrd", "Miao"),
|
||||
("prti", "Inscriptional_Parthian"),
|
||||
("psalterpahlavi", "Psalter_Pahlavi"),
|
||||
("qaac", "Coptic"),
|
||||
("qaai", "Inherited"),
|
||||
("rejang", "Rejang"),
|
||||
("rjng", "Rejang"),
|
||||
("rohg", "Hanifi_Rohingya"),
|
||||
("runic", "Runic"),
|
||||
("runr", "Runic"),
|
||||
("samaritan", "Samaritan"),
|
||||
("samr", "Samaritan"),
|
||||
("sarb", "Old_South_Arabian"),
|
||||
("saur", "Saurashtra"),
|
||||
("saurashtra", "Saurashtra"),
|
||||
("sgnw", "SignWriting"),
|
||||
("sharada", "Sharada"),
|
||||
("shavian", "Shavian"),
|
||||
("shaw", "Shavian"),
|
||||
("shrd", "Sharada"),
|
||||
("sidd", "Siddham"),
|
||||
("siddham", "Siddham"),
|
||||
("signwriting", "SignWriting"),
|
||||
("sind", "Khudawadi"),
|
||||
("sinh", "Sinhala"),
|
||||
("sinhala", "Sinhala"),
|
||||
("sogd", "Sogdian"),
|
||||
("sogdian", "Sogdian"),
|
||||
("sogo", "Old_Sogdian"),
|
||||
("sora", "Sora_Sompeng"),
|
||||
("sorasompeng", "Sora_Sompeng"),
|
||||
("soyo", "Soyombo"),
|
||||
("soyombo", "Soyombo"),
|
||||
("sund", "Sundanese"),
|
||||
("sundanese", "Sundanese"),
|
||||
("sylo", "Syloti_Nagri"),
|
||||
("sylotinagri", "Syloti_Nagri"),
|
||||
("syrc", "Syriac"),
|
||||
("syriac", "Syriac"),
|
||||
("tagalog", "Tagalog"),
|
||||
("tagb", "Tagbanwa"),
|
||||
("tagbanwa", "Tagbanwa"),
|
||||
("taile", "Tai_Le"),
|
||||
("taitham", "Tai_Tham"),
|
||||
("taiviet", "Tai_Viet"),
|
||||
("takr", "Takri"),
|
||||
("takri", "Takri"),
|
||||
("tale", "Tai_Le"),
|
||||
("talu", "New_Tai_Lue"),
|
||||
("tamil", "Tamil"),
|
||||
("taml", "Tamil"),
|
||||
("tang", "Tangut"),
|
||||
("tangsa", "Tangsa"),
|
||||
("tangut", "Tangut"),
|
||||
("tavt", "Tai_Viet"),
|
||||
("telu", "Telugu"),
|
||||
("telugu", "Telugu"),
|
||||
("tfng", "Tifinagh"),
|
||||
("tglg", "Tagalog"),
|
||||
("thaa", "Thaana"),
|
||||
("thaana", "Thaana"),
|
||||
("thai", "Thai"),
|
||||
("tibetan", "Tibetan"),
|
||||
("tibt", "Tibetan"),
|
||||
("tifinagh", "Tifinagh"),
|
||||
("tirh", "Tirhuta"),
|
||||
("tirhuta", "Tirhuta"),
|
||||
("tnsa", "Tangsa"),
|
||||
("toto", "Toto"),
|
||||
("ugar", "Ugaritic"),
|
||||
("ugaritic", "Ugaritic"),
|
||||
("unknown", "Unknown"),
|
||||
("vai", "Vai"),
|
||||
("vaii", "Vai"),
|
||||
("vith", "Vithkuqi"),
|
||||
("vithkuqi", "Vithkuqi"),
|
||||
("wancho", "Wancho"),
|
||||
("wara", "Warang_Citi"),
|
||||
("warangciti", "Warang_Citi"),
|
||||
("wcho", "Wancho"),
|
||||
("xpeo", "Old_Persian"),
|
||||
("xsux", "Cuneiform"),
|
||||
("yezi", "Yezidi"),
|
||||
("yezidi", "Yezidi"),
|
||||
("yi", "Yi"),
|
||||
("yiii", "Yi"),
|
||||
("zanabazarsquare", "Zanabazar_Square"),
|
||||
("zanb", "Zanabazar_Square"),
|
||||
("zinh", "Inherited"),
|
||||
("zyyy", "Common"),
|
||||
("zzzz", "Unknown"),
|
||||
],
|
||||
),
|
||||
(
|
||||
"Script_Extensions",
|
||||
&[
|
||||
("adlam", "Adlam"),
|
||||
("adlm", "Adlam"),
|
||||
("aghb", "Caucasian_Albanian"),
|
||||
("ahom", "Ahom"),
|
||||
("anatolianhieroglyphs", "Anatolian_Hieroglyphs"),
|
||||
("arab", "Arabic"),
|
||||
("arabic", "Arabic"),
|
||||
("armenian", "Armenian"),
|
||||
("armi", "Imperial_Aramaic"),
|
||||
("armn", "Armenian"),
|
||||
("avestan", "Avestan"),
|
||||
("avst", "Avestan"),
|
||||
("bali", "Balinese"),
|
||||
("balinese", "Balinese"),
|
||||
("bamu", "Bamum"),
|
||||
("bamum", "Bamum"),
|
||||
("bass", "Bassa_Vah"),
|
||||
("bassavah", "Bassa_Vah"),
|
||||
("batak", "Batak"),
|
||||
("batk", "Batak"),
|
||||
("beng", "Bengali"),
|
||||
("bengali", "Bengali"),
|
||||
("bhaiksuki", "Bhaiksuki"),
|
||||
("bhks", "Bhaiksuki"),
|
||||
("bopo", "Bopomofo"),
|
||||
("bopomofo", "Bopomofo"),
|
||||
("brah", "Brahmi"),
|
||||
("brahmi", "Brahmi"),
|
||||
("brai", "Braille"),
|
||||
("braille", "Braille"),
|
||||
("bugi", "Buginese"),
|
||||
("buginese", "Buginese"),
|
||||
("buhd", "Buhid"),
|
||||
("buhid", "Buhid"),
|
||||
("cakm", "Chakma"),
|
||||
("canadianaboriginal", "Canadian_Aboriginal"),
|
||||
("cans", "Canadian_Aboriginal"),
|
||||
("cari", "Carian"),
|
||||
("carian", "Carian"),
|
||||
("caucasianalbanian", "Caucasian_Albanian"),
|
||||
("chakma", "Chakma"),
|
||||
("cham", "Cham"),
|
||||
("cher", "Cherokee"),
|
||||
("cherokee", "Cherokee"),
|
||||
("chorasmian", "Chorasmian"),
|
||||
("chrs", "Chorasmian"),
|
||||
("common", "Common"),
|
||||
("copt", "Coptic"),
|
||||
("coptic", "Coptic"),
|
||||
("cpmn", "Cypro_Minoan"),
|
||||
("cprt", "Cypriot"),
|
||||
("cuneiform", "Cuneiform"),
|
||||
("cypriot", "Cypriot"),
|
||||
("cyprominoan", "Cypro_Minoan"),
|
||||
("cyrillic", "Cyrillic"),
|
||||
("cyrl", "Cyrillic"),
|
||||
("deseret", "Deseret"),
|
||||
("deva", "Devanagari"),
|
||||
("devanagari", "Devanagari"),
|
||||
("diak", "Dives_Akuru"),
|
||||
("divesakuru", "Dives_Akuru"),
|
||||
("dogr", "Dogra"),
|
||||
("dogra", "Dogra"),
|
||||
("dsrt", "Deseret"),
|
||||
("dupl", "Duployan"),
|
||||
("duployan", "Duployan"),
|
||||
("egyp", "Egyptian_Hieroglyphs"),
|
||||
("egyptianhieroglyphs", "Egyptian_Hieroglyphs"),
|
||||
("elba", "Elbasan"),
|
||||
("elbasan", "Elbasan"),
|
||||
("elym", "Elymaic"),
|
||||
("elymaic", "Elymaic"),
|
||||
("ethi", "Ethiopic"),
|
||||
("ethiopic", "Ethiopic"),
|
||||
("geor", "Georgian"),
|
||||
("georgian", "Georgian"),
|
||||
("glag", "Glagolitic"),
|
||||
("glagolitic", "Glagolitic"),
|
||||
("gong", "Gunjala_Gondi"),
|
||||
("gonm", "Masaram_Gondi"),
|
||||
("goth", "Gothic"),
|
||||
("gothic", "Gothic"),
|
||||
("gran", "Grantha"),
|
||||
("grantha", "Grantha"),
|
||||
("greek", "Greek"),
|
||||
("grek", "Greek"),
|
||||
("gujarati", "Gujarati"),
|
||||
("gujr", "Gujarati"),
|
||||
("gunjalagondi", "Gunjala_Gondi"),
|
||||
("gurmukhi", "Gurmukhi"),
|
||||
("guru", "Gurmukhi"),
|
||||
("han", "Han"),
|
||||
("hang", "Hangul"),
|
||||
("hangul", "Hangul"),
|
||||
("hani", "Han"),
|
||||
("hanifirohingya", "Hanifi_Rohingya"),
|
||||
("hano", "Hanunoo"),
|
||||
("hanunoo", "Hanunoo"),
|
||||
("hatr", "Hatran"),
|
||||
("hatran", "Hatran"),
|
||||
("hebr", "Hebrew"),
|
||||
("hebrew", "Hebrew"),
|
||||
("hira", "Hiragana"),
|
||||
("hiragana", "Hiragana"),
|
||||
("hluw", "Anatolian_Hieroglyphs"),
|
||||
("hmng", "Pahawh_Hmong"),
|
||||
("hmnp", "Nyiakeng_Puachue_Hmong"),
|
||||
("hrkt", "Katakana_Or_Hiragana"),
|
||||
("hung", "Old_Hungarian"),
|
||||
("imperialaramaic", "Imperial_Aramaic"),
|
||||
("inherited", "Inherited"),
|
||||
("inscriptionalpahlavi", "Inscriptional_Pahlavi"),
|
||||
("inscriptionalparthian", "Inscriptional_Parthian"),
|
||||
("ital", "Old_Italic"),
|
||||
("java", "Javanese"),
|
||||
("javanese", "Javanese"),
|
||||
("kaithi", "Kaithi"),
|
||||
("kali", "Kayah_Li"),
|
||||
("kana", "Katakana"),
|
||||
("kannada", "Kannada"),
|
||||
("katakana", "Katakana"),
|
||||
("katakanaorhiragana", "Katakana_Or_Hiragana"),
|
||||
("kawi", "Kawi"),
|
||||
("kayahli", "Kayah_Li"),
|
||||
("khar", "Kharoshthi"),
|
||||
("kharoshthi", "Kharoshthi"),
|
||||
("khitansmallscript", "Khitan_Small_Script"),
|
||||
("khmer", "Khmer"),
|
||||
("khmr", "Khmer"),
|
||||
("khoj", "Khojki"),
|
||||
("khojki", "Khojki"),
|
||||
("khudawadi", "Khudawadi"),
|
||||
("kits", "Khitan_Small_Script"),
|
||||
("knda", "Kannada"),
|
||||
("kthi", "Kaithi"),
|
||||
("lana", "Tai_Tham"),
|
||||
("lao", "Lao"),
|
||||
("laoo", "Lao"),
|
||||
("latin", "Latin"),
|
||||
("latn", "Latin"),
|
||||
("lepc", "Lepcha"),
|
||||
("lepcha", "Lepcha"),
|
||||
("limb", "Limbu"),
|
||||
("limbu", "Limbu"),
|
||||
("lina", "Linear_A"),
|
||||
("linb", "Linear_B"),
|
||||
("lineara", "Linear_A"),
|
||||
("linearb", "Linear_B"),
|
||||
("lisu", "Lisu"),
|
||||
("lyci", "Lycian"),
|
||||
("lycian", "Lycian"),
|
||||
("lydi", "Lydian"),
|
||||
("lydian", "Lydian"),
|
||||
("mahajani", "Mahajani"),
|
||||
("mahj", "Mahajani"),
|
||||
("maka", "Makasar"),
|
||||
("makasar", "Makasar"),
|
||||
("malayalam", "Malayalam"),
|
||||
("mand", "Mandaic"),
|
||||
("mandaic", "Mandaic"),
|
||||
("mani", "Manichaean"),
|
||||
("manichaean", "Manichaean"),
|
||||
("marc", "Marchen"),
|
||||
("marchen", "Marchen"),
|
||||
("masaramgondi", "Masaram_Gondi"),
|
||||
("medefaidrin", "Medefaidrin"),
|
||||
("medf", "Medefaidrin"),
|
||||
("meeteimayek", "Meetei_Mayek"),
|
||||
("mend", "Mende_Kikakui"),
|
||||
("mendekikakui", "Mende_Kikakui"),
|
||||
("merc", "Meroitic_Cursive"),
|
||||
("mero", "Meroitic_Hieroglyphs"),
|
||||
("meroiticcursive", "Meroitic_Cursive"),
|
||||
("meroitichieroglyphs", "Meroitic_Hieroglyphs"),
|
||||
("miao", "Miao"),
|
||||
("mlym", "Malayalam"),
|
||||
("modi", "Modi"),
|
||||
("mong", "Mongolian"),
|
||||
("mongolian", "Mongolian"),
|
||||
("mro", "Mro"),
|
||||
("mroo", "Mro"),
|
||||
("mtei", "Meetei_Mayek"),
|
||||
("mult", "Multani"),
|
||||
("multani", "Multani"),
|
||||
("myanmar", "Myanmar"),
|
||||
("mymr", "Myanmar"),
|
||||
("nabataean", "Nabataean"),
|
||||
("nagm", "Nag_Mundari"),
|
||||
("nagmundari", "Nag_Mundari"),
|
||||
("nand", "Nandinagari"),
|
||||
("nandinagari", "Nandinagari"),
|
||||
("narb", "Old_North_Arabian"),
|
||||
("nbat", "Nabataean"),
|
||||
("newa", "Newa"),
|
||||
("newtailue", "New_Tai_Lue"),
|
||||
("nko", "Nko"),
|
||||
("nkoo", "Nko"),
|
||||
("nshu", "Nushu"),
|
||||
("nushu", "Nushu"),
|
||||
("nyiakengpuachuehmong", "Nyiakeng_Puachue_Hmong"),
|
||||
("ogam", "Ogham"),
|
||||
("ogham", "Ogham"),
|
||||
("olchiki", "Ol_Chiki"),
|
||||
("olck", "Ol_Chiki"),
|
||||
("oldhungarian", "Old_Hungarian"),
|
||||
("olditalic", "Old_Italic"),
|
||||
("oldnortharabian", "Old_North_Arabian"),
|
||||
("oldpermic", "Old_Permic"),
|
||||
("oldpersian", "Old_Persian"),
|
||||
("oldsogdian", "Old_Sogdian"),
|
||||
("oldsoutharabian", "Old_South_Arabian"),
|
||||
("oldturkic", "Old_Turkic"),
|
||||
("olduyghur", "Old_Uyghur"),
|
||||
("oriya", "Oriya"),
|
||||
("orkh", "Old_Turkic"),
|
||||
("orya", "Oriya"),
|
||||
("osage", "Osage"),
|
||||
("osge", "Osage"),
|
||||
("osma", "Osmanya"),
|
||||
("osmanya", "Osmanya"),
|
||||
("ougr", "Old_Uyghur"),
|
||||
("pahawhhmong", "Pahawh_Hmong"),
|
||||
("palm", "Palmyrene"),
|
||||
("palmyrene", "Palmyrene"),
|
||||
("pauc", "Pau_Cin_Hau"),
|
||||
("paucinhau", "Pau_Cin_Hau"),
|
||||
("perm", "Old_Permic"),
|
||||
("phag", "Phags_Pa"),
|
||||
("phagspa", "Phags_Pa"),
|
||||
("phli", "Inscriptional_Pahlavi"),
|
||||
("phlp", "Psalter_Pahlavi"),
|
||||
("phnx", "Phoenician"),
|
||||
("phoenician", "Phoenician"),
|
||||
("plrd", "Miao"),
|
||||
("prti", "Inscriptional_Parthian"),
|
||||
("psalterpahlavi", "Psalter_Pahlavi"),
|
||||
("qaac", "Coptic"),
|
||||
("qaai", "Inherited"),
|
||||
("rejang", "Rejang"),
|
||||
("rjng", "Rejang"),
|
||||
("rohg", "Hanifi_Rohingya"),
|
||||
("runic", "Runic"),
|
||||
("runr", "Runic"),
|
||||
("samaritan", "Samaritan"),
|
||||
("samr", "Samaritan"),
|
||||
("sarb", "Old_South_Arabian"),
|
||||
("saur", "Saurashtra"),
|
||||
("saurashtra", "Saurashtra"),
|
||||
("sgnw", "SignWriting"),
|
||||
("sharada", "Sharada"),
|
||||
("shavian", "Shavian"),
|
||||
("shaw", "Shavian"),
|
||||
("shrd", "Sharada"),
|
||||
("sidd", "Siddham"),
|
||||
("siddham", "Siddham"),
|
||||
("signwriting", "SignWriting"),
|
||||
("sind", "Khudawadi"),
|
||||
("sinh", "Sinhala"),
|
||||
("sinhala", "Sinhala"),
|
||||
("sogd", "Sogdian"),
|
||||
("sogdian", "Sogdian"),
|
||||
("sogo", "Old_Sogdian"),
|
||||
("sora", "Sora_Sompeng"),
|
||||
("sorasompeng", "Sora_Sompeng"),
|
||||
("soyo", "Soyombo"),
|
||||
("soyombo", "Soyombo"),
|
||||
("sund", "Sundanese"),
|
||||
("sundanese", "Sundanese"),
|
||||
("sylo", "Syloti_Nagri"),
|
||||
("sylotinagri", "Syloti_Nagri"),
|
||||
("syrc", "Syriac"),
|
||||
("syriac", "Syriac"),
|
||||
("tagalog", "Tagalog"),
|
||||
("tagb", "Tagbanwa"),
|
||||
("tagbanwa", "Tagbanwa"),
|
||||
("taile", "Tai_Le"),
|
||||
("taitham", "Tai_Tham"),
|
||||
("taiviet", "Tai_Viet"),
|
||||
("takr", "Takri"),
|
||||
("takri", "Takri"),
|
||||
("tale", "Tai_Le"),
|
||||
("talu", "New_Tai_Lue"),
|
||||
("tamil", "Tamil"),
|
||||
("taml", "Tamil"),
|
||||
("tang", "Tangut"),
|
||||
("tangsa", "Tangsa"),
|
||||
("tangut", "Tangut"),
|
||||
("tavt", "Tai_Viet"),
|
||||
("telu", "Telugu"),
|
||||
("telugu", "Telugu"),
|
||||
("tfng", "Tifinagh"),
|
||||
("tglg", "Tagalog"),
|
||||
("thaa", "Thaana"),
|
||||
("thaana", "Thaana"),
|
||||
("thai", "Thai"),
|
||||
("tibetan", "Tibetan"),
|
||||
("tibt", "Tibetan"),
|
||||
("tifinagh", "Tifinagh"),
|
||||
("tirh", "Tirhuta"),
|
||||
("tirhuta", "Tirhuta"),
|
||||
("tnsa", "Tangsa"),
|
||||
("toto", "Toto"),
|
||||
("ugar", "Ugaritic"),
|
||||
("ugaritic", "Ugaritic"),
|
||||
("unknown", "Unknown"),
|
||||
("vai", "Vai"),
|
||||
("vaii", "Vai"),
|
||||
("vith", "Vithkuqi"),
|
||||
("vithkuqi", "Vithkuqi"),
|
||||
("wancho", "Wancho"),
|
||||
("wara", "Warang_Citi"),
|
||||
("warangciti", "Warang_Citi"),
|
||||
("wcho", "Wancho"),
|
||||
("xpeo", "Old_Persian"),
|
||||
("xsux", "Cuneiform"),
|
||||
("yezi", "Yezidi"),
|
||||
("yezidi", "Yezidi"),
|
||||
("yi", "Yi"),
|
||||
("yiii", "Yi"),
|
||||
("zanabazarsquare", "Zanabazar_Square"),
|
||||
("zanb", "Zanabazar_Square"),
|
||||
("zinh", "Inherited"),
|
||||
("zyyy", "Common"),
|
||||
("zzzz", "Unknown"),
|
||||
],
|
||||
),
|
||||
(
|
||||
"Sentence_Break",
|
||||
&[
|
||||
("at", "ATerm"),
|
||||
("aterm", "ATerm"),
|
||||
("cl", "Close"),
|
||||
("close", "Close"),
|
||||
("cr", "CR"),
|
||||
("ex", "Extend"),
|
||||
("extend", "Extend"),
|
||||
("fo", "Format"),
|
||||
("format", "Format"),
|
||||
("le", "OLetter"),
|
||||
("lf", "LF"),
|
||||
("lo", "Lower"),
|
||||
("lower", "Lower"),
|
||||
("nu", "Numeric"),
|
||||
("numeric", "Numeric"),
|
||||
("oletter", "OLetter"),
|
||||
("other", "Other"),
|
||||
("sc", "SContinue"),
|
||||
("scontinue", "SContinue"),
|
||||
("se", "Sep"),
|
||||
("sep", "Sep"),
|
||||
("sp", "Sp"),
|
||||
("st", "STerm"),
|
||||
("sterm", "STerm"),
|
||||
("up", "Upper"),
|
||||
("upper", "Upper"),
|
||||
("xx", "Other"),
|
||||
],
|
||||
),
|
||||
(
|
||||
"Word_Break",
|
||||
&[
|
||||
("aletter", "ALetter"),
|
||||
("cr", "CR"),
|
||||
("doublequote", "Double_Quote"),
|
||||
("dq", "Double_Quote"),
|
||||
("eb", "E_Base"),
|
||||
("ebase", "E_Base"),
|
||||
("ebasegaz", "E_Base_GAZ"),
|
||||
("ebg", "E_Base_GAZ"),
|
||||
("em", "E_Modifier"),
|
||||
("emodifier", "E_Modifier"),
|
||||
("ex", "ExtendNumLet"),
|
||||
("extend", "Extend"),
|
||||
("extendnumlet", "ExtendNumLet"),
|
||||
("fo", "Format"),
|
||||
("format", "Format"),
|
||||
("gaz", "Glue_After_Zwj"),
|
||||
("glueafterzwj", "Glue_After_Zwj"),
|
||||
("hebrewletter", "Hebrew_Letter"),
|
||||
("hl", "Hebrew_Letter"),
|
||||
("ka", "Katakana"),
|
||||
("katakana", "Katakana"),
|
||||
("le", "ALetter"),
|
||||
("lf", "LF"),
|
||||
("mb", "MidNumLet"),
|
||||
("midletter", "MidLetter"),
|
||||
("midnum", "MidNum"),
|
||||
("midnumlet", "MidNumLet"),
|
||||
("ml", "MidLetter"),
|
||||
("mn", "MidNum"),
|
||||
("newline", "Newline"),
|
||||
("nl", "Newline"),
|
||||
("nu", "Numeric"),
|
||||
("numeric", "Numeric"),
|
||||
("other", "Other"),
|
||||
("regionalindicator", "Regional_Indicator"),
|
||||
("ri", "Regional_Indicator"),
|
||||
("singlequote", "Single_Quote"),
|
||||
("sq", "Single_Quote"),
|
||||
("wsegspace", "WSegSpace"),
|
||||
("xx", "Other"),
|
||||
("zwj", "ZWJ"),
|
||||
],
|
||||
),
|
||||
];
|
||||
1263
third-party/vendor/regex-syntax/src/unicode_tables/script.rs
vendored
Normal file
1263
third-party/vendor/regex-syntax/src/unicode_tables/script.rs
vendored
Normal file
File diff suppressed because it is too large
Load diff
1457
third-party/vendor/regex-syntax/src/unicode_tables/script_extension.rs
vendored
Normal file
1457
third-party/vendor/regex-syntax/src/unicode_tables/script_extension.rs
vendored
Normal file
File diff suppressed because it is too large
Load diff
2477
third-party/vendor/regex-syntax/src/unicode_tables/sentence_break.rs
vendored
Normal file
2477
third-party/vendor/regex-syntax/src/unicode_tables/sentence_break.rs
vendored
Normal file
File diff suppressed because it is too large
Load diff
1120
third-party/vendor/regex-syntax/src/unicode_tables/word_break.rs
vendored
Normal file
1120
third-party/vendor/regex-syntax/src/unicode_tables/word_break.rs
vendored
Normal file
File diff suppressed because it is too large
Load diff
592
third-party/vendor/regex-syntax/src/utf8.rs
vendored
Normal file
592
third-party/vendor/regex-syntax/src/utf8.rs
vendored
Normal file
|
|
@ -0,0 +1,592 @@
|
|||
/*!
|
||||
Converts ranges of Unicode scalar values to equivalent ranges of UTF-8 bytes.
|
||||
|
||||
This is sub-module is useful for constructing byte based automatons that need
|
||||
to embed UTF-8 decoding. The most common use of this module is in conjunction
|
||||
with the [`hir::ClassUnicodeRange`](crate::hir::ClassUnicodeRange) type.
|
||||
|
||||
See the documentation on the `Utf8Sequences` iterator for more details and
|
||||
an example.
|
||||
|
||||
# Wait, what is this?
|
||||
|
||||
This is simplest to explain with an example. Let's say you wanted to test
|
||||
whether a particular byte sequence was a Cyrillic character. One possible
|
||||
scalar value range is `[0400-04FF]`. The set of allowed bytes for this
|
||||
range can be expressed as a sequence of byte ranges:
|
||||
|
||||
```text
|
||||
[D0-D3][80-BF]
|
||||
```
|
||||
|
||||
This is simple enough: simply encode the boundaries, `0400` encodes to
|
||||
`D0 80` and `04FF` encodes to `D3 BF`, and create ranges from each
|
||||
corresponding pair of bytes: `D0` to `D3` and `80` to `BF`.
|
||||
|
||||
However, what if you wanted to add the Cyrillic Supplementary characters to
|
||||
your range? Your range might then become `[0400-052F]`. The same procedure
|
||||
as above doesn't quite work because `052F` encodes to `D4 AF`. The byte ranges
|
||||
you'd get from the previous transformation would be `[D0-D4][80-AF]`. However,
|
||||
this isn't quite correct because this range doesn't capture many characters,
|
||||
for example, `04FF` (because its last byte, `BF` isn't in the range `80-AF`).
|
||||
|
||||
Instead, you need multiple sequences of byte ranges:
|
||||
|
||||
```text
|
||||
[D0-D3][80-BF] # matches codepoints 0400-04FF
|
||||
[D4][80-AF] # matches codepoints 0500-052F
|
||||
```
|
||||
|
||||
This gets even more complicated if you want bigger ranges, particularly if
|
||||
they naively contain surrogate codepoints. For example, the sequence of byte
|
||||
ranges for the basic multilingual plane (`[0000-FFFF]`) look like this:
|
||||
|
||||
```text
|
||||
[0-7F]
|
||||
[C2-DF][80-BF]
|
||||
[E0][A0-BF][80-BF]
|
||||
[E1-EC][80-BF][80-BF]
|
||||
[ED][80-9F][80-BF]
|
||||
[EE-EF][80-BF][80-BF]
|
||||
```
|
||||
|
||||
Note that the byte ranges above will *not* match any erroneous encoding of
|
||||
UTF-8, including encodings of surrogate codepoints.
|
||||
|
||||
And, of course, for all of Unicode (`[000000-10FFFF]`):
|
||||
|
||||
```text
|
||||
[0-7F]
|
||||
[C2-DF][80-BF]
|
||||
[E0][A0-BF][80-BF]
|
||||
[E1-EC][80-BF][80-BF]
|
||||
[ED][80-9F][80-BF]
|
||||
[EE-EF][80-BF][80-BF]
|
||||
[F0][90-BF][80-BF][80-BF]
|
||||
[F1-F3][80-BF][80-BF][80-BF]
|
||||
[F4][80-8F][80-BF][80-BF]
|
||||
```
|
||||
|
||||
This module automates the process of creating these byte ranges from ranges of
|
||||
Unicode scalar values.
|
||||
|
||||
# Lineage
|
||||
|
||||
I got the idea and general implementation strategy from Russ Cox in his
|
||||
[article on regexps](https://web.archive.org/web/20160404141123/https://swtch.com/~rsc/regexp/regexp3.html) and RE2.
|
||||
Russ Cox got it from Ken Thompson's `grep` (no source, folk lore?).
|
||||
I also got the idea from
|
||||
[Lucene](https://github.com/apache/lucene-solr/blob/ae93f4e7ac6a3908046391de35d4f50a0d3c59ca/lucene/core/src/java/org/apache/lucene/util/automaton/UTF32ToUTF8.java),
|
||||
which uses it for executing automata on their term index.
|
||||
*/
|
||||
|
||||
use core::{char, fmt, iter::FusedIterator, slice};
|
||||
|
||||
use alloc::{vec, vec::Vec};
|
||||
|
||||
const MAX_UTF8_BYTES: usize = 4;
|
||||
|
||||
/// Utf8Sequence represents a sequence of byte ranges.
|
||||
///
|
||||
/// To match a Utf8Sequence, a candidate byte sequence must match each
|
||||
/// successive range.
|
||||
///
|
||||
/// For example, if there are two ranges, `[C2-DF][80-BF]`, then the byte
|
||||
/// sequence `\xDD\x61` would not match because `0x61 < 0x80`.
|
||||
#[derive(Copy, Clone, Eq, PartialEq, PartialOrd, Ord)]
|
||||
pub enum Utf8Sequence {
|
||||
/// One byte range.
|
||||
One(Utf8Range),
|
||||
/// Two successive byte ranges.
|
||||
Two([Utf8Range; 2]),
|
||||
/// Three successive byte ranges.
|
||||
Three([Utf8Range; 3]),
|
||||
/// Four successive byte ranges.
|
||||
Four([Utf8Range; 4]),
|
||||
}
|
||||
|
||||
impl Utf8Sequence {
|
||||
/// Creates a new UTF-8 sequence from the encoded bytes of a scalar value
|
||||
/// range.
|
||||
///
|
||||
/// This assumes that `start` and `end` have the same length.
|
||||
fn from_encoded_range(start: &[u8], end: &[u8]) -> Self {
|
||||
assert_eq!(start.len(), end.len());
|
||||
match start.len() {
|
||||
2 => Utf8Sequence::Two([
|
||||
Utf8Range::new(start[0], end[0]),
|
||||
Utf8Range::new(start[1], end[1]),
|
||||
]),
|
||||
3 => Utf8Sequence::Three([
|
||||
Utf8Range::new(start[0], end[0]),
|
||||
Utf8Range::new(start[1], end[1]),
|
||||
Utf8Range::new(start[2], end[2]),
|
||||
]),
|
||||
4 => Utf8Sequence::Four([
|
||||
Utf8Range::new(start[0], end[0]),
|
||||
Utf8Range::new(start[1], end[1]),
|
||||
Utf8Range::new(start[2], end[2]),
|
||||
Utf8Range::new(start[3], end[3]),
|
||||
]),
|
||||
n => unreachable!("invalid encoded length: {}", n),
|
||||
}
|
||||
}
|
||||
|
||||
/// Returns the underlying sequence of byte ranges as a slice.
|
||||
pub fn as_slice(&self) -> &[Utf8Range] {
|
||||
use self::Utf8Sequence::*;
|
||||
match *self {
|
||||
One(ref r) => slice::from_ref(r),
|
||||
Two(ref r) => &r[..],
|
||||
Three(ref r) => &r[..],
|
||||
Four(ref r) => &r[..],
|
||||
}
|
||||
}
|
||||
|
||||
/// Returns the number of byte ranges in this sequence.
|
||||
///
|
||||
/// The length is guaranteed to be in the closed interval `[1, 4]`.
|
||||
pub fn len(&self) -> usize {
|
||||
self.as_slice().len()
|
||||
}
|
||||
|
||||
/// Reverses the ranges in this sequence.
|
||||
///
|
||||
/// For example, if this corresponds to the following sequence:
|
||||
///
|
||||
/// ```text
|
||||
/// [D0-D3][80-BF]
|
||||
/// ```
|
||||
///
|
||||
/// Then after reversal, it will be
|
||||
///
|
||||
/// ```text
|
||||
/// [80-BF][D0-D3]
|
||||
/// ```
|
||||
///
|
||||
/// This is useful when one is constructing a UTF-8 automaton to match
|
||||
/// character classes in reverse.
|
||||
pub fn reverse(&mut self) {
|
||||
match *self {
|
||||
Utf8Sequence::One(_) => {}
|
||||
Utf8Sequence::Two(ref mut x) => x.reverse(),
|
||||
Utf8Sequence::Three(ref mut x) => x.reverse(),
|
||||
Utf8Sequence::Four(ref mut x) => x.reverse(),
|
||||
}
|
||||
}
|
||||
|
||||
/// Returns true if and only if a prefix of `bytes` matches this sequence
|
||||
/// of byte ranges.
|
||||
pub fn matches(&self, bytes: &[u8]) -> bool {
|
||||
if bytes.len() < self.len() {
|
||||
return false;
|
||||
}
|
||||
for (&b, r) in bytes.iter().zip(self) {
|
||||
if !r.matches(b) {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
true
|
||||
}
|
||||
}
|
||||
|
||||
impl<'a> IntoIterator for &'a Utf8Sequence {
|
||||
type IntoIter = slice::Iter<'a, Utf8Range>;
|
||||
type Item = &'a Utf8Range;
|
||||
|
||||
fn into_iter(self) -> Self::IntoIter {
|
||||
self.as_slice().iter()
|
||||
}
|
||||
}
|
||||
|
||||
impl fmt::Debug for Utf8Sequence {
|
||||
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
|
||||
use self::Utf8Sequence::*;
|
||||
match *self {
|
||||
One(ref r) => write!(f, "{:?}", r),
|
||||
Two(ref r) => write!(f, "{:?}{:?}", r[0], r[1]),
|
||||
Three(ref r) => write!(f, "{:?}{:?}{:?}", r[0], r[1], r[2]),
|
||||
Four(ref r) => {
|
||||
write!(f, "{:?}{:?}{:?}{:?}", r[0], r[1], r[2], r[3])
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// A single inclusive range of UTF-8 bytes.
|
||||
#[derive(Clone, Copy, Eq, PartialEq, PartialOrd, Ord)]
|
||||
pub struct Utf8Range {
|
||||
/// Start of byte range (inclusive).
|
||||
pub start: u8,
|
||||
/// End of byte range (inclusive).
|
||||
pub end: u8,
|
||||
}
|
||||
|
||||
impl Utf8Range {
|
||||
fn new(start: u8, end: u8) -> Self {
|
||||
Utf8Range { start, end }
|
||||
}
|
||||
|
||||
/// Returns true if and only if the given byte is in this range.
|
||||
pub fn matches(&self, b: u8) -> bool {
|
||||
self.start <= b && b <= self.end
|
||||
}
|
||||
}
|
||||
|
||||
impl fmt::Debug for Utf8Range {
|
||||
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
|
||||
if self.start == self.end {
|
||||
write!(f, "[{:X}]", self.start)
|
||||
} else {
|
||||
write!(f, "[{:X}-{:X}]", self.start, self.end)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// An iterator over ranges of matching UTF-8 byte sequences.
|
||||
///
|
||||
/// The iteration represents an alternation of comprehensive byte sequences
|
||||
/// that match precisely the set of UTF-8 encoded scalar values.
|
||||
///
|
||||
/// A byte sequence corresponds to one of the scalar values in the range given
|
||||
/// if and only if it completely matches exactly one of the sequences of byte
|
||||
/// ranges produced by this iterator.
|
||||
///
|
||||
/// Each sequence of byte ranges matches a unique set of bytes. That is, no two
|
||||
/// sequences will match the same bytes.
|
||||
///
|
||||
/// # Example
|
||||
///
|
||||
/// This shows how to match an arbitrary byte sequence against a range of
|
||||
/// scalar values.
|
||||
///
|
||||
/// ```rust
|
||||
/// use regex_syntax::utf8::{Utf8Sequences, Utf8Sequence};
|
||||
///
|
||||
/// fn matches(seqs: &[Utf8Sequence], bytes: &[u8]) -> bool {
|
||||
/// for range in seqs {
|
||||
/// if range.matches(bytes) {
|
||||
/// return true;
|
||||
/// }
|
||||
/// }
|
||||
/// false
|
||||
/// }
|
||||
///
|
||||
/// // Test the basic multilingual plane.
|
||||
/// let seqs: Vec<_> = Utf8Sequences::new('\u{0}', '\u{FFFF}').collect();
|
||||
///
|
||||
/// // UTF-8 encoding of 'a'.
|
||||
/// assert!(matches(&seqs, &[0x61]));
|
||||
/// // UTF-8 encoding of '☃' (`\u{2603}`).
|
||||
/// assert!(matches(&seqs, &[0xE2, 0x98, 0x83]));
|
||||
/// // UTF-8 encoding of `\u{10348}` (outside the BMP).
|
||||
/// assert!(!matches(&seqs, &[0xF0, 0x90, 0x8D, 0x88]));
|
||||
/// // Tries to match against a UTF-8 encoding of a surrogate codepoint,
|
||||
/// // which is invalid UTF-8, and therefore fails, despite the fact that
|
||||
/// // the corresponding codepoint (0xD800) falls in the range given.
|
||||
/// assert!(!matches(&seqs, &[0xED, 0xA0, 0x80]));
|
||||
/// // And fails against plain old invalid UTF-8.
|
||||
/// assert!(!matches(&seqs, &[0xFF, 0xFF]));
|
||||
/// ```
|
||||
///
|
||||
/// If this example seems circuitous, that's because it is! It's meant to be
|
||||
/// illustrative. In practice, you could just try to decode your byte sequence
|
||||
/// and compare it with the scalar value range directly. However, this is not
|
||||
/// always possible (for example, in a byte based automaton).
|
||||
#[derive(Debug)]
|
||||
pub struct Utf8Sequences {
|
||||
range_stack: Vec<ScalarRange>,
|
||||
}
|
||||
|
||||
impl Utf8Sequences {
|
||||
/// Create a new iterator over UTF-8 byte ranges for the scalar value range
|
||||
/// given.
|
||||
pub fn new(start: char, end: char) -> Self {
|
||||
let mut it = Utf8Sequences { range_stack: vec![] };
|
||||
it.push(u32::from(start), u32::from(end));
|
||||
it
|
||||
}
|
||||
|
||||
/// reset resets the scalar value range.
|
||||
/// Any existing state is cleared, but resources may be reused.
|
||||
///
|
||||
/// N.B. Benchmarks say that this method is dubious.
|
||||
#[doc(hidden)]
|
||||
pub fn reset(&mut self, start: char, end: char) {
|
||||
self.range_stack.clear();
|
||||
self.push(u32::from(start), u32::from(end));
|
||||
}
|
||||
|
||||
fn push(&mut self, start: u32, end: u32) {
|
||||
self.range_stack.push(ScalarRange { start, end });
|
||||
}
|
||||
}
|
||||
|
||||
struct ScalarRange {
|
||||
start: u32,
|
||||
end: u32,
|
||||
}
|
||||
|
||||
impl fmt::Debug for ScalarRange {
|
||||
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
|
||||
write!(f, "ScalarRange({:X}, {:X})", self.start, self.end)
|
||||
}
|
||||
}
|
||||
|
||||
impl Iterator for Utf8Sequences {
|
||||
type Item = Utf8Sequence;
|
||||
|
||||
fn next(&mut self) -> Option<Self::Item> {
|
||||
'TOP: while let Some(mut r) = self.range_stack.pop() {
|
||||
'INNER: loop {
|
||||
if let Some((r1, r2)) = r.split() {
|
||||
self.push(r2.start, r2.end);
|
||||
r.start = r1.start;
|
||||
r.end = r1.end;
|
||||
continue 'INNER;
|
||||
}
|
||||
if !r.is_valid() {
|
||||
continue 'TOP;
|
||||
}
|
||||
for i in 1..MAX_UTF8_BYTES {
|
||||
let max = max_scalar_value(i);
|
||||
if r.start <= max && max < r.end {
|
||||
self.push(max + 1, r.end);
|
||||
r.end = max;
|
||||
continue 'INNER;
|
||||
}
|
||||
}
|
||||
if let Some(ascii_range) = r.as_ascii() {
|
||||
return Some(Utf8Sequence::One(ascii_range));
|
||||
}
|
||||
for i in 1..MAX_UTF8_BYTES {
|
||||
let m = (1 << (6 * i)) - 1;
|
||||
if (r.start & !m) != (r.end & !m) {
|
||||
if (r.start & m) != 0 {
|
||||
self.push((r.start | m) + 1, r.end);
|
||||
r.end = r.start | m;
|
||||
continue 'INNER;
|
||||
}
|
||||
if (r.end & m) != m {
|
||||
self.push(r.end & !m, r.end);
|
||||
r.end = (r.end & !m) - 1;
|
||||
continue 'INNER;
|
||||
}
|
||||
}
|
||||
}
|
||||
let mut start = [0; MAX_UTF8_BYTES];
|
||||
let mut end = [0; MAX_UTF8_BYTES];
|
||||
let n = r.encode(&mut start, &mut end);
|
||||
return Some(Utf8Sequence::from_encoded_range(
|
||||
&start[0..n],
|
||||
&end[0..n],
|
||||
));
|
||||
}
|
||||
}
|
||||
None
|
||||
}
|
||||
}
|
||||
|
||||
impl FusedIterator for Utf8Sequences {}
|
||||
|
||||
impl ScalarRange {
|
||||
/// split splits this range if it overlaps with a surrogate codepoint.
|
||||
///
|
||||
/// Either or both ranges may be invalid.
|
||||
fn split(&self) -> Option<(ScalarRange, ScalarRange)> {
|
||||
if self.start < 0xE000 && self.end > 0xD7FF {
|
||||
Some((
|
||||
ScalarRange { start: self.start, end: 0xD7FF },
|
||||
ScalarRange { start: 0xE000, end: self.end },
|
||||
))
|
||||
} else {
|
||||
None
|
||||
}
|
||||
}
|
||||
|
||||
/// is_valid returns true if and only if start <= end.
|
||||
fn is_valid(&self) -> bool {
|
||||
self.start <= self.end
|
||||
}
|
||||
|
||||
/// as_ascii returns this range as a Utf8Range if and only if all scalar
|
||||
/// values in this range can be encoded as a single byte.
|
||||
fn as_ascii(&self) -> Option<Utf8Range> {
|
||||
if self.is_ascii() {
|
||||
let start = u8::try_from(self.start).unwrap();
|
||||
let end = u8::try_from(self.end).unwrap();
|
||||
Some(Utf8Range::new(start, end))
|
||||
} else {
|
||||
None
|
||||
}
|
||||
}
|
||||
|
||||
/// is_ascii returns true if the range is ASCII only (i.e., takes a single
|
||||
/// byte to encode any scalar value).
|
||||
fn is_ascii(&self) -> bool {
|
||||
self.is_valid() && self.end <= 0x7f
|
||||
}
|
||||
|
||||
/// encode writes the UTF-8 encoding of the start and end of this range
|
||||
/// to the corresponding destination slices, and returns the number of
|
||||
/// bytes written.
|
||||
///
|
||||
/// The slices should have room for at least `MAX_UTF8_BYTES`.
|
||||
fn encode(&self, start: &mut [u8], end: &mut [u8]) -> usize {
|
||||
let cs = char::from_u32(self.start).unwrap();
|
||||
let ce = char::from_u32(self.end).unwrap();
|
||||
let ss = cs.encode_utf8(start);
|
||||
let se = ce.encode_utf8(end);
|
||||
assert_eq!(ss.len(), se.len());
|
||||
ss.len()
|
||||
}
|
||||
}
|
||||
|
||||
fn max_scalar_value(nbytes: usize) -> u32 {
|
||||
match nbytes {
|
||||
1 => 0x007F,
|
||||
2 => 0x07FF,
|
||||
3 => 0xFFFF,
|
||||
4 => 0x0010_FFFF,
|
||||
_ => unreachable!("invalid UTF-8 byte sequence size"),
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use core::char;
|
||||
|
||||
use alloc::{vec, vec::Vec};
|
||||
|
||||
use crate::utf8::{Utf8Range, Utf8Sequences};
|
||||
|
||||
fn rutf8(s: u8, e: u8) -> Utf8Range {
|
||||
Utf8Range::new(s, e)
|
||||
}
|
||||
|
||||
fn never_accepts_surrogate_codepoints(start: char, end: char) {
|
||||
for cp in 0xD800..0xE000 {
|
||||
let buf = encode_surrogate(cp);
|
||||
for r in Utf8Sequences::new(start, end) {
|
||||
if r.matches(&buf) {
|
||||
panic!(
|
||||
"Sequence ({:X}, {:X}) contains range {:?}, \
|
||||
which matches surrogate code point {:X} \
|
||||
with encoded bytes {:?}",
|
||||
u32::from(start),
|
||||
u32::from(end),
|
||||
r,
|
||||
cp,
|
||||
buf,
|
||||
);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn codepoints_no_surrogates() {
|
||||
never_accepts_surrogate_codepoints('\u{0}', '\u{FFFF}');
|
||||
never_accepts_surrogate_codepoints('\u{0}', '\u{10FFFF}');
|
||||
never_accepts_surrogate_codepoints('\u{0}', '\u{10FFFE}');
|
||||
never_accepts_surrogate_codepoints('\u{80}', '\u{10FFFF}');
|
||||
never_accepts_surrogate_codepoints('\u{D7FF}', '\u{E000}');
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn single_codepoint_one_sequence() {
|
||||
// Tests that every range of scalar values that contains a single
|
||||
// scalar value is recognized by one sequence of byte ranges.
|
||||
for i in 0x0..=0x0010_FFFF {
|
||||
let c = match char::from_u32(i) {
|
||||
None => continue,
|
||||
Some(c) => c,
|
||||
};
|
||||
let seqs: Vec<_> = Utf8Sequences::new(c, c).collect();
|
||||
assert_eq!(seqs.len(), 1);
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn bmp() {
|
||||
use crate::utf8::Utf8Sequence::*;
|
||||
|
||||
let seqs = Utf8Sequences::new('\u{0}', '\u{FFFF}').collect::<Vec<_>>();
|
||||
assert_eq!(
|
||||
seqs,
|
||||
vec![
|
||||
One(rutf8(0x0, 0x7F)),
|
||||
Two([rutf8(0xC2, 0xDF), rutf8(0x80, 0xBF)]),
|
||||
Three([
|
||||
rutf8(0xE0, 0xE0),
|
||||
rutf8(0xA0, 0xBF),
|
||||
rutf8(0x80, 0xBF)
|
||||
]),
|
||||
Three([
|
||||
rutf8(0xE1, 0xEC),
|
||||
rutf8(0x80, 0xBF),
|
||||
rutf8(0x80, 0xBF)
|
||||
]),
|
||||
Three([
|
||||
rutf8(0xED, 0xED),
|
||||
rutf8(0x80, 0x9F),
|
||||
rutf8(0x80, 0xBF)
|
||||
]),
|
||||
Three([
|
||||
rutf8(0xEE, 0xEF),
|
||||
rutf8(0x80, 0xBF),
|
||||
rutf8(0x80, 0xBF)
|
||||
]),
|
||||
]
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn reverse() {
|
||||
use crate::utf8::Utf8Sequence::*;
|
||||
|
||||
let mut s = One(rutf8(0xA, 0xB));
|
||||
s.reverse();
|
||||
assert_eq!(s.as_slice(), &[rutf8(0xA, 0xB)]);
|
||||
|
||||
let mut s = Two([rutf8(0xA, 0xB), rutf8(0xB, 0xC)]);
|
||||
s.reverse();
|
||||
assert_eq!(s.as_slice(), &[rutf8(0xB, 0xC), rutf8(0xA, 0xB)]);
|
||||
|
||||
let mut s = Three([rutf8(0xA, 0xB), rutf8(0xB, 0xC), rutf8(0xC, 0xD)]);
|
||||
s.reverse();
|
||||
assert_eq!(
|
||||
s.as_slice(),
|
||||
&[rutf8(0xC, 0xD), rutf8(0xB, 0xC), rutf8(0xA, 0xB)]
|
||||
);
|
||||
|
||||
let mut s = Four([
|
||||
rutf8(0xA, 0xB),
|
||||
rutf8(0xB, 0xC),
|
||||
rutf8(0xC, 0xD),
|
||||
rutf8(0xD, 0xE),
|
||||
]);
|
||||
s.reverse();
|
||||
assert_eq!(
|
||||
s.as_slice(),
|
||||
&[
|
||||
rutf8(0xD, 0xE),
|
||||
rutf8(0xC, 0xD),
|
||||
rutf8(0xB, 0xC),
|
||||
rutf8(0xA, 0xB)
|
||||
]
|
||||
);
|
||||
}
|
||||
|
||||
fn encode_surrogate(cp: u32) -> [u8; 3] {
|
||||
const TAG_CONT: u8 = 0b1000_0000;
|
||||
const TAG_THREE_B: u8 = 0b1110_0000;
|
||||
|
||||
assert!(0xD800 <= cp && cp < 0xE000);
|
||||
let mut dst = [0; 3];
|
||||
dst[0] = u8::try_from(cp >> 12 & 0x0F).unwrap() | TAG_THREE_B;
|
||||
dst[1] = u8::try_from(cp >> 6 & 0x3F).unwrap() | TAG_CONT;
|
||||
dst[2] = u8::try_from(cp & 0x3F).unwrap() | TAG_CONT;
|
||||
dst
|
||||
}
|
||||
}
|
||||
30
third-party/vendor/regex-syntax/test
vendored
Executable file
30
third-party/vendor/regex-syntax/test
vendored
Executable file
|
|
@ -0,0 +1,30 @@
|
|||
#!/bin/bash
|
||||
|
||||
set -e
|
||||
|
||||
# cd to the directory containing this crate's Cargo.toml so that we don't need
|
||||
# to pass --manifest-path to every `cargo` command.
|
||||
cd "$(dirname "$0")"
|
||||
|
||||
# This is a convenience script for running a broad swath of the syntax tests.
|
||||
echo "===== DEFAULT FEATURES ==="
|
||||
cargo test
|
||||
|
||||
features=(
|
||||
std
|
||||
unicode
|
||||
unicode-age
|
||||
unicode-bool
|
||||
unicode-case
|
||||
unicode-gencat
|
||||
unicode-perl
|
||||
unicode-script
|
||||
unicode-segment
|
||||
)
|
||||
for f in "${features[@]}"; do
|
||||
echo "=== FEATURE: $f ==="
|
||||
# We only run library tests because I couldn't figure out how to easily
|
||||
# make doc tests run in 'no_std' mode. In particular, without the Error
|
||||
# trait, using '?' in doc tests seems tricky.
|
||||
cargo test --no-default-features --lib --features "$f"
|
||||
done
|
||||
Loading…
Add table
Add a link
Reference in a new issue