Vendor things

This commit is contained in:
John Doty 2024-03-08 11:03:01 -08:00
parent 5deceec006
commit 977e3c17e5
19434 changed files with 10682014 additions and 0 deletions

View file

@ -0,0 +1,85 @@
#![forbid(unsafe_code)]
use std::cmp;
use std::collections::HashSet;
use std::env;
use std::fs::File;
use std::io::{self, BufReader, Read};
use xml::reader::XmlEvent;
use xml::ParserConfig;
fn main() -> Result<(), Box<dyn std::error::Error>> {
let mut file;
let mut stdin;
let source: &mut dyn Read = if let Some(file_name) = env::args().nth(1) {
file = File::open(file_name).map_err(|e| format!("Cannot open input file: {e}"))?;
&mut file
} else {
stdin = io::stdin();
&mut stdin
};
let reader = ParserConfig::new()
.whitespace_to_characters(true)
.ignore_comments(false)
.create_reader(BufReader::new(source));
let mut processing_instructions = 0;
let mut elements = 0;
let mut character_blocks = 0;
let mut cdata_blocks = 0;
let mut characters = 0;
let mut comment_blocks = 0;
let mut comment_characters = 0;
let mut namespaces = HashSet::new();
let mut depth = 0;
let mut max_depth = 0;
for e in reader {
let e = e.map_err(|e| format!("Error parsing XML document: {e}"))?;
match e {
XmlEvent::StartDocument { version, encoding, standalone } =>
println!(
"XML document version {}, encoded in {}, {}standalone",
version, encoding, if standalone.unwrap_or(false) { "" } else { "not " }
),
XmlEvent::EndDocument => println!("Document finished"),
XmlEvent::ProcessingInstruction { .. } => processing_instructions += 1,
XmlEvent::Whitespace(_) => {} // can't happen due to configuration
XmlEvent::Characters(s) => {
character_blocks += 1;
characters += s.len();
}
XmlEvent::CData(s) => {
cdata_blocks += 1;
characters += s.len();
}
XmlEvent::Comment(s) => {
comment_blocks += 1;
comment_characters += s.len();
}
XmlEvent::StartElement { namespace, .. } => {
depth += 1;
max_depth = cmp::max(max_depth, depth);
elements += 1;
namespaces.extend(namespace.0.into_values());
}
XmlEvent::EndElement { .. } => {
depth -= 1;
}
};
}
namespaces.remove(xml::namespace::NS_EMPTY_URI);
namespaces.remove(xml::namespace::NS_XMLNS_URI);
namespaces.remove(xml::namespace::NS_XML_URI);
println!("Elements: {elements}, maximum depth: {max_depth}");
println!("Namespaces (excluding built-in): {}", namespaces.len());
println!("Characters: {characters}, characters blocks: {character_blocks}, CDATA blocks: {cdata_blocks}");
println!("Comment blocks: {comment_blocks}, comment characters: {comment_characters}");
println!("Processing instructions (excluding built-in): {processing_instructions}");
Ok(())
}

View file

@ -0,0 +1,103 @@
//! Contains XML attributes manipulation types and functions.
//!
use std::fmt;
use crate::escape::{AttributeEscapes, Escaped};
use crate::name::{Name, OwnedName};
/// A borrowed version of an XML attribute.
///
/// Consists of a borrowed qualified name and a borrowed string value.
#[derive(Copy, Clone, Eq, PartialEq, Hash, Debug)]
pub struct Attribute<'a> {
/// Attribute name.
pub name: Name<'a>,
/// Attribute value.
pub value: &'a str,
}
impl<'a> fmt::Display for Attribute<'a> {
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
write!(f, "{}=\"{}\"", self.name, Escaped::<AttributeEscapes>::new(self.value))
}
}
impl<'a> Attribute<'a> {
/// Creates an owned attribute out of this borrowed one.
#[inline]
#[must_use]
pub fn to_owned(&self) -> OwnedAttribute {
OwnedAttribute {
name: self.name.into(),
value: self.value.into(),
}
}
/// Creates a borrowed attribute using the provided borrowed name and a borrowed string value.
#[inline]
#[must_use]
pub fn new(name: Name<'a>, value: &'a str) -> Attribute<'a> {
Attribute { name, value }
}
}
/// An owned version of an XML attribute.
///
/// Consists of an owned qualified name and an owned string value.
#[derive(Clone, Eq, PartialEq, Hash, Debug)]
pub struct OwnedAttribute {
/// Attribute name.
pub name: OwnedName,
/// Attribute value.
pub value: String,
}
impl OwnedAttribute {
/// Returns a borrowed `Attribute` out of this owned one.
#[must_use]
#[inline]
pub fn borrow(&self) -> Attribute<'_> {
Attribute {
name: self.name.borrow(),
value: &self.value,
}
}
/// Creates a new owned attribute using the provided owned name and an owned string value.
#[inline]
pub fn new<S: Into<String>>(name: OwnedName, value: S) -> OwnedAttribute {
OwnedAttribute {
name,
value: value.into(),
}
}
}
impl fmt::Display for OwnedAttribute {
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
write!(f, "{}=\"{}\"", self.name, Escaped::<AttributeEscapes>::new(&self.value))
}
}
#[cfg(test)]
mod tests {
use super::Attribute;
use crate::name::Name;
#[test]
fn attribute_display() {
let attr = Attribute::new(
Name::qualified("attribute", "urn:namespace", Some("n")),
"its value with > & \" ' < weird symbols",
);
assert_eq!(
&*attr.to_string(),
"{urn:namespace}n:attribute=\"its value with &gt; &amp; &quot; &apos; &lt; weird symbols\""
);
}
}

157
third-party/vendor/xml-rs/src/common.rs vendored Normal file
View file

@ -0,0 +1,157 @@
//! Contains common types and functions used throughout the library.
use std::fmt;
/// Represents a position inside some textual document.
#[derive(Copy, Clone, PartialEq, Eq)]
pub struct TextPosition {
/// Row, counting from 0
pub row: u64,
/// Column, counting from 0
pub column: u64,
}
impl TextPosition {
/// Creates a new position initialized to the beginning of the document
#[inline]
#[must_use]
pub fn new() -> TextPosition {
TextPosition { row: 0, column: 0 }
}
/// Advances the position in a line
#[inline]
pub fn advance(&mut self, count: u8) {
self.column += u64::from(count);
}
/// Advances the position in a line to the next tab position
#[inline]
pub fn advance_to_tab(&mut self, width: u8) {
let width = u64::from(width);
self.column += width - self.column % width;
}
/// Advances the position to the beginning of the next line
#[inline]
pub fn new_line(&mut self) {
self.column = 0;
self.row += 1;
}
}
impl fmt::Debug for TextPosition {
#[cold]
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
write!(f, "{}:{}", self.row + 1, self.column + 1)
}
}
impl fmt::Display for TextPosition {
#[inline]
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
write!(f, "{}:{}", self.row + 1, self.column + 1)
}
}
/// Get the position in the document corresponding to the object
///
/// This trait is implemented by parsers, lexers and errors.
pub trait Position {
/// Returns the current position or a position corresponding to the object.
fn position(&self) -> TextPosition;
}
impl Position for TextPosition {
#[inline]
fn position(&self) -> TextPosition {
*self
}
}
/// XML version enumeration.
#[derive(Copy, Clone, PartialEq, Eq, PartialOrd, Ord)]
pub enum XmlVersion {
/// XML version 1.0.
Version10,
/// XML version 1.1.
Version11,
}
impl fmt::Display for XmlVersion {
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
match *self {
XmlVersion::Version10 => "1.0",
XmlVersion::Version11 => "1.1",
}.fmt(f)
}
}
impl fmt::Debug for XmlVersion {
#[cold]
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
fmt::Display::fmt(self, f)
}
}
/// Checks whether the given character is a white space character (`S`)
/// as is defined by XML 1.1 specification, [section 2.3][1].
///
/// [1]: http://www.w3.org/TR/2006/REC-xml11-20060816/#sec-common-syn
#[must_use]
#[inline]
pub fn is_whitespace_char(c: char) -> bool {
matches!(c, '\x20' | '\x0a' | '\x09' | '\x0d')
}
/// Checks whether the given string is compound only by white space
/// characters (`S`) using the previous `is_whitespace_char` to check
/// all characters of this string
pub fn is_whitespace_str(s: &str) -> bool {
s.chars().all(is_whitespace_char)
}
#[must_use] pub fn is_xml10_char(c: char) -> bool {
matches!(c, '\u{09}' | '\u{0A}' | '\u{0D}' | '\u{20}'..='\u{D7FF}' | '\u{E000}'..='\u{FFFD}' | '\u{10000}'..)
}
#[must_use] pub fn is_xml11_char(c: char) -> bool {
matches!(c, '\u{01}'..='\u{D7FF}' | '\u{E000}'..='\u{FFFD}' | '\u{10000}'..)
}
#[must_use] pub fn is_xml11_char_not_restricted(c: char) -> bool {
is_xml11_char(c) && !matches!(c, '\u{01}'..='\u{08}' | '\u{0B}'..='\u{0C}' | '\u{0E}'..='\u{1F}' | '\u{7F}'..='\u{84}' | '\u{86}'..='\u{9F}')
}
/// Checks whether the given character is a name start character (`NameStartChar`)
/// as is defined by XML 1.1 specification, [section 2.3][1].
///
/// [1]: http://www.w3.org/TR/2006/REC-xml11-20060816/#sec-common-syn
#[must_use]
pub fn is_name_start_char(c: char) -> bool {
match c {
':' | 'A'..='Z' | '_' | 'a'..='z' |
'\u{C0}'..='\u{D6}' | '\u{D8}'..='\u{F6}' | '\u{F8}'..='\u{2FF}' |
'\u{370}'..='\u{37D}' | '\u{37F}'..='\u{1FFF}' |
'\u{200C}'..='\u{200D}' | '\u{2070}'..='\u{218F}' |
'\u{2C00}'..='\u{2FEF}' | '\u{3001}'..='\u{D7FF}' |
'\u{F900}'..='\u{FDCF}' | '\u{FDF0}'..='\u{FFFD}' |
'\u{10000}'..='\u{EFFFF}' => true,
_ => false
}
}
/// Checks whether the given character is a name character (`NameChar`)
/// as is defined by XML 1.1 specification, [section 2.3][1].
///
/// [1]: http://www.w3.org/TR/2006/REC-xml11-20060816/#sec-common-syn
#[must_use]
pub fn is_name_char(c: char) -> bool {
match c {
_ if is_name_start_char(c) => true,
'-' | '.' | '0'..='9' | '\u{B7}' |
'\u{300}'..='\u{36F}' | '\u{203F}'..='\u{2040}' => true,
_ => false
}
}

162
third-party/vendor/xml-rs/src/escape.rs vendored Normal file
View file

@ -0,0 +1,162 @@
//! Contains functions for performing XML special characters escaping.
use std::{borrow::Cow, marker::PhantomData, fmt::{Display, Result, Formatter}};
pub(crate) trait Escapes {
fn escape(c: u8) -> Option<&'static str>;
fn byte_needs_escaping(c: u8) -> bool {
Self::escape(c).is_some()
}
fn str_needs_escaping(s: &str) -> bool {
s.bytes().any(|c| Self::escape(c).is_some())
}
}
pub(crate) struct Escaped<'a, E: Escapes> {
_escape_phantom: PhantomData<E>,
to_escape: &'a str,
}
impl<'a, E: Escapes> Escaped<'a, E> {
pub fn new(s: &'a str) -> Self {
Escaped {
_escape_phantom: PhantomData,
to_escape: s,
}
}
}
impl<'a, E: Escapes> Display for Escaped<'a, E> {
fn fmt(&self, f: &mut Formatter<'_>) -> Result {
let mut total_remaining = self.to_escape;
// find the next occurence
while let Some(n) = total_remaining
.bytes()
.position(E::byte_needs_escaping)
{
let (start, remaining) = total_remaining.split_at(n);
f.write_str(start)?;
// unwrap is safe because we checked is_some for position n earlier
let next_byte = remaining.bytes().next().unwrap();
let replacement = E::escape(next_byte).unwrap();
f.write_str(replacement)?;
total_remaining = &remaining[1..];
}
f.write_str(total_remaining)
}
}
fn escape_str<E: Escapes>(s: &str) -> Cow<'_, str> {
if E::str_needs_escaping(s) {
Cow::Owned(format!("{}", Escaped::<E>::new(s)))
} else {
Cow::Borrowed(s)
}
}
macro_rules! escapes {
{
$name: ident,
$($k: expr => $v: expr),* $(,)?
} => {
pub(crate) struct $name;
impl Escapes for $name {
fn escape(c: u8) -> Option<&'static str> {
match c {
$( $k => Some($v),)*
_ => None
}
}
}
};
}
escapes!(
AttributeEscapes,
b'<' => "&lt;",
b'>' => "&gt;",
b'"' => "&quot;",
b'\'' => "&apos;",
b'&' => "&amp;",
b'\n' => "&#xA;",
b'\r' => "&#xD;",
);
escapes!(
PcDataEscapes,
b'<' => "&lt;",
b'&' => "&amp;",
);
/// Performs escaping of common XML characters inside an attribute value.
///
/// This function replaces several important markup characters with their
/// entity equivalents:
///
/// * `<` → `&lt;`
/// * `>` → `&gt;`
/// * `"` → `&quot;`
/// * `'` → `&apos;`
/// * `&` → `&amp;`
///
/// The following characters are escaped so that attributes are printed on
/// a single line:
/// * `\n` → `&#xA;`
/// * `\r` → `&#xD;`
///
/// The resulting string is safe to use inside XML attribute values or in PCDATA sections.
///
/// Does not perform allocations if the given string does not contain escapable characters.
#[inline]
#[must_use]
pub fn escape_str_attribute(s: &str) -> Cow<'_, str> {
escape_str::<AttributeEscapes>(s)
}
/// Performs escaping of common XML characters inside PCDATA.
///
/// This function replaces several important markup characters with their
/// entity equivalents:
///
/// * `<` → `&lt;`
/// * `&` → `&amp;`
///
/// The resulting string is safe to use inside PCDATA sections but NOT inside attribute values.
///
/// Does not perform allocations if the given string does not contain escapable characters.
#[inline]
#[must_use]
pub fn escape_str_pcdata(s: &str) -> Cow<'_, str> {
escape_str::<PcDataEscapes>(s)
}
#[cfg(test)]
mod tests {
use super::{escape_str_attribute, escape_str_pcdata};
#[test]
fn test_escape_str_attribute() {
assert_eq!(escape_str_attribute("<>'\"&\n\r"), "&lt;&gt;&apos;&quot;&amp;&#xA;&#xD;");
assert_eq!(escape_str_attribute("no_escapes"), "no_escapes");
}
#[test]
fn test_escape_str_pcdata() {
assert_eq!(escape_str_pcdata("<&"), "&lt;&amp;");
assert_eq!(escape_str_pcdata("no_escapes"), "no_escapes");
}
#[test]
fn test_escape_multibyte_code_points() {
assert_eq!(escape_str_attribute("☃<"), "☃&lt;");
assert_eq!(escape_str_pcdata("☃<"), "☃&lt;");
}
}

30
third-party/vendor/xml-rs/src/lib.rs vendored Normal file
View file

@ -0,0 +1,30 @@
//#![warn(missing_doc)]
#![forbid(non_camel_case_types)]
#![forbid(unsafe_code)]
#![allow(clippy::redundant_closure_for_method_calls)]
#![allow(clippy::module_name_repetitions)]
//! This crate currently provides an almost XML 1.0/1.1-compliant pull parser.
//!
//! Please note that functions of this parser may panic.
//! If a panic could cause a Denial Of Service in your codebase, *you're* responsible for wrapping access to this library in `catch_unwind`.
//!
#![cfg_attr(doctest, doc = include_str!("../README.md"))]
pub use crate::reader::EventReader;
pub use crate::reader::ParserConfig;
pub use crate::util::Encoding;
pub use crate::writer::EmitterConfig;
pub use crate::writer::EventWriter;
pub mod attribute;
pub mod common;
pub mod escape;
#[doc(hidden)] // FIXME: not supposed to be public
pub mod macros;
pub mod name;
pub mod namespace;
pub mod reader;
mod util;
pub mod writer;

60
third-party/vendor/xml-rs/src/macros.rs vendored Normal file
View file

@ -0,0 +1,60 @@
#![macro_use]
//! Contains several macros used in this crate.
macro_rules! gen_setter {
($(#[$comments:meta])* $field:ident : into $t:ty) => {
$(#[$comments])*
///
/// <small>See [`ParserConfig`][crate::ParserConfig] fields docs for details</small>
#[inline]
pub fn $field<T: Into<$t>>(mut self, value: T) -> Self {
self.$field = value.into();
self
}
};
($(#[$comments:meta])* $field:ident : val $t:ty) => {
$(#[$comments])*
///
/// <small>See [`ParserConfig`][crate::ParserConfig] fields docs for details</small>
#[inline]
#[must_use] pub fn $field(mut self, value: $t) -> Self {
self.$field = value;
self
}
};
($(#[$comments:meta])* $field:ident : delegate $t:ty) => {
$(#[$comments])*
///
/// <small>See [`ParserConfig`][crate::ParserConfig] fields docs for details</small>
#[inline]
#[must_use] pub fn $field(mut self, value: $t) -> Self {
self.c.$field = value;
self
}
};
($(#[$comments:meta])* $field:ident : c2 $t:ty) => {
$(#[$comments])*
///
/// <small>See [`ParserConfig2`][crate::reader::ParserConfig2] fields docs for details</small>
#[inline]
#[must_use]
pub fn $field(self, value: $t) -> ParserConfig2 {
ParserConfig2 {
c: self,
..Default::default()
}
.$field(value)
}
};
}
macro_rules! gen_setters {
($target:ident, $($(#[$comments:meta])* $field:ident : $k:tt $tpe:ty),+) => (
impl $target {$(
gen_setter! { $(#[$comments])* $field : $k $tpe }
)+
})
}

312
third-party/vendor/xml-rs/src/name.rs vendored Normal file
View file

@ -0,0 +1,312 @@
//! Contains XML qualified names manipulation types and functions.
//!
use std::fmt;
use std::str::FromStr;
use crate::namespace::NS_NO_PREFIX;
/// Represents a qualified XML name.
///
/// A qualified name always consists at least of a local name. It can optionally contain
/// a prefix; when reading an XML document, if it contains a prefix, it must also contain a
/// namespace URI, but this is not enforced statically; see below. The name can contain a
/// namespace without a prefix; in that case a default, empty prefix is assumed.
///
/// When writing XML documents, it is possible to omit the namespace URI, leaving only
/// the prefix. In this case the writer will check that the specifed prefix is bound to some
/// URI in the current namespace context. If both prefix and namespace URI are specified,
/// it is checked that the current namespace context contains this exact correspondence
/// between prefix and namespace URI.
///
/// # Prefixes and URIs
///
/// A qualified name with a prefix must always contain a proper namespace URI --- names with
/// a prefix but without a namespace associated with that prefix are meaningless. However,
/// it is impossible to obtain proper namespace URI by a prefix without a context, and such
/// context is only available when parsing a document (or it can be constructed manually
/// when writing a document). Tying a name to a context statically seems impractical. This
/// may change in future, though.
///
/// # Conversions
///
/// `Name` implements some `From` instances for conversion from strings and tuples. For example:
///
/// ```rust
/// # use xml::name::Name;
/// let n1: Name = "p:some-name".into();
/// let n2: Name = ("p", "some-name").into();
///
/// assert_eq!(n1, n2);
/// assert_eq!(n1.local_name, "some-name");
/// assert_eq!(n1.prefix, Some("p"));
/// assert!(n1.namespace.is_none());
/// ```
///
/// This is added to support easy specification of XML elements when writing XML documents.
#[derive(Copy, Clone, PartialEq, Eq, Hash, Debug)]
pub struct Name<'a> {
/// A local name, e.g. `string` in `xsi:string`.
pub local_name: &'a str,
/// A namespace URI, e.g. `http://www.w3.org/2000/xmlns/`.
pub namespace: Option<&'a str>,
/// A name prefix, e.g. `xsi` in `xsi:string`.
pub prefix: Option<&'a str>,
}
impl<'a> From<&'a str> for Name<'a> {
fn from(s: &'a str) -> Name<'a> {
let mut parts = s.splitn(2, ':').fuse();
match (parts.next(), parts.next()) {
(Some(name), None) => Name::local(name),
(Some(prefix), Some(name)) => Name::prefixed(name, prefix),
_ => unreachable!(),
}
}
}
impl<'a> From<(&'a str, &'a str)> for Name<'a> {
fn from((prefix, name): (&'a str, &'a str)) -> Name<'a> {
Name::prefixed(name, prefix)
}
}
impl<'a> fmt::Display for Name<'a> {
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
if let Some(namespace) = self.namespace {
write!(f, "{{{namespace}}}")?;
}
if let Some(prefix) = self.prefix {
write!(f, "{prefix}:")?;
}
f.write_str(self.local_name)
}
}
impl<'a> Name<'a> {
/// Returns an owned variant of the qualified name.
#[must_use]
pub fn to_owned(&self) -> OwnedName {
OwnedName {
local_name: self.local_name.into(),
namespace: self.namespace.map(std::convert::Into::into),
prefix: self.prefix.map(std::convert::Into::into),
}
}
/// Returns a new `Name` instance representing plain local name.
#[inline]
#[must_use]
pub fn local(local_name: &str) -> Name<'_> {
Name {
local_name,
prefix: None,
namespace: None,
}
}
/// Returns a new `Name` instance with the given local name and prefix.
#[inline]
#[must_use]
pub fn prefixed(local_name: &'a str, prefix: &'a str) -> Name<'a> {
Name {
local_name,
namespace: None,
prefix: Some(prefix),
}
}
/// Returns a new `Name` instance representing a qualified name with or without a prefix and
/// with a namespace URI.
#[inline]
#[must_use]
pub fn qualified(local_name: &'a str, namespace: &'a str, prefix: Option<&'a str>) -> Name<'a> {
Name {
local_name,
namespace: Some(namespace),
prefix,
}
}
/// Returns a correct XML representation of this local name and prefix.
///
/// This method is different from the autoimplemented `to_string()` because it does not
/// include namespace URI in the result.
#[must_use]
pub fn to_repr(&self) -> String {
self.repr_display().to_string()
}
/// Returns a structure which can be displayed with `std::fmt` machinery to obtain this
/// local name and prefix.
///
/// This method is needed for efficiency purposes in order not to create unnecessary
/// allocations.
#[inline]
#[must_use]
pub fn repr_display(&self) -> ReprDisplay<'_, '_> {
ReprDisplay(self)
}
/// Returns either a prefix of this name or `namespace::NS_NO_PREFIX` constant.
#[inline]
#[must_use]
pub fn prefix_repr(&self) -> &str {
self.prefix.unwrap_or(NS_NO_PREFIX)
}
}
/// A wrapper around `Name` whose `Display` implementation prints the wrapped name as it is
/// displayed in an XML document.
pub struct ReprDisplay<'a, 'b>(&'a Name<'b>);
impl<'a, 'b: 'a> fmt::Display for ReprDisplay<'a, 'b> {
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
match self.0.prefix {
Some(prefix) => write!(f, "{}:{}", prefix, self.0.local_name),
None => self.0.local_name.fmt(f),
}
}
}
/// An owned variant of `Name`.
///
/// Everything about `Name` applies to this structure as well.
#[derive(Clone, PartialEq, Eq, Hash, Debug)]
pub struct OwnedName {
/// A local name, e.g. `string` in `xsi:string`.
pub local_name: String,
/// A namespace URI, e.g. `http://www.w3.org/2000/xmlns/`.
pub namespace: Option<String>,
/// A name prefix, e.g. `xsi` in `xsi:string`.
pub prefix: Option<String>,
}
impl fmt::Display for OwnedName {
#[inline]
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
fmt::Display::fmt(&self.borrow(), f)
}
}
impl OwnedName {
/// Constructs a borrowed `Name` based on this owned name.
#[must_use]
#[inline]
pub fn borrow(&self) -> Name<'_> {
Name {
local_name: &self.local_name,
namespace: self.namespace.as_deref(),
prefix: self.prefix.as_deref(),
}
}
/// Returns a new `OwnedName` instance representing a plain local name.
#[inline]
pub fn local<S>(local_name: S) -> OwnedName where S: Into<String> {
OwnedName {
local_name: local_name.into(),
namespace: None,
prefix: None,
}
}
/// Returns a new `OwnedName` instance representing a qualified name with or without
/// a prefix and with a namespace URI.
#[inline]
pub fn qualified<S1, S2, S3>(local_name: S1, namespace: S2, prefix: Option<S3>) -> OwnedName
where S1: Into<String>, S2: Into<String>, S3: Into<String>
{
OwnedName {
local_name: local_name.into(),
namespace: Some(namespace.into()),
prefix: prefix.map(std::convert::Into::into),
}
}
/// Returns an optional prefix by reference, equivalent to `self.borrow().prefix`
/// but avoids extra work.
#[inline]
#[must_use]
pub fn prefix_ref(&self) -> Option<&str> {
self.prefix.as_deref()
}
/// Returns an optional namespace by reference, equivalen to `self.borrow().namespace`
/// but avoids extra work.
#[inline]
#[must_use]
pub fn namespace_ref(&self) -> Option<&str> {
self.namespace.as_deref()
}
}
impl<'a> From<Name<'a>> for OwnedName {
#[inline]
fn from(n: Name<'a>) -> OwnedName {
n.to_owned()
}
}
impl FromStr for OwnedName {
type Err = ();
/// Parses the given string slice into a qualified name.
///
/// This function, when finishes sucessfully, always return a qualified
/// name without a namespace (`name.namespace == None`). It should be filled later
/// using proper `NamespaceStack`.
///
/// It is supposed that all characters in the argument string are correct
/// as defined by the XML specification. No additional checks except a check
/// for emptiness are done.
fn from_str(s: &str) -> Result<OwnedName, ()> {
let mut it = s.split(':');
let r = match (it.next(), it.next(), it.next()) {
(Some(prefix), Some(local_name), None) if !prefix.is_empty() &&
!local_name.is_empty() =>
Some((local_name.into(), Some(prefix.into()))),
(Some(local_name), None, None) if !local_name.is_empty() =>
Some((local_name.into(), None)),
(_, _, _) => None
};
r.map(|(local_name, prefix)| OwnedName {
local_name,
namespace: None,
prefix
}).ok_or(())
}
}
#[cfg(test)]
mod tests {
use super::OwnedName;
#[test]
fn test_owned_name_from_str() {
assert_eq!("prefix:name".parse(), Ok(OwnedName {
local_name: "name".into(),
namespace: None,
prefix: Some("prefix".into())
}));
assert_eq!("name".parse(), Ok(OwnedName {
local_name: "name".into(),
namespace: None,
prefix: None
}));
assert_eq!("".parse(), Err::<OwnedName, ()>(()));
assert_eq!(":".parse(), Err::<OwnedName, ()>(()));
assert_eq!(":a".parse(), Err::<OwnedName, ()>(()));
assert_eq!("a:".parse(), Err::<OwnedName, ()>(()));
assert_eq!("a:b:c".parse(), Err::<OwnedName, ()>(()));
}
}

View file

@ -0,0 +1,508 @@
//! Contains namespace manipulation types and functions.
use std::borrow::Cow;
use std::collections::btree_map::Iter as Entries;
use std::collections::btree_map::{BTreeMap, Entry};
use std::collections::HashSet;
use std::iter::{Map, Rev};
use std::slice::Iter;
/// Designates prefix for namespace definitions.
///
/// See [Namespaces in XML][namespace] spec for more information.
///
/// [namespace]: http://www.w3.org/TR/xml-names/#ns-decl
pub const NS_XMLNS_PREFIX: &str = "xmlns";
/// Designates the standard URI for `xmlns` prefix.
///
/// See [A Namespace Name for xmlns Attributes][namespace] for more information.
///
/// [namespace]: http://www.w3.org/2000/xmlns/
pub const NS_XMLNS_URI: &str = "http://www.w3.org/2000/xmlns/";
/// Designates prefix for a namespace containing several special predefined attributes.
///
/// See [2.10 White Space handling][1], [2.1 Language Identification][2],
/// [XML Base specification][3] and [xml:id specification][4] for more information.
///
/// [1]: http://www.w3.org/TR/REC-xml/#sec-white-space
/// [2]: http://www.w3.org/TR/REC-xml/#sec-lang-tag
/// [3]: http://www.w3.org/TR/xmlbase/
/// [4]: http://www.w3.org/TR/xml-id/
pub const NS_XML_PREFIX: &str = "xml";
/// Designates the standard URI for `xml` prefix.
///
/// See `NS_XML_PREFIX` documentation for more information.
pub const NS_XML_URI: &str = "http://www.w3.org/XML/1998/namespace";
/// Designates the absence of prefix in a qualified name.
///
/// This constant should be used to define or query default namespace which should be used
/// for element or attribute names without prefix. For example, if a namespace mapping
/// at a particular point in the document contains correspondence like
///
/// ```none
/// NS_NO_PREFIX --> urn:some:namespace
/// ```
///
/// then all names declared without an explicit prefix `urn:some:namespace` is assumed as
/// a namespace URI.
///
/// By default empty prefix corresponds to absence of namespace, but this can change either
/// when writing an XML document (manually) or when reading an XML document (based on namespace
/// declarations).
pub const NS_NO_PREFIX: &str = "";
/// Designates an empty namespace URI, which is equivalent to absence of namespace.
///
/// This constant should not usually be used directly; it is used to designate that
/// empty prefix corresponds to absent namespace in `NamespaceStack` instances created with
/// `NamespaceStack::default()`. Therefore, it can be used to restore `NS_NO_PREFIX` mapping
/// in a namespace back to its default value.
pub const NS_EMPTY_URI: &str = "";
/// Namespace is a map from prefixes to namespace URIs.
///
/// No prefix (i.e. default namespace) is designated by `NS_NO_PREFIX` constant.
#[derive(PartialEq, Eq, Clone, Debug)]
pub struct Namespace(pub BTreeMap<String, String>);
impl Namespace {
/// Returns an empty namespace.
#[inline]
#[must_use]
pub fn empty() -> Namespace {
Namespace(BTreeMap::new())
}
/// Checks whether this namespace is empty.
#[inline]
#[must_use]
pub fn is_empty(&self) -> bool {
self.0.is_empty()
}
/// Checks whether this namespace is essentially empty, that is, it does not contain
/// anything but default mappings.
#[must_use]
pub fn is_essentially_empty(&self) -> bool {
// a shortcut for a namespace which is definitely not empty
if self.0.len() > 3 { return false; }
self.0.iter().all(|(k, v)| match (&**k, &**v) {
(NS_NO_PREFIX, NS_EMPTY_URI) => true,
(NS_XMLNS_PREFIX, NS_XMLNS_URI) => true,
(NS_XML_PREFIX, NS_XML_URI) => true,
_ => false
})
}
/// Checks whether this namespace mapping contains the given prefix.
///
/// # Parameters
/// * `prefix` --- namespace prefix.
///
/// # Return value
/// `true` if this namespace contains the given prefix, `false` otherwise.
#[inline]
pub fn contains<P: ?Sized + AsRef<str>>(&self, prefix: &P) -> bool {
self.0.contains_key(prefix.as_ref())
}
/// Puts a mapping into this namespace.
///
/// This method does not override any already existing mappings.
///
/// Returns a boolean flag indicating whether the map already contained
/// the given prefix.
///
/// # Parameters
/// * `prefix` --- namespace prefix;
/// * `uri` --- namespace URI.
///
/// # Return value
/// `true` if `prefix` has been inserted successfully; `false` if the `prefix`
/// was already present in the namespace.
pub fn put<P, U>(&mut self, prefix: P, uri: U) -> bool
where P: Into<String>, U: Into<String>
{
match self.0.entry(prefix.into()) {
Entry::Occupied(_) => false,
Entry::Vacant(ve) => {
ve.insert(uri.into());
true
}
}
}
/// Puts a mapping into this namespace forcefully.
///
/// This method, unlike `put()`, does replace an already existing mapping.
///
/// Returns previous URI which was assigned to the given prefix, if it is present.
///
/// # Parameters
/// * `prefix` --- namespace prefix;
/// * `uri` --- namespace URI.
///
/// # Return value
/// `Some(uri)` with `uri` being a previous URI assigned to the `prefix`, or
/// `None` if such prefix was not present in the namespace before.
pub fn force_put<P, U>(&mut self, prefix: P, uri: U) -> Option<String>
where P: Into<String>, U: Into<String>
{
self.0.insert(prefix.into(), uri.into())
}
/// Queries the namespace for the given prefix.
///
/// # Parameters
/// * `prefix` --- namespace prefix.
///
/// # Return value
/// Namespace URI corresponding to the given prefix, if it is present.
pub fn get<'a, P: ?Sized + AsRef<str>>(&'a self, prefix: &P) -> Option<&'a str> {
self.0.get(prefix.as_ref()).map(|s| &**s)
}
/// Borrowed namespace for the writer
#[must_use]
pub fn borrow(&self) -> Cow<'_, Self> {
Cow::Borrowed(self)
}
}
/// An alias for iterator type for namespace mappings contained in a namespace.
pub type NamespaceMappings<'a> = Map<
Entries<'a, String, String>,
for<'b> fn((&'b String, &'b String)) -> UriMapping<'b>
>;
impl<'a> IntoIterator for &'a Namespace {
type Item = UriMapping<'a>;
type IntoIter = NamespaceMappings<'a>;
fn into_iter(self) -> Self::IntoIter {
fn mapper<'a>((prefix, uri): (&'a String, &'a String)) -> UriMapping<'a> {
(prefix, uri)
}
self.0.iter().map(mapper)
}
}
/// Namespace stack is a sequence of namespaces.
///
/// Namespace stack is used to represent cumulative namespace consisting of
/// combined namespaces from nested elements.
#[derive(Clone, Eq, PartialEq, Debug)]
pub struct NamespaceStack(pub Vec<Namespace>);
impl NamespaceStack {
/// Returns an empty namespace stack.
#[inline]
#[must_use]
pub fn empty() -> NamespaceStack {
NamespaceStack(Vec::with_capacity(2))
}
/// Returns a namespace stack with default items in it.
///
/// Default items are the following:
///
/// * `xml` → `http://www.w3.org/XML/1998/namespace`;
/// * `xmlns` → `http://www.w3.org/2000/xmlns/`.
#[inline]
#[must_use]
pub fn default() -> NamespaceStack {
let mut nst = NamespaceStack::empty();
nst.push_empty();
// xml namespace
nst.put(NS_XML_PREFIX, NS_XML_URI);
// xmlns namespace
nst.put(NS_XMLNS_PREFIX, NS_XMLNS_URI);
// empty namespace
nst.put(NS_NO_PREFIX, NS_EMPTY_URI);
nst
}
/// Adds an empty namespace to the top of this stack.
#[inline]
pub fn push_empty(&mut self) -> &mut NamespaceStack {
self.0.push(Namespace::empty());
self
}
/// Removes the topmost namespace in this stack.
///
/// Panics if the stack is empty.
#[inline]
pub fn pop(&mut self) -> Namespace {
self.0.pop().unwrap()
}
/// Removes the topmost namespace in this stack.
///
/// Returns `Some(namespace)` if this stack is not empty and `None` otherwise.
#[inline]
pub fn try_pop(&mut self) -> Option<Namespace> {
self.0.pop()
}
/// Borrows the topmost namespace mutably, leaving the stack intact.
///
/// Panics if the stack is empty.
#[inline]
pub fn peek_mut(&mut self) -> &mut Namespace {
self.0.last_mut().unwrap()
}
/// Borrows the topmost namespace immutably, leaving the stack intact.
///
/// Panics if the stack is empty.
#[inline]
#[must_use]
pub fn peek(&self) -> &Namespace {
self.0.last().unwrap()
}
/// Puts a mapping into the topmost namespace if this stack does not already contain one.
///
/// Returns a boolean flag indicating whether the insertion has completed successfully.
/// Note that both key and value are matched and the mapping is inserted if either
/// namespace prefix is not already mapped, or if it is mapped, but to a different URI.
///
/// # Parameters
/// * `prefix` --- namespace prefix;
/// * `uri` --- namespace URI.
///
/// # Return value
/// `true` if `prefix` has been inserted successfully; `false` if the `prefix`
/// was already present in the namespace stack.
pub fn put_checked<P, U>(&mut self, prefix: P, uri: U) -> bool
where P: Into<String> + AsRef<str>,
U: Into<String> + AsRef<str>
{
if self.0.iter().any(|ns| ns.get(&prefix) == Some(uri.as_ref())) {
false
} else {
self.put(prefix, uri);
true
}
}
/// Puts a mapping into the topmost namespace in this stack.
///
/// This method does not override a mapping in the topmost namespace if it is
/// already present, however, it does not depend on other namespaces in the stack,
/// so it is possible to put a mapping which is present in lower namespaces.
///
/// Returns a boolean flag indicating whether the insertion has completed successfully.
///
/// # Parameters
/// * `prefix` --- namespace prefix;
/// * `uri` --- namespace URI.
///
/// # Return value
/// `true` if `prefix` has been inserted successfully; `false` if the `prefix`
/// was already present in the namespace.
#[inline]
pub fn put<P, U>(&mut self, prefix: P, uri: U) -> bool
where P: Into<String>, U: Into<String>
{
if let Some(ns) = self.0.last_mut() {
ns.put(prefix, uri)
} else {
false
}
}
/// Performs a search for the given prefix in the whole stack.
///
/// This method walks the stack from top to bottom, querying each namespace
/// in order for the given prefix. If none of the namespaces contains the prefix,
/// `None` is returned.
///
/// # Parameters
/// * `prefix` --- namespace prefix.
#[inline]
pub fn get<'a, P: ?Sized + AsRef<str>>(&'a self, prefix: &P) -> Option<&'a str> {
let prefix = prefix.as_ref();
for ns in self.0.iter().rev() {
match ns.get(prefix) {
None => {},
r => return r,
}
}
None
}
/// Combines this stack of namespaces into a single namespace.
///
/// Namespaces are combined in left-to-right order, that is, rightmost namespace
/// elements take priority over leftmost ones.
#[must_use]
pub fn squash(&self) -> Namespace {
let mut result = BTreeMap::new();
for ns in &self.0 {
result.extend(ns.0.iter().map(|(k, v)| (k.clone(), v.clone())));
}
Namespace(result)
}
/// Returns an object which implements `Extend` using `put_checked()` instead of `put()`.
///
/// See `CheckedTarget` for more information.
#[inline]
pub fn checked_target(&mut self) -> CheckedTarget<'_> {
CheckedTarget(self)
}
/// Returns an iterator over all mappings in this namespace stack.
#[inline]
#[must_use]
pub fn iter(&self) -> NamespaceStackMappings<'_> {
self.into_iter()
}
}
/// An iterator over mappings from prefixes to URIs in a namespace stack.
///
/// # Example
/// ```
/// # use xml::namespace::NamespaceStack;
/// let mut nst = NamespaceStack::empty();
/// nst.push_empty();
/// nst.put("a", "urn:A");
/// nst.put("b", "urn:B");
/// nst.push_empty();
/// nst.put("c", "urn:C");
///
/// assert_eq!(vec![("c", "urn:C"), ("a", "urn:A"), ("b", "urn:B")], nst.iter().collect::<Vec<_>>());
/// ```
pub struct NamespaceStackMappings<'a> {
namespaces: Rev<Iter<'a, Namespace>>,
current_namespace: Option<NamespaceMappings<'a>>,
used_keys: HashSet<&'a str>,
}
impl<'a> NamespaceStackMappings<'a> {
fn go_to_next_namespace(&mut self) -> bool {
self.current_namespace = self.namespaces.next().map(|ns| ns.into_iter());
self.current_namespace.is_some()
}
}
impl<'a> Iterator for NamespaceStackMappings<'a> {
type Item = UriMapping<'a>;
fn next(&mut self) -> Option<UriMapping<'a>> {
// If there is no current namespace and no next namespace, we're finished
if self.current_namespace.is_none() && !self.go_to_next_namespace() {
return None;
}
let next_item = self.current_namespace.as_mut()?.next();
match next_item {
// There is an element in the current namespace
Some((k, v)) => if self.used_keys.contains(&k) {
// If the current key is used, go to the next one
self.next()
} else {
// Otherwise insert the current key to the set of used keys and
// return the mapping
self.used_keys.insert(k);
Some((k, v))
},
// Current namespace is exhausted
None => if self.go_to_next_namespace() {
// If there is next namespace, continue from it
self.next()
} else {
// No next namespace, exiting
None
}
}
}
}
impl<'a> IntoIterator for &'a NamespaceStack {
type Item = UriMapping<'a>;
type IntoIter = NamespaceStackMappings<'a>;
fn into_iter(self) -> Self::IntoIter {
NamespaceStackMappings {
namespaces: self.0.iter().rev(),
current_namespace: None,
used_keys: HashSet::new(),
}
}
}
/// A type alias for a pair of `(prefix, uri)` values returned by namespace iterators.
pub type UriMapping<'a> = (&'a str, &'a str);
impl<'a> Extend<UriMapping<'a>> for Namespace {
fn extend<T>(&mut self, iterable: T) where T: IntoIterator<Item=UriMapping<'a>> {
for (prefix, uri) in iterable {
self.put(prefix, uri);
}
}
}
impl<'a> Extend<UriMapping<'a>> for NamespaceStack {
fn extend<T>(&mut self, iterable: T) where T: IntoIterator<Item=UriMapping<'a>> {
for (prefix, uri) in iterable {
self.put(prefix, uri);
}
}
}
/// A wrapper around `NamespaceStack` which implements `Extend` using `put_checked()`.
///
/// # Example
///
/// ```
/// # use xml::namespace::NamespaceStack;
///
/// let mut nst = NamespaceStack::empty();
/// nst.push_empty();
/// nst.put("a", "urn:A");
/// nst.put("b", "urn:B");
/// nst.push_empty();
/// nst.put("c", "urn:C");
///
/// nst.checked_target().extend(vec![("a", "urn:Z"), ("b", "urn:B"), ("c", "urn:Y"), ("d", "urn:D")]);
/// assert_eq!(
/// vec![("a", "urn:Z"), ("c", "urn:C"), ("d", "urn:D"), ("b", "urn:B")],
/// nst.iter().collect::<Vec<_>>()
/// );
/// ```
///
/// Compare:
///
/// ```
/// # use xml::namespace::NamespaceStack;
/// # let mut nst = NamespaceStack::empty();
/// # nst.push_empty();
/// # nst.put("a", "urn:A");
/// # nst.put("b", "urn:B");
/// # nst.push_empty();
/// # nst.put("c", "urn:C");
///
/// nst.extend(vec![("a", "urn:Z"), ("b", "urn:B"), ("c", "urn:Y"), ("d", "urn:D")]);
/// assert_eq!(
/// vec![("a", "urn:Z"), ("b", "urn:B"), ("c", "urn:C"), ("d", "urn:D")],
/// nst.iter().collect::<Vec<_>>()
/// );
/// ```
pub struct CheckedTarget<'a>(&'a mut NamespaceStack);
impl<'a, 'b> Extend<UriMapping<'b>> for CheckedTarget<'a> {
fn extend<T>(&mut self, iterable: T) where T: IntoIterator<Item=UriMapping<'b>> {
for (prefix, uri) in iterable {
self.0.put_checked(prefix, uri);
}
}
}

157
third-party/vendor/xml-rs/src/reader.rs vendored Normal file
View file

@ -0,0 +1,157 @@
//! Contains high-level interface for a pull-based XML parser.
//!
//! The most important type in this module is `EventReader`, which provides an iterator
//! view for events in XML document.
use std::io::Read;
use std::iter::FusedIterator;
use std::result;
use crate::common::{Position, TextPosition};
pub use self::config::ParserConfig;
pub use self::config::ParserConfig2;
pub use self::error::{Error, ErrorKind};
pub use self::events::XmlEvent;
use self::parser::PullParser;
mod config;
mod events;
mod lexer;
mod parser;
mod indexset;
mod error;
/// A result type yielded by `XmlReader`.
pub type Result<T, E = Error> = result::Result<T, E>;
/// A wrapper around an `std::io::Read` instance which provides pull-based XML parsing.
pub struct EventReader<R: Read> {
source: R,
parser: PullParser,
}
impl<R: Read> EventReader<R> {
/// Creates a new reader, consuming the given stream.
#[inline]
pub fn new(source: R) -> EventReader<R> {
EventReader::new_with_config(source, ParserConfig2::new())
}
/// Creates a new reader with the provded configuration, consuming the given stream.
#[inline]
pub fn new_with_config(source: R, config: impl Into<ParserConfig2>) -> EventReader<R> {
EventReader { source, parser: PullParser::new(config) }
}
/// Pulls and returns next XML event from the stream.
///
/// If returned event is `XmlEvent::Error` or `XmlEvent::EndDocument`, then
/// further calls to this method will return this event again.
#[inline]
pub fn next(&mut self) -> Result<XmlEvent> {
self.parser.next(&mut self.source)
}
/// Skips all XML events until the next end tag at the current level.
///
/// Convenience function that is useful for the case where you have
/// encountered a start tag that is of no interest and want to
/// skip the entire XML subtree until the corresponding end tag.
#[inline]
pub fn skip(&mut self) -> Result<()> {
let mut depth = 1;
while depth > 0 {
match self.next()? {
XmlEvent::StartElement { .. } => depth += 1,
XmlEvent::EndElement { .. } => depth -= 1,
XmlEvent::EndDocument => unreachable!(),
_ => {}
}
}
Ok(())
}
pub fn source(&self) -> &R { &self.source }
pub fn source_mut(&mut self) -> &mut R { &mut self.source }
/// Unwraps this `EventReader`, returning the underlying reader.
///
/// Note that this operation is destructive; unwrapping the reader and wrapping it
/// again with `EventReader::new()` will create a fresh reader which will attempt
/// to parse an XML document from the beginning.
pub fn into_inner(self) -> R {
self.source
}
}
impl<B: Read> Position for EventReader<B> {
/// Returns the position of the last event produced by the reader.
#[inline]
fn position(&self) -> TextPosition {
self.parser.position()
}
}
impl<R: Read> IntoIterator for EventReader<R> {
type Item = Result<XmlEvent>;
type IntoIter = Events<R>;
fn into_iter(self) -> Events<R> {
Events { reader: self, finished: false }
}
}
/// An iterator over XML events created from some type implementing `Read`.
///
/// When the next event is `xml::event::Error` or `xml::event::EndDocument`, then
/// it will be returned by the iterator once, and then it will stop producing events.
pub struct Events<R: Read> {
reader: EventReader<R>,
finished: bool,
}
impl<R: Read> Events<R> {
/// Unwraps the iterator, returning the internal `EventReader`.
#[inline]
pub fn into_inner(self) -> EventReader<R> {
self.reader
}
pub fn source(&self) -> &R { &self.reader.source }
pub fn source_mut(&mut self) -> &mut R { &mut self.reader.source }
}
impl<R: Read> FusedIterator for Events<R> {
}
impl<R: Read> Iterator for Events<R> {
type Item = Result<XmlEvent>;
#[inline]
fn next(&mut self) -> Option<Result<XmlEvent>> {
if self.finished && !self.reader.parser.is_ignoring_end_of_stream() {
None
} else {
let ev = self.reader.next();
if let Ok(XmlEvent::EndDocument) | Err(_) = ev {
self.finished = true;
}
Some(ev)
}
}
}
impl<'r> EventReader<&'r [u8]> {
/// A convenience method to create an `XmlReader` from a string slice.
#[inline]
#[must_use]
pub fn from_str(source: &'r str) -> EventReader<&'r [u8]> {
EventReader::new(source.as_bytes())
}
}

View file

@ -0,0 +1,369 @@
//! Contains parser configuration structure.
use std::collections::HashMap;
use std::io::Read;
use crate::reader::EventReader;
use crate::util::Encoding;
/// Limits to defend from billion laughs attack
const DEFAULT_MAX_ENTITY_EXPANSION_LENGTH: usize = 1_000_000;
const DEFAULT_MAX_ENTITY_EXPANSION_DEPTH: u8 = 10;
/// Parser configuration structure. **There are more config methods than public fileds — see methods below**.
///
/// This structure contains various configuration options which affect
/// behavior of the parser.
#[derive(Clone, PartialEq, Eq, Debug)]
pub struct ParserConfig {
/// Whether or not should whitespace in textual events be removed. Default is false.
///
/// When true, all standalone whitespace will be removed (this means no
/// `Whitespace` events will be emitted), and leading and trailing whitespace
/// from `Character` events will be deleted. If after trimming `Characters`
/// event will be empty, it will also be omitted from output stream. This is
/// possible, however, only if `whitespace_to_characters` or
/// `cdata_to_characters` options are set.
///
/// This option does not affect CDATA events, unless `cdata_to_characters`
/// option is also set. In that case CDATA content will also be trimmed.
pub trim_whitespace: bool,
/// Whether or not should whitespace be converted to characters.
/// Default is false.
///
/// If true, instead of `Whitespace` events `Characters` events with the
/// same content will be emitted. If `trim_whitespace` is also true, these
/// events will be trimmed to nothing and, consequently, not emitted.
pub whitespace_to_characters: bool,
/// Whether or not should CDATA be converted to characters.
/// Default is false.
///
/// If true, instead of `CData` events `Characters` events with the same
/// content will be emitted. If `trim_whitespace` is also true, these events
/// will be trimmed. If corresponding CDATA contained nothing but whitespace,
/// this event will be omitted from the stream.
pub cdata_to_characters: bool,
/// Whether or not should comments be omitted. Default is true.
///
/// If true, `Comment` events will not be emitted at all.
pub ignore_comments: bool,
/// Whether or not should sequential `Characters` events be merged.
/// Default is true.
///
/// If true, multiple sequential `Characters` events will be merged into
/// a single event, that is, their data will be concatenated.
///
/// Multiple sequential `Characters` events are only possible if either
/// `cdata_to_characters` or `ignore_comments` are set. Otherwise character
/// events will always be separated by other events.
pub coalesce_characters: bool,
/// A map of extra entities recognized by the parser. Default is an empty map.
///
/// By default the XML parser recognizes the entities defined in the XML spec. Sometimes,
/// however, it is convenient to make the parser recognize additional entities which
/// are also not available through the DTD definitions (especially given that at the moment
/// DTD parsing is not supported).
pub extra_entities: HashMap<String, String>,
/// Whether or not the parser should ignore the end of stream. Default is false.
///
/// By default the parser will either error out when it encounters a premature end of
/// stream or complete normally if the end of stream was expected. If you want to continue
/// reading from a stream whose input is supplied progressively, you can set this option to true.
/// In this case the parser will allow you to invoke the next() method even if a supposed end
/// of stream has happened.
///
/// Note that support for this functionality is incomplete; for example, the parser will fail if
/// the premature end of stream happens inside PCDATA. Therefore, use this option at your own risk.
pub ignore_end_of_stream: bool,
/// Whether or not non-unicode entity references get replaced with the replacement character
///
/// When true, any decimal or hexadecimal character reference that cannot be converted from a
/// u32 to a char using [std::char::from_u32](https://doc.rust-lang.org/std/char/fn.from_u32.html)
/// will be converted into the unicode REPLACEMENT CHARACTER (U+FFFD).
pub replace_unknown_entity_references: bool,
/// Whether or not whitespace at the root level of the document is ignored. Default is true.
///
/// By default any whitespace that is not enclosed within at least one level of elements will be
/// ignored. Setting this value to false will cause root level whitespace events to be emitted.
///
/// **There are configuration options see methods below**
pub ignore_root_level_whitespace: bool,
}
impl ParserConfig {
/// Returns a new config with default values.
///
/// You can tweak default values using builder-like pattern:
///
/// ```rust
/// use xml::reader::ParserConfig;
///
/// let config = ParserConfig::new()
/// .trim_whitespace(true)
/// .ignore_comments(true)
/// .coalesce_characters(false);
/// ```
#[must_use]
#[inline]
pub fn new() -> ParserConfig {
ParserConfig {
trim_whitespace: false,
whitespace_to_characters: false,
cdata_to_characters: false,
ignore_comments: true,
coalesce_characters: true,
extra_entities: HashMap::new(),
ignore_end_of_stream: false,
replace_unknown_entity_references: false,
ignore_root_level_whitespace: true,
}
}
/// Creates an XML reader with this configuration.
///
/// This is a convenience method for configuring and creating a reader at the same time:
///
/// ```rust
/// use xml::reader::ParserConfig;
///
/// let mut source: &[u8] = b"...";
///
/// let reader = ParserConfig::new()
/// .trim_whitespace(true)
/// .ignore_comments(true)
/// .coalesce_characters(false)
/// .create_reader(&mut source);
/// ```
///
/// This method is exactly equivalent to calling `EventReader::new_with_config()` with
/// this configuration object.
#[inline]
pub fn create_reader<R: Read>(self, source: R) -> EventReader<R> {
EventReader::new_with_config(source, self)
}
/// Adds a new entity mapping and returns an updated config object.
///
/// This is a convenience method for adding external entities mappings to the XML parser.
/// An example:
///
/// ```rust
/// use xml::reader::ParserConfig;
///
/// let mut source: &[u8] = b"...";
///
/// let reader = ParserConfig::new()
/// .add_entity("nbsp", " ")
/// .add_entity("copy", "©")
/// .add_entity("reg", "®")
/// .create_reader(&mut source);
/// ```
pub fn add_entity<S: Into<String>, T: Into<String>>(mut self, entity: S, value: T) -> ParserConfig {
self.extra_entities.insert(entity.into(), value.into());
self
}
}
impl Default for ParserConfig {
#[inline]
fn default() -> ParserConfig {
ParserConfig::new()
}
}
gen_setters! { ParserConfig,
trim_whitespace: val bool,
whitespace_to_characters: val bool,
cdata_to_characters: val bool,
ignore_comments: val bool,
coalesce_characters: val bool,
ignore_end_of_stream: val bool,
replace_unknown_entity_references: val bool,
ignore_root_level_whitespace: val bool
}
/// Backwards-compatible extension of `ParserConfig`, which will eventually be merged into the original `ParserConfig` struct
#[derive(Clone, PartialEq, Eq, Debug)]
#[non_exhaustive]
pub struct ParserConfig2 {
pub(crate) c: ParserConfig,
/// Use this encoding as the default. Necessary for UTF-16 files without BOM.
pub override_encoding: Option<Encoding>,
/// Allow `<?xml encoding="…">` to contain unsupported encoding names,
/// and interpret them as Latin1 instead. This will mangle non-ASCII characters, but usually it won't fail parsing.
pub ignore_invalid_encoding_declarations: bool,
/// Documents with multiple root elements are ill-formed
pub allow_multiple_root_elements: bool,
/// Abort if custom entities create a string longer than this
pub max_entity_expansion_length: usize,
/// Entities can expand into other entities this many times (be careful about exponential cost!)
pub max_entity_expansion_depth: u8,
/// Maximum length of tag name or attribute name
pub max_name_length: usize,
/// Max number of attributes per element
pub max_attributes: usize,
/// Max number of bytes in each attribute
pub max_attribute_length: usize,
/// Maximum length of strings reprsenting characters, comments, and processing instructions
pub max_data_length: usize,
}
impl Default for ParserConfig2 {
fn default() -> Self {
ParserConfig2 {
c: Default::default(),
override_encoding: None,
ignore_invalid_encoding_declarations: false,
allow_multiple_root_elements: true,
max_entity_expansion_length: DEFAULT_MAX_ENTITY_EXPANSION_LENGTH,
max_entity_expansion_depth: DEFAULT_MAX_ENTITY_EXPANSION_DEPTH,
max_attributes: 1<<16,
max_attribute_length: 1<<30,
max_data_length: 1<<30,
max_name_length: 1<<18,
}
}
}
impl ParserConfig2 {
#[inline]
#[must_use]
pub fn new() -> Self {
Self::default()
}
/// Read character encoding from `Content-Type` header.
/// Set this when parsing XML documents fetched over HTTP.
///
/// `text/*` MIME types do *not* imply latin1. UTF-8 is always the default fallback.
#[must_use] pub fn content_type(mut self, mime_type: &str) -> Self {
let charset = mime_type.split_once(';')
.and_then(|(_, args)| args.split_once("charset"))
.and_then(|(_, args)| args.split_once('='));
if let Some((_, charset)) = charset {
let name = charset.trim().trim_matches('"');
match name.parse() {
Ok(enc) => {
self.override_encoding = Some(enc);
},
Err(_) => {},
}
}
self
}
/// Creates an XML reader with this configuration.
///
/// This is a convenience method for configuring and creating a reader at the same time:
///
/// ```rust
/// use xml::reader::ParserConfig;
///
/// let mut source: &[u8] = b"...";
///
/// let reader = ParserConfig::new()
/// .trim_whitespace(true)
/// .ignore_comments(true)
/// .coalesce_characters(false)
/// .create_reader(&mut source);
/// ```
///
/// This method is exactly equivalent to calling `EventReader::new_with_config()` with
/// this configuration object.
#[inline]
pub fn create_reader<R: Read>(self, source: R) -> EventReader<R> {
EventReader::new_with_config(source, self)
}
}
impl From<ParserConfig> for ParserConfig2 {
#[inline]
fn from(c: ParserConfig) -> Self {
Self {
c,
..Default::default()
}
}
}
gen_setters! { ParserConfig2,
/// Set if you got one in the HTTP header
override_encoding: val Option<Encoding>,
/// Allows invalid documents. There should be only a single root element in XML.
allow_multiple_root_elements: val bool,
/// Abort if custom entities create a string longer than this
max_entity_expansion_length: val usize,
/// Entities can expand into other entities this many times (be careful about exponential cost!)
max_entity_expansion_depth: val u8,
/// Max number of attributes per element
max_attributes: val usize,
/// Maximum length of tag name or attribute name
max_name_length: val usize,
/// Max number of bytes in each attribute
max_attribute_length: val usize,
/// Maximum length of strings reprsenting characters, comments, and processing instructions
max_data_length: val usize,
/// Allow `<?xml encoding="bogus"?>`
ignore_invalid_encoding_declarations: val bool
}
gen_setters! { ParserConfig,
/// Set if you got one in the HTTP header (see `content_type`)
override_encoding: c2 Option<Encoding>,
/// Allow `<?xml encoding="bogus"?>`
ignore_invalid_encoding_declarations: c2 bool,
/// Allows invalid documents. There should be only a single root element in XML.
allow_multiple_root_elements: c2 bool,
/// Abort if custom entities create a string longer than this
max_entity_expansion_length: c2 usize,
/// Entities can expand into other entities this many times (be careful about exponential cost!)
max_entity_expansion_depth: c2 u8,
/// Max number of attributes per element
max_attributes: c2 usize,
/// Maximum length of tag name or attribute name
max_name_length: c2 usize,
/// Max number of bytes in each attribute
max_attribute_length: c2 usize,
/// Maximum length of strings reprsenting characters, comments, and processing instructions
max_data_length: c2 usize,
/// Set encoding from the MIME type. Important for HTTP compatibility.
content_type: c2 &str
}
gen_setters! { ParserConfig2,
trim_whitespace: delegate bool,
whitespace_to_characters: delegate bool,
cdata_to_characters: delegate bool,
ignore_comments: delegate bool,
coalesce_characters: delegate bool,
ignore_end_of_stream: delegate bool,
replace_unknown_entity_references: delegate bool,
/// Whether or not whitespace at the root level of the document is ignored. Default is true.
ignore_root_level_whitespace: delegate bool
}
#[test]
fn mime_parse() {
let c = ParserConfig2::new().content_type("text/xml;charset=Us-AScii").max_entity_expansion_length(1000);
assert_eq!(c.override_encoding, Some(Encoding::Ascii));
let c = ParserConfig2::new().max_entity_expansion_depth(3).content_type("text/xml;charset = \"UTF-16\"");
assert_eq!(c.override_encoding, Some(Encoding::Utf16));
}

View file

@ -0,0 +1,251 @@
use crate::Encoding;
use crate::reader::lexer::Token;
use std::borrow::Cow;
use std::error;
use std::error::Error as _;
use std::fmt;
use std::io;
use std::str;
use crate::common::{Position, TextPosition};
use crate::util;
#[derive(Debug)]
pub enum ErrorKind {
Syntax(Cow<'static, str>),
Io(io::Error),
Utf8(str::Utf8Error),
UnexpectedEof,
}
#[derive(Debug, Clone, PartialEq)]
#[non_exhaustive]
pub(crate) enum SyntaxError {
CannotRedefineXmlnsPrefix,
CannotRedefineXmlPrefix,
/// Recursive custom entity expanded to too many chars, it could be DoS
EntityTooBig,
EmptyEntity,
NoRootElement,
ProcessingInstructionWithoutName,
UnbalancedRootElement,
UnexpectedEof,
UnexpectedOpeningTag,
/// Missing `]]>`
UnclosedCdata,
UnexpectedQualifiedName(Token),
UnexpectedTokenOutsideRoot(Token),
UnexpectedToken(Token),
UnexpectedTokenInEntity(Token),
UnexpectedTokenInClosingTag(Token),
UnexpectedTokenInOpeningTag(Token),
InvalidQualifiedName(Box<str>),
UnboundAttribute(Box<str>),
UnboundElementPrefix(Box<str>),
UnexpectedClosingTag(Box<str>),
UnexpectedName(Box<str>),
/// Found <?xml-like PI not at the beginning of a document,
/// which is an error, see section 2.6 of XML 1.1 spec
UnexpectedProcessingInstruction(Box<str>, Token),
CannotUndefinePrefix(Box<str>),
InvalidCharacterEntity(u32),
InvalidDefaultNamespace(Box<str>),
InvalidNamePrefix(Box<str>),
InvalidNumericEntity(Box<str>),
InvalidStandaloneDeclaration(Box<str>),
InvalidXmlProcessingInstruction(Box<str>),
RedefinedAttribute(Box<str>),
UndefinedEntity(Box<str>),
UnexpectedEntity(Box<str>),
UnexpectedNameInsideXml(Box<str>),
UnsupportedEncoding(Box<str>),
/// In DTD
UnknownMarkupDeclaration(Box<str>),
UnexpectedXmlVersion(Box<str>),
ConflictingEncoding(Encoding, Encoding),
UnexpectedTokenBefore(&'static str, char),
/// Document has more stuff than `ParserConfig` allows
ExceededConfiguredLimit,
}
impl fmt::Display for SyntaxError {
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
self.to_cow().fmt(f)
}
}
impl SyntaxError {
#[inline(never)]
#[cold]
pub(crate) fn to_cow(&self) -> Cow<'static, str> {
match *self {
Self::CannotRedefineXmlnsPrefix => "Cannot redefine XMLNS prefix".into(),
Self::CannotRedefineXmlPrefix => "Default XMLNS prefix cannot be rebound to another value".into(),
Self::EmptyEntity => "Encountered empty entity".into(),
Self::EntityTooBig => "Entity too big".into(),
Self::NoRootElement => "Unexpected end of stream: no root element found".into(),
Self::ProcessingInstructionWithoutName => "Encountered processing instruction without a name".into(),
Self::UnbalancedRootElement => "Unexpected end of stream: still inside the root element".into(),
Self::UnclosedCdata => "Unclosed <![CDATA[".into(),
Self::UnexpectedEof => "Unexpected end of stream".into(),
Self::UnexpectedOpeningTag => "'<' is not allowed in attributes".into(),
Self::CannotUndefinePrefix(ref ln) => format!("Cannot undefine prefix '{ln}'").into(),
Self::ConflictingEncoding(a, b) => format!("Declared encoding {a}, but uses {b}").into(),
Self::InvalidCharacterEntity(num) => format!("Invalid character U+{num:04X}").into(),
Self::InvalidDefaultNamespace(ref name) => format!( "Namespace '{name}' cannot be default").into(),
Self::InvalidNamePrefix(ref prefix) => format!("'{prefix}' cannot be an element name prefix").into(),
Self::InvalidNumericEntity(ref v) => format!("Invalid numeric entity: {v}").into(),
Self::InvalidQualifiedName(ref e) => format!("Qualified name is invalid: {e}").into(),
Self::InvalidStandaloneDeclaration(ref value) => format!("Invalid standalone declaration value: {value}").into(),
Self::InvalidXmlProcessingInstruction(ref name) => format!("Invalid processing instruction: <?{name} - \"<?xml\"-like PI is only valid at the beginning of the document").into(),
Self::RedefinedAttribute(ref name) => format!("Attribute '{name}' is redefined").into(),
Self::UnboundAttribute(ref name) => format!("Attribute {name} prefix is unbound").into(),
Self::UnboundElementPrefix(ref name) => format!("Element {name} prefix is unbound").into(),
Self::UndefinedEntity(ref v) => format!("Undefined entity: {v}").into(),
Self::UnexpectedClosingTag(ref expected_got) => format!("Unexpected closing tag: {expected_got}").into(),
Self::UnexpectedEntity(ref name) => format!("Unexpected entity: {name}").into(),
Self::UnexpectedName(ref name) => format!("Unexpected name: {name}").into(),
Self::UnexpectedNameInsideXml(ref name) => format!("Unexpected name inside XML declaration: {name}").into(),
Self::UnexpectedProcessingInstruction(ref buf, token) => format!("Unexpected token inside processing instruction: <?{buf}{token}").into(),
Self::UnexpectedQualifiedName(e) => format!("Unexpected token inside qualified name: {e}").into(),
Self::UnexpectedToken(token) => format!("Unexpected token: {token}").into(),
Self::UnexpectedTokenBefore(before, c) => format!("Unexpected token '{before}' before '{c}'").into(),
Self::UnexpectedTokenInClosingTag(token) => format!("Unexpected token inside closing tag: {token}").into(),
Self::UnexpectedTokenInEntity(token) => format!("Unexpected token inside entity: {token}").into(),
Self::UnexpectedTokenInOpeningTag(token) => format!("Unexpected token inside opening tag: {token}").into(),
Self::UnexpectedTokenOutsideRoot(token) => format!("Unexpected characters outside the root element: {token}").into(),
Self::UnexpectedXmlVersion(ref version) => format!("Invalid XML version: {version}").into(),
Self::UnknownMarkupDeclaration(ref v) => format!("Unknown markup declaration: {v}").into(),
Self::UnsupportedEncoding(ref v) => format!("Unsupported encoding: {v}").into(),
Self::ExceededConfiguredLimit => "This document is larger/more complex than allowed by the parser's configuration".into(),
}
}
}
/// An XML parsing error.
///
/// Consists of a 2D position in a document and a textual message describing the error.
#[derive(Clone, PartialEq, Eq, Debug)]
pub struct Error {
pub(crate) pos: TextPosition,
pub(crate) kind: ErrorKind,
}
impl fmt::Display for Error {
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
use self::ErrorKind::{Io, Syntax, UnexpectedEof, Utf8};
write!(f, "{} ", self.pos)?;
match &self.kind {
Io(io_error) => io_error.fmt(f),
Utf8(reason) => reason.fmt(f),
Syntax(msg) => f.write_str(msg),
UnexpectedEof => f.write_str("Unexpected EOF"),
}
}
}
impl Position for Error {
#[inline]
fn position(&self) -> TextPosition { self.pos }
}
impl Error {
/// Returns a reference to a message which is contained inside this error.
#[cold]
#[doc(hidden)]
#[allow(deprecated)]
#[must_use] pub fn msg(&self) -> &str {
use self::ErrorKind::{Io, Syntax, UnexpectedEof, Utf8};
match &self.kind {
Io(io_error) => io_error.description(),
Utf8(reason) => reason.description(),
Syntax(msg) => msg.as_ref(),
UnexpectedEof => "Unexpected EOF",
}
}
#[must_use]
#[inline]
pub fn kind(&self) -> &ErrorKind {
&self.kind
}
}
impl error::Error for Error {
#[allow(deprecated)]
#[cold]
fn description(&self) -> &str { self.msg() }
}
impl<'a, P, M> From<(&'a P, M)> for Error where P: Position, M: Into<Cow<'static, str>> {
#[cold]
fn from(orig: (&'a P, M)) -> Self {
Error {
pos: orig.0.position(),
kind: ErrorKind::Syntax(orig.1.into()),
}
}
}
impl From<util::CharReadError> for Error {
#[cold]
fn from(e: util::CharReadError) -> Self {
use crate::util::CharReadError::{Io, UnexpectedEof, Utf8};
Error {
pos: TextPosition::new(),
kind: match e {
UnexpectedEof => ErrorKind::UnexpectedEof,
Utf8(reason) => ErrorKind::Utf8(reason),
Io(io_error) => ErrorKind::Io(io_error),
},
}
}
}
impl From<io::Error> for Error {
#[cold]
fn from(e: io::Error) -> Self {
Error {
pos: TextPosition::new(),
kind: ErrorKind::Io(e),
}
}
}
impl Clone for ErrorKind {
#[cold]
fn clone(&self) -> Self {
use self::ErrorKind::{Io, Syntax, UnexpectedEof, Utf8};
match self {
UnexpectedEof => UnexpectedEof,
Utf8(reason) => Utf8(*reason),
Io(io_error) => Io(io::Error::new(io_error.kind(), io_error.to_string())),
Syntax(msg) => Syntax(msg.clone()),
}
}
}
impl PartialEq for ErrorKind {
#[allow(deprecated)]
fn eq(&self, other: &ErrorKind) -> bool {
use self::ErrorKind::{Io, Syntax, UnexpectedEof, Utf8};
match (self, other) {
(UnexpectedEof, UnexpectedEof) => true,
(Utf8(left), Utf8(right)) => left == right,
(Io(left), Io(right)) =>
left.kind() == right.kind() &&
left.description() == right.description(),
(Syntax(left), Syntax(right)) =>
left == right,
(_, _) => false,
}
}
}
impl Eq for ErrorKind {}
#[test]
fn err_size() {
assert!(std::mem::size_of::<SyntaxError>() <= 24);
}

View file

@ -0,0 +1,219 @@
//! Contains `XmlEvent` datatype, instances of which are emitted by the parser.
use std::fmt;
use crate::attribute::OwnedAttribute;
use crate::common::XmlVersion;
use crate::name::OwnedName;
use crate::namespace::Namespace;
/// An element of an XML input stream.
///
/// Items of this enum are emitted by `reader::EventReader`. They correspond to different
/// elements of an XML document.
#[derive(PartialEq, Clone)]
pub enum XmlEvent {
/// Corresponds to XML document declaration.
///
/// This event is always emitted before any other event. It is emitted
/// even if the actual declaration is not present in the document.
StartDocument {
/// XML version.
///
/// If XML declaration is not present, defaults to `Version10`.
version: XmlVersion,
/// XML document encoding.
///
/// If XML declaration is not present or does not contain `encoding` attribute,
/// defaults to `"UTF-8"`. This field is currently used for no other purpose than
/// informational.
encoding: String,
/// XML standalone declaration.
///
/// If XML document is not present or does not contain `standalone` attribute,
/// defaults to `None`. This field is currently used for no other purpose than
/// informational.
standalone: Option<bool>,
},
/// Denotes to the end of the document stream.
///
/// This event is always emitted after any other event (except `Error`). After it
/// is emitted for the first time, it will always be emitted on next event pull attempts.
EndDocument,
/// Denotes an XML processing instruction.
///
/// This event contains a processing instruction target (`name`) and opaque `data`. It
/// is up to the application to process them.
ProcessingInstruction {
/// Processing instruction target.
name: String,
/// Processing instruction content.
data: Option<String>,
},
/// Denotes a beginning of an XML element.
///
/// This event is emitted after parsing opening tags or after parsing bodiless tags. In the
/// latter case `EndElement` event immediately follows.
StartElement {
/// Qualified name of the element.
name: OwnedName,
/// A list of attributes associated with the element.
///
/// Currently attributes are not checked for duplicates (TODO)
attributes: Vec<OwnedAttribute>,
/// Contents of the namespace mapping at this point of the document.
namespace: Namespace,
},
/// Denotes an end of an XML element.
///
/// This event is emitted after parsing closing tags or after parsing bodiless tags. In the
/// latter case it is emitted immediately after corresponding `StartElement` event.
EndElement {
/// Qualified name of the element.
name: OwnedName,
},
/// Denotes CDATA content.
///
/// This event contains unparsed data. No unescaping will be performed.
///
/// It is possible to configure a parser to emit `Characters` event instead of `CData`. See
/// `pull::ParserConfiguration` structure for more information.
CData(String),
/// Denotes a comment.
///
/// It is possible to configure a parser to ignore comments, so this event will never be emitted.
/// See `pull::ParserConfiguration` structure for more information.
Comment(String),
/// Denotes character data outside of tags.
///
/// Contents of this event will always be unescaped, so no entities like `&lt;` or `&amp;` or `&#123;`
/// will appear in it.
///
/// It is possible to configure a parser to trim leading and trailing whitespace for this event.
/// See `pull::ParserConfiguration` structure for more information.
Characters(String),
/// Denotes a chunk of whitespace outside of tags.
///
/// It is possible to configure a parser to emit `Characters` event instead of `Whitespace`.
/// See `pull::ParserConfiguration` structure for more information. When combined with whitespace
/// trimming, it will eliminate standalone whitespace from the event stream completely.
Whitespace(String),
}
impl fmt::Debug for XmlEvent {
#[cold]
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
match *self {
XmlEvent::StartDocument { ref version, ref encoding, standalone } =>
write!(f, "StartDocument({}, {}, {:?})", version, *encoding, standalone),
XmlEvent::EndDocument =>
write!(f, "EndDocument"),
XmlEvent::ProcessingInstruction { ref name, ref data } =>
write!(f, "ProcessingInstruction({}{})", *name, match *data {
Some(ref data) => format!(", {data}"),
None => String::new()
}),
XmlEvent::StartElement { ref name, ref attributes, namespace: Namespace(ref namespace) } =>
write!(f, "StartElement({}, {:?}{})", name, namespace, if attributes.is_empty() {
String::new()
} else {
let attributes: Vec<String> = attributes.iter().map(
|a| format!("{} -> {}", a.name, a.value)
).collect();
format!(", [{}]", attributes.join(", "))
}),
XmlEvent::EndElement { ref name } =>
write!(f, "EndElement({name})"),
XmlEvent::Comment(ref data) =>
write!(f, "Comment({data})"),
XmlEvent::CData(ref data) =>
write!(f, "CData({data})"),
XmlEvent::Characters(ref data) =>
write!(f, "Characters({data})"),
XmlEvent::Whitespace(ref data) =>
write!(f, "Whitespace({data})")
}
}
}
impl XmlEvent {
/// Obtains a writer event from this reader event.
///
/// This method is useful for streaming processing of XML documents where the output
/// is also an XML document. With this method it is possible to process some events
/// while passing other events through to the writer unchanged:
///
/// ```rust
/// use std::str;
///
/// use xml::{EventReader, EventWriter};
/// use xml::reader::XmlEvent as ReaderEvent;
/// use xml::writer::XmlEvent as WriterEvent;
///
/// let mut input: &[u8] = b"<hello>world</hello>";
/// let mut output: Vec<u8> = Vec::new();
///
/// {
/// let mut reader = EventReader::new(&mut input);
/// let mut writer = EventWriter::new(&mut output);
///
/// for e in reader {
/// match e.unwrap() {
/// ReaderEvent::Characters(s) =>
/// writer.write(WriterEvent::characters(&s.to_uppercase())).unwrap(),
/// e => if let Some(e) = e.as_writer_event() {
/// writer.write(e).unwrap()
/// }
/// }
/// }
/// }
///
/// assert_eq!(
/// str::from_utf8(&output).unwrap(),
/// r#"<?xml version="1.0" encoding="UTF-8"?><hello>WORLD</hello>"#
/// );
/// ```
///
/// Note that this API may change or get additions in future to improve its ergonomics.
#[must_use]
pub fn as_writer_event(&self) -> Option<crate::writer::events::XmlEvent<'_>> {
match *self {
XmlEvent::StartDocument { version, ref encoding, standalone } =>
Some(crate::writer::events::XmlEvent::StartDocument {
version,
encoding: Some(encoding),
standalone
}),
XmlEvent::ProcessingInstruction { ref name, ref data } =>
Some(crate::writer::events::XmlEvent::ProcessingInstruction {
name,
data: data.as_ref().map(|s| &**s)
}),
XmlEvent::StartElement { ref name, ref attributes, ref namespace } =>
Some(crate::writer::events::XmlEvent::StartElement {
name: name.borrow(),
attributes: attributes.iter().map(|a| a.borrow()).collect(),
namespace: namespace.borrow(),
}),
XmlEvent::EndElement { ref name } =>
Some(crate::writer::events::XmlEvent::EndElement { name: Some(name.borrow()) }),
XmlEvent::Comment(ref data) => Some(crate::writer::events::XmlEvent::Comment(data)),
XmlEvent::CData(ref data) => Some(crate::writer::events::XmlEvent::CData(data)),
XmlEvent::Characters(ref data) |
XmlEvent::Whitespace(ref data) => Some(crate::writer::events::XmlEvent::Characters(data)),
XmlEvent::EndDocument => None,
}
}
}

View file

@ -0,0 +1,116 @@
use crate::attribute::OwnedAttribute;
use crate::name::OwnedName;
use std::collections::hash_map::RandomState;
use std::collections::HashSet;
use std::hash::BuildHasher;
use std::hash::Hash;
use std::hash::Hasher;
/// An ordered set
pub(crate) struct AttributesSet {
vec: Vec<OwnedAttribute>,
/// Uses a no-op hasher, because these u64s are hashes already
may_contain: HashSet<u64, U64HasherBuilder>,
/// This is real hasher for the `OwnedName`
hasher: RandomState,
}
/// Use linear search and don't allocate `HashSet` if there are few attributes,
/// because allocation costs more than a few comparisons.
const HASH_THRESHOLD: usize = 8;
impl AttributesSet {
pub fn new() -> Self {
Self {
vec: Vec::new(),
hasher: RandomState::new(),
may_contain: HashSet::default(),
}
}
fn hash(&self, val: &OwnedName) -> u64 {
let mut h = self.hasher.build_hasher();
val.hash(&mut h);
h.finish()
}
pub fn len(&self) -> usize {
self.vec.len()
}
pub fn contains(&self, name: &OwnedName) -> bool {
// fall back to linear search only on duplicate or hash collision
(self.vec.len() < HASH_THRESHOLD || self.may_contain.contains(&self.hash(name))) &&
self.vec.iter().any(move |a| &a.name == name)
}
pub fn push(&mut self, attr: OwnedAttribute) {
if self.vec.len() >= HASH_THRESHOLD {
if self.vec.len() == HASH_THRESHOLD {
self.may_contain.reserve(HASH_THRESHOLD * 2);
for attr in &self.vec {
self.may_contain.insert(self.hash(&attr.name));
}
}
self.may_contain.insert(self.hash(&attr.name));
}
self.vec.push(attr);
}
pub fn into_vec(self) -> Vec<OwnedAttribute> {
self.vec
}
}
#[test]
fn indexset() {
let mut s = AttributesSet::new();
let not_here = OwnedName {
local_name: "attr1000".into(),
namespace: Some("test".into()),
prefix: None,
};
// this test will take a lot of time if the `contains()` is linear, and the loop is quadratic
for i in 0..50000 {
let name = OwnedName {
local_name: format!("attr{i}"), namespace: None, prefix: None,
};
assert!(!s.contains(&name));
s.push(OwnedAttribute { name, value: String::new() });
assert!(!s.contains(&not_here));
}
assert!(s.contains(&OwnedName {
local_name: "attr1234".into(), namespace: None, prefix: None,
}));
assert!(s.contains(&OwnedName {
local_name: "attr0".into(), namespace: None, prefix: None,
}));
assert!(s.contains(&OwnedName {
local_name: "attr49999".into(), namespace: None, prefix: None,
}));
}
/// Hashser that does nothing except passing u64 through
struct U64Hasher(u64);
impl Hasher for U64Hasher {
fn finish(&self) -> u64 { self.0 }
fn write(&mut self, slice: &[u8]) {
for &v in slice { self.0 ^= u64::from(v) } // unused in practice
}
fn write_u64(&mut self, i: u64) {
self.0 ^= i;
}
}
#[derive(Default)]
struct U64HasherBuilder;
impl BuildHasher for U64HasherBuilder {
type Hasher = U64Hasher;
fn build_hasher(&self) -> U64Hasher { U64Hasher(0) }
}

File diff suppressed because it is too large Load diff

View file

@ -0,0 +1,799 @@
//! Contains an implementation of pull-based XML parser.
use crate::common::{is_xml10_char, is_xml11_char, is_xml11_char_not_restricted, is_name_char, is_name_start_char, is_whitespace_char};
use crate::common::{Position, TextPosition, XmlVersion};
use crate::name::OwnedName;
use crate::namespace::NamespaceStack;
use crate::reader::config::ParserConfig2;
use crate::reader::error::SyntaxError;
use crate::reader::events::XmlEvent;
use crate::reader::indexset::AttributesSet;
use crate::reader::lexer::{Lexer, Token};
use super::{Error, ErrorKind};
use std::collections::HashMap;
use std::io::Read;
macro_rules! gen_takes(
($($field:ident -> $method:ident, $t:ty, $def:expr);+) => (
$(
impl MarkupData {
#[inline]
#[allow(clippy::mem_replace_option_with_none)]
fn $method(&mut self) -> $t {
std::mem::replace(&mut self.$field, $def)
}
}
)+
)
);
gen_takes!(
name -> take_name, String, String::new();
ref_data -> take_ref_data, String, String::new();
encoding -> take_encoding, Option<String>, None;
element_name -> take_element_name, Option<OwnedName>, None;
attr_name -> take_attr_name, Option<OwnedName>, None;
attributes -> take_attributes, AttributesSet, AttributesSet::new()
);
mod inside_cdata;
mod inside_closing_tag_name;
mod inside_comment;
mod inside_declaration;
mod inside_doctype;
mod inside_opening_tag;
mod inside_processing_instruction;
mod inside_reference;
mod outside_tag;
static DEFAULT_VERSION: XmlVersion = XmlVersion::Version10;
static DEFAULT_STANDALONE: Option<bool> = None;
type ElementStack = Vec<OwnedName>;
pub type Result = super::Result<XmlEvent>;
/// Pull-based XML parser.
pub(crate) struct PullParser {
config: ParserConfig2,
lexer: Lexer,
st: State,
state_after_reference: State,
buf: String,
/// From DTD internal subset
entities: HashMap<String, String>,
nst: NamespaceStack,
data: MarkupData,
final_result: Option<Result>,
next_event: Option<Result>,
est: ElementStack,
pos: Vec<TextPosition>,
encountered: Encountered,
inside_whitespace: bool,
read_prefix_separator: bool,
pop_namespace: bool,
}
// Keeps track when XML declaration can happen
#[derive(Copy, Clone, PartialEq, Eq, PartialOrd, Ord)]
enum Encountered {
None = 0,
AnyChars, // whitespace before <?xml is not allowed
Declaration,
Comment,
Doctype,
Element,
}
impl PullParser {
/// Returns a new parser using the given config.
#[inline]
pub fn new(config: impl Into<ParserConfig2>) -> PullParser {
let config = config.into();
Self::new_with_config2(config)
}
#[inline]
fn new_with_config2(config: ParserConfig2) -> PullParser {
let mut lexer = Lexer::new(&config);
if let Some(enc) = config.override_encoding {
lexer.set_encoding(enc);
}
let mut pos = Vec::with_capacity(16);
pos.push(TextPosition::new());
PullParser {
config,
lexer,
st: State::DocumentStart,
state_after_reference: State::OutsideTag,
buf: String::new(),
entities: HashMap::new(),
nst: NamespaceStack::default(),
data: MarkupData {
name: String::new(),
version: None,
encoding: None,
standalone: None,
ref_data: String::new(),
element_name: None,
quote: None,
attr_name: None,
attributes: AttributesSet::new(),
},
final_result: None,
next_event: None,
est: Vec::new(),
pos,
encountered: Encountered::None,
inside_whitespace: true,
read_prefix_separator: false,
pop_namespace: false,
}
}
/// Checks if this parser ignores the end of stream errors.
pub fn is_ignoring_end_of_stream(&self) -> bool { self.config.c.ignore_end_of_stream }
#[inline(never)]
fn set_encountered(&mut self, new_encounter: Encountered) -> Option<Result> {
if new_encounter <= self.encountered {
return None;
}
let prev_enc = self.encountered;
self.encountered = new_encounter;
// If declaration was not parsed and we have encountered an element,
// emit this declaration as the next event.
if prev_enc == Encountered::None {
self.push_pos();
Some(Ok(XmlEvent::StartDocument {
version: DEFAULT_VERSION,
encoding: self.lexer.encoding().to_string(),
standalone: DEFAULT_STANDALONE,
}))
} else {
None
}
}
}
impl Position for PullParser {
/// Returns the position of the last event produced by the parser
#[inline]
fn position(&self) -> TextPosition {
self.pos[0]
}
}
#[derive(Copy, Clone, PartialEq)]
pub enum State {
OutsideTag,
InsideOpeningTag(OpeningTagSubstate),
InsideClosingTag(ClosingTagSubstate),
InsideProcessingInstruction(ProcessingInstructionSubstate),
InsideComment,
InsideCData,
InsideDeclaration(DeclarationSubstate),
InsideDoctype(DoctypeSubstate),
InsideReference,
DocumentStart,
}
#[derive(Copy, Clone, PartialEq)]
pub enum DoctypeSubstate {
Outside,
String,
InsideName,
BeforeEntityName,
EntityName,
BeforeEntityValue,
EntityValue,
NumericReferenceStart,
NumericReference,
/// expansion
PEReferenceInValue,
PEReferenceInDtd,
/// name definition
PEReferenceDefinitionStart,
PEReferenceDefinition,
SkipDeclaration,
Comment,
}
#[derive(Copy, Clone, PartialEq)]
pub enum OpeningTagSubstate {
InsideName,
InsideTag,
InsideAttributeName,
AfterAttributeName,
InsideAttributeValue,
AfterAttributeValue,
}
#[derive(Copy, Clone, PartialEq)]
pub enum ClosingTagSubstate {
CTInsideName,
CTAfterName,
}
#[derive(Copy, Clone, PartialEq)]
pub enum ProcessingInstructionSubstate {
PIInsideName,
PIInsideData,
}
#[derive(Copy, Clone, PartialEq)]
pub enum DeclarationSubstate {
BeforeVersion,
InsideVersion,
AfterVersion,
InsideVersionValue,
AfterVersionValue,
BeforeEncoding,
InsideEncoding,
AfterEncoding,
InsideEncodingValue,
AfterEncodingValue,
BeforeStandaloneDecl,
InsideStandaloneDecl,
AfterStandaloneDecl,
InsideStandaloneDeclValue,
AfterStandaloneDeclValue,
}
#[derive(PartialEq)]
enum QualifiedNameTarget {
AttributeNameTarget,
OpeningTagNameTarget,
ClosingTagNameTarget,
}
#[derive(Copy, Clone, PartialEq, Eq)]
enum QuoteToken {
SingleQuoteToken,
DoubleQuoteToken,
}
impl QuoteToken {
fn from_token(t: &Token) -> QuoteToken {
match *t {
Token::SingleQuote => QuoteToken::SingleQuoteToken,
Token::DoubleQuote => QuoteToken::DoubleQuoteToken,
_ => panic!("Unexpected token: {t}"),
}
}
fn as_token(self) -> Token {
match self {
QuoteToken::SingleQuoteToken => Token::SingleQuote,
QuoteToken::DoubleQuoteToken => Token::DoubleQuote,
}
}
}
struct MarkupData {
name: String, // used for processing instruction name
ref_data: String, // used for reference content
version: Option<XmlVersion>, // used for XML declaration version
encoding: Option<String>, // used for XML declaration encoding
standalone: Option<bool>, // used for XML declaration standalone parameter
element_name: Option<OwnedName>, // used for element name
quote: Option<QuoteToken>, // used to hold opening quote for attribute value
attr_name: Option<OwnedName>, // used to hold attribute name
attributes: AttributesSet, // used to hold all accumulated attributes
}
impl PullParser {
/// Returns next event read from the given buffer.
///
/// This method should be always called with the same buffer. If you call it
/// providing different buffers each time, the result will be undefined.
pub fn next<R: Read>(&mut self, r: &mut R) -> Result {
if let Some(ref ev) = self.final_result {
return ev.clone();
}
if let Some(ev) = self.next_event.take() {
return ev;
}
if self.pop_namespace {
self.pop_namespace = false;
self.nst.pop();
}
loop {
debug_assert!(self.next_event.is_none());
debug_assert!(!self.pop_namespace);
// While lexer gives us Ok(maybe_token) -- we loop.
// Upon having a complete XML-event -- we return from the whole function.
match self.lexer.next_token(r) {
Ok(Some(token)) => {
match self.dispatch_token(token) {
None => {} // continue
Some(Ok(xml_event)) => {
self.next_pos();
return Ok(xml_event)
},
Some(Err(xml_error)) => {
self.next_pos();
return self.set_final_result(Err(xml_error))
},
}
},
Ok(None) => break,
Err(lexer_error) => {
return self.set_final_result(Err(lexer_error))
},
}
}
self.handle_eof()
}
/// Handle end of stream
fn handle_eof(&mut self) -> std::result::Result<XmlEvent, super::Error> {
// Forward pos to the lexer head
self.next_pos();
let ev = if self.depth() == 0 {
if self.encountered == Encountered::Element && self.st == State::OutsideTag { // all is ok
Ok(XmlEvent::EndDocument)
} else if self.encountered < Encountered::Element {
self.error(SyntaxError::NoRootElement)
} else { // self.st != State::OutsideTag
self.error(SyntaxError::UnexpectedEof) // TODO: add expected hint?
}
} else if self.config.c.ignore_end_of_stream {
self.final_result = None;
self.lexer.reset_eof_handled();
return self.error(SyntaxError::UnbalancedRootElement);
} else {
self.error(SyntaxError::UnbalancedRootElement)
};
self.set_final_result(ev)
}
// This function is to be called when a terminal event is reached.
// The function sets up the `self.final_result` into `Some(result)` and return `result`.
#[inline]
fn set_final_result(&mut self, result: Result) -> Result {
self.final_result = Some(result.clone());
result
}
#[cold]
fn error(&self, e: SyntaxError) -> Result {
Err(Error {
pos: self.lexer.position(),
kind: ErrorKind::Syntax(e.to_cow()),
})
}
#[inline]
fn next_pos(&mut self) {
// unfortunately calls to next_pos will never be perfectly balanced with push_pos,
// at very least because parse errors and EOF can happen unexpectedly without a prior push.
if !self.pos.is_empty() {
if self.pos.len() > 1 {
self.pos.remove(0);
} else {
self.pos[0] = self.lexer.position();
}
}
}
#[inline]
#[track_caller]
fn push_pos(&mut self) {
debug_assert!(self.pos.len() != self.pos.capacity(), "You've found a bug in xml-rs, caused by calls to push_pos() in states that don't end up emitting events.
This case is ignored in release mode, and merely causes document positions to be out of sync.
Please file a bug and include the XML document that triggers this assert.");
// it has capacity preallocated for more than it ever needs, so this reduces code size
if self.pos.len() != self.pos.capacity() {
self.pos.push(self.lexer.position());
} else if self.pos.len() > 1 {
self.pos.remove(0); // this mitigates the excessive push_pos() call
}
}
#[inline(never)]
fn dispatch_token(&mut self, t: Token) -> Option<Result> {
match self.st {
State::OutsideTag => self.outside_tag(t),
State::InsideOpeningTag(s) => self.inside_opening_tag(t, s),
State::InsideClosingTag(s) => self.inside_closing_tag_name(t, s),
State::InsideReference => self.inside_reference(t),
State::InsideComment => self.inside_comment(t),
State::InsideCData => self.inside_cdata(t),
State::InsideProcessingInstruction(s) => self.inside_processing_instruction(t, s),
State::InsideDoctype(s) => self.inside_doctype(t, s),
State::InsideDeclaration(s) => self.inside_declaration(t, s),
State::DocumentStart => self.document_start(t),
}
}
#[inline]
fn depth(&self) -> usize {
self.est.len()
}
#[inline]
fn buf_has_data(&self) -> bool {
!self.buf.is_empty()
}
#[inline]
fn take_buf(&mut self) -> String {
std::mem::take(&mut self.buf)
}
#[inline]
fn into_state(&mut self, st: State, ev: Option<Result>) -> Option<Result> {
self.st = st;
ev
}
#[inline]
fn into_state_continue(&mut self, st: State) -> Option<Result> {
self.into_state(st, None)
}
#[inline]
fn into_state_emit(&mut self, st: State, ev: Result) -> Option<Result> {
self.into_state(st, Some(ev))
}
/// Dispatches tokens in order to process qualified name. If qualified name cannot be parsed,
/// an error is returned.
///
/// # Parameters
/// * `t` --- next token;
/// * `on_name` --- a callback which is executed when whitespace is encountered.
fn read_qualified_name<F>(&mut self, t: Token, target: QualifiedNameTarget, on_name: F) -> Option<Result>
where F: Fn(&mut PullParser, Token, OwnedName) -> Option<Result> {
// We can get here for the first time only when self.data.name contains zero or one character,
// but first character cannot be a colon anyway
if self.buf.len() <= 1 {
self.read_prefix_separator = false;
}
let invoke_callback = move |this: &mut PullParser, t| {
let name = this.take_buf();
match name.parse() {
Ok(name) => on_name(this, t, name),
Err(_) => Some(this.error(SyntaxError::InvalidQualifiedName(name.into()))),
}
};
match t {
// There can be only one colon, and not as the first character
Token::Character(':') if self.buf_has_data() && !self.read_prefix_separator => {
self.buf.push(':');
self.read_prefix_separator = true;
None
}
Token::Character(c) if c != ':' && (self.buf.is_empty() && is_name_start_char(c) ||
self.buf_has_data() && is_name_char(c)) => {
if self.buf.len() > self.config.max_name_length {
return Some(self.error(SyntaxError::ExceededConfiguredLimit));
}
self.buf.push(c);
None
},
Token::EqualsSign if target == QualifiedNameTarget::AttributeNameTarget => invoke_callback(self, t),
Token::EmptyTagEnd if target == QualifiedNameTarget::OpeningTagNameTarget => invoke_callback(self, t),
Token::TagEnd if target == QualifiedNameTarget::OpeningTagNameTarget ||
target == QualifiedNameTarget::ClosingTagNameTarget => invoke_callback(self, t),
Token::Character(c) if is_whitespace_char(c) => invoke_callback(self, t),
_ => Some(self.error(SyntaxError::UnexpectedQualifiedName(t))),
}
}
/// Dispatches tokens in order to process attribute value.
///
/// # Parameters
/// * `t` --- next token;
/// * `on_value` --- a callback which is called when terminating quote is encountered.
fn read_attribute_value<F>(&mut self, t: Token, on_value: F) -> Option<Result>
where F: Fn(&mut PullParser, String) -> Option<Result> {
match t {
Token::Character(c) if self.data.quote.is_none() && is_whitespace_char(c) => None, // skip leading whitespace
Token::DoubleQuote | Token::SingleQuote => match self.data.quote {
None => { // Entered attribute value
self.data.quote = Some(QuoteToken::from_token(&t));
None
}
Some(q) if q.as_token() == t => {
self.data.quote = None;
let value = self.take_buf();
on_value(self, value)
}
_ => {
if let Token::Character(c) = t {
if !self.is_valid_xml_char_not_restricted(c) {
return Some(self.error(SyntaxError::InvalidCharacterEntity(c as u32)));
}
}
if self.buf.len() > self.config.max_attribute_length {
return Some(self.error(SyntaxError::ExceededConfiguredLimit));
}
t.push_to_string(&mut self.buf);
None
}
},
Token::ReferenceStart if self.data.quote.is_some() => {
self.state_after_reference = self.st;
self.into_state_continue(State::InsideReference)
},
Token::OpeningTagStart => Some(self.error(SyntaxError::UnexpectedOpeningTag)),
Token::Character(c) if !self.is_valid_xml_char_not_restricted(c) => {
Some(self.error(SyntaxError::InvalidCharacterEntity(c as u32)))
},
// Every character except " and ' and < is okay
_ if self.data.quote.is_some() => {
if self.buf.len() > self.config.max_attribute_length {
return Some(self.error(SyntaxError::ExceededConfiguredLimit));
}
t.push_to_string(&mut self.buf);
None
}
_ => Some(self.error(SyntaxError::UnexpectedToken(t))),
}
}
fn emit_start_element(&mut self, emit_end_element: bool) -> Option<Result> {
let mut name = self.data.take_element_name()?;
let mut attributes = self.data.take_attributes().into_vec();
// check whether the name prefix is bound and fix its namespace
match self.nst.get(name.borrow().prefix_repr()) {
Some("") => name.namespace = None, // default namespace
Some(ns) => name.namespace = Some(ns.into()),
None => return Some(self.error(SyntaxError::UnboundElementPrefix(name.to_string().into())))
}
// check and fix accumulated attributes prefixes
for attr in &mut attributes {
if let Some(ref pfx) = attr.name.prefix {
let new_ns = match self.nst.get(pfx) {
Some("") => None, // default namespace
Some(ns) => Some(ns.into()),
None => return Some(self.error(SyntaxError::UnboundAttribute(attr.name.to_string().into())))
};
attr.name.namespace = new_ns;
}
}
if emit_end_element {
self.pop_namespace = true;
self.next_event = Some(Ok(XmlEvent::EndElement {
name: name.clone()
}));
} else {
self.est.push(name.clone());
}
let namespace = self.nst.squash();
self.into_state_emit(State::OutsideTag, Ok(XmlEvent::StartElement {
name,
attributes,
namespace
}))
}
fn emit_end_element(&mut self) -> Option<Result> {
let mut name = self.data.take_element_name()?;
// check whether the name prefix is bound and fix its namespace
match self.nst.get(name.borrow().prefix_repr()) {
Some("") => name.namespace = None, // default namespace
Some(ns) => name.namespace = Some(ns.into()),
None => return Some(self.error(SyntaxError::UnboundElementPrefix(name.to_string().into())))
}
let op_name = self.est.pop()?;
if name == op_name {
self.pop_namespace = true;
self.into_state_emit(State::OutsideTag, Ok(XmlEvent::EndElement { name }))
} else {
Some(self.error(SyntaxError::UnexpectedClosingTag(format!("{name} != {op_name}").into())))
}
}
#[inline]
fn is_valid_xml_char(&self, c: char) -> bool {
if Some(XmlVersion::Version11) == self.data.version {
is_xml11_char(c)
} else {
is_xml10_char(c)
}
}
#[inline]
fn is_valid_xml_char_not_restricted(&self, c: char) -> bool {
if Some(XmlVersion::Version11) == self.data.version {
is_xml11_char_not_restricted(c)
} else {
is_xml10_char(c)
}
}
}
#[cfg(test)]
mod tests {
use std::io::BufReader;
use crate::attribute::OwnedAttribute;
use crate::common::TextPosition;
use crate::name::OwnedName;
use crate::reader::events::XmlEvent;
use crate::reader::parser::PullParser;
use crate::reader::ParserConfig;
fn new_parser() -> PullParser {
PullParser::new(ParserConfig::new())
}
macro_rules! expect_event(
($r:expr, $p:expr, $t:pat) => (
match $p.next(&mut $r) {
$t => {}
e => panic!("Unexpected event: {e:?}\nExpected: {}", stringify!($t))
}
);
($r:expr, $p:expr, $t:pat => $c:expr ) => (
match $p.next(&mut $r) {
$t if $c => {}
e => panic!("Unexpected event: {e:?}\nExpected: {} if {}", stringify!($t), stringify!($c))
}
)
);
macro_rules! test_data(
($d:expr) => ({
static DATA: &'static str = $d;
let r = BufReader::new(DATA.as_bytes());
let p = new_parser();
(r, p)
})
);
#[test]
fn issue_3_semicolon_in_attribute_value() {
let (mut r, mut p) = test_data!(r#"
<a attr="zzz;zzz" />
"#);
expect_event!(r, p, Ok(XmlEvent::StartDocument { .. }));
expect_event!(r, p, Ok(XmlEvent::StartElement { ref name, ref attributes, ref namespace }) =>
*name == OwnedName::local("a") &&
attributes.len() == 1 &&
attributes[0] == OwnedAttribute::new(OwnedName::local("attr"), "zzz;zzz") &&
namespace.is_essentially_empty()
);
expect_event!(r, p, Ok(XmlEvent::EndElement { ref name }) => *name == OwnedName::local("a"));
expect_event!(r, p, Ok(XmlEvent::EndDocument));
}
#[test]
fn issue_140_entity_reference_inside_tag() {
let (mut r, mut p) = test_data!(r#"
<bla>&#9835;</bla>
"#);
expect_event!(r, p, Ok(XmlEvent::StartDocument { .. }));
expect_event!(r, p, Ok(XmlEvent::StartElement { ref name, .. }) => *name == OwnedName::local("bla"));
expect_event!(r, p, Ok(XmlEvent::Characters(ref s)) => s == "\u{266b}");
expect_event!(r, p, Ok(XmlEvent::EndElement { ref name, .. }) => *name == OwnedName::local("bla"));
expect_event!(r, p, Ok(XmlEvent::EndDocument));
}
#[test]
fn issue_220_comment() {
let (mut r, mut p) = test_data!(r#"<x><!-- <!--></x>"#);
expect_event!(r, p, Ok(XmlEvent::StartDocument { .. }));
expect_event!(r, p, Ok(XmlEvent::StartElement { .. }));
expect_event!(r, p, Ok(XmlEvent::EndElement { .. }));
expect_event!(r, p, Ok(XmlEvent::EndDocument));
let (mut r, mut p) = test_data!(r#"<x><!-- <!---></x>"#);
expect_event!(r, p, Ok(XmlEvent::StartDocument { .. }));
expect_event!(r, p, Ok(XmlEvent::StartElement { .. }));
expect_event!(r, p, Err(_)); // ---> is forbidden in comments
let (mut r, mut p) = test_data!(r#"<x><!--<text&x;> <!--></x>"#);
p.config.c.ignore_comments = false;
expect_event!(r, p, Ok(XmlEvent::StartDocument { .. }));
expect_event!(r, p, Ok(XmlEvent::StartElement { .. }));
expect_event!(r, p, Ok(XmlEvent::Comment(s)) => s == "<text&x;> <!");
expect_event!(r, p, Ok(XmlEvent::EndElement { .. }));
expect_event!(r, p, Ok(XmlEvent::EndDocument));
}
#[test]
fn malformed_declaration_attrs() {
let (mut r, mut p) = test_data!(r#"<?xml version x="1.0"?>"#);
expect_event!(r, p, Err(_));
let (mut r, mut p) = test_data!(r#"<?xml version="1.0" version="1.0"?>"#);
expect_event!(r, p, Err(_));
let (mut r, mut p) = test_data!(r#"<?xml version="1.0"encoding="utf-8"?>"#);
expect_event!(r, p, Err(_));
let (mut r, mut p) = test_data!(r#"<?xml version="1.0"standalone="yes"?>"#);
expect_event!(r, p, Err(_));
let (mut r, mut p) = test_data!(r#"<?xml version="1.0" encoding="utf-8"standalone="yes"?>"#);
expect_event!(r, p, Err(_));
}
#[test]
fn opening_tag_in_attribute_value() {
use crate::reader::error::{SyntaxError, Error, ErrorKind};
let (mut r, mut p) = test_data!(r#"
<a attr="zzz<zzz" />
"#);
expect_event!(r, p, Ok(XmlEvent::StartDocument { .. }));
expect_event!(r, p, Err(ref e) =>
*e == Error {
kind: ErrorKind::Syntax(SyntaxError::UnexpectedOpeningTag.to_cow()),
pos: TextPosition { row: 1, column: 24 }
}
);
}
#[test]
fn reference_err() {
let (mut r, mut p) = test_data!(r#"
<a>&&amp;</a>
"#);
expect_event!(r, p, Ok(XmlEvent::StartDocument { .. }));
expect_event!(r, p, Ok(XmlEvent::StartElement { .. }));
expect_event!(r, p, Err(_));
}
#[test]
fn state_size() {
assert_eq!(2, std::mem::size_of::<super::State>());
assert_eq!(1, std::mem::size_of::<super::DoctypeSubstate>());
}
}

View file

@ -0,0 +1,38 @@
use crate::reader::error::SyntaxError;
use crate::reader::lexer::Token;
use crate::{common::is_whitespace_char, reader::events::XmlEvent};
use super::{PullParser, Result, State};
impl PullParser {
pub fn inside_cdata(&mut self, t: Token) -> Option<Result> {
match t {
Token::CDataEnd => {
let event = if self.config.c.cdata_to_characters {
// start called push_pos, but there will be no event to pop it
if self.buf.is_empty() {
self.next_pos();
}
None
} else {
let data = self.take_buf();
Some(Ok(XmlEvent::CData(data)))
};
self.into_state(State::OutsideTag, event)
}
Token::Character(c) if !self.is_valid_xml_char(c) => {
Some(self.error(SyntaxError::InvalidCharacterEntity(c as u32)))
},
Token::Character(c) => {
if !is_whitespace_char(c) {
self.inside_whitespace = false;
}
self.buf.push(c);
None
}
_ => unreachable!(),
}
}
}

View file

@ -0,0 +1,31 @@
use crate::reader::error::SyntaxError;
use crate::{common::is_whitespace_char, namespace};
use crate::reader::lexer::Token;
use super::{ClosingTagSubstate, PullParser, QualifiedNameTarget, Result, State};
impl PullParser {
pub fn inside_closing_tag_name(&mut self, t: Token, s: ClosingTagSubstate) -> Option<Result> {
match s {
ClosingTagSubstate::CTInsideName => self.read_qualified_name(t, QualifiedNameTarget::ClosingTagNameTarget, |this, token, name| {
match name.prefix_ref() {
Some(prefix) if prefix == namespace::NS_XML_PREFIX ||
prefix == namespace::NS_XMLNS_PREFIX =>
Some(this.error(SyntaxError::InvalidNamePrefix(prefix.into()))),
_ => {
this.data.element_name = Some(name.clone());
match token {
Token::TagEnd => this.emit_end_element(),
Token::Character(c) if is_whitespace_char(c) => this.into_state_continue(State::InsideClosingTag(ClosingTagSubstate::CTAfterName)),
_ => Some(this.error(SyntaxError::UnexpectedTokenInClosingTag(token)))
}
}
}
}),
ClosingTagSubstate::CTAfterName => match t {
Token::TagEnd => self.emit_end_element(),
Token::Character(c) if is_whitespace_char(c) => None, // Skip whitespace
_ => Some(self.error(SyntaxError::UnexpectedTokenInClosingTag(t)))
}
}
}
}

View file

@ -0,0 +1,34 @@
use crate::reader::error::SyntaxError;
use crate::reader::events::XmlEvent;
use crate::reader::lexer::Token;
use super::{PullParser, Result, State};
impl PullParser {
pub fn inside_comment(&mut self, t: Token) -> Option<Result> {
match t {
Token::CommentEnd if self.config.c.ignore_comments => {
self.into_state_continue(State::OutsideTag)
}
Token::CommentEnd => {
let data = self.take_buf();
self.into_state_emit(State::OutsideTag, Ok(XmlEvent::Comment(data)))
}
Token::Character(c) if !self.is_valid_xml_char(c) => {
Some(self.error(SyntaxError::InvalidCharacterEntity(c as u32)))
},
_ if self.config.c.ignore_comments => None, // Do not modify buffer if ignoring the comment
_ => {
if self.buf.len() > self.config.max_data_length {
return Some(self.error(SyntaxError::ExceededConfiguredLimit));
}
t.push_to_string(&mut self.buf);
None
}
}
}
}

View file

@ -0,0 +1,180 @@
use crate::common::{is_whitespace_char, XmlVersion};
use crate::reader::error::SyntaxError;
use crate::reader::events::XmlEvent;
use crate::reader::lexer::Token;
use crate::util::Encoding;
use super::{
DeclarationSubstate, Encountered, PullParser, QualifiedNameTarget, Result, State,
DEFAULT_VERSION,
};
impl PullParser {
#[inline(never)]
fn emit_start_document(&mut self) -> Option<Result> {
debug_assert!(self.encountered == Encountered::None);
self.encountered = Encountered::Declaration;
let version = self.data.version;
let encoding = self.data.take_encoding();
let standalone = self.data.standalone;
if let Some(new_encoding) = encoding.as_deref() {
let new_encoding = match new_encoding.parse() {
Ok(e) => e,
Err(_) if self.config.ignore_invalid_encoding_declarations => Encoding::Latin1,
Err(_) => return Some(self.error(SyntaxError::UnsupportedEncoding(new_encoding.into()))),
};
let current_encoding = self.lexer.encoding();
if current_encoding != new_encoding {
let set = match (current_encoding, new_encoding) {
(Encoding::Unknown | Encoding::Default, new) if new != Encoding::Utf16 => new,
(Encoding::Utf16Be | Encoding::Utf16Le, Encoding::Utf16) => current_encoding,
_ if self.config.ignore_invalid_encoding_declarations => current_encoding,
_ => return Some(self.error(SyntaxError::ConflictingEncoding(new_encoding, current_encoding))),
};
self.lexer.set_encoding(set);
}
}
let current_encoding = self.lexer.encoding();
self.into_state_emit(State::OutsideTag, Ok(XmlEvent::StartDocument {
version: version.unwrap_or(DEFAULT_VERSION),
encoding: encoding.unwrap_or_else(move || current_encoding.to_string()),
standalone
}))
}
// TODO: remove redundancy via macros or extra methods
pub fn inside_declaration(&mut self, t: Token, s: DeclarationSubstate) -> Option<Result> {
match s {
DeclarationSubstate::BeforeVersion => match t {
Token::Character('v') => self.into_state_continue(State::InsideDeclaration(DeclarationSubstate::InsideVersion)),
Token::Character(c) if is_whitespace_char(c) => None, // continue
_ => Some(self.error(SyntaxError::UnexpectedToken(t))),
},
DeclarationSubstate::InsideVersion => self.read_qualified_name(t, QualifiedNameTarget::AttributeNameTarget, |this, token, name| {
match &*name.local_name {
"ersion" if name.namespace.is_none() =>
this.into_state_continue(State::InsideDeclaration(
if token == Token::EqualsSign {
DeclarationSubstate::InsideVersionValue
} else {
DeclarationSubstate::AfterVersion
}
)),
_ => Some(this.error(SyntaxError::UnexpectedNameInsideXml(name.to_string().into()))),
}
}),
DeclarationSubstate::AfterVersion => match t {
Token::EqualsSign => self.into_state_continue(State::InsideDeclaration(DeclarationSubstate::InsideVersionValue)),
Token::Character(c) if is_whitespace_char(c) => None,
_ => Some(self.error(SyntaxError::UnexpectedToken(t))),
},
DeclarationSubstate::InsideVersionValue => self.read_attribute_value(t, |this, value| {
this.data.version = match &*value {
"1.0" => Some(XmlVersion::Version10),
"1.1" => Some(XmlVersion::Version11),
_ => None
};
if this.data.version.is_some() {
this.into_state_continue(State::InsideDeclaration(DeclarationSubstate::AfterVersionValue))
} else {
Some(this.error(SyntaxError::UnexpectedXmlVersion(value.into())))
}
}),
DeclarationSubstate::AfterVersionValue => match t {
Token::Character(c) if is_whitespace_char(c) => self.into_state_continue(State::InsideDeclaration(DeclarationSubstate::BeforeEncoding)),
Token::ProcessingInstructionEnd => self.emit_start_document(),
_ => Some(self.error(SyntaxError::UnexpectedToken(t))),
},
DeclarationSubstate::BeforeEncoding => match t {
Token::Character('e') => self.into_state_continue(State::InsideDeclaration(DeclarationSubstate::InsideEncoding)),
Token::Character('s') => self.into_state_continue(State::InsideDeclaration(DeclarationSubstate::InsideStandaloneDecl)),
Token::ProcessingInstructionEnd => self.emit_start_document(),
Token::Character(c) if is_whitespace_char(c) => None, // skip whitespace
_ => Some(self.error(SyntaxError::UnexpectedToken(t))),
},
DeclarationSubstate::InsideEncoding => self.read_qualified_name(t, QualifiedNameTarget::AttributeNameTarget, |this, token, name| {
match &*name.local_name {
"ncoding" if name.namespace.is_none() =>
this.into_state_continue(State::InsideDeclaration(
if token == Token::EqualsSign { DeclarationSubstate::InsideEncodingValue } else { DeclarationSubstate::AfterEncoding }
)),
_ => Some(this.error(SyntaxError::UnexpectedName(name.to_string().into())))
}
}),
DeclarationSubstate::AfterEncoding => match t {
Token::EqualsSign => self.into_state_continue(State::InsideDeclaration(DeclarationSubstate::InsideEncodingValue)),
Token::Character(c) if is_whitespace_char(c) => None,
_ => Some(self.error(SyntaxError::UnexpectedToken(t))),
},
DeclarationSubstate::InsideEncodingValue => self.read_attribute_value(t, |this, value| {
this.data.encoding = Some(value);
this.into_state_continue(State::InsideDeclaration(DeclarationSubstate::AfterEncodingValue))
}),
DeclarationSubstate::AfterEncodingValue => match t {
Token::Character(c) if is_whitespace_char(c) => self.into_state_continue(State::InsideDeclaration(DeclarationSubstate::BeforeStandaloneDecl)),
Token::ProcessingInstructionEnd => self.emit_start_document(),
_ => Some(self.error(SyntaxError::UnexpectedToken(t))),
},
DeclarationSubstate::BeforeStandaloneDecl => match t {
Token::Character('s') => self.into_state_continue(State::InsideDeclaration(DeclarationSubstate::InsideStandaloneDecl)),
Token::ProcessingInstructionEnd => self.emit_start_document(),
Token::Character(c) if is_whitespace_char(c) => None, // skip whitespace
_ => Some(self.error(SyntaxError::UnexpectedToken(t))),
},
DeclarationSubstate::InsideStandaloneDecl => self.read_qualified_name(t, QualifiedNameTarget::AttributeNameTarget, |this, token, name| {
match &*name.local_name {
"tandalone" if name.namespace.is_none() =>
this.into_state_continue(State::InsideDeclaration(
if token == Token::EqualsSign {
DeclarationSubstate::InsideStandaloneDeclValue
} else {
DeclarationSubstate::AfterStandaloneDecl
}
)),
_ => Some(this.error(SyntaxError::UnexpectedName(name.to_string().into()))),
}
}),
DeclarationSubstate::AfterStandaloneDecl => match t {
Token::EqualsSign => self.into_state_continue(State::InsideDeclaration(DeclarationSubstate::InsideStandaloneDeclValue)),
Token::Character(c) if is_whitespace_char(c) => None,
_ => Some(self.error(SyntaxError::UnexpectedToken(t))),
},
DeclarationSubstate::InsideStandaloneDeclValue => self.read_attribute_value(t, |this, value| {
let standalone = match &*value {
"yes" => Some(true),
"no" => Some(false),
_ => None
};
if standalone.is_some() {
this.data.standalone = standalone;
this.into_state_continue(State::InsideDeclaration(DeclarationSubstate::AfterStandaloneDeclValue))
} else {
Some(this.error(SyntaxError::InvalidStandaloneDeclaration(value.into())))
}
}),
DeclarationSubstate::AfterStandaloneDeclValue => match t {
Token::ProcessingInstructionEnd => self.emit_start_document(),
Token::Character(c) if is_whitespace_char(c) => None, // skip whitespace
_ => Some(self.error(SyntaxError::UnexpectedToken(t))),
},
}
}
}

View file

@ -0,0 +1,244 @@
use crate::reader::error::SyntaxError;
use crate::common::{is_name_char, is_name_start_char, is_whitespace_char};
use crate::reader::lexer::Token;
use super::{DoctypeSubstate, PullParser, QuoteToken, Result, State};
impl PullParser {
pub fn inside_doctype(&mut self, t: Token, substate: DoctypeSubstate) -> Option<Result> {
match substate {
DoctypeSubstate::Outside => match t {
Token::TagEnd => self.into_state_continue(State::OutsideTag),
Token::MarkupDeclarationStart => {
self.buf.clear();
self.into_state_continue(State::InsideDoctype(DoctypeSubstate::InsideName))
},
Token::Character('%') => {
self.data.ref_data.clear();
self.data.ref_data.push('%');
self.into_state_continue(State::InsideDoctype(DoctypeSubstate::PEReferenceInDtd))
},
Token::CommentStart => {
self.into_state_continue(State::InsideDoctype(DoctypeSubstate::Comment))
},
Token::SingleQuote | Token::DoubleQuote => {
// just discard string literals
self.data.quote = Some(super::QuoteToken::from_token(&t));
self.into_state_continue(State::InsideDoctype(DoctypeSubstate::String))
},
Token::CDataEnd | Token::CDataStart => Some(self.error(SyntaxError::UnexpectedToken(t))),
// TODO: parse SYSTEM, and [
_ => None,
},
DoctypeSubstate::String => match t {
Token::SingleQuote if self.data.quote != Some(QuoteToken::SingleQuoteToken) => None,
Token::DoubleQuote if self.data.quote != Some(QuoteToken::DoubleQuoteToken) => None,
Token::SingleQuote | Token::DoubleQuote => {
self.data.quote = None;
self.into_state_continue(State::InsideDoctype(DoctypeSubstate::Outside))
},
_ => None,
},
DoctypeSubstate::Comment => match t {
Token::CommentEnd => {
self.into_state_continue(State::InsideDoctype(DoctypeSubstate::Outside))
},
_ => None,
},
DoctypeSubstate::InsideName => match t {
Token::Character(c @ 'A'..='Z') => {
self.buf.push(c);
None
},
Token::Character(c) if is_whitespace_char(c) => {
let buf = self.take_buf();
match buf.as_str() {
"ENTITY" => self.into_state_continue(State::InsideDoctype(DoctypeSubstate::BeforeEntityName)),
"NOTATION" | "ELEMENT" | "ATTLIST" => self.into_state_continue(State::InsideDoctype(DoctypeSubstate::SkipDeclaration)),
_ => Some(self.error(SyntaxError::UnknownMarkupDeclaration(buf.into()))),
}
},
_ => Some(self.error(SyntaxError::UnexpectedToken(t))),
},
DoctypeSubstate::BeforeEntityName => {
self.data.name.clear();
match t {
Token::Character(c) if is_whitespace_char(c) => None,
Token::Character('%') => { // % is for PEDecl
self.data.name.push('%');
self.into_state_continue(State::InsideDoctype(DoctypeSubstate::PEReferenceDefinitionStart))
},
Token::Character(c) if is_name_start_char(c) => {
if self.data.name.len() > self.config.max_name_length {
return Some(self.error(SyntaxError::ExceededConfiguredLimit));
}
self.data.name.push(c);
self.into_state_continue(State::InsideDoctype(DoctypeSubstate::EntityName))
},
_ => Some(self.error(SyntaxError::UnexpectedTokenInEntity(t))),
}
},
DoctypeSubstate::EntityName => match t {
Token::Character(c) if is_whitespace_char(c) => {
self.into_state_continue(State::InsideDoctype(DoctypeSubstate::BeforeEntityValue))
},
Token::Character(c) if is_name_char(c) => {
if self.data.name.len() > self.config.max_name_length {
return Some(self.error(SyntaxError::ExceededConfiguredLimit));
}
self.data.name.push(c);
None
},
_ => Some(self.error(SyntaxError::UnexpectedTokenInEntity(t))),
},
DoctypeSubstate::BeforeEntityValue => {
self.buf.clear();
match t {
Token::Character(c) if is_whitespace_char(c) => None,
// SYSTEM/PUBLIC not supported
Token::Character('S' | 'P') => {
let name = self.data.take_name();
self.entities.entry(name).or_insert_with(String::new); // Dummy value, but at least the name is recognized
self.into_state_continue(State::InsideDoctype(DoctypeSubstate::SkipDeclaration))
},
Token::SingleQuote | Token::DoubleQuote => {
self.data.quote = Some(super::QuoteToken::from_token(&t));
self.into_state_continue(State::InsideDoctype(DoctypeSubstate::EntityValue))
},
_ => Some(self.error(SyntaxError::UnexpectedTokenInEntity(t))),
}
},
DoctypeSubstate::EntityValue => match t {
Token::SingleQuote if self.data.quote != Some(QuoteToken::SingleQuoteToken) => { self.buf.push('\''); None },
Token::DoubleQuote if self.data.quote != Some(QuoteToken::DoubleQuoteToken) => { self.buf.push('"'); None },
Token::SingleQuote | Token::DoubleQuote => {
self.data.quote = None;
let name = self.data.take_name();
let val = self.take_buf();
self.entities.entry(name).or_insert(val); // First wins
self.into_state_continue(State::InsideDoctype(DoctypeSubstate::SkipDeclaration)) // FIXME
},
Token::ReferenceStart | Token::Character('&') => {
self.data.ref_data.clear();
self.into_state_continue(State::InsideDoctype(DoctypeSubstate::NumericReferenceStart))
},
Token::Character('%') => {
self.data.ref_data.clear();
self.data.ref_data.push('%'); // include literal % in the name to distinguish from regular entities
self.into_state_continue(State::InsideDoctype(DoctypeSubstate::PEReferenceInValue))
},
Token::Character(c) if !self.is_valid_xml_char(c) => {
Some(self.error(SyntaxError::InvalidCharacterEntity(c as u32)))
},
Token::Character(c) => {
self.buf.push(c);
None
},
_ => Some(self.error(SyntaxError::UnexpectedTokenInEntity(t))),
},
DoctypeSubstate::PEReferenceDefinitionStart => match t {
Token::Character(c) if is_whitespace_char(c) => {
None
},
Token::Character(c) if is_name_start_char(c) => {
debug_assert_eq!(self.data.name, "%");
self.data.name.push(c);
self.into_state_continue(State::InsideDoctype(DoctypeSubstate::PEReferenceDefinition))
},
_ => Some(self.error(SyntaxError::UnexpectedTokenInEntity(t))),
},
DoctypeSubstate::PEReferenceDefinition => match t {
Token::Character(c) if is_name_char(c) => {
if self.data.name.len() > self.config.max_name_length {
return Some(self.error(SyntaxError::ExceededConfiguredLimit));
}
self.data.name.push(c);
None
},
Token::Character(c) if is_whitespace_char(c) => {
self.into_state_continue(State::InsideDoctype(DoctypeSubstate::BeforeEntityValue))
},
_ => Some(self.error(SyntaxError::UnexpectedTokenInEntity(t))),
},
DoctypeSubstate::PEReferenceInDtd => match t {
Token::Character(c) if is_name_char(c) => {
self.data.ref_data.push(c);
None
},
Token::ReferenceEnd | Token::Character(';') => {
let name = self.data.take_ref_data();
match self.entities.get(&name) {
Some(ent) => {
if let Err(e) = self.lexer.reparse(ent) {
return Some(Err(e));
}
self.into_state_continue(State::InsideDoctype(DoctypeSubstate::Outside))
},
None => Some(self.error(SyntaxError::UndefinedEntity(name.into()))),
}
},
_ => Some(self.error(SyntaxError::UnexpectedTokenInEntity(t))),
},
DoctypeSubstate::PEReferenceInValue => match t {
Token::Character(c) if is_name_char(c) => {
self.data.ref_data.push(c);
None
},
Token::ReferenceEnd | Token::Character(';') => {
let name = self.data.take_ref_data();
match self.entities.get(&name) {
Some(ent) => {
self.buf.push_str(ent);
self.into_state_continue(State::InsideDoctype(DoctypeSubstate::EntityValue))
},
None => Some(self.error(SyntaxError::UndefinedEntity(name.into()))),
}
},
_ => Some(self.error(SyntaxError::UnexpectedTokenInEntity(t))),
},
DoctypeSubstate::NumericReferenceStart => match t {
Token::Character('#') => {
self.into_state_continue(State::InsideDoctype(DoctypeSubstate::NumericReference))
},
Token::Character(c) if !self.is_valid_xml_char(c) => {
Some(self.error(SyntaxError::InvalidCharacterEntity(c as u32)))
},
Token::Character(c) => {
self.buf.push('&');
self.buf.push(c);
// named entities are not expanded inside doctype
self.into_state_continue(State::InsideDoctype(DoctypeSubstate::EntityValue))
},
_ => Some(self.error(SyntaxError::UnexpectedTokenInEntity(t))),
},
DoctypeSubstate::NumericReference => match t {
Token::ReferenceEnd | Token::Character(';') => {
let r = self.data.take_ref_data();
// https://www.w3.org/TR/xml/#sec-entexpand
match self.numeric_reference_from_str(&r) {
Ok(c) => {
self.buf.push(c);
self.into_state_continue(State::InsideDoctype(DoctypeSubstate::EntityValue))
}
Err(e) => Some(self.error(e)),
}
},
Token::Character(c) if !self.is_valid_xml_char(c) => {
Some(self.error(SyntaxError::InvalidCharacterEntity(c as u32)))
},
Token::Character(c) => {
self.data.ref_data.push(c);
None
},
_ => Some(self.error(SyntaxError::UnexpectedTokenInEntity(t))),
},
DoctypeSubstate::SkipDeclaration => match t {
Token::TagEnd => {
self.into_state_continue(State::InsideDoctype(DoctypeSubstate::Outside))
},
_ => None,
},
}
}
}

View file

@ -0,0 +1,120 @@
use crate::reader::error::SyntaxError;
use crate::common::is_name_start_char;
use crate::namespace;
use crate::{attribute::OwnedAttribute, common::is_whitespace_char};
use crate::reader::lexer::Token;
use super::{OpeningTagSubstate, PullParser, QualifiedNameTarget, Result, State};
impl PullParser {
pub fn inside_opening_tag(&mut self, t: Token, s: OpeningTagSubstate) -> Option<Result> {
let max_attrs = self.config.max_attributes;
match s {
OpeningTagSubstate::InsideName => self.read_qualified_name(t, QualifiedNameTarget::OpeningTagNameTarget, |this, token, name| {
match name.prefix_ref() {
Some(prefix) if prefix == namespace::NS_XML_PREFIX ||
prefix == namespace::NS_XMLNS_PREFIX =>
Some(this.error(SyntaxError::InvalidNamePrefix(prefix.into()))),
_ => {
this.data.element_name = Some(name.clone());
match token {
Token::TagEnd => this.emit_start_element(false),
Token::EmptyTagEnd => this.emit_start_element(true),
Token::Character(c) if is_whitespace_char(c) => this.into_state_continue(State::InsideOpeningTag(OpeningTagSubstate::InsideTag)),
_ => unreachable!()
}
}
}
}),
OpeningTagSubstate::InsideTag => match t {
Token::TagEnd => self.emit_start_element(false),
Token::EmptyTagEnd => self.emit_start_element(true),
Token::Character(c) if is_whitespace_char(c) => None, // skip whitespace
Token::Character(c) if is_name_start_char(c) => {
if self.buf.len() > self.config.max_name_length {
return Some(self.error(SyntaxError::ExceededConfiguredLimit));
}
self.buf.push(c);
self.into_state_continue(State::InsideOpeningTag(OpeningTagSubstate::InsideAttributeName))
}
_ => Some(self.error(SyntaxError::UnexpectedTokenInOpeningTag(t))),
},
OpeningTagSubstate::InsideAttributeName => self.read_qualified_name(t, QualifiedNameTarget::AttributeNameTarget, |this, token, name| {
// check that no attribute with such name is already present
// if there is one, XML is not well-formed
if this.data.attributes.contains(&name) {
return Some(this.error(SyntaxError::RedefinedAttribute(name.to_string().into())))
}
this.data.attr_name = Some(name);
match token {
Token::EqualsSign => this.into_state_continue(State::InsideOpeningTag(OpeningTagSubstate::InsideAttributeValue)),
Token::Character(c) if is_whitespace_char(c) => this.into_state_continue(State::InsideOpeningTag(OpeningTagSubstate::AfterAttributeName)),
_ => Some(this.error(SyntaxError::UnexpectedTokenInOpeningTag(t))) // likely unreachable
}
}),
OpeningTagSubstate::AfterAttributeName => match t {
Token::EqualsSign => self.into_state_continue(State::InsideOpeningTag(OpeningTagSubstate::InsideAttributeValue)),
Token::Character(c) if is_whitespace_char(c) => None,
_ => Some(self.error(SyntaxError::UnexpectedTokenInOpeningTag(t)))
},
OpeningTagSubstate::InsideAttributeValue => self.read_attribute_value(t, |this, value| {
let name = this.data.take_attr_name()?; // will always succeed here
match name.prefix_ref() {
// declaring a new prefix; it is sufficient to check prefix only
// because "xmlns" prefix is reserved
Some(namespace::NS_XMLNS_PREFIX) => {
let ln = &*name.local_name;
if ln == namespace::NS_XMLNS_PREFIX {
Some(this.error(SyntaxError::CannotRedefineXmlnsPrefix))
} else if ln == namespace::NS_XML_PREFIX && &*value != namespace::NS_XML_URI {
Some(this.error(SyntaxError::CannotRedefineXmlPrefix))
} else if value.is_empty() {
Some(this.error(SyntaxError::CannotUndefinePrefix(ln.into())))
} else {
this.nst.put(name.local_name.clone(), value);
this.into_state_continue(State::InsideOpeningTag(OpeningTagSubstate::AfterAttributeValue))
}
}
// declaring default namespace
None if &*name.local_name == namespace::NS_XMLNS_PREFIX =>
match &*value {
namespace::NS_XMLNS_PREFIX | namespace::NS_XML_PREFIX | namespace::NS_XML_URI | namespace::NS_XMLNS_URI =>
Some(this.error(SyntaxError::InvalidDefaultNamespace(value.into()))),
_ => {
this.nst.put(namespace::NS_NO_PREFIX, value.clone());
this.into_state_continue(State::InsideOpeningTag(OpeningTagSubstate::AfterAttributeValue))
}
},
// regular attribute
_ => {
if this.data.attributes.len() >= max_attrs {
return Some(this.error(SyntaxError::ExceededConfiguredLimit));
}
this.data.attributes.push(OwnedAttribute {
name,
value
});
this.into_state_continue(State::InsideOpeningTag(OpeningTagSubstate::AfterAttributeValue))
}
}
}),
OpeningTagSubstate::AfterAttributeValue => match t {
Token::Character(c) if is_whitespace_char(c) => {
self.into_state_continue(State::InsideOpeningTag(OpeningTagSubstate::InsideTag))
},
Token::TagEnd => self.emit_start_element(false),
Token::EmptyTagEnd => self.emit_start_element(true),
_ => Some(self.error(SyntaxError::UnexpectedTokenInOpeningTag(t))),
},
}
}
}

View file

@ -0,0 +1,116 @@
use crate::reader::error::SyntaxError;
use crate::common::{is_name_char, is_name_start_char, is_whitespace_char};
use crate::reader::events::XmlEvent;
use crate::reader::lexer::Token;
use super::{DeclarationSubstate, ProcessingInstructionSubstate, PullParser, Result, State, Encountered};
impl PullParser {
pub fn inside_processing_instruction(&mut self, t: Token, s: ProcessingInstructionSubstate) -> Option<Result> {
match s {
ProcessingInstructionSubstate::PIInsideName => match t {
Token::Character(c) if self.buf.is_empty() && is_name_start_char(c) ||
self.buf_has_data() && is_name_char(c) => {
if self.buf.len() > self.config.max_name_length {
return Some(self.error(SyntaxError::ExceededConfiguredLimit));
}
self.buf.push(c);
None
},
Token::ProcessingInstructionEnd => {
// self.buf contains PI name
let name = self.take_buf();
// Don't need to check for declaration because it has mandatory attributes
// but there is none
match &*name {
// Name is empty, it is an error
"" => Some(self.error(SyntaxError::ProcessingInstructionWithoutName)),
// Found <?xml-like PI not at the beginning of a document,
// it is an error - see section 2.6 of XML 1.1 spec
n if "xml".eq_ignore_ascii_case(n) =>
Some(self.error(SyntaxError::InvalidXmlProcessingInstruction(name.into()))),
// All is ok, emitting event
_ => {
debug_assert!(self.next_event.is_none(), "{:?}", self.next_event);
// can't have a PI before `<?xml`
let event1 = self.set_encountered(Encountered::Declaration);
let event2 = Some(Ok(XmlEvent::ProcessingInstruction {
name,
data: None
}));
// emitting two events at once is cumbersome
let event1 = if event1.is_some() {
self.next_event = event2;
event1
} else {
event2
};
self.into_state(State::OutsideTag, event1)
}
}
}
Token::Character(c) if is_whitespace_char(c) => {
// self.buf contains PI name
let name = self.take_buf();
match &*name {
// We have not ever encountered an element and have not parsed XML declaration
"xml" if self.encountered == Encountered::None =>
self.into_state_continue(State::InsideDeclaration(DeclarationSubstate::BeforeVersion)),
// Found <?xml-like PI after the beginning of a document,
// it is an error - see section 2.6 of XML 1.1 spec
n if "xml".eq_ignore_ascii_case(n) =>
Some(self.error(SyntaxError::InvalidXmlProcessingInstruction(name.into()))),
// All is ok, starting parsing PI data
_ => {
self.data.name = name;
// can't have a PI before `<?xml`
let next_event = self.set_encountered(Encountered::Declaration);
self.into_state(State::InsideProcessingInstruction(ProcessingInstructionSubstate::PIInsideData), next_event)
}
}
}
_ => {
let buf = self.take_buf();
Some(self.error(SyntaxError::UnexpectedProcessingInstruction(buf.into(), t)))
}
},
ProcessingInstructionSubstate::PIInsideData => match t {
Token::ProcessingInstructionEnd => {
let name = self.data.take_name();
let data = self.take_buf();
self.into_state_emit(
State::OutsideTag,
Ok(XmlEvent::ProcessingInstruction {
name,
data: Some(data),
}),
)
},
Token::Character(c) if !self.is_valid_xml_char(c) => {
Some(self.error(SyntaxError::InvalidCharacterEntity(c as u32)))
},
// Any other token should be treated as plain characters
_ => {
if self.buf.len() > self.config.max_data_length {
return Some(self.error(SyntaxError::ExceededConfiguredLimit));
}
t.push_to_string(&mut self.buf);
None
}
},
}
}
}

View file

@ -0,0 +1,78 @@
use crate::reader::error::SyntaxError;
use std::char;
use crate::common::{is_name_char, is_name_start_char, is_whitespace_char};
use crate::reader::lexer::Token;
use super::{PullParser, Result, State};
impl PullParser {
pub fn inside_reference(&mut self, t: Token) -> Option<Result> {
match t {
Token::Character(c) if !self.data.ref_data.is_empty() && is_name_char(c) ||
self.data.ref_data.is_empty() && (is_name_start_char(c) || c == '#') => {
self.data.ref_data.push(c);
None
}
Token::ReferenceEnd => {
let name = self.data.take_ref_data();
if name.is_empty() {
return Some(self.error(SyntaxError::EmptyEntity));
}
let c = match &*name {
"lt" => Some('<'),
"gt" => Some('>'),
"amp" => Some('&'),
"apos" => Some('\''),
"quot" => Some('"'),
_ if name.starts_with('#') => match self.numeric_reference_from_str(&name[1..]) {
Ok(c) => Some(c),
Err(e) => return Some(self.error(e))
},
_ => None,
};
if let Some(c) = c {
self.buf.push(c);
} else if let Some(v) = self.config.c.extra_entities.get(&name) {
self.buf.push_str(v);
} else if let Some(v) = self.entities.get(&name) {
if self.state_after_reference == State::OutsideTag {
// an entity can expand to *elements*, so outside of a tag it needs a full reparse
if let Err(e) = self.lexer.reparse(v) {
return Some(Err(e));
}
} else {
// however, inside attributes it's not allowed to affect attribute quoting,
// so it can't be fed to the lexer
self.buf.push_str(v);
}
} else {
return Some(self.error(SyntaxError::UnexpectedEntity(name.into())));
}
let prev_st = self.state_after_reference;
if prev_st == State::OutsideTag && !is_whitespace_char(self.buf.chars().last().unwrap_or('\0')) {
self.inside_whitespace = false;
}
self.into_state_continue(prev_st)
}
_ => Some(self.error(SyntaxError::UnexpectedTokenInEntity(t))),
}
}
pub(crate) fn numeric_reference_from_str(&self, num_str: &str) -> std::result::Result<char, SyntaxError> {
let val = if let Some(hex) = num_str.strip_prefix('x') {
u32::from_str_radix(hex, 16).map_err(move |_| SyntaxError::InvalidNumericEntity(num_str.into()))?
} else {
u32::from_str_radix(num_str, 10).map_err(move |_| SyntaxError::InvalidNumericEntity(num_str.into()))?
};
match char::from_u32(val) {
Some(c) if self.is_valid_xml_char(c) => Ok(c),
Some(_) if self.config.c.replace_unknown_entity_references => Ok('\u{fffd}'),
None if self.config.c.replace_unknown_entity_references => {
Ok('\u{fffd}')
},
_ => Err(SyntaxError::InvalidCharacterEntity(val)),
}
}
}

View file

@ -0,0 +1,205 @@
use crate::reader::error::SyntaxError;
use crate::common::is_whitespace_char;
use crate::reader::events::XmlEvent;
use crate::reader::lexer::Token;
use super::{
ClosingTagSubstate, DoctypeSubstate, Encountered, OpeningTagSubstate,
ProcessingInstructionSubstate, PullParser, Result, State,
};
impl PullParser {
pub fn outside_tag(&mut self, t: Token) -> Option<Result> {
match t {
Token::Character(c) => {
if is_whitespace_char(c) {
// skip whitespace outside of the root element
if (self.config.c.trim_whitespace && self.buf.is_empty()) ||
(self.depth() == 0 && self.config.c.ignore_root_level_whitespace) {
return None;
}
} else {
self.inside_whitespace = false;
if self.depth() == 0 {
return Some(self.error(SyntaxError::UnexpectedTokenOutsideRoot(t)));
}
}
if !self.is_valid_xml_char_not_restricted(c) {
return Some(self.error(SyntaxError::InvalidCharacterEntity(c as u32)));
}
if self.buf.is_empty() {
self.push_pos();
} else if self.buf.len() > self.config.max_data_length {
return Some(self.error(SyntaxError::ExceededConfiguredLimit));
}
self.buf.push(c);
None
},
Token::CommentEnd | Token::TagEnd | Token::EqualsSign |
Token::DoubleQuote | Token::SingleQuote |
Token::ProcessingInstructionEnd | Token::EmptyTagEnd => {
if self.depth() == 0 {
return Some(self.error(SyntaxError::UnexpectedTokenOutsideRoot(t)));
}
self.inside_whitespace = false;
if let Some(s) = t.as_static_str() {
if self.buf.is_empty() {
self.push_pos();
} else if self.buf.len() > self.config.max_data_length {
return Some(self.error(SyntaxError::ExceededConfiguredLimit));
}
self.buf.push_str(s);
}
None
},
Token::ReferenceStart if self.depth() > 0 => {
self.state_after_reference = State::OutsideTag;
self.into_state_continue(State::InsideReference)
},
Token::ReferenceEnd if self.depth() > 0 => { // Semi-colon in a text outside an entity
self.inside_whitespace = false;
if self.buf.len() > self.config.max_data_length {
return Some(self.error(SyntaxError::ExceededConfiguredLimit));
}
Token::ReferenceEnd.push_to_string(&mut self.buf);
None
},
Token::CommentStart if self.config.c.coalesce_characters && self.config.c.ignore_comments => {
let next_event = self.set_encountered(Encountered::Comment);
// We need to switch the lexer into a comment mode inside comments
self.into_state(State::InsideComment, next_event)
}
Token::CDataStart if self.depth() > 0 && self.config.c.coalesce_characters && self.config.c.cdata_to_characters => {
if self.buf.is_empty() {
self.push_pos();
}
self.into_state_continue(State::InsideCData)
},
_ => {
// Encountered some markup event, flush the buffer as characters
// or a whitespace
let mut next_event = if self.buf_has_data() {
let buf = self.take_buf();
if self.inside_whitespace && self.config.c.trim_whitespace {
None
} else if self.inside_whitespace && !self.config.c.whitespace_to_characters {
debug_assert!(buf.chars().all(|ch| ch.is_whitespace()), "ws={buf:?}");
Some(Ok(XmlEvent::Whitespace(buf)))
} else if self.config.c.trim_whitespace {
Some(Ok(XmlEvent::Characters(buf.trim_matches(is_whitespace_char).into())))
} else {
Some(Ok(XmlEvent::Characters(buf)))
}
} else { None };
self.inside_whitespace = true; // Reset inside_whitespace flag
// pos is popped whenever an event is emitted, so pushes must happen only if there will be an event to balance it
// and ignored comments don't pop
if t != Token::CommentStart || !self.config.c.ignore_comments {
self.push_pos();
}
match t {
Token::OpeningTagStart if self.depth() > 0 || self.encountered < Encountered::Element || self.config.allow_multiple_root_elements => {
if let Some(e) = self.set_encountered(Encountered::Element) {
next_event = Some(e);
}
self.nst.push_empty();
self.into_state(State::InsideOpeningTag(OpeningTagSubstate::InsideName), next_event)
},
Token::ClosingTagStart if self.depth() > 0 =>
self.into_state(State::InsideClosingTag(ClosingTagSubstate::CTInsideName), next_event),
Token::CommentStart => {
if let Some(e) = self.set_encountered(Encountered::Comment) {
next_event = Some(e);
}
// We need to switch the lexer into a comment mode inside comments
self.into_state(State::InsideComment, next_event)
},
Token::DoctypeStart if self.encountered < Encountered::Doctype => {
if let Some(e) = self.set_encountered(Encountered::Doctype) {
next_event = Some(e);
}
// We don't have a doctype event so skip this position
// FIXME: update when we have a doctype event
self.next_pos();
self.into_state(State::InsideDoctype(DoctypeSubstate::Outside), next_event)
},
Token::ProcessingInstructionStart =>
self.into_state(State::InsideProcessingInstruction(ProcessingInstructionSubstate::PIInsideName), next_event),
Token::CDataStart if self.depth() > 0 => {
self.into_state(State::InsideCData, next_event)
},
_ => Some(self.error(SyntaxError::UnexpectedToken(t)))
}
}
}
}
pub fn document_start(&mut self, t: Token) -> Option<Result> {
debug_assert!(self.encountered < Encountered::Declaration);
match t {
Token::Character(c) => {
let next_event = self.set_encountered(Encountered::AnyChars);
if !is_whitespace_char(c) {
return Some(self.error(SyntaxError::UnexpectedTokenOutsideRoot(t)));
}
self.inside_whitespace = true;
// skip whitespace outside of the root element
if (self.config.c.trim_whitespace && self.buf.is_empty()) ||
(self.depth() == 0 && self.config.c.ignore_root_level_whitespace) {
return self.into_state(State::OutsideTag, next_event);
}
self.push_pos();
self.buf.push(c);
self.into_state(State::OutsideTag, next_event)
},
Token::CommentStart => {
let next_event = self.set_encountered(Encountered::Comment);
self.into_state(State::InsideComment, next_event)
}
Token::OpeningTagStart => {
let next_event = self.set_encountered(Encountered::Element);
self.nst.push_empty();
self.into_state(State::InsideOpeningTag(OpeningTagSubstate::InsideName), next_event)
},
Token::DoctypeStart => {
let next_event = self.set_encountered(Encountered::Doctype);
// We don't have a doctype event so skip this position
// FIXME: update when we have a doctype event
self.next_pos();
self.into_state(State::InsideDoctype(DoctypeSubstate::Outside), next_event)
},
Token::ProcessingInstructionStart => {
self.push_pos();
self.into_state_continue(State::InsideProcessingInstruction(ProcessingInstructionSubstate::PIInsideName))
},
_ => Some(self.error(SyntaxError::UnexpectedToken(t))),
}
}
}

305
third-party/vendor/xml-rs/src/util.rs vendored Normal file
View file

@ -0,0 +1,305 @@
use std::fmt;
use std::io::{self, Read};
use std::str::{self, FromStr};
#[derive(Debug)]
pub enum CharReadError {
UnexpectedEof,
Utf8(str::Utf8Error),
Io(io::Error),
}
impl From<str::Utf8Error> for CharReadError {
#[cold]
fn from(e: str::Utf8Error) -> CharReadError {
CharReadError::Utf8(e)
}
}
impl From<io::Error> for CharReadError {
#[cold]
fn from(e: io::Error) -> CharReadError {
CharReadError::Io(e)
}
}
impl fmt::Display for CharReadError {
#[cold]
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
use self::CharReadError::{Io, UnexpectedEof, Utf8};
match *self {
UnexpectedEof => write!(f, "unexpected end of stream"),
Utf8(ref e) => write!(f, "UTF-8 decoding error: {e}"),
Io(ref e) => write!(f, "I/O error: {e}"),
}
}
}
/// Character encoding used for parsing
#[derive(Debug, Copy, Clone, Eq, PartialEq)]
#[non_exhaustive]
pub enum Encoding {
/// Explicitly UTF-8 only
Utf8,
/// UTF-8 fallback, but can be any 8-bit encoding
Default,
/// ISO-8859-1
Latin1,
/// US-ASCII
Ascii,
/// Big-Endian
Utf16Be,
/// Little-Endian
Utf16Le,
/// Unknown endianness yet, will be sniffed
Utf16,
/// Not determined yet, may be sniffed to be anything
Unknown,
}
// Rustc inlines eq_ignore_ascii_case and creates kilobytes of code!
#[inline(never)]
fn icmp(lower: &str, varcase: &str) -> bool {
lower.bytes().zip(varcase.bytes()).all(|(l, v)| l == v.to_ascii_lowercase())
}
impl FromStr for Encoding {
type Err = &'static str;
fn from_str(val: &str) -> Result<Self, Self::Err> {
if ["utf-8", "utf8"].into_iter().any(move |label| icmp(label, val)) {
Ok(Encoding::Utf8)
} else if ["iso-8859-1", "latin1"].into_iter().any(move |label| icmp(label, val)) {
Ok(Encoding::Latin1)
} else if ["utf-16", "utf16"].into_iter().any(move |label| icmp(label, val)) {
Ok(Encoding::Utf16)
} else if ["ascii", "us-ascii"].into_iter().any(move |label| icmp(label, val)) {
Ok(Encoding::Ascii)
} else {
Err("unknown encoding name")
}
}
}
impl fmt::Display for Encoding {
#[cold]
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
f.write_str(match self {
Encoding::Utf8 => "UTF-8",
Encoding::Default => "UTF-8",
Encoding::Latin1 => "ISO-8859-1",
Encoding::Ascii => "US-ASCII",
Encoding::Utf16Be => "UTF-16",
Encoding::Utf16Le => "UTF-16",
Encoding::Utf16 => "UTF-16",
Encoding::Unknown => "(unknown)",
})
}
}
pub(crate) struct CharReader {
pub encoding: Encoding,
}
impl CharReader {
pub fn new() -> Self {
Self {
encoding: Encoding::Unknown,
}
}
pub fn next_char_from<R: Read>(&mut self, source: &mut R) -> Result<Option<char>, CharReadError> {
let mut bytes = source.bytes();
const MAX_CODEPOINT_LEN: usize = 4;
let mut buf = [0u8; MAX_CODEPOINT_LEN];
let mut pos = 0;
loop {
let next = match bytes.next() {
Some(Ok(b)) => b,
Some(Err(e)) => return Err(e.into()),
None if pos == 0 => return Ok(None),
None => return Err(CharReadError::UnexpectedEof),
};
match self.encoding {
Encoding::Utf8 | Encoding::Default => {
// fast path for ASCII subset
if pos == 0 && next.is_ascii() {
return Ok(Some(next.into()));
}
buf[pos] = next;
pos += 1;
match str::from_utf8(&buf[..pos]) {
Ok(s) => return Ok(s.chars().next()), // always Some(..)
Err(_) if pos < MAX_CODEPOINT_LEN => continue,
Err(e) => return Err(e.into()),
}
},
Encoding::Latin1 => {
return Ok(Some(next.into()));
},
Encoding::Ascii => {
if next.is_ascii() {
return Ok(Some(next.into()));
} else {
return Err(CharReadError::Io(io::Error::new(io::ErrorKind::InvalidData, "char is not ASCII")));
}
},
Encoding::Unknown | Encoding::Utf16 => {
buf[pos] = next;
pos += 1;
// sniff BOM
if pos <= 3 && buf[..pos] == [0xEF, 0xBB, 0xBF][..pos] {
if pos == 3 && self.encoding != Encoding::Utf16 {
pos = 0;
self.encoding = Encoding::Utf8;
}
} else if pos <= 2 && buf[..pos] == [0xFE, 0xFF][..pos] {
if pos == 2 {
pos = 0;
self.encoding = Encoding::Utf16Be;
}
} else if pos <= 2 && buf[..pos] == [0xFF, 0xFE][..pos] {
if pos == 2 {
pos = 0;
self.encoding = Encoding::Utf16Le;
}
} else if pos == 1 && self.encoding == Encoding::Utf16 {
// sniff ASCII char in UTF-16
self.encoding = if next == 0 { Encoding::Utf16Be } else { Encoding::Utf16Le };
} else {
// UTF-8 is the default, but XML decl can change it to other 8-bit encoding
self.encoding = Encoding::Default;
if pos == 1 && next.is_ascii() {
return Ok(Some(next.into()));
}
}
},
Encoding::Utf16Be => {
buf[pos] = next;
pos += 1;
if pos == 2 {
if let Some(Ok(c)) = char::decode_utf16([u16::from_be_bytes(buf[..2].try_into().unwrap())]).next() {
return Ok(Some(c));
}
} else if pos == 4 { // surrogate
return char::decode_utf16([u16::from_be_bytes(buf[..2].try_into().unwrap()), u16::from_be_bytes(buf[2..4].try_into().unwrap())])
.next().transpose()
.map_err(|e| CharReadError::Io(io::Error::new(io::ErrorKind::InvalidData, e)));
}
},
Encoding::Utf16Le => {
buf[pos] = next;
pos += 1;
if pos == 2 {
if let Some(Ok(c)) = char::decode_utf16([u16::from_le_bytes(buf[..2].try_into().unwrap())]).next() {
return Ok(Some(c));
}
} else if pos == 4 { // surrogate
return char::decode_utf16([u16::from_le_bytes(buf[..2].try_into().unwrap()), u16::from_le_bytes(buf[2..4].try_into().unwrap())])
.next().transpose()
.map_err(|e| CharReadError::Io(io::Error::new(io::ErrorKind::InvalidData, e)));
}
},
}
}
}
}
#[cfg(test)]
mod tests {
use super::{CharReadError, CharReader, Encoding};
#[test]
fn test_next_char_from() {
use std::io;
let mut bytes: &[u8] = "correct".as_bytes(); // correct ASCII
assert_eq!(CharReader::new().next_char_from(&mut bytes).unwrap(), Some('c'));
let mut bytes: &[u8] = b"\xEF\xBB\xBF\xE2\x80\xA2!"; // BOM
assert_eq!(CharReader::new().next_char_from(&mut bytes).unwrap(), Some('•'));
let mut bytes: &[u8] = b"\xEF\xBB\xBFx123"; // BOM
assert_eq!(CharReader::new().next_char_from(&mut bytes).unwrap(), Some('x'));
let mut bytes: &[u8] = b"\xEF\xBB\xBF"; // Nothing after BOM
assert_eq!(CharReader::new().next_char_from(&mut bytes).unwrap(), None);
let mut bytes: &[u8] = b"\xEF\xBB"; // Nothing after BO
assert!(matches!(CharReader::new().next_char_from(&mut bytes), Err(CharReadError::UnexpectedEof)));
let mut bytes: &[u8] = b"\xEF\xBB\x42"; // Nothing after BO
assert!(matches!(CharReader::new().next_char_from(&mut bytes), Err(_)));
let mut bytes: &[u8] = b"\xFE\xFF\x00\x42"; // UTF-16
assert_eq!(CharReader::new().next_char_from(&mut bytes).unwrap(), Some('B'));
let mut bytes: &[u8] = b"\xFF\xFE\x42\x00"; // UTF-16
assert_eq!(CharReader::new().next_char_from(&mut bytes).unwrap(), Some('B'));
let mut bytes: &[u8] = b"\xFF\xFE"; // UTF-16
assert_eq!(CharReader::new().next_char_from(&mut bytes).unwrap(), None);
let mut bytes: &[u8] = b"\xFF\xFE\x00"; // UTF-16
assert!(matches!(CharReader::new().next_char_from(&mut bytes), Err(CharReadError::UnexpectedEof)));
let mut bytes: &[u8] = "правильно".as_bytes(); // correct BMP
assert_eq!(CharReader::new().next_char_from(&mut bytes).unwrap(), Some('п'));
let mut bytes: &[u8] = "правильно".as_bytes();
assert_eq!(CharReader { encoding: Encoding::Utf16Be }.next_char_from(&mut bytes).unwrap(), Some('킿'));
let mut bytes: &[u8] = "правильно".as_bytes();
assert_eq!(CharReader { encoding: Encoding::Utf16Le }.next_char_from(&mut bytes).unwrap(), Some('뿐'));
let mut bytes: &[u8] = b"\xD8\xD8\x80";
assert!(matches!(CharReader { encoding: Encoding::Utf16 }.next_char_from(&mut bytes), Err(_)));
let mut bytes: &[u8] = b"\x00\x42";
assert_eq!(CharReader { encoding: Encoding::Utf16 }.next_char_from(&mut bytes).unwrap(), Some('B'));
let mut bytes: &[u8] = b"\x42\x00";
assert_eq!(CharReader { encoding: Encoding::Utf16 }.next_char_from(&mut bytes).unwrap(), Some('B'));
let mut bytes: &[u8] = b"\x00";
assert!(matches!(CharReader { encoding: Encoding::Utf16Be }.next_char_from(&mut bytes), Err(_)));
let mut bytes: &[u8] = "😊".as_bytes(); // correct non-BMP
assert_eq!(CharReader::new().next_char_from(&mut bytes).unwrap(), Some('😊'));
let mut bytes: &[u8] = b""; // empty
assert_eq!(CharReader::new().next_char_from(&mut bytes).unwrap(), None);
let mut bytes: &[u8] = b"\xf0\x9f\x98"; // incomplete code point
match CharReader::new().next_char_from(&mut bytes).unwrap_err() {
super::CharReadError::UnexpectedEof => {},
e => panic!("Unexpected result: {e:?}")
};
let mut bytes: &[u8] = b"\xff\x9f\x98\x32"; // invalid code point
match CharReader::new().next_char_from(&mut bytes).unwrap_err() {
super::CharReadError::Utf8(_) => {},
e => panic!("Unexpected result: {e:?}")
};
// error during read
struct ErrorReader;
impl io::Read for ErrorReader {
fn read(&mut self, _: &mut [u8]) -> io::Result<usize> {
Err(io::Error::new(io::ErrorKind::Other, "test error"))
}
}
let mut r = ErrorReader;
match CharReader::new().next_char_from(&mut r).unwrap_err() {
super::CharReadError::Io(ref e) if e.kind() == io::ErrorKind::Other &&
e.to_string().contains("test error") => {},
e => panic!("Unexpected result: {e:?}")
}
}
}

90
third-party/vendor/xml-rs/src/writer.rs vendored Normal file
View file

@ -0,0 +1,90 @@
//! Contains high-level interface for an events-based XML emitter.
//!
//! The most important type in this module is `EventWriter` which allows writing an XML document
//! to some output stream.
pub use self::config::EmitterConfig;
pub use self::emitter::EmitterError as Error;
pub use self::emitter::Result;
pub use self::events::XmlEvent;
use self::emitter::Emitter;
use std::io::prelude::*;
mod config;
mod emitter;
pub mod events;
/// A wrapper around an `std::io::Write` instance which emits XML document according to provided
/// events.
pub struct EventWriter<W> {
sink: W,
emitter: Emitter,
}
impl<W: Write> EventWriter<W> {
/// Creates a new `EventWriter` out of an `std::io::Write` instance using the default
/// configuration.
#[inline]
pub fn new(sink: W) -> EventWriter<W> {
EventWriter::new_with_config(sink, EmitterConfig::new())
}
/// Creates a new `EventWriter` out of an `std::io::Write` instance using the provided
/// configuration.
#[inline]
pub fn new_with_config(sink: W, config: EmitterConfig) -> EventWriter<W> {
EventWriter {
sink,
emitter: Emitter::new(config),
}
}
/// Writes the next piece of XML document according to the provided event.
///
/// Note that output data may not exactly correspond to the written event because
/// of various configuration options. For example, `XmlEvent::EndElement` may
/// correspond to a separate closing element or it may cause writing an empty element.
/// Another example is that `XmlEvent::CData` may be represented as characters in
/// the output stream.
pub fn write<'a, E>(&mut self, event: E) -> Result<()> where E: Into<XmlEvent<'a>> {
match event.into() {
XmlEvent::StartDocument { version, encoding, standalone } =>
self.emitter.emit_start_document(&mut self.sink, version, encoding.unwrap_or("UTF-8"), standalone),
XmlEvent::ProcessingInstruction { name, data } =>
self.emitter.emit_processing_instruction(&mut self.sink, name, data),
XmlEvent::StartElement { name, attributes, namespace } => {
self.emitter.namespace_stack_mut().push_empty().checked_target().extend(namespace.as_ref());
self.emitter.emit_start_element(&mut self.sink, name, &attributes)
}
XmlEvent::EndElement { name } => {
let r = self.emitter.emit_end_element(&mut self.sink, name);
self.emitter.namespace_stack_mut().try_pop();
r
}
XmlEvent::Comment(content) => self.emitter.emit_comment(&mut self.sink, content),
XmlEvent::CData(content) => self.emitter.emit_cdata(&mut self.sink, content),
XmlEvent::Characters(content) => self.emitter.emit_characters(&mut self.sink, content),
}
}
/// Returns a mutable reference to the underlying `Writer`.
///
/// Note that having a reference to the underlying sink makes it very easy to emit invalid XML
/// documents. Use this method with care. Valid use cases for this method include accessing
/// methods like `Write::flush`, which do not emit new data but rather change the state
/// of the stream itself.
pub fn inner_mut(&mut self) -> &mut W {
&mut self.sink
}
/// Unwraps this `EventWriter`, returning the underlying writer.
///
/// Note that this is a destructive operation: unwrapping a writer and then wrapping
/// it again with `EventWriter::new()` will create a fresh writer whose state will be
/// blank; for example, accumulated namespaces will be reset.
pub fn into_inner(self) -> W {
self.sink
}
}

View file

@ -0,0 +1,157 @@
//! Contains emitter configuration structure.
use std::borrow::Cow;
use std::io::Write;
use crate::writer::EventWriter;
/// Emitter configuration structure.
///
/// This structure contains various options which control XML document emitter behavior.
#[derive(Clone, PartialEq, Eq, Debug)]
pub struct EmitterConfig {
/// Line separator used to separate lines in formatted output. Default is `"\n"`.
pub line_separator: Cow<'static, str>,
/// A string which will be used for a single level of indentation. Default is `" "`
/// (two spaces).
pub indent_string: Cow<'static, str>,
/// Whether or not the emitted document should be indented. Default is false.
///
/// The emitter is capable to perform automatic indentation of the emitted XML document.
/// It is done in stream-like fashion and does not require the knowledge of the whole
/// document in advance.
///
/// Sometimes, however, automatic indentation is undesirable, e.g. when you want to keep
/// existing layout when processing an existing XML document. Also the indentiation algorithm
/// is not thoroughly tested. Hence by default it is disabled.
pub perform_indent: bool,
/// Whether or not characters in output events will be escaped. Default is true.
///
/// The emitter can automatically escape characters which can't appear in PCDATA sections
/// or element attributes of an XML document, like `<` or `"` (in attributes). This may
/// introduce some overhead because then every corresponding piece of character data
/// should be scanned for invalid characters.
///
/// If this option is disabled, the XML writer may produce non-well-formed documents, so
/// use `false` value for this option with care.
pub perform_escaping: bool,
/// Whether or not to write XML document declaration at the beginning of a document.
/// Default is true.
///
/// This option controls whether the document declaration should be emitted automatically
/// before a root element is written if it was not emitted explicitly by the user.
pub write_document_declaration: bool,
/// Whether or not to convert elements with empty content to empty elements. Default is true.
///
/// This option allows turning elements like `<a></a>` (an element with empty content)
/// into `<a />` (an empty element).
pub normalize_empty_elements: bool,
/// Whether or not to emit CDATA events as plain characters. Default is false.
///
/// This option forces the emitter to convert CDATA events into regular character events,
/// performing all the necessary escaping beforehand. This may be occasionally useful
/// for feeding the document into incorrect parsers which do not support CDATA.
pub cdata_to_characters: bool,
/// Whether or not to keep element names to support `EndElement` events without explicit names.
/// Default is true.
///
/// This option makes the emitter to keep names of written elements in order to allow
/// omitting names when writing closing element tags. This could incur some memory overhead.
pub keep_element_names_stack: bool,
/// Whether or not to automatically insert leading and trailing spaces in emitted comments,
/// if necessary. Default is true.
///
/// This is a convenience option in order for the user not to append spaces before and after
/// comments text in order to get more pretty comments: `<!-- something -->` instead of
/// `<!--something-->`.
pub autopad_comments: bool,
/// Whether or not to automatically insert spaces before the trailing `/>` in self-closing
/// elements. Default is true.
///
/// This option is only meaningful if `normalize_empty_elements` is true. For example, the
/// element `<a></a>` would be unaffected. When `normalize_empty_elements` is true, then when
/// this option is also true, the same element would appear `<a />`. If this option is false,
/// then the same element would appear `<a/>`.
pub pad_self_closing: bool,
}
impl EmitterConfig {
/// Creates an emitter configuration with default values.
///
/// You can tweak default options with builder-like pattern:
///
/// ```rust
/// use xml::writer::EmitterConfig;
///
/// let config = EmitterConfig::new()
/// .line_separator("\r\n")
/// .perform_indent(true)
/// .normalize_empty_elements(false);
/// ```
#[inline]
#[must_use]
pub fn new() -> EmitterConfig {
EmitterConfig {
line_separator: "\n".into(),
indent_string: " ".into(), // two spaces
perform_indent: false,
perform_escaping: true,
write_document_declaration: true,
normalize_empty_elements: true,
cdata_to_characters: false,
keep_element_names_stack: true,
autopad_comments: true,
pad_self_closing: true,
}
}
/// Creates an XML writer with this configuration.
///
/// This is a convenience method for configuring and creating a writer at the same time:
///
/// ```rust
/// use xml::writer::EmitterConfig;
///
/// let mut target: Vec<u8> = Vec::new();
///
/// let writer = EmitterConfig::new()
/// .line_separator("\r\n")
/// .perform_indent(true)
/// .normalize_empty_elements(false)
/// .create_writer(&mut target);
/// ```
///
/// This method is exactly equivalent to calling `EventWriter::new_with_config()` with
/// this configuration object.
#[inline]
pub fn create_writer<W: Write>(self, sink: W) -> EventWriter<W> {
EventWriter::new_with_config(sink, self)
}
}
impl Default for EmitterConfig {
#[inline]
fn default() -> EmitterConfig {
EmitterConfig::new()
}
}
gen_setters!(EmitterConfig,
line_separator: into Cow<'static, str>,
indent_string: into Cow<'static, str>,
perform_indent: val bool,
write_document_declaration: val bool,
normalize_empty_elements: val bool,
cdata_to_characters: val bool,
keep_element_names_stack: val bool,
autopad_comments: val bool,
pad_self_closing: val bool
);

View file

@ -0,0 +1,437 @@
use std::error::Error;
use std::fmt;
use std::io;
use std::io::prelude::*;
use std::result;
use crate::attribute::Attribute;
use crate::common;
use crate::common::XmlVersion;
use crate::escape::{AttributeEscapes, Escaped, PcDataEscapes};
use crate::name::{Name, OwnedName};
use crate::namespace::{NamespaceStack, NS_EMPTY_URI, NS_NO_PREFIX, NS_XMLNS_PREFIX, NS_XML_PREFIX};
use crate::writer::config::EmitterConfig;
/// An error which may be returned by `XmlWriter` when writing XML events.
#[derive(Debug)]
pub enum EmitterError {
/// An I/O error occured in the underlying `Write` instance.
Io(io::Error),
/// Document declaration has already been written to the output stream.
DocumentStartAlreadyEmitted,
/// The name of the last opening element is not available.
LastElementNameNotAvailable,
/// The name of the last opening element is not equal to the name of the provided
/// closing element.
EndElementNameIsNotEqualToLastStartElementName,
/// End element name is not specified when it is needed, for example, when automatic
/// closing is not enabled in configuration.
EndElementNameIsNotSpecified,
}
impl From<io::Error> for EmitterError {
#[cold]
fn from(err: io::Error) -> EmitterError {
EmitterError::Io(err)
}
}
impl fmt::Display for EmitterError {
#[cold]
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
f.write_str("emitter error: ")?;
match self {
EmitterError::Io(e) => write!(f, "I/O error: {e}"),
EmitterError::DocumentStartAlreadyEmitted => f.write_str("document start event has already been emitted"),
EmitterError::LastElementNameNotAvailable => f.write_str("last element name is not available"),
EmitterError::EndElementNameIsNotEqualToLastStartElementName => f.write_str("end element name is not equal to last start element name"),
EmitterError::EndElementNameIsNotSpecified => f.write_str("end element name is not specified and can't be inferred"),
}
}
}
impl Error for EmitterError {
}
/// A result type yielded by `XmlWriter`.
pub type Result<T, E = EmitterError> = result::Result<T, E>;
// TODO: split into a low-level fast writer without any checks and formatting logic and a
// high-level indenting validating writer
pub struct Emitter {
config: EmitterConfig,
nst: NamespaceStack,
indent_level: usize,
indent_stack: Vec<IndentFlags>,
element_names: Vec<OwnedName>,
start_document_emitted: bool,
just_wrote_start_element: bool,
}
impl Emitter {
pub fn new(config: EmitterConfig) -> Emitter {
let mut indent_stack = Vec::with_capacity(16);
indent_stack.push(IndentFlags::WroteNothing);
Emitter {
config,
nst: NamespaceStack::empty(),
indent_level: 0,
indent_stack,
element_names: Vec::new(),
start_document_emitted: false,
just_wrote_start_element: false,
}
}
}
#[derive(Copy, Clone, Eq, PartialEq, Debug)]
enum IndentFlags {
WroteNothing,
WroteMarkup,
WroteText,
}
impl Emitter {
/// Returns the current state of namespaces.
#[inline]
pub fn namespace_stack_mut(&mut self) -> &mut NamespaceStack {
&mut self.nst
}
#[inline]
fn wrote_text(&self) -> bool {
self.indent_stack.last().map_or(false, |&e| e == IndentFlags::WroteText)
}
#[inline]
fn wrote_markup(&self) -> bool {
self.indent_stack.last().map_or(false, |&e| e == IndentFlags::WroteMarkup)
}
#[inline]
fn set_wrote_text(&mut self) {
if let Some(e) = self.indent_stack.last_mut() {
*e = IndentFlags::WroteText;
}
}
#[inline]
fn set_wrote_markup(&mut self) {
if let Some(e) = self.indent_stack.last_mut() {
*e = IndentFlags::WroteMarkup;
}
}
fn write_newline<W: Write>(&mut self, target: &mut W, level: usize) -> Result<()> {
target.write_all(self.config.line_separator.as_bytes())?;
for _ in 0..level {
target.write_all(self.config.indent_string.as_bytes())?;
}
Ok(())
}
fn before_markup<W: Write>(&mut self, target: &mut W) -> Result<()> {
if self.config.perform_indent && !self.wrote_text() &&
(self.indent_level > 0 || self.wrote_markup()) {
let indent_level = self.indent_level;
self.write_newline(target, indent_level)?;
if self.indent_level > 0 && self.config.indent_string.len() > 0 {
self.after_markup();
}
}
Ok(())
}
fn after_markup(&mut self) {
self.set_wrote_markup();
}
fn before_start_element<W: Write>(&mut self, target: &mut W) -> Result<()> {
self.before_markup(target)?;
self.indent_stack.push(IndentFlags::WroteNothing);
Ok(())
}
fn after_start_element(&mut self) {
self.after_markup();
self.indent_level += 1;
}
fn before_end_element<W: Write>(&mut self, target: &mut W) -> Result<()> {
if self.config.perform_indent && self.indent_level > 0 && self.wrote_markup() &&
!self.wrote_text() {
let indent_level = self.indent_level;
self.write_newline(target, indent_level - 1)
} else {
Ok(())
}
}
fn after_end_element(&mut self) {
if self.indent_level > 0 {
self.indent_level -= 1;
self.indent_stack.pop();
}
self.set_wrote_markup();
}
fn after_text(&mut self) {
self.set_wrote_text();
}
pub fn emit_start_document<W: Write>(&mut self, target: &mut W,
version: XmlVersion,
encoding: &str,
standalone: Option<bool>) -> Result<()> {
if self.start_document_emitted {
return Err(EmitterError::DocumentStartAlreadyEmitted);
}
self.start_document_emitted = true;
self.before_markup(target)?;
let result = {
let mut write = move || {
write!(target, "<?xml version=\"{version}\" encoding=\"{encoding}\"")?;
if let Some(standalone) = standalone {
write!(target, " standalone=\"{}\"", if standalone { "yes" } else { "no" })?;
}
write!(target, "?>")?;
Ok(())
};
write()
};
self.after_markup();
result
}
fn check_document_started<W: Write>(&mut self, target: &mut W) -> Result<()> {
if !self.start_document_emitted && self.config.write_document_declaration {
self.emit_start_document(target, common::XmlVersion::Version10, "utf-8", None)
} else {
Ok(())
}
}
fn fix_non_empty_element<W: Write>(&mut self, target: &mut W) -> Result<()> {
if self.config.normalize_empty_elements && self.just_wrote_start_element {
self.just_wrote_start_element = false;
target.write_all(b">").map_err(From::from)
} else {
Ok(())
}
}
pub fn emit_processing_instruction<W: Write>(&mut self,
target: &mut W,
name: &str,
data: Option<&str>) -> Result<()> {
self.check_document_started(target)?;
self.fix_non_empty_element(target)?;
self.before_markup(target)?;
let result = {
let mut write = move || {
write!(target, "<?{name}")?;
if let Some(data) = data {
write!(target, " {data}")?;
}
write!(target, "?>")?;
Ok(())
};
write()
};
self.after_markup();
result
}
fn emit_start_element_initial<W>(&mut self, target: &mut W,
name: Name<'_>,
attributes: &[Attribute<'_>]) -> Result<()>
where W: Write
{
self.check_document_started(target)?;
self.fix_non_empty_element(target)?;
self.before_start_element(target)?;
write!(target, "<{}", name.repr_display())?;
self.emit_current_namespace_attributes(target)?;
self.emit_attributes(target, attributes)?;
self.after_start_element();
Ok(())
}
pub fn emit_start_element<W>(&mut self, target: &mut W,
name: Name<'_>,
attributes: &[Attribute<'_>]) -> Result<()>
where W: Write
{
if self.config.keep_element_names_stack {
self.element_names.push(name.to_owned());
}
self.emit_start_element_initial(target, name, attributes)?;
self.just_wrote_start_element = true;
if !self.config.normalize_empty_elements {
write!(target, ">")?;
}
Ok(())
}
pub fn emit_current_namespace_attributes<W>(&mut self, target: &mut W) -> Result<()>
where W: Write
{
for (prefix, uri) in self.nst.peek() {
match prefix {
// internal namespaces are not emitted
NS_XMLNS_PREFIX | NS_XML_PREFIX => Ok(()),
//// there is already a namespace binding with this prefix in scope
//prefix if self.nst.get(prefix) == Some(uri) => Ok(()),
// emit xmlns only if it is overridden
NS_NO_PREFIX => if uri != NS_EMPTY_URI {
write!(target, " xmlns=\"{uri}\"")
} else { Ok(()) },
// everything else
prefix => write!(target, " xmlns:{prefix}=\"{uri}\"")
}?;
}
Ok(())
}
pub fn emit_attributes<W: Write>(&mut self, target: &mut W,
attributes: &[Attribute<'_>]) -> Result<()> {
for attr in attributes.iter() {
write!(target, " {}=\"", attr.name.repr_display())?;
if self.config.perform_escaping {
write!(target, "{}", Escaped::<AttributeEscapes>::new(attr.value))?;
} else {
write!(target, "{}", attr.value)?;
}
write!(target, "\"")?;
}
Ok(())
}
pub fn emit_end_element<W: Write>(&mut self, target: &mut W,
name: Option<Name<'_>>) -> Result<()> {
let owned_name = if self.config.keep_element_names_stack {
Some(self.element_names.pop().ok_or(EmitterError::LastElementNameNotAvailable)?)
} else {
None
};
// Check that last started element name equals to the provided name, if there are both
if let Some(ref last_name) = owned_name {
if let Some(ref name) = name {
if last_name.borrow() != *name {
return Err(EmitterError::EndElementNameIsNotEqualToLastStartElementName);
}
}
}
if let Some(name) = owned_name.as_ref().map(|n| n.borrow()).or(name) {
if self.config.normalize_empty_elements && self.just_wrote_start_element {
self.just_wrote_start_element = false;
let termination = if self.config.pad_self_closing { " />" } else { "/>" };
let result = target.write_all(termination.as_bytes()).map_err(From::from);
self.after_end_element();
result
} else {
self.just_wrote_start_element = false;
self.before_end_element(target)?;
let result = write!(target, "</{}>", name.repr_display()).map_err(From::from);
self.after_end_element();
result
}
} else {
Err(EmitterError::EndElementNameIsNotSpecified)
}
}
pub fn emit_cdata<W: Write>(&mut self, target: &mut W, content: &str) -> Result<()> {
self.fix_non_empty_element(target)?;
if self.config.cdata_to_characters {
self.emit_characters(target, content)
} else {
// TODO: escape ']]>' characters in CDATA as two adjacent CDATA blocks
target.write_all(b"<![CDATA[")?;
target.write_all(content.as_bytes())?;
target.write_all(b"]]>")?;
self.after_text();
Ok(())
}
}
pub fn emit_characters<W: Write>(&mut self, target: &mut W, content: &str) -> Result<()> {
self.check_document_started(target)?;
self.fix_non_empty_element(target)?;
if self.config.perform_escaping {
write!(target, "{}", Escaped::<PcDataEscapes>::new(content))?;
} else {
target.write_all(content.as_bytes())?;
}
self.after_text();
Ok(())
}
pub fn emit_comment<W: Write>(&mut self, target: &mut W, content: &str) -> Result<()> {
self.fix_non_empty_element(target)?;
// TODO: add escaping dashes at the end of the comment
let autopad_comments = self.config.autopad_comments;
let write = move |target: &mut W| -> Result<()> {
target.write_all(b"<!--")?;
if autopad_comments && !content.starts_with(char::is_whitespace) {
target.write_all(b" ")?;
}
target.write_all(content.as_bytes())?;
if autopad_comments && !content.ends_with(char::is_whitespace) {
target.write_all(b" ")?;
}
target.write_all(b"-->")?;
Ok(())
};
self.before_markup(target)?;
let result = write(target);
self.after_markup();
result
}
}

View file

@ -0,0 +1,256 @@
//! Contains `XmlEvent` datatype, instances of which are consumed by the writer.
use std::borrow::Cow;
use crate::attribute::Attribute;
use crate::common::XmlVersion;
use crate::name::Name;
use crate::namespace::{Namespace, NS_NO_PREFIX};
/// A part of an XML output stream.
///
/// Objects of this enum are consumed by `EventWriter`. They correspond to different parts of
/// an XML document.
#[derive(Debug, Clone)]
pub enum XmlEvent<'a> {
/// Corresponds to XML document declaration.
///
/// This event should always be written before any other event. If it is not written
/// at all, a default XML declaration will be outputted if the corresponding option
/// is set in the configuration. Otherwise an error will be returned.
StartDocument {
/// XML version.
///
/// Defaults to `XmlVersion::Version10`.
version: XmlVersion,
/// XML document encoding.
///
/// Defaults to `Some("UTF-8")`.
encoding: Option<&'a str>,
/// XML standalone declaration.
///
/// Defaults to `None`.
standalone: Option<bool>,
},
/// Denotes an XML processing instruction.
ProcessingInstruction {
/// Processing instruction target.
name: &'a str,
/// Processing instruction content.
data: Option<&'a str>,
},
/// Denotes a beginning of an XML element.
StartElement {
/// Qualified name of the element.
name: Name<'a>,
/// A list of attributes associated with the element.
///
/// Currently attributes are not checked for duplicates (TODO). Attribute values
/// will be escaped, and all characters invalid for attribute values like `"` or `<`
/// will be changed into character entities.
attributes: Cow<'a, [Attribute<'a>]>,
/// Contents of the namespace mapping at this point of the document.
///
/// This mapping will be inspected for "new" entries, and if at this point of the document
/// a particular pair of prefix and namespace URI is already defined, no namespace
/// attributes will be emitted.
namespace: Cow<'a, Namespace>,
},
/// Denotes an end of an XML element.
EndElement {
/// Optional qualified name of the element.
///
/// If `None`, then it is assumed that the element name should be the last valid one.
/// If `Some` and element names tracking is enabled, then the writer will check it for
/// correctness.
name: Option<Name<'a>>,
},
/// Denotes CDATA content.
///
/// This event contains unparsed data, and no escaping will be performed when writing it
/// to the output stream.
CData(&'a str),
/// Denotes a comment.
///
/// The string will be checked for invalid sequences and error will be returned by the
/// write operation
Comment(&'a str),
/// Denotes character data outside of tags.
///
/// Contents of this event will be escaped if `perform_escaping` option is enabled,
/// that is, every character invalid for PCDATA will appear as a character entity.
Characters(&'a str),
}
impl<'a> XmlEvent<'a> {
/// Returns an writer event for a processing instruction.
#[inline]
#[must_use]
pub fn processing_instruction(name: &'a str, data: Option<&'a str>) -> XmlEvent<'a> {
XmlEvent::ProcessingInstruction { name, data }
}
/// Returns a builder for a starting element.
///
/// This builder can then be used to tweak attributes and namespace starting at
/// this element.
#[inline]
pub fn start_element<S>(name: S) -> StartElementBuilder<'a> where S: Into<Name<'a>> {
StartElementBuilder {
name: name.into(),
attributes: Vec::new(),
namespace: Namespace::empty(),
}
}
/// Returns a builder for an closing element.
///
/// This method, unline `start_element()`, does not accept a name because by default
/// the writer is able to determine it automatically. However, when this functionality
/// is disabled, it is possible to specify the name with `name()` method on the builder.
#[inline]
#[must_use]
pub fn end_element() -> EndElementBuilder<'a> {
EndElementBuilder { name: None }
}
/// Returns a CDATA event.
///
/// Naturally, the provided string won't be escaped, except for closing CDATA token `]]>`
/// (depending on the configuration).
#[inline]
#[must_use]
pub fn cdata(data: &'a str) -> XmlEvent<'a> {
XmlEvent::CData(data)
}
/// Returns a regular characters (PCDATA) event.
///
/// All offending symbols, in particular, `&` and `<`, will be escaped by the writer.
#[inline]
#[must_use]
pub fn characters(data: &'a str) -> XmlEvent<'a> {
XmlEvent::Characters(data)
}
/// Returns a comment event.
#[inline]
#[must_use]
pub fn comment(data: &'a str) -> XmlEvent<'a> {
XmlEvent::Comment(data)
}
}
impl<'a> From<&'a str> for XmlEvent<'a> {
#[inline]
fn from(s: &'a str) -> XmlEvent<'a> {
XmlEvent::Characters(s)
}
}
pub struct EndElementBuilder<'a> {
name: Option<Name<'a>>,
}
/// A builder for a closing element event.
impl<'a> EndElementBuilder<'a> {
/// Sets the name of this closing element.
///
/// Usually the writer is able to determine closing element names automatically. If
/// this functionality is enabled (by default it is), then this name is checked for correctness.
/// It is possible, however, to disable such behavior; then the user must ensure that
/// closing element name is correct manually.
#[inline]
pub fn name<N>(mut self, name: N) -> EndElementBuilder<'a> where N: Into<Name<'a>> {
self.name = Some(name.into());
self
}
}
impl<'a> From<EndElementBuilder<'a>> for XmlEvent<'a> {
fn from(b: EndElementBuilder<'a>) -> XmlEvent<'a> {
XmlEvent::EndElement { name: b.name }
}
}
/// A builder for a starting element event.
pub struct StartElementBuilder<'a> {
name: Name<'a>,
attributes: Vec<Attribute<'a>>,
namespace: Namespace,
}
impl<'a> StartElementBuilder<'a> {
/// Sets an attribute value of this element to the given string.
///
/// This method can be used to add attributes to the starting element. Name is a qualified
/// name; its namespace is ignored, but its prefix is checked for correctness, that is,
/// it is checked that the prefix is bound to some namespace in the current context.
///
/// Currently attributes are not checked for duplicates. Note that duplicate attributes
/// are a violation of XML document well-formedness.
///
/// The writer checks that you don't specify reserved prefix names, for example `xmlns`.
#[inline]
pub fn attr<N>(mut self, name: N, value: &'a str) -> StartElementBuilder<'a>
where N: Into<Name<'a>>
{
self.attributes.push(Attribute::new(name.into(), value));
self
}
/// Adds a namespace to the current namespace context.
///
/// If no namespace URI was bound to the provided prefix at this point of the document,
/// then the mapping from the prefix to the provided namespace URI will be written as
/// a part of this element attribute set.
///
/// If the same namespace URI was bound to the provided prefix at this point of the document,
/// then no namespace attributes will be emitted.
///
/// If some other namespace URI was bound to the provided prefix at this point of the document,
/// then another binding will be added as a part of this element attribute set, shadowing
/// the outer binding.
#[inline]
#[must_use]
pub fn ns<S1, S2>(mut self, prefix: S1, uri: S2) -> StartElementBuilder<'a>
where S1: Into<String>, S2: Into<String>
{
self.namespace.put(prefix, uri);
self
}
/// Adds a default namespace mapping to the current namespace context.
///
/// Same rules as for `ns()` are also valid for the default namespace mapping.
#[inline]
#[must_use]
pub fn default_ns<S>(mut self, uri: S) -> StartElementBuilder<'a>
where S: Into<String>
{
self.namespace.put(NS_NO_PREFIX, uri);
self
}
}
impl<'a> From<StartElementBuilder<'a>> for XmlEvent<'a> {
#[inline]
fn from(b: StartElementBuilder<'a>) -> XmlEvent<'a> {
XmlEvent::StartElement {
name: b.name,
attributes: Cow::Owned(b.attributes),
namespace: Cow::Owned(b.namespace),
}
}
}