Skip to content

Commit

Permalink
Change API for providing custom entities
Browse files Browse the repository at this point in the history
Instead of providing unescaping functions with an entity mapping
via a data structure, instead provide a closure which maps the entity
with replacement text.
  • Loading branch information
dralley committed Jul 14, 2022
1 parent 4656f3b commit a4a0f8d
Show file tree
Hide file tree
Showing 9 changed files with 118 additions and 158 deletions.
7 changes: 6 additions & 1 deletion Changelog.md
Original file line number Diff line number Diff line change
Expand Up @@ -109,7 +109,11 @@
|`read_event_unbuffered` |`read_event`
|`read_to_end_unbuffered` |`read_to_end`
- [#412]: Change `read_to_end*` and `read_text_into` to accept `QName` instead of `AsRef<[u8]>`

- [#415]: Changed custom entity unescaping API to accept closures rather than a mapping of entity to
replacement text. This avoids needing to allocate a map and provides the user with more flexibility.
- [#415]: Renamed many functions following the pattern `unescape_and_decode*` to `decode_and_unescape*`
to better communicate their function. Renamed functions following the pattern `*_with_custom_entities`
to `decode_and_unescape_with` to be more consistent across the API.
- [#416]: `BytesStart::to_borrowed` renamed to `BytesStart::borrow`, the same method
added to all events

Expand Down Expand Up @@ -137,6 +141,7 @@
[#403]: https://github.com/tafia/quick-xml/pull/403
[#407]: https://github.com/tafia/quick-xml/pull/407
[#412]: https://github.com/tafia/quick-xml/pull/412
[#415]: https://github.com/tafia/quick-xml/pull/415
[#416]: https://github.com/tafia/quick-xml/pull/416
[#418]: https://github.com/tafia/quick-xml/pull/418

Expand Down
39 changes: 23 additions & 16 deletions examples/custom_entities.rs
Original file line number Diff line number Diff line change
Expand Up @@ -7,10 +7,11 @@
//! * the regex in this example is simple but brittle;
//! * it does not support the use of entities in entity declaration.
use std::collections::HashMap;

use quick_xml::events::Event;
use quick_xml::Reader;
use regex::bytes::Regex;
use std::collections::HashMap;

const DATA: &str = r#"
Expand All @@ -27,35 +28,41 @@ fn main() -> Result<(), Box<dyn std::error::Error>> {
reader.trim_text(true);

let mut buf = Vec::new();
let mut custom_entities = HashMap::new();
let mut custom_entities: HashMap<Vec<u8>, String> = HashMap::new();
let entity_re = Regex::new(r#"<!ENTITY\s+([^ \t\r\n]+)\s+"([^"]*)"\s*>"#)?;

loop {
match reader.read_event_into(&mut buf) {
Ok(Event::DocType(ref e)) => {
for cap in entity_re.captures_iter(&e) {
custom_entities.insert(cap[1].to_vec(), cap[2].to_vec());
custom_entities.insert(
cap[1].to_vec(),
reader.decoder().decode(&cap[2])?.into_owned(),
);
}
}
Ok(Event::Start(ref e)) => match e.name().as_ref() {
b"test" => println!(
"attributes values: {:?}",
e.attributes()
.map(|a| a
.unwrap()
.unescape_and_decode_value_with_custom_entities(
&reader,
&custom_entities
)
.unwrap())
.collect::<Vec<_>>()
),
b"test" => {
let attributes = e
.attributes()
.map(|a| {
a.unwrap()
.decode_and_unescape_value_with(&reader, |ent| {
custom_entities.get(ent).map(|s| s.as_str())
})
.unwrap()
})
.collect::<Vec<_>>();
println!("attributes values: {:?}", attributes);
}
_ => (),
},
Ok(Event::Text(ref e)) => {
println!(
"text value: {}",
e.unescape_and_decode_with_custom_entities(&reader, &custom_entities)
e.decode_and_unescape_with(&reader, |ent| custom_entities
.get(ent)
.map(|s| s.as_str()))
.unwrap()
);
}
Expand Down
86 changes: 32 additions & 54 deletions src/escapei.rs
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,6 @@
use memchr;
use std::borrow::Cow;
use std::collections::HashMap;
use std::ops::Range;

#[cfg(test)]
Expand Down Expand Up @@ -66,31 +65,15 @@ impl std::error::Error for EscapeError {}
/// Escapes a `&[u8]` and replaces all xml special characters (<, >, &, ', ") with their
/// corresponding xml escaped value.
pub fn escape(raw: &[u8]) -> Cow<[u8]> {
#[inline]
fn to_escape(b: u8) -> bool {
match b {
b'<' | b'>' | b'\'' | b'&' | b'"' => true,
_ => false,
}
}

_escape(raw, to_escape)
_escape(raw, |ch| matches!(ch, b'<' | b'>' | b'&' | b'\'' | b'\"'))
}

/// Should only be used for escaping text content. In xml text content, it is allowed
/// (though not recommended) to leave the quote special characters " and ' unescaped.
/// This function escapes a `&[u8]` and replaces xml special characters (<, >, &) with
/// their corresponding xml escaped value, but does not escape quote characters.
pub fn partial_escape(raw: &[u8]) -> Cow<[u8]> {
#[inline]
fn to_escape(b: u8) -> bool {
match b {
b'<' | b'>' | b'&' => true,
_ => false,
}
}

_escape(raw, to_escape)
_escape(raw, |ch| matches!(ch, b'<' | b'>' | b'&'))
}

/// Escapes a `&[u8]` and replaces a subset of xml special characters (<, >, &, ', ") with their
Expand Down Expand Up @@ -130,32 +113,23 @@ fn _escape<F: Fn(u8) -> bool>(raw: &[u8], escape_chars: F) -> Cow<[u8]> {
/// Unescape a `&[u8]` and replaces all xml escaped characters ('&...;') into their corresponding
/// value
pub fn unescape(raw: &[u8]) -> Result<Cow<[u8]>, EscapeError> {
do_unescape(raw, None)
}

/// Unescape a `&[u8]` and replaces all xml escaped characters ('&...;') into their corresponding
/// value, using a dictionnary of custom entities.
///
/// # Pre-condition
///
/// The keys and values of `custom_entities`, if any, must be valid UTF-8.
pub fn unescape_with<'a>(
raw: &'a [u8],
custom_entities: &HashMap<Vec<u8>, Vec<u8>>,
) -> Result<Cow<'a, [u8]>, EscapeError> {
do_unescape(raw, Some(custom_entities))
unescape_with(raw, |_| None)
}

/// Unescape a `&[u8]` and replaces all xml escaped characters ('&...;') into their corresponding
/// value, using an optional dictionary of custom entities.
/// value, using a resolver function for custom entities.
///
/// # Pre-condition
///
/// The keys and values of `custom_entities`, if any, must be valid UTF-8.
pub fn do_unescape<'a>(
raw: &'a [u8],
custom_entities: Option<&HashMap<Vec<u8>, Vec<u8>>>,
) -> Result<Cow<'a, [u8]>, EscapeError> {
/// The implementation of `resolve_entity` is expected to operate over UTF-8 inputs.
pub fn unescape_with<'input, 'entity, F>(
raw: &'input [u8],
resolve_entity: F,
) -> Result<Cow<'input, [u8]>, EscapeError>
where
// the lifetime of the output comes from a capture or is `'static`
F: Fn(&[u8]) -> Option<&'entity str>,
{
let mut unescaped = None;
let mut last_end = 0;
let mut iter = memchr::memchr2_iter(b'&', b';', raw);
Expand All @@ -171,12 +145,14 @@ pub fn do_unescape<'a>(

// search for character correctness
let pat = &raw[start + 1..end];
if let Some(s) = named_entity(pat) {
unescaped.extend_from_slice(s.as_bytes());
} else if pat.starts_with(b"#") {
push_utf8(unescaped, parse_number(&pat[1..], start..end)?);
} else if let Some(value) = custom_entities.and_then(|hm| hm.get(pat)) {
unescaped.extend_from_slice(&value);
if pat.starts_with(b"#") {
let entity = &pat[1..]; // starts after the #
let codepoint = parse_number(entity, start..end)?;
push_utf8(unescaped, codepoint);
} else if let Some(value) = named_entity(pat) {
unescaped.extend_from_slice(value.as_bytes());
} else if let Some(value) = resolve_entity(pat) {
unescaped.extend_from_slice(value.as_bytes());
} else {
return Err(EscapeError::UnrecognizedSymbol(
start + 1..end,
Expand Down Expand Up @@ -1740,18 +1716,20 @@ fn test_unescape() {

#[test]
fn test_unescape_with() {
let custom_entities = vec![(b"foo".to_vec(), b"BAR".to_vec())]
.into_iter()
.collect();
assert_eq!(&*unescape_with(b"test", &custom_entities).unwrap(), b"test");
let custom_entities = |ent: &[u8]| match ent {
b"foo" => Some("BAR"),
_ => None,
};

assert_eq!(&*unescape_with(b"test", custom_entities).unwrap(), b"test");
assert_eq!(
&*unescape_with(b"&lt;test&gt;", &custom_entities).unwrap(),
&*unescape_with(b"&lt;test&gt;", custom_entities).unwrap(),
b"<test>"
);
assert_eq!(&*unescape_with(b"&#x30;", &custom_entities).unwrap(), b"0");
assert_eq!(&*unescape_with(b"&#48;", &custom_entities).unwrap(), b"0");
assert_eq!(&*unescape_with(b"&foo;", &custom_entities).unwrap(), b"BAR");
assert!(unescape_with(b"&fop;", &custom_entities).is_err());
assert_eq!(&*unescape_with(b"&#x30;", custom_entities).unwrap(), b"0");
assert_eq!(&*unescape_with(b"&#48;", custom_entities).unwrap(), b"0");
assert_eq!(&*unescape_with(b"&foo;", custom_entities).unwrap(), b"BAR");
assert!(unescape_with(b"&fop;", custom_entities).is_err());
}

#[test]
Expand Down
66 changes: 25 additions & 41 deletions src/events/attributes.rs
Original file line number Diff line number Diff line change
Expand Up @@ -2,23 +2,23 @@
//!
//! Provides an iterator over attributes key/value pairs
use crate::errors::{Error, Result as XmlResult};
use crate::escape::{do_unescape, escape};
use crate::errors::Result as XmlResult;
use crate::escape::{escape, unescape_with};
use crate::name::QName;
use crate::reader::{is_whitespace, Reader};
use crate::utils::{write_byte_string, write_cow_string, Bytes};
use std::fmt::{self, Debug, Display, Formatter};
use std::iter::FusedIterator;
use std::{borrow::Cow, collections::HashMap, ops::Range};
use std::{borrow::Cow, ops::Range};

/// A struct representing a key/value XML attribute.
///
/// Field `value` stores raw bytes, possibly containing escape-sequences. Most users will likely
/// want to access the value using one of the [`unescaped_value`] and [`unescape_and_decode_value`]
/// want to access the value using one of the [`unescaped_value`] and [`decode_and_unescape_value`]
/// functions.
///
/// [`unescaped_value`]: Self::unescaped_value
/// [`unescape_and_decode_value`]: Self::unescape_and_decode_value
/// [`decode_and_unescape_value`]: Self::decode_and_unescape_value
#[derive(Clone, PartialEq)]
pub struct Attribute<'a> {
/// The key to uniquely define the attribute.
Expand All @@ -37,39 +37,33 @@ impl<'a> Attribute<'a> {
///
/// This will allocate if the value contains any escape sequences.
///
/// See also [`unescaped_value_with_custom_entities()`](Self::unescaped_value_with_custom_entities)
/// See also [`unescaped_value_with()`](Self::unescaped_value_with)
pub fn unescaped_value(&self) -> XmlResult<Cow<[u8]>> {
self.make_unescaped_value(None)
self.unescaped_value_with(|_| None)
}

/// Returns the unescaped value, using custom entities.
///
/// This is normally the value you are interested in. Escape sequences such as `&gt;` are
/// replaced with their unescaped equivalents such as `>`.
/// Additional entities can be provided in `custom_entities`.
/// A fallback resolver for additional custom entities can be provided via
/// `resolve_entity`.
///
/// This will allocate if the value contains any escape sequences.
///
/// See also [`unescaped_value()`](Self::unescaped_value)
///
/// # Pre-condition
///
/// The keys and values of `custom_entities`, if any, must be valid UTF-8.
pub fn unescaped_value_with_custom_entities(
&self,
custom_entities: &HashMap<Vec<u8>, Vec<u8>>,
) -> XmlResult<Cow<[u8]>> {
self.make_unescaped_value(Some(custom_entities))
/// The implementation of `resolve_entity` is expected to operate over UTF-8 inputs.
pub fn unescaped_value_with<'s, 'entity>(
&'s self,
resolve_entity: impl Fn(&[u8]) -> Option<&'entity str>,
) -> XmlResult<Cow<'s, [u8]>> {
Ok(unescape_with(&*self.value, resolve_entity)?)
}

fn make_unescaped_value(
&self,
custom_entities: Option<&HashMap<Vec<u8>, Vec<u8>>>,
) -> XmlResult<Cow<[u8]>> {
do_unescape(&*self.value, custom_entities).map_err(Error::EscapeError)
}

/// Decode then unescapes the value
/// Decodes then unescapes the value
///
/// This allocates a `String` in all cases. For performance reasons it might be a better idea to
/// instead use one of:
Expand All @@ -79,41 +73,31 @@ impl<'a> Attribute<'a> {
///
/// [`unescaped_value()`]: Self::unescaped_value
/// [`Reader::decoder().decode()`]: crate::reader::Decoder::decode
pub fn unescape_and_decode_value<B>(&self, reader: &Reader<B>) -> XmlResult<String> {
self.do_unescape_and_decode_value(reader, None)
pub fn decode_and_unescape_value<B>(&self, reader: &Reader<B>) -> XmlResult<String> {
self.decode_and_unescape_value_with(reader, |_| None)
}

/// Decode then unescapes the value with custom entities
/// Decodes then unescapes the value with custom entities
///
/// This allocates a `String` in all cases. For performance reasons it might be a better idea to
/// instead use one of:
///
/// * [`Reader::decoder().decode()`], as it only allocates when the decoding can't be performed otherwise.
/// * [`unescaped_value_with_custom_entities()`], as it doesn't allocate when no escape sequences are used.
/// * [`unescaped_value_with()`], as it doesn't allocate when no escape sequences are used.
///
/// [`unescaped_value_with_custom_entities()`]: Self::unescaped_value_with_custom_entities
/// [`unescaped_value_with()`]: Self::unescaped_value_with
/// [`Reader::decoder().decode()`]: crate::reader::Decoder::decode
///
/// # Pre-condition
///
/// The keys and values of `custom_entities`, if any, must be valid UTF-8.
pub fn unescape_and_decode_value_with_custom_entities<B>(
&self,
reader: &Reader<B>,
custom_entities: &HashMap<Vec<u8>, Vec<u8>>,
) -> XmlResult<String> {
self.do_unescape_and_decode_value(reader, Some(custom_entities))
}

/// The keys and values of `custom_entities`, if any, must be valid UTF-8.
fn do_unescape_and_decode_value<B>(
/// The implementation of `resolve_entity` is expected to operate over UTF-8 inputs.
pub fn decode_and_unescape_value_with<'entity, B>(
&self,
reader: &Reader<B>,
custom_entities: Option<&HashMap<Vec<u8>, Vec<u8>>>,
resolve_entity: impl Fn(&[u8]) -> Option<&'entity str>,
) -> XmlResult<String> {
let decoded = reader.decoder().decode(&*self.value)?;

let unescaped = do_unescape(decoded.as_bytes(), custom_entities)?;
let unescaped = unescape_with(decoded.as_bytes(), resolve_entity)?;
Ok(String::from_utf8(unescaped.into_owned())?)
}
}
Expand Down
Loading

0 comments on commit a4a0f8d

Please sign in to comment.