Skip to content

Commit

Permalink
Change API for providing custom entities
Browse files Browse the repository at this point in the history
Instead of providing unescaping functions with an entity mapping
via a data structure, instead provide a closure which maps the entity
with replacement text.
  • Loading branch information
dralley committed Jul 10, 2022
1 parent 57cd104 commit 81d63a4
Show file tree
Hide file tree
Showing 6 changed files with 81 additions and 127 deletions.
3 changes: 3 additions & 0 deletions Changelog.md
Original file line number Diff line number Diff line change
Expand Up @@ -107,6 +107,8 @@
|`read_event_unbuffered` |`read_event`
|`read_to_end_unbuffered` |`read_to_end`
- [#412]: Change `read_to_end*` and `read_text_into` to accept `QName` instead of `AsRef<[u8]>`
- [#415]: Changed custom entity unescaping API to accept closures rather than a mapping of entity to
replacement text. This avoids needing to allocate a map and provides the user with more flexibility.

### New Tests

Expand All @@ -131,6 +133,7 @@
[#403]: https://github.com/tafia/quick-xml/pull/403
[#407]: https://github.com/tafia/quick-xml/pull/407
[#412]: https://github.com/tafia/quick-xml/pull/412
[#415]: https://github.com/tafia/quick-xml/pull/415

## 0.23.0 -- 2022-05-08

Expand Down
37 changes: 21 additions & 16 deletions examples/custom_entities.rs
Original file line number Diff line number Diff line change
Expand Up @@ -7,10 +7,11 @@
//! * the regex in this example is simple but brittle;
//! * it does not support the use of entities in entity declaration.
use std::collections::HashMap;

use quick_xml::events::Event;
use quick_xml::Reader;
use regex::bytes::Regex;
use std::collections::HashMap;

const DATA: &str = r#"
Expand All @@ -27,35 +28,39 @@ fn main() -> Result<(), Box<dyn std::error::Error>> {
reader.trim_text(true);

let mut buf = Vec::new();
let mut custom_entities = HashMap::new();
let mut custom_entities: HashMap<Vec<u8>, String> = HashMap::new();
let entity_re = Regex::new(r#"<!ENTITY\s+([^ \t\r\n]+)\s+"([^"]*)"\s*>"#)?;

loop {
match reader.read_event_into(&mut buf) {
Ok(Event::DocType(ref e)) => {
for cap in entity_re.captures_iter(&e) {
custom_entities.insert(cap[1].to_vec(), cap[2].to_vec());
custom_entities.insert(cap[1].to_vec(), String::from_utf8(cap[2].to_vec())?);
}
}
Ok(Event::Start(ref e)) => match e.name().as_ref() {
b"test" => println!(
"attributes values: {:?}",
e.attributes()
.map(|a| a
.unwrap()
.unescape_and_decode_value_with_custom_entities(
&reader,
&custom_entities
)
.unwrap())
.collect::<Vec<_>>()
),
b"test" => {
let lookup_custom_entity = |ent| custom_entities.get(ent).map(|s| s.as_str());
let attributes = e
.attributes()
.map(|a| {
a.unwrap()
.unescape_and_decode_value_with_custom_entities(
&reader,
lookup_custom_entity,
)
.unwrap()
})
.collect::<Vec<_>>();
println!("attributes values: {:?}", attributes);
}
_ => (),
},
Ok(Event::Text(ref e)) => {
let lookup_custom_entity = |ent| custom_entities.get(ent).map(|s| s.as_str());
println!(
"text value: {}",
e.unescape_and_decode_with_custom_entities(&reader, &custom_entities)
e.unescape_and_decode_with_custom_entities(&reader, lookup_custom_entity)
.unwrap()
);
}
Expand Down
83 changes: 30 additions & 53 deletions src/escapei.rs
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,6 @@
use memchr;
use std::borrow::Cow;
use std::collections::HashMap;
use std::ops::Range;

#[cfg(test)]
Expand Down Expand Up @@ -66,31 +65,15 @@ impl std::error::Error for EscapeError {}
/// Escapes a `&[u8]` and replaces all xml special characters (<, >, &, ', ") with their
/// corresponding xml escaped value.
pub fn escape(raw: &[u8]) -> Cow<[u8]> {
#[inline]
fn to_escape(b: u8) -> bool {
match b {
b'<' | b'>' | b'\'' | b'&' | b'"' => true,
_ => false,
}
}

_escape(raw, to_escape)
_escape(raw, |ch| matches!(ch, b'<' | b'>' | b'&' | b'\'' | b'\"'))
}

/// Should only be used for escaping text content. In xml text content, it is allowed
/// (though not recommended) to leave the quote special characters " and ' unescaped.
/// This function escapes a `&[u8]` and replaces xml special characters (<, >, &) with
/// their corresponding xml escaped value, but does not escape quote characters.
pub fn partial_escape(raw: &[u8]) -> Cow<[u8]> {
#[inline]
fn to_escape(b: u8) -> bool {
match b {
b'<' | b'>' | b'&' => true,
_ => false,
}
}

_escape(raw, to_escape)
_escape(raw, |ch| matches!(ch, b'<' | b'>' | b'&'))
}

/// Escapes a `&[u8]` and replaces a subset of xml special characters (<, >, &, ', ") with their
Expand Down Expand Up @@ -130,32 +113,22 @@ fn _escape<F: Fn(u8) -> bool>(raw: &[u8], escape_chars: F) -> Cow<[u8]> {
/// Unescape a `&[u8]` and replaces all xml escaped characters ('&...;') into their corresponding
/// value
pub fn unescape(raw: &[u8]) -> Result<Cow<[u8]>, EscapeError> {
do_unescape(raw, None)
unescape_with(raw, |_| None)
}

/// Unescape a `&[u8]` and replaces all xml escaped characters ('&...;') into their corresponding
/// value, using a dictionnary of custom entities.
/// value, using a dictionary of custom entities.
///
/// # Pre-condition
///
/// The keys and values of `custom_entities`, if any, must be valid UTF-8.
pub fn unescape_with<'a>(
/// The implementation of `lookup_custom_entity` is expected to operate over UTF-8 inputs.
pub fn unescape_with<'a, 'b>(
raw: &'a [u8],
custom_entities: &HashMap<Vec<u8>, Vec<u8>>,
) -> Result<Cow<'a, [u8]>, EscapeError> {
do_unescape(raw, Some(custom_entities))
}

/// Unescape a `&[u8]` and replaces all xml escaped characters ('&...;') into their corresponding
/// value, using an optional dictionary of custom entities.
///
/// # Pre-condition
///
/// The keys and values of `custom_entities`, if any, must be valid UTF-8.
pub fn do_unescape<'a>(
raw: &'a [u8],
custom_entities: Option<&HashMap<Vec<u8>, Vec<u8>>>,
) -> Result<Cow<'a, [u8]>, EscapeError> {
lookup_custom_entity: impl Fn(&'b [u8]) -> Option<&'b str>,
) -> Result<Cow<'a, [u8]>, EscapeError>
where
'a: 'b,
{
let mut unescaped = None;
let mut last_end = 0;
let mut iter = memchr::memchr2_iter(b'&', b';', raw);
Expand All @@ -171,12 +144,14 @@ pub fn do_unescape<'a>(

// search for character correctness
let pat = &raw[start + 1..end];
if let Some(s) = named_entity(pat) {
unescaped.extend_from_slice(s.as_bytes());
} else if pat.starts_with(b"#") {
push_utf8(unescaped, parse_number(&pat[1..], start..end)?);
} else if let Some(value) = custom_entities.and_then(|hm| hm.get(pat)) {
unescaped.extend_from_slice(&value);
if pat.starts_with(b"#") {
let entity = &pat[1..]; // starts after the #
let codepoint = parse_number(entity, start..end)?;
push_utf8(unescaped, codepoint);
} else if let Some(value) = named_entity(pat) {
unescaped.extend_from_slice(value.as_bytes());
} else if let Some(value) = lookup_custom_entity(pat) {
unescaped.extend_from_slice(value.as_bytes());
} else {
return Err(EscapeError::UnrecognizedSymbol(
start + 1..end,
Expand Down Expand Up @@ -1740,18 +1715,20 @@ fn test_unescape() {

#[test]
fn test_unescape_with() {
let custom_entities = vec![(b"foo".to_vec(), b"BAR".to_vec())]
.into_iter()
.collect();
assert_eq!(&*unescape_with(b"test", &custom_entities).unwrap(), b"test");
let custom_entities = |ent: &[u8]| match ent {
b"foo" => Some("BAR"),
_ => None,
};

assert_eq!(&*unescape_with(b"test", custom_entities).unwrap(), b"test");
assert_eq!(
&*unescape_with(b"&lt;test&gt;", &custom_entities).unwrap(),
&*unescape_with(b"&lt;test&gt;", custom_entities).unwrap(),
b"<test>"
);
assert_eq!(&*unescape_with(b"&#x30;", &custom_entities).unwrap(), b"0");
assert_eq!(&*unescape_with(b"&#48;", &custom_entities).unwrap(), b"0");
assert_eq!(&*unescape_with(b"&foo;", &custom_entities).unwrap(), b"BAR");
assert!(unescape_with(b"&fop;", &custom_entities).is_err());
assert_eq!(&*unescape_with(b"&#x30;", custom_entities).unwrap(), b"0");
assert_eq!(&*unescape_with(b"&#48;", custom_entities).unwrap(), b"0");
assert_eq!(&*unescape_with(b"&foo;", custom_entities).unwrap(), b"BAR");
assert!(unescape_with(b"&fop;", custom_entities).is_err());
}

#[test]
Expand Down
39 changes: 11 additions & 28 deletions src/events/attributes.rs
Original file line number Diff line number Diff line change
Expand Up @@ -3,13 +3,13 @@
//! Provides an iterator over attributes key/value pairs
use crate::errors::{Error, Result as XmlResult};
use crate::escape::{do_unescape, escape};
use crate::escape::{escape, unescape_with};
use crate::name::QName;
use crate::reader::{is_whitespace, Reader};
use crate::utils::{write_byte_string, write_cow_string, Bytes};
use std::fmt::{self, Debug, Display, Formatter};
use std::iter::FusedIterator;
use std::{borrow::Cow, collections::HashMap, ops::Range};
use std::{borrow::Cow, ops::Range};

/// A struct representing a key/value XML attribute.
///
Expand Down Expand Up @@ -41,7 +41,7 @@ impl<'a> Attribute<'a> {
///
/// See also [`unescaped_value_with_custom_entities()`](#method.unescaped_value_with_custom_entities)
pub fn unescaped_value(&self) -> XmlResult<Cow<[u8]>> {
self.make_unescaped_value(None)
self.unescaped_value_with_custom_entities(|_| None)
}

/// Returns the unescaped value, using custom entities.
Expand All @@ -57,18 +57,11 @@ impl<'a> Attribute<'a> {
/// # Pre-condition
///
/// The keys and values of `custom_entities`, if any, must be valid UTF-8.
pub fn unescaped_value_with_custom_entities(
&self,
custom_entities: &HashMap<Vec<u8>, Vec<u8>>,
) -> XmlResult<Cow<[u8]>> {
self.make_unescaped_value(Some(custom_entities))
}

fn make_unescaped_value(
&self,
custom_entities: Option<&HashMap<Vec<u8>, Vec<u8>>>,
) -> XmlResult<Cow<[u8]>> {
do_unescape(&*self.value, custom_entities).map_err(Error::EscapeError)
pub fn unescaped_value_with_custom_entities<'s>(
&'s self,
lookup_custom_entity: impl Fn(&[u8]) -> Option<&str>,
) -> XmlResult<Cow<'s, [u8]>> {
unescape_with(&*self.value, lookup_custom_entity).map_err(Error::EscapeError)
}

/// Decode then unescapes the value
Expand All @@ -82,7 +75,7 @@ impl<'a> Attribute<'a> {
/// [`unescaped_value()`]: #method.unescaped_value
/// [`Reader::decode()`]: ../../reader/struct.Reader.html#method.decode
pub fn unescape_and_decode_value<B>(&self, reader: &Reader<B>) -> XmlResult<String> {
self.do_unescape_and_decode_value(reader, None)
self.unescape_and_decode_value_with_custom_entities(reader, |_| None)
}

/// Decode then unescapes the value with custom entities
Expand All @@ -102,20 +95,10 @@ impl<'a> Attribute<'a> {
pub fn unescape_and_decode_value_with_custom_entities<B>(
&self,
reader: &Reader<B>,
custom_entities: &HashMap<Vec<u8>, Vec<u8>>,
) -> XmlResult<String> {
self.do_unescape_and_decode_value(reader, Some(custom_entities))
}

/// The keys and values of `custom_entities`, if any, must be valid UTF-8.
fn do_unescape_and_decode_value<B>(
&self,
reader: &Reader<B>,
custom_entities: Option<&HashMap<Vec<u8>, Vec<u8>>>,
lookup_custom_entity: impl Fn(&[u8]) -> Option<&str>,
) -> XmlResult<String> {
let decoded = reader.decoder().decode(&*self.value)?;

let unescaped = do_unescape(decoded.as_bytes(), custom_entities)?;
let unescaped = unescape_with(decoded.as_bytes(), lookup_custom_entity)?;
Ok(String::from_utf8(unescaped.into_owned())?)
}
}
Expand Down
Loading

0 comments on commit 81d63a4

Please sign in to comment.