From 657a8f6e178a163084018adc265e618d213e5ae2 Mon Sep 17 00:00:00 2001 From: Daniel Alley Date: Sun, 10 Jul 2022 14:12:51 -0400 Subject: [PATCH] Change API for providing custom entities Instead of providing unescaping functions with an entity mapping via a data structure, instead provide a closure which maps the entity with replacement text. --- Changelog.md | 3 ++ examples/custom_entities.rs | 37 ++++++++++------- src/escapei.rs | 83 ++++++++++++++----------------------- src/events/attributes.rs | 39 +++++------------ src/events/mod.rs | 36 +++++----------- src/lib.rs | 2 +- 6 files changed, 76 insertions(+), 124 deletions(-) diff --git a/Changelog.md b/Changelog.md index 91dee4f6..35c79f5e 100644 --- a/Changelog.md +++ b/Changelog.md @@ -107,6 +107,8 @@ |`read_event_unbuffered` |`read_event` |`read_to_end_unbuffered` |`read_to_end` - [#412]: Change `read_to_end*` and `read_text_into` to accept `QName` instead of `AsRef<[u8]>` +- [#415]: Changed custom entity unescaping API to accept closures rather than a mapping of entity to + replacement text. This avoids needing to allocate a map and provides the user with more flexibility. ### New Tests @@ -131,6 +133,7 @@ [#403]: https://github.com/tafia/quick-xml/pull/403 [#407]: https://github.com/tafia/quick-xml/pull/407 [#412]: https://github.com/tafia/quick-xml/pull/412 +[#415]: https://github.com/tafia/quick-xml/pull/415 ## 0.23.0 -- 2022-05-08 diff --git a/examples/custom_entities.rs b/examples/custom_entities.rs index 02165faf..0f0db680 100644 --- a/examples/custom_entities.rs +++ b/examples/custom_entities.rs @@ -7,10 +7,11 @@ //! * the regex in this example is simple but brittle; //! * it does not support the use of entities in entity declaration. +use std::collections::HashMap; + use quick_xml::events::Event; use quick_xml::Reader; use regex::bytes::Regex; -use std::collections::HashMap; const DATA: &str = r#" @@ -27,35 +28,39 @@ fn main() -> Result<(), Box> { reader.trim_text(true); let mut buf = Vec::new(); - let mut custom_entities = HashMap::new(); + let mut custom_entities: HashMap, String> = HashMap::new(); let entity_re = Regex::new(r#""#)?; loop { match reader.read_event_into(&mut buf) { Ok(Event::DocType(ref e)) => { for cap in entity_re.captures_iter(&e) { - custom_entities.insert(cap[1].to_vec(), cap[2].to_vec()); + custom_entities.insert(cap[1].to_vec(), String::from_utf8(cap[2].to_vec())?); } } Ok(Event::Start(ref e)) => match e.name().as_ref() { - b"test" => println!( - "attributes values: {:?}", - e.attributes() - .map(|a| a - .unwrap() - .unescape_and_decode_value_with_custom_entities( - &reader, - &custom_entities - ) - .unwrap()) - .collect::>() - ), + b"test" => { + let lookup_custom_entity = |ent| custom_entities.get(ent).map(|s| s.as_str()); + let attributes = e + .attributes() + .map(|a| { + a.unwrap() + .unescape_and_decode_value_with_custom_entities( + &reader, + lookup_custom_entity, + ) + .unwrap() + }) + .collect::>(); + println!("attributes values: {:?}", attributes); + } _ => (), }, Ok(Event::Text(ref e)) => { + let lookup_custom_entity = |ent| custom_entities.get(ent).map(|s| s.as_str()); println!( "text value: {}", - e.unescape_and_decode_with_custom_entities(&reader, &custom_entities) + e.unescape_and_decode_with_custom_entities(&reader, lookup_custom_entity) .unwrap() ); } diff --git a/src/escapei.rs b/src/escapei.rs index 64749c27..daa1e87d 100644 --- a/src/escapei.rs +++ b/src/escapei.rs @@ -2,7 +2,6 @@ use memchr; use std::borrow::Cow; -use std::collections::HashMap; use std::ops::Range; #[cfg(test)] @@ -66,15 +65,7 @@ impl std::error::Error for EscapeError {} /// Escapes a `&[u8]` and replaces all xml special characters (<, >, &, ', ") with their /// corresponding xml escaped value. pub fn escape(raw: &[u8]) -> Cow<[u8]> { - #[inline] - fn to_escape(b: u8) -> bool { - match b { - b'<' | b'>' | b'\'' | b'&' | b'"' => true, - _ => false, - } - } - - _escape(raw, to_escape) + _escape(raw, |ch| matches!(ch, b'<' | b'>' | b'&' | b'\'' | b'\"')) } /// Should only be used for escaping text content. In xml text content, it is allowed @@ -82,15 +73,7 @@ pub fn escape(raw: &[u8]) -> Cow<[u8]> { /// This function escapes a `&[u8]` and replaces xml special characters (<, >, &) with /// their corresponding xml escaped value, but does not escape quote characters. pub fn partial_escape(raw: &[u8]) -> Cow<[u8]> { - #[inline] - fn to_escape(b: u8) -> bool { - match b { - b'<' | b'>' | b'&' => true, - _ => false, - } - } - - _escape(raw, to_escape) + _escape(raw, |ch| matches!(ch, b'<' | b'>' | b'&')) } /// Escapes a `&[u8]` and replaces a subset of xml special characters (<, >, &, ', ") with their @@ -130,32 +113,22 @@ fn _escape bool>(raw: &[u8], escape_chars: F) -> Cow<[u8]> { /// Unescape a `&[u8]` and replaces all xml escaped characters ('&...;') into their corresponding /// value pub fn unescape(raw: &[u8]) -> Result, EscapeError> { - do_unescape(raw, None) + unescape_with(raw, |_| None) } /// Unescape a `&[u8]` and replaces all xml escaped characters ('&...;') into their corresponding -/// value, using a dictionnary of custom entities. +/// value, using a dictionary of custom entities. /// /// # Pre-condition /// -/// The keys and values of `custom_entities`, if any, must be valid UTF-8. -pub fn unescape_with<'a>( +/// The implementation of `lookup_custom_entity` is expected to operate over UTF-8 inputs. +pub fn unescape_with<'a, 'b>( raw: &'a [u8], - custom_entities: &HashMap, Vec>, -) -> Result, EscapeError> { - do_unescape(raw, Some(custom_entities)) -} - -/// Unescape a `&[u8]` and replaces all xml escaped characters ('&...;') into their corresponding -/// value, using an optional dictionary of custom entities. -/// -/// # Pre-condition -/// -/// The keys and values of `custom_entities`, if any, must be valid UTF-8. -pub fn do_unescape<'a>( - raw: &'a [u8], - custom_entities: Option<&HashMap, Vec>>, -) -> Result, EscapeError> { + lookup_custom_entity: impl Fn(&'b [u8]) -> Option<&'b str>, +) -> Result, EscapeError> +where + 'a: 'b, +{ let mut unescaped = None; let mut last_end = 0; let mut iter = memchr::memchr2_iter(b'&', b';', raw); @@ -171,12 +144,14 @@ pub fn do_unescape<'a>( // search for character correctness let pat = &raw[start + 1..end]; - if let Some(s) = named_entity(pat) { - unescaped.extend_from_slice(s.as_bytes()); - } else if pat.starts_with(b"#") { - push_utf8(unescaped, parse_number(&pat[1..], start..end)?); - } else if let Some(value) = custom_entities.and_then(|hm| hm.get(pat)) { - unescaped.extend_from_slice(&value); + if pat.starts_with(b"#") { + let entity = &pat[1..]; // starts after the # + let codepoint = parse_number(entity, start..end)?; + push_utf8(unescaped, codepoint); + } else if let Some(value) = named_entity(pat) { + unescaped.extend_from_slice(value.as_bytes()); + } else if let Some(value) = lookup_custom_entity(pat) { + unescaped.extend_from_slice(value.as_bytes()); } else { return Err(EscapeError::UnrecognizedSymbol( start + 1..end, @@ -1740,18 +1715,20 @@ fn test_unescape() { #[test] fn test_unescape_with() { - let custom_entities = vec![(b"foo".to_vec(), b"BAR".to_vec())] - .into_iter() - .collect(); - assert_eq!(&*unescape_with(b"test", &custom_entities).unwrap(), b"test"); + let custom_entities = |ent: &[u8]| match ent { + b"foo" => Some("BAR"), + _ => None, + }; + + assert_eq!(&*unescape_with(b"test", custom_entities).unwrap(), b"test"); assert_eq!( - &*unescape_with(b"<test>", &custom_entities).unwrap(), + &*unescape_with(b"<test>", custom_entities).unwrap(), b"" ); - assert_eq!(&*unescape_with(b"0", &custom_entities).unwrap(), b"0"); - assert_eq!(&*unescape_with(b"0", &custom_entities).unwrap(), b"0"); - assert_eq!(&*unescape_with(b"&foo;", &custom_entities).unwrap(), b"BAR"); - assert!(unescape_with(b"&fop;", &custom_entities).is_err()); + assert_eq!(&*unescape_with(b"0", custom_entities).unwrap(), b"0"); + assert_eq!(&*unescape_with(b"0", custom_entities).unwrap(), b"0"); + assert_eq!(&*unescape_with(b"&foo;", custom_entities).unwrap(), b"BAR"); + assert!(unescape_with(b"&fop;", custom_entities).is_err()); } #[test] diff --git a/src/events/attributes.rs b/src/events/attributes.rs index 51f1455c..c0336709 100644 --- a/src/events/attributes.rs +++ b/src/events/attributes.rs @@ -3,13 +3,13 @@ //! Provides an iterator over attributes key/value pairs use crate::errors::{Error, Result as XmlResult}; -use crate::escape::{do_unescape, escape}; +use crate::escape::{unescape_with, escape}; use crate::name::QName; use crate::reader::{is_whitespace, Reader}; use crate::utils::{write_byte_string, write_cow_string, Bytes}; use std::fmt::{self, Debug, Display, Formatter}; use std::iter::FusedIterator; -use std::{borrow::Cow, collections::HashMap, ops::Range}; +use std::{borrow::Cow, ops::Range}; /// A struct representing a key/value XML attribute. /// @@ -41,7 +41,7 @@ impl<'a> Attribute<'a> { /// /// See also [`unescaped_value_with_custom_entities()`](#method.unescaped_value_with_custom_entities) pub fn unescaped_value(&self) -> XmlResult> { - self.make_unescaped_value(None) + self.unescaped_value_with_custom_entities(|_| None) } /// Returns the unescaped value, using custom entities. @@ -57,18 +57,11 @@ impl<'a> Attribute<'a> { /// # Pre-condition /// /// The keys and values of `custom_entities`, if any, must be valid UTF-8. - pub fn unescaped_value_with_custom_entities( - &self, - custom_entities: &HashMap, Vec>, - ) -> XmlResult> { - self.make_unescaped_value(Some(custom_entities)) - } - - fn make_unescaped_value( - &self, - custom_entities: Option<&HashMap, Vec>>, - ) -> XmlResult> { - do_unescape(&*self.value, custom_entities).map_err(Error::EscapeError) + pub fn unescaped_value_with_custom_entities<'s>( + &'s self, + lookup_custom_entity: impl Fn(&[u8]) -> Option<&str>, + ) -> XmlResult> { + unescape_with(&*self.value, lookup_custom_entity).map_err(Error::EscapeError) } /// Decode then unescapes the value @@ -82,7 +75,7 @@ impl<'a> Attribute<'a> { /// [`unescaped_value()`]: #method.unescaped_value /// [`Reader::decode()`]: ../../reader/struct.Reader.html#method.decode pub fn unescape_and_decode_value(&self, reader: &Reader) -> XmlResult { - self.do_unescape_and_decode_value(reader, None) + self.unescape_and_decode_value_with_custom_entities(reader, |_| None) } /// Decode then unescapes the value with custom entities @@ -102,20 +95,10 @@ impl<'a> Attribute<'a> { pub fn unescape_and_decode_value_with_custom_entities( &self, reader: &Reader, - custom_entities: &HashMap, Vec>, - ) -> XmlResult { - self.do_unescape_and_decode_value(reader, Some(custom_entities)) - } - - /// The keys and values of `custom_entities`, if any, must be valid UTF-8. - fn do_unescape_and_decode_value( - &self, - reader: &Reader, - custom_entities: Option<&HashMap, Vec>>, + lookup_custom_entity: impl Fn(&[u8]) -> Option<&str>, ) -> XmlResult { let decoded = reader.decoder().decode(&*self.value)?; - - let unescaped = do_unescape(decoded.as_bytes(), custom_entities)?; + let unescaped = unescape_with(decoded.as_bytes(), lookup_custom_entity)?; Ok(String::from_utf8(unescaped.into_owned())?) } } diff --git a/src/events/mod.rs b/src/events/mod.rs index 51c66ec7..c7aadd59 100644 --- a/src/events/mod.rs +++ b/src/events/mod.rs @@ -37,13 +37,12 @@ pub mod attributes; #[cfg(feature = "encoding_rs")] use encoding_rs::Encoding; use std::borrow::Cow; -use std::collections::HashMap; use std::fmt::{self, Debug, Formatter}; use std::ops::Deref; use std::str::from_utf8; use crate::errors::{Error, Result}; -use crate::escape::{do_unescape, escape, partial_escape}; +use crate::escape::{escape, partial_escape, unescape_with}; use crate::name::{LocalName, QName}; use crate::reader::{Decoder, Reader}; use crate::utils::write_cow_string; @@ -707,7 +706,7 @@ impl<'a> BytesText<'a> { //TODO: need to think about better API instead of dozens similar functions // Maybe use builder pattern. After that expose function as public API //FIXME: need to take into account entities defined in the document - Ok(BytesCData::new(match do_unescape(&self.content, None)? { + Ok(BytesCData::new(match unescape_with(&self.content, |_| None)? { Cow::Borrowed(_) => self.content, Cow::Owned(unescaped) => Cow::Owned(unescaped), })) @@ -720,7 +719,7 @@ impl<'a> BytesText<'a> { /// /// See also [`unescaped_with_custom_entities()`](#method.unescaped_with_custom_entities) pub fn unescaped(&self) -> Result> { - self.make_unescaped(None) + self.unescaped_with_custom_entities(|_| None) } /// gets escaped content with custom entities @@ -731,21 +730,14 @@ impl<'a> BytesText<'a> { /// /// # Pre-condition /// - /// The keys and values of `custom_entities`, if any, must be valid UTF-8. + /// The implementation of `lookup_custom_entity` is expected to operate over UTF-8 inputs. /// /// See also [`unescaped()`](#method.unescaped) pub fn unescaped_with_custom_entities<'s>( &'s self, - custom_entities: &HashMap, Vec>, + lookup_custom_entities: impl Fn(&[u8]) -> Option<&str>, ) -> Result> { - self.make_unescaped(Some(custom_entities)) - } - - fn make_unescaped<'s>( - &'s self, - custom_entities: Option<&HashMap, Vec>>, - ) -> Result> { - do_unescape(self, custom_entities).map_err(Error::EscapeError) + unescape_with(self, lookup_custom_entities).map_err(Error::EscapeError) } /// helper method to unescape then decode self using the reader encoding @@ -755,7 +747,7 @@ impl<'a> BytesText<'a> { /// 1. BytesText::unescaped() /// 2. Reader::decode(...) pub fn unescape_and_decode(&self, reader: &Reader) -> Result { - self.do_unescape_and_decode_with_custom_entities(reader, None) + self.unescape_and_decode_with_custom_entities(reader, |_| None) } /// helper method to unescape then decode self using the reader encoding with custom entities @@ -767,23 +759,15 @@ impl<'a> BytesText<'a> { /// /// # Pre-condition /// - /// The keys and values of `custom_entities`, if any, must be valid UTF-8. + /// The implementation of `lookup_custom_entity` is expected to operate over UTF-8 inputs. pub fn unescape_and_decode_with_custom_entities( &self, reader: &Reader, - custom_entities: &HashMap, Vec>, - ) -> Result { - self.do_unescape_and_decode_with_custom_entities(reader, Some(custom_entities)) - } - - fn do_unescape_and_decode_with_custom_entities( - &self, - reader: &Reader, - custom_entities: Option<&HashMap, Vec>>, + lookup_custom_entities: impl Fn(&[u8]) -> Option<&str>, ) -> Result { let decoded = reader.decoder().decode(&*self)?; - let unescaped = do_unescape(decoded.as_bytes(), custom_entities)?; + let unescaped = unescape_with(decoded.as_bytes(), lookup_custom_entities)?; Ok(String::from_utf8(unescaped.into_owned())?) } diff --git a/src/lib.rs b/src/lib.rs index cebc401d..383d4a96 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -139,7 +139,7 @@ mod errors; mod escapei; pub mod escape { //! Manage xml character escapes - pub(crate) use crate::escapei::{do_unescape, EscapeError}; + pub(crate) use crate::escapei::EscapeError; pub use crate::escapei::{escape, partial_escape, unescape, unescape_with}; } pub mod events;