Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Closure-based unescaping with custom entities #415

Merged
merged 3 commits into from
Jul 15, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
11 changes: 10 additions & 1 deletion Changelog.md
Original file line number Diff line number Diff line change
Expand Up @@ -109,7 +109,15 @@
|`read_event_unbuffered` |`read_event`
|`read_to_end_unbuffered` |`read_to_end`
- [#412]: Change `read_to_end*` and `read_text_into` to accept `QName` instead of `AsRef<[u8]>`

- [#415]: Changed custom entity unescaping API to accept closures rather than a mapping of entity to
replacement text. This avoids needing to allocate a map and provides the user with more flexibility.
- [#415]: Renamed many functions following the pattern `unescape_and_decode*` to `decode_and_unescape*`
to better communicate their function. Renamed functions following the pattern `*_with_custom_entities`
to `decode_and_unescape_with` to be more consistent across the API.
- [#415]: `BytesText::escaped()` renamed to `BytesText::escape()`, `BytesText::unescaped()` renamed to
`BytesText::unescape()`, `BytesText::unescaped_with()` renamed to `BytesText::unescape_with()`,
`Attribute::escaped_value()` renamed to `Attribute::escape_value()`, and `Attribute::escaped_value_with()`
renamed to `Attribute::escape_value_with()` for consistency across the API.
- [#416]: `BytesStart::to_borrowed` renamed to `BytesStart::borrow`, the same method
added to all events

Expand Down Expand Up @@ -137,6 +145,7 @@
[#403]: https://github.com/tafia/quick-xml/pull/403
[#407]: https://github.com/tafia/quick-xml/pull/407
[#412]: https://github.com/tafia/quick-xml/pull/412
[#415]: https://github.com/tafia/quick-xml/pull/415
[#416]: https://github.com/tafia/quick-xml/pull/416
[#418]: https://github.com/tafia/quick-xml/pull/418

Expand Down
4 changes: 2 additions & 2 deletions benches/macrobenches.rs
Original file line number Diff line number Diff line change
Expand Up @@ -24,11 +24,11 @@ fn parse_document(doc: &[u8]) -> XmlResult<()> {
match r.read_event()? {
Event::Start(e) | Event::Empty(e) => {
for attr in e.attributes() {
criterion::black_box(attr?.unescaped_value()?);
criterion::black_box(attr?.unescape_value()?);
}
}
Event::Text(e) => {
criterion::black_box(e.unescaped()?);
criterion::black_box(e.unescape()?);
}
Event::CData(e) => {
criterion::black_box(e.into_inner());
Expand Down
83 changes: 1 addition & 82 deletions benches/microbenches.rs
Original file line number Diff line number Diff line change
Expand Up @@ -125,86 +125,6 @@ fn read_namespaced_event(c: &mut Criterion) {
group.finish();
}

/// Benchmarks the `BytesText::unescaped()` method (includes time of `read_event`
/// benchmark)
fn bytes_text_unescaped(c: &mut Criterion) {
let mut group = c.benchmark_group("BytesText::unescaped");
group.bench_function("trim_text = false", |b| {
b.iter(|| {
let mut buf = Vec::new();
let mut r = Reader::from_reader(SAMPLE);
r.check_end_names(false).check_comments(false);
let mut count = criterion::black_box(0);
let mut nbtxt = criterion::black_box(0);
loop {
match r.read_event_into(&mut buf) {
Ok(Event::Start(_)) | Ok(Event::Empty(_)) => count += 1,
Ok(Event::Text(ref e)) => nbtxt += e.unescaped().unwrap().len(),
Ok(Event::Eof) => break,
_ => (),
}
buf.clear();
}
assert_eq!(
count, 1550,
"Overall tag count in ./tests/documents/sample_rss.xml"
);

// Windows has \r\n instead of \n
#[cfg(windows)]
assert_eq!(
nbtxt, 67661,
"Overall length (in bytes) of all text contents of ./tests/documents/sample_rss.xml"
);

#[cfg(not(windows))]
assert_eq!(
nbtxt, 66277,
"Overall length (in bytes) of all text contents of ./tests/documents/sample_rss.xml"
);
});
});

group.bench_function("trim_text = true", |b| {
b.iter(|| {
let mut buf = Vec::new();
let mut r = Reader::from_reader(SAMPLE);
r.check_end_names(false)
.check_comments(false)
.trim_text(true);
let mut count = criterion::black_box(0);
let mut nbtxt = criterion::black_box(0);
loop {
match r.read_event_into(&mut buf) {
Ok(Event::Start(_)) | Ok(Event::Empty(_)) => count += 1,
Ok(Event::Text(ref e)) => nbtxt += e.unescaped().unwrap().len(),
Ok(Event::Eof) => break,
_ => (),
}
buf.clear();
}
assert_eq!(
count, 1550,
"Overall tag count in ./tests/documents/sample_rss.xml"
);

// Windows has \r\n instead of \n
#[cfg(windows)]
assert_eq!(
nbtxt, 50334,
"Overall length (in bytes) of all text contents of ./tests/documents/sample_rss.xml"
);

#[cfg(not(windows))]
assert_eq!(
nbtxt, 50261,
"Overall length (in bytes) of all text contents of ./tests/documents/sample_rss.xml"
);
});
});
group.finish();
}

/// Benchmarks, how fast individual event parsed
fn one_event(c: &mut Criterion) {
let mut group = c.benchmark_group("One event");
Expand Down Expand Up @@ -256,7 +176,7 @@ fn one_event(c: &mut Criterion) {
.check_comments(false)
.trim_text(true);
match r.read_event_into(&mut buf) {
Ok(Event::Comment(ref e)) => nbtxt += e.unescaped().unwrap().len(),
Ok(Event::Comment(ref e)) => nbtxt += e.unescape().unwrap().len(),
something_else => panic!("Did not expect {:?}", something_else),
};

Expand Down Expand Up @@ -473,7 +393,6 @@ purus. Consequat id porta nibh venenatis cras sed felis.";
criterion_group!(
benches,
read_event,
bytes_text_unescaped,
read_namespaced_event,
one_event,
attributes,
Expand Down
39 changes: 23 additions & 16 deletions examples/custom_entities.rs
Original file line number Diff line number Diff line change
Expand Up @@ -7,10 +7,11 @@
//! * the regex in this example is simple but brittle;
//! * it does not support the use of entities in entity declaration.
use std::collections::HashMap;

use quick_xml::events::Event;
use quick_xml::Reader;
use regex::bytes::Regex;
use std::collections::HashMap;

const DATA: &str = r#"
Expand All @@ -27,35 +28,41 @@ fn main() -> Result<(), Box<dyn std::error::Error>> {
reader.trim_text(true);

let mut buf = Vec::new();
let mut custom_entities = HashMap::new();
let mut custom_entities: HashMap<Vec<u8>, String> = HashMap::new();
let entity_re = Regex::new(r#"<!ENTITY\s+([^ \t\r\n]+)\s+"([^"]*)"\s*>"#)?;

loop {
match reader.read_event_into(&mut buf) {
Ok(Event::DocType(ref e)) => {
for cap in entity_re.captures_iter(&e) {
custom_entities.insert(cap[1].to_vec(), cap[2].to_vec());
custom_entities.insert(
cap[1].to_vec(),
reader.decoder().decode(&cap[2])?.into_owned(),
);
}
}
Ok(Event::Start(ref e)) => match e.name().as_ref() {
b"test" => println!(
"attributes values: {:?}",
e.attributes()
.map(|a| a
.unwrap()
.unescape_and_decode_value_with_custom_entities(
&reader,
&custom_entities
)
.unwrap())
.collect::<Vec<_>>()
),
b"test" => {
let attributes = e
.attributes()
.map(|a| {
a.unwrap()
.decode_and_unescape_value_with(&reader, |ent| {
custom_entities.get(ent).map(|s| s.as_str())
})
.unwrap()
})
.collect::<Vec<_>>();
println!("attributes values: {:?}", attributes);
}
_ => (),
},
Ok(Event::Text(ref e)) => {
println!(
"text value: {}",
e.unescape_and_decode_with_custom_entities(&reader, &custom_entities)
e.decode_and_unescape_with(&reader, |ent| custom_entities
.get(ent)
.map(|s| s.as_str()))
Mingun marked this conversation as resolved.
Show resolved Hide resolved
.unwrap()
);
}
Expand Down
4 changes: 2 additions & 2 deletions src/de/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -618,14 +618,14 @@ where
allow_start: bool,
) -> Result<BytesCData<'de>, DeError> {
match self.next()? {
DeEvent::Text(e) if unescape => e.unescape().map_err(Into::into),
DeEvent::Text(e) if unescape => e.unescape_as_cdata().map_err(Into::into),
DeEvent::Text(e) => Ok(BytesCData::new(e.into_inner())),
DeEvent::CData(e) => Ok(e),
DeEvent::Start(e) if allow_start => {
// allow one nested level
let inner = self.next()?;
let t = match inner {
DeEvent::Text(t) if unescape => t.unescape()?,
DeEvent::Text(t) if unescape => t.unescape_as_cdata()?,
DeEvent::Text(t) => BytesCData::new(t.into_inner()),
DeEvent::CData(t) => t,
DeEvent::Start(s) => {
Expand Down
86 changes: 32 additions & 54 deletions src/escapei.rs
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,6 @@
use memchr;
use std::borrow::Cow;
use std::collections::HashMap;
use std::ops::Range;

#[cfg(test)]
Expand Down Expand Up @@ -66,31 +65,15 @@ impl std::error::Error for EscapeError {}
/// Escapes a `&[u8]` and replaces all xml special characters (<, >, &, ', ") with their
/// corresponding xml escaped value.
pub fn escape(raw: &[u8]) -> Cow<[u8]> {
#[inline]
fn to_escape(b: u8) -> bool {
match b {
b'<' | b'>' | b'\'' | b'&' | b'"' => true,
_ => false,
}
}

_escape(raw, to_escape)
_escape(raw, |ch| matches!(ch, b'<' | b'>' | b'&' | b'\'' | b'\"'))
}

/// Should only be used for escaping text content. In xml text content, it is allowed
/// (though not recommended) to leave the quote special characters " and ' unescaped.
/// This function escapes a `&[u8]` and replaces xml special characters (<, >, &) with
/// their corresponding xml escaped value, but does not escape quote characters.
pub fn partial_escape(raw: &[u8]) -> Cow<[u8]> {
#[inline]
fn to_escape(b: u8) -> bool {
match b {
b'<' | b'>' | b'&' => true,
_ => false,
}
}

_escape(raw, to_escape)
_escape(raw, |ch| matches!(ch, b'<' | b'>' | b'&'))
}

/// Escapes a `&[u8]` and replaces a subset of xml special characters (<, >, &, ', ") with their
Expand Down Expand Up @@ -130,32 +113,23 @@ fn _escape<F: Fn(u8) -> bool>(raw: &[u8], escape_chars: F) -> Cow<[u8]> {
/// Unescape a `&[u8]` and replaces all xml escaped characters ('&...;') into their corresponding
/// value
pub fn unescape(raw: &[u8]) -> Result<Cow<[u8]>, EscapeError> {
do_unescape(raw, None)
}

/// Unescape a `&[u8]` and replaces all xml escaped characters ('&...;') into their corresponding
/// value, using a dictionnary of custom entities.
///
/// # Pre-condition
///
/// The keys and values of `custom_entities`, if any, must be valid UTF-8.
pub fn unescape_with<'a>(
raw: &'a [u8],
custom_entities: &HashMap<Vec<u8>, Vec<u8>>,
) -> Result<Cow<'a, [u8]>, EscapeError> {
do_unescape(raw, Some(custom_entities))
unescape_with(raw, |_| None)
}

/// Unescape a `&[u8]` and replaces all xml escaped characters ('&...;') into their corresponding
/// value, using an optional dictionary of custom entities.
/// value, using a resolver function for custom entities.
///
/// # Pre-condition
///
/// The keys and values of `custom_entities`, if any, must be valid UTF-8.
pub fn do_unescape<'a>(
raw: &'a [u8],
custom_entities: Option<&HashMap<Vec<u8>, Vec<u8>>>,
) -> Result<Cow<'a, [u8]>, EscapeError> {
/// The implementation of `resolve_entity` is expected to operate over UTF-8 inputs.
pub fn unescape_with<'input, 'entity, F>(
raw: &'input [u8],
resolve_entity: F,
) -> Result<Cow<'input, [u8]>, EscapeError>
where
// the lifetime of the output comes from a capture or is `'static`
F: Fn(&[u8]) -> Option<&'entity str>,
{
let mut unescaped = None;
let mut last_end = 0;
let mut iter = memchr::memchr2_iter(b'&', b';', raw);
Expand All @@ -171,12 +145,14 @@ pub fn do_unescape<'a>(

// search for character correctness
let pat = &raw[start + 1..end];
if let Some(s) = named_entity(pat) {
unescaped.extend_from_slice(s.as_bytes());
} else if pat.starts_with(b"#") {
push_utf8(unescaped, parse_number(&pat[1..], start..end)?);
} else if let Some(value) = custom_entities.and_then(|hm| hm.get(pat)) {
unescaped.extend_from_slice(&value);
if pat.starts_with(b"#") {
let entity = &pat[1..]; // starts after the #
let codepoint = parse_number(entity, start..end)?;
push_utf8(unescaped, codepoint);
} else if let Some(value) = named_entity(pat) {
unescaped.extend_from_slice(value.as_bytes());
} else if let Some(value) = resolve_entity(pat) {
unescaped.extend_from_slice(value.as_bytes());
} else {
return Err(EscapeError::UnrecognizedSymbol(
start + 1..end,
Expand Down Expand Up @@ -1740,18 +1716,20 @@ fn test_unescape() {

#[test]
fn test_unescape_with() {
let custom_entities = vec![(b"foo".to_vec(), b"BAR".to_vec())]
.into_iter()
.collect();
assert_eq!(&*unescape_with(b"test", &custom_entities).unwrap(), b"test");
let custom_entities = |ent: &[u8]| match ent {
b"foo" => Some("BAR"),
_ => None,
};

assert_eq!(&*unescape_with(b"test", custom_entities).unwrap(), b"test");
assert_eq!(
&*unescape_with(b"&lt;test&gt;", &custom_entities).unwrap(),
&*unescape_with(b"&lt;test&gt;", custom_entities).unwrap(),
b"<test>"
);
assert_eq!(&*unescape_with(b"&#x30;", &custom_entities).unwrap(), b"0");
assert_eq!(&*unescape_with(b"&#48;", &custom_entities).unwrap(), b"0");
assert_eq!(&*unescape_with(b"&foo;", &custom_entities).unwrap(), b"BAR");
assert!(unescape_with(b"&fop;", &custom_entities).is_err());
assert_eq!(&*unescape_with(b"&#x30;", custom_entities).unwrap(), b"0");
assert_eq!(&*unescape_with(b"&#48;", custom_entities).unwrap(), b"0");
assert_eq!(&*unescape_with(b"&foo;", custom_entities).unwrap(), b"BAR");
assert!(unescape_with(b"&fop;", custom_entities).is_err());
}

#[test]
Expand Down
Loading