Skip to content

Commit

Permalink
Properly normalize attribute values
Browse files Browse the repository at this point in the history
closes tafia#371
  • Loading branch information
dralley committed Apr 3, 2022
1 parent d872771 commit ae266a1
Show file tree
Hide file tree
Showing 2 changed files with 180 additions and 41 deletions.
126 changes: 125 additions & 1 deletion src/events/attributes.rs
Original file line number Diff line number Diff line change
Expand Up @@ -331,6 +331,96 @@ impl<'a> From<(&'a str, &'a str)> for Attribute<'a> {
}
}

// 1) All line breaks MUST have been normalized on input to #xA as described in 2.11 End-of-Line Handling, so the rest of this algorithm operates on text normalized in this way.
// 2) Begin with a normalized value consisting of the empty string.
// 3) For each character, entity reference, or character reference in the unnormalized attribute value, beginning with the first and continuing to the last, do the following:
// * For a character reference, append the referenced character to the normalized value.
// * For an entity reference, recursively apply step 3 of this algorithm to the replacement text of the entity.
// * For a white space character (#x20, #xD, #xA, #x9), append a space character (#x20) to the normalized value.
// * For another character, append the character to the normalized value.
//
// If the attribute type is not CDATA, then the XML processor MUST further process the normalized attribute value by discarding any leading and trailing space (#x20) characters,
// and by replacing sequences of space (#x20) characters by a single space (#x20) character.
//
// Note that if the unnormalized attribute value contains a character reference to a white space character other than space (#x20), the normalized value contains the referenced
// character itself (#xD, #xA or #x9). This contrasts with the case where the unnormalized value contains a white space character (not a reference), which is replaced with a
// space character (#x20) in the normalized value and also contrasts with the case where the unnormalized value contains an entity reference whose replacement text contains a
// white space character; being recursively processed, the white space character is replaced with a space character (#x20) in the normalized value.
fn normalize_attribute_value(attr: Cow<[u8]>) -> Cow<[u8]> {
// TODO: character references, entity references
// TODO: don't allocated unless needed?

#[derive(PartialEq)]
enum ParseState {
SpaceOrStart,
CDATA,
}

let mut value: Vec<u8> = Vec::new();
// Starting in the state where we think we've added a space means we implicitly skip leading spaces
let mut current_state = ParseState::SpaceOrStart;
// Used for trimming trailing spaces
let mut last_cdata_idx = 0;

// In one pass, strip leading and trailing spaces and replace sequences of spaces with a single one
for ch in attr.as_ref() {
match current_state {
ParseState::SpaceOrStart => match ch {
b'\n' | b'\r' | b'\t' | b' ' => continue,
c @ _ => {
current_state = ParseState::CDATA;
last_cdata_idx = value.len();
value.push(*c);
}
},
ParseState::CDATA => match ch {
b'\n' | b'\r' | b'\t' | b' ' => {
current_state = ParseState::SpaceOrStart;
value.push(b' ');
}
c @ _ => {
last_cdata_idx = value.len();
value.push(*c)
}
},
}
}

// Trim any trailing spaces
if current_state == ParseState::SpaceOrStart {
value.truncate(last_cdata_idx + 1);
}

Cow::Owned(value)


// let mut value: Vec<u8> = Vec::new();

// // TODO: replace sequences of spaces
// for i in 0..attr.len() {
// let ch = attr[i];
// match ch {
// b'\n' => value.push(b' '),
// b'\r' => value.push(b' '),
// b'\t' => value.push(b' '),
// c @ _ => value.push(c),
// }
// }

// // Position where value starts after whitespace.
// let first_non_space_char = value
// .iter()
// .position(|c| !c.is_ascii_whitespace())
// .unwrap_or(0);
// // Position where the trailing whitespace starts.
// let last_non_space_char = value
// .iter()
// .rposition(|c| !c.is_ascii_whitespace())
// .and_then(|idx| Some(idx + 1))
// .unwrap_or(0);
// Cow::Owned(value[first_non_space_char..last_non_space_char].to_vec())
}

impl<'a> Iterator for Attributes<'a> {
type Item = Result<Attribute<'a>>;
fn next(&mut self) -> Option<Self::Item> {
Expand All @@ -355,7 +445,7 @@ impl<'a> Iterator for Attributes<'a> {
($key:expr, $val:expr) => {
Some(Ok(Attribute {
key: &self.bytes[$key],
value: Cow::Borrowed(&self.bytes[$val]),
value: normalize_attribute_value(Cow::Borrowed(&self.bytes[$val])),
}))
};
}
Expand Down Expand Up @@ -513,4 +603,38 @@ mod tests {
assert_eq!(&*a.value, b"ee");
assert!(attributes.next().is_none());
}

#[test]
fn attribute_value_normalization() {
// return, tab, and newline characters (0xD, 0x9, 0xA) must be replaced with a space character
assert_eq!(
normalize_attribute_value(Cow::Borrowed(b"\rfoo\rbar\tbaz\ndelta\n")).as_ref(),
b"foo bar baz delta"
);
// leading and trailing spaces must be stripped
assert_eq!(
normalize_attribute_value(Cow::Borrowed(b" foo ")).as_ref(),
b"foo"
);
// leading space
assert_eq!(
normalize_attribute_value(Cow::Borrowed(b" bar")).as_ref(),
b"bar"
);
// trailing space
assert_eq!(
normalize_attribute_value(Cow::Borrowed(b"baz ")).as_ref(),
b"baz"
);
// sequences of spaces must be replaced with a single space
assert_eq!(
normalize_attribute_value(Cow::Borrowed(b" foo bar baz ")).as_ref(),
b"foo bar baz"
);
// sequence replacement mixed with characters treated as whitespace (\t \r \n)
assert_eq!(
normalize_attribute_value(Cow::Borrowed(b" \tfoo\tbar \rbaz \n\ndelta\n\t\r echo foxtrot\r")).as_ref(),
b"foo bar baz delta echo foxtrot"
);
}
}
95 changes: 55 additions & 40 deletions tests/test.rs
Original file line number Diff line number Diff line change
Expand Up @@ -35,18 +35,19 @@ fn test_attributes_empty() {
match r.read_event(&mut buf) {
Ok(Empty(e)) => {
let mut atts = e.attributes();

match atts.next() {
Some(Ok(Attribute {
key: b"att1",
value: Cow::Borrowed(b"a"),
})) => (),
Some(Ok(a)) => {
assert_eq!(b"att1", a.key);
assert_eq!(b"a", a.value.as_ref());
}
e => panic!("Expecting att1='a' attribute, found {:?}", e),
}
match atts.next() {
Some(Ok(Attribute {
key: b"att2",
value: Cow::Borrowed(b"b"),
})) => (),
Some(Ok(a)) => {
assert_eq!(b"att2", a.key);
assert_eq!(b"b", a.value.as_ref());
}
e => panic!("Expecting att2='b' attribute, found {:?}", e),
}
match atts.next() {
Expand All @@ -68,10 +69,10 @@ fn test_attribute_equal() {
Ok(Empty(e)) => {
let mut atts = e.attributes();
match atts.next() {
Some(Ok(Attribute {
key: b"att1",
value: Cow::Borrowed(b"a=b"),
})) => (),
Some(Ok(a)) => {
assert_eq!(a.key, b"att1");
assert_eq!(a.value.as_ref(), b"a=b");
}
e => panic!("Expecting att1=\"a=b\" attribute, found {:?}", e),
}
match atts.next() {
Expand Down Expand Up @@ -119,18 +120,23 @@ fn test_attributes_empty_ns() {
.attributes()
.map(|ar| ar.expect("Expecting attribute parsing to succeed."))
// we don't care about xmlns attributes for this test
.filter(|kv| !kv.key.starts_with(b"xmlns"))
.map(|Attribute { key: name, value }| {
let (opt_ns, local_name) = r.attribute_namespace(name, &ns_buf);
(opt_ns, local_name, value)
});
.filter(|kv| !kv.key.starts_with(b"xmlns"));

match atts.next() {
Some((None, b"att1", Cow::Borrowed(b"a"))) => (),
Some(a) => {
let (opt_ns, local_name) = r.attribute_namespace(a.key, &ns_buf);
assert_eq!(opt_ns, None);
assert_eq!(local_name, b"att1");
assert_eq!(a.value.as_ref(), b"a");
}
e => panic!("Expecting att1='a' attribute, found {:?}", e),
}
match atts.next() {
Some((Some(ns), b"att2", Cow::Borrowed(b"b"))) => {
assert_eq!(&ns[..], b"urn:example:r");
Some(a) => {
let (opt_ns, local_name) = r.attribute_namespace(a.key, &ns_buf);
assert_eq!(opt_ns.as_ref(), Some(&b"urn:example:r".as_ref()));
assert_eq!(local_name, b"att2");
assert_eq!(a.value.as_ref(), b"b");
}
e => panic!(
"Expecting {{urn:example:r}}att2='b' attribute, found {:?}",
Expand Down Expand Up @@ -164,18 +170,23 @@ fn test_attributes_empty_ns_expanded() {
.attributes()
.map(|ar| ar.expect("Expecting attribute parsing to succeed."))
// we don't care about xmlns attributes for this test
.filter(|kv| !kv.key.starts_with(b"xmlns"))
.map(|Attribute { key: name, value }| {
let (opt_ns, local_name) = r.attribute_namespace(name, &ns_buf);
(opt_ns, local_name, value)
});
.filter(|kv| !kv.key.starts_with(b"xmlns"));

match atts.next() {
Some((None, b"att1", Cow::Borrowed(b"a"))) => (),
Some(a) => {
let (opt_ns, local_name) = r.attribute_namespace(a.key, &ns_buf);
assert_eq!(opt_ns, None);
assert_eq!(local_name, b"att1");
assert_eq!(a.value.as_ref(), b"a");
}
e => panic!("Expecting att1='a' attribute, found {:?}", e),
}
match atts.next() {
Some((Some(ns), b"att2", Cow::Borrowed(b"b"))) => {
assert_eq!(&ns[..], b"urn:example:r");
Some(a) => {
let (opt_ns, local_name) = r.attribute_namespace(a.key, &ns_buf);
assert_eq!(opt_ns.as_ref(), Some(&b"urn:example:r".as_ref()));
assert_eq!(local_name, b"att2");
assert_eq!(a.value.as_ref(), b"b");
}
e => panic!(
"Expecting {{urn:example:r}}att2='b' attribute, found {:?}",
Expand Down Expand Up @@ -229,15 +240,17 @@ fn test_default_ns_shadowing_empty() {
.attributes()
.map(|ar| ar.expect("Expecting attribute parsing to succeed."))
// we don't care about xmlns attributes for this test
.filter(|kv| !kv.key.starts_with(b"xmlns"))
.map(|Attribute { key: name, value }| {
let (opt_ns, local_name) = r.attribute_namespace(name, &ns_buf);
(opt_ns, local_name, value)
});
.filter(|kv| !kv.key.starts_with(b"xmlns"));

// the attribute should _not_ have a namespace name. The default namespace does not
// apply to attributes.
match atts.next() {
Some((None, b"att1", Cow::Borrowed(b"a"))) => (),
Some(a) => {
let (opt_ns, local_name) = r.attribute_namespace(a.key, &ns_buf);
assert_eq!(opt_ns, None);
assert_eq!(local_name, b"att1");
assert_eq!(a.value.as_ref(), b"a")
}
e => panic!("Expecting att1='a' attribute, found {:?}", e),
}
match atts.next() {
Expand Down Expand Up @@ -291,15 +304,17 @@ fn test_default_ns_shadowing_expanded() {
.attributes()
.map(|ar| ar.expect("Expecting attribute parsing to succeed."))
// we don't care about xmlns attributes for this test
.filter(|kv| !kv.key.starts_with(b"xmlns"))
.map(|Attribute { key: name, value }| {
let (opt_ns, local_name) = r.attribute_namespace(name, &ns_buf);
(opt_ns, local_name, value)
});
.filter(|kv| !kv.key.starts_with(b"xmlns"));

// the attribute should _not_ have a namespace name. The default namespace does not
// apply to attributes.
match atts.next() {
Some((None, b"att1", Cow::Borrowed(b"a"))) => (),
Some(a) => {
let (opt_ns, local_name) = r.attribute_namespace(a.key, &ns_buf);
assert_eq!(opt_ns, None);
assert_eq!(local_name, b"att1");
assert_eq!(a.value.as_ref(), b"a");
}
e => panic!("Expecting att1='a' attribute, found {:?}", e),
}
match atts.next() {
Expand Down

0 comments on commit ae266a1

Please sign in to comment.