tafia · yorkz1994 · Sep 25, 2024 · Sep 25, 2024 · Sep 25, 2024 · Sep 25, 2024
diff --git a/Changelog.md b/Changelog.md
@@ -15,6 +15,10 @@
 
 ### New Features
 
+- [#806]: Perform normalization of line end during unescape process.
+
+[#806]: https://github.com/tafia/quick-xml/issues/806
+
 ### Bug Fixes
 
 ### Misc Changes

diff --git a/src/escape.rs b/src/escape.rs
@@ -1,6 +1,6 @@
 //! Manage xml character escapes
 
-use memchr::memchr2_iter;
+use memchr::{memchr2_iter, memchr_iter};
 use std::borrow::Cow;
 use std::num::ParseIntError;
 use std::ops::Range;
@@ -266,7 +266,7 @@ where
                     unescaped = Some(String::with_capacity(raw.len()));
                 }
                 let unescaped = unescaped.as_mut().expect("initialized");
-                unescaped.push_str(&raw[last_end..start]);
+                unescaped.push_str(&normalize_line_end(&raw[last_end..start]));
 
                 // search for character correctness
                 let pat = &raw[start + 1..end];
@@ -290,11 +290,39 @@ where
 
     if let Some(mut unescaped) = unescaped {
         if let Some(raw) = raw.get(last_end..) {
-            unescaped.push_str(raw);
+            unescaped.push_str(&normalize_line_end(raw));
         }
         Ok(Cow::Owned(unescaped))
     } else {
-        Ok(Cow::Borrowed(raw))
+        Ok(normalize_line_end(raw))
+    }
+}
+
+/// Normalize the line end, replace \r or \r\n with \n.
+#[inline]
+fn normalize_line_end(input: &str) -> Cow<str> {
+    let bytes = input.as_bytes();
+    let mut normalized = None;
+    let mut start = 0;
+    let iter = memchr_iter(b'\r', bytes);
+    for i in iter {
+        if normalized.is_none() {
+            normalized = Some(String::with_capacity(input.len()))
+        }
+        let normalized = normalized.as_mut().expect("initialized");
+        normalized.push_str(&input[start..i]);
+        normalized.push('\n');
+        start = i + 1;
+        if matches!(bytes.get(start), Some(&c) if c == b'\n') {
+            // \n right after \r, \r\n case, skip \n because we have already replaced \r with \n
+            start += 1;
+        }
+    }
+    if let Some(mut normalized) = normalized {
+        normalized.push_str(&input[start..]);
+        Cow::Owned(normalized)
+    } else {
+        input.into()
     }
 }
 

diff --git a/tests/escape.rs b/tests/escape.rs
@@ -75,6 +75,27 @@ fn unescape() {
     );
 }
 
+#[test]
+fn unescape_line_end() {
+    let unchanged = escape::unescape("test\n");
+    // assert_eq does not check that Cow is borrowed, but we explicitly use Cow
+    // because it influences diff
+    // TODO: use assert_matches! when stabilized and other features will bump MSRV
+    assert_eq!(unchanged, Ok(Cow::Borrowed("test\n")));
+    assert!(matches!(unchanged, Ok(Cow::Borrowed(_))));
+
+    assert_eq!(
+        escape::unescape("&lt;&amp;test&apos;\r&quot;\r\n&gt;\r\n\r"),
+        Ok("<&test'\n\"\n>\n\n".into())
+    );
+    assert_eq!(escape::unescape("&#x30;\r\r\n"), Ok("0\n\n".into()));
+    assert_eq!(escape::unescape("\r&#48;\n\r\r"), Ok("\n0\n\n\n".into()));
+    assert_eq!(
+        escape::unescape("\r\n&foo;\n"),
+        Err(EscapeError::UnrecognizedEntity(3..6, "foo".into()))
+    );
+}
+
 /// XML allows any number of leading zeroes. That is not explicitly mentioned
 /// in the specification, but enforced by the conformance test suite
 /// (https://www.w3.org/XML/Test/)

diff --git a/tests/reader-attributes.rs b/tests/reader-attributes.rs
@@ -1,7 +1,10 @@
 use std::borrow::Cow;
 
 use quick_xml::events::attributes::Attribute;
-use quick_xml::events::{BytesEnd, Event::*};
+use quick_xml::events::{
+    BytesEnd,
+    Event::{self, *},
+};
 use quick_xml::name::QName;
 use quick_xml::reader::Reader;
 
@@ -159,3 +162,31 @@ fn equal_sign_in_value() {
         e => panic!("Expecting Empty event, got {:?}", e),
     }
 }
+
+#[test]
+fn line_ends() {
+    const XML: &str = "<root attribute=\"\r\r\n\nvalue1\r\r\n\nvalue2\r\r\n\n\">\r\r\n\nvalue3\r\r\n\nvalue4\r\r\n\n</root>";
+    let mut reader = Reader::from_str(XML);
+    match reader.read_event().unwrap() {
+        Event::Start(event) => {
+            let mut iter = event.attributes();
+            let a = iter.next().unwrap().unwrap();
+            #[cfg(not(feature = "encoding"))]
+            assert_eq!(
+                a.unescape_value().unwrap(),
+                "\n\n\nvalue1\n\n\nvalue2\n\n\n"
+            );
+            assert_eq!(
+                a.decode_and_unescape_value(reader.decoder()).unwrap(),
+                "\n\n\nvalue1\n\n\nvalue2\n\n\n"
+            );
+        }
+        event => panic!("Expected Start, found {:?}", event),
+    }
+    match reader.read_event().unwrap() {
+        Event::Text(event) => {
+            assert_eq!(event.unescape().unwrap(), "\n\n\nvalue3\n\n\nvalue4\n\n\n")
+        }
+        event => panic!("Expected Text, found {:?}", event),
+    }
+}
diff --git a/tests/serde-se.rs b/tests/serde-se.rs
@@ -1955,9 +1955,9 @@ mod with_root {
             <root>3</root>");
     serialize_as!(tuple:
         // Use to_string() to get owned type that is required for deserialization
-        ("<\"&'>".to_string(), "with\t\r\n spaces", 3usize)
+        ("<\"&'>".to_string(), "with\t\n spaces", 3usize)
         => "<root>&lt;\"&amp;'&gt;</root>\
-            <root>with\t\r\n spaces</root>\
+            <root>with\t\n spaces</root>\
             <root>3</root>");
     serialize_as!(tuple_struct:
         Tuple(42.0, "answer")