@@ -326,13 +326,13 @@ pub(crate) fn normalize_xml_eols<'input>(text: &'input str) -> Cow<'input, str>
326326        // we are sure that index within string 
327327        normalized. push_str ( & text[ 0 ..i] ) ; 
328328
329-         let  mut  pos = normalize_xml_eol_step ( & mut  normalized,  bytes ,  i,  '\n' ) ; 
329+         let  mut  pos = normalize_xml_eol_step ( & mut  normalized,  text ,  i,  '\n' ) ; 
330330        while  let  Some ( i)  = memchr3 ( b'\r' ,  0xC2 ,  0xE2 ,  & bytes[ pos..] )  { 
331331            let  index = pos + i; 
332332            // NOTE: unsafe { text.get_unchecked(pos..index) } could be used because 
333333            // we are sure that index within string 
334334            normalized. push_str ( & text[ pos..index] ) ; 
335-             pos = normalize_xml_eol_step ( & mut  normalized,  bytes ,  index,  '\n' ) ; 
335+             pos = normalize_xml_eol_step ( & mut  normalized,  text ,  index,  '\n' ) ; 
336336        } 
337337        if  let  Some ( rest)  = text. get ( pos..)  { 
338338            normalized. push_str ( rest) ; 
@@ -378,21 +378,30 @@ pub(crate) fn normalize_xml_eols<'input>(text: &'input str) -> Cow<'input, str>
378378/// 
379379/// [eof]: https://www.w3.org/TR/xml11/#sec-line-ends 
380380/// [only for]: https://html.spec.whatwg.org/#normalize-newlines 
381- fn  normalize_xml_eol_step ( normalized :  & mut  String ,  input :  & [ u8 ] ,  index :  usize ,  ch :  char )  -> usize  { 
381+ fn  normalize_xml_eol_step ( normalized :  & mut  String ,  text :  & str ,  index :  usize ,  ch :  char )  -> usize  { 
382+     let  input = text. as_bytes ( ) ; 
382383    match  input[ index]  { 
383384        b'\r'  => { 
384-             normalized. push ( ch) ; 
385385            if  index + 1  < input. len ( )  { 
386386                let  next = input[ index + 1 ] ; 
387387                if  next == b'\n'  { 
388+                     normalized. push ( ch) ; 
388389                    return  index + 2 ;  // skip \r\n 
389390                } 
390391                // Because input is correct UTF-8 and in UTF-8 every character has 
391392                // an unique prefix, byte C2 means only start of #x85 character 
392393                if  next == 0xC2  { 
393-                     return  index + 3 ;  // skip UTF-8 encoding of #xD #x85 characters (0d c2 85) 
394+                     if  index + 2  < input. len ( )  && input[ index + 2 ]  == 0x85  { 
395+                         normalized. push ( ch) ; 
396+                     }  else  { 
397+                         // NOTE: unsafe { text.get_unchecked(index..index + 3) } could be used because 
398+                         // we are sure that index within string 
399+                         normalized. push_str ( & text[ index..index + 3 ] ) ; 
400+                     } 
401+                     return  index + 3 ;  // skip \r + UTF-8 encoding of character (c2 xx) 
394402                } 
395403            } 
404+             normalized. push ( ch) ; 
396405            index + 1  // skip \r 
397406        } 
398407        b'\n'  => { 
@@ -401,13 +410,25 @@ fn normalize_xml_eol_step(normalized: &mut String, input: &[u8], index: usize, c
401410        } 
402411        // Start of UTF-8 encoding of #x85 character (c2 85) 
403412        0xC2  => { 
404-             normalized. push ( ch) ; 
405-             index + 2  // skip UTF-8 encoding of #x85 character (c2 85) 
413+             if  index + 1  < input. len ( )  && input[ index + 1 ]  == 0x85  { 
414+                 normalized. push ( ch) ; 
415+             }  else  { 
416+                 // NOTE: unsafe { text.get_unchecked(index..index + 2) } could be used because 
417+                 // we are sure that index within string 
418+                 normalized. push_str ( & text[ index..index + 2 ] ) ; 
419+             } 
420+             index + 2  // skip UTF-8 encoding of character (c2 xx) 
406421        } 
407422        // Start of UTF-8 encoding of #x2028 character (e2 80 a8) 
408423        0xE2  => { 
409-             normalized. push ( ch) ; 
410-             index + 3  // skip UTF-8 encoding of #x2028 character (e2 80 a8) 
424+             if  index + 2  < input. len ( )  && input[ index + 1 ]  == 0x80  && input[ index + 2 ]  == 0xA8  { 
425+                 normalized. push ( ch) ; 
426+             }  else  { 
427+                 // NOTE: unsafe { text.get_unchecked(index..index + 3) } could be used because 
428+                 // we are sure that index within string 
429+                 normalized. push_str ( & text[ index..index + 3 ] ) ; 
430+             } 
431+             index + 3  // skip UTF-8 encoding of character (e2 xx xx) 
411432        } 
412433
413434        x => unreachable ! ( 
@@ -2094,6 +2115,102 @@ mod normalization {
20942115                    "\n \n \n \n \n \n some\n \n \n text" , 
20952116                ) ; 
20962117            } 
2118+ 
2119+             #[ test]  
2120+             fn  utf8_0xc2 ( )  { 
2121+                 // All possible characters encoded in 2 bytes in UTF-8 which first byte is 0xC2 (0b11000010) 
2122+                 // Second byte follows the pattern 10xxxxxx 
2123+                 let  first = str:: from_utf8 ( & [ 0b11000010 ,  0b10000000 ] ) 
2124+                     . unwrap ( ) 
2125+                     . chars ( ) 
2126+                     . next ( ) 
2127+                     . unwrap ( ) ; 
2128+                 let  last = str:: from_utf8 ( & [ 0b11000010 ,  0b10111111 ] ) 
2129+                     . unwrap ( ) 
2130+                     . chars ( ) 
2131+                     . next ( ) 
2132+                     . unwrap ( ) ; 
2133+                 let  mut  utf8 = [ 0 ;  2 ] ; 
2134+                 for  ch in  first..=last { 
2135+                     ch. encode_utf8 ( & mut  utf8) ; 
2136+                     let  description = format ! ( "UTF-8 [{:02x} {:02x}] = `{}`" ,  utf8[ 0 ] ,  utf8[ 1 ] ,  ch) ; 
2137+                     let  input = str:: from_utf8 ( & utf8) . expect ( & description) ; 
2138+ 
2139+                     dbg ! ( ( input,  & description) ) ; 
2140+                     if  ch == '\u{0085}'  { 
2141+                         assert_eq ! ( normalize_xml_eols( input) ,  "\n " ,  "{}" ,  description) ; 
2142+                     }  else  { 
2143+                         assert_eq ! ( normalize_xml_eols( input) ,  input,  "{}" ,  description) ; 
2144+                     } 
2145+                 } 
2146+                 assert_eq ! ( ( first..=last) . count( ) ,  64 ) ; 
2147+             } 
2148+ 
2149+             #[ test]  
2150+             fn  utf8_0x0d_0xc2 ( )  { 
2151+                 // All possible characters encoded in 2 bytes in UTF-8 which first byte is 0xC2 (0b11000010) 
2152+                 // Second byte follows the pattern 10xxxxxx 
2153+                 let  first = str:: from_utf8 ( & [ 0b11000010 ,  0b10000000 ] ) 
2154+                     . unwrap ( ) 
2155+                     . chars ( ) 
2156+                     . next ( ) 
2157+                     . unwrap ( ) ; 
2158+                 let  last = str:: from_utf8 ( & [ 0b11000010 ,  0b10111111 ] ) 
2159+                     . unwrap ( ) 
2160+                     . chars ( ) 
2161+                     . next ( ) 
2162+                     . unwrap ( ) ; 
2163+                 let  mut  utf8 = [ b'\r' ,  0 ,  0 ] ; 
2164+                 for  ch in  first..=last { 
2165+                     ch. encode_utf8 ( & mut  utf8[ 1 ..] ) ; 
2166+                     let  description = format ! ( 
2167+                         "UTF-8 [{:02x} {:02x} {:02x}] = `{}`" , 
2168+                         utf8[ 0 ] ,  utf8[ 1 ] ,  utf8[ 2 ] ,  ch
2169+                     ) ; 
2170+                     let  input = str:: from_utf8 ( & utf8) . expect ( & description) ; 
2171+ 
2172+                     dbg ! ( ( input,  & description) ) ; 
2173+                     if  ch == '\u{0085}'  { 
2174+                         assert_eq ! ( normalize_xml_eols( input) ,  "\n " ,  "{}" ,  description) ; 
2175+                     }  else  { 
2176+                         assert_eq ! ( normalize_xml_eols( input) ,  input,  "{}" ,  description) ; 
2177+                     } 
2178+                 } 
2179+                 assert_eq ! ( ( first..=last) . count( ) ,  64 ) ; 
2180+             } 
2181+ 
2182+             #[ test]  
2183+             fn  utf8_0xe2 ( )  { 
2184+                 // All possible characters encoded in 3 bytes in UTF-8 which first byte is 0xE2 (0b11100010) 
2185+                 // Second and third bytes follows the pattern 10xxxxxx 
2186+                 let  first = str:: from_utf8 ( & [ 0b11100010 ,  0b10000000 ,  0b10000000 ] ) 
2187+                     . unwrap ( ) 
2188+                     . chars ( ) 
2189+                     . next ( ) 
2190+                     . unwrap ( ) ; 
2191+                 let  last = str:: from_utf8 ( & [ 0b11100010 ,  0b10111111 ,  0b10111111 ] ) 
2192+                     . unwrap ( ) 
2193+                     . chars ( ) 
2194+                     . next ( ) 
2195+                     . unwrap ( ) ; 
2196+                 let  mut  buf = [ 0 ;  3 ] ; 
2197+                 for  ch in  first..=last { 
2198+                     let  input = & * ch. encode_utf8 ( & mut  buf) ; 
2199+                     let  buf = input. as_bytes ( ) ; 
2200+                     let  description = format ! ( 
2201+                         "UTF-8 [{:02x} {:02x} {:02x}] = `{}`" , 
2202+                         buf[ 0 ] ,  buf[ 1 ] ,  buf[ 2 ] ,  ch
2203+                     ) ; 
2204+ 
2205+                     dbg ! ( ( input,  & description) ) ; 
2206+                     if  ch == '\u{2028}'  { 
2207+                         assert_eq ! ( normalize_xml_eols( input) ,  "\n " ,  "{}" ,  description) ; 
2208+                     }  else  { 
2209+                         assert_eq ! ( normalize_xml_eols( input) ,  input,  "{}" ,  description) ; 
2210+                     } 
2211+                 } 
2212+                 assert_eq ! ( ( first..=last) . count( ) ,  4096 ) ; 
2213+             } 
20972214        } 
20982215
20992216        mod  html { 
0 commit comments