Skip to content

Commit b26e2f4

Browse files
committed
Fix incorrect assumption that normalize_eol_step always get only characters that should be normalized
For example, the Euro sign encoded as [e2 82 ac] previously would be replaced by \n
1 parent 6cf8c9f commit b26e2f4

File tree

1 file changed

+126
-9
lines changed

1 file changed

+126
-9
lines changed

src/escape.rs

Lines changed: 126 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -326,13 +326,13 @@ pub(crate) fn normalize_xml_eols<'input>(text: &'input str) -> Cow<'input, str>
326326
// we are sure that index within string
327327
normalized.push_str(&text[0..i]);
328328

329-
let mut pos = normalize_xml_eol_step(&mut normalized, bytes, i, '\n');
329+
let mut pos = normalize_xml_eol_step(&mut normalized, text, i, '\n');
330330
while let Some(i) = memchr3(b'\r', 0xC2, 0xE2, &bytes[pos..]) {
331331
let index = pos + i;
332332
// NOTE: unsafe { text.get_unchecked(pos..index) } could be used because
333333
// we are sure that index within string
334334
normalized.push_str(&text[pos..index]);
335-
pos = normalize_xml_eol_step(&mut normalized, bytes, index, '\n');
335+
pos = normalize_xml_eol_step(&mut normalized, text, index, '\n');
336336
}
337337
if let Some(rest) = text.get(pos..) {
338338
normalized.push_str(rest);
@@ -378,21 +378,30 @@ pub(crate) fn normalize_xml_eols<'input>(text: &'input str) -> Cow<'input, str>
378378
///
379379
/// [eof]: https://www.w3.org/TR/xml11/#sec-line-ends
380380
/// [only for]: https://html.spec.whatwg.org/#normalize-newlines
381-
fn normalize_xml_eol_step(normalized: &mut String, input: &[u8], index: usize, ch: char) -> usize {
381+
fn normalize_xml_eol_step(normalized: &mut String, text: &str, index: usize, ch: char) -> usize {
382+
let input = text.as_bytes();
382383
match input[index] {
383384
b'\r' => {
384-
normalized.push(ch);
385385
if index + 1 < input.len() {
386386
let next = input[index + 1];
387387
if next == b'\n' {
388+
normalized.push(ch);
388389
return index + 2; // skip \r\n
389390
}
390391
// Because input is correct UTF-8 and in UTF-8 every character has
391392
// an unique prefix, byte C2 means only start of #x85 character
392393
if next == 0xC2 {
393-
return index + 3; // skip UTF-8 encoding of #xD #x85 characters (0d c2 85)
394+
if index + 2 < input.len() && input[index + 2] == 0x85 {
395+
normalized.push(ch);
396+
} else {
397+
// NOTE: unsafe { text.get_unchecked(index..index + 3) } could be used because
398+
// we are sure that index within string
399+
normalized.push_str(&text[index..index + 3]);
400+
}
401+
return index + 3; // skip \r + UTF-8 encoding of character (c2 xx)
394402
}
395403
}
404+
normalized.push(ch);
396405
index + 1 // skip \r
397406
}
398407
b'\n' => {
@@ -401,13 +410,25 @@ fn normalize_xml_eol_step(normalized: &mut String, input: &[u8], index: usize, c
401410
}
402411
// Start of UTF-8 encoding of #x85 character (c2 85)
403412
0xC2 => {
404-
normalized.push(ch);
405-
index + 2 // skip UTF-8 encoding of #x85 character (c2 85)
413+
if index + 1 < input.len() && input[index + 1] == 0x85 {
414+
normalized.push(ch);
415+
} else {
416+
// NOTE: unsafe { text.get_unchecked(index..index + 2) } could be used because
417+
// we are sure that index within string
418+
normalized.push_str(&text[index..index + 2]);
419+
}
420+
index + 2 // skip UTF-8 encoding of character (c2 xx)
406421
}
407422
// Start of UTF-8 encoding of #x2028 character (e2 80 a8)
408423
0xE2 => {
409-
normalized.push(ch);
410-
index + 3 // skip UTF-8 encoding of #x2028 character (e2 80 a8)
424+
if index + 2 < input.len() && input[index + 1] == 0x80 && input[index + 2] == 0xA8 {
425+
normalized.push(ch);
426+
} else {
427+
// NOTE: unsafe { text.get_unchecked(index..index + 3) } could be used because
428+
// we are sure that index within string
429+
normalized.push_str(&text[index..index + 3]);
430+
}
431+
index + 3 // skip UTF-8 encoding of character (e2 xx xx)
411432
}
412433

413434
x => unreachable!(
@@ -2094,6 +2115,102 @@ mod normalization {
20942115
"\n\n\n\n\n\nsome\n\n\ntext",
20952116
);
20962117
}
2118+
2119+
#[test]
2120+
fn utf8_0xc2() {
2121+
// All possible characters encoded in 2 bytes in UTF-8 which first byte is 0xC2 (0b11000010)
2122+
// Second byte follows the pattern 10xxxxxx
2123+
let first = str::from_utf8(&[0b11000010, 0b10000000])
2124+
.unwrap()
2125+
.chars()
2126+
.next()
2127+
.unwrap();
2128+
let last = str::from_utf8(&[0b11000010, 0b10111111])
2129+
.unwrap()
2130+
.chars()
2131+
.next()
2132+
.unwrap();
2133+
let mut utf8 = [0; 2];
2134+
for ch in first..=last {
2135+
ch.encode_utf8(&mut utf8);
2136+
let description = format!("UTF-8 [{:02x} {:02x}] = `{}`", utf8[0], utf8[1], ch);
2137+
let input = str::from_utf8(&utf8).expect(&description);
2138+
2139+
dbg!((input, &description));
2140+
if ch == '\u{0085}' {
2141+
assert_eq!(normalize_xml_eols(input), "\n", "{}", description);
2142+
} else {
2143+
assert_eq!(normalize_xml_eols(input), input, "{}", description);
2144+
}
2145+
}
2146+
assert_eq!((first..=last).count(), 64);
2147+
}
2148+
2149+
#[test]
2150+
fn utf8_0x0d_0xc2() {
2151+
// All possible characters encoded in 2 bytes in UTF-8 which first byte is 0xC2 (0b11000010)
2152+
// Second byte follows the pattern 10xxxxxx
2153+
let first = str::from_utf8(&[0b11000010, 0b10000000])
2154+
.unwrap()
2155+
.chars()
2156+
.next()
2157+
.unwrap();
2158+
let last = str::from_utf8(&[0b11000010, 0b10111111])
2159+
.unwrap()
2160+
.chars()
2161+
.next()
2162+
.unwrap();
2163+
let mut utf8 = [b'\r', 0, 0];
2164+
for ch in first..=last {
2165+
ch.encode_utf8(&mut utf8[1..]);
2166+
let description = format!(
2167+
"UTF-8 [{:02x} {:02x} {:02x}] = `{}`",
2168+
utf8[0], utf8[1], utf8[2], ch
2169+
);
2170+
let input = str::from_utf8(&utf8).expect(&description);
2171+
2172+
dbg!((input, &description));
2173+
if ch == '\u{0085}' {
2174+
assert_eq!(normalize_xml_eols(input), "\n", "{}", description);
2175+
} else {
2176+
assert_eq!(normalize_xml_eols(input), input, "{}", description);
2177+
}
2178+
}
2179+
assert_eq!((first..=last).count(), 64);
2180+
}
2181+
2182+
#[test]
2183+
fn utf8_0xe2() {
2184+
// All possible characters encoded in 3 bytes in UTF-8 which first byte is 0xE2 (0b11100010)
2185+
// Second and third bytes follows the pattern 10xxxxxx
2186+
let first = str::from_utf8(&[0b11100010, 0b10000000, 0b10000000])
2187+
.unwrap()
2188+
.chars()
2189+
.next()
2190+
.unwrap();
2191+
let last = str::from_utf8(&[0b11100010, 0b10111111, 0b10111111])
2192+
.unwrap()
2193+
.chars()
2194+
.next()
2195+
.unwrap();
2196+
let mut buf = [0; 3];
2197+
for ch in first..=last {
2198+
let input = &*ch.encode_utf8(&mut buf);
2199+
let buf = input.as_bytes();
2200+
let description = format!(
2201+
"UTF-8 [{:02x} {:02x} {:02x}] = `{}`",
2202+
buf[0], buf[1], buf[2], ch
2203+
);
2204+
2205+
dbg!((input, &description));
2206+
if ch == '\u{2028}' {
2207+
assert_eq!(normalize_xml_eols(input), "\n", "{}", description);
2208+
} else {
2209+
assert_eq!(normalize_xml_eols(input), input, "{}", description);
2210+
}
2211+
}
2212+
assert_eq!((first..=last).count(), 4096);
2213+
}
20972214
}
20982215

20992216
mod html {

0 commit comments

Comments
 (0)