From 0334c45bb525f7eb9078f3c5b5074320a9ac7a3d Mon Sep 17 00:00:00 2001 From: Arpad Borsos Date: Wed, 1 May 2024 10:32:26 +0200 Subject: [PATCH 1/5] Write `char::DebugEscape` sequences using `write_str` Instead of writing each `char` of an escape sequence one by one, this delegates to `Display`, which uses `write_str` internally in order to write the whole escape sequence at once. --- library/core/benches/str/debug.rs | 4 ++-- library/core/src/fmt/mod.rs | 11 ++++------- 2 files changed, 6 insertions(+), 9 deletions(-) diff --git a/library/core/benches/str/debug.rs b/library/core/benches/str/debug.rs index 7c72228f0fb5b..cb91169eed8eb 100644 --- a/library/core/benches/str/debug.rs +++ b/library/core/benches/str/debug.rs @@ -44,7 +44,7 @@ fn ascii_escapes(b: &mut Bencher) { assert_fmt( s, r#""some\tmore\tascii\ttext\nthis time with some \"escapes\", also 64 byte""#, - 21, + 15, ); b.iter(|| { black_box(format!("{:?}", black_box(s))); @@ -72,7 +72,7 @@ fn mostly_unicode(b: &mut Bencher) { #[bench] fn mixed(b: &mut Bencher) { let s = "\"❤️\"\n\"hűha ez betű\"\n\"кириллических букв\"."; - assert_fmt(s, r#""\"❤\u{fe0f}\"\n\"hűha ez betű\"\n\"кириллических букв\".""#, 36); + assert_fmt(s, r#""\"❤\u{fe0f}\"\n\"hűha ez betű\"\n\"кириллических букв\".""#, 21); b.iter(|| { black_box(format!("{:?}", black_box(s))); }); diff --git a/library/core/src/fmt/mod.rs b/library/core/src/fmt/mod.rs index 9b372eac52455..10e1d27c88a92 100644 --- a/library/core/src/fmt/mod.rs +++ b/library/core/src/fmt/mod.rs @@ -2409,9 +2409,7 @@ impl Debug for str { // If char needs escaping, flush backlog so far and write, else skip if esc.len() != 1 { f.write_str(&self[from..i])?; - for c in esc { - f.write_char(c)?; - } + Display::fmt(&esc, f)?; from = i + c.len_utf8(); } } @@ -2431,13 +2429,12 @@ impl Display for str { impl Debug for char { fn fmt(&self, f: &mut Formatter<'_>) -> Result { f.write_char('\'')?; - for c in self.escape_debug_ext(EscapeDebugExtArgs { + let esc = self.escape_debug_ext(EscapeDebugExtArgs { escape_grapheme_extended: true, escape_single_quote: true, escape_double_quote: false, - }) { - f.write_char(c)? - } + }); + Display::fmt(&esc, f)?; f.write_char('\'') } } From 3fda931afe98ccfbc2da7307731b268a38b153e9 Mon Sep 17 00:00:00 2001 From: Arpad Borsos Date: Thu, 15 Feb 2024 17:36:21 +0100 Subject: [PATCH 2/5] Add a fast-path to `Debug` ASCII `&str` MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Instead of going through the `EscapeDebug` machinery, we can just skip over ASCII chars that don’t need any escaping. --- library/core/src/fmt/mod.rs | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/library/core/src/fmt/mod.rs b/library/core/src/fmt/mod.rs index 10e1d27c88a92..4cc2a9cf96d82 100644 --- a/library/core/src/fmt/mod.rs +++ b/library/core/src/fmt/mod.rs @@ -2401,6 +2401,11 @@ impl Debug for str { f.write_char('"')?; let mut from = 0; for (i, c) in self.char_indices() { + // a fast path for ASCII chars that do not need escapes: + if matches!(c, ' '..='~') && !matches!(c, '\\' | '\"') { + continue; + } + let esc = c.escape_debug_ext(EscapeDebugExtArgs { escape_grapheme_extended: true, escape_single_quote: false, From 42d870ec8815b860b31f03f938ad911a49a5cff3 Mon Sep 17 00:00:00 2001 From: Arpad Borsos Date: Fri, 3 May 2024 13:16:26 +0200 Subject: [PATCH 3/5] Introduce printable-ASCII fast-path for `impl Debug for str` Instead of having a single loop that works on utf-8 `char`s, this splits the implementation into a loop that quickly skips over printable ASCII, falling back to per-char iteration for other chunks. --- library/core/src/fmt/mod.rs | 59 ++++++++++++++++++++++++++----------- 1 file changed, 42 insertions(+), 17 deletions(-) diff --git a/library/core/src/fmt/mod.rs b/library/core/src/fmt/mod.rs index 4cc2a9cf96d82..7fbbbb67f82e6 100644 --- a/library/core/src/fmt/mod.rs +++ b/library/core/src/fmt/mod.rs @@ -2399,26 +2399,51 @@ impl Display for bool { impl Debug for str { fn fmt(&self, f: &mut Formatter<'_>) -> Result { f.write_char('"')?; - let mut from = 0; - for (i, c) in self.char_indices() { - // a fast path for ASCII chars that do not need escapes: - if matches!(c, ' '..='~') && !matches!(c, '\\' | '\"') { - continue; - } - let esc = c.escape_debug_ext(EscapeDebugExtArgs { - escape_grapheme_extended: true, - escape_single_quote: false, - escape_double_quote: true, - }); - // If char needs escaping, flush backlog so far and write, else skip - if esc.len() != 1 { - f.write_str(&self[from..i])?; - Display::fmt(&esc, f)?; - from = i + c.len_utf8(); + // substring we know is printable + let mut printable_range = 0..0; + + fn needs_escape(b: u8) -> bool { + b > 0x7E || b < 0x20 || b == b'\\' || b == b'"' + } + + // the outer loop here splits the string into chunks of printable ASCII, which is just skipped over, + // and chunks of other chars (unicode, or ASCII that needs escaping), which is handler per-`char`. + let mut rest = self.as_bytes(); + while rest.len() > 0 { + let Some(non_printable_start) = rest.iter().position(|&b| needs_escape(b)) else { + printable_range.end += rest.len(); + break; + }; + + printable_range.end += non_printable_start; + // SAFETY: the position was derived from an iterator, so is known to be within bounds, and at a char boundary + rest = unsafe { rest.get_unchecked(non_printable_start..) }; + + let printable_start = rest.iter().position(|&b| !needs_escape(b)).unwrap_or(rest.len()); + let prefix; + // SAFETY: the position was derived from an iterator, so is known to be within bounds, and at a char boundary + (prefix, rest) = unsafe { rest.split_at_unchecked(printable_start) }; + // SAFETY: prefix is a valid utf8 sequence, and at a char boundary + let prefix = unsafe { crate::str::from_utf8_unchecked(prefix) }; + + for c in prefix.chars() { + let esc = c.escape_debug_ext(EscapeDebugExtArgs { + escape_grapheme_extended: true, + escape_single_quote: false, + escape_double_quote: true, + }); + if esc.len() != 1 { + f.write_str(&self[printable_range.clone()])?; + Display::fmt(&esc, f)?; + printable_range.start = printable_range.end + c.len_utf8(); + } + printable_range.end += c.len_utf8(); } } - f.write_str(&self[from..])?; + + f.write_str(&self[printable_range])?; + f.write_char('"') } } From aaba972e06a35ca6988f41f31ca56d747eac4dc9 Mon Sep 17 00:00:00 2001 From: Arpad Borsos Date: Mon, 20 May 2024 11:31:02 +0200 Subject: [PATCH 4/5] Switch to primarily using `&str` Surprisingly, benchmarks have shown that using `&str` instead of `&[u8]` with some `unsafe` code is actually faster. --- library/core/src/fmt/mod.rs | 13 ++++++------- 1 file changed, 6 insertions(+), 7 deletions(-) diff --git a/library/core/src/fmt/mod.rs b/library/core/src/fmt/mod.rs index 7fbbbb67f82e6..b9f6b2d35c9f2 100644 --- a/library/core/src/fmt/mod.rs +++ b/library/core/src/fmt/mod.rs @@ -2409,9 +2409,10 @@ impl Debug for str { // the outer loop here splits the string into chunks of printable ASCII, which is just skipped over, // and chunks of other chars (unicode, or ASCII that needs escaping), which is handler per-`char`. - let mut rest = self.as_bytes(); + let mut rest = self; while rest.len() > 0 { - let Some(non_printable_start) = rest.iter().position(|&b| needs_escape(b)) else { + let Some(non_printable_start) = rest.as_bytes().iter().position(|&b| needs_escape(b)) + else { printable_range.end += rest.len(); break; }; @@ -2420,12 +2421,10 @@ impl Debug for str { // SAFETY: the position was derived from an iterator, so is known to be within bounds, and at a char boundary rest = unsafe { rest.get_unchecked(non_printable_start..) }; - let printable_start = rest.iter().position(|&b| !needs_escape(b)).unwrap_or(rest.len()); + let printable_start = + rest.as_bytes().iter().position(|&b| !needs_escape(b)).unwrap_or(rest.len()); let prefix; - // SAFETY: the position was derived from an iterator, so is known to be within bounds, and at a char boundary - (prefix, rest) = unsafe { rest.split_at_unchecked(printable_start) }; - // SAFETY: prefix is a valid utf8 sequence, and at a char boundary - let prefix = unsafe { crate::str::from_utf8_unchecked(prefix) }; + (prefix, rest) = rest.split_at(printable_start); for c in prefix.chars() { let esc = c.escape_debug_ext(EscapeDebugExtArgs { From 004100c222638c980b6509aba0ed4990181fa5dc Mon Sep 17 00:00:00 2001 From: Arpad Borsos Date: Mon, 20 May 2024 12:04:21 +0200 Subject: [PATCH 5/5] Process a single not-ASCII-printable `char` per iteration This avoids having to collect a non-ASCII-printable run before processing it. --- library/core/src/fmt/mod.rs | 13 +++++-------- 1 file changed, 5 insertions(+), 8 deletions(-) diff --git a/library/core/src/fmt/mod.rs b/library/core/src/fmt/mod.rs index b9f6b2d35c9f2..7f115d5cb20ba 100644 --- a/library/core/src/fmt/mod.rs +++ b/library/core/src/fmt/mod.rs @@ -2407,8 +2407,8 @@ impl Debug for str { b > 0x7E || b < 0x20 || b == b'\\' || b == b'"' } - // the outer loop here splits the string into chunks of printable ASCII, which is just skipped over, - // and chunks of other chars (unicode, or ASCII that needs escaping), which is handler per-`char`. + // the loop here first skips over runs of printable ASCII as a fast path. + // other chars (unicode, or ASCII that needs escaping) are then handled per-`char`. let mut rest = self; while rest.len() > 0 { let Some(non_printable_start) = rest.as_bytes().iter().position(|&b| needs_escape(b)) @@ -2421,12 +2421,8 @@ impl Debug for str { // SAFETY: the position was derived from an iterator, so is known to be within bounds, and at a char boundary rest = unsafe { rest.get_unchecked(non_printable_start..) }; - let printable_start = - rest.as_bytes().iter().position(|&b| !needs_escape(b)).unwrap_or(rest.len()); - let prefix; - (prefix, rest) = rest.split_at(printable_start); - - for c in prefix.chars() { + let mut chars = rest.chars(); + if let Some(c) = chars.next() { let esc = c.escape_debug_ext(EscapeDebugExtArgs { escape_grapheme_extended: true, escape_single_quote: false, @@ -2439,6 +2435,7 @@ impl Debug for str { } printable_range.end += c.len_utf8(); } + rest = chars.as_str(); } f.write_str(&self[printable_range])?;