diff --git a/crates/oxc_estree/src/serialize/strings.rs b/crates/oxc_estree/src/serialize/strings.rs index 665829ff23abd..8c11df54c0fe9 100644 --- a/crates/oxc_estree/src/serialize/strings.rs +++ b/crates/oxc_estree/src/serialize/strings.rs @@ -1,16 +1,18 @@ +use std::slice; + use oxc_data_structures::code_buffer::CodeBuffer; use super::{ESTree, Serializer}; /// Convert `char` to UTF-8 bytes array. -const fn to_bytes(ch: char) -> [u8; N] { +const fn char_to_bytes(ch: char) -> [u8; N] { let mut bytes = [0u8; N]; ch.encode_utf8(&mut bytes); bytes } /// Lossy replacement character (U+FFFD) as UTF-8 bytes. -const LOSSY_REPLACEMENT_CHAR_BYTES: [u8; 3] = to_bytes('\u{FFFD}'); +const LOSSY_REPLACEMENT_CHAR_BYTES: [u8; 3] = char_to_bytes('\u{FFFD}'); const LOSSY_REPLACEMENT_CHAR_FIRST_BYTE: u8 = LOSSY_REPLACEMENT_CHAR_BYTES[0]; // 0xEF /// A string which does not need any escaping in JSON. @@ -117,91 +119,191 @@ const fn create_table(lo: Escape) -> [Escape; 256] { // `#[inline(always)]` because this is a hot path, and to make compiler remove the code // for handling lone surrogates when outputting a normal string (the common case). #[inline(always)] +#[expect(clippy::cast_sign_loss)] fn write_str(s: &str, table: &[Escape; 256], buffer: &mut CodeBuffer) { buffer.print_ascii_byte(b'"'); let bytes = s.as_bytes(); + let mut chunk_start_ptr = bytes.as_ptr(); + let mut iter = bytes.iter(); + + 'outer: loop { + // Consume bytes which require no unescaping. + // Search in batches of 16 bytes for speed with longer strings. + const BATCH_SIZE: usize = 16; + + let mut byte; + let mut escape; + 'inner: loop { + if let Some(batch) = iter.as_slice().get(..BATCH_SIZE) { + // Enough bytes remaining to process as a batch. Compiler unrolls this loop. + for (i, &next_byte) in batch.iter().enumerate() { + byte = next_byte; + escape = table[byte as usize]; + if escape != Escape::__ { + // Consume bytes before this one. + // SAFETY: `i < BATCH_SIZE` and there are at least `BATCH_SIZE` bytes remaining in `iter` + unsafe { advance_unchecked(&mut iter, i) }; + break 'inner; + } + } - let mut start = 0; - let mut iter = bytes.iter().enumerate(); - while let Some((index, &byte)) = iter.next() { - let escape = table[byte as usize]; - if escape == Escape::__ { - continue; + // Consume the whole batch. + // SAFETY: There are at least `BATCH_SIZE` bytes remaining in `iter`. + unsafe { advance_unchecked(&mut iter, BATCH_SIZE) }; + + // Go round `'inner` loop again to continue searching + } else { + // Not enough bytes remaining for a batch. Search byte-by-byte. + for (i, &next_byte) in iter.clone().enumerate() { + byte = next_byte; + escape = table[byte as usize]; + if escape != Escape::__ { + // Consume bytes before this one. + // SAFETY: `i` is an index of `iter`, so cannot be out of bounds. + unsafe { advance_unchecked(&mut iter, i) }; + break 'inner; + } + } + + // Got to end of string with no further characters requiring escaping found. + // No need to advance `iter`, as we don't use it's current pointer again. + break 'outer; + } } - // Handle lone surrogates + // Found a character that needs escaping + + // Handle lone surrogates. + // `table == &ESCAPE_LONE_SURROGATES` is statically knowable when this function is inlined. + // That condition is included to remove this whole block when not converting lone surrogates + // in `impl ESTree for str`. if table == &ESCAPE_LONE_SURROGATES && escape == Escape::LO { - let (_, &next1) = iter.next().unwrap(); - let (_, &next2) = iter.next().unwrap(); - if [next1, next2] == [LOSSY_REPLACEMENT_CHAR_BYTES[1], LOSSY_REPLACEMENT_CHAR_BYTES[2]] - { + // SAFETY: `0xEF` is always 1st byte in a 3-byte UTF-8 character, + // so reading next 2 bytes cannot be out of bounds + let next_2_bytes = unsafe { iter.as_slice().get_unchecked(1..3) }; + if next_2_bytes == &LOSSY_REPLACEMENT_CHAR_BYTES[1..] { // Lossy replacement character (U+FFFD) is used as an escape before lone surrogates, - // with the code point as 4 x hex characters after it. - let (_, &hex1) = iter.next().unwrap(); - let (_, &hex2) = iter.next().unwrap(); - let (_, &hex3) = iter.next().unwrap(); - let (_, &hex4) = iter.next().unwrap(); - let hex = [hex1, hex2, hex3, hex4]; - - // Print the chunk upto before the lossy replacement character. + // with the code point as 4 x hex characters after it e.g. `\u{FFFD}d800`. + + // Print the chunk up to before the lossy replacement character. // SAFETY: 0xEF is always the start of a 3-byte unicode character. - // Therefore `index` must be on a UTF-8 character boundary. - unsafe { buffer.print_bytes_unchecked(&bytes[start..index]) }; + // Therefore `current_ptr` must be on a UTF-8 character boundary. + // `chunk_start_ptr` is start of string originally, and is only updated to be after + // an ASCII character, so must also be on a UTF-8 character boundary, and in bounds. + // `chunk_start_ptr` is after a previous byte so must be `<= current_ptr`. + unsafe { + let current_ptr = iter.as_slice().as_ptr(); + let len = current_ptr.offset_from(chunk_start_ptr) as usize; + let chunk = slice::from_raw_parts(chunk_start_ptr, len); + buffer.print_bytes_unchecked(chunk); + } + + // Consume the lossy replacement character. + // SAFETY: Lossy replacement character is 3 bytes. + unsafe { advance_unchecked(&mut iter, 3) }; - if hex == *b"fffd" { + let hex = iter.as_slice().get(..4).unwrap(); + if hex == b"fffd" { // This is an actual lossy replacement character (not an escaped lone surrogate) buffer.print_str("\u{FFFD}"); + + // Consume `fffd`. + // SAFETY: We know next 4 bytes are `fffd`. + unsafe { advance_unchecked(&mut iter, 4) }; + + // Set `chunk_start_ptr` to after `\u{FFFD}fffd`. + // That's a complete UTF-8 sequence, so `chunk_start_ptr` is definitely + // left on a UTF-8 character boundary. + chunk_start_ptr = iter.as_slice().as_ptr(); } else { // This is an escaped lone surrogate. - // Print `\uXXXX` where `XXXX` is hex characters. e.g. `\ud800`. + // Next 4 bytes should be code point encoded as 4 x hex bytes. + #[cfg(debug_assertions)] + for &b in hex { + assert!(matches!(b, b'0'..=b'9' | b'a'..=b'f')); + } + + // Print `\u`. Leave the hex bytes to be printed in next batch. + // After lossy replacement character is definitely a UTF-8 boundary. buffer.print_str("\\u"); - // Check all 4 hex bytes are ASCII - assert_eq!(u32::from_ne_bytes(hex) & 0x8080_8080, 0); - // SAFETY: Just checked all 4 bytes are ASCII - unsafe { buffer.print_bytes_unchecked(&hex) }; - } + chunk_start_ptr = iter.as_slice().as_ptr(); - // Skip the 3 bytes of the lossy replacement character + 4 hex bytes. - // We checked that all 4 hex bytes are ASCII, so `start` is definitely left on - // a UTF-8 character boundary. - start = index + 7; + // SAFETY: `iter.as_slice().get(..4).unwrap()` above would have panicked + // if there weren't at least 4 bytes remaining in `iter`. + // We haven't checked that the 4 following bytes are ASCII, but it doesn't matter + // whether `iter` is left on a UTF-8 char boundary or not. + unsafe { advance_unchecked(&mut iter, 4) } + } } else { - // Some other unicode character starting with 0xEF. Just continue the loop. + // Some other unicode character starting with 0xEF. + // Consume it and continue the loop. + // SAFETY: `0xEF` is always 1st byte in a 3-byte UTF-8 character. + unsafe { advance_unchecked(&mut iter, 3) }; } continue; } - if start < index { - // SAFETY: `bytes` is derived from a `&str`. - // `escape` is only non-zero for ASCII bytes, except `Escape::LO` which is handled above. - // Therefore current `index` must mark the end of a valid UTF8 character sequence. - // `start` is either the start of string, or after an ASCII character, - // therefore always the start of a valid UTF8 character sequence. - unsafe { buffer.print_bytes_unchecked(&bytes[start..index]) }; + // Print the chunk up to before the character which requires escaping. + let current_ptr = iter.as_slice().as_ptr(); + // SAFETY: `escape` is only non-zero for ASCII bytes, except `Escape::LO` which is handled above. + // Therefore `current_ptr` must be on an ASCII byte. + // `chunk_start_ptr` is start of string originally, and is only updated to be after + // an ASCII character, so must also be on a UTF-8 character boundary, and in bounds. + // `chunk_start_ptr` is after a previous byte so must be `<= current_ptr`. + unsafe { + let len = current_ptr.offset_from(chunk_start_ptr) as usize; + let chunk = slice::from_raw_parts(chunk_start_ptr, len); + buffer.print_bytes_unchecked(chunk); } write_char_escape(escape, byte, buffer); - start = index + 1; + // SAFETY: `'inner` loop above ensures `iter` is not at end of string + unsafe { advance_unchecked(&mut iter, 1) }; + + // Set `chunk_start_ptr` to be after this character. + // `escape` is only non-zero for ASCII bytes, except `Escape::LO` which is handled above. + // We just consumed that ASCII byte, so `chunk_start_ptr` must be on a UTF-8 char boundary. + chunk_start_ptr = iter.as_slice().as_ptr(); } - if start < bytes.len() { - // SAFETY: `bytes` is derived from a `&str`. - // `start` is either the start of string, or after an ASCII character, - // therefore always the start of a valid UTF8 character sequence. - unsafe { buffer.print_bytes_unchecked(&bytes[start..]) }; + // Print last chunk. + // SAFETY: Adding `len` to `ptr` cannot be out of bounds. + let end_ptr = unsafe { iter.as_slice().as_ptr().add(iter.as_slice().len()) }; + // SAFETY: `chunk_start_ptr` is start of string originally, and is only updated to be after + // an ASCII character, so must be on a UTF-8 character boundary, and in bounds. + // `chunk_start_ptr` is after a previous byte so must be `<= end_ptr`. + unsafe { + let len = end_ptr.offset_from(chunk_start_ptr) as usize; + let chunk = slice::from_raw_parts(chunk_start_ptr, len); + buffer.print_bytes_unchecked(chunk); } buffer.print_ascii_byte(b'"'); } +/// Advance bytes iterator by `count` bytes. +/// +/// # SAFETY +/// Caller must ensure there are at least `count` bytes remaining in `iter`. +#[inline] +unsafe fn advance_unchecked(iter: &mut slice::Iter, count: usize) { + // Compiler boils this down to just adding `count` to `iter`'s pointer. + // SAFETY: Caller guarantees there are at least `count` bytes remaining in `iter`. + unsafe { + let new_ptr = iter.as_slice().as_ptr().add(count); + let new_len = iter.as_slice().len() - count; + let slice = slice::from_raw_parts(new_ptr, new_len); + *iter = slice.iter(); + }; +} + fn write_char_escape(escape: Escape, byte: u8, buffer: &mut CodeBuffer) { #[expect(clippy::if_not_else)] if escape != Escape::UU { - buffer.print_ascii_byte(b'\\'); // SAFETY: All values of `Escape` are ASCII - unsafe { buffer.print_byte_unchecked(escape as u8) }; + unsafe { buffer.print_bytes_unchecked(&[b'\\', escape as u8]) }; } else { static HEX_DIGITS: [u8; 16] = *b"0123456789abcdef"; let bytes = [