Skip to content
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
202 changes: 152 additions & 50 deletions crates/oxc_estree/src/serialize/strings.rs
Original file line number Diff line number Diff line change
@@ -1,16 +1,18 @@
use std::slice;

use oxc_data_structures::code_buffer::CodeBuffer;

use super::{ESTree, Serializer};

/// Convert `char` to UTF-8 bytes array.
const fn to_bytes<const N: usize>(ch: char) -> [u8; N] {
const fn char_to_bytes<const N: usize>(ch: char) -> [u8; N] {
let mut bytes = [0u8; N];
ch.encode_utf8(&mut bytes);
bytes
}

/// Lossy replacement character (U+FFFD) as UTF-8 bytes.
const LOSSY_REPLACEMENT_CHAR_BYTES: [u8; 3] = to_bytes('\u{FFFD}');
const LOSSY_REPLACEMENT_CHAR_BYTES: [u8; 3] = char_to_bytes('\u{FFFD}');
const LOSSY_REPLACEMENT_CHAR_FIRST_BYTE: u8 = LOSSY_REPLACEMENT_CHAR_BYTES[0]; // 0xEF

/// A string which does not need any escaping in JSON.
Expand Down Expand Up @@ -117,91 +119,191 @@ const fn create_table(lo: Escape) -> [Escape; 256] {
// `#[inline(always)]` because this is a hot path, and to make compiler remove the code
// for handling lone surrogates when outputting a normal string (the common case).
#[inline(always)]
#[expect(clippy::cast_sign_loss)]
fn write_str(s: &str, table: &[Escape; 256], buffer: &mut CodeBuffer) {
buffer.print_ascii_byte(b'"');

let bytes = s.as_bytes();
let mut chunk_start_ptr = bytes.as_ptr();
let mut iter = bytes.iter();

'outer: loop {
// Consume bytes which require no unescaping.
// Search in batches of 16 bytes for speed with longer strings.
const BATCH_SIZE: usize = 16;

let mut byte;
let mut escape;
'inner: loop {
if let Some(batch) = iter.as_slice().get(..BATCH_SIZE) {
// Enough bytes remaining to process as a batch. Compiler unrolls this loop.
for (i, &next_byte) in batch.iter().enumerate() {
byte = next_byte;
escape = table[byte as usize];
if escape != Escape::__ {
// Consume bytes before this one.
// SAFETY: `i < BATCH_SIZE` and there are at least `BATCH_SIZE` bytes remaining in `iter`
unsafe { advance_unchecked(&mut iter, i) };
break 'inner;
}
}

let mut start = 0;
let mut iter = bytes.iter().enumerate();
while let Some((index, &byte)) = iter.next() {
let escape = table[byte as usize];
if escape == Escape::__ {
continue;
// Consume the whole batch.
// SAFETY: There are at least `BATCH_SIZE` bytes remaining in `iter`.
unsafe { advance_unchecked(&mut iter, BATCH_SIZE) };

// Go round `'inner` loop again to continue searching
} else {
// Not enough bytes remaining for a batch. Search byte-by-byte.
for (i, &next_byte) in iter.clone().enumerate() {
byte = next_byte;
escape = table[byte as usize];
if escape != Escape::__ {
// Consume bytes before this one.
// SAFETY: `i` is an index of `iter`, so cannot be out of bounds.
unsafe { advance_unchecked(&mut iter, i) };
break 'inner;
}
}

// Got to end of string with no further characters requiring escaping found.
// No need to advance `iter`, as we don't use it's current pointer again.
break 'outer;
}
}

// Handle lone surrogates
// Found a character that needs escaping

// Handle lone surrogates.
// `table == &ESCAPE_LONE_SURROGATES` is statically knowable when this function is inlined.
// That condition is included to remove this whole block when not converting lone surrogates
// in `impl ESTree for str`.
if table == &ESCAPE_LONE_SURROGATES && escape == Escape::LO {
let (_, &next1) = iter.next().unwrap();
let (_, &next2) = iter.next().unwrap();
if [next1, next2] == [LOSSY_REPLACEMENT_CHAR_BYTES[1], LOSSY_REPLACEMENT_CHAR_BYTES[2]]
{
// SAFETY: `0xEF` is always 1st byte in a 3-byte UTF-8 character,
// so reading next 2 bytes cannot be out of bounds
let next_2_bytes = unsafe { iter.as_slice().get_unchecked(1..3) };
if next_2_bytes == &LOSSY_REPLACEMENT_CHAR_BYTES[1..] {
// Lossy replacement character (U+FFFD) is used as an escape before lone surrogates,
// with the code point as 4 x hex characters after it.
let (_, &hex1) = iter.next().unwrap();
let (_, &hex2) = iter.next().unwrap();
let (_, &hex3) = iter.next().unwrap();
let (_, &hex4) = iter.next().unwrap();
let hex = [hex1, hex2, hex3, hex4];

// Print the chunk upto before the lossy replacement character.
// with the code point as 4 x hex characters after it e.g. `\u{FFFD}d800`.

// Print the chunk up to before the lossy replacement character.
// SAFETY: 0xEF is always the start of a 3-byte unicode character.
// Therefore `index` must be on a UTF-8 character boundary.
unsafe { buffer.print_bytes_unchecked(&bytes[start..index]) };
// Therefore `current_ptr` must be on a UTF-8 character boundary.
// `chunk_start_ptr` is start of string originally, and is only updated to be after
// an ASCII character, so must also be on a UTF-8 character boundary, and in bounds.
// `chunk_start_ptr` is after a previous byte so must be `<= current_ptr`.
unsafe {
let current_ptr = iter.as_slice().as_ptr();
let len = current_ptr.offset_from(chunk_start_ptr) as usize;
let chunk = slice::from_raw_parts(chunk_start_ptr, len);
buffer.print_bytes_unchecked(chunk);
}

// Consume the lossy replacement character.
// SAFETY: Lossy replacement character is 3 bytes.
unsafe { advance_unchecked(&mut iter, 3) };

if hex == *b"fffd" {
let hex = iter.as_slice().get(..4).unwrap();
if hex == b"fffd" {
// This is an actual lossy replacement character (not an escaped lone surrogate)
buffer.print_str("\u{FFFD}");

// Consume `fffd`.
// SAFETY: We know next 4 bytes are `fffd`.
unsafe { advance_unchecked(&mut iter, 4) };

// Set `chunk_start_ptr` to after `\u{FFFD}fffd`.
// That's a complete UTF-8 sequence, so `chunk_start_ptr` is definitely
// left on a UTF-8 character boundary.
chunk_start_ptr = iter.as_slice().as_ptr();
} else {
// This is an escaped lone surrogate.
// Print `\uXXXX` where `XXXX` is hex characters. e.g. `\ud800`.
// Next 4 bytes should be code point encoded as 4 x hex bytes.
#[cfg(debug_assertions)]
for &b in hex {
assert!(matches!(b, b'0'..=b'9' | b'a'..=b'f'));
}

// Print `\u`. Leave the hex bytes to be printed in next batch.
// After lossy replacement character is definitely a UTF-8 boundary.
buffer.print_str("\\u");
// Check all 4 hex bytes are ASCII
assert_eq!(u32::from_ne_bytes(hex) & 0x8080_8080, 0);
// SAFETY: Just checked all 4 bytes are ASCII
unsafe { buffer.print_bytes_unchecked(&hex) };
}
chunk_start_ptr = iter.as_slice().as_ptr();

// Skip the 3 bytes of the lossy replacement character + 4 hex bytes.
// We checked that all 4 hex bytes are ASCII, so `start` is definitely left on
// a UTF-8 character boundary.
start = index + 7;
// SAFETY: `iter.as_slice().get(..4).unwrap()` above would have panicked
// if there weren't at least 4 bytes remaining in `iter`.
// We haven't checked that the 4 following bytes are ASCII, but it doesn't matter
// whether `iter` is left on a UTF-8 char boundary or not.
unsafe { advance_unchecked(&mut iter, 4) }
}
} else {
// Some other unicode character starting with 0xEF. Just continue the loop.
// Some other unicode character starting with 0xEF.
// Consume it and continue the loop.
// SAFETY: `0xEF` is always 1st byte in a 3-byte UTF-8 character.
unsafe { advance_unchecked(&mut iter, 3) };
}
continue;
}

if start < index {
// SAFETY: `bytes` is derived from a `&str`.
// `escape` is only non-zero for ASCII bytes, except `Escape::LO` which is handled above.
// Therefore current `index` must mark the end of a valid UTF8 character sequence.
// `start` is either the start of string, or after an ASCII character,
// therefore always the start of a valid UTF8 character sequence.
unsafe { buffer.print_bytes_unchecked(&bytes[start..index]) };
// Print the chunk up to before the character which requires escaping.
let current_ptr = iter.as_slice().as_ptr();
// SAFETY: `escape` is only non-zero for ASCII bytes, except `Escape::LO` which is handled above.
// Therefore `current_ptr` must be on an ASCII byte.
// `chunk_start_ptr` is start of string originally, and is only updated to be after
// an ASCII character, so must also be on a UTF-8 character boundary, and in bounds.
// `chunk_start_ptr` is after a previous byte so must be `<= current_ptr`.
unsafe {
let len = current_ptr.offset_from(chunk_start_ptr) as usize;
let chunk = slice::from_raw_parts(chunk_start_ptr, len);
buffer.print_bytes_unchecked(chunk);
}

write_char_escape(escape, byte, buffer);

start = index + 1;
// SAFETY: `'inner` loop above ensures `iter` is not at end of string
unsafe { advance_unchecked(&mut iter, 1) };

// Set `chunk_start_ptr` to be after this character.
// `escape` is only non-zero for ASCII bytes, except `Escape::LO` which is handled above.
// We just consumed that ASCII byte, so `chunk_start_ptr` must be on a UTF-8 char boundary.
chunk_start_ptr = iter.as_slice().as_ptr();
}

if start < bytes.len() {
// SAFETY: `bytes` is derived from a `&str`.
// `start` is either the start of string, or after an ASCII character,
// therefore always the start of a valid UTF8 character sequence.
unsafe { buffer.print_bytes_unchecked(&bytes[start..]) };
// Print last chunk.
// SAFETY: Adding `len` to `ptr` cannot be out of bounds.
let end_ptr = unsafe { iter.as_slice().as_ptr().add(iter.as_slice().len()) };
// SAFETY: `chunk_start_ptr` is start of string originally, and is only updated to be after
// an ASCII character, so must be on a UTF-8 character boundary, and in bounds.
// `chunk_start_ptr` is after a previous byte so must be `<= end_ptr`.
unsafe {
let len = end_ptr.offset_from(chunk_start_ptr) as usize;
let chunk = slice::from_raw_parts(chunk_start_ptr, len);
buffer.print_bytes_unchecked(chunk);
}

buffer.print_ascii_byte(b'"');
}

/// Advance bytes iterator by `count` bytes.
///
/// # SAFETY
/// Caller must ensure there are at least `count` bytes remaining in `iter`.
#[inline]
unsafe fn advance_unchecked(iter: &mut slice::Iter<u8>, count: usize) {
// Compiler boils this down to just adding `count` to `iter`'s pointer.
// SAFETY: Caller guarantees there are at least `count` bytes remaining in `iter`.
unsafe {
let new_ptr = iter.as_slice().as_ptr().add(count);
let new_len = iter.as_slice().len() - count;
let slice = slice::from_raw_parts(new_ptr, new_len);
*iter = slice.iter();
};
}

fn write_char_escape(escape: Escape, byte: u8, buffer: &mut CodeBuffer) {
#[expect(clippy::if_not_else)]
if escape != Escape::UU {
buffer.print_ascii_byte(b'\\');
// SAFETY: All values of `Escape` are ASCII
unsafe { buffer.print_byte_unchecked(escape as u8) };
unsafe { buffer.print_bytes_unchecked(&[b'\\', escape as u8]) };
} else {
static HEX_DIGITS: [u8; 16] = *b"0123456789abcdef";
let bytes = [
Expand Down
Loading