diff --git a/crates/oxc_codegen/src/lib.rs b/crates/oxc_codegen/src/lib.rs index 4208ce48cd001..127a3ed3a4a0c 100644 --- a/crates/oxc_codegen/src/lib.rs +++ b/crates/oxc_codegen/src/lib.rs @@ -11,25 +11,26 @@ mod r#gen; mod operator; mod options; mod sourcemap_builder; +mod str; use std::borrow::Cow; use oxc_ast::ast::{ Argument, BindingIdentifier, BlockStatement, Comment, Expression, IdentifierReference, Program, - Statement, StringLiteral, + Statement, }; use oxc_data_structures::{code_buffer::CodeBuffer, stack::Stack}; use oxc_semantic::Scoping; use oxc_span::{GetSpan, SPAN, Span}; use oxc_syntax::{ - identifier::{LS, PS, is_identifier_part, is_identifier_part_ascii}, + identifier::{is_identifier_part, is_identifier_part_ascii}, operator::{BinaryOperator, UnaryOperator, UpdateOperator}, precedence::Precedence, }; use crate::{ binary_expr_visitor::BinaryExpressionVisitor, comment::CommentsMap, operator::Operator, - sourcemap_builder::SourcemapBuilder, + sourcemap_builder::SourcemapBuilder, str::Quote, }; pub use crate::{ context::Context, @@ -242,12 +243,6 @@ impl<'a> Codegen<'a> { self.code.print_str(s); } - /// Push `char` into the buffer. - #[inline] - pub fn print_char(&mut self, ch: char) { - self.code.print_char(ch); - } - /// Print a single [`Expression`], adding it to the code generator's /// internal buffer. Unlike [`Codegen::build`], this does not consume `self`. #[inline] @@ -582,129 +577,6 @@ impl<'a> Codegen<'a> { } } - fn print_string_literal(&mut self, s: &StringLiteral<'_>, allow_backtick: bool) { - self.add_source_mapping(s.span); - - let quote = if self.options.minify { - // String length is max `u32::MAX`, so use `i64` to make overflow impossible - let mut single_cost: i64 = 0; - let mut double_cost: i64 = 0; - let mut backtick_cost: i64 = 0; - let mut bytes = s.value.as_bytes().iter().peekable(); - while let Some(b) = bytes.next() { - match b { - b'\n' => backtick_cost -= 1, - b'\'' => single_cost += 1, - b'"' => double_cost += 1, - b'`' => backtick_cost += 1, - b'$' => { - if bytes.peek() == Some(&&b'{') { - backtick_cost += 1; - } - } - _ => {} - } - } - let mut quote = Quote::Double; - if allow_backtick && double_cost >= backtick_cost { - quote = Quote::Backtick; - if backtick_cost > single_cost { - quote = Quote::Single; - } - } else if double_cost > single_cost { - quote = Quote::Single; - } - quote - } else { - self.quote - }; - - quote.print(self); - self.print_unquoted_utf16(s, quote); - quote.print(self); - } - - fn print_unquoted_utf16(&mut self, s: &StringLiteral<'_>, quote: Quote) { - let mut chars = s.value.chars().peekable(); - - while let Some(c) = chars.next() { - match c { - '\x00' => { - if chars.peek().is_some_and(|&next| next.is_ascii_digit()) { - self.print_str("\\x00"); - } else { - self.print_str("\\0"); - } - } - '\x07' => self.print_str("\\x07"), - '\u{8}' => self.print_str("\\b"), // \b - '\u{b}' => self.print_str("\\v"), // \v - '\u{c}' => self.print_str("\\f"), // \f - '\n' => { - if quote == Quote::Backtick { - self.print_ascii_byte(b'\n'); - } else { - self.print_str("\\n"); - } - } - '\r' => self.print_str("\\r"), - '\x1B' => self.print_str("\\x1B"), - '\\' => self.print_str("\\\\"), - // Allow `U+2028` and `U+2029` in string literals - // - // - LS => self.print_str("\\u2028"), - PS => self.print_str("\\u2029"), - '\u{a0}' => self.print_str("\\xA0"), - '\'' => { - if quote == Quote::Single { - self.print_ascii_byte(b'\\'); - } - self.print_ascii_byte(b'\''); - } - '\"' => { - if quote == Quote::Double { - self.print_ascii_byte(b'\\'); - } - self.print_ascii_byte(b'"'); - } - '`' => { - if quote == Quote::Backtick { - self.print_ascii_byte(b'\\'); - } - self.print_ascii_byte(b'`'); - } - '$' => { - if quote == Quote::Backtick && chars.peek() == Some(&'{') { - self.print_ascii_byte(b'\\'); - } - self.print_ascii_byte(b'$'); - } - '\u{FFFD}' if s.lone_surrogates => { - // If `lone_surrogates` is set, string contains lone surrogates which are escaped - // using the lossy replacement character (U+FFFD) as an escape marker. - // The lone surrogate is encoded as `\u{FFFD}XXXX` where `XXXX` is the code point as hex. - let hex1 = chars.next().unwrap(); - let hex2 = chars.next().unwrap(); - let hex3 = chars.next().unwrap(); - let hex4 = chars.next().unwrap(); - if [hex1, hex2, hex3, hex4] == ['f', 'f', 'f', 'd'] { - // Actual lossy replacement character - self.print_char('\u{FFFD}'); - } else { - // Lossy replacement character representing a lone surrogate - self.print_str("\\u"); - self.print_char(hex1); - self.print_char(hex2); - self.print_char(hex3); - self.print_char(hex4); - } - } - _ => self.print_str(c.encode_utf8([0; 4].as_mut())), - } - } - } - // `get_minified_number` from terser // https://github.com/terser/terser/blob/c5315c3fd6321d6b2e076af35a70ef532f498505/lib/output.js#L2418 #[expect(clippy::cast_possible_truncation, clippy::cast_sign_loss, clippy::cast_possible_wrap)] @@ -786,20 +658,3 @@ impl<'a> Codegen<'a> { } } } - -/// Quote character. -#[derive(Clone, Copy, PartialEq, Eq)] -#[repr(u8)] -enum Quote { - Single = b'\'', - Double = b'"', - Backtick = b'`', -} - -impl Quote { - #[inline] - fn print(self, codegen: &mut Codegen<'_>) { - // SAFETY: All variants of `Quote` are ASCII bytes - unsafe { codegen.code.print_byte_unchecked(self as u8) }; - } -} diff --git a/crates/oxc_codegen/src/str.rs b/crates/oxc_codegen/src/str.rs new file mode 100644 index 0000000000000..af68886a2fbb8 --- /dev/null +++ b/crates/oxc_codegen/src/str.rs @@ -0,0 +1,693 @@ +use std::slice; + +use oxc_ast::ast::StringLiteral; +use oxc_data_structures::assert_unchecked; +use oxc_syntax::identifier::{LS, NBSP, PS}; + +use crate::Codegen; + +/// Quote character. +#[derive(Clone, Copy, PartialEq, Eq)] +#[repr(u8)] +pub enum Quote { + Single = b'\'', + Double = b'"', + Backtick = b'`', +} + +impl Quote { + #[inline] + pub fn print(self, codegen: &mut Codegen<'_>) { + // SAFETY: All variants of `Quote` are ASCII bytes + unsafe { codegen.code.print_byte_unchecked(self as u8) }; + } +} + +impl Codegen<'_> { + /// Print a [`StringLiteral`]. + pub(crate) fn print_string_literal(&mut self, s: &StringLiteral<'_>, allow_backtick: bool) { + self.add_source_mapping(s.span); + + // If `minify` option enabled, quote will be chosen depending on what produces shortest output. + // What is the best quote to use will be determined when first character needing escape is found. + // This avoids iterating through the string twice if it contains no quotes (common case). + // Don't print opening quote now, because we don't know what it is yet. + // + // If not in `minify` mode, print the quote requested in options. + let quote = if self.options.minify { + None + } else { + let quote = self.quote; + quote.print(self); + Some(quote) + }; + + // Loop through bytes, looking for any which need to be escaped. + // String is written to buffer in chunks. + let bytes = s.value.as_bytes().iter(); + let mut state = PrintStringState { + chunk_start: bytes.as_slice().as_ptr(), + bytes, + quote, + lone_surrogates: s.lone_surrogates, + allow_backtick, + }; + + // Loop through bytes. + while let Some(b) = state.peek() { + // Look up whether byte needs escaping + let escape = ESCAPES.0[b as usize]; + if escape == Escape::__ { + // No escape required. + // SAFETY: We just checked there's a byte to consume. + // If byte is not ASCII, this will temporarily leave `bytes` iterator not on a UTF-8 + // character boundary, but if so next turns of the loop will consume the rest of + // the Unicode character. + // All bytes which produce an `Escape` which isn't `Escape::__` are 1st byte + // of a UTF-8 character sequence. + unsafe { state.consume_byte_unchecked() }; + } else { + // Escape may be required. Execute byte handler. + // Characters requiring escape are relatively rare, so cold branch. + cold_branch(|| { + // SAFETY: We just peeked a byte, so there is a byte ready to consume. + // `escape` corresponds to that byte. + // Checked that `escape != Escape::__` above. + unsafe { handle_escape(escape, self, &mut state) }; + }); + } + } + + // Flush any remaining bytes + state.flush(self); + + // Print closing quote. + // SAFETY: `flush` calls `calculate_quote` which ensures `state.quote` is `Some`. + let quote = unsafe { state.quote.unwrap_unchecked() }; + quote.print(self); + } +} + +/// String printer state. +/// +/// Main purpose is to contain `bytes` iterator. +/// This iterator must always be positioned on a UTF-8 character boundary. +struct PrintStringState<'s> { + chunk_start: *const u8, + bytes: slice::Iter<'s, u8>, + quote: Option, + lone_surrogates: bool, + allow_backtick: bool, +} + +impl PrintStringState<'_> { + /// Peek next byte in `bytes` iterator. + #[inline] + fn peek(&self) -> Option { + self.bytes.clone().next().copied() + } + + /// Advance the `bytes` iterator by 1 byte. + /// + /// # SAFETY + /// + /// * There must be at least 1 more byte in the `bytes` iterator. + /// * After this call, `bytes` iterator must be left on a UTF-8 character boundary + /// (i.e. the current byte is ASCII), or if byte is not ASCII, then further calls to + /// `consume_byte_unchecked` / `consume_bytes_unchecked` consume the rest of the Unicode character + /// before calling other methods e.g. `flush`. + #[inline] + unsafe fn consume_byte_unchecked(&mut self) { + debug_assert!(self.bytes.clone().next().is_some()); + // SAFETY: Caller guarantees there is a byte to consume in `bytes` iterator, + // and that consuming it leaves the iterator on a UTF-8 char boundary. + unsafe { self.bytes.next().unwrap_unchecked() }; + } + + /// Advance the `bytes` iterator by `N` bytes. + /// + /// # SAFETY + /// + /// * There must be at least `N` more bytes in the `bytes` iterator. + /// * After this call, `bytes` iterator must be left on a UTF-8 character boundary. + #[inline] + unsafe fn consume_bytes_unchecked(&mut self) { + debug_assert!(self.bytes.as_slice().len() >= N); + // SAFETY: Caller guarantees there are `N` bytes to consume in `bytes` iterator, + // and that consuming them leaves the iterator on a UTF-8 char boundary. + unsafe { + for _i in 0..N { + self.bytes.next().unwrap_unchecked(); + } + } + } + + /// Set the start of next chunk to be current position of `bytes` iterator. + #[inline] + fn start_chunk(&mut self) { + self.chunk_start = self.bytes.as_slice().as_ptr(); + } + + /// Flush current chunk to buffer, consume 1 byte, and start next chunk after that byte. + /// + /// # SAFETY + /// + /// * There must be at least 1 more byte in the `bytes` iterator. + /// * After this call, `bytes` iterator must be left on a UTF-8 character boundary. + /// i.e. the current byte is ASCII. + #[inline] + unsafe fn flush_and_consume_byte(&mut self, codegen: &mut Codegen) { + // SAFETY: Caller guarantees `flush_and_consume_bytes`'s requirements are met + unsafe { self.flush_and_consume_bytes::<1>(codegen) }; + } + + /// Flush current chunk to buffer, consume `N` bytes, and start next chunk after those bytes. + /// + /// # SAFETY + /// + /// * There must be at least `N` more bytes in the `bytes` iterator. + /// * After this call, `bytes` iterator must be left on a UTF-8 character boundary. + #[inline] + unsafe fn flush_and_consume_bytes(&mut self, codegen: &mut Codegen) { + self.flush(codegen); + + debug_assert!(self.bytes.as_slice().len() >= N); + // SAFETY: Caller guarantees there are `N` bytes to consume in `bytes` iterator, + // and that consuming them leaves the iterator on a UTF-8 char boundary + unsafe { self.consume_bytes_unchecked::() }; + + self.start_chunk(); + } + + /// Flush all bytes from `chunk_start` up to current position of `bytes` iterator into buffer. + /// + /// If what quote character to use has not been decided yet, calculate the best quote character, + /// and print it before flushing. + fn flush(&mut self, codegen: &mut Codegen) { + // If which quote character to use is not already known, calculate it and print opening quote + self.calculate_quote(codegen); + + // SAFETY: `chunk_start` is pointer to current position of `bytes` iterator at some point, + // and the iterator only advances, so current position of `bytes` must be on or after `chunk_start` + let len = unsafe { + let bytes_ptr = self.bytes.as_slice().as_ptr(); + let offset = bytes_ptr.offset_from(self.chunk_start); + usize::try_from(offset).unwrap_unchecked() + }; + + // SAFETY: `chunk_start` is within bounds of original `&str`. + // `bytes` iter cannot go past end of `&str` either. + // So a slice of `len` bytes starting at `chunk_start` must be within bounds of the `&str`. + // `bytes` iterator is always positioned on a UTF-8 character boundary, as is `chunk_start`. + // Therefore the slice between these two must be a valid UTF-8 string. + unsafe { + let slice = slice::from_raw_parts(self.chunk_start, len); + codegen.code.print_bytes_unchecked(slice); + } + } + + /// Calculate optimum quote character to use, and print that quote (as opening quote). + /// + /// Actual logic in separate `calculate_quote_impl` method, so that `calculate_quote` itself + /// is inlined, to create a fast path for when `quote` is `Some`. + #[inline] + fn calculate_quote(&mut self, codegen: &mut Codegen) -> Quote { + if let Some(quote) = self.quote { quote } else { self.calculate_quote_impl(codegen) } + } + + fn calculate_quote_impl(&mut self, codegen: &mut Codegen) -> Quote { + let quote = if self.allow_backtick { + self.calculate_quote_maybe_backtick() + } else { + self.calculate_quote_no_backtick() + }; + + quote.print(codegen); + + self.quote = Some(quote); + + quote + } + + /// Calculate optimum quote character to use, when backtick (`) is an option. + fn calculate_quote_maybe_backtick(&self) -> Quote { + // String length is max `u32::MAX`, so use `i64` to make overflow impossible + let mut single_cost: i64 = 0; + let mut double_cost: i64 = 0; + let mut backtick_cost: i64 = 0; + let mut bytes = self.bytes.clone(); + while let Some(b) = bytes.next() { + match b { + b'\n' => backtick_cost -= 1, + b'\'' => single_cost += 1, + b'"' => double_cost += 1, + b'`' => backtick_cost += 1, + b'$' => { + if bytes.clone().next() == Some(&b'{') { + backtick_cost += 1; + } + } + _ => {} + } + } + + // If equal cost for different quotes prefer, in order: + // 1. Backtick + // 2. Double quote + // 3. Single quote + #[rustfmt::skip] + let quote = if double_cost >= backtick_cost { + if backtick_cost > single_cost { + Quote::Single + } else { + Quote::Backtick + } + } else if double_cost > single_cost { + Quote::Single + } else { + Quote::Double + }; + quote + } + + /// Calculate optimum quote character to use, when backtick (`) is not an option. + fn calculate_quote_no_backtick(&self) -> Quote { + // String length is max `u32::MAX`, so `i64` cannot overflow + let mut single_cost: i64 = 0; + for &b in self.bytes.clone() { + match b { + b'\'' => single_cost += 1, + b'"' => single_cost -= 1, + _ => {} + } + } + + // Prefer double quote over single quote if cost is the same + if single_cost < 0 { Quote::Single } else { Quote::Double } + } +} + +/// Convert `char` to UTF-8 bytes array. +const fn to_bytes(ch: char) -> [u8; N] { + let mut bytes = [0u8; N]; + ch.encode_utf8(&mut bytes); + bytes +} + +/// `LS` character as UTF-8 bytes. +const LS_BYTES: [u8; 3] = to_bytes(LS); +/// `PS` character as UTF-8 bytes. +const PS_BYTES: [u8; 3] = to_bytes(PS); + +const _: () = assert!(LS_BYTES[0] == 0xE2); +const _: () = assert!(PS_BYTES[0] == 0xE2); +const LS_LAST_2_BYTES: [u8; 2] = [LS_BYTES[1], LS_BYTES[2]]; +const PS_LAST_2_BYTES: [u8; 2] = [PS_BYTES[1], PS_BYTES[2]]; + +/// `NBSP` character as UTF-8 bytes. +const NBSP_BYTES: [u8; 2] = to_bytes(NBSP); +const _: () = assert!(NBSP_BYTES[0] == 0xC2); +const NBSP_LAST_BYTE: u8 = NBSP_BYTES[1]; + +/// Lossy replacement character (U+FFFD) as UTF-8 bytes. +const LOSSY_REPLACEMENT_CHAR_BYTES: [u8; 3] = to_bytes('\u{FFFD}'); +const _: () = assert!(LOSSY_REPLACEMENT_CHAR_BYTES[0] == 0xEF); +const LOSSY_REPLACEMENT_CHAR_LAST_2_BYTES: [u8; 2] = + [LOSSY_REPLACEMENT_CHAR_BYTES[1], LOSSY_REPLACEMENT_CHAR_BYTES[2]]; + +/// Escape codes. +/// +/// Discriminant - 1 is used as index into `BYTE_HANDLERS` (except for `__` variant). +#[derive(Clone, Copy, PartialEq, Eq)] +#[repr(u8)] +enum Escape { + __ = 0, // No escape required + NU = 1, // \x00 - Null byte + BE = 2, // \x07 - Bell + BK = 3, // \b - Backspace + VT = 4, // \v - Vertical tab + FF = 5, // \f - Form feed + NL = 6, // \n - New line + CR = 7, // \r - Carriage return + ES = 8, // \x1B - Escape + BS = 9, // \\ - Backslash + SQ = 10, // ' - Single quote + DQ = 11, // " - Double quote + BQ = 12, // ` - Backtick quote + DO = 13, // $ - Dollar sign + LS = 14, // LS/PS - U+2028 LINE SEPARATOR or U+2029 PARAGRAPH SEPARATOR (first byte) + NB = 15, // NBSP - Non-breaking space (first byte) + LO = 16, // � - U+FFFD lossy replacement character (first byte) +} + +/// Struct which ensures content is aligned on 128. +#[repr(C, align(128))] +struct Aligned128(T); + +/// Table mapping bytes to `Escape`s. +/// +/// Aligned on 128, so top half (ASCII chars) occupies a pair of L1 cache lines. +/// Bottom half (non-ASCII chars) also occupies a pair of L1 cache lines, +/// but will not be accessed for strings which only contain ASCII (common case). +static ESCAPES: Aligned128<[Escape; 256]> = { + #[allow(clippy::enum_glob_use, clippy::allow_attributes)] + use Escape::*; + Aligned128([ + // 1 2 3 4 5 6 7 8 9 A B C D E F + NU, __, __, __, __, __, __, BE, BK, __, NL, VT, FF, CR, __, __, // 0 + __, __, __, __, __, __, __, __, __, __, __, ES, __, __, __, __, // 1 + __, __, DQ, __, DO, __, __, SQ, __, __, __, __, __, __, __, __, // 2 + __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, // 3 + __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, // 4 + __, __, __, __, __, __, __, __, __, __, __, __, BS, __, __, __, // 5 + BQ, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, // 6 + __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, // 7 + __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, // 8 + __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, // 9 + __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, // A + __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, // B + __, __, NB, __, __, __, __, __, __, __, __, __, __, __, __, __, // C + __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, // D + __, __, LS, __, __, __, __, __, __, __, __, __, __, __, __, LO, // E + __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, // F + ]) +}; + +type ByteHandler = unsafe fn(&mut Codegen, &mut PrintStringState); + +/// Byte handlers. +/// +/// Indexed by `escape as usize - 1` (where `escape` is not `Escape::__`). +/// Must be in same order as discriminants in `Escape`. +/// +/// Function pointers are 8 bytes each, so `BYTE_HANDLERS` is 128 bytes in total. +/// Aligned on 128, so occupies a pair of L1 cache lines. +static BYTE_HANDLERS: Aligned128<[ByteHandler; 16]> = Aligned128([ + print_null, + print_bell, + print_backspace, + print_vertical_tab, + print_form_field, + print_new_line, + print_carriage_return, + print_escape, + print_backslash, + print_single_quote, + print_double_quote, + print_backtick, + print_dollar, + print_ls_or_ps, + print_non_breaking_space, + print_lossy_replacement, +]); + +/// Call byte handler for byte which needs escaping. +/// +/// # SAFETY +/// +/// * There must be another byte to consume in `bytes` iterator. +/// * `escape` must correspond to the next byte. +/// * `escape` must not be `Escape::__`. +unsafe fn handle_escape(escape: Escape, codegen: &mut Codegen, state: &mut PrintStringState) { + // Inform compiler that `escape` is not `Escape::__`. + // This removes the bounds check from `BYTE_HANDLERS.0[escape as usize - 1]`. + // SAFETY: Caller guarantees `escape` is not `Escape::__`. + unsafe { assert_unchecked!(escape != Escape::__) }; + + let byte_handler = BYTE_HANDLERS.0[escape as usize - 1]; + // SAFETY: Caller guarantees there is a byte to consume in `bytes` iterator, + // and that `escape` corresponds to the next byte, so we are calling the correct byte handler + unsafe { byte_handler(codegen, state) }; +} + +// Byte handlers for bytes which need escaping. +// +// # SAFETY +// +// All byte handlers have safety invariants: +// * There must be at least 1 byte remaining in `bytes` iterator. +// * The byte handler is only called for the byte it expects. + +// \x00 +unsafe fn print_null(codegen: &mut Codegen, state: &mut PrintStringState) { + debug_assert_eq!(state.peek(), Some(0x00)); + + // SAFETY: Next byte is `\x00`, which is ASCII + unsafe { state.flush_and_consume_byte(codegen) }; + if state.peek().is_some_and(|b| b.is_ascii_digit()) { + codegen.print_str("\\x00"); + } else { + codegen.print_str("\\0"); + } +} + +// \x07 +unsafe fn print_bell(codegen: &mut Codegen, state: &mut PrintStringState) { + debug_assert_eq!(state.peek(), Some(0x07)); + // SAFETY: Next byte is `\x07`, which is ASCII + unsafe { state.flush_and_consume_byte(codegen) }; + codegen.print_str("\\x07"); +} + +// \b +unsafe fn print_backspace(codegen: &mut Codegen, state: &mut PrintStringState) { + debug_assert_eq!(state.peek(), Some(0x08)); + // SAFETY: Next byte is `\b`, which is ASCII + unsafe { state.flush_and_consume_byte(codegen) }; + codegen.print_str("\\b"); +} + +// \v +unsafe fn print_vertical_tab(codegen: &mut Codegen, state: &mut PrintStringState) { + debug_assert_eq!(state.peek(), Some(0x0B)); + // SAFETY: Next byte is `\v`, which is ASCII + unsafe { state.flush_and_consume_byte(codegen) }; + codegen.print_str("\\v"); +} + +// \f +unsafe fn print_form_field(codegen: &mut Codegen, state: &mut PrintStringState) { + debug_assert_eq!(state.peek(), Some(0x0C)); + // SAFETY: Next byte is `\f`, which is ASCII + unsafe { state.flush_and_consume_byte(codegen) }; + codegen.print_str("\\f"); +} + +// \n +unsafe fn print_new_line(codegen: &mut Codegen, state: &mut PrintStringState) { + debug_assert_eq!(state.peek(), Some(b'\n')); + + if state.calculate_quote(codegen) == Quote::Backtick { + // No need to escape. + // SAFETY: Next byte is `\n`, which is ASCII. + unsafe { state.consume_byte_unchecked() }; + } else { + // SAFETY: Next byte is `\n`, which is ASCII + unsafe { state.flush_and_consume_byte(codegen) }; + codegen.print_str("\\n"); + } +} + +// \r +unsafe fn print_carriage_return(codegen: &mut Codegen, state: &mut PrintStringState) { + debug_assert_eq!(state.peek(), Some(b'\r')); + // SAFETY: Next byte is `\r`, which is ASCII + unsafe { state.flush_and_consume_byte(codegen) }; + codegen.print_str("\\r"); +} + +// \x1B +unsafe fn print_escape(codegen: &mut Codegen, state: &mut PrintStringState) { + debug_assert_eq!(state.peek(), Some(0x1B)); + // SAFETY: Next byte is `\x1B`, which is ASCII + unsafe { state.flush_and_consume_byte(codegen) }; + codegen.print_str("\\x1B"); +} + +// \ +unsafe fn print_backslash(codegen: &mut Codegen, state: &mut PrintStringState) { + debug_assert_eq!(state.peek(), Some(b'\\')); + // SAFETY: Next byte is `\`, which is ASCII + unsafe { state.flush_and_consume_byte(codegen) }; + codegen.print_str("\\\\"); +} + +// ' +unsafe fn print_single_quote(codegen: &mut Codegen, state: &mut PrintStringState) { + debug_assert_eq!(state.peek(), Some(b'\'')); + + if state.calculate_quote(codegen) == Quote::Single { + // SAFETY: Next byte is `'`, which is ASCII + unsafe { state.flush_and_consume_byte(codegen) }; + codegen.print_str("\\'"); + } else { + // No need to escape. + // SAFETY: Next byte is `'`, which is ASCII. + unsafe { state.consume_byte_unchecked() }; + } +} + +// " +unsafe fn print_double_quote(codegen: &mut Codegen, state: &mut PrintStringState) { + debug_assert_eq!(state.peek(), Some(b'"')); + + if state.calculate_quote(codegen) == Quote::Double { + // SAFETY: Next byte is `"`, which is ASCII + unsafe { state.flush_and_consume_byte(codegen) }; + codegen.print_str("\\\""); + } else { + // No need to escape. + // SAFETY: Next byte is `"`, which is ASCII. + unsafe { state.consume_byte_unchecked() }; + } +} + +// ` +unsafe fn print_backtick(codegen: &mut Codegen, state: &mut PrintStringState) { + debug_assert_eq!(state.peek(), Some(b'`')); + + if state.calculate_quote(codegen) == Quote::Backtick { + // SAFETY: Next byte is `, which is ASCII + unsafe { state.flush_and_consume_byte(codegen) }; + codegen.print_str("\\`"); + } else { + // No need to escape. + // SAFETY: Next byte is `, which is ASCII. + unsafe { state.consume_byte_unchecked() }; + } +} + +// $ +unsafe fn print_dollar(codegen: &mut Codegen, state: &mut PrintStringState) { + debug_assert_eq!(state.peek(), Some(b'$')); + + // Note: Check next byte is `{` first, to avoid calculating quote unless have to + let next = state.bytes.as_slice().get(1); + if next == Some(&b'{') && state.calculate_quote(codegen) == Quote::Backtick { + // SAFETY: Next byte is `$`, which is ASCII + unsafe { state.flush_and_consume_byte(codegen) }; + codegen.print_str("\\$"); + } else { + // No need to escape. + // SAFETY: Next byte is `$`, which is ASCII. + unsafe { state.consume_byte_unchecked() }; + } +} + +// 0xE2 - first byte of or +unsafe fn print_ls_or_ps(codegen: &mut Codegen, state: &mut PrintStringState) { + debug_assert_eq!(state.peek(), Some(0xE2)); + + let next2: [u8; 2] = { + // SAFETY: 0xE2 is always the start of a 3-byte Unicode character, + // so there must be 2 more bytes available to consume + let next2 = unsafe { state.bytes.as_slice().get_unchecked(1..3) }; + next2.try_into().unwrap() + }; + + let replacement = match next2 { + LS_LAST_2_BYTES => "\\u2028", + PS_LAST_2_BYTES => "\\u2029", + _ => { + // Some other character starting with 0xE2. Advance past it. + // SAFETY: 0xE2 is always the start of a 3-byte Unicode character + unsafe { state.consume_bytes_unchecked::<3>() }; + return; + } + }; + + // SAFETY: 0xE2 is always the start of a 3-byte Unicode character + unsafe { state.flush_and_consume_bytes::<3>(codegen) }; + codegen.print_str(replacement); +} + +// 0xC2 - first byte of +unsafe fn print_non_breaking_space(codegen: &mut Codegen, state: &mut PrintStringState) { + debug_assert_eq!(state.peek(), Some(0xC2)); + + // SAFETY: 0xC2 is always the start of a 2-byte Unicode character, + // so there must be 1 more byte available to consume + let next = unsafe { *state.bytes.as_slice().get_unchecked(1) }; + if next == NBSP_LAST_BYTE { + // Character is NBSP. + // SAFETY: 0xC2 is always the start of a 2-byte Unicode character. + unsafe { state.flush_and_consume_bytes::<2>(codegen) }; + codegen.print_str("\\xA0"); + } else { + // Some other character starting with 0xC2. Advance past it. + // SAFETY: 0xC2 is always the start of a 2-byte Unicode character. + unsafe { state.consume_bytes_unchecked::<2>() }; + } +} + +// 0xEF - first byte of lossy replacement character (U+FFFD) +unsafe fn print_lossy_replacement(codegen: &mut Codegen, state: &mut PrintStringState) { + debug_assert_eq!(state.peek(), Some(0xEF)); + + if state.lone_surrogates { + // String contains lone surrogates which use the lossy replacement character (U+FFFD) + // as an escape marker. + // The lone surrogate is encoded as `\u{FFFD}XXXX` where `XXXX` is the code point as hex. + let next2: [u8; 2] = { + // SAFETY: 0xEF is always the start of a 3-byte Unicode character, + // so there must be 2 more bytes available to consume + let next2 = unsafe { state.bytes.as_slice().get_unchecked(1..3) }; + next2.try_into().unwrap() + }; + + if next2 == LOSSY_REPLACEMENT_CHAR_LAST_2_BYTES { + // Get the 4 hex bytes + let bytes = &mut state.bytes; + let hex: [u8; 4] = bytes.as_slice()[3..7].try_into().unwrap(); + + if hex == *b"fffd" { + // Actual lossy replacement character. + // Flush up to and including the lossy replacement character, then skip the 4 hex bytes. + // SAFETY: 0xEF is always the start of a 3-byte Unicode character + unsafe { state.consume_bytes_unchecked::<3>() }; + state.flush(codegen); + // SAFETY: 0xEF is always the start of a 3-byte Unicode character. + // `bytes.as_slice()[3..7]` would have panicked if there weren't 4 more bytes after it. + // All those bytes are ASCII, so this leaves `bytes` on a UTF-8 char boundary. + unsafe { state.consume_bytes_unchecked::<4>() }; + // Start next chunk after the 4 hex bytes + state.start_chunk(); + return; + } + + // Flush text before the lossy replacement character + state.flush(codegen); + + // Check all 4 hex bytes are ASCII + assert_eq!(u32::from_ne_bytes(hex) & 0x8080_8080, 0); + + // SAFETY: `bytes.as_slice()[3..7]` would have panicked if there weren't at least 7 bytes + // remaining. First 3 bytes are lossy replacement character, and we just checked that + // next 4 bytes are ASCII, so this leaves `bytes` on a UTF-8 char boundary. + unsafe { state.consume_bytes_unchecked::<7>() }; + + // Start next chunk after the 4 hex bytes + state.start_chunk(); + + codegen.print_str("\\u"); + // SAFETY: Just checked all 4 hex bytes are ASCII + unsafe { codegen.code.print_bytes_unchecked(&hex) }; + + return; + } + } + + // `lone_surrogates` is `false` or character is some other character starting with 0xEF. + // Advance past the character. + // SAFETY: 0xEF is always the start of a 3-byte Unicode character + unsafe { state.consume_bytes_unchecked::<3>() }; +} + +/// Call a closure while hinting to compiler that this branch is rarely taken. +/// +/// "Cold trampoline function", suggested in: +/// +#[cold] +pub fn cold_branch T, T>(f: F) -> T { + f() +}