diff --git a/.gitignore b/.gitignore index c20668ecc8203..eaeb72b32c2d5 100644 --- a/.gitignore +++ b/.gitignore @@ -52,3 +52,4 @@ tasks/prettier_conformance/prettier/ # See also # * https://stackoverflow.com/a/7335487 # * https://docs.github.com/en/get-started/getting-started-with-git/ignoring-files#configuring-ignored-files-for-all-repositories-on-your-computer +tmp/ diff --git a/crates/oxc_codegen/src/comment.rs b/crates/oxc_codegen/src/comment.rs index e25ee8d403fa5..5052fdcc46ae4 100644 --- a/crates/oxc_codegen/src/comment.rs +++ b/crates/oxc_codegen/src/comment.rs @@ -22,12 +22,12 @@ pub type CommentsMap = FxHashMap>; /// Standard split would turn `"line1\r\nline2"` into `["line1", "", "line2"]` because /// it treats `\r` and `\n` as separate terminators. This iterator correctly produces /// `["line1", "line2"]` by treating `\r\n` as a single terminator. -struct LineTerminatorSplitter<'a> { +pub(crate) struct LineTerminatorSplitter<'a> { text: &'a str, } impl<'a> LineTerminatorSplitter<'a> { - fn new(text: &'a str) -> Self { + pub(crate) fn new(text: &'a str) -> Self { Self { text } } } @@ -40,54 +40,155 @@ impl<'a> Iterator for LineTerminatorSplitter<'a> { return None; } - for (index, &byte) in self.text.as_bytes().iter().enumerate() { - match byte { - b'\n' => { - // SAFETY: Byte at `index` is `\n`, so `index` and `index + 1` are both UTF-8 char boundaries. - // Therefore, slices up to `index` and from `index + 1` are both valid `&str`s. - unsafe { - let line = self.text.get_unchecked(..index); - self.text = self.text.get_unchecked(index + 1..); - return Some(line); + // Line terminators will be very rare in most text. So we try to make the search as quick as possible by: + // 1. Searching for line terminator bytes (`\r`, `\n`, `0xE2`) first, and only checking details once found. + // 2. Searching longer strings in chunks of 16 bytes using SIMD, and only doing the + // more expensive byte-by-byte search once a line terminator is found. + + let bytes = self.text.as_bytes(); + let mut consumed = 0; + + // Search range of bytes for line terminators, byte by byte. + // + // Bytes between `ptr` and `last_ptr` (inclusive) are searched for `\r`, `\n`, or `0xE2`. + // If found, process the line terminator and return the line. + // + // SAFETY: + // * `ptr` and `last_ptr` must be within bounds of `bytes`. + // * `last_ptr` must be greater or equal to `ptr`. + // * For `0xE2` (LS/PS), `last_ptr` must be no later than 3 bytes before end of string. + // i.e. safe to read 3 bytes at `last_ptr`. + let mut search_bytes = |mut ptr: *const u8, last_ptr| -> Option<&'a str> { + loop { + // SAFETY: `ptr` is always less than or equal to `last_ptr`. + // `last_ptr` is within bounds of `bytes`, so safe to read a byte at `ptr`. + let byte = unsafe { *ptr }; + match byte { + b'\n' => { + // SAFETY: `consumed` is initially 0, and only updated to valid UTF-8 boundaries. + // `index` is on `\n`, so `index` and `index + 1` are UTF-8 char boundaries. + unsafe { + let index = ptr.offset_from(bytes.as_ptr()) as usize; + let line = self.text.get_unchecked(consumed..index); + // Set `consumed` to after `\n` + consumed = index + 1; + self.text = self.text.get_unchecked(consumed..); + return Some(line); + } } - } - b'\r' => { - // SAFETY: Byte at `index` is `\r`, so `index` is on a UTF-8 char boundary - let line = unsafe { self.text.get_unchecked(..index) }; - // If the next byte is `\n`, consume it as well - let skip_bytes = - if self.text.as_bytes().get(index + 1) == Some(&b'\n') { 2 } else { 1 }; - // SAFETY: `index + skip_bytes` is after `\r` or `\n`, so on a UTF-8 char boundary. - // Therefore slice from `index + skip_bytes` is a valid `&str`. - self.text = unsafe { self.text.get_unchecked(index + skip_bytes..) }; - return Some(line); - } - LS_OR_PS_FIRST_BYTE => { - let next2: [u8; 2] = { - // SAFETY: 0xE2 is always the start of a 3-byte Unicode character, - // so there must be 2 more bytes available to consume - let next2 = - unsafe { self.text.as_bytes().get_unchecked(index + 1..index + 3) }; - next2.try_into().unwrap() - }; - // If this is LS or PS, treat it as a line terminator - if matches!(next2, LS_LAST_2_BYTES | PS_LAST_2_BYTES) { - // SAFETY: `index` is the start of a 3-byte Unicode character, - // so `index` and `index + 3` are both UTF-8 char boundaries. - // Therefore, slices up to `index` and from `index + 3` are both valid `&str`s. + b'\r' => { + // SAFETY: `consumed` is initially 0, and only updated to valid UTF-8 boundaries. + // `index` is on `\r`, so `index` is a UTF-8 char boundary. unsafe { - let line = self.text.get_unchecked(..index); - self.text = self.text.get_unchecked(index + 3..); + let index = ptr.offset_from(bytes.as_ptr()) as usize; + let line = self.text.get_unchecked(consumed..index); + // Check if next byte is `\n` and consume it as well + let skip_bytes = + if bytes.get(index + 1) == Some(&b'\n') { 2 } else { 1 }; + // Set `consumed` to after `\r` or `\r\n` + consumed = index + skip_bytes; + self.text = self.text.get_unchecked(consumed..); return Some(line); } } + LS_OR_PS_FIRST_BYTE => { + let next2: [u8; 2] = { + // SAFETY: We ensure `last_ptr` is at least 3 bytes before end, + // so safe to read 2 more bytes after `0xE2` + let next2 = unsafe { + let slice_ptr = ptr.add(1); + std::slice::from_raw_parts(slice_ptr, 2) + }; + [next2[0], next2[1]] + }; + // If this is LS or PS, treat it as a line terminator + if matches!(next2, LS_LAST_2_BYTES | PS_LAST_2_BYTES) { + // SAFETY: `consumed` is initially 0, and only updated to valid UTF-8 boundaries. + // `index` is start of 3-byte Unicode char, so `index` and `index + 3` are UTF-8 boundaries. + unsafe { + let index = ptr.offset_from(bytes.as_ptr()) as usize; + let line = self.text.get_unchecked(consumed..index); + // Set `consumed` to after the 3-byte LS/PS character + consumed = index + 3; + self.text = self.text.get_unchecked(consumed..); + return Some(line); + } + } + } + _ => {} + } + + if ptr == last_ptr { + break; + } + // SAFETY: `ptr` is less than `last_ptr`, which is in bounds, so safe to increment `ptr` + ptr = unsafe { ptr.add(1) }; + } + None + }; + + // Search string in chunks of 16 bytes + let mut chunks = bytes.chunks_exact(16); + for (chunk_index, chunk) in chunks.by_ref().enumerate() { + #[expect(clippy::missing_panics_doc, reason = "infallible")] + let chunk: &[u8; 16] = chunk.try_into().unwrap(); + + // Compiler vectorizes this loop to a few SIMD ops + let mut contains_line_terminator = false; + for &byte in chunk { + if matches!(byte, b'\r' | b'\n' | LS_OR_PS_FIRST_BYTE) { + contains_line_terminator = true; + break; } - _ => {} + } + + if contains_line_terminator { + // Chunk contains at least one line terminator. + // Find them and process. + // + // SAFETY: `index` is byte index of start of chunk. + // We search bytes starting with first byte of chunk, and ending with last byte of chunk. + // i.e. `index` to `index + 15` (inclusive). + // If this chunk is towards the end of the string, reduce the range of bytes searched + // so the last byte searched has at least 2 further bytes after it for LS/PS detection. + // i.e. safe to read 3 bytes at `last_ptr`. + return crate::str::cold_branch(|| unsafe { + let index = chunk_index * 16; + let remaining_bytes = bytes.len() - index; + let last_offset = if remaining_bytes >= 3 { + std::cmp::min(remaining_bytes - 3, 15) + } else { + // Not enough bytes for LS/PS, but still check for \r and \n + if remaining_bytes > 0 { remaining_bytes - 1 } else { 0 } + }; + let ptr = bytes.as_ptr().add(index); + let last_ptr = ptr.add(last_offset); + search_bytes(ptr, last_ptr) + }); + } + } + + // Search last chunk byte-by-byte. + // Skip LS/PS checks if less than 3 bytes remaining. + let last_chunk = chunks.remainder(); + if !last_chunk.is_empty() { + let ptr = last_chunk.as_ptr(); + let last_offset = if last_chunk.len() >= 3 { + last_chunk.len() - 3 + } else { + // Not enough bytes for LS/PS, but still check for \r and \n + if last_chunk.len() > 0 { last_chunk.len() - 1 } else { 0 } + }; + // SAFETY: `last_offset` is calculated to be in bounds of `last_chunk`. + let last_ptr = unsafe { ptr.add(last_offset) }; + if let Some(line) = search_bytes(ptr, last_ptr) { + return Some(line); } } // No line break found - return the remaining text. Next call will return `None`. - let line = self.text; + // SAFETY: `consumed` is either 0 or set to valid UTF-8 boundaries from previous processing. + let line = unsafe { self.text.get_unchecked(consumed..) }; self.text = ""; Some(line) }