oxc-project · Copilot · Sep 8, 2025 · Sep 8, 2025 · Sep 8, 2025 · overlookmotel
diff --git a/.gitignore b/.gitignore
@@ -52,3 +52,4 @@ tasks/prettier_conformance/prettier/
 # See also
 # * https://stackoverflow.com/a/7335487
 # * https://docs.github.com/en/get-started/getting-started-with-git/ignoring-files#configuring-ignored-files-for-all-repositories-on-your-computer
+tmp/
diff --git a/crates/oxc_codegen/src/comment.rs b/crates/oxc_codegen/src/comment.rs
@@ -22,12 +22,12 @@ pub type CommentsMap = FxHashMap</* attached_to */ u32, Vec<Comment>>;
 /// Standard split would turn `"line1\r\nline2"` into `["line1", "", "line2"]` because
 /// it treats `\r` and `\n` as separate terminators. This iterator correctly produces
 /// `["line1", "line2"]` by treating `\r\n` as a single terminator.
-struct LineTerminatorSplitter<'a> {
+pub(crate) struct LineTerminatorSplitter<'a> {
     text: &'a str,
 }
 
 impl<'a> LineTerminatorSplitter<'a> {
-    fn new(text: &'a str) -> Self {
+    pub(crate) fn new(text: &'a str) -> Self {
         Self { text }
     }
 }
@@ -40,54 +40,155 @@ impl<'a> Iterator for LineTerminatorSplitter<'a> {
             return None;
         }
 
-        for (index, &byte) in self.text.as_bytes().iter().enumerate() {
-            match byte {
-                b'\n' => {
-                    // SAFETY: Byte at `index` is `\n`, so `index` and `index + 1` are both UTF-8 char boundaries.
-                    // Therefore, slices up to `index` and from `index + 1` are both valid `&str`s.
-                    unsafe {
-                        let line = self.text.get_unchecked(..index);
-                        self.text = self.text.get_unchecked(index + 1..);
-                        return Some(line);
+        // Line terminators will be very rare in most text. So we try to make the search as quick as possible by:
+        // 1. Searching for line terminator bytes (`\r`, `\n`, `0xE2`) first, and only checking details once found.
+        // 2. Searching longer strings in chunks of 16 bytes using SIMD, and only doing the
+        //    more expensive byte-by-byte search once a line terminator is found.
+
+        let bytes = self.text.as_bytes();
+        let mut consumed = 0;
+
+        // Search range of bytes for line terminators, byte by byte.
+        //
+        // Bytes between `ptr` and `last_ptr` (inclusive) are searched for `\r`, `\n`, or `0xE2`.
+        // If found, process the line terminator and return the line.
+        //
+        // SAFETY:
+        // * `ptr` and `last_ptr` must be within bounds of `bytes`.
+        // * `last_ptr` must be greater or equal to `ptr`.
+        // * For `0xE2` (LS/PS), `last_ptr` must be no later than 3 bytes before end of string.
+        //   i.e. safe to read 3 bytes at `last_ptr`.
+        let mut search_bytes = |mut ptr: *const u8, last_ptr| -> Option<&'a str> {
+            loop {
+                // SAFETY: `ptr` is always less than or equal to `last_ptr`.
+                // `last_ptr` is within bounds of `bytes`, so safe to read a byte at `ptr`.
+                let byte = unsafe { *ptr };
+                match byte {
+                    b'\n' => {
+                        // SAFETY: `consumed` is initially 0, and only updated to valid UTF-8 boundaries.
+                        // `index` is on `\n`, so `index` and `index + 1` are UTF-8 char boundaries.
+                        unsafe {
+                            let index = ptr.offset_from(bytes.as_ptr()) as usize;
+                            let line = self.text.get_unchecked(consumed..index);
+                            // Set `consumed` to after `\n`
+                            consumed = index + 1;
+                            self.text = self.text.get_unchecked(consumed..);
+                            return Some(line);
+                        }
                     }
-                }
-                b'\r' => {
-                    // SAFETY: Byte at `index` is `\r`, so `index` is on a UTF-8 char boundary
-                    let line = unsafe { self.text.get_unchecked(..index) };
-                    // If the next byte is `\n`, consume it as well
-                    let skip_bytes =
-                        if self.text.as_bytes().get(index + 1) == Some(&b'\n') { 2 } else { 1 };
-                    // SAFETY: `index + skip_bytes` is after `\r` or `\n`, so on a UTF-8 char boundary.
-                    // Therefore slice from `index + skip_bytes` is a valid `&str`.
-                    self.text = unsafe { self.text.get_unchecked(index + skip_bytes..) };
-                    return Some(line);
-                }
-                LS_OR_PS_FIRST_BYTE => {
-                    let next2: [u8; 2] = {
-                        // SAFETY: 0xE2 is always the start of a 3-byte Unicode character,
-                        // so there must be 2 more bytes available to consume
-                        let next2 =
-                            unsafe { self.text.as_bytes().get_unchecked(index + 1..index + 3) };
-                        next2.try_into().unwrap()
-                    };
-                    // If this is LS or PS, treat it as a line terminator
-                    if matches!(next2, LS_LAST_2_BYTES | PS_LAST_2_BYTES) {
-                        // SAFETY: `index` is the start of a 3-byte Unicode character,
-                        // so `index` and `index + 3` are both UTF-8 char boundaries.
-                        // Therefore, slices up to `index` and from `index + 3` are both valid `&str`s.
+                    b'\r' => {
+                        // SAFETY: `consumed` is initially 0, and only updated to valid UTF-8 boundaries.
+                        // `index` is on `\r`, so `index` is a UTF-8 char boundary.
                         unsafe {
-                            let line = self.text.get_unchecked(..index);
-                            self.text = self.text.get_unchecked(index + 3..);
+                            let index = ptr.offset_from(bytes.as_ptr()) as usize;
+                            let line = self.text.get_unchecked(consumed..index);
+                            // Check if next byte is `\n` and consume it as well
+                            let skip_bytes =
+                                if bytes.get(index + 1) == Some(&b'\n') { 2 } else { 1 };
+                            // Set `consumed` to after `\r` or `\r\n`
+                            consumed = index + skip_bytes;
+                            self.text = self.text.get_unchecked(consumed..);
                             return Some(line);
                         }
                     }
+                    LS_OR_PS_FIRST_BYTE => {
+                        let next2: [u8; 2] = {
+                            // SAFETY: We ensure `last_ptr` is at least 3 bytes before end,
+                            // so safe to read 2 more bytes after `0xE2`
+                            let next2 = unsafe {
+                                let slice_ptr = ptr.add(1);
+                                std::slice::from_raw_parts(slice_ptr, 2)
+                            };
+                            [next2[0], next2[1]]
+                        };
+                        // If this is LS or PS, treat it as a line terminator
+                        if matches!(next2, LS_LAST_2_BYTES | PS_LAST_2_BYTES) {
+                            // SAFETY: `consumed` is initially 0, and only updated to valid UTF-8 boundaries.
+                            // `index` is start of 3-byte Unicode char, so `index` and `index + 3` are UTF-8 boundaries.
+                            unsafe {
+                                let index = ptr.offset_from(bytes.as_ptr()) as usize;
+                                let line = self.text.get_unchecked(consumed..index);
+                                // Set `consumed` to after the 3-byte LS/PS character
+                                consumed = index + 3;
+                                self.text = self.text.get_unchecked(consumed..);
+                                return Some(line);
+                            }
+                        }
+                    }
+                    _ => {}
+                }
+
+                if ptr == last_ptr {
+                    break;
+                }
+                // SAFETY: `ptr` is less than `last_ptr`, which is in bounds, so safe to increment `ptr`
+                ptr = unsafe { ptr.add(1) };
+            }
+            None
+        };
+
+        // Search string in chunks of 16 bytes
+        let mut chunks = bytes.chunks_exact(16);
+        for (chunk_index, chunk) in chunks.by_ref().enumerate() {
+            #[expect(clippy::missing_panics_doc, reason = "infallible")]
+            let chunk: &[u8; 16] = chunk.try_into().unwrap();
+
+            // Compiler vectorizes this loop to a few SIMD ops
+            let mut contains_line_terminator = false;
+            for &byte in chunk {
+                if matches!(byte, b'\r' | b'\n' | LS_OR_PS_FIRST_BYTE) {
+                    contains_line_terminator = true;
+                    break;
                 }
-                _ => {}
+            }
+
+            if contains_line_terminator {
+                // Chunk contains at least one line terminator.
+                // Find them and process.
+                //
+                // SAFETY: `index` is byte index of start of chunk.
+                // We search bytes starting with first byte of chunk, and ending with last byte of chunk.
+                // i.e. `index` to `index + 15` (inclusive).
+                // If this chunk is towards the end of the string, reduce the range of bytes searched
+                // so the last byte searched has at least 2 further bytes after it for LS/PS detection.
+                // i.e. safe to read 3 bytes at `last_ptr`.
+                return crate::str::cold_branch(|| unsafe {
+                    let index = chunk_index * 16;
+                    let remaining_bytes = bytes.len() - index;
+                    let last_offset = if remaining_bytes >= 3 {
+                        std::cmp::min(remaining_bytes - 3, 15)
+                    } else {
+                        // Not enough bytes for LS/PS, but still check for \r and \n
+                        if remaining_bytes > 0 { remaining_bytes - 1 } else { 0 }
+                    };
+                    let ptr = bytes.as_ptr().add(index);
+                    let last_ptr = ptr.add(last_offset);
+                    search_bytes(ptr, last_ptr)
+                });
+            }
+        }
+
+        // Search last chunk byte-by-byte.
+        // Skip LS/PS checks if less than 3 bytes remaining.
+        let last_chunk = chunks.remainder();
+        if !last_chunk.is_empty() {
+            let ptr = last_chunk.as_ptr();
+            let last_offset = if last_chunk.len() >= 3 {
+                last_chunk.len() - 3
+            } else {
+                // Not enough bytes for LS/PS, but still check for \r and \n
+                if last_chunk.len() > 0 { last_chunk.len() - 1 } else { 0 }
+            };
+            // SAFETY: `last_offset` is calculated to be in bounds of `last_chunk`.
+            let last_ptr = unsafe { ptr.add(last_offset) };
+            if let Some(line) = search_bytes(ptr, last_ptr) {
+                return Some(line);
             }
         }
 
         // No line break found - return the remaining text. Next call will return `None`.
-        let line = self.text;
+        // SAFETY: `consumed` is either 0 or set to valid UTF-8 boundaries from previous processing.
+        let line = unsafe { self.text.get_unchecked(consumed..) };
         self.text = "";
         Some(line)
     }