Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -52,3 +52,4 @@ tasks/prettier_conformance/prettier/
# See also
# * https://stackoverflow.com/a/7335487
# * https://docs.github.com/en/get-started/getting-started-with-git/ignoring-files#configuring-ignored-files-for-all-repositories-on-your-computer
tmp/
181 changes: 141 additions & 40 deletions crates/oxc_codegen/src/comment.rs
Original file line number Diff line number Diff line change
Expand Up @@ -22,12 +22,12 @@ pub type CommentsMap = FxHashMap</* attached_to */ u32, Vec<Comment>>;
/// Standard split would turn `"line1\r\nline2"` into `["line1", "", "line2"]` because
/// it treats `\r` and `\n` as separate terminators. This iterator correctly produces
/// `["line1", "line2"]` by treating `\r\n` as a single terminator.
struct LineTerminatorSplitter<'a> {
pub(crate) struct LineTerminatorSplitter<'a> {
text: &'a str,
}

impl<'a> LineTerminatorSplitter<'a> {
fn new(text: &'a str) -> Self {
pub(crate) fn new(text: &'a str) -> Self {
Self { text }
}
}
Expand All @@ -40,54 +40,155 @@ impl<'a> Iterator for LineTerminatorSplitter<'a> {
return None;
}

for (index, &byte) in self.text.as_bytes().iter().enumerate() {
match byte {
b'\n' => {
// SAFETY: Byte at `index` is `\n`, so `index` and `index + 1` are both UTF-8 char boundaries.
// Therefore, slices up to `index` and from `index + 1` are both valid `&str`s.
unsafe {
let line = self.text.get_unchecked(..index);
self.text = self.text.get_unchecked(index + 1..);
return Some(line);
// Line terminators will be very rare in most text. So we try to make the search as quick as possible by:
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'm not sure if this assumption is correct. Line breaks are fairly common in block comments.

// 1. Searching for line terminator bytes (`\r`, `\n`, `0xE2`) first, and only checking details once found.
// 2. Searching longer strings in chunks of 16 bytes using SIMD, and only doing the
// more expensive byte-by-byte search once a line terminator is found.

let bytes = self.text.as_bytes();
let mut consumed = 0;

// Search range of bytes for line terminators, byte by byte.
//
// Bytes between `ptr` and `last_ptr` (inclusive) are searched for `\r`, `\n`, or `0xE2`.
// If found, process the line terminator and return the line.
//
// SAFETY:
// * `ptr` and `last_ptr` must be within bounds of `bytes`.
// * `last_ptr` must be greater or equal to `ptr`.
// * For `0xE2` (LS/PS), `last_ptr` must be no later than 3 bytes before end of string.
// i.e. safe to read 3 bytes at `last_ptr`.
let mut search_bytes = |mut ptr: *const u8, last_ptr| -> Option<&'a str> {
loop {
// SAFETY: `ptr` is always less than or equal to `last_ptr`.
// `last_ptr` is within bounds of `bytes`, so safe to read a byte at `ptr`.
let byte = unsafe { *ptr };
match byte {
b'\n' => {
// SAFETY: `consumed` is initially 0, and only updated to valid UTF-8 boundaries.
// `index` is on `\n`, so `index` and `index + 1` are UTF-8 char boundaries.
unsafe {
let index = ptr.offset_from(bytes.as_ptr()) as usize;
let line = self.text.get_unchecked(consumed..index);
// Set `consumed` to after `\n`
consumed = index + 1;
self.text = self.text.get_unchecked(consumed..);
return Some(line);
}
}
}
b'\r' => {
// SAFETY: Byte at `index` is `\r`, so `index` is on a UTF-8 char boundary
let line = unsafe { self.text.get_unchecked(..index) };
// If the next byte is `\n`, consume it as well
let skip_bytes =
if self.text.as_bytes().get(index + 1) == Some(&b'\n') { 2 } else { 1 };
// SAFETY: `index + skip_bytes` is after `\r` or `\n`, so on a UTF-8 char boundary.
// Therefore slice from `index + skip_bytes` is a valid `&str`.
self.text = unsafe { self.text.get_unchecked(index + skip_bytes..) };
return Some(line);
}
LS_OR_PS_FIRST_BYTE => {
let next2: [u8; 2] = {
// SAFETY: 0xE2 is always the start of a 3-byte Unicode character,
// so there must be 2 more bytes available to consume
let next2 =
unsafe { self.text.as_bytes().get_unchecked(index + 1..index + 3) };
next2.try_into().unwrap()
};
// If this is LS or PS, treat it as a line terminator
if matches!(next2, LS_LAST_2_BYTES | PS_LAST_2_BYTES) {
// SAFETY: `index` is the start of a 3-byte Unicode character,
// so `index` and `index + 3` are both UTF-8 char boundaries.
// Therefore, slices up to `index` and from `index + 3` are both valid `&str`s.
b'\r' => {
// SAFETY: `consumed` is initially 0, and only updated to valid UTF-8 boundaries.
// `index` is on `\r`, so `index` is a UTF-8 char boundary.
unsafe {
let line = self.text.get_unchecked(..index);
self.text = self.text.get_unchecked(index + 3..);
let index = ptr.offset_from(bytes.as_ptr()) as usize;
let line = self.text.get_unchecked(consumed..index);
// Check if next byte is `\n` and consume it as well
let skip_bytes =
if bytes.get(index + 1) == Some(&b'\n') { 2 } else { 1 };
// Set `consumed` to after `\r` or `\r\n`
consumed = index + skip_bytes;
self.text = self.text.get_unchecked(consumed..);
return Some(line);
}
}
LS_OR_PS_FIRST_BYTE => {
let next2: [u8; 2] = {
// SAFETY: We ensure `last_ptr` is at least 3 bytes before end,
// so safe to read 2 more bytes after `0xE2`
let next2 = unsafe {
let slice_ptr = ptr.add(1);
std::slice::from_raw_parts(slice_ptr, 2)
};
[next2[0], next2[1]]
};
// If this is LS or PS, treat it as a line terminator
if matches!(next2, LS_LAST_2_BYTES | PS_LAST_2_BYTES) {
// SAFETY: `consumed` is initially 0, and only updated to valid UTF-8 boundaries.
// `index` is start of 3-byte Unicode char, so `index` and `index + 3` are UTF-8 boundaries.
unsafe {
let index = ptr.offset_from(bytes.as_ptr()) as usize;
let line = self.text.get_unchecked(consumed..index);
// Set `consumed` to after the 3-byte LS/PS character
consumed = index + 3;
self.text = self.text.get_unchecked(consumed..);
return Some(line);
}
}
}
_ => {}
}

if ptr == last_ptr {
break;
}
// SAFETY: `ptr` is less than `last_ptr`, which is in bounds, so safe to increment `ptr`
ptr = unsafe { ptr.add(1) };
}
None
};

// Search string in chunks of 16 bytes
let mut chunks = bytes.chunks_exact(16);
for (chunk_index, chunk) in chunks.by_ref().enumerate() {
#[expect(clippy::missing_panics_doc, reason = "infallible")]
let chunk: &[u8; 16] = chunk.try_into().unwrap();

// Compiler vectorizes this loop to a few SIMD ops
let mut contains_line_terminator = false;
for &byte in chunk {
if matches!(byte, b'\r' | b'\n' | LS_OR_PS_FIRST_BYTE) {
contains_line_terminator = true;
break;
}
_ => {}
}
Comment on lines +136 to +143
Copy link
Member

@overlookmotel overlookmotel Sep 8, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Compiler does not vectorize this. The match is too complicated, and it uses byte-by-byte search.

https://godbolt.org/z/o3hP4cEMv

(for context, if it was using SIMD, it'd be about ~8 instructions, and you'd see instructions using xmm registers)

It's really hard to get the compiler to auto-vectorize cases like this.

The premise of this PR is that it's using SIMD for better perf. But as it's not actually using SIMD, unfortunately I doubt this PR has much value. It may even regress perf as it's searching bytes twice instead of once.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This works better: https://godbolt.org/z/r795Gsq3T (but still not quite ideal - it should be possible to use only 1 pmovmskb instruction, rather than 3).


if contains_line_terminator {
// Chunk contains at least one line terminator.
// Find them and process.
//
// SAFETY: `index` is byte index of start of chunk.
// We search bytes starting with first byte of chunk, and ending with last byte of chunk.
// i.e. `index` to `index + 15` (inclusive).
// If this chunk is towards the end of the string, reduce the range of bytes searched
// so the last byte searched has at least 2 further bytes after it for LS/PS detection.
// i.e. safe to read 3 bytes at `last_ptr`.
return crate::str::cold_branch(|| unsafe {
let index = chunk_index * 16;
let remaining_bytes = bytes.len() - index;
let last_offset = if remaining_bytes >= 3 {
std::cmp::min(remaining_bytes - 3, 15)
} else {
// Not enough bytes for LS/PS, but still check for \r and \n
if remaining_bytes > 0 { remaining_bytes - 1 } else { 0 }
};
let ptr = bytes.as_ptr().add(index);
let last_ptr = ptr.add(last_offset);
search_bytes(ptr, last_ptr)
});
}
}

// Search last chunk byte-by-byte.
// Skip LS/PS checks if less than 3 bytes remaining.
let last_chunk = chunks.remainder();
if !last_chunk.is_empty() {
let ptr = last_chunk.as_ptr();
let last_offset = if last_chunk.len() >= 3 {
last_chunk.len() - 3
} else {
// Not enough bytes for LS/PS, but still check for \r and \n
if last_chunk.len() > 0 { last_chunk.len() - 1 } else { 0 }
};
// SAFETY: `last_offset` is calculated to be in bounds of `last_chunk`.
let last_ptr = unsafe { ptr.add(last_offset) };
if let Some(line) = search_bytes(ptr, last_ptr) {
return Some(line);
}
}

// No line break found - return the remaining text. Next call will return `None`.
let line = self.text;
// SAFETY: `consumed` is either 0 or set to valid UTF-8 boundaries from previous processing.
let line = unsafe { self.text.get_unchecked(consumed..) };
self.text = "";
Some(line)
}
Expand Down
Loading