-
-
Notifications
You must be signed in to change notification settings - Fork 861
codegen: Optimize multiline comments handling with SIMD processing #13593
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -22,12 +22,12 @@ pub type CommentsMap = FxHashMap</* attached_to */ u32, Vec<Comment>>; | |
| /// Standard split would turn `"line1\r\nline2"` into `["line1", "", "line2"]` because | ||
| /// it treats `\r` and `\n` as separate terminators. This iterator correctly produces | ||
| /// `["line1", "line2"]` by treating `\r\n` as a single terminator. | ||
| struct LineTerminatorSplitter<'a> { | ||
| pub(crate) struct LineTerminatorSplitter<'a> { | ||
| text: &'a str, | ||
| } | ||
|
|
||
| impl<'a> LineTerminatorSplitter<'a> { | ||
| fn new(text: &'a str) -> Self { | ||
| pub(crate) fn new(text: &'a str) -> Self { | ||
| Self { text } | ||
| } | ||
| } | ||
|
|
@@ -40,54 +40,155 @@ impl<'a> Iterator for LineTerminatorSplitter<'a> { | |
| return None; | ||
| } | ||
|
|
||
| for (index, &byte) in self.text.as_bytes().iter().enumerate() { | ||
| match byte { | ||
| b'\n' => { | ||
| // SAFETY: Byte at `index` is `\n`, so `index` and `index + 1` are both UTF-8 char boundaries. | ||
| // Therefore, slices up to `index` and from `index + 1` are both valid `&str`s. | ||
| unsafe { | ||
| let line = self.text.get_unchecked(..index); | ||
| self.text = self.text.get_unchecked(index + 1..); | ||
| return Some(line); | ||
| // Line terminators will be very rare in most text. So we try to make the search as quick as possible by: | ||
| // 1. Searching for line terminator bytes (`\r`, `\n`, `0xE2`) first, and only checking details once found. | ||
| // 2. Searching longer strings in chunks of 16 bytes using SIMD, and only doing the | ||
| // more expensive byte-by-byte search once a line terminator is found. | ||
|
|
||
| let bytes = self.text.as_bytes(); | ||
| let mut consumed = 0; | ||
|
|
||
| // Search range of bytes for line terminators, byte by byte. | ||
| // | ||
| // Bytes between `ptr` and `last_ptr` (inclusive) are searched for `\r`, `\n`, or `0xE2`. | ||
| // If found, process the line terminator and return the line. | ||
| // | ||
| // SAFETY: | ||
| // * `ptr` and `last_ptr` must be within bounds of `bytes`. | ||
| // * `last_ptr` must be greater or equal to `ptr`. | ||
| // * For `0xE2` (LS/PS), `last_ptr` must be no later than 3 bytes before end of string. | ||
| // i.e. safe to read 3 bytes at `last_ptr`. | ||
| let mut search_bytes = |mut ptr: *const u8, last_ptr| -> Option<&'a str> { | ||
| loop { | ||
| // SAFETY: `ptr` is always less than or equal to `last_ptr`. | ||
| // `last_ptr` is within bounds of `bytes`, so safe to read a byte at `ptr`. | ||
| let byte = unsafe { *ptr }; | ||
| match byte { | ||
| b'\n' => { | ||
| // SAFETY: `consumed` is initially 0, and only updated to valid UTF-8 boundaries. | ||
| // `index` is on `\n`, so `index` and `index + 1` are UTF-8 char boundaries. | ||
| unsafe { | ||
| let index = ptr.offset_from(bytes.as_ptr()) as usize; | ||
| let line = self.text.get_unchecked(consumed..index); | ||
| // Set `consumed` to after `\n` | ||
| consumed = index + 1; | ||
| self.text = self.text.get_unchecked(consumed..); | ||
| return Some(line); | ||
| } | ||
| } | ||
| } | ||
| b'\r' => { | ||
| // SAFETY: Byte at `index` is `\r`, so `index` is on a UTF-8 char boundary | ||
| let line = unsafe { self.text.get_unchecked(..index) }; | ||
| // If the next byte is `\n`, consume it as well | ||
| let skip_bytes = | ||
| if self.text.as_bytes().get(index + 1) == Some(&b'\n') { 2 } else { 1 }; | ||
| // SAFETY: `index + skip_bytes` is after `\r` or `\n`, so on a UTF-8 char boundary. | ||
| // Therefore slice from `index + skip_bytes` is a valid `&str`. | ||
| self.text = unsafe { self.text.get_unchecked(index + skip_bytes..) }; | ||
| return Some(line); | ||
| } | ||
| LS_OR_PS_FIRST_BYTE => { | ||
| let next2: [u8; 2] = { | ||
| // SAFETY: 0xE2 is always the start of a 3-byte Unicode character, | ||
| // so there must be 2 more bytes available to consume | ||
| let next2 = | ||
| unsafe { self.text.as_bytes().get_unchecked(index + 1..index + 3) }; | ||
| next2.try_into().unwrap() | ||
| }; | ||
| // If this is LS or PS, treat it as a line terminator | ||
| if matches!(next2, LS_LAST_2_BYTES | PS_LAST_2_BYTES) { | ||
| // SAFETY: `index` is the start of a 3-byte Unicode character, | ||
| // so `index` and `index + 3` are both UTF-8 char boundaries. | ||
| // Therefore, slices up to `index` and from `index + 3` are both valid `&str`s. | ||
| b'\r' => { | ||
| // SAFETY: `consumed` is initially 0, and only updated to valid UTF-8 boundaries. | ||
| // `index` is on `\r`, so `index` is a UTF-8 char boundary. | ||
| unsafe { | ||
| let line = self.text.get_unchecked(..index); | ||
| self.text = self.text.get_unchecked(index + 3..); | ||
| let index = ptr.offset_from(bytes.as_ptr()) as usize; | ||
| let line = self.text.get_unchecked(consumed..index); | ||
| // Check if next byte is `\n` and consume it as well | ||
| let skip_bytes = | ||
| if bytes.get(index + 1) == Some(&b'\n') { 2 } else { 1 }; | ||
| // Set `consumed` to after `\r` or `\r\n` | ||
| consumed = index + skip_bytes; | ||
| self.text = self.text.get_unchecked(consumed..); | ||
| return Some(line); | ||
| } | ||
| } | ||
| LS_OR_PS_FIRST_BYTE => { | ||
| let next2: [u8; 2] = { | ||
| // SAFETY: We ensure `last_ptr` is at least 3 bytes before end, | ||
| // so safe to read 2 more bytes after `0xE2` | ||
| let next2 = unsafe { | ||
| let slice_ptr = ptr.add(1); | ||
| std::slice::from_raw_parts(slice_ptr, 2) | ||
| }; | ||
| [next2[0], next2[1]] | ||
| }; | ||
| // If this is LS or PS, treat it as a line terminator | ||
| if matches!(next2, LS_LAST_2_BYTES | PS_LAST_2_BYTES) { | ||
| // SAFETY: `consumed` is initially 0, and only updated to valid UTF-8 boundaries. | ||
| // `index` is start of 3-byte Unicode char, so `index` and `index + 3` are UTF-8 boundaries. | ||
| unsafe { | ||
| let index = ptr.offset_from(bytes.as_ptr()) as usize; | ||
| let line = self.text.get_unchecked(consumed..index); | ||
| // Set `consumed` to after the 3-byte LS/PS character | ||
| consumed = index + 3; | ||
| self.text = self.text.get_unchecked(consumed..); | ||
| return Some(line); | ||
| } | ||
| } | ||
| } | ||
| _ => {} | ||
| } | ||
|
|
||
| if ptr == last_ptr { | ||
| break; | ||
| } | ||
| // SAFETY: `ptr` is less than `last_ptr`, which is in bounds, so safe to increment `ptr` | ||
| ptr = unsafe { ptr.add(1) }; | ||
| } | ||
| None | ||
| }; | ||
|
|
||
| // Search string in chunks of 16 bytes | ||
| let mut chunks = bytes.chunks_exact(16); | ||
| for (chunk_index, chunk) in chunks.by_ref().enumerate() { | ||
| #[expect(clippy::missing_panics_doc, reason = "infallible")] | ||
| let chunk: &[u8; 16] = chunk.try_into().unwrap(); | ||
|
|
||
| // Compiler vectorizes this loop to a few SIMD ops | ||
| let mut contains_line_terminator = false; | ||
| for &byte in chunk { | ||
| if matches!(byte, b'\r' | b'\n' | LS_OR_PS_FIRST_BYTE) { | ||
| contains_line_terminator = true; | ||
| break; | ||
| } | ||
| _ => {} | ||
| } | ||
|
Comment on lines
+136
to
+143
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Compiler does not vectorize this. The match is too complicated, and it uses byte-by-byte search. https://godbolt.org/z/o3hP4cEMv (for context, if it was using SIMD, it'd be about ~8 instructions, and you'd see instructions using It's really hard to get the compiler to auto-vectorize cases like this. The premise of this PR is that it's using SIMD for better perf. But as it's not actually using SIMD, unfortunately I doubt this PR has much value. It may even regress perf as it's searching bytes twice instead of once.
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This works better: https://godbolt.org/z/r795Gsq3T (but still not quite ideal - it should be possible to use only 1 |
||
|
|
||
| if contains_line_terminator { | ||
| // Chunk contains at least one line terminator. | ||
| // Find them and process. | ||
| // | ||
| // SAFETY: `index` is byte index of start of chunk. | ||
| // We search bytes starting with first byte of chunk, and ending with last byte of chunk. | ||
| // i.e. `index` to `index + 15` (inclusive). | ||
| // If this chunk is towards the end of the string, reduce the range of bytes searched | ||
| // so the last byte searched has at least 2 further bytes after it for LS/PS detection. | ||
| // i.e. safe to read 3 bytes at `last_ptr`. | ||
| return crate::str::cold_branch(|| unsafe { | ||
| let index = chunk_index * 16; | ||
| let remaining_bytes = bytes.len() - index; | ||
| let last_offset = if remaining_bytes >= 3 { | ||
| std::cmp::min(remaining_bytes - 3, 15) | ||
| } else { | ||
| // Not enough bytes for LS/PS, but still check for \r and \n | ||
| if remaining_bytes > 0 { remaining_bytes - 1 } else { 0 } | ||
| }; | ||
| let ptr = bytes.as_ptr().add(index); | ||
| let last_ptr = ptr.add(last_offset); | ||
| search_bytes(ptr, last_ptr) | ||
| }); | ||
| } | ||
| } | ||
|
|
||
| // Search last chunk byte-by-byte. | ||
| // Skip LS/PS checks if less than 3 bytes remaining. | ||
| let last_chunk = chunks.remainder(); | ||
| if !last_chunk.is_empty() { | ||
| let ptr = last_chunk.as_ptr(); | ||
| let last_offset = if last_chunk.len() >= 3 { | ||
| last_chunk.len() - 3 | ||
| } else { | ||
| // Not enough bytes for LS/PS, but still check for \r and \n | ||
| if last_chunk.len() > 0 { last_chunk.len() - 1 } else { 0 } | ||
| }; | ||
| // SAFETY: `last_offset` is calculated to be in bounds of `last_chunk`. | ||
| let last_ptr = unsafe { ptr.add(last_offset) }; | ||
| if let Some(line) = search_bytes(ptr, last_ptr) { | ||
| return Some(line); | ||
| } | ||
| } | ||
|
|
||
| // No line break found - return the remaining text. Next call will return `None`. | ||
| let line = self.text; | ||
| // SAFETY: `consumed` is either 0 or set to valid UTF-8 boundaries from previous processing. | ||
| let line = unsafe { self.text.get_unchecked(consumed..) }; | ||
| self.text = ""; | ||
| Some(line) | ||
| } | ||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I'm not sure if this assumption is correct. Line breaks are fairly common in block comments.