-
-
Notifications
You must be signed in to change notification settings - Fork 14.4k
Fix is_ascii performance regression on AVX-512 CPUs when compiling with -C target-cpu=native #151259
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Fix is_ascii performance regression on AVX-512 CPUs when compiling with -C target-cpu=native #151259
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -3,10 +3,7 @@ | |
| use core::ascii::EscapeDefault; | ||
|
|
||
| use crate::fmt::{self, Write}; | ||
| #[cfg(not(any( | ||
| all(target_arch = "x86_64", target_feature = "sse2"), | ||
| all(target_arch = "loongarch64", target_feature = "lsx") | ||
| )))] | ||
| #[cfg(not(all(target_arch = "loongarch64", target_feature = "lsx")))] | ||
| use crate::intrinsics::const_eval_select; | ||
| use crate::{ascii, iter, ops}; | ||
|
|
||
|
|
@@ -463,19 +460,101 @@ const fn is_ascii(s: &[u8]) -> bool { | |
| ) | ||
| } | ||
|
|
||
| /// ASCII test optimized to use the `pmovmskb` instruction on `x86-64` and the | ||
| /// `vmskltz.b` instruction on `loongarch64`. | ||
| /// Chunk size for vectorized ASCII checking (two 16-byte SSE registers). | ||
| #[cfg(all(target_arch = "x86_64", target_feature = "sse2"))] | ||
| const CHUNK_SIZE: usize = 32; | ||
|
|
||
| /// SSE2 implementation using `_mm_movemask_epi8` (compiles to `pmovmskb`) to | ||
| /// avoid LLVM's broken AVX-512 auto-vectorization of counting loops. | ||
| /// | ||
| /// FIXME(llvm#176906): Remove this workaround once LLVM generates efficient code. | ||
| #[cfg(all(target_arch = "x86_64", target_feature = "sse2"))] | ||
| fn is_ascii_sse2(bytes: &[u8]) -> bool { | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. did you consider writing the loop using https://rust.godbolt.org/z/vfxK51fq8 Somehow LLVM seems to understand this less well, but it does remove a bunch of the unsafety from the loop.
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Pretty sure I tested it earlier but that benchmark showed a regression.
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I tested with
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Wild. Could you open a separate issue for that performance issue? I'd love to see what our LLVM and mir-opt people think about that. It just seems like it should be equivalent, so this might be one of those cases where LLVM overfits on what
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I checked everything again. |
||
| use crate::arch::x86_64::{__m128i, _mm_loadu_si128, _mm_movemask_epi8, _mm_or_si128}; | ||
|
|
||
| let mut i = 0; | ||
|
|
||
| while i + CHUNK_SIZE <= bytes.len() { | ||
| // SAFETY: We have verified that `i + CHUNK_SIZE <= bytes.len()`. | ||
| let ptr = unsafe { bytes.as_ptr().add(i) }; | ||
|
|
||
| // Load two 16-byte chunks and combine them. | ||
| // SAFETY: We verified `i + 32 <= len`, so ptr is valid for 32 bytes. | ||
| // `_mm_loadu_si128` allows unaligned loads. | ||
| let chunk1 = unsafe { _mm_loadu_si128(ptr as *const __m128i) }; | ||
| // SAFETY: Same as above - ptr.add(16) is within the valid 32-byte range. | ||
| let chunk2 = unsafe { _mm_loadu_si128(ptr.add(16) as *const __m128i) }; | ||
|
|
||
| // OR them together - if any byte has the high bit set, the result will too. | ||
| // SAFETY: SSE2 is guaranteed by the cfg predicate. | ||
| let combined = unsafe { _mm_or_si128(chunk1, chunk2) }; | ||
|
|
||
| // Create a mask from the MSBs of each byte. | ||
| // If any byte is >= 128, its MSB is 1, so the mask will be non-zero. | ||
| // SAFETY: SSE2 is guaranteed by the cfg predicate. | ||
| let mask = unsafe { _mm_movemask_epi8(combined) }; | ||
|
|
||
| if mask != 0 { | ||
| return false; | ||
| } | ||
|
|
||
| i += CHUNK_SIZE; | ||
| } | ||
|
|
||
| // Handle remaining bytes with simple loop | ||
| while i < bytes.len() { | ||
| if !bytes[i].is_ascii() { | ||
| return false; | ||
| } | ||
| i += 1; | ||
| } | ||
|
|
||
| true | ||
| } | ||
|
|
||
| /// ASCII test optimized to use the `pmovmskb` instruction on `x86-64`. | ||
| /// | ||
| /// Uses explicit SSE2 intrinsics to prevent LLVM from auto-vectorizing with | ||
| /// broken AVX-512 code that extracts mask bits one-by-one. | ||
| #[cfg(all(target_arch = "x86_64", target_feature = "sse2"))] | ||
| #[inline] | ||
| #[rustc_allow_const_fn_unstable(const_eval_select)] | ||
| const fn is_ascii(bytes: &[u8]) -> bool { | ||
| const USIZE_SIZE: usize = size_of::<usize>(); | ||
| const NONASCII_MASK: usize = usize::MAX / 255 * 0x80; | ||
|
|
||
| const_eval_select!( | ||
| @capture { bytes: &[u8] } -> bool: | ||
| if const { | ||
| is_ascii_simple(bytes) | ||
| } else { | ||
| // For small inputs, use usize-at-a-time processing to avoid SSE2 call overhead. | ||
| if bytes.len() < CHUNK_SIZE { | ||
| let chunks = bytes.chunks_exact(USIZE_SIZE); | ||
| let remainder = chunks.remainder(); | ||
| for chunk in chunks { | ||
| let word = usize::from_ne_bytes(chunk.try_into().unwrap()); | ||
| if (word & NONASCII_MASK) != 0 { | ||
| return false; | ||
| } | ||
| } | ||
| return remainder.iter().all(|b| b.is_ascii()); | ||
| } | ||
|
|
||
| is_ascii_sse2(bytes) | ||
| } | ||
| ) | ||
| } | ||
|
|
||
| /// ASCII test optimized to use the `vmskltz.b` instruction on `loongarch64`. | ||
| /// | ||
| /// Other platforms are not likely to benefit from this code structure, so they | ||
| /// use SWAR techniques to test for ASCII in `usize`-sized chunks. | ||
| #[cfg(any( | ||
| all(target_arch = "x86_64", target_feature = "sse2"), | ||
| all(target_arch = "loongarch64", target_feature = "lsx") | ||
| ))] | ||
| #[cfg(all(target_arch = "loongarch64", target_feature = "lsx"))] | ||
| #[inline] | ||
| const fn is_ascii(bytes: &[u8]) -> bool { | ||
| // Process chunks of 32 bytes at a time in the fast path to enable | ||
| // auto-vectorization and use of `pmovmskb`. Two 128-bit vector registers | ||
| // auto-vectorization and use of `vmskltz.b`. Two 128-bit vector registers | ||
| // can be OR'd together and then the resulting vector can be tested for | ||
| // non-ASCII bytes. | ||
| const CHUNK_SIZE: usize = 32; | ||
|
|
@@ -485,7 +564,7 @@ const fn is_ascii(bytes: &[u8]) -> bool { | |
| while i + CHUNK_SIZE <= bytes.len() { | ||
| let chunk_end = i + CHUNK_SIZE; | ||
|
|
||
| // Get LLVM to produce a `pmovmskb` instruction on x86-64 which | ||
| // Get LLVM to produce a `vmskltz.b` instruction on loongarch64 which | ||
| // creates a mask from the most significant bit of each byte. | ||
| // ASCII bytes are less than 128 (0x80), so their most significant | ||
| // bit is unset. | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,32 @@ | ||
| //@ revisions: X86_64 LA64 | ||
| //@ assembly-output: emit-asm | ||
| //@ compile-flags: -C opt-level=3 | ||
| // | ||
| //@ [X86_64] only-x86_64 | ||
| //@ [X86_64] compile-flags: -C target-cpu=znver4 | ||
| //@ [X86_64] compile-flags: -C llvm-args=-x86-asm-syntax=intel | ||
| // | ||
| //@ [LA64] only-loongarch64 | ||
|
|
||
| #![crate_type = "lib"] | ||
|
|
||
| /// Verify `is_ascii` generates efficient code on different architectures: | ||
| /// | ||
| /// - x86_64: Must NOT use `kshiftrd`/`kshiftrq` (broken AVX-512 auto-vectorization). | ||
| /// The fix uses explicit SSE2 intrinsics (`pmovmskb`/`vpmovmskb`). | ||
| /// See: https://github.com/llvm/llvm-project/issues/176906 | ||
| /// | ||
| /// - loongarch64: Should use `vmskltz.b` instruction for the fast-path. | ||
| /// This architecture still relies on LLVM auto-vectorization. | ||
|
|
||
| // X86_64-LABEL: test_is_ascii | ||
| // X86_64-NOT: kshiftrd | ||
| // X86_64-NOT: kshiftrq | ||
|
|
||
| // LA64-LABEL: test_is_ascii | ||
| // LA64: vmskltz.b | ||
|
|
||
| #[no_mangle] | ||
| pub fn test_is_ascii(s: &[u8]) -> bool { | ||
| s.is_ascii() | ||
| } |
This file was deleted.
Uh oh!
There was an error while loading. Please reload this page.