diff --git a/library/core/src/slice/ascii.rs b/library/core/src/slice/ascii.rs index 3e8c553f9f159..459c826f40646 100644 --- a/library/core/src/slice/ascii.rs +++ b/library/core/src/slice/ascii.rs @@ -3,10 +3,7 @@ use core::ascii::EscapeDefault; use crate::fmt::{self, Write}; -#[cfg(not(any( - all(target_arch = "x86_64", target_feature = "sse2"), - all(target_arch = "loongarch64", target_feature = "lsx") -)))] +#[cfg(not(all(target_arch = "loongarch64", target_feature = "lsx")))] use crate::intrinsics::const_eval_select; use crate::{ascii, iter, ops}; @@ -463,19 +460,101 @@ const fn is_ascii(s: &[u8]) -> bool { ) } -/// ASCII test optimized to use the `pmovmskb` instruction on `x86-64` and the -/// `vmskltz.b` instruction on `loongarch64`. +/// Chunk size for vectorized ASCII checking (two 16-byte SSE registers). +#[cfg(all(target_arch = "x86_64", target_feature = "sse2"))] +const CHUNK_SIZE: usize = 32; + +/// SSE2 implementation using `_mm_movemask_epi8` (compiles to `pmovmskb`) to +/// avoid LLVM's broken AVX-512 auto-vectorization of counting loops. +/// +/// FIXME(llvm#176906): Remove this workaround once LLVM generates efficient code. +#[cfg(all(target_arch = "x86_64", target_feature = "sse2"))] +fn is_ascii_sse2(bytes: &[u8]) -> bool { + use crate::arch::x86_64::{__m128i, _mm_loadu_si128, _mm_movemask_epi8, _mm_or_si128}; + + let mut i = 0; + + while i + CHUNK_SIZE <= bytes.len() { + // SAFETY: We have verified that `i + CHUNK_SIZE <= bytes.len()`. + let ptr = unsafe { bytes.as_ptr().add(i) }; + + // Load two 16-byte chunks and combine them. + // SAFETY: We verified `i + 32 <= len`, so ptr is valid for 32 bytes. + // `_mm_loadu_si128` allows unaligned loads. + let chunk1 = unsafe { _mm_loadu_si128(ptr as *const __m128i) }; + // SAFETY: Same as above - ptr.add(16) is within the valid 32-byte range. + let chunk2 = unsafe { _mm_loadu_si128(ptr.add(16) as *const __m128i) }; + + // OR them together - if any byte has the high bit set, the result will too. + // SAFETY: SSE2 is guaranteed by the cfg predicate. + let combined = unsafe { _mm_or_si128(chunk1, chunk2) }; + + // Create a mask from the MSBs of each byte. + // If any byte is >= 128, its MSB is 1, so the mask will be non-zero. + // SAFETY: SSE2 is guaranteed by the cfg predicate. + let mask = unsafe { _mm_movemask_epi8(combined) }; + + if mask != 0 { + return false; + } + + i += CHUNK_SIZE; + } + + // Handle remaining bytes with simple loop + while i < bytes.len() { + if !bytes[i].is_ascii() { + return false; + } + i += 1; + } + + true +} + +/// ASCII test optimized to use the `pmovmskb` instruction on `x86-64`. +/// +/// Uses explicit SSE2 intrinsics to prevent LLVM from auto-vectorizing with +/// broken AVX-512 code that extracts mask bits one-by-one. +#[cfg(all(target_arch = "x86_64", target_feature = "sse2"))] +#[inline] +#[rustc_allow_const_fn_unstable(const_eval_select)] +const fn is_ascii(bytes: &[u8]) -> bool { + const USIZE_SIZE: usize = size_of::(); + const NONASCII_MASK: usize = usize::MAX / 255 * 0x80; + + const_eval_select!( + @capture { bytes: &[u8] } -> bool: + if const { + is_ascii_simple(bytes) + } else { + // For small inputs, use usize-at-a-time processing to avoid SSE2 call overhead. + if bytes.len() < CHUNK_SIZE { + let chunks = bytes.chunks_exact(USIZE_SIZE); + let remainder = chunks.remainder(); + for chunk in chunks { + let word = usize::from_ne_bytes(chunk.try_into().unwrap()); + if (word & NONASCII_MASK) != 0 { + return false; + } + } + return remainder.iter().all(|b| b.is_ascii()); + } + + is_ascii_sse2(bytes) + } + ) +} + +/// ASCII test optimized to use the `vmskltz.b` instruction on `loongarch64`. /// /// Other platforms are not likely to benefit from this code structure, so they /// use SWAR techniques to test for ASCII in `usize`-sized chunks. -#[cfg(any( - all(target_arch = "x86_64", target_feature = "sse2"), - all(target_arch = "loongarch64", target_feature = "lsx") -))] +#[cfg(all(target_arch = "loongarch64", target_feature = "lsx"))] #[inline] const fn is_ascii(bytes: &[u8]) -> bool { // Process chunks of 32 bytes at a time in the fast path to enable - // auto-vectorization and use of `pmovmskb`. Two 128-bit vector registers + // auto-vectorization and use of `vmskltz.b`. Two 128-bit vector registers // can be OR'd together and then the resulting vector can be tested for // non-ASCII bytes. const CHUNK_SIZE: usize = 32; @@ -485,7 +564,7 @@ const fn is_ascii(bytes: &[u8]) -> bool { while i + CHUNK_SIZE <= bytes.len() { let chunk_end = i + CHUNK_SIZE; - // Get LLVM to produce a `pmovmskb` instruction on x86-64 which + // Get LLVM to produce a `vmskltz.b` instruction on loongarch64 which // creates a mask from the most significant bit of each byte. // ASCII bytes are less than 128 (0x80), so their most significant // bit is unset. diff --git a/tests/assembly-llvm/slice-is-ascii.rs b/tests/assembly-llvm/slice-is-ascii.rs new file mode 100644 index 0000000000000..d01b321bf460a --- /dev/null +++ b/tests/assembly-llvm/slice-is-ascii.rs @@ -0,0 +1,32 @@ +//@ revisions: X86_64 LA64 +//@ assembly-output: emit-asm +//@ compile-flags: -C opt-level=3 +// +//@ [X86_64] only-x86_64 +//@ [X86_64] compile-flags: -C target-cpu=znver4 +//@ [X86_64] compile-flags: -C llvm-args=-x86-asm-syntax=intel +// +//@ [LA64] only-loongarch64 + +#![crate_type = "lib"] + +/// Verify `is_ascii` generates efficient code on different architectures: +/// +/// - x86_64: Must NOT use `kshiftrd`/`kshiftrq` (broken AVX-512 auto-vectorization). +/// The fix uses explicit SSE2 intrinsics (`pmovmskb`/`vpmovmskb`). +/// See: https://github.com/llvm/llvm-project/issues/176906 +/// +/// - loongarch64: Should use `vmskltz.b` instruction for the fast-path. +/// This architecture still relies on LLVM auto-vectorization. + +// X86_64-LABEL: test_is_ascii +// X86_64-NOT: kshiftrd +// X86_64-NOT: kshiftrq + +// LA64-LABEL: test_is_ascii +// LA64: vmskltz.b + +#[no_mangle] +pub fn test_is_ascii(s: &[u8]) -> bool { + s.is_ascii() +} diff --git a/tests/codegen-llvm/slice-is-ascii.rs b/tests/codegen-llvm/slice-is-ascii.rs deleted file mode 100644 index 67537c871a0a3..0000000000000 --- a/tests/codegen-llvm/slice-is-ascii.rs +++ /dev/null @@ -1,16 +0,0 @@ -//@ only-x86_64 -//@ compile-flags: -C opt-level=3 -C target-cpu=x86-64 -#![crate_type = "lib"] - -/// Check that the fast-path of `is_ascii` uses a `pmovmskb` instruction. -/// Platforms lacking an equivalent instruction use other techniques for -/// optimizing `is_ascii`. -// CHECK-LABEL: @is_ascii_autovectorized -#[no_mangle] -pub fn is_ascii_autovectorized(s: &[u8]) -> bool { - // CHECK: load <32 x i8> - // CHECK-NEXT: icmp slt <32 x i8> - // CHECK-NEXT: bitcast <32 x i1> - // CHECK-NEXT: icmp eq i32 - s.is_ascii() -}