rust-lang · rust-bors · Jan 24, 2026 · Jan 17, 2026 · Jan 18, 2026 · Jan 22, 2026
diff --git a/library/core/src/slice/ascii.rs b/library/core/src/slice/ascii.rs
@@ -3,10 +3,7 @@
 use core::ascii::EscapeDefault;
 
 use crate::fmt::{self, Write};
-#[cfg(not(any(
-    all(target_arch = "x86_64", target_feature = "sse2"),
-    all(target_arch = "loongarch64", target_feature = "lsx")
-)))]
+#[cfg(not(all(target_arch = "loongarch64", target_feature = "lsx")))]
 use crate::intrinsics::const_eval_select;
 use crate::{ascii, iter, ops};
 
@@ -463,19 +460,101 @@ const fn is_ascii(s: &[u8]) -> bool {
     )
 }
 
-/// ASCII test optimized to use the `pmovmskb` instruction on `x86-64` and the
-/// `vmskltz.b` instruction on `loongarch64`.
+/// Chunk size for vectorized ASCII checking (two 16-byte SSE registers).
+#[cfg(all(target_arch = "x86_64", target_feature = "sse2"))]
+const CHUNK_SIZE: usize = 32;
+
+/// SSE2 implementation using `_mm_movemask_epi8` (compiles to `pmovmskb`) to
+/// avoid LLVM's broken AVX-512 auto-vectorization of counting loops.
+///
+/// FIXME(llvm#176906): Remove this workaround once LLVM generates efficient code.
+#[cfg(all(target_arch = "x86_64", target_feature = "sse2"))]
+fn is_ascii_sse2(bytes: &[u8]) -> bool {
+    use crate::arch::x86_64::{__m128i, _mm_loadu_si128, _mm_movemask_epi8, _mm_or_si128};
+
+    let mut i = 0;
+
+    while i + CHUNK_SIZE <= bytes.len() {
+        // SAFETY: We have verified that `i + CHUNK_SIZE <= bytes.len()`.
+        let ptr = unsafe { bytes.as_ptr().add(i) };
+
+        // Load two 16-byte chunks and combine them.
+        // SAFETY: We verified `i + 32 <= len`, so ptr is valid for 32 bytes.
+        // `_mm_loadu_si128` allows unaligned loads.
+        let chunk1 = unsafe { _mm_loadu_si128(ptr as *const __m128i) };
+        // SAFETY: Same as above - ptr.add(16) is within the valid 32-byte range.
+        let chunk2 = unsafe { _mm_loadu_si128(ptr.add(16) as *const __m128i) };
+
+        // OR them together - if any byte has the high bit set, the result will too.
+        // SAFETY: SSE2 is guaranteed by the cfg predicate.
+        let combined = unsafe { _mm_or_si128(chunk1, chunk2) };
+
+        // Create a mask from the MSBs of each byte.
+        // If any byte is >= 128, its MSB is 1, so the mask will be non-zero.
+        // SAFETY: SSE2 is guaranteed by the cfg predicate.
+        let mask = unsafe { _mm_movemask_epi8(combined) };
+
+        if mask != 0 {
+            return false;
+        }
+
+        i += CHUNK_SIZE;
+    }
+
+    // Handle remaining bytes with simple loop
+    while i < bytes.len() {
+        if !bytes[i].is_ascii() {
+            return false;
+        }
+        i += 1;
+    }
+
+    true
+}
+
+/// ASCII test optimized to use the `pmovmskb` instruction on `x86-64`.
+///
+/// Uses explicit SSE2 intrinsics to prevent LLVM from auto-vectorizing with
+/// broken AVX-512 code that extracts mask bits one-by-one.
+#[cfg(all(target_arch = "x86_64", target_feature = "sse2"))]
+#[inline]
+#[rustc_allow_const_fn_unstable(const_eval_select)]
+const fn is_ascii(bytes: &[u8]) -> bool {
+    const USIZE_SIZE: usize = size_of::<usize>();
+    const NONASCII_MASK: usize = usize::MAX / 255 * 0x80;
+
+    const_eval_select!(
+        @capture { bytes: &[u8] } -> bool:
+        if const {
+            is_ascii_simple(bytes)
+        } else {
+            // For small inputs, use usize-at-a-time processing to avoid SSE2 call overhead.
+            if bytes.len() < CHUNK_SIZE {
+                let chunks = bytes.chunks_exact(USIZE_SIZE);
+                let remainder = chunks.remainder();
+                for chunk in chunks {
+                    let word = usize::from_ne_bytes(chunk.try_into().unwrap());
+                    if (word & NONASCII_MASK) != 0 {
+                        return false;
+                    }
+                }
+                return remainder.iter().all(|b| b.is_ascii());
+            }
+
+            is_ascii_sse2(bytes)
+        }
+    )
+}
+
+/// ASCII test optimized to use the `vmskltz.b` instruction on `loongarch64`.
 ///
 /// Other platforms are not likely to benefit from this code structure, so they
 /// use SWAR techniques to test for ASCII in `usize`-sized chunks.
-#[cfg(any(
-    all(target_arch = "x86_64", target_feature = "sse2"),
-    all(target_arch = "loongarch64", target_feature = "lsx")
-))]
+#[cfg(all(target_arch = "loongarch64", target_feature = "lsx"))]
 #[inline]
 const fn is_ascii(bytes: &[u8]) -> bool {
     // Process chunks of 32 bytes at a time in the fast path to enable
-    // auto-vectorization and use of `pmovmskb`. Two 128-bit vector registers
+    // auto-vectorization and use of `vmskltz.b`. Two 128-bit vector registers
     // can be OR'd together and then the resulting vector can be tested for
     // non-ASCII bytes.
     const CHUNK_SIZE: usize = 32;
@@ -485,7 +564,7 @@ const fn is_ascii(bytes: &[u8]) -> bool {
     while i + CHUNK_SIZE <= bytes.len() {
         let chunk_end = i + CHUNK_SIZE;
 
-        // Get LLVM to produce a `pmovmskb` instruction on x86-64 which
+        // Get LLVM to produce a `vmskltz.b` instruction on loongarch64 which
         // creates a mask from the most significant bit of each byte.
         // ASCII bytes are less than 128 (0x80), so their most significant
         // bit is unset.

diff --git a/tests/assembly-llvm/slice-is-ascii.rs b/tests/assembly-llvm/slice-is-ascii.rs
@@ -0,0 +1,32 @@
+//@ revisions: X86_64 LA64
+//@ assembly-output: emit-asm
+//@ compile-flags: -C opt-level=3
+//
+//@ [X86_64] only-x86_64
+//@ [X86_64] compile-flags: -C target-cpu=znver4
+//@ [X86_64] compile-flags: -C llvm-args=-x86-asm-syntax=intel
+//
+//@ [LA64] only-loongarch64
+
+#![crate_type = "lib"]
+
+/// Verify `is_ascii` generates efficient code on different architectures:
+///
+/// - x86_64: Must NOT use `kshiftrd`/`kshiftrq` (broken AVX-512 auto-vectorization).
+///   The fix uses explicit SSE2 intrinsics (`pmovmskb`/`vpmovmskb`).
+///   See: https://github.com/llvm/llvm-project/issues/176906
+///
+/// - loongarch64: Should use `vmskltz.b` instruction for the fast-path.
+///   This architecture still relies on LLVM auto-vectorization.
+
+// X86_64-LABEL: test_is_ascii
+// X86_64-NOT: kshiftrd
+// X86_64-NOT: kshiftrq
+
+// LA64-LABEL: test_is_ascii
+// LA64: vmskltz.b
+
+#[no_mangle]
+pub fn test_is_ascii(s: &[u8]) -> bool {
+    s.is_ascii()
+}
diff --git a/tests/codegen-llvm/slice-is-ascii.rs b/tests/codegen-llvm/slice-is-ascii.rs