From a72f68e80154a208b85a3b80cea744b84b7b5d18 Mon Sep 17 00:00:00 2001
From: Andreas Liljeqvist <bonega@gmail.com>
Date: Sat, 24 Jan 2026 20:05:01 +0100
Subject: [PATCH 1/3] Fix is_ascii performance on x86_64 with explicit SSE2
 intrinsics

Use explicit SSE2 intrinsics to avoid LLVM's broken AVX-512
auto-vectorization which generates ~31 kshiftrd instructions.

Performance
- AVX-512: 34-48x faster
- SSE2: 1.5-2x faster

Improves on earlier pr
---
 library/core/src/slice/ascii.rs       | 61 +++++++++------------------
 tests/assembly-llvm/slice-is-ascii.rs |  5 +++
 2 files changed, 26 insertions(+), 40 deletions(-)

diff --git a/library/core/src/slice/ascii.rs b/library/core/src/slice/ascii.rs
index 459c826f40646..de89d77e5e2ce 100644
--- a/library/core/src/slice/ascii.rs
+++ b/library/core/src/slice/ascii.rs
@@ -460,56 +460,37 @@ const fn is_ascii(s: &[u8]) -> bool {
     )
 }
 
-/// Chunk size for vectorized ASCII checking (two 16-byte SSE registers).
+/// Chunk size for SSE2 vectorized ASCII checking (4x 16-byte loads).
 #[cfg(all(target_arch = "x86_64", target_feature = "sse2"))]
-const CHUNK_SIZE: usize = 32;
+const SSE2_CHUNK_SIZE: usize = 64;
 
-/// SSE2 implementation using `_mm_movemask_epi8` (compiles to `pmovmskb`) to
-/// avoid LLVM's broken AVX-512 auto-vectorization of counting loops.
-///
-/// FIXME(llvm#176906): Remove this workaround once LLVM generates efficient code.
 #[cfg(all(target_arch = "x86_64", target_feature = "sse2"))]
 fn is_ascii_sse2(bytes: &[u8]) -> bool {
     use crate::arch::x86_64::{__m128i, _mm_loadu_si128, _mm_movemask_epi8, _mm_or_si128};
 
-    let mut i = 0;
-
-    while i + CHUNK_SIZE <= bytes.len() {
-        // SAFETY: We have verified that `i + CHUNK_SIZE <= bytes.len()`.
-        let ptr = unsafe { bytes.as_ptr().add(i) };
-
-        // Load two 16-byte chunks and combine them.
-        // SAFETY: We verified `i + 32 <= len`, so ptr is valid for 32 bytes.
-        // `_mm_loadu_si128` allows unaligned loads.
-        let chunk1 = unsafe { _mm_loadu_si128(ptr as *const __m128i) };
-        // SAFETY: Same as above - ptr.add(16) is within the valid 32-byte range.
-        let chunk2 = unsafe { _mm_loadu_si128(ptr.add(16) as *const __m128i) };
-
-        // OR them together - if any byte has the high bit set, the result will too.
-        // SAFETY: SSE2 is guaranteed by the cfg predicate.
-        let combined = unsafe { _mm_or_si128(chunk1, chunk2) };
-
-        // Create a mask from the MSBs of each byte.
-        // If any byte is >= 128, its MSB is 1, so the mask will be non-zero.
-        // SAFETY: SSE2 is guaranteed by the cfg predicate.
-        let mask = unsafe { _mm_movemask_epi8(combined) };
-
+    let (chunks, rest) = bytes.as_chunks::<SSE2_CHUNK_SIZE>();
+
+    for chunk in chunks {
+        let ptr = chunk.as_ptr();
+        // SAFETY: chunk is 64 bytes. SSE2 is baseline on x86_64.
+        let mask = unsafe {
+            let a1 = _mm_loadu_si128(ptr as *const __m128i);
+            let a2 = _mm_loadu_si128(ptr.add(16) as *const __m128i);
+            let b1 = _mm_loadu_si128(ptr.add(32) as *const __m128i);
+            let b2 = _mm_loadu_si128(ptr.add(48) as *const __m128i);
+            // OR all chunks - if any byte has high bit set, combined will too.
+            let combined = _mm_or_si128(_mm_or_si128(a1, a2), _mm_or_si128(b1, b2));
+            // Create a mask from the MSBs of each byte.
+            // If any byte is >= 128, its MSB is 1, so the mask will be non-zero.
+            _mm_movemask_epi8(combined)
+        };
         if mask != 0 {
             return false;
         }
-
-        i += CHUNK_SIZE;
-    }
-
-    // Handle remaining bytes with simple loop
-    while i < bytes.len() {
-        if !bytes[i].is_ascii() {
-            return false;
-        }
-        i += 1;
     }
 
-    true
+    // Handle remaining bytes
+    rest.iter().all(|b| b.is_ascii())
 }
 
 /// ASCII test optimized to use the `pmovmskb` instruction on `x86-64`.
@@ -529,7 +510,7 @@ const fn is_ascii(bytes: &[u8]) -> bool {
             is_ascii_simple(bytes)
         } else {
             // For small inputs, use usize-at-a-time processing to avoid SSE2 call overhead.
-            if bytes.len() < CHUNK_SIZE {
+            if bytes.len() < SSE2_CHUNK_SIZE {
                 let chunks = bytes.chunks_exact(USIZE_SIZE);
                 let remainder = chunks.remainder();
                 for chunk in chunks {
diff --git a/tests/assembly-llvm/slice-is-ascii.rs b/tests/assembly-llvm/slice-is-ascii.rs
index d01b321bf460a..3b782ab2cd827 100644
--- a/tests/assembly-llvm/slice-is-ascii.rs
+++ b/tests/assembly-llvm/slice-is-ascii.rs
@@ -22,6 +22,11 @@
 // X86_64-LABEL: test_is_ascii
 // X86_64-NOT: kshiftrd
 // X86_64-NOT: kshiftrq
+// Verify explicit SSE2/AVX intrinsics are used:
+// - pmovmskb/vpmovmskb: efficient mask extraction from the MSBs
+// - vpor/por: OR-combining of 4x 16-byte loads (2x unrolled, 64-byte chunks)
+// X86_64: {{vpmovmskb|pmovmskb}}
+// X86_64: {{vpor|por}}
 
 // LA64-LABEL: test_is_ascii
 // LA64: vmskltz.b

From cbcd8694c6e549c658901f010644fddcb7ffbce8 Mon Sep 17 00:00:00 2001
From: Andreas Liljeqvist <bonega@gmail.com>
Date: Sun, 25 Jan 2026 09:44:04 +0100
Subject: [PATCH 2/3] Remove x86_64 assembly test for is_ascii

The SSE2 helper function is not inlined across crate boundaries,
so we cannot verify the codegen in an assembly test. The fix is
still verified by the absence of performance regression.
---
 tests/assembly-llvm/slice-is-ascii.rs | 22 +---------------------
 1 file changed, 1 insertion(+), 21 deletions(-)

diff --git a/tests/assembly-llvm/slice-is-ascii.rs b/tests/assembly-llvm/slice-is-ascii.rs
index 3b782ab2cd827..00deb23e9a6cd 100644
--- a/tests/assembly-llvm/slice-is-ascii.rs
+++ b/tests/assembly-llvm/slice-is-ascii.rs
@@ -1,32 +1,12 @@
-//@ revisions: X86_64 LA64
+//@ revisions: LA64
 //@ assembly-output: emit-asm
 //@ compile-flags: -C opt-level=3
 //
-//@ [X86_64] only-x86_64
-//@ [X86_64] compile-flags: -C target-cpu=znver4
-//@ [X86_64] compile-flags: -C llvm-args=-x86-asm-syntax=intel
-//
 //@ [LA64] only-loongarch64
 
 #![crate_type = "lib"]
 
-/// Verify `is_ascii` generates efficient code on different architectures:
-///
-/// - x86_64: Must NOT use `kshiftrd`/`kshiftrq` (broken AVX-512 auto-vectorization).
-///   The fix uses explicit SSE2 intrinsics (`pmovmskb`/`vpmovmskb`).
-///   See: https://github.com/llvm/llvm-project/issues/176906
-///
 /// - loongarch64: Should use `vmskltz.b` instruction for the fast-path.
-///   This architecture still relies on LLVM auto-vectorization.
-
-// X86_64-LABEL: test_is_ascii
-// X86_64-NOT: kshiftrd
-// X86_64-NOT: kshiftrq
-// Verify explicit SSE2/AVX intrinsics are used:
-// - pmovmskb/vpmovmskb: efficient mask extraction from the MSBs
-// - vpor/por: OR-combining of 4x 16-byte loads (2x unrolled, 64-byte chunks)
-// X86_64: {{vpmovmskb|pmovmskb}}
-// X86_64: {{vpor|por}}
 
 // LA64-LABEL: test_is_ascii
 // LA64: vmskltz.b

From dbc870afec91308b2e6a6c6ba16e8f3bb085e338 Mon Sep 17 00:00:00 2001
From: Andreas Liljeqvist <bonega@gmail.com>
Date: Sun, 25 Jan 2026 20:03:32 +0100
Subject: [PATCH 3/3] Mark is_ascii_sse2 as #[inline]

---
 library/core/src/slice/ascii.rs       |  1 +
 tests/assembly-llvm/slice-is-ascii.rs | 17 ++++++++++++++++-
 2 files changed, 17 insertions(+), 1 deletion(-)

diff --git a/library/core/src/slice/ascii.rs b/library/core/src/slice/ascii.rs
index de89d77e5e2ce..ae641871279b6 100644
--- a/library/core/src/slice/ascii.rs
+++ b/library/core/src/slice/ascii.rs
@@ -465,6 +465,7 @@ const fn is_ascii(s: &[u8]) -> bool {
 const SSE2_CHUNK_SIZE: usize = 64;
 
 #[cfg(all(target_arch = "x86_64", target_feature = "sse2"))]
+#[inline]
 fn is_ascii_sse2(bytes: &[u8]) -> bool {
     use crate::arch::x86_64::{__m128i, _mm_loadu_si128, _mm_movemask_epi8, _mm_or_si128};
 
diff --git a/tests/assembly-llvm/slice-is-ascii.rs b/tests/assembly-llvm/slice-is-ascii.rs
index 00deb23e9a6cd..b9a5205054986 100644
--- a/tests/assembly-llvm/slice-is-ascii.rs
+++ b/tests/assembly-llvm/slice-is-ascii.rs
@@ -1,13 +1,28 @@
-//@ revisions: LA64
+//@ revisions: X86_64 LA64
 //@ assembly-output: emit-asm
 //@ compile-flags: -C opt-level=3
 //
+//@ [X86_64] only-x86_64
+//@ [X86_64] compile-flags: -C target-cpu=znver4
+//@ [X86_64] compile-flags: -C llvm-args=-x86-asm-syntax=intel
+//
 //@ [LA64] only-loongarch64
 
 #![crate_type = "lib"]
 
+/// Verify `is_ascii` generates efficient code on different architectures:
+///
+/// - x86_64: Must NOT use `kshiftrd`/`kshiftrq` (broken AVX-512 auto-vectorization).
+///   Good version uses explicit SSE2 intrinsics (`pmovmskb`/`vpmovmskb`).
+///
 /// - loongarch64: Should use `vmskltz.b` instruction for the fast-path.
 
+// X86_64-LABEL: test_is_ascii
+// X86_64-NOT: kshiftrd
+// X86_64-NOT: kshiftrq
+// X86_64: {{vpor|por}}
+// X86_64: {{vpmovmskb|pmovmskb}}
+
 // LA64-LABEL: test_is_ascii
 // LA64: vmskltz.b