library/core/src/slice/ascii.rs

            
                      Original file line number
                      Diff line number
                      Diff line change
                  
    @@ -460,56 +460,38 @@ const fn is_ascii(s: &[u8]) -> bool {
  
        )

    }

    /// Chunk size for vectorized ASCII checking (two 16-byte SSE registers).

    /// Chunk size for SSE2 vectorized ASCII checking (4x 16-byte loads).

    #[cfg(all(target_arch = "x86_64", target_feature = "sse2"))]

    const CHUNK_SIZE: usize = 32;

    const SSE2_CHUNK_SIZE: usize = 64;

    /// SSE2 implementation using `_mm_movemask_epi8` (compiles to `pmovmskb`) to

    /// avoid LLVM's broken AVX-512 auto-vectorization of counting loops.

    ///

    /// FIXME(llvm#176906): Remove this workaround once LLVM generates efficient code.

    #[cfg(all(target_arch = "x86_64", target_feature = "sse2"))]

    #[inline]

    fn is_ascii_sse2(bytes: &[u8]) -> bool {

        use crate::arch::x86_64::{__m128i, _mm_loadu_si128, _mm_movemask_epi8, _mm_or_si128};

        let mut i = 0;

        while i + CHUNK_SIZE <= bytes.len() {

            // SAFETY: We have verified that `i + CHUNK_SIZE <= bytes.len()`.

            let ptr = unsafe { bytes.as_ptr().add(i) };

            // Load two 16-byte chunks and combine them.

            // SAFETY: We verified `i + 32 <= len`, so ptr is valid for 32 bytes.

            // `_mm_loadu_si128` allows unaligned loads.

            let chunk1 = unsafe { _mm_loadu_si128(ptr as *const __m128i) };

            // SAFETY: Same as above - ptr.add(16) is within the valid 32-byte range.

            let chunk2 = unsafe { _mm_loadu_si128(ptr.add(16) as *const __m128i) };

            // OR them together - if any byte has the high bit set, the result will too.

            // SAFETY: SSE2 is guaranteed by the cfg predicate.

            let combined = unsafe { _mm_or_si128(chunk1, chunk2) };

            // Create a mask from the MSBs of each byte.

            // If any byte is >= 128, its MSB is 1, so the mask will be non-zero.

            // SAFETY: SSE2 is guaranteed by the cfg predicate.

            let mask = unsafe { _mm_movemask_epi8(combined) };

        let (chunks, rest) = bytes.as_chunks::<SSE2_CHUNK_SIZE>();

        for chunk in chunks {

            let ptr = chunk.as_ptr();

            // SAFETY: chunk is 64 bytes. SSE2 is baseline on x86_64.

            let mask = unsafe {

                let a1 = _mm_loadu_si128(ptr as *const __m128i);

                let a2 = _mm_loadu_si128(ptr.add(16) as *const __m128i);

                let b1 = _mm_loadu_si128(ptr.add(32) as *const __m128i);

                let b2 = _mm_loadu_si128(ptr.add(48) as *const __m128i);

                // OR all chunks - if any byte has high bit set, combined will too.

                let combined = _mm_or_si128(_mm_or_si128(a1, a2), _mm_or_si128(b1, b2));

                // Create a mask from the MSBs of each byte.

                // If any byte is >= 128, its MSB is 1, so the mask will be non-zero.

                _mm_movemask_epi8(combined)

            };

            if mask != 0 {

                return false;

            }

            i += CHUNK_SIZE;

        }

        // Handle remaining bytes with simple loop

        while i < bytes.len() {

            if !bytes[i].is_ascii() {

                return false;

            }

            i += 1;

        }

        true

        // Handle remaining bytes

        rest.iter().all(|b| b.is_ascii())

    }

    /// ASCII test optimized to use the `pmovmskb` instruction on `x86-64`.

    @@ -529,7 +511,7 @@ const fn is_ascii(bytes: &[u8]) -> bool {
  
                is_ascii_simple(bytes)

            } else {

                // For small inputs, use usize-at-a-time processing to avoid SSE2 call overhead.

                if bytes.len() < CHUNK_SIZE {

                if bytes.len() < SSE2_CHUNK_SIZE {

                    let chunks = bytes.chunks_exact(USIZE_SIZE);

                    let remainder = chunks.remainder();

                    for chunk in chunks {

tests/assembly-llvm/slice-is-ascii.rs

-Original file line number
+Diff line change
@@ Expand Up / @@ -13,15 +13,15 @@ @@
     /// Verify `is_ascii` generates efficient code on different architectures:
     ///
     /// - x86_64: Must NOT use `kshiftrd`/`kshiftrq` (broken AVX-512 auto-vectorization).
-    ///   The fix uses explicit SSE2 intrinsics (`pmovmskb`/`vpmovmskb`).
-    ///   See: https://github.com/llvm/llvm-project/issues/176906
+    ///   Good version uses explicit SSE2 intrinsics (`pmovmskb`/`vpmovmskb`).
     ///
     /// - loongarch64: Should use `vmskltz.b` instruction for the fast-path.
-    ///   This architecture still relies on LLVM auto-vectorization.
     // X86_64-LABEL: test_is_ascii
     // X86_64-NOT: kshiftrd
     // X86_64-NOT: kshiftrq
+    // X86_64: {{vpor|por}}
+    // X86_64: {{vpmovmskb|pmovmskb}}
     // LA64-LABEL: test_is_ascii
     // LA64: vmskltz.b
@@ Expand Down @@

Improve is_ascii performance on x86_64 with explicit SSE2 intrinsics #151611

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged

rust-bors merged 3 commits into rust-lang:main from bonega:improve-is-slice-is-ascii-performance

Jan 26, 2026

+25 −43

-Original file line number
+Diff line change
@@ Expand Up / @@ -13,15 +13,15 @@ @@
     /// Verify `is_ascii` generates efficient code on different architectures:
     ///
     /// - x86_64: Must NOT use `kshiftrd`/`kshiftrq` (broken AVX-512 auto-vectorization).
-    ///   The fix uses explicit SSE2 intrinsics (`pmovmskb`/`vpmovmskb`).
-    ///   See: https://github.com/llvm/llvm-project/issues/176906
+    ///   Good version uses explicit SSE2 intrinsics (`pmovmskb`/`vpmovmskb`).
     ///
     /// - loongarch64: Should use `vmskltz.b` instruction for the fast-path.
-    ///   This architecture still relies on LLVM auto-vectorization.
     // X86_64-LABEL: test_is_ascii
     // X86_64-NOT: kshiftrd
     // X86_64-NOT: kshiftrq
+    // X86_64: {{vpor|por}}
+    // X86_64: {{vpmovmskb|pmovmskb}}
     // LA64-LABEL: test_is_ascii
     // LA64: vmskltz.b
@@ Expand Down @@

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Uh oh!

Improve is_ascii performance on x86_64 with explicit SSE2 intrinsics #151611

Diff view

Diff view

There are no files selected for viewing

Uh oh!