Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
62 changes: 22 additions & 40 deletions library/core/src/slice/ascii.rs
Original file line number Diff line number Diff line change
Expand Up @@ -460,56 +460,38 @@ const fn is_ascii(s: &[u8]) -> bool {
)
}

/// Chunk size for vectorized ASCII checking (two 16-byte SSE registers).
/// Chunk size for SSE2 vectorized ASCII checking (4x 16-byte loads).
#[cfg(all(target_arch = "x86_64", target_feature = "sse2"))]
const CHUNK_SIZE: usize = 32;
const SSE2_CHUNK_SIZE: usize = 64;

/// SSE2 implementation using `_mm_movemask_epi8` (compiles to `pmovmskb`) to
/// avoid LLVM's broken AVX-512 auto-vectorization of counting loops.
///
/// FIXME(llvm#176906): Remove this workaround once LLVM generates efficient code.
#[cfg(all(target_arch = "x86_64", target_feature = "sse2"))]
#[inline]
fn is_ascii_sse2(bytes: &[u8]) -> bool {
use crate::arch::x86_64::{__m128i, _mm_loadu_si128, _mm_movemask_epi8, _mm_or_si128};

let mut i = 0;

while i + CHUNK_SIZE <= bytes.len() {
// SAFETY: We have verified that `i + CHUNK_SIZE <= bytes.len()`.
let ptr = unsafe { bytes.as_ptr().add(i) };

// Load two 16-byte chunks and combine them.
// SAFETY: We verified `i + 32 <= len`, so ptr is valid for 32 bytes.
// `_mm_loadu_si128` allows unaligned loads.
let chunk1 = unsafe { _mm_loadu_si128(ptr as *const __m128i) };
// SAFETY: Same as above - ptr.add(16) is within the valid 32-byte range.
let chunk2 = unsafe { _mm_loadu_si128(ptr.add(16) as *const __m128i) };

// OR them together - if any byte has the high bit set, the result will too.
// SAFETY: SSE2 is guaranteed by the cfg predicate.
let combined = unsafe { _mm_or_si128(chunk1, chunk2) };

// Create a mask from the MSBs of each byte.
// If any byte is >= 128, its MSB is 1, so the mask will be non-zero.
// SAFETY: SSE2 is guaranteed by the cfg predicate.
let mask = unsafe { _mm_movemask_epi8(combined) };

let (chunks, rest) = bytes.as_chunks::<SSE2_CHUNK_SIZE>();

for chunk in chunks {
let ptr = chunk.as_ptr();
// SAFETY: chunk is 64 bytes. SSE2 is baseline on x86_64.
let mask = unsafe {
let a1 = _mm_loadu_si128(ptr as *const __m128i);
let a2 = _mm_loadu_si128(ptr.add(16) as *const __m128i);
let b1 = _mm_loadu_si128(ptr.add(32) as *const __m128i);
let b2 = _mm_loadu_si128(ptr.add(48) as *const __m128i);
// OR all chunks - if any byte has high bit set, combined will too.
let combined = _mm_or_si128(_mm_or_si128(a1, a2), _mm_or_si128(b1, b2));
// Create a mask from the MSBs of each byte.
// If any byte is >= 128, its MSB is 1, so the mask will be non-zero.
_mm_movemask_epi8(combined)
};
if mask != 0 {
return false;
}

i += CHUNK_SIZE;
}

// Handle remaining bytes with simple loop
while i < bytes.len() {
if !bytes[i].is_ascii() {
return false;
}
i += 1;
}

true
// Handle remaining bytes
rest.iter().all(|b| b.is_ascii())
}

/// ASCII test optimized to use the `pmovmskb` instruction on `x86-64`.
Expand All @@ -529,7 +511,7 @@ const fn is_ascii(bytes: &[u8]) -> bool {
is_ascii_simple(bytes)
} else {
// For small inputs, use usize-at-a-time processing to avoid SSE2 call overhead.
if bytes.len() < CHUNK_SIZE {
if bytes.len() < SSE2_CHUNK_SIZE {
let chunks = bytes.chunks_exact(USIZE_SIZE);
let remainder = chunks.remainder();
for chunk in chunks {
Expand Down
6 changes: 3 additions & 3 deletions tests/assembly-llvm/slice-is-ascii.rs
Original file line number Diff line number Diff line change
Expand Up @@ -13,15 +13,15 @@
/// Verify `is_ascii` generates efficient code on different architectures:
///
/// - x86_64: Must NOT use `kshiftrd`/`kshiftrq` (broken AVX-512 auto-vectorization).
/// The fix uses explicit SSE2 intrinsics (`pmovmskb`/`vpmovmskb`).
/// See: https://github.com/llvm/llvm-project/issues/176906
/// Good version uses explicit SSE2 intrinsics (`pmovmskb`/`vpmovmskb`).
///
/// - loongarch64: Should use `vmskltz.b` instruction for the fast-path.
/// This architecture still relies on LLVM auto-vectorization.

// X86_64-LABEL: test_is_ascii
// X86_64-NOT: kshiftrd
// X86_64-NOT: kshiftrq
// X86_64: {{vpor|por}}
// X86_64: {{vpmovmskb|pmovmskb}}

// LA64-LABEL: test_is_ascii
// LA64: vmskltz.b
Expand Down
Loading