Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
103 changes: 91 additions & 12 deletions library/core/src/slice/ascii.rs
Original file line number Diff line number Diff line change
Expand Up @@ -3,10 +3,7 @@
use core::ascii::EscapeDefault;

use crate::fmt::{self, Write};
#[cfg(not(any(
all(target_arch = "x86_64", target_feature = "sse2"),
all(target_arch = "loongarch64", target_feature = "lsx")
)))]
#[cfg(not(all(target_arch = "loongarch64", target_feature = "lsx")))]
use crate::intrinsics::const_eval_select;
use crate::{ascii, iter, ops};

Expand Down Expand Up @@ -463,19 +460,101 @@ const fn is_ascii(s: &[u8]) -> bool {
)
}

/// ASCII test optimized to use the `pmovmskb` instruction on `x86-64` and the
/// `vmskltz.b` instruction on `loongarch64`.
/// Chunk size for vectorized ASCII checking (two 16-byte SSE registers).
#[cfg(all(target_arch = "x86_64", target_feature = "sse2"))]
const CHUNK_SIZE: usize = 32;

/// SSE2 implementation using `_mm_movemask_epi8` (compiles to `pmovmskb`) to
/// avoid LLVM's broken AVX-512 auto-vectorization of counting loops.
///
/// FIXME(llvm#176906): Remove this workaround once LLVM generates efficient code.
#[cfg(all(target_arch = "x86_64", target_feature = "sse2"))]
fn is_ascii_sse2(bytes: &[u8]) -> bool {
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

did you consider writing the loop using as_chunks?

https://rust.godbolt.org/z/vfxK51fq8

Somehow LLVM seems to understand this less well, but it does remove a bunch of the unsafety from the loop.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Pretty sure I tested it earlier but that benchmark showed a regression.
Will check again when I get home 👍

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I tested with as_chunks.
I'm seeing roughly a 10% slowdown for larger inputs.
The results are consistent over several runs

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Wild. Could you open a separate issue for that performance issue? I'd love to see what our LLVM and mir-opt people think about that. It just seems like it should be equivalent, so this might be one of those cases where LLVM overfits on what clang will give it (and we should fix that).

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I checked everything again.
I had a #[target_feature(enable = "sse2")] enabled for the benchmark which actually produces different asm code.
Your godbolt link is good and produces better performance.
I will do another PR for this.

use crate::arch::x86_64::{__m128i, _mm_loadu_si128, _mm_movemask_epi8, _mm_or_si128};

let mut i = 0;

while i + CHUNK_SIZE <= bytes.len() {
// SAFETY: We have verified that `i + CHUNK_SIZE <= bytes.len()`.
let ptr = unsafe { bytes.as_ptr().add(i) };

// Load two 16-byte chunks and combine them.
// SAFETY: We verified `i + 32 <= len`, so ptr is valid for 32 bytes.
// `_mm_loadu_si128` allows unaligned loads.
let chunk1 = unsafe { _mm_loadu_si128(ptr as *const __m128i) };
// SAFETY: Same as above - ptr.add(16) is within the valid 32-byte range.
let chunk2 = unsafe { _mm_loadu_si128(ptr.add(16) as *const __m128i) };

// OR them together - if any byte has the high bit set, the result will too.
// SAFETY: SSE2 is guaranteed by the cfg predicate.
let combined = unsafe { _mm_or_si128(chunk1, chunk2) };

// Create a mask from the MSBs of each byte.
// If any byte is >= 128, its MSB is 1, so the mask will be non-zero.
// SAFETY: SSE2 is guaranteed by the cfg predicate.
let mask = unsafe { _mm_movemask_epi8(combined) };

if mask != 0 {
return false;
}

i += CHUNK_SIZE;
}

// Handle remaining bytes with simple loop
while i < bytes.len() {
if !bytes[i].is_ascii() {
return false;
}
i += 1;
}

true
}

/// ASCII test optimized to use the `pmovmskb` instruction on `x86-64`.
///
/// Uses explicit SSE2 intrinsics to prevent LLVM from auto-vectorizing with
/// broken AVX-512 code that extracts mask bits one-by-one.
#[cfg(all(target_arch = "x86_64", target_feature = "sse2"))]
#[inline]
#[rustc_allow_const_fn_unstable(const_eval_select)]
const fn is_ascii(bytes: &[u8]) -> bool {
const USIZE_SIZE: usize = size_of::<usize>();
const NONASCII_MASK: usize = usize::MAX / 255 * 0x80;

const_eval_select!(
@capture { bytes: &[u8] } -> bool:
if const {
is_ascii_simple(bytes)
} else {
// For small inputs, use usize-at-a-time processing to avoid SSE2 call overhead.
if bytes.len() < CHUNK_SIZE {
let chunks = bytes.chunks_exact(USIZE_SIZE);
let remainder = chunks.remainder();
for chunk in chunks {
let word = usize::from_ne_bytes(chunk.try_into().unwrap());
if (word & NONASCII_MASK) != 0 {
return false;
}
}
return remainder.iter().all(|b| b.is_ascii());
}

is_ascii_sse2(bytes)
}
)
}

/// ASCII test optimized to use the `vmskltz.b` instruction on `loongarch64`.
///
/// Other platforms are not likely to benefit from this code structure, so they
/// use SWAR techniques to test for ASCII in `usize`-sized chunks.
#[cfg(any(
all(target_arch = "x86_64", target_feature = "sse2"),
all(target_arch = "loongarch64", target_feature = "lsx")
))]
#[cfg(all(target_arch = "loongarch64", target_feature = "lsx"))]
#[inline]
const fn is_ascii(bytes: &[u8]) -> bool {
// Process chunks of 32 bytes at a time in the fast path to enable
// auto-vectorization and use of `pmovmskb`. Two 128-bit vector registers
// auto-vectorization and use of `vmskltz.b`. Two 128-bit vector registers
// can be OR'd together and then the resulting vector can be tested for
// non-ASCII bytes.
const CHUNK_SIZE: usize = 32;
Expand All @@ -485,7 +564,7 @@ const fn is_ascii(bytes: &[u8]) -> bool {
while i + CHUNK_SIZE <= bytes.len() {
let chunk_end = i + CHUNK_SIZE;

// Get LLVM to produce a `pmovmskb` instruction on x86-64 which
// Get LLVM to produce a `vmskltz.b` instruction on loongarch64 which
// creates a mask from the most significant bit of each byte.
// ASCII bytes are less than 128 (0x80), so their most significant
// bit is unset.
Expand Down
32 changes: 32 additions & 0 deletions tests/assembly-llvm/slice-is-ascii.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@
//@ revisions: X86_64 LA64
//@ assembly-output: emit-asm
//@ compile-flags: -C opt-level=3
//
//@ [X86_64] only-x86_64
//@ [X86_64] compile-flags: -C target-cpu=znver4
//@ [X86_64] compile-flags: -C llvm-args=-x86-asm-syntax=intel
//
//@ [LA64] only-loongarch64

#![crate_type = "lib"]

/// Verify `is_ascii` generates efficient code on different architectures:
///
/// - x86_64: Must NOT use `kshiftrd`/`kshiftrq` (broken AVX-512 auto-vectorization).
/// The fix uses explicit SSE2 intrinsics (`pmovmskb`/`vpmovmskb`).
/// See: https://github.com/llvm/llvm-project/issues/176906
///
/// - loongarch64: Should use `vmskltz.b` instruction for the fast-path.
/// This architecture still relies on LLVM auto-vectorization.

// X86_64-LABEL: test_is_ascii
// X86_64-NOT: kshiftrd
// X86_64-NOT: kshiftrq

// LA64-LABEL: test_is_ascii
// LA64: vmskltz.b

#[no_mangle]
pub fn test_is_ascii(s: &[u8]) -> bool {
s.is_ascii()
}
16 changes: 0 additions & 16 deletions tests/codegen-llvm/slice-is-ascii.rs

This file was deleted.

Loading