diff --git a/compiler/rustc_data_structures/src/fx.rs b/compiler/rustc_data_structures/src/fx.rs index 026ec5c230ec6..cad775cc98641 100644 --- a/compiler/rustc_data_structures/src/fx.rs +++ b/compiler/rustc_data_structures/src/fx.rs @@ -1,11 +1,9 @@ -use std::hash::BuildHasherDefault; - pub use rustc_hash::{FxBuildHasher, FxHashMap, FxHashSet, FxHasher}; pub type StdEntry<'a, K, V> = std::collections::hash_map::Entry<'a, K, V>; -pub type FxIndexMap = indexmap::IndexMap>; -pub type FxIndexSet = indexmap::IndexSet>; +pub type FxIndexMap = indexmap::IndexMap; +pub type FxIndexSet = indexmap::IndexSet; pub type IndexEntry<'a, K, V> = indexmap::map::Entry<'a, K, V>; pub type IndexOccupiedEntry<'a, K, V> = indexmap::map::OccupiedEntry<'a, K, V>; diff --git a/compiler/rustc_data_structures/src/unord.rs b/compiler/rustc_data_structures/src/unord.rs index 0a9a86d7a43b8..eb29ef3b4d0a5 100644 --- a/compiler/rustc_data_structures/src/unord.rs +++ b/compiler/rustc_data_structures/src/unord.rs @@ -8,10 +8,10 @@ use std::hash::Hash; use std::iter::{Product, Sum}; use std::ops::Index; -use rustc_hash::{FxBuildHasher, FxHashMap, FxHashSet}; use rustc_macros::{Decodable_NoContext, Encodable_NoContext}; use crate::fingerprint::Fingerprint; +use crate::fx::{FxBuildHasher, FxHashMap, FxHashSet}; use crate::stable_hasher::{HashStable, StableCompare, StableHasher, ToStableHashKey}; /// `UnordItems` is the order-less version of `Iterator`. It only contains methods diff --git a/compiler/rustc_type_ir/src/data_structures/mod.rs b/compiler/rustc_type_ir/src/data_structures/mod.rs index a72669cbd189b..c2b629f1d11c4 100644 --- a/compiler/rustc_type_ir/src/data_structures/mod.rs +++ b/compiler/rustc_type_ir/src/data_structures/mod.rs @@ -1,11 +1,9 @@ -use std::hash::BuildHasherDefault; - pub use ena::unify::{NoError, UnifyKey, UnifyValue}; -use rustc_hash::FxHasher; +use rustc_hash::FxBuildHasher; pub use rustc_hash::{FxHashMap as HashMap, FxHashSet as HashSet}; -pub type IndexMap = indexmap::IndexMap>; -pub type IndexSet = indexmap::IndexSet>; +pub type IndexMap = indexmap::IndexMap; +pub type IndexSet = indexmap::IndexSet; mod delayed_map; diff --git a/library/core/src/pin.rs b/library/core/src/pin.rs index 4791f5612bfa2..c4981facc04c3 100644 --- a/library/core/src/pin.rs +++ b/library/core/src/pin.rs @@ -831,15 +831,13 @@ //! fn get_pin_mut(self: [Pin]<[`&mut Self`]>) -> [Pin]<[`&mut T`]>. //! Then we could do the following: //! ```compile_fail -//! # use std::cell::RefCell; -//! # use std::pin::Pin; -//! fn exploit_ref_cell(rc: Pin<&mut RefCell>) { +//! fn exploit_ref_cell(mut rc: Pin<&mut RefCell>) { //! // Here we get pinned access to the `T`. //! let _: Pin<&mut T> = rc.as_mut().get_pin_mut(); //! //! // And here we have `&mut T` to the same data. //! let shared: &RefCell = rc.into_ref().get_ref(); -//! let borrow = shared.borrow_mut(); +//! let mut borrow = shared.borrow_mut(); //! let content = &mut *borrow; //! } //! ``` diff --git a/library/core/src/slice/ascii.rs b/library/core/src/slice/ascii.rs index 459c826f40646..ae641871279b6 100644 --- a/library/core/src/slice/ascii.rs +++ b/library/core/src/slice/ascii.rs @@ -460,56 +460,38 @@ const fn is_ascii(s: &[u8]) -> bool { ) } -/// Chunk size for vectorized ASCII checking (two 16-byte SSE registers). +/// Chunk size for SSE2 vectorized ASCII checking (4x 16-byte loads). #[cfg(all(target_arch = "x86_64", target_feature = "sse2"))] -const CHUNK_SIZE: usize = 32; +const SSE2_CHUNK_SIZE: usize = 64; -/// SSE2 implementation using `_mm_movemask_epi8` (compiles to `pmovmskb`) to -/// avoid LLVM's broken AVX-512 auto-vectorization of counting loops. -/// -/// FIXME(llvm#176906): Remove this workaround once LLVM generates efficient code. #[cfg(all(target_arch = "x86_64", target_feature = "sse2"))] +#[inline] fn is_ascii_sse2(bytes: &[u8]) -> bool { use crate::arch::x86_64::{__m128i, _mm_loadu_si128, _mm_movemask_epi8, _mm_or_si128}; - let mut i = 0; - - while i + CHUNK_SIZE <= bytes.len() { - // SAFETY: We have verified that `i + CHUNK_SIZE <= bytes.len()`. - let ptr = unsafe { bytes.as_ptr().add(i) }; - - // Load two 16-byte chunks and combine them. - // SAFETY: We verified `i + 32 <= len`, so ptr is valid for 32 bytes. - // `_mm_loadu_si128` allows unaligned loads. - let chunk1 = unsafe { _mm_loadu_si128(ptr as *const __m128i) }; - // SAFETY: Same as above - ptr.add(16) is within the valid 32-byte range. - let chunk2 = unsafe { _mm_loadu_si128(ptr.add(16) as *const __m128i) }; - - // OR them together - if any byte has the high bit set, the result will too. - // SAFETY: SSE2 is guaranteed by the cfg predicate. - let combined = unsafe { _mm_or_si128(chunk1, chunk2) }; - - // Create a mask from the MSBs of each byte. - // If any byte is >= 128, its MSB is 1, so the mask will be non-zero. - // SAFETY: SSE2 is guaranteed by the cfg predicate. - let mask = unsafe { _mm_movemask_epi8(combined) }; - + let (chunks, rest) = bytes.as_chunks::(); + + for chunk in chunks { + let ptr = chunk.as_ptr(); + // SAFETY: chunk is 64 bytes. SSE2 is baseline on x86_64. + let mask = unsafe { + let a1 = _mm_loadu_si128(ptr as *const __m128i); + let a2 = _mm_loadu_si128(ptr.add(16) as *const __m128i); + let b1 = _mm_loadu_si128(ptr.add(32) as *const __m128i); + let b2 = _mm_loadu_si128(ptr.add(48) as *const __m128i); + // OR all chunks - if any byte has high bit set, combined will too. + let combined = _mm_or_si128(_mm_or_si128(a1, a2), _mm_or_si128(b1, b2)); + // Create a mask from the MSBs of each byte. + // If any byte is >= 128, its MSB is 1, so the mask will be non-zero. + _mm_movemask_epi8(combined) + }; if mask != 0 { return false; } - - i += CHUNK_SIZE; - } - - // Handle remaining bytes with simple loop - while i < bytes.len() { - if !bytes[i].is_ascii() { - return false; - } - i += 1; } - true + // Handle remaining bytes + rest.iter().all(|b| b.is_ascii()) } /// ASCII test optimized to use the `pmovmskb` instruction on `x86-64`. @@ -529,7 +511,7 @@ const fn is_ascii(bytes: &[u8]) -> bool { is_ascii_simple(bytes) } else { // For small inputs, use usize-at-a-time processing to avoid SSE2 call overhead. - if bytes.len() < CHUNK_SIZE { + if bytes.len() < SSE2_CHUNK_SIZE { let chunks = bytes.chunks_exact(USIZE_SIZE); let remainder = chunks.remainder(); for chunk in chunks { diff --git a/src/doc/rustc/src/platform-support/wasm32-wasip1.md b/src/doc/rustc/src/platform-support/wasm32-wasip1.md index 958a34a86928c..eb74edda22de8 100644 --- a/src/doc/rustc/src/platform-support/wasm32-wasip1.md +++ b/src/doc/rustc/src/platform-support/wasm32-wasip1.md @@ -20,7 +20,7 @@ focused on the Component Model-based definition of WASI. At this point the `wasm32-wasip1` Rust target is intended for historical compatibility with [WASIp1] set of syscalls. -[WASIp1]: https://github.com/WebAssembly/WASI/tree/main/legacy/preview1 +[WASIp1]: https://github.com/WebAssembly/WASI/tree/wasi-0.1/preview1 [Component Model]: https://github.com/webassembly/component-model Today the `wasm32-wasip1` target will generate core WebAssembly modules diff --git a/tests/assembly-llvm/slice-is-ascii.rs b/tests/assembly-llvm/slice-is-ascii.rs index d01b321bf460a..b9a5205054986 100644 --- a/tests/assembly-llvm/slice-is-ascii.rs +++ b/tests/assembly-llvm/slice-is-ascii.rs @@ -13,15 +13,15 @@ /// Verify `is_ascii` generates efficient code on different architectures: /// /// - x86_64: Must NOT use `kshiftrd`/`kshiftrq` (broken AVX-512 auto-vectorization). -/// The fix uses explicit SSE2 intrinsics (`pmovmskb`/`vpmovmskb`). -/// See: https://github.com/llvm/llvm-project/issues/176906 +/// Good version uses explicit SSE2 intrinsics (`pmovmskb`/`vpmovmskb`). /// /// - loongarch64: Should use `vmskltz.b` instruction for the fast-path. -/// This architecture still relies on LLVM auto-vectorization. // X86_64-LABEL: test_is_ascii // X86_64-NOT: kshiftrd // X86_64-NOT: kshiftrq +// X86_64: {{vpor|por}} +// X86_64: {{vpmovmskb|pmovmskb}} // LA64-LABEL: test_is_ascii // LA64: vmskltz.b