diff --git a/Cargo.toml b/Cargo.toml index 98ba373c68f..32c92fe9cdc 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -41,8 +41,8 @@ alloc = ["rand_core/alloc"] # Option: use getrandom package for seeding getrandom = ["rand_core/getrandom"] -# Option (requires nightly): experimental SIMD support -simd_support = ["packed_simd"] +# Option (requires nightly Rust): experimental SIMD support +simd_support = [] # Option (enabled by default): enable StdRng std_rng = ["rand_chacha"] @@ -68,13 +68,6 @@ log = { version = "0.4.4", optional = true } serde = { version = "1.0.103", features = ["derive"], optional = true } rand_chacha = { path = "rand_chacha", version = "0.3.0", default-features = false, optional = true } -[dependencies.packed_simd] -# NOTE: so far no version works reliably due to dependence on unstable features -package = "packed_simd_2" -version = "0.3.7" -optional = true -features = ["into_bits"] - [target.'cfg(unix)'.dependencies] # Used for fork protection (reseeding.rs) libc = { version = "0.2.22", optional = true, default-features = false } diff --git a/src/distributions/bernoulli.rs b/src/distributions/bernoulli.rs index 226db79fa9c..676b79a5c10 100644 --- a/src/distributions/bernoulli.rs +++ b/src/distributions/bernoulli.rs @@ -14,6 +14,7 @@ use core::{fmt, u64}; #[cfg(feature = "serde1")] use serde::{Serialize, Deserialize}; + /// The Bernoulli distribution. /// /// This is a special case of the Binomial distribution where `n = 1`. @@ -147,10 +148,10 @@ mod test { use crate::Rng; #[test] - #[cfg(feature="serde1")] + #[cfg(feature = "serde1")] fn test_serializing_deserializing_bernoulli() { let coin_flip = Bernoulli::new(0.5).unwrap(); - let de_coin_flip : Bernoulli = bincode::deserialize(&bincode::serialize(&coin_flip).unwrap()).unwrap(); + let de_coin_flip: Bernoulli = bincode::deserialize(&bincode::serialize(&coin_flip).unwrap()).unwrap(); assert_eq!(coin_flip.p_int, de_coin_flip.p_int); } diff --git a/src/distributions/float.rs b/src/distributions/float.rs index ce5946f7f01..54aebad4dc5 100644 --- a/src/distributions/float.rs +++ b/src/distributions/float.rs @@ -8,11 +8,11 @@ //! Basic floating-point number distributions -use crate::distributions::utils::FloatSIMDUtils; +use crate::distributions::utils::{IntAsSIMD, FloatAsSIMD, FloatSIMDUtils}; use crate::distributions::{Distribution, Standard}; use crate::Rng; use core::mem; -#[cfg(feature = "simd_support")] use packed_simd::*; +#[cfg(feature = "simd_support")] use core::simd::*; #[cfg(feature = "serde1")] use serde::{Serialize, Deserialize}; @@ -99,7 +99,7 @@ macro_rules! float_impls { // The exponent is encoded using an offset-binary representation let exponent_bits: $u_scalar = (($exponent_bias + exponent) as $u_scalar) << $fraction_bits; - $ty::from_bits(self | exponent_bits) + $ty::from_bits(self | $uty::splat(exponent_bits)) } } @@ -108,13 +108,13 @@ macro_rules! float_impls { // Multiply-based method; 24/53 random bits; [0, 1) interval. // We use the most significant bits because for simple RNGs // those are usually more random. - let float_size = mem::size_of::<$f_scalar>() as u32 * 8; + let float_size = mem::size_of::<$f_scalar>() as $u_scalar * 8; let precision = $fraction_bits + 1; let scale = 1.0 / ((1 as $u_scalar << precision) as $f_scalar); let value: $uty = rng.gen(); - let value = value >> (float_size - precision); - scale * $ty::cast_from_int(value) + let value = value >> $uty::splat(float_size - precision); + $ty::splat(scale) * $ty::cast_from_int(value) } } @@ -123,14 +123,14 @@ macro_rules! float_impls { // Multiply-based method; 24/53 random bits; (0, 1] interval. // We use the most significant bits because for simple RNGs // those are usually more random. - let float_size = mem::size_of::<$f_scalar>() as u32 * 8; + let float_size = mem::size_of::<$f_scalar>() as $u_scalar * 8; let precision = $fraction_bits + 1; let scale = 1.0 / ((1 as $u_scalar << precision) as $f_scalar); let value: $uty = rng.gen(); - let value = value >> (float_size - precision); + let value = value >> $uty::splat(float_size - precision); // Add 1 to shift up; will not overflow because of right-shift: - scale * $ty::cast_from_int(value + 1) + $ty::splat(scale) * $ty::cast_from_int(value + $uty::splat(1)) } } @@ -140,11 +140,11 @@ macro_rules! float_impls { // We use the most significant bits because for simple RNGs // those are usually more random. use core::$f_scalar::EPSILON; - let float_size = mem::size_of::<$f_scalar>() as u32 * 8; + let float_size = mem::size_of::<$f_scalar>() as $u_scalar * 8; let value: $uty = rng.gen(); - let fraction = value >> (float_size - $fraction_bits); - fraction.into_float_with_exponent(0) - (1.0 - EPSILON / 2.0) + let fraction = value >> $uty::splat(float_size - $fraction_bits); + fraction.into_float_with_exponent(0) - $ty::splat(1.0 - EPSILON / 2.0) } } } @@ -169,10 +169,10 @@ float_impls! { f64x4, u64x4, f64, u64, 52, 1023 } #[cfg(feature = "simd_support")] float_impls! { f64x8, u64x8, f64, u64, 52, 1023 } - #[cfg(test)] mod tests { use super::*; + use crate::distributions::utils::FloatAsSIMD; use crate::rngs::mock::StepRng; const EPSILON32: f32 = ::core::f32::EPSILON; @@ -182,29 +182,31 @@ mod tests { ($fnn:ident, $ty:ident, $ZERO:expr, $EPSILON:expr) => { #[test] fn $fnn() { + let two = $ty::splat(2.0); + // Standard let mut zeros = StepRng::new(0, 0); assert_eq!(zeros.gen::<$ty>(), $ZERO); let mut one = StepRng::new(1 << 8 | 1 << (8 + 32), 0); - assert_eq!(one.gen::<$ty>(), $EPSILON / 2.0); + assert_eq!(one.gen::<$ty>(), $EPSILON / two); let mut max = StepRng::new(!0, 0); - assert_eq!(max.gen::<$ty>(), 1.0 - $EPSILON / 2.0); + assert_eq!(max.gen::<$ty>(), $ty::splat(1.0) - $EPSILON / two); // OpenClosed01 let mut zeros = StepRng::new(0, 0); - assert_eq!(zeros.sample::<$ty, _>(OpenClosed01), 0.0 + $EPSILON / 2.0); + assert_eq!(zeros.sample::<$ty, _>(OpenClosed01), $ZERO + $EPSILON / two); let mut one = StepRng::new(1 << 8 | 1 << (8 + 32), 0); assert_eq!(one.sample::<$ty, _>(OpenClosed01), $EPSILON); let mut max = StepRng::new(!0, 0); - assert_eq!(max.sample::<$ty, _>(OpenClosed01), $ZERO + 1.0); + assert_eq!(max.sample::<$ty, _>(OpenClosed01), $ZERO + $ty::splat(1.0)); // Open01 let mut zeros = StepRng::new(0, 0); - assert_eq!(zeros.sample::<$ty, _>(Open01), 0.0 + $EPSILON / 2.0); + assert_eq!(zeros.sample::<$ty, _>(Open01), $ZERO + $EPSILON / two); let mut one = StepRng::new(1 << 9 | 1 << (9 + 32), 0); - assert_eq!(one.sample::<$ty, _>(Open01), $EPSILON / 2.0 * 3.0); + assert_eq!(one.sample::<$ty, _>(Open01), $EPSILON / two * $ty::splat(3.0)); let mut max = StepRng::new(!0, 0); - assert_eq!(max.sample::<$ty, _>(Open01), 1.0 - $EPSILON / 2.0); + assert_eq!(max.sample::<$ty, _>(Open01), $ty::splat(1.0) - $EPSILON / two); } }; } @@ -222,29 +224,31 @@ mod tests { ($fnn:ident, $ty:ident, $ZERO:expr, $EPSILON:expr) => { #[test] fn $fnn() { + let two = $ty::splat(2.0); + // Standard let mut zeros = StepRng::new(0, 0); assert_eq!(zeros.gen::<$ty>(), $ZERO); let mut one = StepRng::new(1 << 11, 0); - assert_eq!(one.gen::<$ty>(), $EPSILON / 2.0); + assert_eq!(one.gen::<$ty>(), $EPSILON / two); let mut max = StepRng::new(!0, 0); - assert_eq!(max.gen::<$ty>(), 1.0 - $EPSILON / 2.0); + assert_eq!(max.gen::<$ty>(), $ty::splat(1.0) - $EPSILON / two); // OpenClosed01 let mut zeros = StepRng::new(0, 0); - assert_eq!(zeros.sample::<$ty, _>(OpenClosed01), 0.0 + $EPSILON / 2.0); + assert_eq!(zeros.sample::<$ty, _>(OpenClosed01), $ZERO + $EPSILON / two); let mut one = StepRng::new(1 << 11, 0); assert_eq!(one.sample::<$ty, _>(OpenClosed01), $EPSILON); let mut max = StepRng::new(!0, 0); - assert_eq!(max.sample::<$ty, _>(OpenClosed01), $ZERO + 1.0); + assert_eq!(max.sample::<$ty, _>(OpenClosed01), $ZERO + $ty::splat(1.0)); // Open01 let mut zeros = StepRng::new(0, 0); - assert_eq!(zeros.sample::<$ty, _>(Open01), 0.0 + $EPSILON / 2.0); + assert_eq!(zeros.sample::<$ty, _>(Open01), $ZERO + $EPSILON / two); let mut one = StepRng::new(1 << 12, 0); - assert_eq!(one.sample::<$ty, _>(Open01), $EPSILON / 2.0 * 3.0); + assert_eq!(one.sample::<$ty, _>(Open01), $EPSILON / two * $ty::splat(3.0)); let mut max = StepRng::new(!0, 0); - assert_eq!(max.sample::<$ty, _>(Open01), 1.0 - $EPSILON / 2.0); + assert_eq!(max.sample::<$ty, _>(Open01), $ty::splat(1.0) - $EPSILON / two); } }; } @@ -296,16 +300,16 @@ mod tests { // non-SIMD types; we assume this pattern continues across all // SIMD types. - test_samples(&Standard, f32x2::new(0.0, 0.0), &[ - f32x2::new(0.0035963655, 0.7346052), - f32x2::new(0.09778172, 0.20298547), - f32x2::new(0.34296435, 0.81664366), + test_samples(&Standard, f32x2::from([0.0, 0.0]), &[ + f32x2::from([0.0035963655, 0.7346052]), + f32x2::from([0.09778172, 0.20298547]), + f32x2::from([0.34296435, 0.81664366]), ]); - test_samples(&Standard, f64x2::new(0.0, 0.0), &[ - f64x2::new(0.7346051961657583, 0.20298547462974248), - f64x2::new(0.8166436635290655, 0.7423708925400552), - f64x2::new(0.16387782224016323, 0.9087068770169618), + test_samples(&Standard, f64x2::from([0.0, 0.0]), &[ + f64x2::from([0.7346051961657583, 0.20298547462974248]), + f64x2::from([0.8166436635290655, 0.7423708925400552]), + f64x2::from([0.16387782224016323, 0.9087068770169618]), ]); } } diff --git a/src/distributions/integer.rs b/src/distributions/integer.rs index 19ce71599cb..d905044ed1e 100644 --- a/src/distributions/integer.rs +++ b/src/distributions/integer.rs @@ -11,12 +11,17 @@ use crate::distributions::{Distribution, Standard}; use crate::Rng; #[cfg(all(target_arch = "x86", feature = "simd_support"))] +use core::arch::x86::__m512i; +#[cfg(target_arch = "x86")] use core::arch::x86::{__m128i, __m256i}; #[cfg(all(target_arch = "x86_64", feature = "simd_support"))] +use core::arch::x86_64::__m512i; +#[cfg(target_arch = "x86_64")] use core::arch::x86_64::{__m128i, __m256i}; use core::num::{NonZeroU16, NonZeroU32, NonZeroU64, NonZeroU8, NonZeroUsize, NonZeroU128}; -#[cfg(feature = "simd_support")] use packed_simd::*; +#[cfg(feature = "simd_support")] use core::simd::*; +use core::mem; impl Distribution for Standard { #[inline] @@ -109,53 +114,63 @@ impl_nzint!(NonZeroU64, NonZeroU64::new); impl_nzint!(NonZeroU128, NonZeroU128::new); impl_nzint!(NonZeroUsize, NonZeroUsize::new); -#[cfg(feature = "simd_support")] -macro_rules! simd_impl { - ($(($intrinsic:ident, $vec:ty),)+) => {$( +macro_rules! intrinsic_impl { + ($($intrinsic:ident),+) => {$( + /// Available only on x86/64 platforms impl Distribution<$intrinsic> for Standard { #[inline] fn sample(&self, rng: &mut R) -> $intrinsic { - $intrinsic::from_bits(rng.gen::<$vec>()) + // On proper hardware, this should compile to SIMD instructions + // Verified on x86 Haswell with __m128i, __m256i + let mut buf = [0_u8; mem::size_of::<$intrinsic>()]; + rng.fill_bytes(&mut buf); + // x86 is little endian so no need for conversion + // SAFETY: we know [u8; N] and $intrinsic have the same size + unsafe { mem::transmute_copy(&buf) } } } )+}; +} - ($bits:expr,) => {}; - ($bits:expr, $ty:ty, $($ty_more:ty,)*) => { - simd_impl!($bits, $($ty_more,)*); - +#[cfg(feature = "simd_support")] +macro_rules! simd_impl { + ($($ty:ty),+) => {$( + /// Requires nightly Rust and the [`simd_support`] feature + /// + /// [`simd_support`]: https://github.com/rust-random/rand#crate-features impl Distribution<$ty> for Standard { #[inline] fn sample(&self, rng: &mut R) -> $ty { - let mut vec: $ty = Default::default(); - unsafe { - let ptr = &mut vec; - let b_ptr = &mut *(ptr as *mut $ty as *mut [u8; $bits/8]); - rng.fill_bytes(b_ptr); + // TODO: impl this generically once const generics are robust enough + let mut vec: Simd() }> = Default::default(); + rng.fill_bytes(vec.as_mut_array()); + // NOTE: replace with `to_le` if added to core::simd + #[cfg(not(target_endian = "little"))] + { + vec = vec.reverse(); } - vec.to_le() + // SAFETY: we know u8xN and $ty have the same size + unsafe { mem::transmute_copy(&vec) } } } - }; + )+}; } #[cfg(feature = "simd_support")] -simd_impl!(16, u8x2, i8x2,); -#[cfg(feature = "simd_support")] -simd_impl!(32, u8x4, i8x4, u16x2, i16x2,); -#[cfg(feature = "simd_support")] -simd_impl!(64, u8x8, i8x8, u16x4, i16x4, u32x2, i32x2,); -#[cfg(feature = "simd_support")] -simd_impl!(128, u8x16, i8x16, u16x8, i16x8, u32x4, i32x4, u64x2, i64x2,); -#[cfg(feature = "simd_support")] -simd_impl!(256, u8x32, i8x32, u16x16, i16x16, u32x8, i32x8, u64x4, i64x4,); -#[cfg(feature = "simd_support")] -simd_impl!(512, u8x64, i8x64, u16x32, i16x32, u32x16, i32x16, u64x8, i64x8,); +simd_impl!( + i8x4, i8x8, i8x16, i8x32, i8x64, i16x2, i16x4, i16x8, i16x16, i16x32, i32x2, i32x4, i32x8, + i32x16, i64x2, i64x4, i64x8, isizex2, isizex4, isizex8, u8x4, u8x8, u8x16, u8x32, u8x64, u16x2, + u16x4, u16x8, u16x16, u16x32, u32x2, u32x4, u32x8, u32x16, u64x2, u64x4, u64x8, usizex2, + usizex4, usizex8 +); + +#[cfg(any(target_arch = "x86", target_arch = "x86_64"))] +intrinsic_impl!(__m128i, __m256i); #[cfg(all( - feature = "simd_support", - any(target_arch = "x86", target_arch = "x86_64") + any(target_arch = "x86", target_arch = "x86_64"), + feature = "simd_support" ))] -simd_impl!((__m128i, u8x16), (__m256i, u8x32),); +intrinsic_impl!(__m512i); #[cfg(test)] mod tests { @@ -221,24 +236,19 @@ mod tests { { // We only test a sub-set of types here and make assumptions about the rest. - test_samples(u8x2::default(), &[ - u8x2::new(9, 126), - u8x2::new(247, 167), - u8x2::new(111, 149), - ]); test_samples(u8x4::default(), &[ - u8x4::new(9, 126, 87, 132), - u8x4::new(247, 167, 123, 153), - u8x4::new(111, 149, 73, 120), + u8x4::from([9, 126, 87, 132]), + u8x4::from([247, 167, 123, 153]), + u8x4::from([111, 149, 73, 120]), ]); test_samples(u8x8::default(), &[ - u8x8::new(9, 126, 87, 132, 247, 167, 123, 153), - u8x8::new(111, 149, 73, 120, 68, 171, 98, 223), - u8x8::new(24, 121, 1, 50, 13, 46, 164, 20), + u8x8::from([9, 126, 87, 132, 247, 167, 123, 153]), + u8x8::from([111, 149, 73, 120, 68, 171, 98, 223]), + u8x8::from([24, 121, 1, 50, 13, 46, 164, 20]), ]); test_samples(i64x8::default(), &[ - i64x8::new( + i64x8::from([ -7387126082252079607, -2350127744969763473, 1487364411147516184, @@ -247,8 +257,8 @@ mod tests { 6022086574635100741, -5080089175222015595, -4066367846667249123, - ), - i64x8::new( + ]), + i64x8::from([ 9180885022207963908, 3095981199532211089, 6586075293021332726, @@ -257,8 +267,8 @@ mod tests { 5287129228749947252, 444726432079249540, -1587028029513790706, - ), - i64x8::new( + ]), + i64x8::from([ 6075236523189346388, 1351763722368165432, -6192309979959753740, @@ -267,7 +277,7 @@ mod tests { 7522501477800909500, -1837258847956201231, -586926753024886735, - ), + ]), ]); } } diff --git a/src/distributions/mod.rs b/src/distributions/mod.rs index 05ca80606b0..9b0a8867000 100644 --- a/src/distributions/mod.rs +++ b/src/distributions/mod.rs @@ -149,8 +149,14 @@ use crate::Rng; /// * `bool`: Generates `false` or `true`, each with probability 0.5. /// * Floating point types (`f32` and `f64`): Uniformly distributed in the /// half-open range `[0, 1)`. See notes below. -/// * Wrapping integers (`Wrapping`), besides the type identical to their +/// * Wrapping integers ([`Wrapping`]), besides the type identical to their /// normal integer variants. +/// * Non-zero integers ([`NonZeroU8`]), which are like their normal integer +/// variants but cannot produce zero. +/// * SIMD types like x86's [`__m128i`], `std::simd`'s [`u32x4`]/[`f32x4`]/ +/// [`mask32x4`] (requires [`simd_support`]), where each lane is distributed +/// like their scalar `Standard` variants. See the list of `Standard` +/// implementations for more. /// /// The `Standard` distribution also supports generation of the following /// compound types where all component types are supported: @@ -213,6 +219,13 @@ use crate::Rng; /// CPUs all methods have approximately equal performance). /// /// [`Uniform`]: uniform::Uniform +/// [`Wrapping`]: std::num::Wrapping +/// [`NonZeroU8`]: std::num::NonZeroU8 +/// [`__m128i`]: https://doc.rust-lang.org/nightly/core/arch/x86/struct.__m128i.html +/// [`u32x4`]: std::simd::u32x4 +/// [`f32x4`]: std::simd::f32x4 +/// [`mask32x4`]: std::simd::mask32x4 +/// [`simd_support`]: https://github.com/rust-random/rand#crate-features #[derive(Clone, Copy, Debug)] #[cfg_attr(feature = "serde1", derive(serde::Serialize, serde::Deserialize))] pub struct Standard; diff --git a/src/distributions/other.rs b/src/distributions/other.rs index 03802a76d5f..b3b5e4e0201 100644 --- a/src/distributions/other.rs +++ b/src/distributions/other.rs @@ -22,6 +22,8 @@ use crate::Rng; use serde::{Serialize, Deserialize}; #[cfg(feature = "min_const_gen")] use core::mem::{self, MaybeUninit}; +#[cfg(feature = "simd_support")] +use core::simd::{Mask, Simd, LaneCount, SupportedLaneCount, MaskElement, SimdElement}; // ----- Sampling distributions ----- @@ -145,6 +147,37 @@ impl Distribution for Standard { } } +/// Requires nightly Rust and the [`simd_support`] feature +/// +/// Note that on some hardware like x86/64 mask operations like [`_mm_blendv_epi8`] +/// only care about a single bit. This means that you could use uniform random bits +/// directly: +/// +/// ```ignore +/// // this may be faster... +/// let x = unsafe { _mm_blendv_epi8(a.into(), b.into(), rng.gen::<__m128i>()) }; +/// +/// // ...than this +/// let x = rng.gen::().select(b, a); +/// ``` +/// +/// Since most bits are unused you could also generate only as many bits as you need. +/// +/// [`_mm_blendv_epi8`]: https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_blendv_epi8&ig_expand=514/ +/// [`simd_support`]: https://github.com/rust-random/rand#crate-features +#[cfg(feature = "simd_support")] +impl Distribution> for Standard +where + T: MaskElement + PartialOrd + SimdElement + Default, + LaneCount: SupportedLaneCount, + Standard: Distribution>, +{ + #[inline] + fn sample(&self, rng: &mut R) -> Mask { + rng.gen().lanes_lt(Simd::default()) + } +} + macro_rules! tuple_impl { // use variables to indicate the arity of the tuple ($($tyvar:ident),* ) => { diff --git a/src/distributions/uniform.rs b/src/distributions/uniform.rs index 261357b2456..d9197c68045 100644 --- a/src/distributions/uniform.rs +++ b/src/distributions/uniform.rs @@ -110,15 +110,17 @@ use core::time::Duration; use core::ops::{Range, RangeInclusive}; use crate::distributions::float::IntoFloat; -use crate::distributions::utils::{BoolAsSIMD, FloatAsSIMD, FloatSIMDUtils, WideningMultiply}; +use crate::distributions::utils::{BoolAsSIMD, FloatAsSIMD, FloatSIMDUtils, IntAsSIMD, WideningMultiply}; use crate::distributions::Distribution; +#[cfg(feature = "simd_support")] +use crate::distributions::Standard; use crate::{Rng, RngCore}; #[cfg(not(feature = "std"))] #[allow(unused_imports)] // rustc doesn't detect that this is actually used use crate::distributions::utils::Float; -#[cfg(feature = "simd_support")] use packed_simd::*; +#[cfg(feature = "simd_support")] use core::simd::*; #[cfg(feature = "serde1")] use serde::{Serialize, Deserialize}; @@ -571,21 +573,30 @@ uniform_int_impl! { u128, u128, u128 } #[cfg(feature = "simd_support")] macro_rules! uniform_simd_int_impl { - ($ty:ident, $unsigned:ident, $u_scalar:ident) => { + ($ty:ident, $unsigned:ident) => { // The "pick the largest zone that can fit in an `u32`" optimization // is less useful here. Multiple lanes complicate things, we don't // know the PRNG's minimal output size, and casting to a larger vector // is generally a bad idea for SIMD performance. The user can still // implement it manually. - - // TODO: look into `Uniform::::new(0u32, 100)` functionality - // perhaps `impl SampleUniform for $u_scalar`? - impl SampleUniform for $ty { - type Sampler = UniformInt<$ty>; + impl SampleUniform for Simd<$ty, LANES> + where + LaneCount: SupportedLaneCount, + Simd<$unsigned, LANES>: + WideningMultiply, Simd<$unsigned, LANES>)>, + Standard: Distribution>, + { + type Sampler = UniformInt>; } - impl UniformSampler for UniformInt<$ty> { - type X = $ty; + impl UniformSampler for UniformInt> + where + LaneCount: SupportedLaneCount, + Simd<$unsigned, LANES>: + WideningMultiply, Simd<$unsigned, LANES>)>, + Standard: Distribution>, + { + type X = Simd<$ty, LANES>; #[inline] // if the range is constant, this helps LLVM to do the // calculations at compile-time. @@ -595,8 +606,8 @@ macro_rules! uniform_simd_int_impl { { let low = *low_b.borrow(); let high = *high_b.borrow(); - assert!(low.lt(high).all(), "Uniform::new called with `low >= high`"); - UniformSampler::new_inclusive(low, high - 1) + assert!(low.lanes_lt(high).all(), "Uniform::new called with `low >= high`"); + UniformSampler::new_inclusive(low, high - Simd::splat(1)) } #[inline] // if the range is constant, this helps LLVM to do the @@ -607,20 +618,20 @@ macro_rules! uniform_simd_int_impl { { let low = *low_b.borrow(); let high = *high_b.borrow(); - assert!(low.le(high).all(), + assert!(low.lanes_le(high).all(), "Uniform::new_inclusive called with `low > high`"); - let unsigned_max = ::core::$u_scalar::MAX; + let unsigned_max = Simd::splat(::core::$unsigned::MAX); - // NOTE: these may need to be replaced with explicitly - // wrapping operations if `packed_simd` changes - let range: $unsigned = ((high - low) + 1).cast(); + // NOTE: all `Simd` operations are inherently wrapping, + // see https://doc.rust-lang.org/std/simd/struct.Simd.html + let range: Simd<$unsigned, LANES> = ((high - low) + Simd::splat(1)).cast(); // `% 0` will panic at runtime. - let not_full_range = range.gt($unsigned::splat(0)); + let not_full_range = range.lanes_gt(Simd::splat(0)); // replacing 0 with `unsigned_max` allows a faster `select` // with bitwise OR - let modulo = not_full_range.select(range, $unsigned::splat(unsigned_max)); + let modulo = not_full_range.select(range, unsigned_max); // wrapping addition - let ints_to_reject = (unsigned_max - range + 1) % modulo; + let ints_to_reject = (unsigned_max - range + Simd::splat(1)) % modulo; // When `range` is 0, `lo` of `v.wmul(range)` will always be // zero which means only one sample is needed. let zone = unsigned_max - ints_to_reject; @@ -634,8 +645,8 @@ macro_rules! uniform_simd_int_impl { } fn sample(&self, rng: &mut R) -> Self::X { - let range: $unsigned = self.range.cast(); - let zone: $unsigned = self.z.cast(); + let range: Simd<$unsigned, LANES> = self.range.cast(); + let zone: Simd<$unsigned, LANES> = self.z.cast(); // This might seem very slow, generating a whole new // SIMD vector for every sample rejection. For most uses @@ -646,19 +657,19 @@ macro_rules! uniform_simd_int_impl { // rejection. The replacement method does however add a little // overhead. Benchmarking or calculating probabilities might // reveal contexts where this replacement method is slower. - let mut v: $unsigned = rng.gen(); + let mut v: Simd<$unsigned, LANES> = rng.gen(); loop { let (hi, lo) = v.wmul(range); - let mask = lo.le(zone); + let mask = lo.lanes_le(zone); if mask.all() { - let hi: $ty = hi.cast(); + let hi: Simd<$ty, LANES> = hi.cast(); // wrapping addition let result = self.low + hi; // `select` here compiles to a blend operation // When `range.eq(0).none()` the compare and blend // operations are avoided. - let v: $ty = v.cast(); - return range.gt($unsigned::splat(0)).select(result, v); + let v: Simd<$ty, LANES> = v.cast(); + return range.lanes_gt(Simd::splat(0)).select(result, v); } // Replace only the failing lanes v = mask.select(v, rng.gen()); @@ -668,51 +679,16 @@ macro_rules! uniform_simd_int_impl { }; // bulk implementation - ($(($unsigned:ident, $signed:ident),)+ $u_scalar:ident) => { + ($(($unsigned:ident, $signed:ident)),+) => { $( - uniform_simd_int_impl!($unsigned, $unsigned, $u_scalar); - uniform_simd_int_impl!($signed, $unsigned, $u_scalar); + uniform_simd_int_impl!($unsigned, $unsigned); + uniform_simd_int_impl!($signed, $unsigned); )+ }; } #[cfg(feature = "simd_support")] -uniform_simd_int_impl! { - (u64x2, i64x2), - (u64x4, i64x4), - (u64x8, i64x8), - u64 -} - -#[cfg(feature = "simd_support")] -uniform_simd_int_impl! { - (u32x2, i32x2), - (u32x4, i32x4), - (u32x8, i32x8), - (u32x16, i32x16), - u32 -} - -#[cfg(feature = "simd_support")] -uniform_simd_int_impl! { - (u16x2, i16x2), - (u16x4, i16x4), - (u16x8, i16x8), - (u16x16, i16x16), - (u16x32, i16x32), - u16 -} - -#[cfg(feature = "simd_support")] -uniform_simd_int_impl! { - (u8x2, i8x2), - (u8x4, i8x4), - (u8x8, i8x8), - (u8x16, i8x16), - (u8x32, i8x32), - (u8x64, i8x64), - u8 -} +uniform_simd_int_impl! { (u8, i8), (u16, i16), (u32, i32), (u64, i64) } impl SampleUniform for char { type Sampler = UniformChar; @@ -847,7 +823,7 @@ macro_rules! uniform_float_impl { loop { let mask = (scale * max_rand + low).ge_mask(high); - if mask.none() { + if !mask.any() { break; } scale = scale.decrease_masked(mask); @@ -886,7 +862,7 @@ macro_rules! uniform_float_impl { loop { let mask = (scale * max_rand + low).gt_mask(high); - if mask.none() { + if !mask.any() { break; } scale = scale.decrease_masked(mask); @@ -899,11 +875,11 @@ macro_rules! uniform_float_impl { fn sample(&self, rng: &mut R) -> Self::X { // Generate a value in the range [1, 2) - let value1_2 = (rng.gen::<$uty>() >> $bits_to_discard).into_float_with_exponent(0); + let value1_2 = (rng.gen::<$uty>() >> $uty::splat($bits_to_discard)).into_float_with_exponent(0); // Get a value in the range [0, 1) in order to avoid // overflowing into infinity when multiplying with scale - let value0_1 = value1_2 - 1.0; + let value0_1 = value1_2 - <$ty>::splat(1.0); // We don't use `f64::mul_add`, because it is not available with // `no_std`. Furthermore, it is slower for some targets (but @@ -939,11 +915,11 @@ macro_rules! uniform_float_impl { loop { // Generate a value in the range [1, 2) let value1_2 = - (rng.gen::<$uty>() >> $bits_to_discard).into_float_with_exponent(0); + (rng.gen::<$uty>() >> $uty::splat($bits_to_discard)).into_float_with_exponent(0); // Get a value in the range [0, 1) in order to avoid // overflowing into infinity when multiplying with scale - let value0_1 = value1_2 - 1.0; + let value0_1 = value1_2 - <$ty>::splat(1.0); // Doing multiply before addition allows some architectures // to use a single instruction. @@ -1184,7 +1160,7 @@ mod tests { _ => panic!("`UniformDurationMode` was not serialized/deserialized correctly") } } - + #[test] #[cfg(feature = "serde1")] fn test_uniform_serialization() { @@ -1289,8 +1265,8 @@ mod tests { ($ty::splat(10), $ty::splat(127)), ($ty::splat($scalar::MIN), $ty::splat($scalar::MAX)), ], - |x: $ty, y| x.le(y).all(), - |x: $ty, y| x.lt(y).all() + |x: $ty, y| x.lanes_le(y).all(), + |x: $ty, y| x.lanes_lt(y).all() );)* }}; } @@ -1298,8 +1274,8 @@ mod tests { #[cfg(feature = "simd_support")] { - t!(u8x2, u8x4, u8x8, u8x16, u8x32, u8x64 => u8); - t!(i8x2, i8x4, i8x8, i8x16, i8x32, i8x64 => i8); + t!(u8x4, u8x8, u8x16, u8x32, u8x64 => u8); + t!(i8x4, i8x8, i8x16, i8x32, i8x64 => i8); t!(u16x2, u16x4, u16x8, u16x16, u16x32 => u16); t!(i16x2, i16x4, i16x8, i16x16, i16x32 => i16); t!(u32x2, u32x4, u32x8, u32x16 => u32); @@ -1351,7 +1327,7 @@ mod tests { (-::core::$f_scalar::MAX * 0.2, ::core::$f_scalar::MAX * 0.7), ]; for &(low_scalar, high_scalar) in v.iter() { - for lane in 0..<$ty>::lanes() { + for lane in 0..<$ty>::LANES { let low = <$ty>::splat(0.0 as $f_scalar).replace(lane, low_scalar); let high = <$ty>::splat(1.0 as $f_scalar).replace(lane, high_scalar); let my_uniform = Uniform::new(low, high); @@ -1474,7 +1450,7 @@ mod tests { (::std::$f_scalar::NEG_INFINITY, ::std::$f_scalar::INFINITY), ]; for &(low_scalar, high_scalar) in v.iter() { - for lane in 0..<$ty>::lanes() { + for lane in 0..<$ty>::LANES { let low = <$ty>::splat(0.0 as $f_scalar).replace(lane, low_scalar); let high = <$ty>::splat(1.0 as $f_scalar).replace(lane, high_scalar); assert!(catch_unwind(|| range(low, high)).is_err()); diff --git a/src/distributions/utils.rs b/src/distributions/utils.rs index 89da5fd7aad..689a4a1d8ea 100644 --- a/src/distributions/utils.rs +++ b/src/distributions/utils.rs @@ -8,7 +8,7 @@ //! Math helper functions -#[cfg(feature = "simd_support")] use packed_simd::*; +#[cfg(feature = "simd_support")] use core::simd::*; pub(crate) trait WideningMultiply { @@ -45,7 +45,7 @@ macro_rules! wmul_impl { let y: $wide = self.cast(); let x: $wide = x.cast(); let tmp = y * x; - let hi: $ty = (tmp >> $shift).cast(); + let hi: $ty = (tmp >> Simd::splat($shift)).cast(); let lo: $ty = tmp.cast(); (hi, lo) } @@ -99,19 +99,20 @@ macro_rules! wmul_impl_large { #[inline(always)] fn wmul(self, b: $ty) -> Self::Output { // needs wrapping multiplication - const LOWER_MASK: $scalar = !0 >> $half; + const LOWER_MASK: $ty = <$ty>::splat(!0 >> $half); + const HALF: $ty = <$ty>::splat($half); let mut low = (self & LOWER_MASK) * (b & LOWER_MASK); - let mut t = low >> $half; + let mut t = low >> HALF; low &= LOWER_MASK; - t += (self >> $half) * (b & LOWER_MASK); - low += (t & LOWER_MASK) << $half; - let mut high = t >> $half; - t = low >> $half; + t += (self >> HALF) * (b & LOWER_MASK); + low += (t & LOWER_MASK) << HALF; + let mut high = t >> HALF; + t = low >> HALF; low &= LOWER_MASK; - t += (b >> $half) * (self & LOWER_MASK); - low += (t & LOWER_MASK) << $half; - high += t >> $half; - high += (self >> $half) * (b >> $half); + t += (b >> HALF) * (self & LOWER_MASK); + low += (t & LOWER_MASK) << HALF; + high += t >> HALF; + high += (self >> HALF) * (b >> HALF); (high, low) } @@ -148,7 +149,6 @@ mod simd_wmul { #[cfg(target_arch = "x86_64")] use core::arch::x86_64::*; wmul_impl! { - (u8x2, u16x2), (u8x4, u16x4), (u8x8, u16x8), (u8x16, u16x16), @@ -167,16 +167,14 @@ mod simd_wmul { // means `wmul` can be implemented with only two instructions. #[allow(unused_macros)] macro_rules! wmul_impl_16 { - ($ty:ident, $intrinsic:ident, $mulhi:ident, $mullo:ident) => { + ($ty:ident, $mulhi:ident, $mullo:ident) => { impl WideningMultiply for $ty { type Output = ($ty, $ty); #[inline(always)] fn wmul(self, x: $ty) -> Self::Output { - let b = $intrinsic::from_bits(x); - let a = $intrinsic::from_bits(self); - let hi = $ty::from_bits(unsafe { $mulhi(a, b) }); - let lo = $ty::from_bits(unsafe { $mullo(a, b) }); + let hi = unsafe { $mulhi(self.into(), x.into()) }.into(); + let lo = unsafe { $mullo(self.into(), x.into()) }.into(); (hi, lo) } } @@ -184,11 +182,11 @@ mod simd_wmul { } #[cfg(target_feature = "sse2")] - wmul_impl_16! { u16x8, __m128i, _mm_mulhi_epu16, _mm_mullo_epi16 } + wmul_impl_16! { u16x8, _mm_mulhi_epu16, _mm_mullo_epi16 } #[cfg(target_feature = "avx2")] - wmul_impl_16! { u16x16, __m256i, _mm256_mulhi_epu16, _mm256_mullo_epi16 } - // FIXME: there are no `__m512i` types in stdsimd yet, so `wmul::` - // cannot use the same implementation. + wmul_impl_16! { u16x16, _mm256_mulhi_epu16, _mm256_mullo_epi16 } + #[cfg(target_feature = "avx512bw")] + wmul_impl_16! { u16x32, _mm512_mulhi_epu16, _mm512_mullo_epi16 } wmul_impl! { (u32x2, u64x2), @@ -199,6 +197,7 @@ mod simd_wmul { // TODO: optimize, this seems to seriously slow things down wmul_impl_large! { (u8x64,) u8, 4 } + #[cfg(not(target_feature = "avx512bw"))] wmul_impl_large! { (u16x32,) u16, 8 } wmul_impl_large! { (u32x16,) u32, 16 } wmul_impl_large! { (u64x2, u64x4, u64x8,) u64, 32 } @@ -229,6 +228,10 @@ pub(crate) trait FloatSIMDUtils { // value, not by retaining the binary representation. type UInt; fn cast_from_int(i: Self::UInt) -> Self; + + type Scalar; + fn replace(self, index: usize, new_value: Self::Scalar) -> Self; + fn extract(self, index: usize) -> Self::Scalar; } /// Implement functions available in std builds but missing from core primitives @@ -243,26 +246,23 @@ pub(crate) trait Float: Sized { /// Implement functions on f32/f64 to give them APIs similar to SIMD types pub(crate) trait FloatAsSIMD: Sized { - #[inline(always)] - fn lanes() -> usize { - 1 - } + const LANES: usize = 1; #[inline(always)] fn splat(scalar: Self) -> Self { scalar } +} + +pub(crate) trait IntAsSIMD: Sized { #[inline(always)] - fn extract(self, index: usize) -> Self { - debug_assert_eq!(index, 0); - self - } - #[inline(always)] - fn replace(self, index: usize, new_value: Self) -> Self { - debug_assert_eq!(index, 0); - new_value + fn splat(scalar: Self) -> Self { + scalar } } +impl IntAsSIMD for u32 {} +impl IntAsSIMD for u64 {} + pub(crate) trait BoolAsSIMD: Sized { fn any(self) -> bool; fn all(self) -> bool; @@ -308,6 +308,7 @@ macro_rules! scalar_float_impl { impl FloatSIMDUtils for $ty { type Mask = bool; + type Scalar = $ty; type UInt = $uty; #[inline(always)] @@ -350,6 +351,18 @@ macro_rules! scalar_float_impl { fn cast_from_int(i: Self::UInt) -> Self { i as $ty } + + #[inline] + fn replace(self, index: usize, new_value: Self::Scalar) -> Self { + debug_assert_eq!(index, 0); + new_value + } + + #[inline] + fn extract(self, index: usize) -> Self::Scalar { + debug_assert_eq!(index, 0); + self + } } impl FloatAsSIMD for $ty {} @@ -362,42 +375,42 @@ scalar_float_impl!(f64, u64); #[cfg(feature = "simd_support")] macro_rules! simd_impl { - ($ty:ident, $f_scalar:ident, $mty:ident, $uty:ident) => { - impl FloatSIMDUtils for $ty { - type Mask = $mty; - type UInt = $uty; + ($fty:ident, $uty:ident) => { + impl FloatSIMDUtils for Simd<$fty, LANES> + where LaneCount: SupportedLaneCount + { + type Mask = Mask<<$fty as SimdElement>::Mask, LANES>; + type Scalar = $fty; + type UInt = Simd<$uty, LANES>; #[inline(always)] fn all_lt(self, other: Self) -> bool { - self.lt(other).all() + self.lanes_lt(other).all() } #[inline(always)] fn all_le(self, other: Self) -> bool { - self.le(other).all() + self.lanes_le(other).all() } #[inline(always)] fn all_finite(self) -> bool { - self.finite_mask().all() + self.is_finite().all() } #[inline(always)] fn finite_mask(self) -> Self::Mask { - // This can possibly be done faster by checking bit patterns - let neg_inf = $ty::splat(::core::$f_scalar::NEG_INFINITY); - let pos_inf = $ty::splat(::core::$f_scalar::INFINITY); - self.gt(neg_inf) & self.lt(pos_inf) + self.is_finite() } #[inline(always)] fn gt_mask(self, other: Self) -> Self::Mask { - self.gt(other) + self.lanes_gt(other) } #[inline(always)] fn ge_mask(self, other: Self) -> Self::Mask { - self.ge(other) + self.lanes_ge(other) } #[inline(always)] @@ -406,24 +419,32 @@ macro_rules! simd_impl { // true, and 0 for false. Adding that to the binary // representation of a float means subtracting one from // the binary representation, resulting in the next lower - // value representable by $ty. This works even when the + // value representable by $fty. This works even when the // current value is infinity. debug_assert!(mask.any(), "At least one lane must be set"); - <$ty>::from_bits(<$uty>::from_bits(self) + <$uty>::from_bits(mask)) + Self::from_bits(self.to_bits() + mask.to_int().cast()) } #[inline] fn cast_from_int(i: Self::UInt) -> Self { i.cast() } + + #[inline] + fn replace(mut self, index: usize, new_value: Self::Scalar) -> Self { + self.as_mut_array()[index] = new_value; + self + } + + #[inline] + fn extract(self, index: usize) -> Self::Scalar { + self.as_array()[index] + } } }; } -#[cfg(feature="simd_support")] simd_impl! { f32x2, f32, m32x2, u32x2 } -#[cfg(feature="simd_support")] simd_impl! { f32x4, f32, m32x4, u32x4 } -#[cfg(feature="simd_support")] simd_impl! { f32x8, f32, m32x8, u32x8 } -#[cfg(feature="simd_support")] simd_impl! { f32x16, f32, m32x16, u32x16 } -#[cfg(feature="simd_support")] simd_impl! { f64x2, f64, m64x2, u64x2 } -#[cfg(feature="simd_support")] simd_impl! { f64x4, f64, m64x4, u64x4 } -#[cfg(feature="simd_support")] simd_impl! { f64x8, f64, m64x8, u64x8 } +#[cfg(feature = "simd_support")] +simd_impl!(f32, u32); +#[cfg(feature = "simd_support")] +simd_impl!(f64, u64); diff --git a/src/lib.rs b/src/lib.rs index 6d847180111..ef5c8a5a56c 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -49,7 +49,7 @@ #![deny(missing_debug_implementations)] #![doc(test(attr(allow(unused_variables), deny(warnings))))] #![no_std] -#![cfg_attr(feature = "simd_support", feature(stdsimd))] +#![cfg_attr(feature = "simd_support", feature(stdsimd, portable_simd))] #![cfg_attr(doc_cfg, feature(doc_cfg))] #![allow( clippy::float_cmp,