Skip to content

Commit 50aced8

Browse files
pythoneeralexcrichton
authored andcommitted
* added missing doc _mm_cvtps_pd added missing doc & test _mm_load_pd added missing doc & test _mm_store_pd added _mm_store1_pd added _mm_store_pd1 added _mm_storer_pd added _mm_load_pd1 added _mm_loadr_pd added _mm_loadu_pd * correct alignments
1 parent 59de334 commit 50aced8

File tree

1 file changed

+201
-1
lines changed

1 file changed

+201
-1
lines changed

src/x86/sse2.rs

Lines changed: 201 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1726,6 +1726,9 @@ pub unsafe fn _mm_cvtpd_ps(a: f64x2) -> f32x4 {
17261726
cvtpd2ps(a)
17271727
}
17281728

1729+
1730+
/// Convert packed single-precision (32-bit) floating-point elements in `a` to packed
1731+
/// double-precision (64-bit) floating-point elements.
17291732
#[inline(always)]
17301733
#[target_feature = "+sse2"]
17311734
#[cfg_attr(test, assert_instr(cvtps2pd))]
@@ -1873,21 +1876,55 @@ pub unsafe fn _mm_movemask_pd(a: f64x2) -> i32 {
18731876

18741877

18751878

1876-
1879+
/// Load 128-bits (composed of 2 packed double-precision (64-bit) floating-point elements)
1880+
/// from memory into the returned vector. mem_addr must be aligned on a 16-byte boundary or
1881+
/// a general-protection exception may be generated.
18771882
#[inline(always)]
18781883
#[target_feature = "+sse2"]
18791884
#[cfg_attr(test, assert_instr(movaps))]
18801885
pub unsafe fn _mm_load_pd(mem_addr: *const f64) -> f64x2 {
18811886
*(mem_addr as *const f64x2)
18821887
}
18831888

1889+
/// Store 128-bits (composed of 2 packed double-precision (64-bit) floating-point elements) from `a`
1890+
/// into memory. mem_addr must be aligned on a 16-byte boundary or a general-protection exception
1891+
/// may be generated.
18841892
#[inline(always)]
18851893
#[target_feature = "+sse2"]
18861894
#[cfg_attr(test, assert_instr(movaps))]
18871895
pub unsafe fn _mm_store_pd(mem_addr: *mut f64, a: f64x2) {
18881896
*(mem_addr as *mut f64x2) = a;
18891897
}
18901898

1899+
/// Store the lower double-precision (64-bit) floating-point element from `a` into 2 contiguous
1900+
/// elements in memory. `mem_addr` must be aligned on a 16-byte boundary or a general-protection
1901+
/// exception may be generated.
1902+
#[inline(always)]
1903+
#[target_feature = "+sse2"]
1904+
pub unsafe fn _mm_store1_pd(mem_addr: *mut f64, a: f64x2) {
1905+
let b: f64x2 = simd_shuffle2(a, a, [0, 0]);
1906+
*(mem_addr as *mut f64x2) = b;
1907+
}
1908+
1909+
/// Store the lower double-precision (64-bit) floating-point element from `a` into 2 contiguous
1910+
/// elements in memory. `mem_addr` must be aligned on a 16-byte boundary or a general-protection
1911+
/// exception may be generated.
1912+
#[inline(always)]
1913+
#[target_feature = "+sse2"]
1914+
pub unsafe fn _mm_store_pd1(mem_addr: *mut f64, a: f64x2) {
1915+
let b: f64x2 = simd_shuffle2(a, a, [0, 0]);
1916+
*(mem_addr as *mut f64x2) = b;
1917+
}
1918+
1919+
/// Store 2 double-precision (64-bit) floating-point elements from `a` into memory in reverse order.
1920+
/// `mem_addr` must be aligned on a 16-byte boundary or a general-protection exception may be generated.
1921+
#[inline(always)]
1922+
#[target_feature = "+sse2"]
1923+
pub unsafe fn _mm_storer_pd(mem_addr: *mut f64, a: f64x2) {
1924+
let b: f64x2 = simd_shuffle2(a, a, [1, 0]);
1925+
*(mem_addr as *mut f64x2) = b;
1926+
}
1927+
18911928
/// Load a double-precision (64-bit) floating-point element from memory
18921929
/// into both elements of returned vector.
18931930
#[inline(always)]
@@ -1897,6 +1934,41 @@ pub unsafe fn _mm_load1_pd(mem_addr: *const f64) -> f64x2 {
18971934
f64x2::new(d, d)
18981935
}
18991936

1937+
/// Load a double-precision (64-bit) floating-point element from memory
1938+
/// into both elements of returned vector.
1939+
#[inline(always)]
1940+
#[target_feature = "+sse2"]
1941+
pub unsafe fn _mm_load_pd1(mem_addr: *const f64) -> f64x2 {
1942+
let d = *mem_addr;
1943+
f64x2::new(d, d)
1944+
}
1945+
1946+
/// Load 2 double-precision (64-bit) floating-point elements from memory into the returned vector
1947+
/// in reverse order. mem_addr must be aligned on a 16-byte boundary or a general-protection
1948+
/// exception may be generated.
1949+
#[inline(always)]
1950+
#[target_feature = "+sse2"]
1951+
#[cfg_attr(test, assert_instr(movapd))]
1952+
pub unsafe fn _mm_loadr_pd(mem_addr: *const f64) -> f64x2 {
1953+
let a = _mm_load_pd(mem_addr);
1954+
simd_shuffle2(a, a, [1, 0])
1955+
}
1956+
1957+
/// Load 128-bits (composed of 2 packed double-precision (64-bit) floating-point elements)
1958+
/// from memory into the returned vector. mem_addr does not need to be aligned on any particular
1959+
/// oundary.
1960+
#[inline(always)]
1961+
#[target_feature = "+sse2"]
1962+
#[cfg_attr(test, assert_instr(movups))]
1963+
pub unsafe fn _mm_loadu_pd(mem_addr: *const f64) -> f64x2 {
1964+
let mut dst = f64x2::splat(mem::uninitialized());
1965+
ptr::copy_nonoverlapping(
1966+
mem_addr as *const u8,
1967+
&mut dst as *mut f64x2 as *mut u8,
1968+
mem::size_of::<f64x2>());
1969+
dst
1970+
}
1971+
19001972
/// Return vector of type __m128d with undefined elements.
19011973
#[inline(always)]
19021974
#[target_feature = "+sse2"]
@@ -2068,6 +2140,7 @@ extern {
20682140
mod tests {
20692141
use std::os::raw::c_void;
20702142
use stdsimd_test::simd_test;
2143+
use test::black_box; // Used to inhibit constant-folding.
20712144

20722145
use v128::*;
20732146
use x86::{__m128i, sse2};
@@ -3587,6 +3660,126 @@ mod tests {
35873660
assert_eq!(r, 0b11);
35883661
}
35893662

3663+
#[simd_test = "sse2"]
3664+
unsafe fn _mm_load_pd() {
3665+
let vals = &[1.0f64, 2.0, 3.0, 4.0];
3666+
let mut d = vals.as_ptr();
3667+
3668+
// Align d to 16-byte boundary
3669+
let mut offset = 0;
3670+
while (d as usize) & 0xf != 0 {
3671+
d = d.offset(1 as isize);
3672+
offset += 1;
3673+
}
3674+
3675+
let r = sse2::_mm_load_pd(d);
3676+
assert_eq!(r, f64x2::new(1.0, 2.0) + f64x2::splat(offset as f64));
3677+
}
3678+
3679+
#[simd_test = "sse2"]
3680+
unsafe fn _mm_store_pd() {
3681+
let mut vals = [0.0f64; 4];
3682+
let a = f64x2::new(1.0, 2.0);
3683+
let mut d = vals.as_mut_ptr();
3684+
3685+
// Align d to 16-byte boundary
3686+
let mut offset = 0;
3687+
while (d as usize) & 0xf != 0 {
3688+
d = d.offset(1 as isize);
3689+
offset += 1;
3690+
}
3691+
3692+
sse2::_mm_store_pd(d, *black_box(&a));
3693+
assert_eq!(vals[offset + 0], 1.0);
3694+
assert_eq!(vals[offset + 1], 2.0);
3695+
}
3696+
3697+
#[simd_test = "sse2"]
3698+
unsafe fn _mm_store1_pd() {
3699+
let mut vals = [0.0f64; 4];
3700+
let a = f64x2::new(1.0, 2.0);
3701+
let mut d = vals.as_mut_ptr();
3702+
3703+
// Align d to 16-byte boundary
3704+
let mut offset = 0;
3705+
while (d as usize) & 0xf != 0 {
3706+
d = d.offset(1 as isize);
3707+
offset += 1;
3708+
}
3709+
3710+
sse2::_mm_store1_pd(d, *black_box(&a));
3711+
assert_eq!(vals[offset + 0], 1.0);
3712+
assert_eq!(vals[offset + 1], 1.0);
3713+
}
3714+
3715+
#[simd_test = "sse2"]
3716+
unsafe fn _mm_store_pd1() {
3717+
let mut vals = [0.0f64; 4];
3718+
let a = f64x2::new(1.0, 2.0);
3719+
let mut d = vals.as_mut_ptr();
3720+
3721+
// Align d to 16-byte boundary
3722+
let mut offset = 0;
3723+
while (d as usize) & 0xf != 0 {
3724+
d = d.offset(1 as isize);
3725+
offset += 1;
3726+
}
3727+
3728+
sse2::_mm_store_pd1(d, *black_box(&a));
3729+
assert_eq!(vals[offset + 0], 1.0);
3730+
assert_eq!(vals[offset + 1], 1.0);
3731+
}
3732+
3733+
#[simd_test = "sse2"]
3734+
unsafe fn _mm_storer_pd() {
3735+
let mut vals = [0.0f64; 4];
3736+
let a = f64x2::new(1.0, 2.0);
3737+
let mut d = vals.as_mut_ptr();
3738+
3739+
// Align d to 16-byte boundary
3740+
let mut offset = 0;
3741+
while (d as usize) & 0xf != 0 {
3742+
d = d.offset(1 as isize);
3743+
offset += 1;
3744+
}
3745+
3746+
sse2::_mm_storer_pd(d, *black_box(&a));
3747+
assert_eq!(vals[offset + 0], 2.0);
3748+
assert_eq!(vals[offset + 1], 1.0);
3749+
}
3750+
3751+
#[simd_test = "sse2"]
3752+
unsafe fn _mm_loadr_pd() {
3753+
let vals = &[1.0f64, 2.0, 3.0, 4.0];
3754+
let mut d = vals.as_ptr();
3755+
3756+
// Align d to 16-byte boundary
3757+
let mut offset = 0;
3758+
while (d as usize) & 0xf != 0 {
3759+
d = d.offset(1 as isize);
3760+
offset += 1;
3761+
}
3762+
3763+
let r = sse2::_mm_loadr_pd(d);
3764+
assert_eq!(r, f64x2::new(2.0, 1.0) + f64x2::splat(offset as f64));
3765+
}
3766+
3767+
#[simd_test = "sse2"]
3768+
unsafe fn _mm_loadu_pd() {
3769+
let vals = &[1.0f64, 2.0, 3.0, 4.0];
3770+
let mut d = vals.as_ptr();
3771+
3772+
// make sure d is not aligned to 16-byte boundary
3773+
let mut offset = 0;
3774+
if (d as usize) & 0xf == 0 {
3775+
offset = 1;
3776+
d = d.offset(offset as isize);
3777+
}
3778+
3779+
let r = sse2::_mm_loadu_pd(d);
3780+
assert_eq!(r, f64x2::new(1.0, 2.0) + f64x2::splat(offset as f64));
3781+
}
3782+
35903783
#[simd_test = "sse2"]
35913784
unsafe fn _mm_cvtpd_ps() {
35923785
use std::{f64,f32};
@@ -3795,4 +3988,11 @@ mod tests {
37953988
let r = sse2::_mm_load1_pd(&d);
37963989
assert_eq!(r, f64x2::new(d, d));
37973990
}
3991+
3992+
#[simd_test = "sse2"]
3993+
unsafe fn _mm_load_pd1() {
3994+
let d = -5.0;
3995+
let r = sse2::_mm_load_pd1(&d);
3996+
assert_eq!(r, f64x2::new(d, d));
3997+
}
37983998
}

0 commit comments

Comments
 (0)