@@ -1726,6 +1726,9 @@ pub unsafe fn _mm_cvtpd_ps(a: f64x2) -> f32x4 {
17261726 cvtpd2ps ( a)
17271727}
17281728
1729+
1730+ /// Convert packed single-precision (32-bit) floating-point elements in `a` to packed
1731+ /// double-precision (64-bit) floating-point elements.
17291732#[ inline( always) ]
17301733#[ target_feature = "+sse2" ]
17311734#[ cfg_attr( test, assert_instr( cvtps2pd) ) ]
@@ -1873,21 +1876,55 @@ pub unsafe fn _mm_movemask_pd(a: f64x2) -> i32 {
18731876
18741877
18751878
1876-
1879+ /// Load 128-bits (composed of 2 packed double-precision (64-bit) floating-point elements)
1880+ /// from memory into the returned vector. mem_addr must be aligned on a 16-byte boundary or
1881+ /// a general-protection exception may be generated.
18771882#[ inline( always) ]
18781883#[ target_feature = "+sse2" ]
18791884#[ cfg_attr( test, assert_instr( movaps) ) ]
18801885pub unsafe fn _mm_load_pd ( mem_addr : * const f64 ) -> f64x2 {
18811886 * ( mem_addr as * const f64x2 )
18821887}
18831888
1889+ /// Store 128-bits (composed of 2 packed double-precision (64-bit) floating-point elements) from `a`
1890+ /// into memory. mem_addr must be aligned on a 16-byte boundary or a general-protection exception
1891+ /// may be generated.
18841892#[ inline( always) ]
18851893#[ target_feature = "+sse2" ]
18861894#[ cfg_attr( test, assert_instr( movaps) ) ]
18871895pub unsafe fn _mm_store_pd ( mem_addr : * mut f64 , a : f64x2 ) {
18881896 * ( mem_addr as * mut f64x2 ) = a;
18891897}
18901898
1899+ /// Store the lower double-precision (64-bit) floating-point element from `a` into 2 contiguous
1900+ /// elements in memory. `mem_addr` must be aligned on a 16-byte boundary or a general-protection
1901+ /// exception may be generated.
1902+ #[ inline( always) ]
1903+ #[ target_feature = "+sse2" ]
1904+ pub unsafe fn _mm_store1_pd ( mem_addr : * mut f64 , a : f64x2 ) {
1905+ let b: f64x2 = simd_shuffle2 ( a, a, [ 0 , 0 ] ) ;
1906+ * ( mem_addr as * mut f64x2 ) = b;
1907+ }
1908+
1909+ /// Store the lower double-precision (64-bit) floating-point element from `a` into 2 contiguous
1910+ /// elements in memory. `mem_addr` must be aligned on a 16-byte boundary or a general-protection
1911+ /// exception may be generated.
1912+ #[ inline( always) ]
1913+ #[ target_feature = "+sse2" ]
1914+ pub unsafe fn _mm_store_pd1 ( mem_addr : * mut f64 , a : f64x2 ) {
1915+ let b: f64x2 = simd_shuffle2 ( a, a, [ 0 , 0 ] ) ;
1916+ * ( mem_addr as * mut f64x2 ) = b;
1917+ }
1918+
1919+ /// Store 2 double-precision (64-bit) floating-point elements from `a` into memory in reverse order.
1920+ /// `mem_addr` must be aligned on a 16-byte boundary or a general-protection exception may be generated.
1921+ #[ inline( always) ]
1922+ #[ target_feature = "+sse2" ]
1923+ pub unsafe fn _mm_storer_pd ( mem_addr : * mut f64 , a : f64x2 ) {
1924+ let b: f64x2 = simd_shuffle2 ( a, a, [ 1 , 0 ] ) ;
1925+ * ( mem_addr as * mut f64x2 ) = b;
1926+ }
1927+
18911928/// Load a double-precision (64-bit) floating-point element from memory
18921929/// into both elements of returned vector.
18931930#[ inline( always) ]
@@ -1897,6 +1934,41 @@ pub unsafe fn _mm_load1_pd(mem_addr: *const f64) -> f64x2 {
18971934 f64x2:: new ( d, d)
18981935}
18991936
1937+ /// Load a double-precision (64-bit) floating-point element from memory
1938+ /// into both elements of returned vector.
1939+ #[ inline( always) ]
1940+ #[ target_feature = "+sse2" ]
1941+ pub unsafe fn _mm_load_pd1 ( mem_addr : * const f64 ) -> f64x2 {
1942+ let d = * mem_addr;
1943+ f64x2:: new ( d, d)
1944+ }
1945+
1946+ /// Load 2 double-precision (64-bit) floating-point elements from memory into the returned vector
1947+ /// in reverse order. mem_addr must be aligned on a 16-byte boundary or a general-protection
1948+ /// exception may be generated.
1949+ #[ inline( always) ]
1950+ #[ target_feature = "+sse2" ]
1951+ #[ cfg_attr( test, assert_instr( movapd) ) ]
1952+ pub unsafe fn _mm_loadr_pd ( mem_addr : * const f64 ) -> f64x2 {
1953+ let a = _mm_load_pd ( mem_addr) ;
1954+ simd_shuffle2 ( a, a, [ 1 , 0 ] )
1955+ }
1956+
1957+ /// Load 128-bits (composed of 2 packed double-precision (64-bit) floating-point elements)
1958+ /// from memory into the returned vector. mem_addr does not need to be aligned on any particular
1959+ /// oundary.
1960+ #[ inline( always) ]
1961+ #[ target_feature = "+sse2" ]
1962+ #[ cfg_attr( test, assert_instr( movups) ) ]
1963+ pub unsafe fn _mm_loadu_pd ( mem_addr : * const f64 ) -> f64x2 {
1964+ let mut dst = f64x2:: splat ( mem:: uninitialized ( ) ) ;
1965+ ptr:: copy_nonoverlapping (
1966+ mem_addr as * const u8 ,
1967+ & mut dst as * mut f64x2 as * mut u8 ,
1968+ mem:: size_of :: < f64x2 > ( ) ) ;
1969+ dst
1970+ }
1971+
19001972/// Return vector of type __m128d with undefined elements.
19011973#[ inline( always) ]
19021974#[ target_feature = "+sse2" ]
@@ -2068,6 +2140,7 @@ extern {
20682140mod tests {
20692141 use std:: os:: raw:: c_void;
20702142 use stdsimd_test:: simd_test;
2143+ use test:: black_box; // Used to inhibit constant-folding.
20712144
20722145 use v128:: * ;
20732146 use x86:: { __m128i, sse2} ;
@@ -3587,6 +3660,126 @@ mod tests {
35873660 assert_eq ! ( r, 0b11 ) ;
35883661 }
35893662
3663+ #[ simd_test = "sse2" ]
3664+ unsafe fn _mm_load_pd ( ) {
3665+ let vals = & [ 1.0f64 , 2.0 , 3.0 , 4.0 ] ;
3666+ let mut d = vals. as_ptr ( ) ;
3667+
3668+ // Align d to 16-byte boundary
3669+ let mut offset = 0 ;
3670+ while ( d as usize ) & 0xf != 0 {
3671+ d = d. offset ( 1 as isize ) ;
3672+ offset += 1 ;
3673+ }
3674+
3675+ let r = sse2:: _mm_load_pd ( d) ;
3676+ assert_eq ! ( r, f64x2:: new( 1.0 , 2.0 ) + f64x2:: splat( offset as f64 ) ) ;
3677+ }
3678+
3679+ #[ simd_test = "sse2" ]
3680+ unsafe fn _mm_store_pd ( ) {
3681+ let mut vals = [ 0.0f64 ; 4 ] ;
3682+ let a = f64x2:: new ( 1.0 , 2.0 ) ;
3683+ let mut d = vals. as_mut_ptr ( ) ;
3684+
3685+ // Align d to 16-byte boundary
3686+ let mut offset = 0 ;
3687+ while ( d as usize ) & 0xf != 0 {
3688+ d = d. offset ( 1 as isize ) ;
3689+ offset += 1 ;
3690+ }
3691+
3692+ sse2:: _mm_store_pd ( d, * black_box ( & a) ) ;
3693+ assert_eq ! ( vals[ offset + 0 ] , 1.0 ) ;
3694+ assert_eq ! ( vals[ offset + 1 ] , 2.0 ) ;
3695+ }
3696+
3697+ #[ simd_test = "sse2" ]
3698+ unsafe fn _mm_store1_pd ( ) {
3699+ let mut vals = [ 0.0f64 ; 4 ] ;
3700+ let a = f64x2:: new ( 1.0 , 2.0 ) ;
3701+ let mut d = vals. as_mut_ptr ( ) ;
3702+
3703+ // Align d to 16-byte boundary
3704+ let mut offset = 0 ;
3705+ while ( d as usize ) & 0xf != 0 {
3706+ d = d. offset ( 1 as isize ) ;
3707+ offset += 1 ;
3708+ }
3709+
3710+ sse2:: _mm_store1_pd ( d, * black_box ( & a) ) ;
3711+ assert_eq ! ( vals[ offset + 0 ] , 1.0 ) ;
3712+ assert_eq ! ( vals[ offset + 1 ] , 1.0 ) ;
3713+ }
3714+
3715+ #[ simd_test = "sse2" ]
3716+ unsafe fn _mm_store_pd1 ( ) {
3717+ let mut vals = [ 0.0f64 ; 4 ] ;
3718+ let a = f64x2:: new ( 1.0 , 2.0 ) ;
3719+ let mut d = vals. as_mut_ptr ( ) ;
3720+
3721+ // Align d to 16-byte boundary
3722+ let mut offset = 0 ;
3723+ while ( d as usize ) & 0xf != 0 {
3724+ d = d. offset ( 1 as isize ) ;
3725+ offset += 1 ;
3726+ }
3727+
3728+ sse2:: _mm_store_pd1 ( d, * black_box ( & a) ) ;
3729+ assert_eq ! ( vals[ offset + 0 ] , 1.0 ) ;
3730+ assert_eq ! ( vals[ offset + 1 ] , 1.0 ) ;
3731+ }
3732+
3733+ #[ simd_test = "sse2" ]
3734+ unsafe fn _mm_storer_pd ( ) {
3735+ let mut vals = [ 0.0f64 ; 4 ] ;
3736+ let a = f64x2:: new ( 1.0 , 2.0 ) ;
3737+ let mut d = vals. as_mut_ptr ( ) ;
3738+
3739+ // Align d to 16-byte boundary
3740+ let mut offset = 0 ;
3741+ while ( d as usize ) & 0xf != 0 {
3742+ d = d. offset ( 1 as isize ) ;
3743+ offset += 1 ;
3744+ }
3745+
3746+ sse2:: _mm_storer_pd ( d, * black_box ( & a) ) ;
3747+ assert_eq ! ( vals[ offset + 0 ] , 2.0 ) ;
3748+ assert_eq ! ( vals[ offset + 1 ] , 1.0 ) ;
3749+ }
3750+
3751+ #[ simd_test = "sse2" ]
3752+ unsafe fn _mm_loadr_pd ( ) {
3753+ let vals = & [ 1.0f64 , 2.0 , 3.0 , 4.0 ] ;
3754+ let mut d = vals. as_ptr ( ) ;
3755+
3756+ // Align d to 16-byte boundary
3757+ let mut offset = 0 ;
3758+ while ( d as usize ) & 0xf != 0 {
3759+ d = d. offset ( 1 as isize ) ;
3760+ offset += 1 ;
3761+ }
3762+
3763+ let r = sse2:: _mm_loadr_pd ( d) ;
3764+ assert_eq ! ( r, f64x2:: new( 2.0 , 1.0 ) + f64x2:: splat( offset as f64 ) ) ;
3765+ }
3766+
3767+ #[ simd_test = "sse2" ]
3768+ unsafe fn _mm_loadu_pd ( ) {
3769+ let vals = & [ 1.0f64 , 2.0 , 3.0 , 4.0 ] ;
3770+ let mut d = vals. as_ptr ( ) ;
3771+
3772+ // make sure d is not aligned to 16-byte boundary
3773+ let mut offset = 0 ;
3774+ if ( d as usize ) & 0xf == 0 {
3775+ offset = 1 ;
3776+ d = d. offset ( offset as isize ) ;
3777+ }
3778+
3779+ let r = sse2:: _mm_loadu_pd ( d) ;
3780+ assert_eq ! ( r, f64x2:: new( 1.0 , 2.0 ) + f64x2:: splat( offset as f64 ) ) ;
3781+ }
3782+
35903783 #[ simd_test = "sse2" ]
35913784 unsafe fn _mm_cvtpd_ps ( ) {
35923785 use std:: { f64, f32} ;
@@ -3795,4 +3988,11 @@ mod tests {
37953988 let r = sse2:: _mm_load1_pd ( & d) ;
37963989 assert_eq ! ( r, f64x2:: new( d, d) ) ;
37973990 }
3991+
3992+ #[ simd_test = "sse2" ]
3993+ unsafe fn _mm_load_pd1 ( ) {
3994+ let d = -5.0 ;
3995+ let r = sse2:: _mm_load_pd1 ( & d) ;
3996+ assert_eq ! ( r, f64x2:: new( d, d) ) ;
3997+ }
37983998}
0 commit comments