From b81b3e57ee6e6d38febb68d330e2d09466f914fa Mon Sep 17 00:00:00 2001 From: Jeff Muizelaar Date: Tue, 8 Sep 2020 13:08:07 -0400 Subject: [PATCH] Add vld1q_s32 and vld1q_u32 --- crates/core_arch/src/aarch64/neon/mod.rs | 46 ++++++++++++++++++++++++ crates/core_arch/src/arm/neon/mod.rs | 44 +++++++++++++++++++++++ 2 files changed, 90 insertions(+) diff --git a/crates/core_arch/src/aarch64/neon/mod.rs b/crates/core_arch/src/aarch64/neon/mod.rs index 3712c99ba3..cbf3d3638a 100644 --- a/crates/core_arch/src/aarch64/neon/mod.rs +++ b/crates/core_arch/src/aarch64/neon/mod.rs @@ -1812,6 +1812,32 @@ pub unsafe fn vld1q_f32(addr: *const f32) -> float32x4_t { )) } +#[inline] +#[target_feature(enable = "neon")] +#[cfg_attr(test, assert_instr(ldr))] +pub unsafe fn vld1q_s32(addr: *const i32) -> int32x4_t { + use crate::core_arch::simd::i32x4; + transmute(i32x4::new( + *addr, + *addr.offset(1), + *addr.offset(2), + *addr.offset(3), + )) +} + +#[inline] +#[target_feature(enable = "neon")] +#[cfg_attr(test, assert_instr(ldr))] +pub unsafe fn vld1q_u32(addr: *const u32) -> uint32x4_t { + use crate::core_arch::simd::u32x4; + transmute(u32x4::new( + *addr, + *addr.offset(1), + *addr.offset(2), + *addr.offset(3), + )) +} + #[cfg(test)] mod tests { use crate::core_arch::aarch64::test_support::*; @@ -1830,6 +1856,26 @@ mod tests { assert_eq!(r, e); } + #[simd_test(enable = "neon")] + unsafe fn test_vld1q_s32() { + let e = i32x4::new(1, 2, 3, 4); + let f = [0, 1, 2, 3, 4]; + // do a load that has 4 byte alignment to make sure we're not + // over aligning it + let r: i32x4 = transmute(vld1q_s32(f[1..].as_ptr())); + assert_eq!(r, e); + } + + #[simd_test(enable = "neon")] + unsafe fn test_vld1q_u32() { + let e = u32x4::new(1, 2, 3, 4); + let f = [0, 1, 2, 3, 4]; + // do a load that has 4 byte alignment to make sure we're not + // over aligning it + let r: u32x4 = transmute(vld1q_u32(f[1..].as_ptr())); + assert_eq!(r, e); + } + #[simd_test(enable = "neon")] unsafe fn test_vpaddq_s16() { let a = i16x8::new(1, 2, 3, 4, 5, 6, 7, 8); diff --git a/crates/core_arch/src/arm/neon/mod.rs b/crates/core_arch/src/arm/neon/mod.rs index 03dce6dd1b..c733f01f76 100644 --- a/crates/core_arch/src/arm/neon/mod.rs +++ b/crates/core_arch/src/arm/neon/mod.rs @@ -219,6 +219,8 @@ extern "C" { ) -> int8x8_t; #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vld1.v4f32.p0i8")] fn vld1q_v4f32(addr: *const u8, align: u32) -> float32x4_t; + #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vld1.v4i32.p0i8")] + fn vld1q_v4i32(addr: *const u8, align: u32) -> int32x4_t; } /// Absolute value (wrapping). @@ -1769,6 +1771,26 @@ pub unsafe fn vld1q_u8(addr: *const u8) -> uint8x16_t { ptr::read(addr as *const uint8x16_t) } +/// Load multiple single-element structures to one, two, three, or four registers +#[inline] +#[cfg(target_arch = "arm")] +#[target_feature(enable = "neon")] +#[target_feature(enable = "v7")] +#[cfg_attr(test, assert_instr("vld1.32"))] +pub unsafe fn vld1q_s32(addr: *const i32) -> int32x4_t { + vld1q_v4i32(addr as *const u8, 4) +} + +/// Load multiple single-element structures to one, two, three, or four registers +#[inline] +#[cfg(target_arch = "arm")] +#[target_feature(enable = "neon")] +#[target_feature(enable = "v7")] +#[cfg_attr(test, assert_instr("vld1.32"))] +pub unsafe fn vld1q_u32(addr: *const u32) -> uint32x4_t { + transmute(vld1q_v4i32(addr as *const u8, 4)) +} + /// Load multiple single-element structures to one, two, three, or four registers #[inline] #[cfg(target_arch = "arm")] @@ -1826,6 +1848,28 @@ mod tests { assert_eq!(r, e); } + #[cfg(target_arch = "arm")] + #[simd_test(enable = "neon")] + unsafe fn test_vld1q_s32() { + let e = i32x4::new(1, 2, 3, 4); + let f = [0, 1, 2, 3, 4]; + // do a load that has 4 byte alignment to make sure we're not + // over aligning it + let r: i32x4 = transmute(vld1q_s32(f[1..].as_ptr())); + assert_eq!(r, e); + } + + #[cfg(target_arch = "arm")] + #[simd_test(enable = "neon")] + unsafe fn test_vld1q_u32() { + let e = u32x4::new(1, 2, 3, 4); + let f = [0, 1, 2, 3, 4]; + // do a load that has 4 byte alignment to make sure we're not + // over aligning it + let r: u32x4 = transmute(vld1q_u32(f[1..].as_ptr())); + assert_eq!(r, e); + } + #[simd_test(enable = "neon")] unsafe fn test_vld1q_dup_f32() { let e = f32x4::new(1., 1., 1., 1.);