diff --git a/include/eve/arch/cpu/wide.hpp b/include/eve/arch/cpu/wide.hpp index f6d6238efe..4adf9cae80 100644 --- a/include/eve/arch/cpu/wide.hpp +++ b/include/eve/arch/cpu/wide.hpp @@ -33,6 +33,7 @@ #include #include #include +#include #include #include @@ -890,32 +891,32 @@ namespace eve } //! @brief Element-wise greater-than comparison between eve::wide - friend EVE_FORCEINLINE auto operator>(wide v, wide w) noexcept + friend EVE_FORCEINLINE auto operator>(wide a, wide b) noexcept #if !defined(EVE_DOXYGEN_INVOKED) requires(supports_ordering_v) #endif { - return detail::self_greater(v, w); + return is_greater(a, b); } //! @brief Element-wise greater-than comparison between a eve::wide and a scalar template - friend EVE_FORCEINLINE auto operator>(wide v, S w) noexcept + friend EVE_FORCEINLINE auto operator>(wide w, S s) noexcept #if !defined(EVE_DOXYGEN_INVOKED) requires(supports_ordering_v) #endif { - return v > wide {w}; + return is_greater(w, s); } //! @brief Element-wise greater-than comparison between a scalar and a eve::wide template - friend EVE_FORCEINLINE auto operator>(S v, wide w) noexcept + friend EVE_FORCEINLINE auto operator>(S s, wide w) noexcept #if !defined(EVE_DOXYGEN_INVOKED) requires(supports_ordering_v) #endif { - return wide {v} > w; + return is_greater(s, w); } //! @brief Element-wise greater-or-equal comparison between eve::wide diff --git a/include/eve/detail/function/simd/arm/neon/friends.hpp b/include/eve/detail/function/simd/arm/neon/friends.hpp index b0226d408c..f4ed7ee2a7 100644 --- a/include/eve/detail/function/simd/arm/neon/friends.hpp +++ b/include/eve/detail/function/simd/arm/neon/friends.hpp @@ -62,41 +62,6 @@ namespace eve::detail return !(v == w); } - template - EVE_FORCEINLINE logical> self_greater( wide v - , wide w - ) noexcept - requires arm_abi> - { - constexpr auto cat = categorize>(); - - if constexpr( cat == category::int32x4 ) return vcgtq_s32(v, w); - else if constexpr( cat == category::int16x8 ) return vcgtq_s16(v, w); - else if constexpr( cat == category::int8x16 ) return vcgtq_s8(v, w); - else if constexpr( cat == category::uint32x4 ) return vcgtq_u32(v, w); - else if constexpr( cat == category::uint16x8 ) return vcgtq_u16(v, w); - else if constexpr( cat == category::uint8x16 ) return vcgtq_u8(v, w); - else if constexpr( cat == category::float32x4) return vcgtq_f32(v, w); - else if constexpr( cat == category::int32x2 ) return vcgt_s32(v, w); - else if constexpr( cat == category::int16x4 ) return vcgt_s16(v, w); - else if constexpr( cat == category::int8x8 ) return vcgt_s8(v, w); - else if constexpr( cat == category::uint32x2 ) return vcgt_u32(v, w); - else if constexpr( cat == category::uint16x4 ) return vcgt_u16(v, w); - else if constexpr( cat == category::uint8x8 ) return vcgt_u8(v, w); - else if constexpr( cat == category::float32x2) return vcgt_f32(v, w); - else if constexpr( current_api >= asimd) - { - if constexpr( cat == category::float64x1) return vcgt_f64(v, w); - else if constexpr( cat == category::int64x1) return vcgt_s64(v, w); - else if constexpr( cat == category::uint64x1) return vcgt_u64(v, w); - else if constexpr( cat == category::float64x2) return vcgtq_f64(v, w); - else if constexpr( cat == category::int64x2) return vcgtq_s64(v, w); - else if constexpr( cat == category::uint64x2) return vcgtq_u64(v, w); - } - else if constexpr( sizeof(T) == 8 ) - return map([](E const& e, E const& f){ return as_logical_t(e > f); }, v, w); - } - template EVE_FORCEINLINE logical> self_geq(wide v, wide w) noexcept requires arm_abi> diff --git a/include/eve/detail/function/simd/arm/sve/friends.hpp b/include/eve/detail/function/simd/arm/sve/friends.hpp index a9b0029a37..19d2c2ed28 100644 --- a/include/eve/detail/function/simd/arm/sve/friends.hpp +++ b/include/eve/detail/function/simd/arm/sve/friends.hpp @@ -22,11 +22,6 @@ EVE_FORCEINLINE auto self_neq(wide v, wide w) noexcept -> as_logical_t> requires sve_abi> { return svcmpne(sve_true(), v, w); } -template -EVE_FORCEINLINE auto -self_greater(wide v, wide w) noexcept -> as_logical_t> -requires sve_abi> { return svcmpgt(sve_true(), v, w); } - template EVE_FORCEINLINE auto self_leq(wide v, wide w) noexcept -> as_logical_t> diff --git a/include/eve/detail/function/simd/common/friends.hpp b/include/eve/detail/function/simd/common/friends.hpp index a516f5146e..0396a6cbfa 100644 --- a/include/eve/detail/function/simd/common/friends.hpp +++ b/include/eve/detail/function/simd/common/friends.hpp @@ -113,20 +113,6 @@ namespace eve::detail } } - template - EVE_FORCEINLINE auto self_greater(Wide const& v,Wide const& w) noexcept - { - if constexpr( product_type ) - { - return convert(kumi::to_tuple(v) > kumi::to_tuple(w), as_element>()); - } - else - { - constexpr auto gt = [](E const& e, E const& f) { return as_logical_t(e > f); }; - return apply_over(gt, v, w); - } - } - template EVE_FORCEINLINE auto self_geq(Wide const& v,Wide const& w) noexcept { diff --git a/include/eve/detail/function/simd/ppc/friends.hpp b/include/eve/detail/function/simd/ppc/friends.hpp index 3cbc940e3c..f0f76ad0cf 100644 --- a/include/eve/detail/function/simd/ppc/friends.hpp +++ b/include/eve/detail/function/simd/ppc/friends.hpp @@ -28,13 +28,6 @@ namespace eve::detail return logical>(vec_cmpne(v.storage(), w.storage())); } - template - EVE_FORCEINLINE auto self_greater(wide const &v, wide const &w) noexcept - requires ppc_abi> - { - return logical>(vec_cmpgt(v.storage(), w.storage())); - } - template EVE_FORCEINLINE auto self_geq(wide const &v, wide const &w) noexcept requires ppc_abi> diff --git a/include/eve/detail/function/simd/riscv/friends.hpp b/include/eve/detail/function/simd/riscv/friends.hpp index 4d2476c49c..c77dad9219 100644 --- a/include/eve/detail/function/simd/riscv/friends.hpp +++ b/include/eve/detail/function/simd/riscv/friends.hpp @@ -12,37 +12,6 @@ namespace eve::detail // *_impl in separate functions, as otherwise compiler can not // choose overload between riscv-specific and common one -template -EVE_FORCEINLINE auto -self_greater_impl(wide lhs, U rhs) noexcept -> logical> -requires rvv_abi> && (std::same_as, U> || scalar_value) -{ - if constexpr( scalar_value && !std::same_as ) - return self_greater(lhs, static_cast(rhs)); - else - { - constexpr auto c = categorize>(); - if constexpr( match(c, category::int_) ) return __riscv_vmsgt(lhs, rhs, N::value); - else if constexpr( match(c, category::uint_) ) return __riscv_vmsgtu(lhs, rhs, N::value); - else if constexpr( match(c, category::float_) ) return __riscv_vmfgt(lhs, rhs, N::value); - } -} - -template -EVE_FORCEINLINE auto -self_greater(wide lhs, wide rhs) noexcept -> logical> -requires rvv_abi> -{ - return self_greater_impl(lhs, rhs); -} - -template -EVE_FORCEINLINE auto -self_greater(wide lhs, std::convertible_to auto rhs) noexcept -> logical> -requires rvv_abi> -{ - return self_greater_impl(lhs, rhs); -} template EVE_FORCEINLINE auto diff --git a/include/eve/detail/function/simd/x86/friends.hpp b/include/eve/detail/function/simd/x86/friends.hpp index 03901edc7f..fe18f49008 100644 --- a/include/eve/detail/function/simd/x86/friends.hpp +++ b/include/eve/detail/function/simd/x86/friends.hpp @@ -179,89 +179,6 @@ self_neq(logical> v, logical> w) noexcept requires x86_abi else { return bit_cast(v.bits() ^ w.bits(), as(v)); } } -//================================================================================================ -template -EVE_FORCEINLINE as_logical_t> - self_greater(wide v, wide w) noexcept requires x86_abi> -{ - constexpr auto c = categorize>(); - constexpr auto f = to_integer(cmp_flt::gt_oq); - - if constexpr( current_api >= avx512 ) - { - if constexpr( c == category::float32x16 ) return mask16 {_mm512_cmp_ps_mask(v, w, f)}; - else if constexpr( c == category::float32x8 ) return mask8 {_mm256_cmp_ps_mask(v, w, f)}; - else if constexpr( c == category::float32x4 ) return mask8 {_mm_cmp_ps_mask(v, w, f)}; - else if constexpr( c == category::float64x8 ) return mask8 {_mm512_cmp_pd_mask(v, w, f)}; - else if constexpr( c == category::float64x4 ) return mask8 {_mm256_cmp_pd_mask(v, w, f)}; - else if constexpr( c == category::float64x2 ) return mask8 {_mm_cmp_pd_mask(v, w, f)}; - else if constexpr( c == category::uint64x8 ) return mask8 {_mm512_cmpgt_epu64_mask(v, w)}; - else if constexpr( c == category::uint64x4 ) return mask8 {_mm256_cmpgt_epu64_mask(v, w)}; - else if constexpr( c == category::uint64x2 ) return mask8 {_mm_cmpgt_epu64_mask(v, w)}; - else if constexpr( c == category::uint32x16 ) return mask16 {_mm512_cmpgt_epu32_mask(v, w)}; - else if constexpr( c == category::uint32x8 ) return mask8 {_mm256_cmpgt_epu32_mask(v, w)}; - else if constexpr( c == category::uint32x4 ) return mask8 {_mm_cmpgt_epu32_mask(v, w)}; - else if constexpr( c == category::uint16x32 ) return mask32 {_mm512_cmpgt_epu16_mask(v, w)}; - else if constexpr( c == category::uint16x16 ) return mask16 {_mm256_cmpgt_epu16_mask(v, w)}; - else if constexpr( c == category::uint16x8 ) return mask8 {_mm_cmpgt_epu16_mask(v, w)}; - else if constexpr( c == category::uint8x64 ) return mask64 {_mm512_cmpgt_epu8_mask(v, w)}; - else if constexpr( c == category::uint8x32 ) return mask32 {_mm256_cmpgt_epu8_mask(v, w)}; - else if constexpr( c == category::uint8x16 ) return mask16 {_mm_cmpgt_epu8_mask(v, w)}; - else if constexpr( c == category::int64x8 ) return mask8 {_mm512_cmpgt_epi64_mask(v, w)}; - else if constexpr( c == category::int64x4 ) return mask8 {_mm256_cmpgt_epi64_mask(v, w)}; - else if constexpr( c == category::int64x2 ) return mask8 {_mm_cmpgt_epi64_mask(v, w)}; - else if constexpr( c == category::int32x16 ) return mask16 {_mm512_cmpgt_epi32_mask(v, w)}; - else if constexpr( c == category::int32x8 ) return mask8 {_mm256_cmpgt_epi32_mask(v, w)}; - else if constexpr( c == category::int32x4 ) return mask8 {_mm_cmpgt_epi32_mask(v, w)}; - else if constexpr( c == category::int16x32 ) return mask32 {_mm512_cmpgt_epi16_mask(v, w)}; - else if constexpr( c == category::int16x16 ) return mask16 {_mm256_cmpgt_epi16_mask(v, w)}; - else if constexpr( c == category::int16x8 ) return mask8 {_mm_cmpgt_epi16_mask(v, w)}; - else if constexpr( c == category::int8x64 ) return mask64 {_mm512_cmpgt_epi8_mask(v, w)}; - else if constexpr( c == category::int8x32 ) return mask32 {_mm256_cmpgt_epi8_mask(v, w)}; - else if constexpr( c == category::int8x16 ) return mask16 {_mm_cmpgt_epi8_mask(v, w)}; - } - else - { - if constexpr( c == category::float32x8 ) return _mm256_cmp_ps(v, w, f); - else if constexpr( c == category::float64x4 ) return _mm256_cmp_pd(v, w, f); - else if constexpr( c == category::float32x4 ) return _mm_cmpgt_ps(v, w); - else if constexpr( c == category::float64x2 ) return _mm_cmpgt_pd(v, w); - else - { - constexpr auto use_avx2 = current_api >= avx2; - constexpr auto use_sse4 = current_api >= sse4_2; - - constexpr auto gt = [](E ev, E fv) { return as_logical_t(ev > fv); }; - - [[maybe_unused]] auto unsigned_cmp = [](auto vv, auto vw) - { - using l_t = logical>; - auto const sm = signmask(as, signed>>()); - return bit_cast((bit_cast(vv, as(sm)) - sm) > (bit_cast(vw, as(sm)) - sm), as {}); - }; - - if constexpr( use_avx2 && c == category::int64x4 ) return _mm256_cmpgt_epi64(v, w); - else if constexpr( use_avx2 && c == category::uint64x4 ) return unsigned_cmp(v, w); - else if constexpr( use_avx2 && c == category::int32x8 ) return _mm256_cmpgt_epi32(v, w); - else if constexpr( use_avx2 && c == category::uint32x8 ) return unsigned_cmp(v, w); - else if constexpr( use_avx2 && c == category::int16x16 ) return _mm256_cmpgt_epi16(v, w); - else if constexpr( use_avx2 && c == category::uint16x16 ) return unsigned_cmp(v, w); - else if constexpr( use_avx2 && c == category::int8x32 ) return _mm256_cmpgt_epi8(v, w); - else if constexpr( use_avx2 && c == category::uint8x32 ) return unsigned_cmp(v, w); - else if constexpr( use_sse4 && c == category::int64x2 ) return _mm_cmpgt_epi64(v, w); - else if constexpr( c == category::int64x2 ) return map(gt, v, w); - else if constexpr( c == category::int32x4 ) return _mm_cmpgt_epi32(v, w); - else if constexpr( c == category::int16x8 ) return _mm_cmpgt_epi16(v, w); - else if constexpr( c == category::int8x16 ) return _mm_cmpgt_epi8(v, w); - else if constexpr( c == category::uint64x2 ) return unsigned_cmp(v, w); - else if constexpr( c == category::uint32x4 ) return unsigned_cmp(v, w); - else if constexpr( c == category::uint16x8 ) return unsigned_cmp(v, w); - else if constexpr( c == category::uint8x16 ) return unsigned_cmp(v, w); - else return aggregate(gt, v, w); - } - } -} - //================================================================================================ template EVE_FORCEINLINE as_logical_t> diff --git a/include/eve/module/core/regular/impl/is_greater.hpp b/include/eve/module/core/regular/impl/is_greater.hpp new file mode 100644 index 0000000000..e8fd745ab3 --- /dev/null +++ b/include/eve/module/core/regular/impl/is_greater.hpp @@ -0,0 +1,39 @@ +//================================================================================================== +/* + EVE - Expressive Vector Engine + Copyright : EVE Project Contributors + SPDX-License-Identifier: BSL-1.0 +*/ +//================================================================================================== +#pragma once + +#include +#include +#include +#include +#include +#include +#include + +namespace eve::detail +{ + template + EVE_FORCEINLINE constexpr as_logical_t is_greater_(EVE_REQUIRES(cpu_), O const & o, T a, T b) noexcept + { + if constexpr(O::contains(definitely)) + { + auto tol = o[definitely].value(T{}); + if constexpr(integral_value) return a > eve::next(b, tol); + else return a > fam(b, tol, eve::max(eve::abs(a), eve::abs(b))); + } + else if constexpr (product_type) + { + return kumi::to_tuple(a) > kumi::to_tuple(b); + } + else + { + if constexpr (scalar_value) return as_logical_t(a > b); + else return map([](E e, E f){ return as_logical_t(e > f); }, a, b); + } + } +} diff --git a/include/eve/module/core/regular/impl/simd/arm/neon/is_greater.hpp b/include/eve/module/core/regular/impl/simd/arm/neon/is_greater.hpp new file mode 100644 index 0000000000..4e1d6d5cec --- /dev/null +++ b/include/eve/module/core/regular/impl/simd/arm/neon/is_greater.hpp @@ -0,0 +1,55 @@ +//================================================================================================== +/* + EVE - Expressive Vector Engine + Copyright : EVE Project Contributors + SPDX-License-Identifier: BSL-1.0 +*/ +//================================================================================================== +#pragma once + +#include +#include +#include +#include + +namespace eve::detail +{ + template + EVE_FORCEINLINE logical> is_greater_(EVE_REQUIRES(neon128_), O const& opts, wide a, wide b) noexcept + requires arm_abi> + { + if constexpr (O::contains(definitely)) + { + return is_greater.behavior(cpu_{}, opts, a, b); + } + else + { + constexpr auto cat = categorize>(); + + if constexpr (cat == category::int32x4) return vcgtq_s32(a, b); + else if constexpr (cat == category::int16x8) return vcgtq_s16(a, b); + else if constexpr (cat == category::int8x16) return vcgtq_s8(a, b); + else if constexpr (cat == category::uint32x4) return vcgtq_u32(a, b); + else if constexpr (cat == category::uint16x8) return vcgtq_u16(a, b); + else if constexpr (cat == category::uint8x16) return vcgtq_u8(a, b); + else if constexpr (cat == category::float32x4) return vcgtq_f32(a, b); + else if constexpr (cat == category::int32x2) return vcgt_s32(a, b); + else if constexpr (cat == category::int16x4) return vcgt_s16(a, b); + else if constexpr (cat == category::int8x8) return vcgt_s8(a, b); + else if constexpr (cat == category::uint32x2) return vcgt_u32(a, b); + else if constexpr (cat == category::uint16x4) return vcgt_u16(a, b); + else if constexpr (cat == category::uint8x8) return vcgt_u8(a, b); + else if constexpr (cat == category::float32x2) return vcgt_f32(a, b); + else if constexpr (current_api >= asimd) + { + if constexpr (cat == category::float64x1) return vcgt_f64(a, b); + else if constexpr (cat == category::int64x1) return vcgt_s64(a, b); + else if constexpr (cat == category::uint64x1) return vcgt_u64(a, b); + else if constexpr (cat == category::float64x2) return vcgtq_f64(a, b); + else if constexpr (cat == category::int64x2) return vcgtq_s64(a, b); + else if constexpr (cat == category::uint64x2) return vcgtq_u64(a, b); + } + else return map([](E e, E f){ return as_logical_t(e > f); }, a, b); + } + } +} diff --git a/include/eve/module/core/regular/impl/simd/arm/sve/is_greater.hpp b/include/eve/module/core/regular/impl/simd/arm/sve/is_greater.hpp new file mode 100644 index 0000000000..2b3a415dc5 --- /dev/null +++ b/include/eve/module/core/regular/impl/simd/arm/sve/is_greater.hpp @@ -0,0 +1,22 @@ +//================================================================================================== +/* + EVE - Expressive Vector Engine + Copyright : EVE Project Contributors + SPDX-License-Identifier: BSL-1.0 +*/ +//================================================================================================== +#pragma once + +#include +#include + +namespace eve::detail +{ + template + EVE_FORCEINLINE logical> is_greater_(EVE_REQUIRES(sve_), O const& opts, wide a, wide b) noexcept + requires sve_abi> + { + if constexpr (O::contains(definitely)) return is_greater.behavior(cpu_{}, opts, a, b); + else return svcmpgt(sve_true(), a, b); + } +} diff --git a/include/eve/module/core/regular/impl/simd/ppc/is_greater.hpp b/include/eve/module/core/regular/impl/simd/ppc/is_greater.hpp new file mode 100644 index 0000000000..ea73d89a2e --- /dev/null +++ b/include/eve/module/core/regular/impl/simd/ppc/is_greater.hpp @@ -0,0 +1,24 @@ +//================================================================================================== +/* + EVE - Expressive Vector Engine + Copyright : EVE Project Contributors + SPDX-License-Identifier: BSL-1.0 +*/ +//================================================================================================== +#pragma once + +#include +#include +#include +#include + +namespace eve::detail +{ + template + EVE_FORCEINLINE logical> is_greater_(EVE_REQUIRES(vmx_), O const& opts, wide a, wide b) noexcept + requires ppc_abi> + { + if constexpr (O::contains(definitely)) return is_greater.behavior(cpu_{}, opts, a, b); + else return logical>(vec_cmpgt(a.storage(), b.storage())); + } +} diff --git a/include/eve/module/core/regular/impl/simd/riscv/is_greater.hpp b/include/eve/module/core/regular/impl/simd/riscv/is_greater.hpp new file mode 100644 index 0000000000..68c6fa76f8 --- /dev/null +++ b/include/eve/module/core/regular/impl/simd/riscv/is_greater.hpp @@ -0,0 +1,32 @@ +//================================================================================================== +/* + EVE - Expressive Vector Engine + Copyright : EVE Project Contributors + SPDX-License-Identifier: BSL-1.0 +*/ +//================================================================================================== +#pragma once + +#include +#include + +namespace eve::detail +{ + template + EVE_FORCEINLINE logical> is_greater_(EVE_REQUIRES(rvv_), O const& opts, wide a, U b) noexcept + requires (rvv_abi> && same_element_type) + { + if constexpr (O::contains(definitely)) + { + return is_greater.behavior(cpu_{}, opts, a, b); + } + else + { + constexpr auto c = categorize>(); + + if constexpr (match(c, category::int_)) return __riscv_vmsgt(a, b, N::value); + else if constexpr (match(c, category::uint_)) return __riscv_vmsgtu(a, b, N::value); + else if constexpr (match(c, category::float_)) return __riscv_vmfgt(a, b, N::value); + } + } +} diff --git a/include/eve/module/core/regular/impl/simd/x86/is_greater.hpp b/include/eve/module/core/regular/impl/simd/x86/is_greater.hpp index 4e340c7d86..7c0585556f 100644 --- a/include/eve/module/core/regular/impl/simd/x86/is_greater.hpp +++ b/include/eve/module/core/regular/impl/simd/x86/is_greater.hpp @@ -18,59 +18,154 @@ namespace eve::detail { + template + EVE_FORCEINLINE logical> is_greater_(EVE_REQUIRES(sse2_), O const& opts, wide a, wide b) noexcept + requires x86_abi> + { + if constexpr (O::contains(definitely)) + { + return is_greater.behavior(cpu_{}, opts, a, b); + } + else + { + constexpr auto c = categorize>(); + constexpr auto f = to_integer(cmp_flt::gt_oq); + + if constexpr (current_api >= avx512) + { + if constexpr (c == category::float32x16) return mask16 {_mm512_cmp_ps_mask(v, w, f)}; + else if constexpr (c == category::float32x8) return mask8 {_mm256_cmp_ps_mask(v, w, f)}; + else if constexpr (c == category::float32x4) return mask8 {_mm_cmp_ps_mask(v, w, f)}; + else if constexpr (c == category::float64x8) return mask8 {_mm512_cmp_pd_mask(v, w, f)}; + else if constexpr (c == category::float64x4) return mask8 {_mm256_cmp_pd_mask(v, w, f)}; + else if constexpr (c == category::float64x2) return mask8 {_mm_cmp_pd_mask(v, w, f)}; + else if constexpr (c == category::uint64x8) return mask8 {_mm512_cmpgt_epu64_mask(v, w)}; + else if constexpr (c == category::uint64x4) return mask8 {_mm256_cmpgt_epu64_mask(v, w)}; + else if constexpr (c == category::uint64x2) return mask8 {_mm_cmpgt_epu64_mask(v, w)}; + else if constexpr (c == category::uint32x16) return mask16 {_mm512_cmpgt_epu32_mask(v, w)}; + else if constexpr (c == category::uint32x8) return mask8 {_mm256_cmpgt_epu32_mask(v, w)}; + else if constexpr (c == category::uint32x4) return mask8 {_mm_cmpgt_epu32_mask(v, w)}; + else if constexpr (c == category::uint16x32) return mask32 {_mm512_cmpgt_epu16_mask(v, w)}; + else if constexpr (c == category::uint16x16) return mask16 {_mm256_cmpgt_epu16_mask(v, w)}; + else if constexpr (c == category::uint16x8) return mask8 {_mm_cmpgt_epu16_mask(v, w)}; + else if constexpr (c == category::uint8x64) return mask64 {_mm512_cmpgt_epu8_mask(v, w)}; + else if constexpr (c == category::uint8x32) return mask32 {_mm256_cmpgt_epu8_mask(v, w)}; + else if constexpr (c == category::uint8x16) return mask16 {_mm_cmpgt_epu8_mask(v, w)}; + else if constexpr (c == category::int64x8) return mask8 {_mm512_cmpgt_epi64_mask(v, w)}; + else if constexpr (c == category::int64x4) return mask8 {_mm256_cmpgt_epi64_mask(v, w)}; + else if constexpr (c == category::int64x2) return mask8 {_mm_cmpgt_epi64_mask(v, w)}; + else if constexpr (c == category::int32x16) return mask16 {_mm512_cmpgt_epi32_mask(v, w)}; + else if constexpr (c == category::int32x8) return mask8 {_mm256_cmpgt_epi32_mask(v, w)}; + else if constexpr (c == category::int32x4) return mask8 {_mm_cmpgt_epi32_mask(v, w)}; + else if constexpr (c == category::int16x32) return mask32 {_mm512_cmpgt_epi16_mask(v, w)}; + else if constexpr (c == category::int16x16) return mask16 {_mm256_cmpgt_epi16_mask(v, w)}; + else if constexpr (c == category::int16x8) return mask8 {_mm_cmpgt_epi16_mask(v, w)}; + else if constexpr (c == category::int8x64) return mask64 {_mm512_cmpgt_epi8_mask(v, w)}; + else if constexpr (c == category::int8x32) return mask32 {_mm256_cmpgt_epi8_mask(v, w)}; + else if constexpr (c == category::int8x16) return mask16 {_mm_cmpgt_epi8_mask(v, w)}; + } + else + { + if constexpr (c == category::float32x8) return _mm256_cmp_ps(v, w, f); + else if constexpr (c == category::float64x4) return _mm256_cmp_pd(v, w, f); + else if constexpr (c == category::float32x4) return _mm_cmpgt_ps(v, w); + else if constexpr (c == category::float64x2) return _mm_cmpgt_pd(v, w); + else + { + constexpr auto use_avx2 = current_api >= avx2; + constexpr auto use_sse4_1 = current_api >= sse4_1; + constexpr auto use_sse4_2 = current_api >= sse4_2; + + constexpr auto gt = [](E ev, E fv) { return as_logical_t(ev > fv); }; + + [[maybe_unused]] auto unsigned_cmp = [](auto vv, auto vw) + { + using l_t = logical>; + auto const sm = signmask(as, signed>>()); + return bit_cast((bit_cast(vv, as(sm)) - sm) > (bit_cast(vw, as(sm)) - sm), as {}); + }; + + if constexpr (use_avx2 && c == category::int64x4) return _mm256_cmpgt_epi64(v, w); + else if constexpr (use_avx2 && c == category::uint64x4) return unsigned_cmp(v, w); + else if constexpr (use_avx2 && c == category::int32x8) return _mm256_cmpgt_epi32(v, w); + else if constexpr (use_avx2 && c == category::uint32x8) return unsigned_cmp(v, w); + else if constexpr (use_avx2 && c == category::int16x16) return _mm256_cmpgt_epi16(v, w); + else if constexpr (use_avx2 && c == category::uint16x16) return unsigned_cmp(v, w); + else if constexpr (use_avx2 && c == category::int8x32) return _mm256_cmpgt_epi8(v, w); + else if constexpr (use_avx2 && c == category::uint8x32) return unsigned_cmp(v, w); + else if constexpr (use_sse4_2 && c == category::int64x2) return _mm_cmpgt_epi64(v, w); + else if constexpr (c == category::int32x4) return _mm_cmpgt_epi32(v, w); + else if constexpr (c == category::int16x8) return _mm_cmpgt_epi16(v, w); + else if constexpr (c == category::int8x16) return _mm_cmpgt_epi8(v, w); + else if constexpr (c == category::uint32x4) + { + if constexpr (use_sse4_1) return eve::max(a, b) != b; + else return unsigned_cmp(a, b); + } + else if constexpr (c == category::uint16x8) + { + if constexpr (use_sse4_1) return eve::max(a, b) != b; + else return unsigned_cmp(a, b); + } + else if constexpr (c == category::uint8x16) + { + if constexpr (use_sse4_1) return eve::max(a, b) != b; + else return unsigned_cmp(a, b); + } + else return map(gt, a, b); + } + } + } + } -// ----------------------------------------------------------------------------------------------- -// masked implementation - template - EVE_FORCEINLINE as_logical_t> is_greater_(EVE_REQUIRES(avx512_), - C const& mask, - O const& opts, - wide const& v, - wide const& w) noexcept - requires x86_abi> + // ----------------------------------------------------------------------------------------------- + // masked implementation + template + EVE_FORCEINLINE as_logical_t> is_greater_(EVE_REQUIRES(avx512_), C const& mask, O const& opts, wide a, wide b) noexcept + requires x86_abi> { - if constexpr( C::has_alternative || O::contains(definitely)) + if constexpr (C::has_alternative || O::contains(definitely)) { - return is_greater.behavior(cpu_{}, opts, v, w); + return is_greater.behavior(cpu_{}, opts, a, b); } else { - auto const s = alternative(mask, v, as(to_logical(v))); - [[maybe_unused]] auto m = expand_mask(mask, as(v)).storage().value; + auto const s = alternative(mask, a, as(to_logical(a))); + [[maybe_unused]] auto m = expand_mask(mask, as(a)).storage().value; constexpr auto c = categorize>(); constexpr auto f = to_integer(cmp_flt::gt_oq); - if constexpr( C::is_complete ) return s; - else if constexpr( c == category::float32x16 ) return mask16 {_mm512_mask_cmp_ps_mask(m, v, w, f)}; - else if constexpr( c == category::float64x8 ) return mask8 {_mm512_mask_cmp_pd_mask(m, v, w, f)}; - else if constexpr( c == category::float32x8 ) return mask8 {_mm256_mask_cmp_ps_mask(m, v, w, f)}; - else if constexpr( c == category::float64x4 ) return mask8 {_mm256_mask_cmp_pd_mask(m, v, w, f)}; - else if constexpr( c == category::float32x4 ) return mask8 {_mm_mask_cmp_ps_mask(m, v, w, f)}; - else if constexpr( c == category::float64x2 ) return mask8 {_mm_mask_cmp_pd_mask(m, v, w, f)}; - else if constexpr( c == category::int64x8 ) return mask8 {_mm512_mask_cmpgt_epi64_mask(m, v, w)}; - else if constexpr( c == category::int64x4 ) return mask8 {_mm256_mask_cmpgt_epi64_mask(m, v, w)}; - else if constexpr( c == category::int64x2 ) return mask8 {_mm_mask_cmpgt_epi64_mask(m, v, w)}; - else if constexpr( c == category::int32x16 ) return mask16 {_mm512_mask_cmpgt_epi32_mask(m, v, w)}; - else if constexpr( c == category::int32x8 ) return mask8 {_mm256_mask_cmpgt_epi32_mask(m, v, w)}; - else if constexpr( c == category::int32x4 ) return mask8 {_mm_mask_cmpgt_epi32_mask(m, v, w)}; - else if constexpr( c == category::int16x32 ) return mask32 {_mm512_mask_cmpgt_epi16_mask(m, v, w)}; - else if constexpr( c == category::int16x16 ) return mask16 {_mm256_mask_cmpgt_epi16_mask(m, v, w)}; - else if constexpr( c == category::int16x8 ) return mask8 {_mm_mask_cmpgt_epi16_mask(m, v, w)}; - else if constexpr( c == category::int8x64 ) return mask64 {_mm512_mask_cmpgt_epi8_mask(m, v, w)}; - else if constexpr( c == category::int8x32 ) return mask32 {_mm256_mask_cmpgt_epi8_mask(m, v, w)}; - else if constexpr( c == category::int8x16 ) return mask16 {_mm_mask_cmpgt_epi8_mask(m, v, w)}; - else if constexpr( c == category::uint64x8 ) return mask8 {_mm512_mask_cmpgt_epu64_mask(m, v, w)}; - else if constexpr( c == category::uint64x4 ) return mask8 {_mm256_mask_cmpgt_epu64_mask(m, v, w)}; - else if constexpr( c == category::uint64x2 ) return mask8 {_mm_mask_cmpgt_epu64_mask(m, v, w)}; - else if constexpr( c == category::uint32x16 ) return mask16 {_mm512_mask_cmpgt_epu32_mask(m, v, w)}; - else if constexpr( c == category::uint32x8 ) return mask8 {_mm256_mask_cmpgt_epu32_mask(m, v, w)}; - else if constexpr( c == category::uint32x4 ) return mask8 {_mm_mask_cmpgt_epu32_mask(m, v, w)}; - else if constexpr( c == category::uint16x32 ) return mask32 {_mm512_mask_cmpgt_epu16_mask(m, v, w)}; - else if constexpr( c == category::uint16x16 ) return mask16 {_mm256_mask_cmpgt_epu16_mask(m, v, w)}; - else if constexpr( c == category::uint16x8 ) return mask8 {_mm_mask_cmpgt_epu16_mask(m, v, w)}; - else if constexpr( c == category::uint8x64 ) return mask64 {_mm512_mask_cmpgt_epu8_mask(m, v, w)}; - else if constexpr( c == category::uint8x32 ) return mask32 {_mm256_mask_cmpgt_epu8_mask(m, v, w)}; - else if constexpr( c == category::uint8x16 ) return mask16 {_mm_mask_cmpgt_epu8_mask(m, v, w)}; + if constexpr (C::is_complete) return s; + else if constexpr (c == category::float32x16) return mask16 {_mm512_mask_cmp_ps_mask(m, a, b, f)}; + else if constexpr (c == category::float64x8) return mask8 {_mm512_mask_cmp_pd_mask(m, a, b, f)}; + else if constexpr (c == category::float32x8) return mask8 {_mm256_mask_cmp_ps_mask(m, a, b, f)}; + else if constexpr (c == category::float64x4) return mask8 {_mm256_mask_cmp_pd_mask(m, a, b, f)}; + else if constexpr (c == category::float32x4) return mask8 {_mm_mask_cmp_ps_mask(m, a, b, f)}; + else if constexpr (c == category::float64x2) return mask8 {_mm_mask_cmp_pd_mask(m, a, b, f)}; + else if constexpr (c == category::int64x8) return mask8 {_mm512_mask_cmpgt_epi64_mask(m, a, b)}; + else if constexpr (c == category::int64x4) return mask8 {_mm256_mask_cmpgt_epi64_mask(m, a, b)}; + else if constexpr (c == category::int64x2) return mask8 {_mm_mask_cmpgt_epi64_mask(m, a, b)}; + else if constexpr (c == category::int32x16) return mask16 {_mm512_mask_cmpgt_epi32_mask(m, a, b)}; + else if constexpr (c == category::int32x8) return mask8 {_mm256_mask_cmpgt_epi32_mask(m, a, b)}; + else if constexpr (c == category::int32x4) return mask8 {_mm_mask_cmpgt_epi32_mask(m, a, b)}; + else if constexpr (c == category::int16x32) return mask32 {_mm512_mask_cmpgt_epi16_mask(m, a, b)}; + else if constexpr (c == category::int16x16) return mask16 {_mm256_mask_cmpgt_epi16_mask(m, a, b)}; + else if constexpr (c == category::int16x8) return mask8 {_mm_mask_cmpgt_epi16_mask(m, a, b)}; + else if constexpr (c == category::int8x64) return mask64 {_mm512_mask_cmpgt_epi8_mask(m, a, b)}; + else if constexpr (c == category::int8x32) return mask32 {_mm256_mask_cmpgt_epi8_mask(m, a, b)}; + else if constexpr (c == category::int8x16) return mask16 {_mm_mask_cmpgt_epi8_mask(m, a, b)}; + else if constexpr (c == category::uint64x8) return mask8 {_mm512_mask_cmpgt_epu64_mask(m, a, b)}; + else if constexpr (c == category::uint64x4) return mask8 {_mm256_mask_cmpgt_epu64_mask(m, a, b)}; + else if constexpr (c == category::uint64x2) return mask8 {_mm_mask_cmpgt_epu64_mask(m, a, b)}; + else if constexpr (c == category::uint32x16) return mask16 {_mm512_mask_cmpgt_epu32_mask(m, a, b)}; + else if constexpr (c == category::uint32x8) return mask8 {_mm256_mask_cmpgt_epu32_mask(m, a, b)}; + else if constexpr (c == category::uint32x4) return mask8 {_mm_mask_cmpgt_epu32_mask(m, a, b)}; + else if constexpr (c == category::uint16x32) return mask32 {_mm512_mask_cmpgt_epu16_mask(m, a, b)}; + else if constexpr (c == category::uint16x16) return mask16 {_mm256_mask_cmpgt_epu16_mask(m, a, b)}; + else if constexpr (c == category::uint16x8) return mask8 {_mm_mask_cmpgt_epu16_mask(m, a, b)}; + else if constexpr (c == category::uint8x64) return mask64 {_mm512_mask_cmpgt_epu8_mask(m, a, b)}; + else if constexpr (c == category::uint8x32) return mask32 {_mm256_mask_cmpgt_epu8_mask(m, a, b)}; + else if constexpr (c == category::uint8x16) return mask16 {_mm_mask_cmpgt_epu8_mask(m, a, b)}; } } } diff --git a/include/eve/module/core/regular/impl/simd/x86/is_less.hpp b/include/eve/module/core/regular/impl/simd/x86/is_less.hpp index 1465ccd66f..8ecc20658f 100644 --- a/include/eve/module/core/regular/impl/simd/x86/is_less.hpp +++ b/include/eve/module/core/regular/impl/simd/x86/is_less.hpp @@ -75,6 +75,7 @@ namespace eve::detail { constexpr auto use_avx2 = current_api >= avx2; constexpr auto use_sse4_1 = current_api >= sse4_1; + constexpr auto use_sse4_2 = current_api >= sse4_2; constexpr auto lt = [](E ev, E fv) { return as_logical_t(ev < fv); }; [[maybe_unused]] auto unsigned_cmp = [](auto lhs, auto rhs) @@ -92,6 +93,7 @@ namespace eve::detail else if constexpr (use_avx2 && c == category::uint16x16) return eve::min(a, b) != b; else if constexpr (use_avx2 && c == category::int8x32) return _mm256_cmpgt_epi8(b, a); else if constexpr (use_avx2 && c == category::uint8x32) return eve::min(a, b) != b; + else if constexpr (use_sse4_2 && c == category::int64x2) return _mm_cmpgt_epi64(b, a); else if constexpr (c == category::int32x4) return _mm_cmplt_epi32(a, b); else if constexpr (c == category::int16x8) return _mm_cmplt_epi16(a, b); else if constexpr (c == category::int8x16) return _mm_cmplt_epi8(a, b); diff --git a/include/eve/module/core/regular/is_greater.hpp b/include/eve/module/core/regular/is_greater.hpp index 3b5b338a14..f57e5b74aa 100644 --- a/include/eve/module/core/regular/is_greater.hpp +++ b/include/eve/module/core/regular/is_greater.hpp @@ -11,18 +11,24 @@ #include #include #include +#include namespace eve { template - struct is_greater_t : strict_elementwise_callable + struct is_greater_t : elementwise_callable { - template - requires(eve::same_lanes_or_scalar) - constexpr EVE_FORCEINLINE common_logical_t operator()(T a, U b) const + template + constexpr EVE_FORCEINLINE common_logical_t operator()(T a, U b) const + requires (compatible_arithmetic_values) { -// static_assert( valid_tolerance, Options>::value, "[eve::is_greater] simd tolerance requires at least one simd parameter." ); - return EVE_DISPATCH_CALL(a, b); + if constexpr (Options::contains(definitely)) + { + static_assert(floating_value, "[eve::is_less] The definitely option is only supported for floating types."); + // static_assert( valid_tolerance, Options>::value, "[eve::is_greater] simd tolerance requires at least one simd parameter." ); + } + + return EVE_DISPATCH_CALL(a, b); } @@ -89,48 +95,25 @@ namespace eve // Required for if_else optimisation detections using callable_is_greater_ = tag_t; } -//These include are there because max must see is_greater -#include -#include -#include -#include -#include -#include -#include -namespace eve::detail -{ - template - EVE_FORCEINLINE constexpr common_logical_t - is_greater_(EVE_REQUIRES(cpu_), O const&, logical a, logical b) noexcept - { - if constexpr( scalar_value && scalar_value) return common_logical_t(a > b); - else return a > b; - } +#include - template - EVE_FORCEINLINE constexpr common_logical_t - is_greater_(EVE_REQUIRES(cpu_), O const & o, T const& aa, U const& bb) noexcept - { - if constexpr(O::contains(definitely)) - { - using w_t = common_value_t; - auto a = w_t(aa); - auto b = w_t(bb); +#if defined(EVE_INCLUDE_X86_HEADER) +# include +#endif - auto tol = o[definitely].value(w_t{}); - if constexpr(integral_value) return a > eve::next(b, tol); - else return a > fam(b, tol, eve::max(eve::abs(a), eve::abs(b))); - } - else - { - if constexpr(scalar_value && scalar_value) return common_logical_t(aa > bb); - else return aa > bb; - } - } -} +#if defined(EVE_INCLUDE_POWERPC_HEADER) +# include +#endif +#if defined(EVE_INCLUDE_ARM_NEON_HEADER) +# include +#endif -#if defined(EVE_INCLUDE_X86_HEADER) -# include +#if defined(EVE_INCLUDE_ARM_SVE_HEADER) +# include +#endif + +#if defined(EVE_INCLUDE_RISCV_HEADER) +# include #endif