Squashed 'lib/simde/simde/' changes from 1f4a28c4..cbef1c15

cbef1c15 avx512/permutex2var: hard-code types in casts instead of using typeof 71a65cbd gfni: add cast to work around -Wimplicit-int-conversion warning 10dd508b avx512/scalef: work around for GCC bug #101614 277b303b neon/cvt: fix compilation with -ffast-math 9ec8c259 avx512/scalef: _mm_mask_scalef_round_ss is still missing in GCC e821bee3 Wrap static assertions in code to disable -Wreserved-identifier 13cf2969 The fix for GCC bug #95483 wasn't in a release until 11.2 b66e3cb9 avx2: separate natural vector length for float, int, and double types dda31b76 Add -Wdeclaration-after-statement to the list of ignored warnings. 9af03cd0 Work around compound literal warning with clang 74a4aa59 neon/clt: Add SSE/AVX512 fallbacks 02ce512d neon/mlsl_high_n: initial implementation 6472321c neon/mlal_high_n: initial implementation 2632bbc1 neon/subl_high: initial implementation d1d2362d neon/types: remove duplicate NEON float16_t definitions 456812f8 sse: avoid including windows.h when possible 332dcc83 neon/reinterpret: change defines to work with templated callers e369cd0c neon/cge: Improve some of the SSE2 fallbacks 3397efe1 deal with WASM SIMD128 API changes. 3aa4ae58 neon/rndn: Fix macros to workaround bugs 30b3607b neon/ld1: Fix macros in order to workaround bugs 8cac29c6 neon/cge: Implement f16 functions c96b3ae6 neon/cagt: Implement f16 functions f948d39a neon/bsl: Implement f16 functions d6e025bd neon/reinterpret: f16_u16 and u16_f16 implementations 5e763da5 neon/add: Implement f16 functions 5a7c9e13 neon/ceqz: Implement f16 functions 1ba94bc4 neon/dup_n: Implement f16 functions af26004a neon/ceq: Implement f16 functions e41944f3 neon/st1: Add f16 functions a660d577 neon/cvt: Implement f16 functions 412da5b3 neon/ld1: Implement f16 functions 068485c9 neon/cage: Initial f16 implementations 89fb99ee neon: Implement f16 types 50a56ef7 sse4.2: work around more warnings on old clang fa54e7b3 avx512/permutex2var: work around incorrect definition on old clang d20c7bf8 sse: use portable implementation to work around llvm bug #344589 371fd445 avx: work around incorrect maskload/store definitions on clang < 3.8 3bb373c8 Various fixes for -fno-lax-vector-conversions f26ad2d1 avx512/fixupimm: initial implementation f9182e3b Fix warnings with -fno-lax-vector-conversions 37c26d7f avx512/dpbusds: complete function family 0dc7eaf6 sse: replace _mm_prefetch implementation b7fd63d9 neon/ld1q: u8_x2, u8_x3, u8_x4 6427473b neon/mul: add improved SSE2 vmulq_s8 implementation b843d7e1 avx512/cvt: add _mm512_cvtepu32_ps 5df05510 simd128: improve many lt and gt implementation 495a0d2a neon/mul: implement unsigned multiplication using signed functions 2b087a1c neon/qadd: fix warning in ternarylogic call in vaddq_u32 f027c8da neon/qabs: add some faster implementations bf6667b4 simd128: add fast sqrt implementations d490ca7a simd128: add fast extmul_low/high implementations 2abd2cc0 simd128: add NEON and POWER shift implementations 3032eb33 simd128: add fast promote/demote implementations e92273a6 simd128: add dedicated functions for unsigned extract_lane 34c5733c sse2, sse4.1: pull in improved packs/packus implementations from WASM 1bfc221c simd128: add fast narrow implementations f333a089 simd128: add fast implementations of extend_low/extend_high b4e0d0cc msa/madd: initial implementation c09e6b0a neon/rndn: work around some missing functions in GCC on armv8 cc7afa77 avx512/4dpwssds: initial implementation a9cec6fe avx512/dpbf16: implement remaining functions 371da5f8 avx512/dpwssds: initial implementation ccef3bee common: Use AArch64 intrinsics if _M_ARM64EC is defined f79c08c3 xop: fix NEON implementation of maccs functions to use NEON types 9eb0a88d sse4.1: use NEON types instead of vector in insert implementations 0bbae5ff avx512/roundscale: don't assume IEEE 754 storage 77673258 fma: use NEON types in simde_mm_fnmadd_ps NEON implementation 865412e7 sse2: remove statement expr requirement for NEON srli/srai macros 573c0a24 sse4.1: replace NEON implementations with shuffle-based implementations 534794b2 sse4.1: remove statement expr dependency in blend functions a571ca8c fma: fix return value of simde_mm_fnmadd_ps on NEON df95ab8e sse, sse2: clean up several shuffle macros 44e25b30 sse2: add parenthesis around macro arguments 305ac0a8 avx512/set, avx512/popcnt: use _mm512_set_epi8 only when available 98de6621 relaxed-simd: add blend functions 974f83d5 relaxed-simd: add fms functions a46a04b7 relaxed-simd: add fma functions 54c62bf7 avx512/popcnt: implement remaining functions d4dc926f avx512/dpbf16: initial implementation b9a7904d avx512/4dpwssd: implement complete function family f54cc98a avx512/dpwssd: initial implementation 7e877d17 avx512/bitshuffle: initial implementation 9e96b711 avx512/dpbusd: implement remaining functions 423572d5 simd128: use vec_cmpgt instead of vec_cmplt in pmin 73b6978f sse, sse2: fix vec_cpsign order test 7c0bdbff gfni: remove unintentional dependency on vector extensions 26fcfdb1 simd128: add fast ceil implementations 85035430 Improve widening pairwise addition implementations 8f35dc1a simd128: add fast max/pmax implementations a8adeffc neon/cvt: disable some code on 32-bit x86 which uses _mm_cvttsd_si64 29955848 avx512/shldv: limit shuffle-based version to little endian ae330dd9 simd128: add NEON, Altivec, & vector extension sub_sat implementations 9debe735 neon/cvt, relaxed-simd: add work-around for GCC bug #101614 eab383d9 avx512/dbsad: add vector extension impl. and improve scalar version 79c93ce0 sse, sse2: sync clang-12 changes for vec_cpsgn 7205c644 avx512/cvtt: _mm_cvttpd_epi64 is only available on x86_64 42538f0e simd128, sse2: more cvtpd_ps/f32x4_demote_f64x2_zero implementations 1bec285e simd128, sse2: add more madd_epi16 / i32x4_dot_i16x8 implementations 6dfdf3d2 simd128: vector extension implementation of floating-point abs 00c3b68b simd128, neon/neg: add VSX implementations of abs and neg functions 7f3a52d0 neon/cgt, simd128: improve some unsigned comparisons on x86 f5184634 neon/abd: add much better implementations 9b1974dd Add @aqrit's SSE2 min/max implementations 9caf5e6e simd128: add more pmin/pmax implementations dcd00397 neon/qrdmulh: steal WASM q15mulr_sat implementation for qrdmulhq_s16 34dee780 simd128: add SSE2 q15mulr_sat implementation fe3e623e neon/min: add SSE2 vminq_u32 implementation 4abbb4db neon/min: add SSE2 vqsubq_u32 implementation c1158835 simd128: add improved min implementations on several architectures c059f800 relaxed-simd: add trunc functions 0394e967 simd128: add several some AArch64 and Altivec trunc_sat implementations 3fa2026b Fix several places where we assumed NEON used vector extensions. 6a183313 neon/qsub: add some SSE and vector extension implementations 313561fe msa/subv: initial implementation 8f1155e4 msa/andi: initial implementation d20bca47 msa/and: initial implementation 82e93303 gfni: work around clang bug #50932 3a27037f arch: set SIMDE_ARCH_ARM for AArch64 on MSVC d19a9d6a msa/adds: initial implementation 41f9ad33 neon/qadd: improve SSE implementation eb55cce3 avx512/shldv: initial implementation ee0a83e1 avx512/popcnt: initial implementation 48855d3a msa/adds_a: initial implementation 6133600b neon/qadd: add several improved x86 and vector extension versions 6b5814d9 avx512/ternarylogic: implement remaining functions 3fba9986 Add many fast floating point to integer conversion functions b2f01b98 neon/st4_lane: Implement remaining functions ccc9e2c8 neon/st3_lane: Implement remaining functions 3f0859be neon/st2_lane: Implement remaining functions e136dfe7 neon/ld1_dup: Add f64 function implementations 4a2ceb45 neon/cvt: add some faster x86 float->int/uint conversions b82b16ac neon/cvt: Add vcvt_f32_f64 and vcvt_f64_f32 implementations 477068c9 neon/st2: Implement remaining functions 3a93c5dd neon/ld4_lane: Implement remaining functions 75838c15 neon/qshlu_n: Add scalar function implementations 7d314092 simde/scalef: add scalef_ss/sd d3547dac msa/add_a: initial implementation 8ba8dc84 msa/addvi: initial implementation b1006161 Begin working on implementing MIPS MSA. 38088d10 fma: use fma/fms instead of mla/mls on NEON 76c4b7cd neon/cle: add some x86 implementations d045a667 neon/cle: improve formatting of some x86 implementations 6fc12601 relaxed-simd: initial support for the WASM relaxed SIMD proposal 2d430eb4 neon/ld2: Implement remaining functions fc3aef94 neon/ld1_lane: Implement remaining functions 0ec9c9c9 neon/rsqrte: Implement remaining functions 92e72c44 neon/rsqrts: Add remaining function implementations e7cdccd0 neon/qdmulh_lane: Add remaining function implementations 905f1e4c neon/recpe: Add remaining function implementations 96cebc42 neon/recps: Add scalar function implementations 63ad6d0a neon/qrdmulh_lane: Add scalar function implementations f8dacd07 simde-diagnostic: Include simde-arch 4ad3f10f neon/mul_lane: Add mul_laneq functions 25d0fe82 neon/sri_n: Add scalar function implementations 6fb9fa3a neon/shl_n: Add scalar function implementations 5738564f neon/shl: Add scalar implementations fc2aed9b neon/rsra_n: Add scalar function implementations 7c7d8d80 neon/qshrn_n: Add scalar function implementations 76e65444 neon/qrshrn_n: Add scalar function implementations 25aa2124 neon/rshr_n: Add custom scalar function for utility 6d1c7aaf avx512/dbsad: initial implementation 4b1ba2ce avx512/dpbusd: initial implementation 02719bcc svml: remove some dead stores from cdfnorminv 803b29ac sse2: fix set but not used variable in _mm_cvtps_epi32 7ee622df Use SIMDE_HUGE_FUNCTION_ATTRIBUTES on several functions. 80439178 arch: fix SIMDE_ARCH_POWER_ALTIVEC_CHECK to include AltiVec check 604a90af neon/cvt: fix a couple of s390x implementations' NaN handling a0fe7651 simd128: work around bad diagnostic from clang < 7 cd742d66 f16c: use __ARM_FEATURE_FP16_VECTOR_ARITHMETIC to detect Arm support 4f39e4fc Fix an assortment of small bugs 4bf12875 Remove all `&& 0`s in preprocessor macros. 8e0d0f93 simd128: remove stray `&& 0` d98f81cb simd128: add optimized f32x4.floor implementations b626266d simd128: add some Arm implementations of all_true 78957358 simd128: any_true implementations for Arm 20cd4d00 simd128: add improved add_sat implementations ea364550 wasm128, sse2: disable -Wvector-conversion when calling vgetq_lane_s64 4e09afb4 neon/zip1: add armv7 implementations f27932a7 simd128: add x86/Arm/POWER implementations 2bcd59bb avx512/conflict: implement missing functions 7da82adb avx512/multishift: initial implementation e7229088 various: correct PPC and z/Arch versions plus typo 005d39c8 simd128: fix portable fallback for wasm_i8x16_swizzle 860127a1 Add NEON, SSE3, and AltiVec implementations of wasm_i8x16_swizzle 0959466e simd128: add AltiVec implementations of any/all_true 7f38c52e simd128: add vec_abs implementation of wasm_i8x16_abs e2cb9632 simd128: work around clang bugs 50893 and 50901. 77e4f57d avx512/rol: implement remaining functions 1d60dc03 avx512/rolv: initial implementation 30681718 avx512: initial implementation 38f8ef8f avx512/ternarylogic: initial implementation 3efe186a Add constrained compilation mode 1faf7872 simd128: add simde_wasm_i64x2_ne 68616767 avx512/scalef: implement remaining functions 6ea919f8 avx512/conflict: implements mm_conflict_epi32 ad5d51c5 avx512/scalef: initial implementation 4f0f1e8f neon/qrshrun_n: Add scalar function implementations dc278de7 neon/rshr_n: Add scalar function implementations 86f73e1e neon/rndn: Add macro corrections 189d7762 neon/qshrun_n: Add scalar function implementations 1fc63065 neon/rshl: Add scalar function implementations 4ca2973e neon/rndn: Add scalar function implementation d78398c8 neon/qdmulh: Add scalar function implementations 7d43b7c9 neon/pmin: Add scalar function implementations 4dacfeff neon/pmax: Add scalar function implementations abccc767 neon/padd: Add scalar function implementations b3d97677 neon/neg: Complete implementation of function family 137afad7 neon/dup_lane: Complete implementation of function family ef93f1bb neon/fma_lane: Implement fmaq_lane functions e9dcfe8b neon/sra_n: Add scalar function implementations 44cf247c neon/shr_n: Add scalar function implementations ca78eb82 neon/sub: Implements the two remaining scalar functions 65d8d52f avx512/rorv: implement _mm{256,512}{,_mask,_maskz}_rorv_epi{32,64} 1afa8148 Many work-arounds for GCC with MSA, and support in the docker image. 8bf571ac neon/ext: clean up shuffle-based implementation 51790ff8 avx512/rorv: initial implementation of _mm_rorv_epi32 952dab89 neon/st3: Add shuffle vector implementations 2229f4ba sse, sse2: work around GCC bug #100927 e0b88179 neon/ld{2,3,4}: disable -Wmaybe-uninitialized on all recent GCC 76c76bfa neon/fma_lane: portable and native implementations 002b4066 neon/mul_lane: finish implementation of function family ae959e7e neon/;shlu_n: faster WASM implementations 7df8e3ab neon/qshlu_n: initial implementation 338eb083 neon/ld4: use conformant array parameters 049eaa9e neon/vld4: Wasm optimization of vld4q_u8 720db9ff neon/st3q_u8: Wasm optimization ccf235e1 neon/qdmull: add WASM implementations 06a64a94 neon/movl: improve WASM implementation e36a029e neon/tbl: add WASM implementation of vtbl1_u8 5debb615 neon/tst: implement scalar functions cef74f3b neon/hadd,hsub: optimization for Wasm 502243a2 neon/qrdmulh_lane: fix typo in undefs 6eb625d7 fma: drop weird high-priority implementation in _mm_fmadd_ps 47ba41d6 neon/qshrn_n: initial implementation b94e0298 neon/qrdmulh: native aliases for scalar functions should be A64 f27e9fcb neon/qrdmulh_lane: initial implementation 04e2ca66 neon/subhn: initial implementation 8b129a93 neon/sri_n: add 128-bit implementations 88dd65de neon/mull_lane: initial implementation 12c940ed neon/mlsl_lane: initial implementation abc8dacf neon/mlal_lane: initial implementation 9438ea43 neon/dup_lane: fix macro for simde_vdup_laneq_u16 36e2ce5b neon/{add,sub}w_high: use vmovl_high instead of vmovl + get_high d86492fa neon/sri_n: native and portable 60715735 neon/qshrun_n: native and portable implementations de84bcd0 neon/qdmulh_lane: native and portable 4581232f avx512/roundscale_round: implement remaining functions 76b19b97 avx512/range_rounnd,round: move range_round functions out of round 2ba2b7b8 neon/ld1_dup: native and portable (64-bit vectors) f6fd4b67 neon/dup_lane: implement vdupq_lane_f64 07b4a2b3 neon/shll_n: native and portable implementations 58a0188d neon/dupq_lane: native and portable 623f2207 neon/st4_lane: portable and native *_{s,u}{8,16,32} 322663be neon/st3_lane: portable and native *_{s,u}{8,16,32} 7700b2e5 neon/st2_lane: portable and native for _{u,s}{8,16,32} acc67df2 neon/cltz: Add scalar functions and natural vector fallbacks fcf6e88e neon/clt: Add implementations of scalar functions 799e1629 neon/clez: Add implementaions of scalar functions f22ae740 neon/addhn: initial implementation 8774393f avx512/cmp{g,l}e: AVX-512 implementations of non-mask functions 1eb57468 avx512/cmple: finish implementations of all cmple functions 9b60d826 avx512/cmpge: fix bad _mm512_cmpge_epi64_mask implementation 6849da33 avx: use internal symbols in clang fallbacks for cmp_ps/pd functions f2746208 avx512/cmpge: finish implementing all functions 135cbbf0 avx512/range: implement mm{,512}{,_mask,_maskz}_range_round* 6421a835 avx512/round, avx512/roundscale: add shorter vector fallbacks 5c6673f5 avx512/roundscale: implement simde_mm{256,512}_roundscale_ps 6fcb4433 neon/cle: Add implementations for remaining functions a49bdc1c neon/fma_n: the 32-bit functions are missing on GCC on arm 05172a08 neon/ld4: work around spurious warning on clang < 10 2fa3d1d8 neon/qdmulh: add shuffle-based implementations ea22a611 neon/qdmulh_n: native and portable implementations 5ef8e53d neon/qrshrn_n: native and portable implementations fda538d1 neon/ld1_lane: portable and native implementations 8f118bbd neon/cgtz: Add implementations of remaining functions 31d5048c neon/cgt: Add implementation of remaining functions 79274d8d neon/ld4_lane: move private type usage to inside loop bdcfccb7 neon/ld4_lane: native and portable implementations bbc35b65 avx512/range: don't used masked comparisons for 128/256-bit versions ef90404e avx512/range: fix fallback macros 5d00aa4c features: add z/arch to SIMDE_NATURAL_VECTOR_SIZE 83cab7c1 sve/cmplt: replace vec_and with & for s390 implementations a636d0ae Fix gcc-10 compilation on s/390x bb35d9f0 gfni: work around error with vec_bperm on clang-10 on POWER 2db3ba03 gfni: replace vec_and and vec_xor with & and ^ on z/arch cdb3f68c sse, mmx: fix clang-11 on POWER 233fef43 gfni: add many x86, ARM, z/Arch, PPC and WASM implementations c300a66e Don't set SIMDE_NO_CHECK_IMMEDIATE_CONSTANT in tests. 283c6e40 neon/qrshrun_n: native and portable implementations 9535e063 neon/rshrn_n: native and portable implementations 5c05d980 sse: prefer SIMDE_SHUFFLE_VECTOR implementation of _mm_shuffle_ps cdfff167 sse: don't use armv7 impl of _MM_TRANSPOSE4_PS on armv8 921db75b neon/cvt: add out-of-range and NaN tests b62dfb24 neon/dot_lane: add remaining implementation b24c6d5c neon/recps: Use vector ops instead of relying on autovec 8acdce75 avx512/range: implement mm(256, 512)_mask(z)_range_p* c0bc87e9 avx512/roundscale: initial implementation f1fd8066 sse2: correct typos in simde_x_mm_broadcastlow_pd 29427e22 avx, avx512/cmp: properly handle NaN in _mm{,256,512}_cmp_{ps,pd,ss,sd} c8d3304e neon/dot_lane: correct implementations of dot_lane functions aa739959 neon/st2,st1: use zip + st1 to implement st2 375321b4 neon/dup_n: replace remaining functions with dup_n implementations 992541f9 neon/dup_lane: use dup_n 00602f7f neon/dup_lane: add shuffle-based implementations 9fcd7043 fma: add mls-based NEON implementations of fnmadd functions 7b88559a avx512/range: fix variable names in macro implementations 3a3cd210 sse2: prefer shuffle implementation of _mm_shuffle_epi32 to NEON 3061e315 common: move conversion functions for u32 <-> f32 into common daa83861 neon/ld2: apply optimizations from previous commit to other extensions 53a7c6f7 neon/ld2: Wasm optimizations b4fa3c6e neon/mls_n: initial implementation 849357cf neon/shrn_n: Wasm SIMD optimizations c1841732 Add SIMDE_FAST_EXCEPTIONS option c73b6b54 avx512/range: remove CONSTIFY macro usage a8aac144 neon/shr_n: fix variable name in GFNI implementation of vshrq_n_s8 0d3ee449 avx512/range: implement mm512_range_ps/d functions b71e9e44 neon/cgez: complete implementation of CGEZ family 7c6e0f4b neon/abd: Wasm SIMD implementation 79f39ad2 neon/ext: add __builtin_shufflevector implementation e979809b neon/ld1: add Wasm SIMD implementation 829bc16b avx512: implement mm_mask(z)_unpack* funcs c66c2fba Work around issues preventing compilation on NVCC 5edb0bc3 sse2: add fast-math WASM implementation of _mm_cvtps_epi32 dbdddf8a Fix compilation with clang on POWER 9f259118 neon/get_low: use __builtin_shufflevector if available 8bce4a4d neon/get_high: add __builtin_shufflevector optimizations 70f0acbf sse2: don't require constants for _mm_srai_epi{16,32} 7043be93 avx2: added vector size conditional for unpack 4f790ded neon/vmovq: define vmovq_n as aliases for vdup_n 60661a40 neon/vdup: vdupq_lane_f32 native and portable c244411d neon/ld1_dup: split from ld1, dup_n fallbacks, WASM implementations 4dfa8110 neon/vld1q_dup: native and portable implementations a33b557f avx512/unpack{hi,lo}: implement mm256_mask(z)_unpack* functions b59facba avx512/unpack{hi,lo}: implement mask variants of unpacklo fc1f74ec neon: port additional code to new style 1acf4544 neon/types: reverse logic for SIMDE_ARM_NEON_FORCE_NATIVE_TYPES 06ff80cd neon/types: use vector extensions for public types when available 30e49b1b neon: refactor to use different types on all targets 72fe8c3a neon/{min,max}nm: add some headers for -ffast-math 7ea2d71e sse2: vcvtnq_s32_f32 is armv8-specific 7c370654 neon/cvt: cast result of float/double comparison 339a10f7 neon/shrn_n: s16 s32 s64 u16 u32 u64 portable and native ad4273b3 avx512/unpacklo: added vector size conditional 5809070a avx512/unpacklo: implement mm512_unpacklo_* functions 33984c24 neon/ld{2,3,4}: silence false positive diagnostic on GCC 7 91c2b412 neon/st2: vst2(q) f32 s8 s16 s32 u8 u16 u32 c6717a2b neon/vld2: vld2_{u16,u32} and vld2q_{u8,u16,u32,f32} b3aae641 neon/fma_n: initial implementation cc930ce3 neon/rsqrte: use vmls for fallbacks. af29571a neon/fma: add a couple x86 and PPC implementations 05383e30 neon/mls: add _mm_fnmadd_* implementations of vmls*_f* 1951e4c7 neon/rsqrts: vrsqrts_f32 and vrsqrtsq_f32 native and portable a8b09e16 neon: replace some more abs/labs/llabs usage with simde_math_* versions de31fc3e math: use simde_math_-prefixed abs/labs/llabs 5a899528 neon/{min,max}nm: use simde_math_* prefixed min/max functions 4b64da56 diagnostic: silence -Wreserved-identifier warning from LLVM a2520a85 simd128: add clang implementation of wasm_f64x2_promote_low_f32x4 359f1b18 simd128: cast to int ptrs instead of void* in wasm_v128_load*_lane 8682175b simd128: add some implementations of convert functions ce8a9adb neon/rsqrte: vrsqrte_f32 and vrsqrteq_f32 on native and portable ffc32e85 simd128: add missing WASM SIMD128 functions 84c95b06 neon/vld2_u8: native and portable implementation c6161667 neon/recpe: Remove duplicate code and fix copyright year bf3d202c neon/recps: recps/recpsq for native and portable f44192f2 sve/add: initial implementation 614cbc36 neon/recpe: add some additional implementations stolen from SSE 75c24a80 neon/recpe: recpe_f32 and recpe_f64, native and portable e846ddac Update WASM SIMD intrinsics to match new names. 89d22f41 common: don't use aligned OpenMP clause on MCST LCC cc509617 fma: work around broken implementations of some functions on MCST LCC e566aec7 x86: ignore warnings about inefficient functions on lcc 1a1b6c27 neon/cge: implement remaining functions in CGE family b076027c sve/qadd: initial implementation e95132fc sve/sel: add cast to make GCC on s/390x happy 492802a5 sve/add: switch some _x implementations to use _x instead of _z d8fd62dc sve/dup: add *_m variants 9976c212 sve/dup: switch implementations to use svsel a9fd997f sve/dup: rename from dup_n f7c4646e common: improve check for C11 generic selections ab5ce304 sve/ptest: simplify svptest_first 783f969d sve/whilelt: small optimizations for all whilelt functions 8c696906 sve: add native aliases for overloads c3815d08 sve/add: add svadd_n_* functions 6aae2920 sve/add: switch to using svsel for implementations of _z/_m variants 86aa82b6 sve/whilelt: add svwhilelt_*_{u32,s64,u64} implementations bf99fbce sve/and: switch implementations to use svsel d60cdae7 sve/sel: initial implementation 6f47cffb sve/types: add mmask4 functions for 256-bit vectors 0ebc13f1 sve: some tweaks to get s390x working 53ea316c sve/and: initial implementation dab8f553 sse4.1: add some casts to make clang -Weverything happy 38111a8b avx512/cvtt: add simde_mm{_mask,_maskz}_cvttpd_epi64 eed525d3 avx512/cvt: add simde_mm{_mask,_maskz}_cvtepi64_pd e6a87ee7 Initial import of a portable SVE implementation. 101c3cbe avx512: add tests for previous commit (104a99bc) ce57d61d avx512: add several new functions dd6b78e7 neon/ceqz: finish implementation of ceqz family 1e6be96a avx512: implement mm*_mask(z)_compress(storeu)_* 9af89540 sse4.1: _mm_blendv_epi8: add sse2 and update wasm_simd128 implementions 6e4e66d6 avx512:compress: implement mm256_mask(z)_compress(storeu)_p* 89ef83e2 simd128: add movemask-based implementations of any/all_true functions 3979670d avx512/insert: convert macros to functions, regenerate old-style tests 9814181e simd128: add additional cast in wasm_i32x4_abs 071795e7 avx512/fmsub: implement _fmsub_ functions for AXV512VL 846727ee avx512/compress: implement _mm256_mask_compress_pd 6694ae06 avx512/cmpeq: implement _mm512_mask_cmpeq_epi8_mask 09a6fec2 avx512/cmpneq: initial implementation of 128-bit and 256-bit functions 8f7a6be3 avx512/abs: work around buggy pd functions in GCC 7 - 8.2 0e64c1b5 avx2: fix undefs for many native aliases cc007898 sse2: ignore broken _mm_loadu_si{16,32} on GCC 42205923 sse2: use simde_math_{add,sub}s_* for _mm_{add,sub}s_* functions ce930565 {neon,simd128,avx512/abs}: provide vector versions of i64 abs d390e3f4 avx2: add vector/shuffle implementation of _mm256_madd_epi16 ec395e86 avx512/insert: implement inserti{,_mask,_maskz}_{32x8,64x2} 60e9fed6 avx512/insert: implement mm512{_mask,_maskz}_insert{f32x8,64x2} f62bb037 neon/cnt: add x86 implementations of vcntq_s8 c0b43a38 avx512/xor: implement mm512_mask(z)_xor_pd/s functions fb3f2fbc avx512/or: implement mm512_mask(z)_or_ps/d functions 42032d0e avx512/mullo: implement mm512_mullo_epi64 with mask(z) 0b3d7f8a neon/cmla, neon/cmla_rot{90,180,270}: CMLA requires armv8.3+ de120cf1 neon/fma: add more extensive feature checking fbf4bc3c neon/rndi, sse2: work around several functions missing in GCC 85d8190e arch: __ARM_ARCH now (v8.1+) encodes the minor version 9d7b0dad neon/cmla, neon/cmla_rot{90,180,270}: check compiler versions 71128dca neon/cmla, neon/cmla_rot{90,180,270}, neon/fma: initial implementation adaf41a1 axv512/or: implement _mm512_mask_or_pd function b33bd046 neon/bcax: Adds WASM and SSE2 implementations git-subtree-dir: lib/simde/simde git-subtree-split: cbef1c152ad2e73bc8971c306208f55f44b16088
soedinglab · Nov 1, 2022 · 845e09b · 845e09b
1 parent 1524231
commit 845e09b
Show file tree

Hide file tree

Showing 250 changed files with 61,173 additions and 8,670 deletions.
diff --git a/arm/neon.h b/arm/neon.h
@@ -34,6 +34,7 @@
 #include "neon/abdl.h"
 #include "neon/abs.h"
 #include "neon/add.h"
+#include "neon/addhn.h"
 #include "neon/addl.h"
 #include "neon/addlv.h"
 #include "neon/addl_high.h"
@@ -58,6 +59,10 @@
 #include "neon/clt.h"
 #include "neon/cltz.h"
 #include "neon/clz.h"
+#include "neon/cmla.h"
+#include "neon/cmla_rot90.h"
+#include "neon/cmla_rot180.h"
+#include "neon/cmla_rot270.h"
 #include "neon/cnt.h"
 #include "neon/cvt.h"
 #include "neon/combine.h"
@@ -68,14 +73,21 @@
 #include "neon/dup_n.h"
 #include "neon/eor.h"
 #include "neon/ext.h"
+#include "neon/fma.h"
+#include "neon/fma_lane.h"
+#include "neon/fma_n.h"
 #include "neon/get_high.h"
 #include "neon/get_lane.h"
 #include "neon/get_low.h"
 #include "neon/hadd.h"
 #include "neon/hsub.h"
 #include "neon/ld1.h"
+#include "neon/ld1_dup.h"
+#include "neon/ld1_lane.h"
+#include "neon/ld2.h"
 #include "neon/ld3.h"
 #include "neon/ld4.h"
+#include "neon/ld4_lane.h"
 #include "neon/max.h"
 #include "neon/maxnm.h"
 #include "neon/maxv.h"
@@ -86,10 +98,15 @@
 #include "neon/mla_n.h"
 #include "neon/mlal.h"
 #include "neon/mlal_high.h"
+#include "neon/mlal_high_n.h"
+#include "neon/mlal_lane.h"
 #include "neon/mlal_n.h"
 #include "neon/mls.h"
+#include "neon/mls_n.h"
 #include "neon/mlsl.h"
 #include "neon/mlsl_high.h"
+#include "neon/mlsl_high_n.h"
+#include "neon/mlsl_lane.h"
 #include "neon/mlsl_n.h"
 #include "neon/movl.h"
 #include "neon/movl_high.h"
@@ -100,6 +117,7 @@
 #include "neon/mul_n.h"
 #include "neon/mull.h"
 #include "neon/mull_high.h"
+#include "neon/mull_lane.h"
 #include "neon/mull_n.h"
 #include "neon/mvn.h"
 #include "neon/neg.h"
@@ -113,18 +131,28 @@
 #include "neon/qabs.h"
 #include "neon/qadd.h"
 #include "neon/qdmulh.h"
+#include "neon/qdmulh_lane.h"
+#include "neon/qdmulh_n.h"
 #include "neon/qdmull.h"
 #include "neon/qrdmulh.h"
+#include "neon/qrdmulh_lane.h"
 #include "neon/qrdmulh_n.h"
+#include "neon/qrshrn_n.h"
+#include "neon/qrshrun_n.h"
 #include "neon/qmovn.h"
 #include "neon/qmovun.h"
 #include "neon/qmovn_high.h"
 #include "neon/qneg.h"
 #include "neon/qsub.h"
 #include "neon/qshl.h"
+#include "neon/qshlu_n.h"
+#include "neon/qshrn_n.h"
+#include "neon/qshrun_n.h"
 #include "neon/qtbl.h"
 #include "neon/qtbx.h"
 #include "neon/rbit.h"
+#include "neon/recpe.h"
+#include "neon/recps.h"
 #include "neon/reinterpret.h"
 #include "neon/rev16.h"
 #include "neon/rev32.h"
@@ -137,19 +165,31 @@
 #include "neon/rndp.h"
 #include "neon/rshl.h"
 #include "neon/rshr_n.h"
+#include "neon/rshrn_n.h"
+#include "neon/rsqrte.h"
+#include "neon/rsqrts.h"
 #include "neon/rsra_n.h"
 #include "neon/set_lane.h"
 #include "neon/shl.h"
 #include "neon/shl_n.h"
+#include "neon/shll_n.h"
 #include "neon/shr_n.h"
+#include "neon/shrn_n.h"
 #include "neon/sqadd.h"
 #include "neon/sra_n.h"
+#include "neon/sri_n.h"
 #include "neon/st1.h"
 #include "neon/st1_lane.h"
+#include "neon/st2.h"
+#include "neon/st2_lane.h"
 #include "neon/st3.h"
+#include "neon/st3_lane.h"
 #include "neon/st4.h"
+#include "neon/st4_lane.h"
 #include "neon/sub.h"
+#include "neon/subhn.h"
 #include "neon/subl.h"
+#include "neon/subl_high.h"
 #include "neon/subw.h"
 #include "neon/subw_high.h"
 #include "neon/tbl.h"

diff --git a/arm/neon/abd.h b/arm/neon/abd.h
@@ -100,6 +100,23 @@ simde_int8x8_t
 simde_vabd_s8(simde_int8x8_t a, simde_int8x8_t b) {
   #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
     return vabd_s8(a, b);
+  #elif defined(SIMDE_X86_MMX_NATIVE)
+    simde_int8x8_private
+      r_,
+      a_ = simde_int8x8_to_private(a),
+      b_ = simde_int8x8_to_private(b);
+
+    const __m64 m = _mm_cmpgt_pi8(b_.m64, a_.m64);
+    r_.m64 =
+      _mm_xor_si64(
+        _mm_add_pi8(
+          _mm_sub_pi8(a_.m64, b_.m64),
+          m
+        ),
+        m
+      );
+
+    return simde_int8x8_from_private(r_);
   #else
     return simde_vmovn_s16(simde_vabsq_s16(simde_vsubl_s8(a, b)));
   #endif
@@ -114,6 +131,15 @@ simde_int16x4_t
 simde_vabd_s16(simde_int16x4_t a, simde_int16x4_t b) {
   #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
     return vabd_s16(a, b);
+  #elif defined(SIMDE_X86_MMX_NATIVE) && defined(SIMDE_X86_SSE_NATIVE)
+    simde_int16x4_private
+      r_,
+      a_ = simde_int16x4_to_private(a),
+      b_ = simde_int16x4_to_private(b);
+
+    r_.m64 = _mm_sub_pi16(_mm_max_pi16(a_.m64, b_.m64), _mm_min_pi16(a_.m64, b_.m64));
+
+    return simde_int16x4_from_private(r_);
   #else
     return simde_vmovn_s32(simde_vabsq_s32(simde_vsubl_s16(a, b)));
   #endif
@@ -227,17 +253,37 @@ simde_int8x16_t
 simde_vabdq_s8(simde_int8x16_t a, simde_int8x16_t b) {
   #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
     return vabdq_s8(a, b);
+  #elif defined(SIMDE_POWER_ALTIVEC_P6_NATIVE)
+    return vec_sub(vec_max(a, b), vec_min(a, b));
+  #elif defined(SIMDE_ZARCH_ZVECTOR_13_NATIVE)
+    return vec_max(a, b) - vec_min(a, b);
   #else
     simde_int8x16_private
       r_,
       a_ = simde_int8x16_to_private(a),
       b_ = simde_int8x16_to_private(b);
 
-    SIMDE_VECTORIZE
-    for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) {
-      int16_t tmp = HEDLEY_STATIC_CAST(int16_t, a_.values[i]) - HEDLEY_STATIC_CAST(int16_t, b_.values[i]);
-      r_.values[i] = HEDLEY_STATIC_CAST(int8_t, tmp < 0 ? -tmp : tmp);
-    }
+    #if defined(SIMDE_X86_SSE4_1_NATIVE)
+      r_.m128i = _mm_sub_epi8(_mm_max_epi8(a_.m128i, b_.m128i), _mm_min_epi8(a_.m128i, b_.m128i));
+    #elif defined(SIMDE_X86_SSE2_NATIVE)
+      const __m128i m = _mm_cmpgt_epi8(b_.m128i, a_.m128i);
+      r_.m128i =
+        _mm_xor_si128(
+          _mm_add_epi8(
+            _mm_sub_epi8(a_.m128i, b_.m128i),
+            m
+          ),
+          m
+        );
+    #elif defined(SIMDE_WASM_SIMD128_NATIVE)
+      r_.v128 = wasm_i8x16_sub(wasm_i8x16_max(a_.v128, b_.v128), wasm_i8x16_min(a_.v128, b_.v128));
+    #else
+      SIMDE_VECTORIZE
+      for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) {
+        int16_t tmp = HEDLEY_STATIC_CAST(int16_t, a_.values[i]) - HEDLEY_STATIC_CAST(int16_t, b_.values[i]);
+        r_.values[i] = HEDLEY_STATIC_CAST(int8_t, tmp < 0 ? -tmp : tmp);
+      }
+    #endif
 
     return simde_int8x16_from_private(r_);
   #endif
@@ -252,18 +298,31 @@ simde_int16x8_t
 simde_vabdq_s16(simde_int16x8_t a, simde_int16x8_t b) {
   #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
     return vabdq_s16(a, b);
+  #elif defined(SIMDE_POWER_ALTIVEC_P6_NATIVE)
+    return vec_sub(vec_max(a, b), vec_min(a, b));
+  #elif defined(SIMDE_ZARCH_ZVECTOR_13_NATIVE)
+    return vec_max(a, b) - vec_min(a, b);
   #else
     simde_int16x8_private
       r_,
       a_ = simde_int16x8_to_private(a),
       b_ = simde_int16x8_to_private(b);
 
-    SIMDE_VECTORIZE
-    for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) {
-      int32_t tmp = HEDLEY_STATIC_CAST(int32_t, a_.values[i]) - HEDLEY_STATIC_CAST(int32_t, b_.values[i]);
-      r_.values[i] = HEDLEY_STATIC_CAST(int16_t, tmp < 0 ? -tmp : tmp);
-    }
-
+    #if defined(SIMDE_X86_SSE2_NATIVE)
+      /* https://github.com/simd-everywhere/simde/issues/855#issuecomment-881658604 */
+      r_.m128i = _mm_sub_epi16(_mm_max_epi16(a_.m128i, b_.m128i), _mm_min_epi16(a_.m128i, b_.m128i));
+    #elif defined(SIMDE_WASM_SIMD128_NATIVE)
+      r_.v128 = wasm_i16x8_sub(wasm_i16x8_max(a_.v128, b_.v128), wasm_i16x8_min(a_.v128, b_.v128));
+    #else
+      SIMDE_VECTORIZE
+      for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) {
+        r_.values[i] =
+          (a_.values[i] < b_.values[i]) ?
+            (b_.values[i] - a_.values[i]) :
+            (a_.values[i] - b_.values[i]);
+      }
+
+    #endif
     return simde_int16x8_from_private(r_);
   #endif
 }
@@ -277,17 +336,35 @@ simde_int32x4_t
 simde_vabdq_s32(simde_int32x4_t a, simde_int32x4_t b) {
   #if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
     return vabdq_s32(a, b);
+  #elif defined(SIMDE_POWER_ALTIVEC_P6_NATIVE)
+    return vec_sub(vec_max(a, b), vec_min(a, b));
+  #elif defined(SIMDE_ZARCH_ZVECTOR_13_NATIVE)
+    return vec_max(a, b) - vec_min(a, b);
   #else
     simde_int32x4_private
       r_,
       a_ = simde_int32x4_to_private(a),
       b_ = simde_int32x4_to_private(b);
 
-    SIMDE_VECTORIZE
-    for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) {
-      int64_t tmp = HEDLEY_STATIC_CAST(int64_t, a_.values[i]) - HEDLEY_STATIC_CAST(int64_t, b_.values[i]);
-      r_.values[i] = HEDLEY_STATIC_CAST(int32_t, tmp < 0 ? -tmp : tmp);
-    }
+    #if defined(SIMDE_X86_SSE4_1_NATIVE)
+      r_.m128i = _mm_sub_epi32(_mm_max_epi32(a_.m128i, b_.m128i), _mm_min_epi32(a_.m128i, b_.m128i));
+    #elif defined(SIMDE_X86_SSE2_NATIVE)
+      const __m128i m = _mm_cmpgt_epi32(b_.m128i, a_.m128i);
+      r_.m128i =
+        _mm_xor_si128(
+          _mm_add_epi32(
+            _mm_sub_epi32(a_.m128i, b_.m128i),
+            m
+          ),
+          m
+        );
+    #else
+      SIMDE_VECTORIZE
+      for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) {
+        int64_t tmp = HEDLEY_STATIC_CAST(int64_t, a_.values[i]) - HEDLEY_STATIC_CAST(int64_t, b_.values[i]);
+        r_.values[i] = HEDLEY_STATIC_CAST(int32_t, tmp < 0 ? -tmp : tmp);
+      }
+    #endif
 
     return simde_int32x4_from_private(r_);
   #endif
@@ -304,17 +381,27 @@ simde_vabdq_u8(simde_uint8x16_t a, simde_uint8x16_t b) {
     return vabdq_u8(a, b);
   #elif defined(SIMDE_POWER_ALTIVEC_P9_NATIVE)
     return vec_absd(a, b);
+  #elif defined(SIMDE_POWER_ALTIVEC_P6_NATIVE)
+    return vec_sub(vec_max(a, b), vec_min(a, b));
+  #elif defined(SIMDE_ZARCH_ZVECTOR_13_NATIVE)
+    return vec_max(a, b) - vec_min(a, b);
   #else
     simde_uint8x16_private
       r_,
       a_ = simde_uint8x16_to_private(a),
       b_ = simde_uint8x16_to_private(b);
 
-    SIMDE_VECTORIZE
-    for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) {
-      int16_t tmp = HEDLEY_STATIC_CAST(int16_t, a_.values[i]) - HEDLEY_STATIC_CAST(int16_t, b_.values[i]);
-      r_.values[i] = HEDLEY_STATIC_CAST(uint8_t, tmp < 0 ? -tmp : tmp);
-    }
+    #if defined(SIMDE_X86_SSE2_NATIVE)
+      r_.m128i = _mm_sub_epi8(_mm_max_epu8(a_.m128i, b_.m128i), _mm_min_epu8(a_.m128i, b_.m128i));
+    #elif defined(SIMDE_WASM_SIMD128_NATIVE)
+      r_.v128 = wasm_i8x16_sub(wasm_u8x16_max(a_.v128, b_.v128), wasm_u8x16_min(a_.v128, b_.v128));
+    #else
+      SIMDE_VECTORIZE
+      for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) {
+        int16_t tmp = HEDLEY_STATIC_CAST(int16_t, a_.values[i]) - HEDLEY_STATIC_CAST(int16_t, b_.values[i]);
+        r_.values[i] = HEDLEY_STATIC_CAST(uint8_t, tmp < 0 ? -tmp : tmp);
+      }
+    #endif
 
     return simde_uint8x16_from_private(r_);
   #endif
@@ -331,17 +418,27 @@ simde_vabdq_u16(simde_uint16x8_t a, simde_uint16x8_t b) {
     return vabdq_u16(a, b);
   #elif defined(SIMDE_POWER_ALTIVEC_P9_NATIVE)
     return vec_absd(a, b);
+  #elif defined(SIMDE_POWER_ALTIVEC_P6_NATIVE)
+    return vec_sub(vec_max(a, b), vec_min(a, b));
+  #elif defined(SIMDE_ZARCH_ZVECTOR_13_NATIVE)
+    return vec_max(a, b) - vec_min(a, b);
   #else
     simde_uint16x8_private
       r_,
       a_ = simde_uint16x8_to_private(a),
       b_ = simde_uint16x8_to_private(b);
 
-    SIMDE_VECTORIZE
-    for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) {
-      int32_t tmp = HEDLEY_STATIC_CAST(int32_t, a_.values[i]) - HEDLEY_STATIC_CAST(int32_t, b_.values[i]);
-      r_.values[i] = HEDLEY_STATIC_CAST(uint16_t, tmp < 0 ? -tmp : tmp);
-    }
+    #if defined(SIMDE_X86_SSE4_2_NATIVE)
+      r_.m128i = _mm_sub_epi16(_mm_max_epu16(a_.m128i, b_.m128i), _mm_min_epu16(a_.m128i, b_.m128i));
+    #elif defined(SIMDE_WASM_SIMD128_NATIVE)
+      r_.v128 = wasm_i16x8_sub(wasm_u16x8_max(a_.v128, b_.v128), wasm_u16x8_min(a_.v128, b_.v128));
+    #else
+      SIMDE_VECTORIZE
+      for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) {
+        int32_t tmp = HEDLEY_STATIC_CAST(int32_t, a_.values[i]) - HEDLEY_STATIC_CAST(int32_t, b_.values[i]);
+        r_.values[i] = HEDLEY_STATIC_CAST(uint16_t, tmp < 0 ? -tmp : tmp);
+      }
+    #endif
 
     return simde_uint16x8_from_private(r_);
   #endif
@@ -358,17 +455,25 @@ simde_vabdq_u32(simde_uint32x4_t a, simde_uint32x4_t b) {
     return vabdq_u32(a, b);
   #elif defined(SIMDE_POWER_ALTIVEC_P9_NATIVE)
     return vec_absd(a, b);
+  #elif defined(SIMDE_POWER_ALTIVEC_P6_NATIVE)
+    return vec_sub(vec_max(a, b), vec_min(a, b));
+  #elif defined(SIMDE_ZARCH_ZVECTOR_13_NATIVE)
+    return vec_max(a, b) - vec_min(a, b);
   #else
     simde_uint32x4_private
       r_,
       a_ = simde_uint32x4_to_private(a),
       b_ = simde_uint32x4_to_private(b);
 
-    SIMDE_VECTORIZE
-    for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) {
-      int64_t tmp = HEDLEY_STATIC_CAST(int64_t, a_.values[i]) - HEDLEY_STATIC_CAST(int64_t, b_.values[i]);
-      r_.values[i] = HEDLEY_STATIC_CAST(uint32_t, tmp < 0 ? -tmp : tmp);
-    }
+    #if defined(SIMDE_X86_SSE4_2_NATIVE)
+      r_.m128i = _mm_sub_epi32(_mm_max_epu32(a_.m128i, b_.m128i), _mm_min_epu32(a_.m128i, b_.m128i));
+    #else
+      SIMDE_VECTORIZE
+      for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) {
+        int64_t tmp = HEDLEY_STATIC_CAST(int64_t, a_.values[i]) - HEDLEY_STATIC_CAST(int64_t, b_.values[i]);
+        r_.values[i] = HEDLEY_STATIC_CAST(uint32_t, tmp < 0 ? -tmp : tmp);
+      }
+    #endif
 
     return simde_uint32x4_from_private(r_);
   #endif