Skip to content

Commit

Permalink
Squashed 'lib/simde/simde/' changes from 1f4a28c4..cbef1c15
Browse files Browse the repository at this point in the history
cbef1c15 avx512/permutex2var: hard-code types in casts instead of using typeof
71a65cbd gfni: add cast to work around -Wimplicit-int-conversion warning
10dd508b avx512/scalef: work around for GCC bug #101614
277b303b neon/cvt: fix compilation with -ffast-math
9ec8c259 avx512/scalef: _mm_mask_scalef_round_ss is still missing in GCC
e821bee3 Wrap static assertions in code to disable -Wreserved-identifier
13cf2969 The fix for GCC bug #95483 wasn't in a release until 11.2
b66e3cb9 avx2: separate natural vector length for float, int, and double types
dda31b76 Add -Wdeclaration-after-statement to the list of ignored warnings.
9af03cd0 Work around compound literal warning with clang
74a4aa59 neon/clt: Add SSE/AVX512 fallbacks
02ce512d neon/mlsl_high_n: initial implementation
6472321c neon/mlal_high_n: initial implementation
2632bbc1 neon/subl_high: initial implementation
d1d2362d neon/types: remove duplicate NEON float16_t definitions
456812f8 sse: avoid including windows.h when possible
332dcc83 neon/reinterpret: change defines to work with templated callers
e369cd0c neon/cge: Improve some of the SSE2 fallbacks
3397efe1 deal with WASM SIMD128 API changes.
3aa4ae58 neon/rndn: Fix macros to workaround bugs
30b3607b neon/ld1: Fix macros in order to workaround bugs
8cac29c6 neon/cge: Implement f16 functions
c96b3ae6 neon/cagt: Implement f16 functions
f948d39a neon/bsl: Implement f16 functions
d6e025bd neon/reinterpret: f16_u16 and u16_f16 implementations
5e763da5 neon/add: Implement f16 functions
5a7c9e13 neon/ceqz: Implement f16 functions
1ba94bc4 neon/dup_n: Implement f16 functions
af26004a neon/ceq: Implement f16 functions
e41944f3 neon/st1: Add f16 functions
a660d577 neon/cvt: Implement f16 functions
412da5b3 neon/ld1: Implement f16 functions
068485c9 neon/cage: Initial f16 implementations
89fb99ee neon: Implement f16 types
50a56ef7 sse4.2: work around more warnings on old clang
fa54e7b3 avx512/permutex2var: work around incorrect definition on old clang
d20c7bf8 sse: use portable implementation to work around llvm bug #344589
371fd445 avx: work around incorrect maskload/store definitions on clang < 3.8
3bb373c8 Various fixes for -fno-lax-vector-conversions
f26ad2d1 avx512/fixupimm: initial implementation
f9182e3b Fix warnings with -fno-lax-vector-conversions
37c26d7f avx512/dpbusds: complete function family
0dc7eaf6 sse: replace _mm_prefetch implementation
b7fd63d9 neon/ld1q: u8_x2, u8_x3, u8_x4
6427473b neon/mul: add improved SSE2 vmulq_s8 implementation
b843d7e1 avx512/cvt: add _mm512_cvtepu32_ps
5df05510 simd128: improve many lt and gt implementation
495a0d2a neon/mul: implement unsigned multiplication using signed functions
2b087a1c neon/qadd: fix warning in ternarylogic call in vaddq_u32
f027c8da neon/qabs: add some faster implementations
bf6667b4 simd128: add fast sqrt implementations
d490ca7a simd128: add fast extmul_low/high implementations
2abd2cc0 simd128: add NEON and POWER shift implementations
3032eb33 simd128: add fast promote/demote implementations
e92273a6 simd128: add dedicated functions for unsigned extract_lane
34c5733c sse2, sse4.1: pull in improved packs/packus implementations from WASM
1bfc221c simd128: add fast narrow implementations
f333a089 simd128: add fast implementations of extend_low/extend_high
b4e0d0cc msa/madd: initial implementation
c09e6b0a neon/rndn: work around some missing functions in GCC on armv8
cc7afa77 avx512/4dpwssds: initial implementation
a9cec6fe avx512/dpbf16: implement remaining functions
371da5f8 avx512/dpwssds: initial implementation
ccef3bee common: Use AArch64 intrinsics if _M_ARM64EC is defined
f79c08c3 xop: fix NEON implementation of maccs functions to use NEON types
9eb0a88d sse4.1: use NEON types instead of vector in insert implementations
0bbae5ff avx512/roundscale: don't assume IEEE 754 storage
77673258 fma: use NEON types in simde_mm_fnmadd_ps NEON implementation
865412e7 sse2: remove statement expr requirement for NEON srli/srai macros
573c0a24 sse4.1: replace NEON implementations with shuffle-based implementations
534794b2 sse4.1: remove statement expr dependency in blend functions
a571ca8c fma: fix return value of simde_mm_fnmadd_ps on NEON
df95ab8e sse, sse2: clean up several shuffle macros
44e25b30 sse2: add parenthesis around macro arguments
305ac0a8 avx512/set, avx512/popcnt: use _mm512_set_epi8 only when available
98de6621 relaxed-simd: add blend functions
974f83d5 relaxed-simd: add fms functions
a46a04b7 relaxed-simd: add fma functions
54c62bf7 avx512/popcnt: implement remaining functions
d4dc926f avx512/dpbf16: initial implementation
b9a7904d avx512/4dpwssd: implement complete function family
f54cc98a avx512/dpwssd: initial implementation
7e877d17 avx512/bitshuffle: initial implementation
9e96b711 avx512/dpbusd: implement remaining functions
423572d5 simd128: use vec_cmpgt instead of vec_cmplt in pmin
73b6978f sse, sse2: fix vec_cpsign order test
7c0bdbff gfni: remove unintentional dependency on vector extensions
26fcfdb1 simd128: add fast ceil implementations
85035430 Improve widening pairwise addition implementations
8f35dc1a simd128: add fast max/pmax implementations
a8adeffc neon/cvt: disable some code on 32-bit x86 which uses _mm_cvttsd_si64
29955848 avx512/shldv: limit shuffle-based version to little endian
ae330dd9 simd128: add NEON, Altivec, & vector extension sub_sat implementations
9debe735 neon/cvt, relaxed-simd: add work-around for GCC bug #101614
eab383d9 avx512/dbsad: add vector extension impl. and improve scalar version
79c93ce0 sse, sse2: sync clang-12 changes for vec_cpsgn
7205c644 avx512/cvtt: _mm_cvttpd_epi64 is only available on x86_64
42538f0e simd128, sse2: more cvtpd_ps/f32x4_demote_f64x2_zero implementations
1bec285e simd128, sse2: add more madd_epi16 / i32x4_dot_i16x8 implementations
6dfdf3d2 simd128: vector extension implementation of floating-point abs
00c3b68b simd128, neon/neg: add VSX implementations of abs and neg functions
7f3a52d0 neon/cgt, simd128: improve some unsigned comparisons on x86
f5184634 neon/abd: add much better implementations
9b1974dd Add @aqrit's SSE2 min/max implementations
9caf5e6e simd128: add more pmin/pmax implementations
dcd00397 neon/qrdmulh: steal WASM q15mulr_sat implementation for qrdmulhq_s16
34dee780 simd128: add SSE2 q15mulr_sat implementation
fe3e623e neon/min: add SSE2 vminq_u32 implementation
4abbb4db neon/min: add SSE2 vqsubq_u32 implementation
c1158835 simd128: add improved min implementations on several architectures
c059f800 relaxed-simd: add trunc functions
0394e967 simd128: add several some AArch64 and Altivec trunc_sat implementations
3fa2026b Fix several places where we assumed NEON used vector extensions.
6a183313 neon/qsub: add some SSE and vector extension implementations
313561fe msa/subv: initial implementation
8f1155e4 msa/andi: initial implementation
d20bca47 msa/and: initial implementation
82e93303 gfni: work around clang bug #50932
3a27037f arch: set SIMDE_ARCH_ARM for AArch64 on MSVC
d19a9d6a msa/adds: initial implementation
41f9ad33 neon/qadd: improve SSE implementation
eb55cce3 avx512/shldv: initial implementation
ee0a83e1 avx512/popcnt: initial implementation
48855d3a msa/adds_a: initial implementation
6133600b neon/qadd: add several improved x86 and vector extension versions
6b5814d9 avx512/ternarylogic: implement remaining functions
3fba9986 Add many fast floating point to integer conversion functions
b2f01b98 neon/st4_lane: Implement remaining functions
ccc9e2c8 neon/st3_lane: Implement remaining functions
3f0859be neon/st2_lane: Implement remaining functions
e136dfe7 neon/ld1_dup: Add f64 function implementations
4a2ceb45 neon/cvt: add some faster x86 float->int/uint conversions
b82b16ac neon/cvt: Add vcvt_f32_f64 and vcvt_f64_f32 implementations
477068c9 neon/st2: Implement remaining functions
3a93c5dd neon/ld4_lane: Implement remaining functions
75838c15 neon/qshlu_n: Add scalar function implementations
7d314092 simde/scalef: add scalef_ss/sd
d3547dac msa/add_a: initial implementation
8ba8dc84 msa/addvi: initial implementation
b1006161 Begin working on implementing MIPS MSA.
38088d10 fma: use fma/fms instead of mla/mls on NEON
76c4b7cd neon/cle: add some x86 implementations
d045a667 neon/cle: improve formatting of some x86 implementations
6fc12601 relaxed-simd: initial support for the WASM relaxed SIMD proposal
2d430eb4 neon/ld2: Implement remaining functions
fc3aef94 neon/ld1_lane: Implement remaining functions
0ec9c9c9 neon/rsqrte: Implement remaining functions
92e72c44 neon/rsqrts: Add remaining function implementations
e7cdccd0 neon/qdmulh_lane: Add remaining function implementations
905f1e4c neon/recpe: Add remaining function implementations
96cebc42 neon/recps: Add scalar function implementations
63ad6d0a neon/qrdmulh_lane: Add scalar function implementations
f8dacd07 simde-diagnostic: Include simde-arch
4ad3f10f neon/mul_lane: Add mul_laneq functions
25d0fe82 neon/sri_n: Add scalar function implementations
6fb9fa3a neon/shl_n: Add scalar function implementations
5738564f neon/shl: Add scalar implementations
fc2aed9b neon/rsra_n: Add scalar function implementations
7c7d8d80 neon/qshrn_n: Add scalar function implementations
76e65444 neon/qrshrn_n: Add scalar function implementations
25aa2124 neon/rshr_n: Add custom scalar function for utility
6d1c7aaf avx512/dbsad: initial implementation
4b1ba2ce avx512/dpbusd: initial implementation
02719bcc svml: remove some dead stores from cdfnorminv
803b29ac sse2: fix set but not used variable in _mm_cvtps_epi32
7ee622df Use SIMDE_HUGE_FUNCTION_ATTRIBUTES on several functions.
80439178 arch: fix SIMDE_ARCH_POWER_ALTIVEC_CHECK to include AltiVec check
604a90af neon/cvt: fix a couple of s390x implementations' NaN handling
a0fe7651 simd128: work around bad diagnostic from clang < 7
cd742d66 f16c: use __ARM_FEATURE_FP16_VECTOR_ARITHMETIC to detect Arm support
4f39e4fc Fix an assortment of small bugs
4bf12875 Remove all `&& 0`s in preprocessor macros.
8e0d0f93 simd128: remove stray `&& 0`
d98f81cb simd128: add optimized f32x4.floor implementations
b626266d simd128: add some Arm implementations of all_true
78957358 simd128: any_true implementations for Arm
20cd4d00 simd128: add improved add_sat implementations
ea364550 wasm128, sse2: disable -Wvector-conversion when calling vgetq_lane_s64
4e09afb4 neon/zip1: add armv7 implementations
f27932a7 simd128: add x86/Arm/POWER implementations
2bcd59bb avx512/conflict: implement missing functions
7da82adb avx512/multishift: initial implementation
e7229088 various: correct PPC and z/Arch versions plus typo
005d39c8 simd128: fix portable fallback for wasm_i8x16_swizzle
860127a1 Add NEON, SSE3, and AltiVec implementations of wasm_i8x16_swizzle
0959466e simd128: add AltiVec implementations of any/all_true
7f38c52e simd128: add vec_abs implementation of wasm_i8x16_abs
e2cb9632 simd128: work around clang bugs 50893 and 50901.
77e4f57d avx512/rol: implement remaining functions
1d60dc03 avx512/rolv: initial implementation
30681718 avx512: initial implementation
38f8ef8f avx512/ternarylogic: initial implementation
3efe186a Add constrained compilation mode
1faf7872 simd128: add simde_wasm_i64x2_ne
68616767 avx512/scalef: implement remaining functions
6ea919f8 avx512/conflict: implements mm_conflict_epi32
ad5d51c5 avx512/scalef: initial implementation
4f0f1e8f neon/qrshrun_n: Add scalar function implementations
dc278de7 neon/rshr_n: Add scalar function implementations
86f73e1e neon/rndn: Add macro corrections
189d7762 neon/qshrun_n: Add scalar function implementations
1fc63065 neon/rshl: Add scalar function implementations
4ca2973e neon/rndn: Add scalar function implementation
d78398c8 neon/qdmulh: Add scalar function implementations
7d43b7c9 neon/pmin: Add scalar function implementations
4dacfeff neon/pmax: Add scalar function implementations
abccc767 neon/padd: Add scalar function implementations
b3d97677 neon/neg: Complete implementation of function family
137afad7 neon/dup_lane: Complete implementation of function family
ef93f1bb neon/fma_lane: Implement fmaq_lane functions
e9dcfe8b neon/sra_n: Add scalar function implementations
44cf247c neon/shr_n: Add scalar function implementations
ca78eb82 neon/sub: Implements the two remaining scalar functions
65d8d52f avx512/rorv: implement _mm{256,512}{,_mask,_maskz}_rorv_epi{32,64}
1afa8148 Many work-arounds for GCC with MSA, and support in the docker image.
8bf571ac neon/ext: clean up shuffle-based implementation
51790ff8 avx512/rorv: initial implementation of _mm_rorv_epi32
952dab89 neon/st3: Add shuffle vector implementations
2229f4ba sse, sse2: work around GCC bug #100927
e0b88179 neon/ld{2,3,4}: disable -Wmaybe-uninitialized on all recent GCC
76c76bfa neon/fma_lane: portable and native implementations
002b4066 neon/mul_lane: finish implementation of function family
ae959e7e neon/;shlu_n: faster WASM implementations
7df8e3ab neon/qshlu_n: initial implementation
338eb083 neon/ld4: use conformant array parameters
049eaa9e neon/vld4: Wasm optimization of vld4q_u8
720db9ff neon/st3q_u8: Wasm optimization
ccf235e1 neon/qdmull: add WASM implementations
06a64a94 neon/movl: improve WASM implementation
e36a029e neon/tbl: add WASM implementation of vtbl1_u8
5debb615 neon/tst: implement scalar functions
cef74f3b neon/hadd,hsub: optimization for Wasm
502243a2 neon/qrdmulh_lane: fix typo in undefs
6eb625d7 fma: drop weird high-priority implementation in _mm_fmadd_ps
47ba41d6 neon/qshrn_n: initial implementation
b94e0298 neon/qrdmulh: native aliases for scalar functions should be A64
f27e9fcb neon/qrdmulh_lane: initial implementation
04e2ca66 neon/subhn: initial implementation
8b129a93 neon/sri_n: add 128-bit implementations
88dd65de neon/mull_lane: initial implementation
12c940ed neon/mlsl_lane: initial implementation
abc8dacf neon/mlal_lane: initial implementation
9438ea43 neon/dup_lane: fix macro for simde_vdup_laneq_u16
36e2ce5b neon/{add,sub}w_high: use vmovl_high instead of vmovl + get_high
d86492fa neon/sri_n: native and portable
60715735 neon/qshrun_n: native and portable implementations
de84bcd0 neon/qdmulh_lane: native and portable
4581232f avx512/roundscale_round: implement remaining functions
76b19b97 avx512/range_rounnd,round: move range_round functions out of round
2ba2b7b8 neon/ld1_dup: native and portable (64-bit vectors)
f6fd4b67 neon/dup_lane: implement vdupq_lane_f64
07b4a2b3 neon/shll_n: native and portable implementations
58a0188d neon/dupq_lane: native and portable
623f2207 neon/st4_lane: portable and native *_{s,u}{8,16,32}
322663be neon/st3_lane: portable and native *_{s,u}{8,16,32}
7700b2e5 neon/st2_lane: portable and native for _{u,s}{8,16,32}
acc67df2 neon/cltz: Add scalar functions and natural vector fallbacks
fcf6e88e neon/clt: Add implementations of scalar functions
799e1629 neon/clez: Add implementaions of scalar functions
f22ae740 neon/addhn: initial implementation
8774393f avx512/cmp{g,l}e: AVX-512 implementations of non-mask functions
1eb57468 avx512/cmple: finish implementations of all cmple functions
9b60d826 avx512/cmpge: fix bad _mm512_cmpge_epi64_mask implementation
6849da33 avx: use internal symbols in clang fallbacks for cmp_ps/pd functions
f2746208 avx512/cmpge: finish implementing all functions
135cbbf0 avx512/range: implement mm{,512}{,_mask,_maskz}_range_round*
6421a835 avx512/round, avx512/roundscale: add shorter vector fallbacks
5c6673f5 avx512/roundscale: implement simde_mm{256,512}_roundscale_ps
6fcb4433 neon/cle: Add implementations for remaining functions
a49bdc1c neon/fma_n: the 32-bit functions are missing on GCC on arm
05172a08 neon/ld4: work around spurious warning on clang < 10
2fa3d1d8 neon/qdmulh: add shuffle-based implementations
ea22a611 neon/qdmulh_n: native and portable implementations
5ef8e53d neon/qrshrn_n: native and portable implementations
fda538d1 neon/ld1_lane: portable and native implementations
8f118bbd neon/cgtz: Add implementations of remaining functions
31d5048c neon/cgt: Add implementation of remaining functions
79274d8d neon/ld4_lane: move private type usage to inside loop
bdcfccb7 neon/ld4_lane: native and portable implementations
bbc35b65 avx512/range: don't used masked comparisons for 128/256-bit versions
ef90404e avx512/range: fix fallback macros
5d00aa4c features: add z/arch to SIMDE_NATURAL_VECTOR_SIZE
83cab7c1 sve/cmplt: replace vec_and with & for s390 implementations
a636d0ae Fix gcc-10 compilation on s/390x
bb35d9f0 gfni: work around error with vec_bperm on clang-10 on POWER
2db3ba03 gfni: replace vec_and and vec_xor with & and ^ on z/arch
cdb3f68c sse, mmx: fix clang-11 on POWER
233fef43 gfni: add many x86, ARM, z/Arch, PPC and WASM implementations
c300a66e Don't set SIMDE_NO_CHECK_IMMEDIATE_CONSTANT in tests.
283c6e40 neon/qrshrun_n: native and portable implementations
9535e063 neon/rshrn_n: native and portable implementations
5c05d980 sse: prefer SIMDE_SHUFFLE_VECTOR implementation of _mm_shuffle_ps
cdfff167 sse: don't use armv7 impl of _MM_TRANSPOSE4_PS on armv8
921db75b neon/cvt: add out-of-range and NaN tests
b62dfb24 neon/dot_lane: add remaining implementation
b24c6d5c neon/recps: Use vector ops instead of relying on autovec
8acdce75 avx512/range: implement mm(256, 512)_mask(z)_range_p*
c0bc87e9 avx512/roundscale: initial implementation
f1fd8066 sse2: correct typos in simde_x_mm_broadcastlow_pd
29427e22 avx, avx512/cmp: properly handle NaN in _mm{,256,512}_cmp_{ps,pd,ss,sd}
c8d3304e neon/dot_lane: correct implementations of dot_lane functions
aa739959 neon/st2,st1: use zip + st1 to implement st2
375321b4 neon/dup_n: replace remaining functions with dup_n implementations
992541f9 neon/dup_lane: use dup_n
00602f7f neon/dup_lane: add shuffle-based implementations
9fcd7043 fma: add mls-based NEON implementations of fnmadd functions
7b88559a avx512/range: fix variable names in macro implementations
3a3cd210 sse2: prefer shuffle implementation of _mm_shuffle_epi32 to NEON
3061e315 common: move conversion functions for u32 <-> f32 into common
daa83861 neon/ld2: apply optimizations from previous commit to other extensions
53a7c6f7 neon/ld2: Wasm optimizations
b4fa3c6e neon/mls_n: initial implementation
849357cf neon/shrn_n: Wasm SIMD optimizations
c1841732 Add SIMDE_FAST_EXCEPTIONS option
c73b6b54 avx512/range: remove CONSTIFY macro usage
a8aac144 neon/shr_n: fix variable name in GFNI implementation of vshrq_n_s8
0d3ee449 avx512/range: implement mm512_range_ps/d functions
b71e9e44 neon/cgez: complete implementation of CGEZ family
7c6e0f4b neon/abd: Wasm SIMD implementation
79f39ad2 neon/ext: add __builtin_shufflevector implementation
e979809b neon/ld1: add Wasm SIMD implementation
829bc16b avx512: implement mm_mask(z)_unpack* funcs
c66c2fba Work around issues preventing compilation on NVCC
5edb0bc3 sse2: add fast-math WASM implementation of _mm_cvtps_epi32
dbdddf8a Fix compilation with clang on POWER
9f259118 neon/get_low: use __builtin_shufflevector if available
8bce4a4d neon/get_high: add __builtin_shufflevector optimizations
70f0acbf sse2: don't require constants for _mm_srai_epi{16,32}
7043be93 avx2: added vector size conditional for unpack
4f790ded neon/vmovq: define vmovq_n as aliases for vdup_n
60661a40 neon/vdup: vdupq_lane_f32 native and portable
c244411d neon/ld1_dup: split from ld1, dup_n fallbacks, WASM implementations
4dfa8110 neon/vld1q_dup: native and portable implementations
a33b557f avx512/unpack{hi,lo}: implement mm256_mask(z)_unpack* functions
b59facba avx512/unpack{hi,lo}: implement mask variants of unpacklo
fc1f74ec neon: port additional code to new style
1acf4544 neon/types: reverse logic for SIMDE_ARM_NEON_FORCE_NATIVE_TYPES
06ff80cd neon/types: use vector extensions for public types when available
30e49b1b neon: refactor to use different types on all targets
72fe8c3a neon/{min,max}nm: add some headers for -ffast-math
7ea2d71e sse2: vcvtnq_s32_f32 is armv8-specific
7c370654 neon/cvt: cast result of float/double comparison
339a10f7 neon/shrn_n: s16 s32 s64 u16 u32 u64 portable and native
ad4273b3 avx512/unpacklo: added vector size conditional
5809070a avx512/unpacklo: implement mm512_unpacklo_* functions
33984c24 neon/ld{2,3,4}: silence false positive diagnostic on GCC 7
91c2b412 neon/st2: vst2(q) f32 s8 s16 s32 u8 u16 u32
c6717a2b neon/vld2: vld2_{u16,u32} and vld2q_{u8,u16,u32,f32}
b3aae641 neon/fma_n: initial implementation
cc930ce3 neon/rsqrte: use vmls for fallbacks.
af29571a neon/fma: add a couple x86 and PPC implementations
05383e30 neon/mls: add _mm_fnmadd_* implementations of vmls*_f*
1951e4c7 neon/rsqrts: vrsqrts_f32 and vrsqrtsq_f32 native and portable
a8b09e16 neon: replace some more abs/labs/llabs usage with simde_math_* versions
de31fc3e math: use simde_math_-prefixed abs/labs/llabs
5a899528 neon/{min,max}nm: use simde_math_* prefixed min/max functions
4b64da56 diagnostic: silence -Wreserved-identifier warning from LLVM
a2520a85 simd128: add clang implementation of wasm_f64x2_promote_low_f32x4
359f1b18 simd128: cast to int ptrs instead of void* in wasm_v128_load*_lane
8682175b simd128: add some implementations of convert functions
ce8a9adb neon/rsqrte: vrsqrte_f32 and vrsqrteq_f32 on native and portable
ffc32e85 simd128: add missing WASM SIMD128 functions
84c95b06 neon/vld2_u8: native and portable implementation
c6161667 neon/recpe: Remove duplicate code and fix copyright year
bf3d202c neon/recps: recps/recpsq for native and portable
f44192f2 sve/add: initial implementation
614cbc36 neon/recpe: add some additional implementations stolen from SSE
75c24a80 neon/recpe: recpe_f32 and recpe_f64, native and portable
e846ddac Update WASM SIMD intrinsics to match new names.
89d22f41 common: don't use aligned OpenMP clause on MCST LCC
cc509617 fma: work around broken implementations of some functions on MCST LCC
e566aec7 x86: ignore warnings about inefficient functions on lcc
1a1b6c27 neon/cge: implement remaining functions in CGE family
b076027c sve/qadd: initial implementation
e95132fc sve/sel: add cast to make GCC on s/390x happy
492802a5 sve/add: switch some _x implementations to use _x instead of _z
d8fd62dc sve/dup: add *_m variants
9976c212 sve/dup: switch implementations to use svsel
a9fd997f sve/dup: rename from dup_n
f7c4646e common: improve check for C11 generic selections
ab5ce304 sve/ptest: simplify svptest_first
783f969d sve/whilelt: small optimizations for all whilelt functions
8c696906 sve: add native aliases for overloads
c3815d08 sve/add: add svadd_n_* functions
6aae2920 sve/add: switch to using svsel for implementations of _z/_m variants
86aa82b6 sve/whilelt: add svwhilelt_*_{u32,s64,u64} implementations
bf99fbce sve/and: switch implementations to use svsel
d60cdae7 sve/sel: initial implementation
6f47cffb sve/types: add mmask4 functions for 256-bit vectors
0ebc13f1 sve: some tweaks to get s390x working
53ea316c sve/and: initial implementation
dab8f553 sse4.1: add some casts to make clang -Weverything happy
38111a8b avx512/cvtt: add simde_mm{_mask,_maskz}_cvttpd_epi64
eed525d3 avx512/cvt: add simde_mm{_mask,_maskz}_cvtepi64_pd
e6a87ee7 Initial import of a portable SVE implementation.
101c3cbe avx512: add tests for previous commit (104a99bc)
ce57d61d avx512: add several new functions
dd6b78e7 neon/ceqz: finish implementation of ceqz family
1e6be96a avx512: implement mm*_mask(z)_compress(storeu)_*
9af89540 sse4.1: _mm_blendv_epi8: add sse2 and update wasm_simd128 implementions
6e4e66d6 avx512:compress: implement mm256_mask(z)_compress(storeu)_p*
89ef83e2 simd128: add movemask-based implementations of any/all_true functions
3979670d avx512/insert: convert macros to functions, regenerate old-style tests
9814181e simd128: add additional cast in wasm_i32x4_abs
071795e7 avx512/fmsub: implement _fmsub_ functions for AXV512VL
846727ee avx512/compress: implement _mm256_mask_compress_pd
6694ae06 avx512/cmpeq: implement _mm512_mask_cmpeq_epi8_mask
09a6fec2 avx512/cmpneq: initial implementation of 128-bit and 256-bit functions
8f7a6be3 avx512/abs: work around buggy pd functions in GCC 7 - 8.2
0e64c1b5 avx2: fix undefs for many native aliases
cc007898 sse2: ignore broken _mm_loadu_si{16,32} on GCC
42205923 sse2: use simde_math_{add,sub}s_* for _mm_{add,sub}s_* functions
ce930565 {neon,simd128,avx512/abs}: provide vector versions of i64 abs
d390e3f4 avx2: add vector/shuffle implementation of _mm256_madd_epi16
ec395e86 avx512/insert: implement inserti{,_mask,_maskz}_{32x8,64x2}
60e9fed6 avx512/insert: implement mm512{_mask,_maskz}_insert{f32x8,64x2}
f62bb037 neon/cnt: add x86 implementations of vcntq_s8
c0b43a38 avx512/xor: implement mm512_mask(z)_xor_pd/s functions
fb3f2fbc avx512/or: implement mm512_mask(z)_or_ps/d functions
42032d0e avx512/mullo: implement mm512_mullo_epi64 with mask(z)
0b3d7f8a neon/cmla, neon/cmla_rot{90,180,270}: CMLA requires armv8.3+
de120cf1 neon/fma: add more extensive feature checking
fbf4bc3c neon/rndi, sse2: work around several functions missing in GCC
85d8190e arch: __ARM_ARCH now (v8.1+) encodes the minor version
9d7b0dad neon/cmla, neon/cmla_rot{90,180,270}: check compiler versions
71128dca neon/cmla, neon/cmla_rot{90,180,270}, neon/fma: initial implementation
adaf41a1 axv512/or: implement _mm512_mask_or_pd function
b33bd046 neon/bcax: Adds WASM and SSE2 implementations

git-subtree-dir: lib/simde/simde
git-subtree-split: cbef1c152ad2e73bc8971c306208f55f44b16088
  • Loading branch information
milot-mirdita committed Nov 1, 2022
1 parent 1524231 commit 845e09b
Show file tree
Hide file tree
Showing 250 changed files with 61,173 additions and 8,670 deletions.
40 changes: 40 additions & 0 deletions arm/neon.h
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,7 @@
#include "neon/abdl.h"
#include "neon/abs.h"
#include "neon/add.h"
#include "neon/addhn.h"
#include "neon/addl.h"
#include "neon/addlv.h"
#include "neon/addl_high.h"
Expand All @@ -58,6 +59,10 @@
#include "neon/clt.h"
#include "neon/cltz.h"
#include "neon/clz.h"
#include "neon/cmla.h"
#include "neon/cmla_rot90.h"
#include "neon/cmla_rot180.h"
#include "neon/cmla_rot270.h"
#include "neon/cnt.h"
#include "neon/cvt.h"
#include "neon/combine.h"
Expand All @@ -68,14 +73,21 @@
#include "neon/dup_n.h"
#include "neon/eor.h"
#include "neon/ext.h"
#include "neon/fma.h"
#include "neon/fma_lane.h"
#include "neon/fma_n.h"
#include "neon/get_high.h"
#include "neon/get_lane.h"
#include "neon/get_low.h"
#include "neon/hadd.h"
#include "neon/hsub.h"
#include "neon/ld1.h"
#include "neon/ld1_dup.h"
#include "neon/ld1_lane.h"
#include "neon/ld2.h"
#include "neon/ld3.h"
#include "neon/ld4.h"
#include "neon/ld4_lane.h"
#include "neon/max.h"
#include "neon/maxnm.h"
#include "neon/maxv.h"
Expand All @@ -86,10 +98,15 @@
#include "neon/mla_n.h"
#include "neon/mlal.h"
#include "neon/mlal_high.h"
#include "neon/mlal_high_n.h"
#include "neon/mlal_lane.h"
#include "neon/mlal_n.h"
#include "neon/mls.h"
#include "neon/mls_n.h"
#include "neon/mlsl.h"
#include "neon/mlsl_high.h"
#include "neon/mlsl_high_n.h"
#include "neon/mlsl_lane.h"
#include "neon/mlsl_n.h"
#include "neon/movl.h"
#include "neon/movl_high.h"
Expand All @@ -100,6 +117,7 @@
#include "neon/mul_n.h"
#include "neon/mull.h"
#include "neon/mull_high.h"
#include "neon/mull_lane.h"
#include "neon/mull_n.h"
#include "neon/mvn.h"
#include "neon/neg.h"
Expand All @@ -113,18 +131,28 @@
#include "neon/qabs.h"
#include "neon/qadd.h"
#include "neon/qdmulh.h"
#include "neon/qdmulh_lane.h"
#include "neon/qdmulh_n.h"
#include "neon/qdmull.h"
#include "neon/qrdmulh.h"
#include "neon/qrdmulh_lane.h"
#include "neon/qrdmulh_n.h"
#include "neon/qrshrn_n.h"
#include "neon/qrshrun_n.h"
#include "neon/qmovn.h"
#include "neon/qmovun.h"
#include "neon/qmovn_high.h"
#include "neon/qneg.h"
#include "neon/qsub.h"
#include "neon/qshl.h"
#include "neon/qshlu_n.h"
#include "neon/qshrn_n.h"
#include "neon/qshrun_n.h"
#include "neon/qtbl.h"
#include "neon/qtbx.h"
#include "neon/rbit.h"
#include "neon/recpe.h"
#include "neon/recps.h"
#include "neon/reinterpret.h"
#include "neon/rev16.h"
#include "neon/rev32.h"
Expand All @@ -137,19 +165,31 @@
#include "neon/rndp.h"
#include "neon/rshl.h"
#include "neon/rshr_n.h"
#include "neon/rshrn_n.h"
#include "neon/rsqrte.h"
#include "neon/rsqrts.h"
#include "neon/rsra_n.h"
#include "neon/set_lane.h"
#include "neon/shl.h"
#include "neon/shl_n.h"
#include "neon/shll_n.h"
#include "neon/shr_n.h"
#include "neon/shrn_n.h"
#include "neon/sqadd.h"
#include "neon/sra_n.h"
#include "neon/sri_n.h"
#include "neon/st1.h"
#include "neon/st1_lane.h"
#include "neon/st2.h"
#include "neon/st2_lane.h"
#include "neon/st3.h"
#include "neon/st3_lane.h"
#include "neon/st4.h"
#include "neon/st4_lane.h"
#include "neon/sub.h"
#include "neon/subhn.h"
#include "neon/subl.h"
#include "neon/subl_high.h"
#include "neon/subw.h"
#include "neon/subw_high.h"
#include "neon/tbl.h"
Expand Down
167 changes: 136 additions & 31 deletions arm/neon/abd.h
Original file line number Diff line number Diff line change
Expand Up @@ -100,6 +100,23 @@ simde_int8x8_t
simde_vabd_s8(simde_int8x8_t a, simde_int8x8_t b) {
#if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
return vabd_s8(a, b);
#elif defined(SIMDE_X86_MMX_NATIVE)
simde_int8x8_private
r_,
a_ = simde_int8x8_to_private(a),
b_ = simde_int8x8_to_private(b);

const __m64 m = _mm_cmpgt_pi8(b_.m64, a_.m64);
r_.m64 =
_mm_xor_si64(
_mm_add_pi8(
_mm_sub_pi8(a_.m64, b_.m64),
m
),
m
);

return simde_int8x8_from_private(r_);
#else
return simde_vmovn_s16(simde_vabsq_s16(simde_vsubl_s8(a, b)));
#endif
Expand All @@ -114,6 +131,15 @@ simde_int16x4_t
simde_vabd_s16(simde_int16x4_t a, simde_int16x4_t b) {
#if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
return vabd_s16(a, b);
#elif defined(SIMDE_X86_MMX_NATIVE) && defined(SIMDE_X86_SSE_NATIVE)
simde_int16x4_private
r_,
a_ = simde_int16x4_to_private(a),
b_ = simde_int16x4_to_private(b);

r_.m64 = _mm_sub_pi16(_mm_max_pi16(a_.m64, b_.m64), _mm_min_pi16(a_.m64, b_.m64));

return simde_int16x4_from_private(r_);
#else
return simde_vmovn_s32(simde_vabsq_s32(simde_vsubl_s16(a, b)));
#endif
Expand Down Expand Up @@ -227,17 +253,37 @@ simde_int8x16_t
simde_vabdq_s8(simde_int8x16_t a, simde_int8x16_t b) {
#if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
return vabdq_s8(a, b);
#elif defined(SIMDE_POWER_ALTIVEC_P6_NATIVE)
return vec_sub(vec_max(a, b), vec_min(a, b));
#elif defined(SIMDE_ZARCH_ZVECTOR_13_NATIVE)
return vec_max(a, b) - vec_min(a, b);
#else
simde_int8x16_private
r_,
a_ = simde_int8x16_to_private(a),
b_ = simde_int8x16_to_private(b);

SIMDE_VECTORIZE
for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) {
int16_t tmp = HEDLEY_STATIC_CAST(int16_t, a_.values[i]) - HEDLEY_STATIC_CAST(int16_t, b_.values[i]);
r_.values[i] = HEDLEY_STATIC_CAST(int8_t, tmp < 0 ? -tmp : tmp);
}
#if defined(SIMDE_X86_SSE4_1_NATIVE)
r_.m128i = _mm_sub_epi8(_mm_max_epi8(a_.m128i, b_.m128i), _mm_min_epi8(a_.m128i, b_.m128i));
#elif defined(SIMDE_X86_SSE2_NATIVE)
const __m128i m = _mm_cmpgt_epi8(b_.m128i, a_.m128i);
r_.m128i =
_mm_xor_si128(
_mm_add_epi8(
_mm_sub_epi8(a_.m128i, b_.m128i),
m
),
m
);
#elif defined(SIMDE_WASM_SIMD128_NATIVE)
r_.v128 = wasm_i8x16_sub(wasm_i8x16_max(a_.v128, b_.v128), wasm_i8x16_min(a_.v128, b_.v128));
#else
SIMDE_VECTORIZE
for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) {
int16_t tmp = HEDLEY_STATIC_CAST(int16_t, a_.values[i]) - HEDLEY_STATIC_CAST(int16_t, b_.values[i]);
r_.values[i] = HEDLEY_STATIC_CAST(int8_t, tmp < 0 ? -tmp : tmp);
}
#endif

return simde_int8x16_from_private(r_);
#endif
Expand All @@ -252,18 +298,31 @@ simde_int16x8_t
simde_vabdq_s16(simde_int16x8_t a, simde_int16x8_t b) {
#if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
return vabdq_s16(a, b);
#elif defined(SIMDE_POWER_ALTIVEC_P6_NATIVE)
return vec_sub(vec_max(a, b), vec_min(a, b));
#elif defined(SIMDE_ZARCH_ZVECTOR_13_NATIVE)
return vec_max(a, b) - vec_min(a, b);
#else
simde_int16x8_private
r_,
a_ = simde_int16x8_to_private(a),
b_ = simde_int16x8_to_private(b);

SIMDE_VECTORIZE
for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) {
int32_t tmp = HEDLEY_STATIC_CAST(int32_t, a_.values[i]) - HEDLEY_STATIC_CAST(int32_t, b_.values[i]);
r_.values[i] = HEDLEY_STATIC_CAST(int16_t, tmp < 0 ? -tmp : tmp);
}

#if defined(SIMDE_X86_SSE2_NATIVE)
/* https://github.com/simd-everywhere/simde/issues/855#issuecomment-881658604 */
r_.m128i = _mm_sub_epi16(_mm_max_epi16(a_.m128i, b_.m128i), _mm_min_epi16(a_.m128i, b_.m128i));
#elif defined(SIMDE_WASM_SIMD128_NATIVE)
r_.v128 = wasm_i16x8_sub(wasm_i16x8_max(a_.v128, b_.v128), wasm_i16x8_min(a_.v128, b_.v128));
#else
SIMDE_VECTORIZE
for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) {
r_.values[i] =
(a_.values[i] < b_.values[i]) ?
(b_.values[i] - a_.values[i]) :
(a_.values[i] - b_.values[i]);
}

#endif
return simde_int16x8_from_private(r_);
#endif
}
Expand All @@ -277,17 +336,35 @@ simde_int32x4_t
simde_vabdq_s32(simde_int32x4_t a, simde_int32x4_t b) {
#if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
return vabdq_s32(a, b);
#elif defined(SIMDE_POWER_ALTIVEC_P6_NATIVE)
return vec_sub(vec_max(a, b), vec_min(a, b));
#elif defined(SIMDE_ZARCH_ZVECTOR_13_NATIVE)
return vec_max(a, b) - vec_min(a, b);
#else
simde_int32x4_private
r_,
a_ = simde_int32x4_to_private(a),
b_ = simde_int32x4_to_private(b);

SIMDE_VECTORIZE
for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) {
int64_t tmp = HEDLEY_STATIC_CAST(int64_t, a_.values[i]) - HEDLEY_STATIC_CAST(int64_t, b_.values[i]);
r_.values[i] = HEDLEY_STATIC_CAST(int32_t, tmp < 0 ? -tmp : tmp);
}
#if defined(SIMDE_X86_SSE4_1_NATIVE)
r_.m128i = _mm_sub_epi32(_mm_max_epi32(a_.m128i, b_.m128i), _mm_min_epi32(a_.m128i, b_.m128i));
#elif defined(SIMDE_X86_SSE2_NATIVE)
const __m128i m = _mm_cmpgt_epi32(b_.m128i, a_.m128i);
r_.m128i =
_mm_xor_si128(
_mm_add_epi32(
_mm_sub_epi32(a_.m128i, b_.m128i),
m
),
m
);
#else
SIMDE_VECTORIZE
for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) {
int64_t tmp = HEDLEY_STATIC_CAST(int64_t, a_.values[i]) - HEDLEY_STATIC_CAST(int64_t, b_.values[i]);
r_.values[i] = HEDLEY_STATIC_CAST(int32_t, tmp < 0 ? -tmp : tmp);
}
#endif

return simde_int32x4_from_private(r_);
#endif
Expand All @@ -304,17 +381,27 @@ simde_vabdq_u8(simde_uint8x16_t a, simde_uint8x16_t b) {
return vabdq_u8(a, b);
#elif defined(SIMDE_POWER_ALTIVEC_P9_NATIVE)
return vec_absd(a, b);
#elif defined(SIMDE_POWER_ALTIVEC_P6_NATIVE)
return vec_sub(vec_max(a, b), vec_min(a, b));
#elif defined(SIMDE_ZARCH_ZVECTOR_13_NATIVE)
return vec_max(a, b) - vec_min(a, b);
#else
simde_uint8x16_private
r_,
a_ = simde_uint8x16_to_private(a),
b_ = simde_uint8x16_to_private(b);

SIMDE_VECTORIZE
for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) {
int16_t tmp = HEDLEY_STATIC_CAST(int16_t, a_.values[i]) - HEDLEY_STATIC_CAST(int16_t, b_.values[i]);
r_.values[i] = HEDLEY_STATIC_CAST(uint8_t, tmp < 0 ? -tmp : tmp);
}
#if defined(SIMDE_X86_SSE2_NATIVE)
r_.m128i = _mm_sub_epi8(_mm_max_epu8(a_.m128i, b_.m128i), _mm_min_epu8(a_.m128i, b_.m128i));
#elif defined(SIMDE_WASM_SIMD128_NATIVE)
r_.v128 = wasm_i8x16_sub(wasm_u8x16_max(a_.v128, b_.v128), wasm_u8x16_min(a_.v128, b_.v128));
#else
SIMDE_VECTORIZE
for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) {
int16_t tmp = HEDLEY_STATIC_CAST(int16_t, a_.values[i]) - HEDLEY_STATIC_CAST(int16_t, b_.values[i]);
r_.values[i] = HEDLEY_STATIC_CAST(uint8_t, tmp < 0 ? -tmp : tmp);
}
#endif

return simde_uint8x16_from_private(r_);
#endif
Expand All @@ -331,17 +418,27 @@ simde_vabdq_u16(simde_uint16x8_t a, simde_uint16x8_t b) {
return vabdq_u16(a, b);
#elif defined(SIMDE_POWER_ALTIVEC_P9_NATIVE)
return vec_absd(a, b);
#elif defined(SIMDE_POWER_ALTIVEC_P6_NATIVE)
return vec_sub(vec_max(a, b), vec_min(a, b));
#elif defined(SIMDE_ZARCH_ZVECTOR_13_NATIVE)
return vec_max(a, b) - vec_min(a, b);
#else
simde_uint16x8_private
r_,
a_ = simde_uint16x8_to_private(a),
b_ = simde_uint16x8_to_private(b);

SIMDE_VECTORIZE
for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) {
int32_t tmp = HEDLEY_STATIC_CAST(int32_t, a_.values[i]) - HEDLEY_STATIC_CAST(int32_t, b_.values[i]);
r_.values[i] = HEDLEY_STATIC_CAST(uint16_t, tmp < 0 ? -tmp : tmp);
}
#if defined(SIMDE_X86_SSE4_2_NATIVE)
r_.m128i = _mm_sub_epi16(_mm_max_epu16(a_.m128i, b_.m128i), _mm_min_epu16(a_.m128i, b_.m128i));
#elif defined(SIMDE_WASM_SIMD128_NATIVE)
r_.v128 = wasm_i16x8_sub(wasm_u16x8_max(a_.v128, b_.v128), wasm_u16x8_min(a_.v128, b_.v128));
#else
SIMDE_VECTORIZE
for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) {
int32_t tmp = HEDLEY_STATIC_CAST(int32_t, a_.values[i]) - HEDLEY_STATIC_CAST(int32_t, b_.values[i]);
r_.values[i] = HEDLEY_STATIC_CAST(uint16_t, tmp < 0 ? -tmp : tmp);
}
#endif

return simde_uint16x8_from_private(r_);
#endif
Expand All @@ -358,17 +455,25 @@ simde_vabdq_u32(simde_uint32x4_t a, simde_uint32x4_t b) {
return vabdq_u32(a, b);
#elif defined(SIMDE_POWER_ALTIVEC_P9_NATIVE)
return vec_absd(a, b);
#elif defined(SIMDE_POWER_ALTIVEC_P6_NATIVE)
return vec_sub(vec_max(a, b), vec_min(a, b));
#elif defined(SIMDE_ZARCH_ZVECTOR_13_NATIVE)
return vec_max(a, b) - vec_min(a, b);
#else
simde_uint32x4_private
r_,
a_ = simde_uint32x4_to_private(a),
b_ = simde_uint32x4_to_private(b);

SIMDE_VECTORIZE
for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) {
int64_t tmp = HEDLEY_STATIC_CAST(int64_t, a_.values[i]) - HEDLEY_STATIC_CAST(int64_t, b_.values[i]);
r_.values[i] = HEDLEY_STATIC_CAST(uint32_t, tmp < 0 ? -tmp : tmp);
}
#if defined(SIMDE_X86_SSE4_2_NATIVE)
r_.m128i = _mm_sub_epi32(_mm_max_epu32(a_.m128i, b_.m128i), _mm_min_epu32(a_.m128i, b_.m128i));
#else
SIMDE_VECTORIZE
for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) {
int64_t tmp = HEDLEY_STATIC_CAST(int64_t, a_.values[i]) - HEDLEY_STATIC_CAST(int64_t, b_.values[i]);
r_.values[i] = HEDLEY_STATIC_CAST(uint32_t, tmp < 0 ? -tmp : tmp);
}
#endif

return simde_uint32x4_from_private(r_);
#endif
Expand Down
Loading

0 comments on commit 845e09b

Please sign in to comment.