diff --git a/.github/workflows/apple.yml b/.github/workflows/apple.yml index 46c2477efb2..229e8b90ec9 100644 --- a/.github/workflows/apple.yml +++ b/.github/workflows/apple.yml @@ -151,7 +151,8 @@ jobs: HOMEBREW_NO_AUTO_UPDATE=1 brew install dylibbundler lipo -create ./macosx-x86_64/supertuxkart.app/Contents/MacOS/supertuxkart ./macosx-arm64/supertuxkart.app/Contents/MacOS/supertuxkart -output ./macosx-arm64/supertuxkart.app/Contents/MacOS/supertuxkart chmod 755 ./macosx-arm64/supertuxkart.app/Contents/MacOS/supertuxkart - dylibbundler -od -b -x ./macosx-arm64/supertuxkart.app/Contents/MacOS/supertuxkart -d ./macosx-arm64/supertuxkart.app/Contents/libs/ -p @executable_path/../libs/ -s dependencies-macosx/lib -ns + install_name_tool -change libcurl.4.dylib @rpath/libcurl.4.dylib ./macosx-arm64/supertuxkart.app/Contents/MacOS/supertuxkart + dylibbundler -od -b -x ./macosx-arm64/supertuxkart.app/Contents/MacOS/supertuxkart -d ./macosx-arm64/supertuxkart.app/Contents/libs/ -p @executable_path/../libs/ -s ./dependencies-macosx/lib -ns # We use SDL_Vulkan_LoadLibrary for 10.9 compatibility, so otool -L supertuxkart has no libMoltenVK.dylib cp ./dependencies-macosx/lib/libMoltenVK.dylib ./macosx-arm64/supertuxkart.app/Contents/libs/ cd ./macosx-arm64/supertuxkart.app/Contents/Resources/data diff --git a/android/build.gradle b/android/build.gradle index f846541156e..8241548db24 100644 --- a/android/build.gradle +++ b/android/build.gradle @@ -11,7 +11,7 @@ buildscript // 4.1.2 is the minimum version to support native debug symbols file // https://developer.android.com/studio/build/shrink-code#android_gradle_plugin_version_41_or_later // 7.0.0 to fix https://stackoverflow.com/questions/68387270/android-studio-error-installed-build-tools-revision-31-0-0-is-corrupted - classpath 'com.android.tools.build:gradle:8.2.1' + classpath 'com.android.tools.build:gradle:8.5.1' } } @@ -48,6 +48,7 @@ android versionCode project.getProperty('version_code').toInteger() versionName project.getProperty('version_name') minSdkVersion min_sdk_version.toInteger() + compileSdkVersion compile_sdk_version.toInteger() targetSdkVersion target_sdk_version.toInteger() externalNativeBuild { diff --git a/android/gradle/wrapper/gradle-wrapper.jar b/android/gradle/wrapper/gradle-wrapper.jar index 249e5832f09..e6441136f3d 100644 Binary files a/android/gradle/wrapper/gradle-wrapper.jar and b/android/gradle/wrapper/gradle-wrapper.jar differ diff --git a/android/gradle/wrapper/gradle-wrapper.properties b/android/gradle/wrapper/gradle-wrapper.properties index c65b8841c00..09523c0e549 100644 --- a/android/gradle/wrapper/gradle-wrapper.properties +++ b/android/gradle/wrapper/gradle-wrapper.properties @@ -1,6 +1,7 @@ -#Sun Dec 03 18:24:53 EET 2023 distributionBase=GRADLE_USER_HOME distributionPath=wrapper/dists -distributionUrl=https\://services.gradle.org/distributions/gradle-8.5-bin.zip +distributionUrl=https\://services.gradle.org/distributions/gradle-8.9-bin.zip +networkTimeout=10000 +validateDistributionUrl=true zipStoreBase=GRADLE_USER_HOME zipStorePath=wrapper/dists diff --git a/android/gradlew b/android/gradlew index a69d9cb6c20..b740cf13397 100755 --- a/android/gradlew +++ b/android/gradlew @@ -55,7 +55,7 @@ # Darwin, MinGW, and NonStop. # # (3) This script is generated from the Groovy template -# https://github.com/gradle/gradle/blob/master/subprojects/plugins/src/main/resources/org/gradle/api/internal/plugins/unixStartScript.txt +# https://github.com/gradle/gradle/blob/HEAD/platforms/jvm/plugins-application/src/main/resources/org/gradle/api/internal/plugins/unixStartScript.txt # within the Gradle project. # # You can find Gradle at https://github.com/gradle/gradle/. @@ -80,13 +80,11 @@ do esac done -APP_HOME=$( cd "${APP_HOME:-./}" && pwd -P ) || exit - -APP_NAME="Gradle" +# This is normally unused +# shellcheck disable=SC2034 APP_BASE_NAME=${0##*/} - -# Add default JVM options here. You can also use JAVA_OPTS and GRADLE_OPTS to pass JVM options to this script. -DEFAULT_JVM_OPTS='"-Xmx64m" "-Xms64m"' +# Discard cd standard output in case $CDPATH is set (https://github.com/gradle/gradle/issues/25036) +APP_HOME=$( cd "${APP_HOME:-./}" > /dev/null && pwd -P ) || exit # Use the maximum available, or set MAX_FD != -1 to use that value. MAX_FD=maximum @@ -133,22 +131,29 @@ location of your Java installation." fi else JAVACMD=java - which java >/dev/null 2>&1 || die "ERROR: JAVA_HOME is not set and no 'java' command could be found in your PATH. + if ! command -v java >/dev/null 2>&1 + then + die "ERROR: JAVA_HOME is not set and no 'java' command could be found in your PATH. Please set the JAVA_HOME variable in your environment to match the location of your Java installation." + fi fi # Increase the maximum file descriptors if we can. if ! "$cygwin" && ! "$darwin" && ! "$nonstop" ; then case $MAX_FD in #( max*) + # In POSIX sh, ulimit -H is undefined. That's why the result is checked to see if it worked. + # shellcheck disable=SC2039,SC3045 MAX_FD=$( ulimit -H -n ) || warn "Could not query maximum file descriptor limit" esac case $MAX_FD in #( '' | soft) :;; #( *) + # In POSIX sh, ulimit -n is undefined. That's why the result is checked to see if it worked. + # shellcheck disable=SC2039,SC3045 ulimit -n "$MAX_FD" || warn "Could not set maximum file descriptor limit to $MAX_FD" esac @@ -193,11 +198,15 @@ if "$cygwin" || "$msys" ; then done fi -# Collect all arguments for the java command; -# * $DEFAULT_JVM_OPTS, $JAVA_OPTS, and $GRADLE_OPTS can contain fragments of -# shell script including quotes and variable substitutions, so put them in -# double quotes to make sure that they get re-expanded; and -# * put everything else in single quotes, so that it's not re-expanded. + +# Add default JVM options here. You can also use JAVA_OPTS and GRADLE_OPTS to pass JVM options to this script. +DEFAULT_JVM_OPTS='"-Xmx64m" "-Xms64m"' + +# Collect all arguments for the java command: +# * DEFAULT_JVM_OPTS, JAVA_OPTS, JAVA_OPTS, and optsEnvironmentVar are not allowed to contain shell fragments, +# and any embedded shellness will be escaped. +# * For example: A user cannot expect ${Hostname} to be expanded, as it is an environment variable and will be +# treated as '${Hostname}' itself on the command line. set -- \ "-Dorg.gradle.appname=$APP_BASE_NAME" \ diff --git a/android/make.sh b/android/make.sh index 31c62ae79ed..f5a8f267056 100755 --- a/android/make.sh +++ b/android/make.sh @@ -435,6 +435,7 @@ convert -scale 432x432 "$APP_ICON_ADAPTIVE_FG" "$DIRNAME/res/drawable-xxxhdpi/ic export ANDROID_HOME="$SDK_PATH" ./gradlew -Pcompile_sdk_version="$COMPILE_SDK_VERSION" \ -Pmin_sdk_version="$STK_MIN_ANDROID_SDK" \ + -Pcompile_sdk_version="$STK_TARGET_ANDROID_SDK"\ -Ptarget_sdk_version="$STK_TARGET_ANDROID_SDK" \ -Pstorepass="$STK_STOREPASS" \ -Pkeystore="$STK_KEYSTORE" \ @@ -450,6 +451,7 @@ export ANDROID_HOME="$SDK_PATH" if [ "$GRADLE_BUILD_TYPE" = "assembleRelease" ]; then ./gradlew -Pcompile_sdk_version="$COMPILE_SDK_VERSION" \ -Pmin_sdk_version="$STK_MIN_ANDROID_SDK" \ + -Pcompile_sdk_version="$STK_TARGET_ANDROID_SDK"\ -Ptarget_sdk_version="$STK_TARGET_ANDROID_SDK" \ -Pstorepass="$STK_STOREPASS" \ -Pkeystore="$STK_KEYSTORE" \ diff --git a/data/gui/screens/arenas.stkgui b/data/gui/screens/arenas.stkgui index 9746e93081a..6c71e81cf7f 100644 --- a/data/gui/screens/arenas.stkgui +++ b/data/gui/screens/arenas.stkgui @@ -3,9 +3,16 @@
-
- +
+ +
+ + + + + +
- + + + + +
diff --git a/data/supertuxkart.appdata.xml b/data/supertuxkart.appdata.xml index a57e6e1e4e7..c3393a97262 100644 --- a/data/supertuxkart.appdata.xml +++ b/data/supertuxkart.appdata.xml @@ -3,6 +3,7 @@ supertuxkart.desktop CC0-1.0 GPL-3.0+ + supertuxkart.desktop SuperTuxKart A 3D open-source kart racing game 3D 開源卡丁車賽車遊戲 diff --git a/lib/irrlicht/source/Irrlicht/CIrrDeviceSDL.cpp b/lib/irrlicht/source/Irrlicht/CIrrDeviceSDL.cpp index 955b1694dd3..6f8b7178306 100644 --- a/lib/irrlicht/source/Irrlicht/CIrrDeviceSDL.cpp +++ b/lib/irrlicht/source/Irrlicht/CIrrDeviceSDL.cpp @@ -92,6 +92,10 @@ CIrrDeviceSDL::CIrrDeviceSDL(const SIrrlichtCreationParameters& param) // Switch SDL disables this hint by default: https://github.com/devkitPro/SDL/pull/55#issuecomment-633775255 SDL_SetHint(SDL_HINT_TOUCH_MOUSE_EVENTS, "1"); +#ifdef ANDROID + SDL_SetHint(SDL_HINT_ORIENTATIONS, "LandscapeLeft LandscapeRight"); +#endif + #ifndef MOBILE_STK // Prevent fullscreen minimizes when losing focus if (CreationParams.Fullscreen) diff --git a/lib/simd_wrapper/simde/README.md b/lib/simd_wrapper/simde/README.md index 52278c866a1..303893a01bc 100644 --- a/lib/simd_wrapper/simde/README.md +++ b/lib/simd_wrapper/simde/README.md @@ -1,10 +1,513 @@ -# SIMDe Without Test Cases +# SIMD Everywhere + +[![All Contributors](https://img.shields.io/badge/all_contributors-73-orange.svg?style=flat-square)](#contributors-) + +[![Chat](https://badges.gitter.im/gitterHQ/gitter.png)](https://matrix.to/#/#simd-everywhere_community:gitter.im) +[![codecov](https://codecov.io/gh/simd-everywhere/simde/branch/master/graph/badge.svg?token=jcMBoRk0ui)](https://codecov.io/gh/simd-everywhere/simde) -This repository contains only the core of -[SIMDe](https://github.com/simd-everywhere/simde). -It is generated automatically for every commit to master, and is -intended to be used as a submodule in projects which don't want to -include the (rather large) test cases. +The SIMDe header-only library provides fast, portable implementations of +[SIMD intrinsics](https://en.wikipedia.org/wiki/SIMD) on hardware which +doesn't natively support them, such as calling [SSE](https://en.wikipedia.org/wiki/Streaming_SIMD_Extensions) +functions on ARM. There is no performance penalty if the hardware +supports the native implementation (*e.g.*, SSE/[AVX](https://en.wikipedia.org/wiki/Advanced_Vector_Extensions) +runs at full speed on [x86](https://en.wikipedia.org/wiki/X86), +[NEON](https://en.wikipedia.org/wiki/ARM_architecture#Advanced_SIMD_(Neon)) on [ARM](https://en.wikipedia.org/wiki/ARM_architecture), +*etc.*). -All development work happens in the main repository, please do not -file issues or create pull requests against this repository. +This makes porting code to other architectures much easier in a few +key ways: + +First, instead of forcing you to rewrite everything for each +architecture, SIMDe lets you get a port up and running almost +effortlessly. You can then start working on switching the most +performance-critical sections to native intrinsics, improving +performance gradually. SIMDe lets (for example) SSE/AVX and NEON code +exist side-by-side, in the same implementation. + +Second, SIMDe makes it easier to write code targeting [ISA](https://en.wikipedia.org/wiki/Instruction_set_architecture) +extensions you don't have convenient access to. You can run NEON code on your +x86 machine *without an emulator*. Obviously you'll eventually want +to test on the actual hardware you're targeting, but for most +development, SIMDe can provide a much easier path. + +SIMDe takes a very different approach from most other SIMD abstraction +layers in that it aims to expose the entire functionality of the +underlying instruction set. Instead of limiting functionality to the +lowest common denominator, SIMDe tries to minimize the amount of +effort required to port while still allowing you the space to optimize +as needed. + +The current focus is on writing complete portable implementations, +though a large number of functions already have accelerated +implementations using one (or more) of the following: + + * SIMD intrinsics from other ISA extensions (e.g., using NEON to + implement SSE). + * Compiler-specific vector extensions and built-ins such as + [`__builtin_shufflevector`](http://clang.llvm.org/docs/LanguageExtensions.html#langext-builtin-shufflevector) + and + [`__builtin_convertvector`](http://clang.llvm.org/docs/LanguageExtensions.html#langext-builtin-convertvector) + * Compiler auto-vectorization hints, using: + * [OpenMP 4 SIMD](http://www.openmp.org/) + * [Cilk Plus](https://www.cilkplus.org/) + * [GCC loop-specific pragmas](https://gcc.gnu.org/onlinedocs/gcc/Loop-Specific-Pragmas.html) + * [clang pragma loop hint directives](http://llvm.org/docs/Vectorizers.html#pragma-loop-hint-directives) + +You can [try SIMDe online](https://simde.netlify.app/godbolt/demo) +using Compiler Explorer and an amalgamated SIMDe header. + +If you have any questions, please feel free to use the +[issue tracker](https://github.com/simd-everywhere/simde/issues) or the +[mailing list](https://groups.google.com/forum/#!forum/simde). + +## Current Status + +There are currently complete implementations of the following instruction +set extensions: + +* ARM + * [NEON](https://en.wikipedia.org/wiki/ARM_architecture_family#Advanced_SIMD_(Neon)) [List](https://developer.arm.com/architectures/instruction-sets/intrinsics/#f:@navigationhierarchiessimdisa=[Neon]) +* x86 / x86_64 + * [MMX](https://en.wikipedia.org/wiki/MMX_(instruction_set)) [List](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#techs=MMX) + * [SSE](https://en.wikipedia.org/wiki/Streaming_SIMD_Extensions) [List](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#ssetechs=SSE) + * [SSE2](https://en.wikipedia.org/wiki/SSE2) [List](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#ssetechs=SSE2) + * [SSE3](https://en.wikipedia.org/wiki/SSE3) [List](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#ssetechs=SSE3) + * [SSSE3](https://en.wikipedia.org/wiki/SSSE3) [List](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#ssetechs=SSSE3) + * [SSE4.1](https://en.wikipedia.org/wiki/SSE4#SSE4.1) [List](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#ssetechs=SSE4_1) + * [CRC32](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#ig_expand=7131&othertechs=CRC32) + * [AVX](https://en.wikipedia.org/wiki/Advanced_Vector_Extensions) [List](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#avxnewtechs=AVX) + * [AVX2](https://en.wikipedia.org/wiki/Advanced_Vector_Extensions#Advanced_Vector_Extensions_2) [List](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#avxnewtechs=AVX2) + * [F16C](https://en.wikipedia.org/wiki/F16C) [List](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#avxnewtechs=F16C) + * [FMA](https://en.wikipedia.org/wiki/FMA_instruction_set) [List](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#avxnewtechs=FMA) + * [GFNI](https://en.wikipedia.org/wiki/AVX-512#GFNI) [List](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#othertechs=GFNI) + * [XOP](https://en.wikipedia.org/wiki/XOP_instruction_set) + * [SVML](https://software.intel.com/content/www/us/en/develop/documentation/cpp-compiler-developer-guide-and-reference/top/compiler-reference/intrinsics/intrinsics-for-intel-advanced-vector-extensions-512-intel-avx-512-instructions/intrinsics-for-arithmetic-operations-1/intrinsics-for-short-vector-math-library-svml-operations.html) [List](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#ig_expand=770&techs=SVML) + * [AVX512VPOPCNTDQ](https://en.wikipedia.org/wiki/AVX-512#VPOPCNTDQ_and_BITALG) [List](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#ig_expand=7131&avx512techs=AVX512VPOPCNTDQ) + * [AVX512_BITALG](https://en.wikipedia.org/wiki/AVX-512#VPOPCNTDQ_and_BITALG) [List](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#ig_expand=7131&avx512techs=AVX512_BITALG) + * [AVX512_VBMI](https://en.wikipedia.org/wiki/AVX-512#Permute) [List](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#ig_expand=7131&avx512techs=AVX512_VBMI) + * [AVX512_VNNI](https://en.wikipedia.org/wiki/AVX-512#VNNI) [List](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#ig_expand=7131&avx512techs=AVX512_VNNI) + * [AVX512_VP2INTERSECT](https://en.wikipedia.org/wiki/AVX-512#VP2INTERSECT) [List](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#ig_expand=7131&avx512techs=AVX512_VP2INTERSECT) + * [VPCLMULQDQ](https://en.wikipedia.org/wiki/AVX-512#VPCLMULQDQ) [List](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#othertechs=VPCLMULQDQ) +* WebAssembly + * [SIMD128](https://github.com/WebAssembly/simd) + +As well as partial support for many others, including AES-ni, [CLMUL](https://en.wikipedia.org/wiki/CLMUL_instruction_set), SSE4.2, SVE, [MSA](https://en.wikipedia.org/wiki/MIPS_architecture#Application-specific_extensions) in +addition to several AVX-512 extensions. See the +[instruction-set-support](https://github.com/simd-everywhere/simde/issues?q=is%3Aissue+is%3Aopen+label%3Ainstruction-set-support+sort%3Aupdated-desc) +label in the issue tracker for details on progress. If you'd like to +be notified when an instruction set is available you may subscribe to +the relevant issue. + +If you have a project you're interested in using with SIMDe but we +don't yet support all the functions you need, please file an issue +with a list of what's missing so we know what to prioritize. + +The default branch is protected so commits never reach it unless +they have passed extensive CI checks. Status badges don't really +make sense since they will always be green, but here are the links: + +* [GitHub Actions](https://github.com/simd-everywhere/simde/actions) +* [Cirrus CI](https://cirrus-ci.com/github/simd-everywhere/simde) +* [Semaphore CI](https://nemequ.semaphoreci.com/projects/simde) +* [Circle CI](https://app.circleci.com/pipelines/github/simd-everywhere/simde) +* [AppVeyor](https://ci.appveyor.com/project/nemequ/simde) +* [Azure Pipelines](https://dev.azure.com/simd-everywhere/SIMDe/_build) +* [Drone CI](https://cloud.drone.io/simd-everywhere/simde/) +* [Travis CI](https://app.travis-ci.com/github/simd-everywhere/simde/) +* [Packit CI](https://dashboard.packit.dev/projects/github.com/simd-everywhere/simde) + +If you're adding a new build I suggest Cirrus CI, which is where we +currently have the most room given the number of builds currently on +the platform and the quotas for free/open-source usage. Alternately, +feel free to set up another provider (such as +[Codefresh](https://codefresh.io/), +[Shippable](https://www.shippable.com/), +[Bitrise](https://www.bitrise.io/), +[Werkaer](https://app.wercker.com/), etc.). + +*Notice*: we plan on changing the name of the default branch from +"master" to something else soon; we are just trying to wait to see what +name git settles on so we can be consistent. + +## Contributing + +First off, if you're reading this: thank you! Even considering +contributing to SIMDe is very much appreciated! + +SIMDe is a fairly large undertaking; there are a *lot* of functions to +get through and a lot of opportunities for optimization on different +platforms, so we're very happy for any help you can provide. + +Programmers of all skill levels are welcome, there are lots of tasks +which are pretty straightforward and don't require any special +expertise. + +If you're not sure how you'd like to contribute, please consider taking +a look at [the issue tracker](https://github.com/simd-everywhere/simde/issues). +There is a [good first issue](https://github.com/simd-everywhere/simde/issues?q=is%3Aissue+is%3Aopen+label%3A%22good+first+issue%22) +tag if you want to ease into a your first contributions, but if you're +interested in something else please get in touch via the issue tracker; +we're happy to help you get a handle on whatever you are interested in. + +If you're interested in implementing currently unimplemented functions, +there is [a +guide](https://github.com/simd-everywhere/simde/wiki/Implementing-a-New-Function) +explaining how to add new functions and how to quickly and easily get +a test case in place. It's a bit rough right now, but if anything is +unclear please feel free to use the issue tracker to ask about +anything you're not clear on. + +## Usage + +First, it is important to note that *you do not need two separate +versions* (one using SIMDe, the other native). If the native functions +are available SIMDe will use them, and compilers easily optimize away +any overhead from SIMDe; all they have to do is some basic inlining. +`-O2` should be enough, but we strongly recommend `-O3` (or whatever +flag instructs your compiler to aggressizely optimize) since many of +the portable fallbacks are substantially faster with aggressive +auto-vectorization that isn't enabled at lower optimization levels. + +Each instruction set has a separate file; `x86/mmx.h` for MMX, +`x86/sse.h` for SSE, `x86/sse2.h` for SSE2, and so on. Just include +the header for whichever instruction set(s) you want *instead of the +native version* (if you include the native version after SIMDe it will +result in compile-time errors if native aliases are enabled). SIMDe +will provide the fastest implementation it can given which extensions +you've enabled in your compiler (i.e., if you want to use NEON to +implement SSE, you may need to pass something like `-mfpu=neon` +or `-march=armv8-a+simd`. See +[GCC ARM-Options](https://gcc.gnu.org/onlinedocs/gcc/ARM-Options.html) +for more information). + +If you define `SIMDE_ENABLE_NATIVE_ALIASES` before including SIMDe +you can use the same names as the native functions. Unfortunately, +this is somewhat error-prone due to portability issues in the APIs, so +it's recommended to only do this for testing. When +`SIMDE_ENABLE_NATIVE_ALIASES` is undefined only the versions prefixed +with `simde_` will be available; for example, the MMX `_mm_add_pi8` +intrinsic becomes `simde_mm_add_pi8`, and `__m64` becomes `simde__m64`. + +Since SIMDe is meant to be portable, many functions which assume types +are of a specific size have been altered to use fixed-width types +instead. For example, Intel's APIs use `char` for signed 8-bit +integers, but `char` on ARM is generally unsigned. SIMDe uses `int8_t` +to make the API portable, but that means your code may require some +minor changes (such as using `int8_t` instead of `char`) to work on +other platforms. + +That said, the changes are usually quite minor. It's often enough to +just use search and replace, manual changes are required pretty +infrequently. + +### OpenMP 4 SIMD + +SIMDe makes extensive use of annotations to help the compiler vectorize +code. By far the best annotations use the SIMD support built in to +OpenMP 4, so if your compiler supports these annotations we strongly +recommend you enable them. + +If you are already using OpenMP, SIMDe will automatically detect it +using the `_OPENMP` macro and no further action is required. + +Some compilers allow you to enable OpenMP SIMD *without* enabling the +full OpenMP. In such cases there is no runtime dependency on OpenMP +and no runtime overhead; SIMDe will just be faster. Unfortunately, +SIMDe has no way to detect such situations (the `_OPENMP` macro is not +defined), so after enabling it in your compiler you'll need to define +`SIMDE_ENABLE_OPENMP` (e.g., by passing `-DSIMDE_ENABLE_OPENMP`) to get +SIMDe to output the relevant pragmas. + +Enabling OpenMP SIMD support varies by compiler: + + * GCC 4.9+ and clang 6+ support a `-fopenmp-simd` command line flag. + * ICC supports a `-qopenmp-simd` command line flag. + * MCST's LCC enables OpenMP SIMD by default, so no flags are needed + (technically you don't even need to pass `-DSIMDE_ENABLE_OPENMP`). + +We are not currently aware of any other compilers which allow you to +enable OpenMP SIMD support without enabling full OpenMP (if you are +please file an issue to let us know). You should determine whether you +wish to enable full OpenMP support on a case-by-case basis, but it is +likely that the overhead of linking to (but not using) the OpenMP +runtime library will be dwarfed by the performance improvements from +using the OpenMP SIMD annotations in SIMDe. + +If you choose not to use OpenMP SIMD, SIMDe also supports +using [Cilk Plus](https://www.cilkplus.org/), [GCC loop-specific +pragmas](https://gcc.gnu.org/onlinedocs/gcc/Loop-Specific-Pragmas.html), +or [clang pragma loop hint +directives](http://llvm.org/docs/Vectorizers.html#pragma-loop-hint-directives), +though these are not nearly as effective as OpenMP SIMD and depending +on them will likely result in less efficient code. All of these are +detected automatically by SIMDe, so if they are enabled in your +compiler nothing more is required. + +If for some reason you do not wish to enable OpenMP 4 SIMD support even +though SIMDe detects it, you should define `SIMDE_DISABLE_OPENMP` prior +to including SIMDe. + +## Portability + +### Compilers + +SIMDe does depend on some C99 features, though the subset supported by +MSVC also works. While we do our best to make sure we provide optimized +implementations where they are supported, SIMDe does contain portable +fallbacks which are designed to work on any C99 compiler. + +Every commit is tested in CI on multiple compilers, platforms, and +configurations, and our test coverage is extremely extensive. +Currently tested compilers include: + + * GCC versions back to 4.8 + * Clang versions back to 3.8 + * Microsoft Visual Studio back to 12 (2013) + * IBM XL C/C++ + * Intel C/C++ Compiler (ICC) + +I'm generally willing to accept patches to add support for other +compilers, as long as they're not too disruptive, *especially* if we +can get CI support going. If using one of our existing CI providers +isn't an option then other CI platforms can be added. + +### Hardware + +The following architectures are tested in CI for every commit: + + * x86_64/amd64 + * x86 + * AArch64 + * ARMv8 + * ARMv7 with VFPv3-D16 floating point + * ARMv5 EABI + * PPC64 + * z/Architecture (with "-mzvector") + * MIPS Loongson 64 + * RISC-V 64 + * emscripten 32- & 64-bit; regular and relaxed + +We would love to add more, so patches are extremely welcome! + +## Related Projects + + * The "builtins" module in + [portable-snippets](https://github.com/nemequ/portable-snippets) + does much the same thing, but for compiler-specific intrinsics + (think `__builtin_clz` and `_BitScanForward`), **not** SIMD + intrinsics. + * Intel offers an emulator, the [Intel® Software Development + Emulator](https://software.intel.com/en-us/articles/intel-software-development-emulator/) + which can be used to develop software which uses Intel intrinsics + without having to own hardware which supports them, though it + doesn't help for deployment. + * [Iris](https://github.com/AlexYaruki/iris) is the only other project + I'm aware of which is attempting to create portable implementations + like SIMDe. SIMDe is much further along on the Intel side, but Iris + looks to be in better shape on ARM. C++-only, Apache 2.0 license. + AFAICT there are no accelerated fallbacks, nor is there a good way to + add them since it relies extensively on templates. + * There are a few projects trying to implement one set with another: + * [ARM_NEON_2_x86_SSE](https://github.com/intel/ARM_NEON_2_x86_SSE) + — implementing NEON using SSE. Quite extensive, Apache 2.0 + license. + * [sse2neon](https://github.com/jratcliff63367/sse2neon) — + implementing SSE using NEON. This code has already been merged + into SIMDe. + * [veclib](https://github.com/IvantheDugtrio/veclib) — implementing + SSE2 using AltiVec/VMX, using a non-free IBM library called + [powerveclib](https://www.ibm.com/developerworks/community/groups/community/powerveclib/) + * [SSE-to-NEON](https://github.com/otim/SSE-to-NEON) — implementing + SSE with NEON. Non-free, C++. + * [AvxToNeon](https://github.com/kunpengcompute/AvxToNeon) — Popular + AVX+ intrinsincs implemented in NEON. C, Apache 2.0 license. + * [neon2rvv](https://github.com/howjmay/neon2rvv) - A C/C++ header file that converts Arm/Aarch64 NEON intrinsics to RISC-V Vector (RVV) Extension, MIT license + * [sse2rvv](https://github.com/pattonkan/sse2rvv) - A C/C++ header file that converts Intel SSE intrinsics to RISCV-V Extension intrinsics, MIT license. + * [arm-neon-tests](https://github.com/christophe-lyon/arm-neon-tests) + contains tests to verify NEON implementations. + +If you know of any other related projects, please [let us +know](https://github.com/simd-everywhere/simde/issues/new)! + +## Caveats + +Sometime features can't be emulated. If SIMDe is operating in native +mode the functions will work as expected, but if there is no native +support some caveats apply: + + * Many functions require and/or . SIMDe will still + work without those headers, but the results of those functions are + undefined. + * x86 / x86_64 + * SSE + * `SIMDE_MM_SET_ROUNDING_MODE()` will use `fesetround()`, altering + the global rounding mode. + * `simde_mm_getcsr` and `simde_mm_setcsr` only implement bits 13 + and 14 (rounding mode). + * AVX + * `simde_mm256_test*` do not set the CF/ZF registers as there is + no portable way to implement that functionality. + * `simde_mm256_zeroall` and `simde_mm256_zeroupper` are not + implemented as there is no portable way to implement that + functionality. + +Additionally, there are some known limitations which apply when using +native aliases (`SIMDE_ENABLE_NATIVE_ALIASES`): + +* On Windows x86 (but not x86_64), some MMX functions and SSE/SSE2 + functions which use MMX types (__m64) other than for pointers may + return incorrect results. + +Also, as mentioned earlier, while some APIs make assumptions about +basic types (*e.g.*, `int` is 32 bits), SIMDe does not, so many types +have been altered to use portable fixed-width versions such as +`int32_t`. + +If you find any other differences, please file an issue so we can either fix +it or add it to the list above. + +## Benefactors + +SIMDe uses resources provided for free by a number of organizations. +While this shouldn't be taken to imply endorsement of SIMDe, we're +tremendously grateful for their support: + + * [IntegriCloud](https://integricloud.com/) — provides access to a very + fast POWER9 server for developing AltiVec/VMX support. + * [GCC Compile Farm](https://gcc.gnu.org/wiki/CompileFarm) — provides + access to a wide range of machines with different architectures for + developing support for various ISA extensions. + * [CodeCov.io](https://codecov.io/) — provides code coverage analysis + for our test cases. + * [Google](https://www.google.com/) ­— financing + [Summer of Code](https://summerofcode.withgoogle.com/), substantial + amounts of code (Sean Maher's contributions), and an [Open Source Peer + Bonus](https://opensource.google/docs/growing/peer-bonus/). + +Without such organizations donating resources, SIMDe wouldn't be nearly +as useful or usable as it is today. + +We would also like to thank anyone who has helped develop the myriad +of software on which SIMDe relies, including compilers and analysis +tools. + +Finally, a special thank you to +[anyone who has contributed](https://github.com/simd-everywhere/simde/graphs/contributors) +to SIMDe, filed bugs, provided suggestions, or helped with SIMDe +development in any way. + +## License + +SIMDe is distributed under an MIT-style license; see COPYING for +details. + +## Contributors ✨ + +Thanks goes to these wonderful people ([emoji key](https://allcontributors.org/docs/en/emoji-key)): + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +

Evan Nemerson

💻 🖋 📖 💡 🤔 💬 👀 ⚠️ 📢 🐛 🚇 🚧 📆

Michael R. Crusoe

🐛 💻 📋 🔍 🤔 🚇 📦 ⚠️ 🚧 📆 👀

HIMANSHI MATHUR

💻 ⚠️

Hidayat Khan

💻 ⚠️

rosbif

💻 ⚠️ 🐛 🤔 📖

Jun Aruga

💻 🤔 📦 🚇 🚧 ⚠️ 🐛

Élie ROUDNINSKI

💻 ⚠️

Jesper Storm Bache

💻

Jeff Daily

💻 🚇

Pavel

💻

Sabarish Bollapragada

💻

Gavin Li

💻

Yining Karl Li

💻

Anirban Dey

📖

Darren Ng

📖

FaresSalem

📖

Pradnyesh Gore

💻

Sean Maher

💻

Mingye Wang

📖

Ng Zhi An

💻 📖

Atharva Nimbalkar

💻 ⚠️

simba611

💻 ⚠️

Ashleigh Newman-Jones

💻 ⚠️

Willy R. Vasquez

💻 🚧 ⚠️

Keith Winstein

💻 🚧 ⚠️

David Seifert

🚧

Milot Mirdita

💻 🚧 ⚠️

aqrit

💻 🚧

Décio Luiz Gazzoni Filho

💻 🚧 ⚠️

Igor Molchanov

💻 🚧 📦

Andrew Rodriguez

💻 🚧 ⚠️

Changqing Jing

🚧

JP Cimalando

💻 🚇

Jiaxun Yang

💻 📦

Masahiro Kitagawa

💻 ⚠️

Pavel Iatchenii

💻 ⚠️

Tommy Vercetti

🚧

Robert Cohn

🚧

Adam Novak

📖

boris-kuz

🚧

Dimo Markov

🚧

dblue

🚧

zekehul

💻 🚧

Laurent Thomas

💻

Max Bachmann

📖

psaab

🚧

Sam Clegg

🚧

Thomas Lively

🐛 🤔 🚧

coderzh

💻 ⚠️

Dominik Kutra

💻 ⚠️

Lithrein

🚧

Nick

🚧

thomasdwu

🚧

Stephen

🐛

John Platts

🐛

Steven Noonan

🐛

p0nce

🐛

Paul Wise

🐛

easyaspi314 (Devin)

🐛 💻

JonLiu1993

📦

Cheney Wang

📦

myd7349

📦

chausner

📦

Yi-Yen Chung

💻 ⚠️

Chi-Wei Chu

💻 ⚠️

M-HT

💻

Simon Gene Gottlieb

💻

Chris Bielow

💻

gu xiwei

📦 ⚠️

George Vinokhodov

💻

Cœur

💻

Florian @Proudsalsa

💻

Thomas Schlichter

🐛 💻
+ + + + + +This project follows the [all-contributors](https://github.com/all-contributors/all-contributors) specification. Contributions of any kind are welcome! diff --git a/lib/simd_wrapper/simde/arm/neon.h b/lib/simd_wrapper/simde/arm/neon.h index df91b0d9334..5835db69642 100644 --- a/lib/simd_wrapper/simde/arm/neon.h +++ b/lib/simd_wrapper/simde/arm/neon.h @@ -22,6 +22,7 @@ * * Copyright: * 2020 Evan Nemerson + * 2023 Yi-Yen Chung (Copyright owned by Andes Technology) */ #if !defined(SIMDE_ARM_NEON_H) @@ -30,23 +31,32 @@ #include "neon/types.h" #include "neon/aba.h" +#include "neon/abal.h" +#include "neon/abal_high.h" #include "neon/abd.h" #include "neon/abdl.h" +#include "neon/abdl_high.h" #include "neon/abs.h" #include "neon/add.h" #include "neon/addhn.h" +#include "neon/addhn_high.h" #include "neon/addl.h" #include "neon/addlv.h" #include "neon/addl_high.h" #include "neon/addv.h" #include "neon/addw.h" #include "neon/addw_high.h" +#include "neon/aes.h" #include "neon/and.h" #include "neon/bcax.h" #include "neon/bic.h" #include "neon/bsl.h" +#include "neon/cadd_rot270.h" +#include "neon/cadd_rot90.h" #include "neon/cage.h" #include "neon/cagt.h" +#include "neon/cale.h" +#include "neon/calt.h" #include "neon/ceq.h" #include "neon/ceqz.h" #include "neon/cge.h" @@ -60,13 +70,24 @@ #include "neon/cltz.h" #include "neon/clz.h" #include "neon/cmla.h" -#include "neon/cmla_rot90.h" +#include "neon/cmla_lane.h" #include "neon/cmla_rot180.h" +#include "neon/cmla_rot180_lane.h" #include "neon/cmla_rot270.h" +#include "neon/cmla_rot270_lane.h" +#include "neon/cmla_rot90.h" +#include "neon/cmla_rot90_lane.h" #include "neon/cnt.h" #include "neon/cvt.h" +#include "neon/cvt_n.h" +#include "neon/cvtm.h" +#include "neon/cvtn.h" +#include "neon/cvtp.h" #include "neon/combine.h" +#include "neon/copy_lane.h" +#include "neon/crc32.h" #include "neon/create.h" +#include "neon/div.h" #include "neon/dot.h" #include "neon/dot_lane.h" #include "neon/dup_lane.h" @@ -76,6 +97,11 @@ #include "neon/fma.h" #include "neon/fma_lane.h" #include "neon/fma_n.h" +#include "neon/fmlal.h" +#include "neon/fmlsl.h" +#include "neon/fms.h" +#include "neon/fms_lane.h" +#include "neon/fms_n.h" #include "neon/get_high.h" #include "neon/get_lane.h" #include "neon/get_low.h" @@ -84,30 +110,48 @@ #include "neon/ld1.h" #include "neon/ld1_dup.h" #include "neon/ld1_lane.h" +#include "neon/ld1_x2.h" +#include "neon/ld1_x3.h" +#include "neon/ld1_x4.h" +#include "neon/ld1q_x2.h" +#include "neon/ld1q_x3.h" +#include "neon/ld1q_x4.h" #include "neon/ld2.h" +#include "neon/ld2_dup.h" +#include "neon/ld2_lane.h" #include "neon/ld3.h" +#include "neon/ld3_dup.h" +#include "neon/ld3_lane.h" #include "neon/ld4.h" +#include "neon/ld4_dup.h" #include "neon/ld4_lane.h" #include "neon/max.h" #include "neon/maxnm.h" +#include "neon/maxnmv.h" #include "neon/maxv.h" #include "neon/min.h" #include "neon/minnm.h" +#include "neon/minnmv.h" #include "neon/minv.h" #include "neon/mla.h" +#include "neon/mla_lane.h" #include "neon/mla_n.h" #include "neon/mlal.h" #include "neon/mlal_high.h" +#include "neon/mlal_high_lane.h" #include "neon/mlal_high_n.h" #include "neon/mlal_lane.h" #include "neon/mlal_n.h" #include "neon/mls.h" +#include "neon/mls_lane.h" #include "neon/mls_n.h" #include "neon/mlsl.h" #include "neon/mlsl_high.h" +#include "neon/mlsl_high_lane.h" #include "neon/mlsl_high_n.h" #include "neon/mlsl_lane.h" #include "neon/mlsl_n.h" +#include "neon/mmlaq.h" #include "neon/movl.h" #include "neon/movl_high.h" #include "neon/movn.h" @@ -117,8 +161,13 @@ #include "neon/mul_n.h" #include "neon/mull.h" #include "neon/mull_high.h" +#include "neon/mull_high_lane.h" +#include "neon/mull_high_n.h" #include "neon/mull_lane.h" #include "neon/mull_n.h" +#include "neon/mulx.h" +#include "neon/mulx_lane.h" +#include "neon/mulx_n.h" #include "neon/mvn.h" #include "neon/neg.h" #include "neon/orn.h" @@ -127,59 +176,117 @@ #include "neon/padd.h" #include "neon/paddl.h" #include "neon/pmax.h" +#include "neon/pmaxnm.h" #include "neon/pmin.h" +#include "neon/pminnm.h" #include "neon/qabs.h" #include "neon/qadd.h" +#include "neon/qdmlal.h" +#include "neon/qdmlal_high.h" +#include "neon/qdmlal_high_lane.h" +#include "neon/qdmlal_high_n.h" +#include "neon/qdmlal_lane.h" +#include "neon/qdmlal_n.h" +#include "neon/qdmlsl.h" +#include "neon/qdmlsl_high.h" +#include "neon/qdmlsl_high_lane.h" +#include "neon/qdmlsl_high_n.h" +#include "neon/qdmlsl_lane.h" +#include "neon/qdmlsl_n.h" #include "neon/qdmulh.h" #include "neon/qdmulh_lane.h" #include "neon/qdmulh_n.h" #include "neon/qdmull.h" +#include "neon/qdmull_high.h" +#include "neon/qdmull_high_lane.h" +#include "neon/qdmull_high_n.h" +#include "neon/qdmull_lane.h" +#include "neon/qdmull_n.h" +#include "neon/qrdmlah.h" +#include "neon/qrdmlah_lane.h" +#include "neon/qrdmlsh.h" +#include "neon/qrdmlsh_lane.h" #include "neon/qrdmulh.h" #include "neon/qrdmulh_lane.h" #include "neon/qrdmulh_n.h" +#include "neon/qrshl.h" +#include "neon/qrshrn_high_n.h" #include "neon/qrshrn_n.h" +#include "neon/qrshrun_high_n.h" #include "neon/qrshrun_n.h" #include "neon/qmovn.h" -#include "neon/qmovun.h" #include "neon/qmovn_high.h" +#include "neon/qmovun.h" +#include "neon/qmovun_high.h" #include "neon/qneg.h" #include "neon/qsub.h" #include "neon/qshl.h" +#include "neon/qshl_n.h" #include "neon/qshlu_n.h" +#include "neon/qshrn_high_n.h" #include "neon/qshrn_n.h" +#include "neon/qshrun_high_n.h" #include "neon/qshrun_n.h" #include "neon/qtbl.h" #include "neon/qtbx.h" +#include "neon/raddhn.h" +#include "neon/raddhn_high.h" +#include "neon/rax.h" #include "neon/rbit.h" #include "neon/recpe.h" #include "neon/recps.h" +#include "neon/recpx.h" #include "neon/reinterpret.h" #include "neon/rev16.h" #include "neon/rev32.h" #include "neon/rev64.h" #include "neon/rhadd.h" #include "neon/rnd.h" +#include "neon/rnd32x.h" +#include "neon/rnd32z.h" +#include "neon/rnd64x.h" +#include "neon/rnd64z.h" +#include "neon/rnda.h" #include "neon/rndm.h" #include "neon/rndi.h" #include "neon/rndn.h" #include "neon/rndp.h" +#include "neon/rndx.h" #include "neon/rshl.h" #include "neon/rshr_n.h" +#include "neon/rshrn_high_n.h" #include "neon/rshrn_n.h" #include "neon/rsqrte.h" #include "neon/rsqrts.h" #include "neon/rsra_n.h" +#include "neon/rsubhn.h" +#include "neon/rsubhn_high.h" #include "neon/set_lane.h" +#include "neon/sha1.h" +#include "neon/sha256.h" +#include "neon/sha512.h" #include "neon/shl.h" #include "neon/shl_n.h" +#include "neon/shll_high_n.h" #include "neon/shll_n.h" #include "neon/shr_n.h" +#include "neon/shrn_high_n.h" #include "neon/shrn_n.h" +#include "neon/sli_n.h" +#include "neon/sm3.h" +#include "neon/sm4.h" #include "neon/sqadd.h" +#include "neon/sqrt.h" #include "neon/sra_n.h" #include "neon/sri_n.h" #include "neon/st1.h" #include "neon/st1_lane.h" +#include "neon/st1_x2.h" +#include "neon/st1_x3.h" +#include "neon/st1_x4.h" +#include "neon/st1q_x2.h" +#include "neon/st1q_x3.h" +#include "neon/st1q_x4.h" #include "neon/st2.h" #include "neon/st2_lane.h" #include "neon/st3.h" @@ -188,10 +295,12 @@ #include "neon/st4_lane.h" #include "neon/sub.h" #include "neon/subhn.h" +#include "neon/subhn_high.h" #include "neon/subl.h" #include "neon/subl_high.h" #include "neon/subw.h" #include "neon/subw_high.h" +#include "neon/sudot_lane.h" #include "neon/tbl.h" #include "neon/tbx.h" #include "neon/trn.h" @@ -199,6 +308,8 @@ #include "neon/trn2.h" #include "neon/tst.h" #include "neon/uqadd.h" +#include "neon/usdot.h" +#include "neon/usdot_lane.h" #include "neon/uzp.h" #include "neon/uzp1.h" #include "neon/uzp2.h" diff --git a/lib/simd_wrapper/simde/arm/neon/abal.h b/lib/simd_wrapper/simde/arm/neon/abal.h new file mode 100644 index 00000000000..7e5093d37ec --- /dev/null +++ b/lib/simd_wrapper/simde/arm/neon/abal.h @@ -0,0 +1,125 @@ +/* SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, copy, + * modify, merge, publish, distribute, sublicense, and/or sell copies + * of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + * Copyright: + * 2023 Yi-Yen Chung (Copyright owned by Andes Technology) + */ + +#if !defined(SIMDE_ARM_NEON_ABAL_H) +#define SIMDE_ARM_NEON_ABAL_H + +#include "abdl.h" +#include "add.h" + +HEDLEY_DIAGNOSTIC_PUSH +SIMDE_DISABLE_UNWANTED_DIAGNOSTICS +SIMDE_BEGIN_DECLS_ + +SIMDE_FUNCTION_ATTRIBUTES +simde_int16x8_t +simde_vabal_s8(simde_int16x8_t a, simde_int8x8_t b, simde_int8x8_t c) { + #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) + return vabal_s8(a, b, c); + #else + return simde_vaddq_s16(simde_vabdl_s8(b, c), a); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vabal_s8 + #define vabal_s8(a, b, c) simde_vabal_s8((a), (b), (c)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_int32x4_t +simde_vabal_s16(simde_int32x4_t a, simde_int16x4_t b, simde_int16x4_t c) { + #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) + return vabal_s16(a, b, c); + #else + return simde_vaddq_s32(simde_vabdl_s16(b, c), a); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vabal_s16 + #define vabal_s16(a, b, c) simde_vabal_s16((a), (b), (c)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_int64x2_t +simde_vabal_s32(simde_int64x2_t a, simde_int32x2_t b, simde_int32x2_t c) { + #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) + return vabal_s32(a, b, c); + #else + return simde_vaddq_s64(simde_vabdl_s32(b, c), a); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vabal_s32 + #define vabal_s32(a, b, c) simde_vabal_s32((a), (b), (c)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_uint16x8_t +simde_vabal_u8(simde_uint16x8_t a, simde_uint8x8_t b, simde_uint8x8_t c) { + #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) + return vabal_u8(a, b, c); + #else + return simde_vaddq_u16(simde_vabdl_u8(b, c), a); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vabal_u8 + #define vabal_u8(a, b, c) simde_vabal_u8((a), (b), (c)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_uint32x4_t +simde_vabal_u16(simde_uint32x4_t a, simde_uint16x4_t b, simde_uint16x4_t c) { + #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) + return vabal_u16(a, b, c); + #else + return simde_vaddq_u32(simde_vabdl_u16(b, c), a); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vabal_u16 + #define vabal_u16(a, b, c) simde_vabal_u16((a), (b), (c)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_uint64x2_t +simde_vabal_u32(simde_uint64x2_t a, simde_uint32x2_t b, simde_uint32x2_t c) { + #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) + return vabal_u32(a, b, c); + #else + return simde_vaddq_u64(simde_vabdl_u32(b, c), a); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vabal_u32 + #define vabal_u32(a, b, c) simde_vabal_u32((a), (b), (c)) +#endif + + +SIMDE_END_DECLS_ +HEDLEY_DIAGNOSTIC_POP + +#endif /* !defined(SIMDE_ARM_NEON_abal_H) */ diff --git a/lib/simd_wrapper/simde/arm/neon/abal_high.h b/lib/simd_wrapper/simde/arm/neon/abal_high.h new file mode 100644 index 00000000000..78f538dc410 --- /dev/null +++ b/lib/simd_wrapper/simde/arm/neon/abal_high.h @@ -0,0 +1,125 @@ +/* SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, copy, + * modify, merge, publish, distribute, sublicense, and/or sell copies + * of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + * Copyright: + * 2023 Yi-Yen Chung (Copyright owned by Andes Technology) + */ + +#if !defined(SIMDE_ARM_NEON_ABAL_HIGH_H) +#define SIMDE_ARM_NEON_ABAL_HIGH_H + +#include "abdl.h" +#include "add.h" +#include "movl_high.h" + +HEDLEY_DIAGNOSTIC_PUSH +SIMDE_DISABLE_UNWANTED_DIAGNOSTICS +SIMDE_BEGIN_DECLS_ + +SIMDE_FUNCTION_ATTRIBUTES +simde_int16x8_t +simde_vabal_high_s8(simde_int16x8_t a, simde_int8x16_t b, simde_int8x16_t c) { + #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + return vabal_high_s8(a, b, c); + #else + return simde_vaddq_s16(simde_vabdl_s8(simde_vget_high_s8(b), simde_vget_high_s8(c)), a); + #endif +} +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vabal_high_s8 + #define vabal_high_s8(a, b, c) simde_vabal_high_s8((a), (b), (c)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_int32x4_t +simde_vabal_high_s16(simde_int32x4_t a, simde_int16x8_t b, simde_int16x8_t c) { + #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + return vabal_high_s16(a, b, c); + #else + return simde_vaddq_s32(simde_vabdl_s16(simde_vget_high_s16(b), simde_vget_high_s16(c)), a); + #endif +} +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vabal_high_s16 + #define vabal_high_s16(a, b, c) simde_vabal_high_s16((a), (b), (c)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_int64x2_t +simde_vabal_high_s32(simde_int64x2_t a, simde_int32x4_t b, simde_int32x4_t c) { + #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + return vabal_high_s32(a, b, c); + #else + return simde_vaddq_s64(simde_vabdl_s32(simde_vget_high_s32(b), simde_vget_high_s32(c)), a); + #endif +} +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vabal_high_s32 + #define vabal_high_s32(a, b, c) simde_vabal_high_s32((a), (b), (c)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_uint16x8_t +simde_vabal_high_u8(simde_uint16x8_t a, simde_uint8x16_t b, simde_uint8x16_t c) { + #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + return vabal_high_u8(a, b, c); + #else + return simde_vaddq_u16(simde_vabdl_u8(simde_vget_high_u8(b), simde_vget_high_u8(c)), a); + #endif +} +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vabal_high_u8 + #define vabal_high_u8(a, b, c) simde_vabal_high_u8((a), (b), (c)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_uint32x4_t +simde_vabal_high_u16(simde_uint32x4_t a, simde_uint16x8_t b, simde_uint16x8_t c) { + #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + return vabal_high_u16(a, b, c); + #else + return simde_vaddq_u32(simde_vabdl_u16(simde_vget_high_u16(b), simde_vget_high_u16(c)), a); + #endif +} +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vabal_high_u16 + #define vabal_high_u16(a, b, c) simde_vabal_high_u16((a), (b), (c)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_uint64x2_t +simde_vabal_high_u32(simde_uint64x2_t a, simde_uint32x4_t b, simde_uint32x4_t c) { + #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + return vabal_high_u32(a, b, c); + #else + return simde_vaddq_u64(simde_vabdl_u32(simde_vget_high_u32(b), simde_vget_high_u32(c)), a); + #endif +} +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vabal_high_u32 + #define vabal_high_u32(a, b, c) simde_vabal_high_u32((a), (b), (c)) +#endif + +SIMDE_END_DECLS_ +HEDLEY_DIAGNOSTIC_POP + +#endif /* !defined(SIMDE_ARM_NEON_abal_H) */ diff --git a/lib/simd_wrapper/simde/arm/neon/abd.h b/lib/simd_wrapper/simde/arm/neon/abd.h index 0a814e8d97f..fdb1131adc9 100644 --- a/lib/simd_wrapper/simde/arm/neon/abd.h +++ b/lib/simd_wrapper/simde/arm/neon/abd.h @@ -22,6 +22,7 @@ * * Copyright: * 2020 Evan Nemerson + * 2023 Yi-Yen Chung (Copyright owned by Andes Technology) */ #if !defined(SIMDE_ARM_NEON_ABD_H) @@ -37,6 +38,23 @@ HEDLEY_DIAGNOSTIC_PUSH SIMDE_DISABLE_UNWANTED_DIAGNOSTICS SIMDE_BEGIN_DECLS_ +SIMDE_FUNCTION_ATTRIBUTES +simde_float16_t +simde_vabdh_f16(simde_float16_t a, simde_float16_t b) { + #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && defined(SIMDE_ARM_NEON_FP16) + return vabdh_f16(a, b); + #else + simde_float32_t a_ = simde_float16_to_float32(a); + simde_float32_t b_ = simde_float16_to_float32(b); + simde_float32_t r_ = a_ - b_; + return r_ < 0 ? simde_float16_from_float32(-r_) : simde_float16_from_float32(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vabdh_f16 + #define vabdh_f16(a, b) simde_vabdh_f16((a), (b)) +#endif + SIMDE_FUNCTION_ATTRIBUTES simde_float32_t simde_vabds_f32(simde_float32_t a, simde_float32_t b) { @@ -67,6 +85,20 @@ simde_vabdd_f64(simde_float64_t a, simde_float64_t b) { #define vabdd_f64(a, b) simde_vabdd_f64((a), (b)) #endif +SIMDE_FUNCTION_ATTRIBUTES +simde_float16x4_t +simde_vabd_f16(simde_float16x4_t a, simde_float16x4_t b) { + #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && defined(SIMDE_ARM_NEON_FP16) + return vabd_f16(a, b); + #else + return simde_vabs_f16(simde_vsub_f16(a, b)); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) + #undef vabd_f16 + #define vabd_f16(a, b) simde_vabd_f16((a), (b)) +#endif + SIMDE_FUNCTION_ATTRIBUTES simde_float32x2_t simde_vabd_f32(simde_float32x2_t a, simde_float32x2_t b) { @@ -116,6 +148,15 @@ simde_vabd_s8(simde_int8x8_t a, simde_int8x8_t b) { m ); + return simde_int8x8_from_private(r_); + #elif defined(SIMDE_RISCV_V_NATIVE) + simde_int8x8_private r_, max_, min_; + simde_int8x8_private a_ = simde_int8x8_to_private(a); + simde_int8x8_private b_ = simde_int8x8_to_private(b); + + max_.sv64 = __riscv_vmax_vv_i8m1(a_.sv64, b_.sv64, 8); + min_.sv64 = __riscv_vmin_vv_i8m1(a_.sv64, b_.sv64, 8); + r_.sv64 = __riscv_vsub_vv_i8m1(max_.sv64, min_.sv64, 8); return simde_int8x8_from_private(r_); #else return simde_vmovn_s16(simde_vabsq_s16(simde_vsubl_s8(a, b))); @@ -139,6 +180,15 @@ simde_vabd_s16(simde_int16x4_t a, simde_int16x4_t b) { r_.m64 = _mm_sub_pi16(_mm_max_pi16(a_.m64, b_.m64), _mm_min_pi16(a_.m64, b_.m64)); + return simde_int16x4_from_private(r_); + #elif defined(SIMDE_RISCV_V_NATIVE) + simde_int16x4_private r_, max_, min_; + simde_int16x4_private a_ = simde_int16x4_to_private(a); + simde_int16x4_private b_ = simde_int16x4_to_private(b); + + max_.sv64 = __riscv_vmax_vv_i16m1(a_.sv64, b_.sv64, 4); + min_.sv64 = __riscv_vmin_vv_i16m1(a_.sv64, b_.sv64, 4); + r_.sv64 = __riscv_vsub_vv_i16m1(max_.sv64, min_.sv64, 4); return simde_int16x4_from_private(r_); #else return simde_vmovn_s32(simde_vabsq_s32(simde_vsubl_s16(a, b))); @@ -154,6 +204,15 @@ simde_int32x2_t simde_vabd_s32(simde_int32x2_t a, simde_int32x2_t b) { #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) return vabd_s32(a, b); + #elif defined(SIMDE_RISCV_V_NATIVE) + simde_int32x2_private r_, max_, min_; + simde_int32x2_private a_ = simde_int32x2_to_private(a); + simde_int32x2_private b_ = simde_int32x2_to_private(b); + + max_.sv64 = __riscv_vmax_vv_i32m1(a_.sv64, b_.sv64, 2); + min_.sv64 = __riscv_vmin_vv_i32m1(a_.sv64, b_.sv64, 2); + r_.sv64 = __riscv_vsub_vv_i32m1(max_.sv64, min_.sv64, 2); + return simde_int32x2_from_private(r_); #else return simde_vmovn_s64(simde_vabsq_s64(simde_vsubl_s32(a, b))); #endif @@ -168,6 +227,15 @@ simde_uint8x8_t simde_vabd_u8(simde_uint8x8_t a, simde_uint8x8_t b) { #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) return vabd_u8(a, b); + #elif defined(SIMDE_RISCV_V_NATIVE) + simde_uint8x8_private r_, max_, min_; + simde_uint8x8_private a_ = simde_uint8x8_to_private(a); + simde_uint8x8_private b_ = simde_uint8x8_to_private(b); + + max_.sv64 = __riscv_vmaxu_vv_u8m1(a_.sv64, b_.sv64, 8); + min_.sv64 = __riscv_vminu_vv_u8m1(a_.sv64, b_.sv64, 8); + r_.sv64 = __riscv_vsub_vv_u8m1(max_.sv64, min_.sv64, 8); + return simde_uint8x8_from_private(r_); #else return simde_vmovn_u16( simde_vreinterpretq_u16_s16( @@ -187,6 +255,15 @@ simde_uint16x4_t simde_vabd_u16(simde_uint16x4_t a, simde_uint16x4_t b) { #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) return vabd_u16(a, b); + #elif defined(SIMDE_RISCV_V_NATIVE) + simde_uint16x4_private r_, max_, min_; + simde_uint16x4_private a_ = simde_uint16x4_to_private(a); + simde_uint16x4_private b_ = simde_uint16x4_to_private(b); + + max_.sv64 = __riscv_vmaxu_vv_u16m1(a_.sv64, b_.sv64, 4); + min_.sv64 = __riscv_vminu_vv_u16m1(a_.sv64, b_.sv64, 4); + r_.sv64 = __riscv_vsub_vv_u16m1(max_.sv64, min_.sv64, 4); + return simde_uint16x4_from_private(r_); #else return simde_vmovn_u32( simde_vreinterpretq_u32_s32( @@ -206,6 +283,15 @@ simde_uint32x2_t simde_vabd_u32(simde_uint32x2_t a, simde_uint32x2_t b) { #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) return vabd_u32(a, b); + #elif defined(SIMDE_RISCV_V_NATIVE) + simde_uint32x2_private r_, max_, min_; + simde_uint32x2_private a_ = simde_uint32x2_to_private(a); + simde_uint32x2_private b_ = simde_uint32x2_to_private(b); + + max_.sv64 = __riscv_vmaxu_vv_u32m1(a_.sv64, b_.sv64, 2); + min_.sv64 = __riscv_vminu_vv_u32m1(a_.sv64, b_.sv64, 2); + r_.sv64 = __riscv_vsub_vv_u32m1(max_.sv64, min_.sv64, 2); + return simde_uint32x2_from_private(r_); #else return simde_vmovn_u64( simde_vreinterpretq_u64_s64( @@ -220,6 +306,20 @@ simde_vabd_u32(simde_uint32x2_t a, simde_uint32x2_t b) { #define vabd_u32(a, b) simde_vabd_u32((a), (b)) #endif +SIMDE_FUNCTION_ATTRIBUTES +simde_float16x8_t +simde_vabdq_f16(simde_float16x8_t a, simde_float16x8_t b) { + #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARM_NEON_FP16) + return vabdq_f16(a, b); + #else + return simde_vabsq_f16(simde_vsubq_f16(a, b)); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) + #undef vabdq_f16 + #define vabdq_f16(a, b) simde_vabdq_f16((a), (b)) +#endif + SIMDE_FUNCTION_ATTRIBUTES simde_float32x4_t simde_vabdq_f32(simde_float32x4_t a, simde_float32x4_t b) { @@ -277,6 +377,12 @@ simde_vabdq_s8(simde_int8x16_t a, simde_int8x16_t b) { ); #elif defined(SIMDE_WASM_SIMD128_NATIVE) r_.v128 = wasm_i8x16_sub(wasm_i8x16_max(a_.v128, b_.v128), wasm_i8x16_min(a_.v128, b_.v128)); + #elif defined(SIMDE_RISCV_V_NATIVE) + simde_int8x16_private max_, min_; + + max_.sv128 = __riscv_vmax_vv_i8m1(a_.sv128, b_.sv128, 16); + min_.sv128 = __riscv_vmin_vv_i8m1(a_.sv128, b_.sv128, 16); + r_.sv128 = __riscv_vsub_vv_i8m1(max_.sv128, min_.sv128, 16); #else SIMDE_VECTORIZE for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { @@ -313,6 +419,12 @@ simde_vabdq_s16(simde_int16x8_t a, simde_int16x8_t b) { r_.m128i = _mm_sub_epi16(_mm_max_epi16(a_.m128i, b_.m128i), _mm_min_epi16(a_.m128i, b_.m128i)); #elif defined(SIMDE_WASM_SIMD128_NATIVE) r_.v128 = wasm_i16x8_sub(wasm_i16x8_max(a_.v128, b_.v128), wasm_i16x8_min(a_.v128, b_.v128)); + #elif defined(SIMDE_RISCV_V_NATIVE) + simde_int16x8_private max_, min_; + + max_.sv128 = __riscv_vmax_vv_i16m1(a_.sv128, b_.sv128, 8); + min_.sv128 = __riscv_vmin_vv_i16m1(a_.sv128, b_.sv128, 8); + r_.sv128 = __riscv_vsub_vv_i16m1(max_.sv128, min_.sv128, 8); #else SIMDE_VECTORIZE for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { @@ -348,6 +460,8 @@ simde_vabdq_s32(simde_int32x4_t a, simde_int32x4_t b) { #if defined(SIMDE_X86_SSE4_1_NATIVE) r_.m128i = _mm_sub_epi32(_mm_max_epi32(a_.m128i, b_.m128i), _mm_min_epi32(a_.m128i, b_.m128i)); + #elif defined(SIMDE_WASM_SIMD128_NATIVE) + r_.v128 = wasm_i32x4_sub(wasm_i32x4_max(a_.v128, b_.v128), wasm_i32x4_min(a_.v128, b_.v128)); #elif defined(SIMDE_X86_SSE2_NATIVE) const __m128i m = _mm_cmpgt_epi32(b_.m128i, a_.m128i); r_.m128i = @@ -358,6 +472,12 @@ simde_vabdq_s32(simde_int32x4_t a, simde_int32x4_t b) { ), m ); + #elif defined(SIMDE_RISCV_V_NATIVE) + simde_int32x4_private max_, min_; + + max_.sv128 = __riscv_vmax_vv_i32m1(a_.sv128, b_.sv128, 4); + min_.sv128 = __riscv_vmin_vv_i32m1(a_.sv128, b_.sv128, 4); + r_.sv128 = __riscv_vsub_vv_i32m1(max_.sv128, min_.sv128, 4); #else SIMDE_VECTORIZE for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { @@ -395,6 +515,12 @@ simde_vabdq_u8(simde_uint8x16_t a, simde_uint8x16_t b) { r_.m128i = _mm_sub_epi8(_mm_max_epu8(a_.m128i, b_.m128i), _mm_min_epu8(a_.m128i, b_.m128i)); #elif defined(SIMDE_WASM_SIMD128_NATIVE) r_.v128 = wasm_i8x16_sub(wasm_u8x16_max(a_.v128, b_.v128), wasm_u8x16_min(a_.v128, b_.v128)); + #elif defined(SIMDE_RISCV_V_NATIVE) + simde_uint8x16_private max_, min_; + + max_.sv128 = __riscv_vmaxu_vv_u8m1(a_.sv128, b_.sv128, 16); + min_.sv128 = __riscv_vminu_vv_u8m1(a_.sv128, b_.sv128, 16); + r_.sv128 = __riscv_vsub_vv_u8m1(max_.sv128, min_.sv128, 16); #else SIMDE_VECTORIZE for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { @@ -432,6 +558,12 @@ simde_vabdq_u16(simde_uint16x8_t a, simde_uint16x8_t b) { r_.m128i = _mm_sub_epi16(_mm_max_epu16(a_.m128i, b_.m128i), _mm_min_epu16(a_.m128i, b_.m128i)); #elif defined(SIMDE_WASM_SIMD128_NATIVE) r_.v128 = wasm_i16x8_sub(wasm_u16x8_max(a_.v128, b_.v128), wasm_u16x8_min(a_.v128, b_.v128)); + #elif defined(SIMDE_RISCV_V_NATIVE) + simde_uint16x8_private max_, min_; + + max_.sv128 = __riscv_vmaxu_vv_u16m1(a_.sv128, b_.sv128, 8); + min_.sv128 = __riscv_vminu_vv_u16m1(a_.sv128, b_.sv128, 8); + r_.sv128 = __riscv_vsub_vv_u16m1(max_.sv128, min_.sv128, 8); #else SIMDE_VECTORIZE for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { @@ -467,6 +599,14 @@ simde_vabdq_u32(simde_uint32x4_t a, simde_uint32x4_t b) { #if defined(SIMDE_X86_SSE4_2_NATIVE) r_.m128i = _mm_sub_epi32(_mm_max_epu32(a_.m128i, b_.m128i), _mm_min_epu32(a_.m128i, b_.m128i)); + #elif defined(SIMDE_WASM_SIMD128_NATIVE) + r_.v128 = wasm_i32x4_sub(wasm_u32x4_max(a_.v128, b_.v128), wasm_u32x4_min(a_.v128, b_.v128)); + #elif defined(SIMDE_RISCV_V_NATIVE) + simde_uint32x4_private max_, min_; + + max_.sv128 = __riscv_vmaxu_vv_u32m1(a_.sv128, b_.sv128, 4); + min_.sv128 = __riscv_vminu_vv_u32m1(a_.sv128, b_.sv128, 4); + r_.sv128 = __riscv_vsub_vv_u32m1(max_.sv128, min_.sv128, 4); #else SIMDE_VECTORIZE for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { diff --git a/lib/simd_wrapper/simde/arm/neon/abdl_high.h b/lib/simd_wrapper/simde/arm/neon/abdl_high.h new file mode 100644 index 00000000000..826b1ba33dd --- /dev/null +++ b/lib/simd_wrapper/simde/arm/neon/abdl_high.h @@ -0,0 +1,123 @@ +/* SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, copy, + * modify, merge, publish, distribute, sublicense, and/or sell copies + * of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + * Copyright: + * 2023 Yi-Yen Chung (Copyright owned by Andes Technology) + */ + +#if !defined(SIMDE_ARM_NEON_ABDL_HIGH_H) +#define SIMDE_ARM_NEON_ABDL_HIGH_H + +#include "abdl.h" + +HEDLEY_DIAGNOSTIC_PUSH +SIMDE_DISABLE_UNWANTED_DIAGNOSTICS +SIMDE_BEGIN_DECLS_ + +SIMDE_FUNCTION_ATTRIBUTES +simde_int16x8_t +simde_vabdl_high_s8(simde_int8x16_t a, simde_int8x16_t b) { + #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + return vabdl_high_s8(a, b); + #else + return simde_vabdl_s8(simde_vget_high_s8(a), simde_vget_high_s8(b)); + #endif +} +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vabdl_high_s8 + #define vabdl_high_s8(a, b) simde_vabdl_high_s8((a), (b)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_int32x4_t +simde_vabdl_high_s16(simde_int16x8_t a, simde_int16x8_t b) { + #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + return vabdl_high_s16(a, b); + #else + return simde_vabdl_s16(simde_vget_high_s16(a), simde_vget_high_s16(b)); + #endif +} +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vabdl_high_s16 + #define vabdl_high_s16(a, b) simde_vabdl_high_s16((a), (b)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_int64x2_t +simde_vabdl_high_s32(simde_int32x4_t a, simde_int32x4_t b) { + #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + return vabdl_high_s32(a, b); + #else + return simde_vabdl_s32(simde_vget_high_s32(a), simde_vget_high_s32(b)); + #endif +} +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vabdl_high_s32 + #define vabdl_high_s32(a, b) simde_vabdl_high_s32((a), (b)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_uint16x8_t +simde_vabdl_high_u8(simde_uint8x16_t a, simde_uint8x16_t b) { + #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + return vabdl_high_u8(a, b); + #else + return simde_vabdl_u8(simde_vget_high_u8(a), simde_vget_high_u8(b)); + #endif +} +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vabdl_high_u8 + #define vabdl_high_u8(a, b) simde_vabdl_high_u8((a), (b)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_uint32x4_t +simde_vabdl_high_u16(simde_uint16x8_t a, simde_uint16x8_t b) { + #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + return vabdl_high_u16(a, b); + #else + return simde_vabdl_u16(simde_vget_high_u16(a), simde_vget_high_u16(b)); + #endif +} +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vabdl_high_u16 + #define vabdl_high_u16(a, b) simde_vabdl_high_u16((a), (b)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_uint64x2_t +simde_vabdl_high_u32(simde_uint32x4_t a, simde_uint32x4_t b) { + #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + return vabdl_high_u32(a, b); + #else + return simde_vabdl_u32(simde_vget_high_u32(a), simde_vget_high_u32(b)); + #endif +} +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vabdl_high_u32 + #define vabdl_high_u32(a, b) simde_vabdl_high_u32((a), (b)) +#endif + +SIMDE_END_DECLS_ +HEDLEY_DIAGNOSTIC_POP + +#endif /* !defined(SIMDE_ARM_NEON_ABDL_HIGH_H) */ diff --git a/lib/simd_wrapper/simde/arm/neon/abs.h b/lib/simd_wrapper/simde/arm/neon/abs.h index 3c705e98b4e..16250da787b 100644 --- a/lib/simd_wrapper/simde/arm/neon/abs.h +++ b/lib/simd_wrapper/simde/arm/neon/abs.h @@ -22,6 +22,7 @@ * * Copyright: * 2020 Evan Nemerson + * 2023 Yi-Yen Chung (Copyright owned by Andes Technology) */ #if !defined(SIMDE_ARM_NEON_ABS_H) @@ -47,6 +48,45 @@ simde_vabsd_s64(int64_t a) { #define vabsd_s64(a) simde_vabsd_s64(a) #endif +SIMDE_FUNCTION_ATTRIBUTES +simde_float16_t +simde_vabsh_f16(simde_float16_t a) { + #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARM_NEON_FP16) + return vabsh_f16(a); + #else + simde_float32_t a_ = simde_float16_to_float32(a); + + return (a_ >= 0.0f) ? simde_float16_from_float32(a_) : simde_float16_from_float32(-a_); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) + #undef vabsh_f16 + #define vabsh_f16(a) simde_vabsh_f16(a) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_float16x4_t +simde_vabs_f16(simde_float16x4_t a) { + #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARM_NEON_FP16) + return vabs_f16(a); + #else + simde_float16x4_private + r_, + a_ = simde_float16x4_to_private(a); + + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { + r_.values[i] = simde_vabsh_f16(a_.values[i]); + } + + return simde_float16x4_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) + #undef vabs_f16 + #define vabs_f16(a) simde_vabs_f16(a) +#endif + SIMDE_FUNCTION_ATTRIBUTES simde_float32x2_t simde_vabs_f32(simde_float32x2_t a) { @@ -211,6 +251,29 @@ simde_vabs_s64(simde_int64x1_t a) { #define vabs_s64(a) simde_vabs_s64(a) #endif +SIMDE_FUNCTION_ATTRIBUTES +simde_float16x8_t +simde_vabsq_f16(simde_float16x8_t a) { + #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARM_NEON_FP16) + return vabsq_f16(a); + #else + simde_float16x8_private + r_, + a_ = simde_float16x8_to_private(a); + + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { + r_.values[i] = simde_vabsh_f16(a_.values[i]); + } + + return simde_float16x8_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) + #undef vabsq_f16 + #define vabsq_f16(a) simde_vabsq_f16(a) +#endif + SIMDE_FUNCTION_ATTRIBUTES simde_float32x4_t simde_vabsq_f32(simde_float32x4_t a) { @@ -374,7 +437,7 @@ simde_vabsq_s32(simde_int32x4_t a) { #else SIMDE_VECTORIZE for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { - r_.values[i] = a_.values[i] < 0 ? -a_.values[i] : a_.values[i]; + r_.values[i] = a_.values[i] < 0 ? HEDLEY_STATIC_CAST(int32_t, 0 - HEDLEY_STATIC_CAST(uint32_t, a_.values[i])) : a_.values[i]; } #endif @@ -413,7 +476,7 @@ simde_vabsq_s64(simde_int64x2_t a) { #else SIMDE_VECTORIZE for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { - r_.values[i] = a_.values[i] < 0 ? -a_.values[i] : a_.values[i]; + r_.values[i] = a_.values[i] < 0 ? HEDLEY_STATIC_CAST(int64_t, 0 - HEDLEY_STATIC_CAST(uint64_t, a_.values[i])) : a_.values[i]; } #endif diff --git a/lib/simd_wrapper/simde/arm/neon/add.h b/lib/simd_wrapper/simde/arm/neon/add.h index d3660f66085..8b4fe3499ae 100644 --- a/lib/simd_wrapper/simde/arm/neon/add.h +++ b/lib/simd_wrapper/simde/arm/neon/add.h @@ -22,6 +22,8 @@ * * Copyright: * 2020 Evan Nemerson + * 2023 Yi-Yen Chung (Copyright owned by Andes Technology) + * 2023 Ju-Hung Li (Copyright owned by NTHU pllab) */ #if !defined(SIMDE_ARM_NEON_ADD_H) @@ -35,7 +37,7 @@ SIMDE_BEGIN_DECLS_ SIMDE_FUNCTION_ATTRIBUTES simde_float16 -simde_vaddh_f16(simde_float16 a, simde_float16 b) { +simde_vaddh_f16(simde_float16_t a, simde_float16_t b) { #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARM_NEON_FP16) return vaddh_f16(a, b); #else @@ -88,10 +90,14 @@ simde_vadd_f16(simde_float16x4_t a, simde_float16x4_t b) { a_ = simde_float16x4_to_private(a), b_ = simde_float16x4_to_private(b); - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { - r_.values[i] = simde_vaddh_f16(a_.values[i], b_.values[i]); - } + #if defined(SIMDE_RISCV_V_NATIVE) && defined(SIMDE_ARCH_RISCV_ZVFH) + r_.sv64 = __riscv_vfadd_vv_f16m1(a_.sv64, b_.sv64, 4); + #else + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { + r_.values[i] = simde_vaddh_f16(a_.values[i], b_.values[i]); + } + #endif return simde_float16x4_from_private(r_); #endif @@ -112,7 +118,9 @@ simde_vadd_f32(simde_float32x2_t a, simde_float32x2_t b) { a_ = simde_float32x2_to_private(a), b_ = simde_float32x2_to_private(b); - #if defined(SIMDE_VECTOR_SUBSCRIPT_OPS) + #if defined(SIMDE_RISCV_V_NATIVE) + r_.sv64 = __riscv_vfadd_vv_f32m1(a_.sv64, b_.sv64, 2); + #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) r_.values = a_.values + b_.values; #else SIMDE_VECTORIZE @@ -140,7 +148,9 @@ simde_vadd_f64(simde_float64x1_t a, simde_float64x1_t b) { a_ = simde_float64x1_to_private(a), b_ = simde_float64x1_to_private(b); - #if defined(SIMDE_VECTOR_SUBSCRIPT_OPS) + #if defined(SIMDE_RISCV_V_NATIVE) + r_.sv64 = __riscv_vfadd_vv_f64m1(a_.sv64, b_.sv64, 1); + #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) r_.values = a_.values + b_.values; #else SIMDE_VECTORIZE @@ -168,7 +178,9 @@ simde_vadd_s8(simde_int8x8_t a, simde_int8x8_t b) { a_ = simde_int8x8_to_private(a), b_ = simde_int8x8_to_private(b); - #if defined(SIMDE_VECTOR_SUBSCRIPT_OPS) + #if defined(SIMDE_RISCV_V_NATIVE) + r_.sv64 = __riscv_vadd_vv_i8m1(a_.sv64, b_.sv64, 8); + #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) r_.values = a_.values + b_.values; #elif defined(SIMDE_X86_MMX_NATIVE) r_.m64 = _mm_add_pi8(a_.m64, b_.m64); @@ -198,7 +210,9 @@ simde_vadd_s16(simde_int16x4_t a, simde_int16x4_t b) { a_ = simde_int16x4_to_private(a), b_ = simde_int16x4_to_private(b); - #if defined(SIMDE_VECTOR_SUBSCRIPT_OPS) + #if defined(SIMDE_RISCV_V_NATIVE) + r_.sv64 = __riscv_vadd_vv_i16m1(a_.sv64, b_.sv64, 4); + #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) r_.values = a_.values + b_.values; #elif defined(SIMDE_X86_MMX_NATIVE) r_.m64 = _mm_add_pi16(a_.m64, b_.m64); @@ -228,7 +242,9 @@ simde_vadd_s32(simde_int32x2_t a, simde_int32x2_t b) { a_ = simde_int32x2_to_private(a), b_ = simde_int32x2_to_private(b); - #if defined(SIMDE_VECTOR_SUBSCRIPT_OPS) + #if defined(SIMDE_RISCV_V_NATIVE) + r_.sv64 = __riscv_vadd_vv_i32m1(a_.sv64, b_.sv64, 2); + #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) r_.values = a_.values + b_.values; #elif defined(SIMDE_X86_MMX_NATIVE) r_.m64 = _mm_add_pi32(a_.m64, b_.m64); @@ -258,7 +274,9 @@ simde_vadd_s64(simde_int64x1_t a, simde_int64x1_t b) { a_ = simde_int64x1_to_private(a), b_ = simde_int64x1_to_private(b); - #if defined(SIMDE_VECTOR_SUBSCRIPT_OPS) + #if defined(SIMDE_RISCV_V_NATIVE) + r_.sv64 = __riscv_vadd_vv_i64m1(a_.sv64, b_.sv64, 1); + #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) r_.values = a_.values + b_.values; #else SIMDE_VECTORIZE @@ -286,7 +304,9 @@ simde_vadd_u8(simde_uint8x8_t a, simde_uint8x8_t b) { a_ = simde_uint8x8_to_private(a), b_ = simde_uint8x8_to_private(b); - #if defined(SIMDE_VECTOR_SUBSCRIPT_OPS) + #if defined(SIMDE_RISCV_V_NATIVE) + r_.sv64 = __riscv_vadd_vv_u8m1(a_.sv64, b_.sv64, 8); + #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) r_.values = a_.values + b_.values; #else SIMDE_VECTORIZE @@ -314,7 +334,10 @@ simde_vadd_u16(simde_uint16x4_t a, simde_uint16x4_t b) { a_ = simde_uint16x4_to_private(a), b_ = simde_uint16x4_to_private(b); - #if defined(SIMDE_VECTOR_SUBSCRIPT_OPS) + + #if defined(SIMDE_RISCV_V_NATIVE) + r_.sv64 = __riscv_vadd_vv_u16m1(a_.sv64, b_.sv64, 4); + #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) r_.values = a_.values + b_.values; #else SIMDE_VECTORIZE @@ -342,7 +365,9 @@ simde_vadd_u32(simde_uint32x2_t a, simde_uint32x2_t b) { a_ = simde_uint32x2_to_private(a), b_ = simde_uint32x2_to_private(b); - #if defined(SIMDE_VECTOR_SUBSCRIPT_OPS) + #if defined(SIMDE_RISCV_V_NATIVE) + r_.sv64 = __riscv_vadd_vv_u32m1(a_.sv64, b_.sv64, 2); + #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) r_.values = a_.values + b_.values; #else SIMDE_VECTORIZE @@ -370,7 +395,9 @@ simde_vadd_u64(simde_uint64x1_t a, simde_uint64x1_t b) { a_ = simde_uint64x1_to_private(a), b_ = simde_uint64x1_to_private(b); - #if defined(SIMDE_VECTOR_SUBSCRIPT_OPS) + #if defined(SIMDE_RISCV_V_NATIVE) + r_.sv64 = __riscv_vadd_vv_u64m1(a_.sv64, b_.sv64, 1); + #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) r_.values = a_.values + b_.values; #else SIMDE_VECTORIZE @@ -397,10 +424,15 @@ simde_vaddq_f16(simde_float16x8_t a, simde_float16x8_t b) { r_, a_ = simde_float16x8_to_private(a), b_ = simde_float16x8_to_private(b); - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { - r_.values[i] = simde_vaddh_f16(a_.values[i], b_.values[i]); - } + + #if defined(SIMDE_RISCV_V_NATIVE) && defined(SIMDE_ARCH_RISCV_ZVFH) + r_.sv128 = __riscv_vfadd_vv_f16m1(a_.sv128, b_.sv128, 8); + #else + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { + r_.values[i] = simde_vaddh_f16(a_.values[i], b_.values[i]); + } + #endif return simde_float16x8_from_private(r_); #endif @@ -431,6 +463,8 @@ simde_vaddq_f32(simde_float32x4_t a, simde_float32x4_t b) { r_.m128 = _mm_add_ps(a_.m128, b_.m128); #elif defined(SIMDE_WASM_SIMD128_NATIVE) r_.v128 = wasm_f32x4_add(a_.v128, b_.v128); + #elif defined(SIMDE_RISCV_V_NATIVE) + r_.sv128 = __riscv_vfadd_vv_f32m1(a_.sv128, b_.sv128, 4); #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) r_.values = a_.values + b_.values; #else @@ -465,6 +499,8 @@ simde_vaddq_f64(simde_float64x2_t a, simde_float64x2_t b) { r_.m128d = _mm_add_pd(a_.m128d, b_.m128d); #elif defined(SIMDE_WASM_SIMD128_NATIVE) r_.v128 = wasm_f64x2_add(a_.v128, b_.v128); + #elif defined(SIMDE_RISCV_V_NATIVE) + r_.sv128 = __riscv_vfadd_vv_f64m1(a_.sv128, b_.sv128, 2); #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) r_.values = a_.values + b_.values; #else @@ -499,6 +535,8 @@ simde_vaddq_s8(simde_int8x16_t a, simde_int8x16_t b) { r_.m128i = _mm_add_epi8(a_.m128i, b_.m128i); #elif defined(SIMDE_WASM_SIMD128_NATIVE) r_.v128 = wasm_i8x16_add(a_.v128, b_.v128); + #elif defined(SIMDE_RISCV_V_NATIVE) + r_.sv128 = __riscv_vadd_vv_i8m1(a_.sv128, b_.sv128, 16); #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) r_.values = a_.values + b_.values; #else @@ -533,6 +571,8 @@ simde_vaddq_s16(simde_int16x8_t a, simde_int16x8_t b) { r_.m128i = _mm_add_epi16(a_.m128i, b_.m128i); #elif defined(SIMDE_WASM_SIMD128_NATIVE) r_.v128 = wasm_i16x8_add(a_.v128, b_.v128); + #elif defined(SIMDE_RISCV_V_NATIVE) + r_.sv128 = __riscv_vadd_vv_i16m1(a_.sv128, b_.sv128, 8); #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) r_.values = a_.values + b_.values; #else @@ -567,6 +607,8 @@ simde_vaddq_s32(simde_int32x4_t a, simde_int32x4_t b) { r_.m128i = _mm_add_epi32(a_.m128i, b_.m128i); #elif defined(SIMDE_WASM_SIMD128_NATIVE) r_.v128 = wasm_i32x4_add(a_.v128, b_.v128); + #elif defined(SIMDE_RISCV_V_NATIVE) + r_.sv128 = __riscv_vadd_vv_i32m1(a_.sv128, b_.sv128, 4); #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) r_.values = a_.values + b_.values; #else @@ -601,6 +643,8 @@ simde_vaddq_s64(simde_int64x2_t a, simde_int64x2_t b) { r_.m128i = _mm_add_epi64(a_.m128i, b_.m128i); #elif defined(SIMDE_WASM_SIMD128_NATIVE) r_.v128 = wasm_i64x2_add(a_.v128, b_.v128); + #elif defined(SIMDE_RISCV_V_NATIVE) + r_.sv128 = __riscv_vadd_vv_i64m1(a_.sv128, b_.sv128, 2); #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) r_.values = a_.values + b_.values; #else @@ -631,7 +675,9 @@ simde_vaddq_u8(simde_uint8x16_t a, simde_uint8x16_t b) { a_ = simde_uint8x16_to_private(a), b_ = simde_uint8x16_to_private(b); - #if defined(SIMDE_VECTOR_SUBSCRIPT_OPS) + #if defined(SIMDE_RISCV_V_NATIVE) + r_.sv128 = __riscv_vadd_vv_u8m1(a_.sv128, b_.sv128, 16); + #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) r_.values = a_.values + b_.values; #else SIMDE_VECTORIZE @@ -661,7 +707,9 @@ simde_vaddq_u16(simde_uint16x8_t a, simde_uint16x8_t b) { a_ = simde_uint16x8_to_private(a), b_ = simde_uint16x8_to_private(b); - #if defined(SIMDE_VECTOR_SUBSCRIPT_OPS) + #if defined(SIMDE_RISCV_V_NATIVE) + r_.sv128 = __riscv_vadd_vv_u16m1(a_.sv128, b_.sv128, 8); + #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) r_.values = a_.values + b_.values; #else SIMDE_VECTORIZE @@ -691,7 +739,9 @@ simde_vaddq_u32(simde_uint32x4_t a, simde_uint32x4_t b) { a_ = simde_uint32x4_to_private(a), b_ = simde_uint32x4_to_private(b); - #if defined(SIMDE_VECTOR_SUBSCRIPT_OPS) + #if defined(SIMDE_RISCV_V_NATIVE) + r_.sv128 = __riscv_vadd_vv_u32m1(a_.sv128, b_.sv128, 4); + #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) r_.values = a_.values + b_.values; #else SIMDE_VECTORIZE @@ -721,7 +771,9 @@ simde_vaddq_u64(simde_uint64x2_t a, simde_uint64x2_t b) { a_ = simde_uint64x2_to_private(a), b_ = simde_uint64x2_to_private(b); - #if defined(SIMDE_VECTOR_SUBSCRIPT_OPS) + #if defined(SIMDE_RISCV_V_NATIVE) + r_.sv128 = __riscv_vadd_vv_u64m1(a_.sv128, b_.sv128, 2); + #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) r_.values = a_.values + b_.values; #else SIMDE_VECTORIZE @@ -738,6 +790,172 @@ simde_vaddq_u64(simde_uint64x2_t a, simde_uint64x2_t b) { #define vaddq_u64(a, b) simde_vaddq_u64((a), (b)) #endif +SIMDE_FUNCTION_ATTRIBUTES +simde_poly8x8_t +simde_vadd_p8(simde_poly8x8_t a, simde_poly8x8_t b) { + #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) && !defined(_GCC_ARM_NEON_H) + return vadd_p8(a, b); + #else + simde_poly8x8_private + r_, + a_ = simde_poly8x8_to_private(a), + b_ = simde_poly8x8_to_private(b); + + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { + r_.values[i] = b_.values[i] ^ ((0 ^ a_.values[i]) & 0xFF); + } + + return simde_poly8x8_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vadd_p8 + #define vadd_p8(a, b) simde_vadd_p8((a), (b)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_poly16x4_t +simde_vadd_p16(simde_poly16x4_t a, simde_poly16x4_t b) { + #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) && !defined(_GCC_ARM_NEON_H) + return vadd_p16(a, b); + #else + simde_poly16x4_private + r_, + a_ = simde_poly16x4_to_private(a), + b_ = simde_poly16x4_to_private(b); + + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { + r_.values[i] = b_.values[i] ^ ((0 ^ a_.values[i]) & 0xFFFF); + } + + return simde_poly16x4_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vadd_p16 + #define vadd_p16(a, b) simde_vadd_p16((a), (b)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_poly64x1_t +simde_vadd_p64(simde_poly64x1_t a, simde_poly64x1_t b) { + #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARM_NEON_CRYPTO) && \ + !defined(_GCC_ARM_NEON_H) + return vadd_p64(a, b); + #else + simde_poly64x1_private + r_, + a_ = simde_poly64x1_to_private(a), + b_ = simde_poly64x1_to_private(b); + + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { + r_.values[i] = b_.values[i] ^ ((0 ^ a_.values[i]) & 0xFFFFFFFFFFFFFFFF); + } + + return simde_poly64x1_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vadd_p64 + #define vadd_p64(a, b) simde_vadd_p64((a), (b)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_poly8x16_t +simde_vaddq_p8(simde_poly8x16_t a, simde_poly8x16_t b) { + #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) && !defined(_GCC_ARM_NEON_H) + return vaddq_p8(a, b); + #else + simde_poly8x16_private + r_, + a_ = simde_poly8x16_to_private(a), + b_ = simde_poly8x16_to_private(b); + + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { + r_.values[i] = b_.values[i] ^ ((0 ^ a_.values[i]) & 0xFF); + } + + return simde_poly8x16_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vaddq_p8 + #define vaddq_p8(a, b) simde_vaddq_p8((a), (b)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_poly16x8_t +simde_vaddq_p16(simde_poly16x8_t a, simde_poly16x8_t b) { + #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) && !defined(_GCC_ARM_NEON_H) + return vaddq_p16(a, b); + #else + simde_poly16x8_private + r_, + a_ = simde_poly16x8_to_private(a), + b_ = simde_poly16x8_to_private(b); + + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { + r_.values[i] = b_.values[i] ^ ((0 ^ a_.values[i]) & 0xFFFF); + } + + return simde_poly16x8_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vaddq_p16 + #define vaddq_p16(a, b) simde_vaddq_p16((a), (b)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_poly64x2_t +simde_vaddq_p64(simde_poly64x2_t a, simde_poly64x2_t b) { + #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARM_NEON_CRYPTO) && \ + !defined(_GCC_ARM_NEON_H) + return vaddq_p64(a, b); + #else + simde_poly64x2_private + r_, + a_ = simde_poly64x2_to_private(a), + b_ = simde_poly64x2_to_private(b); + + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { + r_.values[i] = b_.values[i] ^ ((0 ^ a_.values[i]) & 0xFFFFFFFFFFFFFFFF); + } + + return simde_poly64x2_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) + #undef vaddq_p64 + #define vaddq_p64(a, b) simde_vaddq_p64((a), (b)) +#endif + +#if !defined(SIMDE_TARGET_NOT_SUPPORT_INT128_TYPE) +SIMDE_FUNCTION_ATTRIBUTES +simde_poly128_t +simde_vaddq_p128(simde_poly128_t a, simde_poly128_t b) { + #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARM_NEON_CRYPTO) && \ + !defined(_GCC_ARM_NEON_H) + return vaddq_p128(a, b); + #else + simde_poly128_t mask = 0xFFFFFFFFFFFFFFFFull; + mask = mask << 64; + mask = mask | 0xFFFFFFFFFFFFFFFFull; + return b ^ ((0 ^ a) & mask); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) + #undef vaddq_p128 + #define vaddq_p128(a, b) simde_vaddq_p128((a), (b)) +#endif +#endif /* !defined(SIMDE_TARGET_NOT_SUPPORT_INT128_TYPE) */ + SIMDE_END_DECLS_ HEDLEY_DIAGNOSTIC_POP diff --git a/lib/simd_wrapper/simde/arm/neon/addhn_high.h b/lib/simd_wrapper/simde/arm/neon/addhn_high.h new file mode 100644 index 00000000000..0c96a24d456 --- /dev/null +++ b/lib/simd_wrapper/simde/arm/neon/addhn_high.h @@ -0,0 +1,124 @@ +/* SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, copy, + * modify, merge, publish, distribute, sublicense, and/or sell copies + * of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + * Copyright: + * 2023 Yi-Yen Chung (Copyright owned by Andes Technology) + */ + +#if !defined(SIMDE_ARM_NEON_ADDHN_HIGH_H) +#define SIMDE_ARM_NEON_ADDHN_HIGH_H + +#include "addhn.h" +#include "combine.h" + +HEDLEY_DIAGNOSTIC_PUSH +SIMDE_DISABLE_UNWANTED_DIAGNOSTICS +SIMDE_BEGIN_DECLS_ + +SIMDE_FUNCTION_ATTRIBUTES +simde_int8x16_t +simde_vaddhn_high_s16(simde_int8x8_t r, simde_int16x8_t a, simde_int16x8_t b) { + #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + return vaddhn_high_s16(r, a, b); + #else + return simde_vcombine_s8(r, simde_vaddhn_s16(a, b)); + #endif +} +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vaddhn_high_s16 + #define vaddhn_high_s16(r, a, b) simde_vaddhn_high_s16((r), (a), (b)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_int16x8_t +simde_vaddhn_high_s32(simde_int16x4_t r, simde_int32x4_t a, simde_int32x4_t b) { + #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + return vaddhn_high_s32(r, a, b); + #else + return simde_vcombine_s16(r, simde_vaddhn_s32(a, b)); + #endif +} +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vaddhn_high_s32 + #define vaddhn_high_s32(r, a, b) simde_vaddhn_high_s32((r), (a), (b)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_int32x4_t +simde_vaddhn_high_s64(simde_int32x2_t r, simde_int64x2_t a, simde_int64x2_t b) { + #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + return vaddhn_high_s64(r, a, b); + #else + return simde_vcombine_s32(r, simde_vaddhn_s64(a, b)); + #endif +} +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vaddhn_high_s64 + #define vaddhn_high_s64(r, a, b) simde_vaddhn_high_s64((r), (a), (b)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_uint8x16_t +simde_vaddhn_high_u16(simde_uint8x8_t r, simde_uint16x8_t a, simde_uint16x8_t b) { + #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + return vaddhn_high_u16(r, a, b); + #else + return simde_vcombine_u8(r, simde_vaddhn_u16(a, b)); + #endif +} +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vaddhn_high_u16 + #define vaddhn_high_u16(r, a, b) simde_vaddhn_high_u16((r), (a), (b)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_uint16x8_t +simde_vaddhn_high_u32(simde_uint16x4_t r, simde_uint32x4_t a, simde_uint32x4_t b) { + #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + return vaddhn_high_u32(r, a, b); + #else + return simde_vcombine_u16(r, simde_vaddhn_u32(a, b)); + #endif +} +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vaddhn_high_u32 + #define vaddhn_high_u32(r, a, b) simde_vaddhn_high_u32((r), (a), (b)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_uint32x4_t +simde_vaddhn_high_u64(simde_uint32x2_t r, simde_uint64x2_t a, simde_uint64x2_t b) { + #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + return vaddhn_high_u64(r, a, b); + #else + return simde_vcombine_u32(r, simde_vaddhn_u64(a, b)); + #endif +} +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vaddhn_high_u64 + #define vaddhn_high_u64(r, a, b) simde_vaddhn_high_u64((r), (a), (b)) +#endif + +SIMDE_END_DECLS_ +HEDLEY_DIAGNOSTIC_POP + +#endif /* !defined(SIMDE_ARM_NEON_ADDHN_HIGH_H) */ diff --git a/lib/simd_wrapper/simde/arm/neon/addlv.h b/lib/simd_wrapper/simde/arm/neon/addlv.h index 79d9451b0d2..dc7de0c45c9 100644 --- a/lib/simd_wrapper/simde/arm/neon/addlv.h +++ b/lib/simd_wrapper/simde/arm/neon/addlv.h @@ -184,6 +184,12 @@ int16_t simde_vaddlvq_s8(simde_int8x16_t a) { #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) return vaddlvq_s8(a); + #elif defined(SIMDE_X86_SSE2_NATIVE) + __m128i a_ = simde_int8x16_to_m128i(a); + a_ = _mm_xor_si128(a_, _mm_set1_epi8('\x80')); + a_ = _mm_sad_epu8(a_, _mm_setzero_si128()); + a_ = _mm_add_epi16(a_, _mm_shuffle_epi32(a_, 0xEE)); + return HEDLEY_STATIC_CAST(int16_t, _mm_cvtsi128_si32(a_) - 2048); #else simde_int8x16_private a_ = simde_int8x16_to_private(a); int16_t r = 0; @@ -206,6 +212,13 @@ int32_t simde_vaddlvq_s16(simde_int16x8_t a) { #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) return vaddlvq_s16(a); + #elif defined(SIMDE_X86_SSSE3_NATIVE) && !defined(HEDLEY_MSVC_VERSION) + __m128i a_ = simde_int16x8_to_m128i(a); + a_ = _mm_xor_si128(a_, _mm_set1_epi16(HEDLEY_STATIC_CAST(int16_t, 0x8000))); + a_ = _mm_shuffle_epi8(a_, _mm_set_epi8(15, 13, 11, 9, 7, 5, 3, 1, 14, 12, 10, 8, 6, 4, 2, 0)); + a_ = _mm_sad_epu8(a_, _mm_setzero_si128()); + a_ = _mm_add_epi32(a_, _mm_srli_si128(a_, 7)); + return _mm_cvtsi128_si32(a_) - 262144; #else simde_int16x8_private a_ = simde_int16x8_to_private(a); int32_t r = 0; @@ -250,6 +263,11 @@ uint16_t simde_vaddlvq_u8(simde_uint8x16_t a) { #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) return vaddlvq_u8(a); + #elif defined(SIMDE_X86_SSE2_NATIVE) + __m128i a_ = simde_uint8x16_to_m128i(a); + a_ = _mm_sad_epu8(a_, _mm_setzero_si128()); + a_ = _mm_add_epi16(a_, _mm_shuffle_epi32(a_, 0xEE)); + return HEDLEY_STATIC_CAST(uint16_t, _mm_cvtsi128_si32(a_)); #else simde_uint8x16_private a_ = simde_uint8x16_to_private(a); uint16_t r = 0; @@ -272,6 +290,12 @@ uint32_t simde_vaddlvq_u16(simde_uint16x8_t a) { #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) return vaddlvq_u16(a); + #elif defined(SIMDE_X86_SSSE3_NATIVE) + __m128i a_ = simde_uint16x8_to_m128i(a); + a_ = _mm_shuffle_epi8(a_, _mm_set_epi8(15, 13, 11, 9, 7, 5, 3, 1, 14, 12, 10, 8, 6, 4, 2, 0)); + a_ = _mm_sad_epu8(a_, _mm_setzero_si128()); + a_ = _mm_add_epi32(a_, _mm_srli_si128(a_, 7)); + return HEDLEY_STATIC_CAST(uint32_t, _mm_cvtsi128_si32(a_)); #else simde_uint16x8_private a_ = simde_uint16x8_to_private(a); uint32_t r = 0; diff --git a/lib/simd_wrapper/simde/arm/neon/addv.h b/lib/simd_wrapper/simde/arm/neon/addv.h index bcc082b34f1..6beb9836c48 100644 --- a/lib/simd_wrapper/simde/arm/neon/addv.h +++ b/lib/simd_wrapper/simde/arm/neon/addv.h @@ -352,6 +352,11 @@ simde_vaddvq_u8(simde_uint8x16_t a) { #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) r = vaddvq_u8(a); + #elif defined(SIMDE_X86_SSE2_NATIVE) + __m128i a_ = simde_uint8x16_to_m128i(a); + a_ = _mm_sad_epu8(a_, _mm_setzero_si128()); + a_ = _mm_add_epi8(a_, _mm_shuffle_epi32(a_, 0xEE)); + return HEDLEY_STATIC_CAST(uint8_t, _mm_cvtsi128_si32(a_)); #else simde_uint8x16_private a_ = simde_uint8x16_to_private(a); diff --git a/lib/simd_wrapper/simde/arm/neon/aes.h b/lib/simd_wrapper/simde/arm/neon/aes.h new file mode 100644 index 00000000000..4e6896fc879 --- /dev/null +++ b/lib/simd_wrapper/simde/arm/neon/aes.h @@ -0,0 +1,218 @@ +/* SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, copy, + * modify, merge, publish, distribute, sublicense, and/or sell copies + * of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + * Copyright: + * 2023 Yi-Yen Chung (Copyright owned by Andes Technology) + */ + +#if !defined(SIMDE_ARM_NEON_AES_H) +#define SIMDE_ARM_NEON_AES_H + +#include "types.h" +#include "../../simde-aes.h" + +HEDLEY_DIAGNOSTIC_PUSH +SIMDE_DISABLE_UNWANTED_DIAGNOSTICS +SIMDE_BEGIN_DECLS_ + +static uint8_t simde_xtime(uint8_t x) +{ + return HEDLEY_STATIC_CAST(uint8_t, (x<<1) ^ (((x>>7) & 1) * 0x1b)); +} + +SIMDE_FUNCTION_ATTRIBUTES +simde_uint8x16_t +simde_vaeseq_u8(simde_uint8x16_t data, simde_uint8x16_t key) { + #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARCH_ARM_AES) + return vaeseq_u8(data, key); + #else + /* ref: https://github.com/kokke/tiny-AES-c/blob/master/aes.c */ + simde_uint8x16_private + r_, + a_ = simde_uint8x16_to_private(data), + b_ = simde_uint8x16_to_private(key); + + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { + r_.values[i] = a_.values[i] ^ b_.values[i]; + } + // AESShiftRows + uint8_t tmp; + tmp = r_.values[1]; + r_.values[1] = r_.values[5]; + r_.values[5] = r_.values[9]; + r_.values[9] = r_.values[13]; + r_.values[13] = tmp; + + tmp = r_.values[2]; + r_.values[2] = r_.values[10]; + r_.values[10] = tmp; + + tmp = r_.values[6]; + r_.values[6] = r_.values[14]; + r_.values[14] = tmp; + + tmp = r_.values[3]; + r_.values[3] = r_.values[15]; + r_.values[15] = r_.values[11]; + r_.values[11] = r_.values[7]; + r_.values[7] = tmp; + + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { + r_.values[i] = simde_x_aes_s_box[r_.values[i]]; + } + return simde_uint8x16_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) + #undef vaeseq_u8 + #define vaeseq_u8(data, key) simde_vaeseq_u8((data), (key)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_uint8x16_t +simde_vaesdq_u8(simde_uint8x16_t data, simde_uint8x16_t key) { + #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARCH_ARM_AES) + return vaesdq_u8(data, key); + #else + /* ref: https://github.com/kokke/tiny-AES-c/blob/master/aes.c */ + simde_uint8x16_private + r_, + a_ = simde_uint8x16_to_private(data), + b_ = simde_uint8x16_to_private(key); + + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { + r_.values[i] = a_.values[i] ^ b_.values[i]; + } + // AESInvShiftRows + uint8_t tmp; + tmp = r_.values[13]; + r_.values[13] = r_.values[9]; + r_.values[9] = r_.values[5]; + r_.values[5] = r_.values[1]; + r_.values[1] = tmp; + + tmp = r_.values[2]; + r_.values[2] = r_.values[10]; + r_.values[10] = tmp; + + tmp = r_.values[6]; + r_.values[6] = r_.values[14]; + r_.values[14] = tmp; + + tmp = r_.values[3]; + r_.values[3] = r_.values[7]; + r_.values[7] = r_.values[11]; + r_.values[11] = r_.values[15]; + r_.values[15] = tmp; + for(int i = 0; i < 16; ++i) { + r_.values[i] = simde_x_aes_inv_s_box[r_.values[i]]; + } + return simde_uint8x16_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) + #undef vaesdq_u8 + #define vaesdq_u8(data, key) simde_vaesdq_u8((data), (key)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_uint8x16_t +simde_vaesmcq_u8(simde_uint8x16_t data) { + #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARCH_ARM_AES) + return vaesmcq_u8(data); + #else + /* ref: https://github.com/kokke/tiny-AES-c/blob/master/aes.c */ + simde_uint8x16_private + a_ = simde_uint8x16_to_private(data); + uint8_t i; + uint8_t Tmp, Tm, t; + for (i = 0; i < 4; ++i) + { + t = a_.values[i*4+0]; + Tmp = a_.values[i*4+0] ^ a_.values[i*4+1] ^ a_.values[i*4+2] ^ a_.values[i*4+3] ; + Tm = a_.values[i*4+0] ^ a_.values[i*4+1] ; Tm = simde_xtime(Tm); a_.values[i*4+0] ^= Tm ^ Tmp ; + Tm = a_.values[i*4+1] ^ a_.values[i*4+2] ; Tm = simde_xtime(Tm); a_.values[i*4+1] ^= Tm ^ Tmp ; + Tm = a_.values[i*4+2] ^ a_.values[i*4+3] ; Tm = simde_xtime(Tm); a_.values[i*4+2] ^= Tm ^ Tmp ; + Tm = a_.values[i*4+3] ^ t ; Tm = simde_xtime(Tm); a_.values[i*4+3] ^= Tm ^ Tmp ; + } + return simde_uint8x16_from_private(a_); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) + #undef vaesmcq_u8 + #define vaesmcq_u8(data) simde_vaesmcq_u8((data)) +#endif + +static uint8_t Multiply(uint8_t x, uint8_t y) +{ + return (((y & 1) * x) ^ + ((y>>1 & 1) * simde_xtime(x)) ^ + ((y>>2 & 1) * simde_xtime(simde_xtime(x))) ^ + ((y>>3 & 1) * simde_xtime(simde_xtime(simde_xtime(x)))) ^ + ((y>>4 & 1) * simde_xtime(simde_xtime(simde_xtime(simde_xtime(x)))))); /* this last call to simde_xtime() can be omitted */ +} + +SIMDE_FUNCTION_ATTRIBUTES +simde_uint8x16_t +simde_vaesimcq_u8(simde_uint8x16_t data) { + #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARCH_ARM_AES) + return vaesimcq_u8(data); + #else + simde_uint8x16_private + a_ = simde_uint8x16_to_private(data), + r_; + /* ref: simde/simde/x86/aes.h */ + #if defined(SIMDE_X86_AES_NATIVE) + r_.m128i = _mm_aesimc_si128(a_.m128i); + #else + int Nb = simde_x_aes_Nb; + // uint8_t k[] = {0x0e, 0x09, 0x0d, 0x0b}; // a(x) = {0e} + {09}x + {0d}x2 + {0b}x3 + uint8_t i, j, col[4], res[4]; + + for (j = 0; j < Nb; j++) { + for (i = 0; i < 4; i++) { + col[i] = a_.values[Nb*j+i]; + } + + //coef_mult(k, col, res); + simde_x_aes_coef_mult_lookup(4, col, res); + + for (i = 0; i < 4; i++) { + r_.values[Nb*j+i] = res[i]; + } + } + #endif + return simde_uint8x16_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) + #undef vaesimcq_u8 + #define vaesimcq_u8(data) simde_vaesimcq_u8((data)) +#endif + +SIMDE_END_DECLS_ +HEDLEY_DIAGNOSTIC_POP + +#endif /* !defined(SIMDE_ARM_NEON_AES_H) */ diff --git a/lib/simd_wrapper/simde/arm/neon/and.h b/lib/simd_wrapper/simde/arm/neon/and.h index 381154228f2..185683d75a8 100644 --- a/lib/simd_wrapper/simde/arm/neon/and.h +++ b/lib/simd_wrapper/simde/arm/neon/and.h @@ -47,6 +47,8 @@ simde_vand_s8(simde_int8x8_t a, simde_int8x8_t b) { #if defined(SIMDE_X86_MMX_NATIVE) r_.m64 = _mm_and_si64(a_.m64, b_.m64); + #elif defined(SIMDE_RISCV_V_NATIVE) + r_.sv64 = __riscv_vand_vv_i8m1(a_.sv64, b_.sv64, 8); #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) r_.values = a_.values & b_.values; #else @@ -77,6 +79,8 @@ simde_vand_s16(simde_int16x4_t a, simde_int16x4_t b) { #if defined(SIMDE_X86_MMX_NATIVE) r_.m64 = _mm_and_si64(a_.m64, b_.m64); + #elif defined(SIMDE_RISCV_V_NATIVE) + r_.sv64 = __riscv_vand_vv_i16m1(a_.sv64, b_.sv64, 4); #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) r_.values = a_.values & b_.values; #else @@ -107,6 +111,8 @@ simde_vand_s32(simde_int32x2_t a, simde_int32x2_t b) { #if defined(SIMDE_X86_MMX_NATIVE) r_.m64 = _mm_and_si64(a_.m64, b_.m64); + #elif defined(SIMDE_RISCV_V_NATIVE) + r_.sv64 = __riscv_vand_vv_i32m1(a_.sv64, b_.sv64, 2); #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) r_.values = a_.values & b_.values; #else @@ -137,6 +143,8 @@ simde_vand_s64(simde_int64x1_t a, simde_int64x1_t b) { #if defined(SIMDE_X86_MMX_NATIVE) r_.m64 = _mm_and_si64(a_.m64, b_.m64); + #elif defined(SIMDE_RISCV_V_NATIVE) + r_.sv64 = __riscv_vand_vv_i64m1(a_.sv64, b_.sv64, 1); #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) r_.values = a_.values & b_.values; #else @@ -167,6 +175,8 @@ simde_vand_u8(simde_uint8x8_t a, simde_uint8x8_t b) { #if defined(SIMDE_X86_MMX_NATIVE) r_.m64 = _mm_and_si64(a_.m64, b_.m64); + #elif defined(SIMDE_RISCV_V_NATIVE) + r_.sv64 = __riscv_vand_vv_u8m1(a_.sv64, b_.sv64, 8); #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) r_.values = a_.values & b_.values; #else @@ -197,6 +207,8 @@ simde_vand_u16(simde_uint16x4_t a, simde_uint16x4_t b) { #if defined(SIMDE_X86_MMX_NATIVE) r_.m64 = _mm_and_si64(a_.m64, b_.m64); + #elif defined(SIMDE_RISCV_V_NATIVE) + r_.sv64 = __riscv_vand_vv_u16m1(a_.sv64, b_.sv64, 4); #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) r_.values = a_.values & b_.values; #else @@ -227,6 +239,8 @@ simde_vand_u32(simde_uint32x2_t a, simde_uint32x2_t b) { #if defined(SIMDE_X86_MMX_NATIVE) r_.m64 = _mm_and_si64(a_.m64, b_.m64); + #elif defined(SIMDE_RISCV_V_NATIVE) + r_.sv64 = __riscv_vand_vv_u32m1(a_.sv64, b_.sv64, 2); #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) r_.values = a_.values & b_.values; #else @@ -257,6 +271,8 @@ simde_vand_u64(simde_uint64x1_t a, simde_uint64x1_t b) { #if defined(SIMDE_X86_MMX_NATIVE) r_.m64 = _mm_and_si64(a_.m64, b_.m64); + #elif defined(SIMDE_RISCV_V_NATIVE) + r_.sv64 = __riscv_vand_vv_u64m1(a_.sv64, b_.sv64, 1); #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) r_.values = a_.values & b_.values; #else @@ -291,6 +307,8 @@ simde_vandq_s8(simde_int8x16_t a, simde_int8x16_t b) { r_.m128i = _mm_and_si128(a_.m128i, b_.m128i); #elif defined(SIMDE_WASM_SIMD128_NATIVE) r_.v128 = wasm_v128_and(a_.v128, b_.v128); + #elif defined(SIMDE_RISCV_V_NATIVE) + r_.sv128 = __riscv_vand_vv_i8m1(a_.sv128, b_.sv128, 16); #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) r_.values = a_.values & b_.values; #else @@ -325,6 +343,8 @@ simde_vandq_s16(simde_int16x8_t a, simde_int16x8_t b) { r_.m128i = _mm_and_si128(a_.m128i, b_.m128i); #elif defined(SIMDE_WASM_SIMD128_NATIVE) r_.v128 = wasm_v128_and(a_.v128, b_.v128); + #elif defined(SIMDE_RISCV_V_NATIVE) + r_.sv128 = __riscv_vand_vv_i16m1(a_.sv128, b_.sv128, 8); #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) r_.values = a_.values & b_.values; #else @@ -359,6 +379,8 @@ simde_vandq_s32(simde_int32x4_t a, simde_int32x4_t b) { r_.m128i = _mm_and_si128(a_.m128i, b_.m128i); #elif defined(SIMDE_WASM_SIMD128_NATIVE) r_.v128 = wasm_v128_and(a_.v128, b_.v128); + #elif defined(SIMDE_RISCV_V_NATIVE) + r_.sv128 = __riscv_vand_vv_i32m1(a_.sv128, b_.sv128, 4); #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) r_.values = a_.values & b_.values; #else @@ -393,6 +415,8 @@ simde_vandq_s64(simde_int64x2_t a, simde_int64x2_t b) { r_.m128i = _mm_and_si128(a_.m128i, b_.m128i); #elif defined(SIMDE_WASM_SIMD128_NATIVE) r_.v128 = wasm_v128_and(a_.v128, b_.v128); + #elif defined(SIMDE_RISCV_V_NATIVE) + r_.sv128 = __riscv_vand_vv_i64m1(a_.sv128, b_.sv128, 2); #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) r_.values = a_.values & b_.values; #else @@ -427,6 +451,8 @@ simde_vandq_u8(simde_uint8x16_t a, simde_uint8x16_t b) { r_.m128i = _mm_and_si128(a_.m128i, b_.m128i); #elif defined(SIMDE_WASM_SIMD128_NATIVE) r_.v128 = wasm_v128_and(a_.v128, b_.v128); + #elif defined(SIMDE_RISCV_V_NATIVE) + r_.sv128 = __riscv_vand_vv_u8m1(a_.sv128, b_.sv128, 16); #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) r_.values = a_.values & b_.values; #else @@ -461,6 +487,8 @@ simde_vandq_u16(simde_uint16x8_t a, simde_uint16x8_t b) { r_.m128i = _mm_and_si128(a_.m128i, b_.m128i); #elif defined(SIMDE_WASM_SIMD128_NATIVE) r_.v128 = wasm_v128_and(a_.v128, b_.v128); + #elif defined(SIMDE_RISCV_V_NATIVE) + r_.sv128 = __riscv_vand_vv_u16m1(a_.sv128, b_.sv128, 8); #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) r_.values = a_.values & b_.values; #else @@ -495,6 +523,8 @@ simde_vandq_u32(simde_uint32x4_t a, simde_uint32x4_t b) { r_.m128i = _mm_and_si128(a_.m128i, b_.m128i); #elif defined(SIMDE_WASM_SIMD128_NATIVE) r_.v128 = wasm_v128_and(a_.v128, b_.v128); + #elif defined(SIMDE_RISCV_V_NATIVE) + r_.sv128 = __riscv_vand_vv_u32m1(a_.sv128, b_.sv128, 4); #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) r_.values = a_.values & b_.values; #else @@ -529,6 +559,8 @@ simde_vandq_u64(simde_uint64x2_t a, simde_uint64x2_t b) { r_.m128i = _mm_and_si128(a_.m128i, b_.m128i); #elif defined(SIMDE_WASM_SIMD128_NATIVE) r_.v128 = wasm_v128_and(a_.v128, b_.v128); + #elif defined(SIMDE_RISCV_V_NATIVE) + r_.sv128 = __riscv_vand_vv_u64m1(a_.sv128, b_.sv128, 2); #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) r_.values = a_.values & b_.values; #else diff --git a/lib/simd_wrapper/simde/arm/neon/bcax.h b/lib/simd_wrapper/simde/arm/neon/bcax.h index 929d8f8d887..b9e84ccba34 100644 --- a/lib/simd_wrapper/simde/arm/neon/bcax.h +++ b/lib/simd_wrapper/simde/arm/neon/bcax.h @@ -39,13 +39,13 @@ SIMDE_BEGIN_DECLS_ SIMDE_FUNCTION_ATTRIBUTES simde_uint8x16_t simde_vbcaxq_u8(simde_uint8x16_t a, simde_uint8x16_t b, simde_uint8x16_t c) { - #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && defined(__ARM_FEATURE_SHA3) + #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && defined(SIMDE_ARCH_ARM_SHA3) return vbcaxq_u8(a, b, c); #else return simde_veorq_u8(a, simde_vbicq_u8(b, c)); #endif } -#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && !defined(__ARM_FEATURE_SHA3)) +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && !defined(SIMDE_ARCH_ARM_SHA3)) #undef vbcaxq_u8 #define vbcaxq_u8(a, b, c) simde_vbcaxq_u8(a, b, c) #endif @@ -53,13 +53,13 @@ simde_vbcaxq_u8(simde_uint8x16_t a, simde_uint8x16_t b, simde_uint8x16_t c) { SIMDE_FUNCTION_ATTRIBUTES simde_uint16x8_t simde_vbcaxq_u16(simde_uint16x8_t a, simde_uint16x8_t b, simde_uint16x8_t c) { - #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && defined(__ARM_FEATURE_SHA3) + #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && defined(SIMDE_ARCH_ARM_SHA3) return vbcaxq_u16(a, b, c); #else return simde_veorq_u16(a, simde_vbicq_u16(b, c)); #endif } -#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && !defined(__ARM_FEATURE_SHA3)) +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && !defined(SIMDE_ARCH_ARM_SHA3)) #undef vbcaxq_u16 #define vbcaxq_u16(a, b, c) simde_vbcaxq_u16(a, b, c) #endif @@ -67,13 +67,13 @@ simde_vbcaxq_u16(simde_uint16x8_t a, simde_uint16x8_t b, simde_uint16x8_t c) { SIMDE_FUNCTION_ATTRIBUTES simde_uint32x4_t simde_vbcaxq_u32(simde_uint32x4_t a, simde_uint32x4_t b, simde_uint32x4_t c) { - #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && defined(__ARM_FEATURE_SHA3) + #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && defined(SIMDE_ARCH_ARM_SHA3) return vbcaxq_u32(a, b, c); #else return simde_veorq_u32(a, simde_vbicq_u32(b, c)); #endif } -#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && !defined(__ARM_FEATURE_SHA3)) +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && !defined(SIMDE_ARCH_ARM_SHA3)) #undef vbcaxq_u32 #define vbcaxq_u32(a, b, c) simde_vbcaxq_u32(a, b, c) #endif @@ -81,13 +81,13 @@ simde_vbcaxq_u32(simde_uint32x4_t a, simde_uint32x4_t b, simde_uint32x4_t c) { SIMDE_FUNCTION_ATTRIBUTES simde_uint64x2_t simde_vbcaxq_u64(simde_uint64x2_t a, simde_uint64x2_t b, simde_uint64x2_t c) { - #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && defined(__ARM_FEATURE_SHA3) + #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && defined(SIMDE_ARCH_ARM_SHA3) return vbcaxq_u64(a, b, c); #else return simde_veorq_u64(a, simde_vbicq_u64(b, c)); #endif } -#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && !defined(__ARM_FEATURE_SHA3)) +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && !defined(SIMDE_ARCH_ARM_SHA3)) #undef vbcaxq_u64 #define vbcaxq_u64(a, b, c) simde_vbcaxq_u64(a, b, c) #endif @@ -95,13 +95,13 @@ simde_vbcaxq_u64(simde_uint64x2_t a, simde_uint64x2_t b, simde_uint64x2_t c) { SIMDE_FUNCTION_ATTRIBUTES simde_int8x16_t simde_vbcaxq_s8(simde_int8x16_t a, simde_int8x16_t b, simde_int8x16_t c) { - #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && defined(__ARM_FEATURE_SHA3) + #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && defined(SIMDE_ARCH_ARM_SHA3) return vbcaxq_s8(a, b, c); #else return simde_veorq_s8(a, simde_vbicq_s8(b, c)); #endif } -#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && !defined(__ARM_FEATURE_SHA3)) +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && !defined(SIMDE_ARCH_ARM_SHA3)) #undef vbcaxq_s8 #define vbcaxq_s8(a, b, c) simde_vbcaxq_s8(a, b, c) #endif @@ -109,13 +109,13 @@ simde_vbcaxq_s8(simde_int8x16_t a, simde_int8x16_t b, simde_int8x16_t c) { SIMDE_FUNCTION_ATTRIBUTES simde_int16x8_t simde_vbcaxq_s16(simde_int16x8_t a, simde_int16x8_t b, simde_int16x8_t c) { - #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && defined(__ARM_FEATURE_SHA3) + #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && defined(SIMDE_ARCH_ARM_SHA3) return vbcaxq_s16(a, b, c); #else return simde_veorq_s16(a,simde_vbicq_s16(b, c)); #endif } -#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && !defined(__ARM_FEATURE_SHA3)) +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && !defined(SIMDE_ARCH_ARM_SHA3)) #undef vbcaxq_s16 #define vbcaxq_s16(a, b, c) simde_vbcaxq_s16(a, b, c) #endif @@ -123,13 +123,13 @@ simde_vbcaxq_s16(simde_int16x8_t a, simde_int16x8_t b, simde_int16x8_t c) { SIMDE_FUNCTION_ATTRIBUTES simde_int32x4_t simde_vbcaxq_s32(simde_int32x4_t a, simde_int32x4_t b, simde_int32x4_t c) { - #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && defined(__ARM_FEATURE_SHA3) + #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && defined(SIMDE_ARCH_ARM_SHA3) return vbcaxq_s32(a, b, c); #else return simde_veorq_s32(a, simde_vbicq_s32(b, c)); #endif } -#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && !defined(__ARM_FEATURE_SHA3)) +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && !defined(SIMDE_ARCH_ARM_SHA3)) #undef vbcaxq_s32 #define vbcaxq_s32(a, b, c) simde_vbcaxq_s32(a, b, c) #endif @@ -137,13 +137,13 @@ simde_vbcaxq_s32(simde_int32x4_t a, simde_int32x4_t b, simde_int32x4_t c) { SIMDE_FUNCTION_ATTRIBUTES simde_int64x2_t simde_vbcaxq_s64(simde_int64x2_t a, simde_int64x2_t b, simde_int64x2_t c) { - #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && defined(__ARM_FEATURE_SHA3) + #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && defined(SIMDE_ARCH_ARM_SHA3) return vbcaxq_s64(a, b, c); #else return simde_veorq_s64(a, simde_vbicq_s64(b, c)); #endif } -#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && !defined(__ARM_FEATURE_SHA3)) +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && !defined(SIMDE_ARCH_ARM_SHA3)) #undef vbcaxq_s64 #define vbcaxq_s64(a, b, c) simde_vbcaxq_s64(a, b, c) #endif diff --git a/lib/simd_wrapper/simde/arm/neon/bsl.h b/lib/simd_wrapper/simde/arm/neon/bsl.h index 0fc4ff270fd..40cdac89f1e 100644 --- a/lib/simd_wrapper/simde/arm/neon/bsl.h +++ b/lib/simd_wrapper/simde/arm/neon/bsl.h @@ -23,6 +23,7 @@ * Copyright: * 2020 Evan Nemerson * 2020 Sean Maher (Copyright owned by Google, LLC) + * 2023 Yi-Yen Chung (Copyright owned by Andes Technology) */ #if !defined(SIMDE_ARM_NEON_BSL_H) @@ -755,6 +756,156 @@ simde_vbslq_u64(simde_uint64x2_t a, simde_uint64x2_t b, simde_uint64x2_t c) { #define vbslq_u64(a, b, c) simde_vbslq_u64((a), (b), (c)) #endif +SIMDE_FUNCTION_ATTRIBUTES +simde_poly8x8_t +simde_vbsl_p8(simde_uint8x8_t a, simde_poly8x8_t b, simde_poly8x8_t c) { + #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) + return vbsl_p8(a, b, c); + #else + simde_poly8x8_private + r_, + b_ = simde_poly8x8_to_private(b), + c_ = simde_poly8x8_to_private(c); + simde_uint8x8_private a_ = simde_uint8x8_to_private(a); + + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { + r_.values[i] = (b_.values[i] & a_.values[i]) | (c_.values[i] & ~a_.values[i]); + } + + return simde_poly8x8_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vbsl_p8 + #define vbsl_p8(a, b, c) simde_vbsl_p8((a), (b), (c)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_poly16x4_t +simde_vbsl_p16(simde_uint16x4_t a, simde_poly16x4_t b, simde_poly16x4_t c) { + #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) + return vbsl_p16(a, b, c); + #else + simde_poly16x4_private + r_, + b_ = simde_poly16x4_to_private(b), + c_ = simde_poly16x4_to_private(c); + simde_uint16x4_private a_ = simde_uint16x4_to_private(a); + + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { + r_.values[i] = (b_.values[i] & a_.values[i]) | (c_.values[i] & ~a_.values[i]); + } + + return simde_poly16x4_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vbsl_p16 + #define vbsl_p16(a, b, c) simde_vbsl_p16((a), (b), (c)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_poly64x1_t +simde_vbsl_p64(simde_uint64x1_t a, simde_poly64x1_t b, simde_poly64x1_t c) { + #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) + return vbsl_p64(a, b, c); + #else + simde_poly64x1_private + r_, + b_ = simde_poly64x1_to_private(b), + c_ = simde_poly64x1_to_private(c); + simde_uint64x1_private a_ = simde_uint64x1_to_private(a); + + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { + r_.values[i] = (b_.values[i] & a_.values[i]) | (c_.values[i] & ~a_.values[i]); + } + + return simde_poly64x1_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) + #undef vbsl_p64 + #define vbsl_p64(a, b, c) simde_vbsl_p64((a), (b), (c)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_poly8x16_t +simde_vbslq_p8(simde_uint8x16_t a, simde_poly8x16_t b, simde_poly8x16_t c) { + #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) + return vbslq_p8(a, b, c); + #else + simde_poly8x16_private + r_, + b_ = simde_poly8x16_to_private(b), + c_ = simde_poly8x16_to_private(c); + simde_uint8x16_private a_ = simde_uint8x16_to_private(a); + + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { + r_.values[i] = (b_.values[i] & a_.values[i]) | (c_.values[i] & ~a_.values[i]); + } + + return simde_poly8x16_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vbslq_p8 + #define vbslq_p8(a, b, c) simde_vbslq_p8((a), (b), (c)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_poly16x8_t +simde_vbslq_p16(simde_uint16x8_t a, simde_poly16x8_t b, simde_poly16x8_t c) { + #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) + return vbslq_p16(a, b, c); + #else + simde_poly16x8_private + r_, + b_ = simde_poly16x8_to_private(b), + c_ = simde_poly16x8_to_private(c); + simde_uint16x8_private a_ = simde_uint16x8_to_private(a); + + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { + r_.values[i] = (b_.values[i] & a_.values[i]) | (c_.values[i] & ~a_.values[i]); + } + + return simde_poly16x8_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vbslq_p16 + #define vbslq_p16(a, b, c) simde_vbslq_p16((a), (b), (c)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_poly64x2_t +simde_vbslq_p64(simde_uint64x2_t a, simde_poly64x2_t b, simde_poly64x2_t c) { + #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) + return vbslq_p64(a, b, c); + #else + simde_poly64x2_private + r_, + b_ = simde_poly64x2_to_private(b), + c_ = simde_poly64x2_to_private(c); + simde_uint64x2_private a_ = simde_uint64x2_to_private(a); + + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { + r_.values[i] = (b_.values[i] & a_.values[i]) | (c_.values[i] & ~a_.values[i]); + } + + return simde_poly64x2_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) + #undef vbslq_p64 + #define vbslq_p64(a, b, c) simde_vbslq_p64((a), (b), (c)) +#endif + SIMDE_END_DECLS_ HEDLEY_DIAGNOSTIC_POP diff --git a/lib/simd_wrapper/simde/arm/neon/cadd_rot270.h b/lib/simd_wrapper/simde/arm/neon/cadd_rot270.h new file mode 100644 index 00000000000..17995f48a84 --- /dev/null +++ b/lib/simd_wrapper/simde/arm/neon/cadd_rot270.h @@ -0,0 +1,188 @@ +/* SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, copy, + * modify, merge, publish, distribute, sublicense, and/or sell copies + * of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + * Copyright: + * 2023 Chi-Wei Chu + */ + +#if !defined(SIMDE_ARM_NEON_CADD_ROT270_H) +#define SIMDE_ARM_NEON_CADD_ROT270_H + +#include "add.h" +#include "types.h" +HEDLEY_DIAGNOSTIC_PUSH +SIMDE_DISABLE_UNWANTED_DIAGNOSTICS +SIMDE_BEGIN_DECLS_ + +#if defined(__clang__) && SIMDE_FLOAT16_API == SIMDE_FLOAT16_API_FP16 +SIMDE_DIAGNOSTIC_DISABLE_DOUBLE_PROMOTION_ +_Pragma("clang diagnostic ignored \"-Wimplicit-float-conversion\"") +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_float16x4_t simde_vcadd_rot270_f16(simde_float16x4_t a, simde_float16x4_t b) +{ + #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && SIMDE_ARCH_ARM_CHECK(8, 3) && \ + (!defined(HEDLEY_GCC_VERSION) || HEDLEY_GCC_VERSION_CHECK(9, 0, 0)) && \ + (!defined(__clang__) || SIMDE_DETECT_CLANG_VERSION_CHECK(12, 0, 0)) + return vcadd_rot270_f16(a, b); + #else + simde_float16x4_private r_, a_ = simde_float16x4_to_private(a), b_ = simde_float16x4_to_private(b); + #if defined(SIMDE_SHUFFLE_VECTOR_) && !defined(SIMDE_BUG_GCC_100760) && \ + ((SIMDE_FLOAT16_API == SIMDE_FLOAT16_API_FP16) || (SIMDE_FLOAT16_API == SIMDE_FLOAT16_API_FLOAT16)) + b_.values = SIMDE_SHUFFLE_VECTOR_(16, 4, -b_.values, b_.values, 5, 0, 7, 2); + r_.values = b_.values + a_.values; + #else + SIMDE_VECTORIZE + for (size_t i = 0; i < (sizeof(r_.values) / (2 * sizeof(r_.values[0]))); i++) + { + r_.values[2 * i] = simde_vaddh_f16(b_.values[2 * i + 1], a_.values[2 * i]); + r_.values[2 * i + 1] = + simde_vaddh_f16(simde_float16_from_float32(-simde_float16_to_float32(b_.values[2 * i])), a_.values[2 * i + 1]); + } + #endif + return simde_float16x4_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) + #undef vcadd_rot270_f16 + #define vcadd_rot270_f16(a, b) simde_vcadd_rot270_f16(a, b) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_float16x8_t simde_vcaddq_rot270_f16(simde_float16x8_t a, simde_float16x8_t b) +{ + #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && SIMDE_ARCH_ARM_CHECK(8, 3) && \ + (!defined(HEDLEY_GCC_VERSION) || HEDLEY_GCC_VERSION_CHECK(9, 0, 0)) && \ + (!defined(__clang__) || SIMDE_DETECT_CLANG_VERSION_CHECK(12, 0, 0)) + return vcaddq_rot270_f16(a, b); + #else + simde_float16x8_private r_, a_ = simde_float16x8_to_private(a), b_ = simde_float16x8_to_private(b); + #if defined(SIMDE_SHUFFLE_VECTOR_) && \ + ((SIMDE_FLOAT16_API == SIMDE_FLOAT16_API_FP16) || (SIMDE_FLOAT16_API == SIMDE_FLOAT16_API_FLOAT16)) + b_.values = SIMDE_SHUFFLE_VECTOR_(16, 8, -b_.values, b_.values, 9, 0, 11, 2, 13, 4, 15, 6); + r_.values = b_.values + a_.values; + #else + SIMDE_VECTORIZE + for (size_t i = 0; i < (sizeof(r_.values) / (2 * sizeof(r_.values[0]))); i++) + { + r_.values[2 * i] = simde_vaddh_f16(b_.values[2 * i + 1], a_.values[2 * i]); + r_.values[2 * i + 1] = + simde_vaddh_f16(simde_float16_from_float32(-simde_float16_to_float32(b_.values[2 * i])), a_.values[2 * i + 1]); + } + #endif + return simde_float16x8_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) + #undef vcaddq_rot270_f16 + #define vcaddq_rot270_f16(a, b) simde_vcaddq_rot270_f16(a, b) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_float32x2_t simde_vcadd_rot270_f32(simde_float32x2_t a, simde_float32x2_t b) +{ + #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && SIMDE_ARCH_ARM_CHECK(8, 3) && \ + (!defined(HEDLEY_GCC_VERSION) || HEDLEY_GCC_VERSION_CHECK(9, 0, 0)) && \ + (!defined(__clang__) || SIMDE_DETECT_CLANG_VERSION_CHECK(12, 0, 0)) + return vcadd_rot270_f32(a, b); + #else + simde_float32x2_private r_, a_ = simde_float32x2_to_private(a), b_ = simde_float32x2_to_private(b); + #if defined(SIMDE_SHUFFLE_VECTOR_) && !defined(SIMDE_BUG_GCC_100760) + b_.values = SIMDE_SHUFFLE_VECTOR_(32, 8, -b_.values, b_.values, 3, 0); + r_.values = b_.values + a_.values; + #else + SIMDE_VECTORIZE + for (size_t i = 0; i < (sizeof(r_.values) / (2 * sizeof(r_.values[0]))); i++) + { + r_.values[2 * i] = b_.values[2 * i + 1] + a_.values[2 * i]; + r_.values[2 * i + 1] = -(b_.values[2 * i]) + a_.values[2 * i + 1]; + } + #endif + return simde_float32x2_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) + #undef vcadd_rot270_f32 + #define vcadd_rot270_f32(a, b) simde_vcadd_rot270_f32(a, b) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_float32x4_t simde_vcaddq_rot270_f32(simde_float32x4_t a, simde_float32x4_t b) +{ + #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && SIMDE_ARCH_ARM_CHECK(8, 3) && \ + (!defined(HEDLEY_GCC_VERSION) || HEDLEY_GCC_VERSION_CHECK(9, 0, 0)) && \ + (!defined(__clang__) || SIMDE_DETECT_CLANG_VERSION_CHECK(12, 0, 0)) + return vcaddq_rot270_f32(a, b); + #else + simde_float32x4_private r_, a_ = simde_float32x4_to_private(a), b_ = simde_float32x4_to_private(b); + #if defined(SIMDE_SHUFFLE_VECTOR_) + b_.values = SIMDE_SHUFFLE_VECTOR_(32, 16, -b_.values, b_.values, 5, 0, 7, 2); + r_.values = b_.values + a_.values; + #else + SIMDE_VECTORIZE + for (size_t i = 0; i < (sizeof(r_.values) / (2 * sizeof(r_.values[0]))); i++) + { + r_.values[2 * i] = b_.values[2 * i + 1] + a_.values[2 * i]; + r_.values[2 * i + 1] = -(b_.values[2 * i]) + a_.values[2 * i + 1]; + } + #endif + return simde_float32x4_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) + #undef vcaddq_rot270_f32 + #define vcaddq_rot270_f32(a, b) simde_vcaddq_rot270_f32(a, b) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_float64x2_t simde_vcaddq_rot270_f64(simde_float64x2_t a, simde_float64x2_t b) +{ + #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && SIMDE_ARCH_ARM_CHECK(8, 3) && \ + (!defined(HEDLEY_GCC_VERSION) || HEDLEY_GCC_VERSION_CHECK(9, 0, 0)) && \ + (!defined(__clang__) || SIMDE_DETECT_CLANG_VERSION_CHECK(12, 0, 0)) + return vcaddq_rot270_f64(a, b); + #else + simde_float64x2_private r_, a_ = simde_float64x2_to_private(a), b_ = simde_float64x2_to_private(b); + #if defined(SIMDE_SHUFFLE_VECTOR_) + b_.values = SIMDE_SHUFFLE_VECTOR_(64, 16, -b_.values, b_.values, 3, 0); + r_.values = b_.values + a_.values; + #else + SIMDE_VECTORIZE + for (size_t i = 0; i < (sizeof(r_.values) / (2 * sizeof(r_.values[0]))); i++) + { + r_.values[2 * i] = b_.values[2 * i + 1] + a_.values[2 * i]; + r_.values[2 * i + 1] = -(b_.values[2 * i]) + a_.values[2 * i + 1]; + } + #endif + return simde_float64x2_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) + #undef vcaddq_rot270_f64 + #define vcaddq_rot270_f64(a, b) simde_vcaddq_rot270_f64(a, b) +#endif + +SIMDE_END_DECLS_ +HEDLEY_DIAGNOSTIC_POP + +#endif /* !defined(SIMDE_ARM_NEON_CADD_ROT270_H) */ diff --git a/lib/simd_wrapper/simde/arm/neon/cadd_rot90.h b/lib/simd_wrapper/simde/arm/neon/cadd_rot90.h new file mode 100644 index 00000000000..0c448a52191 --- /dev/null +++ b/lib/simd_wrapper/simde/arm/neon/cadd_rot90.h @@ -0,0 +1,188 @@ +/* SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, copy, + * modify, merge, publish, distribute, sublicense, and/or sell copies + * of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + * Copyright: + * 2023 Chi-Wei Chu + */ + +#if !defined(SIMDE_ARM_NEON_CADD_ROT90_H) +#define SIMDE_ARM_NEON_CADD_ROT90_H + +#include "add.h" +#include "types.h" +HEDLEY_DIAGNOSTIC_PUSH +SIMDE_DISABLE_UNWANTED_DIAGNOSTICS +SIMDE_BEGIN_DECLS_ + +#if defined(__clang__) && SIMDE_FLOAT16_API == SIMDE_FLOAT16_API_FP16 +SIMDE_DIAGNOSTIC_DISABLE_DOUBLE_PROMOTION_ +_Pragma("clang diagnostic ignored \"-Wimplicit-float-conversion\"") +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_float16x4_t simde_vcadd_rot90_f16(simde_float16x4_t a, simde_float16x4_t b) +{ + #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && SIMDE_ARCH_ARM_CHECK(8, 3) && \ + (!defined(HEDLEY_GCC_VERSION) || HEDLEY_GCC_VERSION_CHECK(9, 0, 0)) && \ + (!defined(__clang__) || SIMDE_DETECT_CLANG_VERSION_CHECK(12, 0, 0)) + return vcadd_rot90_f16(a, b); + #else + simde_float16x4_private r_, a_ = simde_float16x4_to_private(a), b_ = simde_float16x4_to_private(b); + #if defined(SIMDE_SHUFFLE_VECTOR_) && \ + ((SIMDE_FLOAT16_API == SIMDE_FLOAT16_API_FP16) || (SIMDE_FLOAT16_API == SIMDE_FLOAT16_API_FLOAT16)) + b_.values = SIMDE_SHUFFLE_VECTOR_(16, 4, -b_.values, b_.values, 1, 4, 3, 6); + r_.values = b_.values + a_.values; + #else + SIMDE_VECTORIZE + for (size_t i = 0; i < (sizeof(r_.values) / (2 * sizeof(r_.values[0]))); i++) + { + r_.values[2 * i] = + simde_vaddh_f16(simde_float16_from_float32(-simde_float16_to_float32(b_.values[2 * i + 1])), a_.values[2 * i]); + r_.values[2 * i + 1] = simde_vaddh_f16(b_.values[2 * i], a_.values[2 * i + 1]); + } + #endif + return simde_float16x4_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) + #undef vcadd_rot90_f16 + #define vcadd_rot90_f16(a, b) simde_vcadd_rot90_f16(a, b) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_float16x8_t simde_vcaddq_rot90_f16(simde_float16x8_t a, simde_float16x8_t b) +{ + #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && SIMDE_ARCH_ARM_CHECK(8, 3) && \ + (!defined(HEDLEY_GCC_VERSION) || HEDLEY_GCC_VERSION_CHECK(9, 0, 0)) && \ + (!defined(__clang__) || SIMDE_DETECT_CLANG_VERSION_CHECK(12, 0, 0)) + return vcaddq_rot90_f16(a, b); + #else + simde_float16x8_private r_, a_ = simde_float16x8_to_private(a), b_ = simde_float16x8_to_private(b); + #if defined(SIMDE_SHUFFLE_VECTOR_) && \ + ((SIMDE_FLOAT16_API == SIMDE_FLOAT16_API_FP16) || (SIMDE_FLOAT16_API == SIMDE_FLOAT16_API_FLOAT16)) + b_.values = SIMDE_SHUFFLE_VECTOR_(16, 8, -b_.values, b_.values, 1, 8, 3, 10, 5, 12, 7, 14); + r_.values = b_.values + a_.values; + #else + SIMDE_VECTORIZE + for (size_t i = 0; i < (sizeof(r_.values) / (2 * sizeof(r_.values[0]))); i++) + { + r_.values[2 * i] = + simde_vaddh_f16(simde_float16_from_float32(-simde_float16_to_float32(b_.values[2 * i + 1])), a_.values[2 * i]); + r_.values[2 * i + 1] = simde_vaddh_f16(b_.values[2 * i], a_.values[2 * i + 1]); + } + #endif + return simde_float16x8_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) + #undef vcaddq_rot90_f16 + #define vcaddq_rot90_f16(a, b) simde_vcaddq_rot90_f16(a, b) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_float32x2_t simde_vcadd_rot90_f32(simde_float32x2_t a, simde_float32x2_t b) +{ + #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && SIMDE_ARCH_ARM_CHECK(8, 3) && \ + (!defined(HEDLEY_GCC_VERSION) || HEDLEY_GCC_VERSION_CHECK(9, 0, 0)) && \ + (!defined(__clang__) || SIMDE_DETECT_CLANG_VERSION_CHECK(12, 0, 0)) + return vcadd_rot90_f32(a, b); + #else + simde_float32x2_private r_, a_ = simde_float32x2_to_private(a), b_ = simde_float32x2_to_private(b); + #if defined(SIMDE_SHUFFLE_VECTOR_) && !defined(SIMDE_BUG_GCC_100760) + b_.values = SIMDE_SHUFFLE_VECTOR_(32, 8, -b_.values, b_.values, 1, 2); + r_.values = b_.values + a_.values; + #else + SIMDE_VECTORIZE + for (size_t i = 0; i < (sizeof(r_.values) / (2 * sizeof(r_.values[0]))); i++) + { + r_.values[2 * i] = -(b_.values[2 * i + 1]) + a_.values[2 * i]; + r_.values[2 * i + 1] = b_.values[2 * i] + a_.values[2 * i + 1]; + } + #endif + return simde_float32x2_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) + #undef vcadd_rot90_f32 + #define vcadd_rot90_f32(a, b) simde_vcadd_rot90_f32(a, b) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_float32x4_t simde_vcaddq_rot90_f32(simde_float32x4_t a, simde_float32x4_t b) +{ + #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && SIMDE_ARCH_ARM_CHECK(8, 3) && \ + (!defined(HEDLEY_GCC_VERSION) || HEDLEY_GCC_VERSION_CHECK(9, 0, 0)) && \ + (!defined(__clang__) || SIMDE_DETECT_CLANG_VERSION_CHECK(12, 0, 0)) + return vcaddq_rot90_f32(a, b); + #else + simde_float32x4_private r_, a_ = simde_float32x4_to_private(a), b_ = simde_float32x4_to_private(b); + #if defined(SIMDE_SHUFFLE_VECTOR_) + b_.values = SIMDE_SHUFFLE_VECTOR_(32, 16, -b_.values, b_.values, 1, 4, 3, 6); + r_.values = b_.values + a_.values; + #else + SIMDE_VECTORIZE + for (size_t i = 0; i < (sizeof(r_.values) / (2 * sizeof(r_.values[0]))); i++) + { + r_.values[2 * i] = -(b_.values[2 * i + 1]) + a_.values[2 * i]; + r_.values[2 * i + 1] = b_.values[2 * i] + a_.values[2 * i + 1]; + } + #endif + return simde_float32x4_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) + #undef vcaddq_rot90_f32 + #define vcaddq_rot90_f32(a, b) simde_vcaddq_rot90_f32(a, b) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_float64x2_t simde_vcaddq_rot90_f64(simde_float64x2_t a, simde_float64x2_t b) +{ + #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && SIMDE_ARCH_ARM_CHECK(8, 3) && \ + (!defined(HEDLEY_GCC_VERSION) || HEDLEY_GCC_VERSION_CHECK(9, 0, 0)) && \ + (!defined(__clang__) || SIMDE_DETECT_CLANG_VERSION_CHECK(12, 0, 0)) + return vcaddq_rot90_f64(a, b); + #else + simde_float64x2_private r_, a_ = simde_float64x2_to_private(a), b_ = simde_float64x2_to_private(b); + #if defined(SIMDE_SHUFFLE_VECTOR_) + b_.values = SIMDE_SHUFFLE_VECTOR_(64, 16, -b_.values, b_.values, 1, 2); + r_.values = b_.values + a_.values; + #else + SIMDE_VECTORIZE + for (size_t i = 0; i < (sizeof(r_.values) / (2 * sizeof(r_.values[0]))); i++) + { + r_.values[2 * i] = -(b_.values[2 * i + 1]) + a_.values[2 * i]; + r_.values[2 * i + 1] = b_.values[2 * i] + a_.values[2 * i + 1]; + } + #endif + return simde_float64x2_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) +#undef vcaddq_rot90_f64 +#define vcaddq_rot90_f64(a, b) simde_vcaddq_rot90_f64(a, b) +#endif + +SIMDE_END_DECLS_ +HEDLEY_DIAGNOSTIC_POP + +#endif /* !defined(SIMDE_ARM_NEON_CADD_ROT90_H) */ diff --git a/lib/simd_wrapper/simde/arm/neon/cale.h b/lib/simd_wrapper/simde/arm/neon/cale.h new file mode 100644 index 00000000000..f2baa51581a --- /dev/null +++ b/lib/simd_wrapper/simde/arm/neon/cale.h @@ -0,0 +1,165 @@ +/* SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, copy, + * modify, merge, publish, distribute, sublicense, and/or sell copies + * of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + * Copyright: + * 2023 Yi-Yen Chung (Copyright owned by Andes Technology) + */ + +#if !defined(SIMDE_ARM_NEON_CALE_H) +#define SIMDE_ARM_NEON_CALE_H + +#include "cage.h" + +HEDLEY_DIAGNOSTIC_PUSH +SIMDE_DISABLE_UNWANTED_DIAGNOSTICS +SIMDE_BEGIN_DECLS_ + +SIMDE_FUNCTION_ATTRIBUTES +uint16_t +simde_vcaleh_f16(simde_float16_t a, simde_float16_t b) { + #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && defined(SIMDE_ARM_NEON_FP16) + return vcaleh_f16(a, b); + #else + return simde_vcageh_f16(b, a); + #endif +} +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vcaleh_f16 + #define vcaleh_f16(a, b) simde_vcaleh_f16((a), (b)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +uint32_t +simde_vcales_f32(simde_float32_t a, simde_float32_t b) { + #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + return vcales_f32(a, b); + #else + return simde_vcages_f32(b, a); + #endif +} +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vcales_f32 + #define vcales_f32(a, b) simde_vcales_f32((a), (b)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +uint64_t +simde_vcaled_f64(simde_float64_t a, simde_float64_t b) { + #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + return vcaled_f64(a, b); + #else + return simde_vcaged_f64(b, a); + #endif +} +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vcaled_f64 + #define vcaled_f64(a, b) simde_vcaled_f64((a), (b)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_uint16x4_t +simde_vcale_f16(simde_float16x4_t a, simde_float16x4_t b) { + #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARM_NEON_FP16) + return vcale_f16(a, b); + #else + return simde_vcage_f16(b, a); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) + #undef vcale_f16 + #define vcale_f16(a, b) simde_vcale_f16((a), (b)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_uint32x2_t +simde_vcale_f32(simde_float32x2_t a, simde_float32x2_t b) { + #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) + return vcale_f32(a, b); + #else + return simde_vcage_f32(b, a); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vcale_f32 + #define vcale_f32(a, b) simde_vcale_f32((a), (b)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_uint64x1_t +simde_vcale_f64(simde_float64x1_t a, simde_float64x1_t b) { + #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + return vcale_f64(a, b); + #else + return simde_vcage_f64(b, a); + #endif +} +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vcale_f64 + #define vcale_f64(a, b) simde_vcale_f64((a), (b)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_uint16x8_t +simde_vcaleq_f16(simde_float16x8_t a, simde_float16x8_t b) { + #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARM_NEON_FP16) + return vcaleq_f16(a, b); + #else + return simde_vcageq_f16(b, a); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) + #undef vcaleq_f16 + #define vcaleq_f16(a, b) simde_vcaleq_f16((a), (b)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_uint32x4_t +simde_vcaleq_f32(simde_float32x4_t a, simde_float32x4_t b) { + #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) + return vcaleq_f32(a, b); + #else + return simde_vcageq_f32(b, a); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vcaleq_f32 + #define vcaleq_f32(a, b) simde_vcaleq_f32((a), (b)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_uint64x2_t +simde_vcaleq_f64(simde_float64x2_t a, simde_float64x2_t b) { + #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + return vcaleq_f64(a, b); + #else + return simde_vcageq_f64(b, a); + #endif +} +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vcaleq_f64 + #define vcaleq_f64(a, b) simde_vcaleq_f64((a), (b)) +#endif + +SIMDE_END_DECLS_ +HEDLEY_DIAGNOSTIC_POP + +#endif /* !defined(SIMDE_ARM_NEON_cale_H) */ diff --git a/lib/simd_wrapper/simde/arm/neon/calt.h b/lib/simd_wrapper/simde/arm/neon/calt.h new file mode 100644 index 00000000000..99fa3841932 --- /dev/null +++ b/lib/simd_wrapper/simde/arm/neon/calt.h @@ -0,0 +1,165 @@ +/* SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, copy, + * modify, merge, publish, distribute, sublicense, and/or sell copies + * of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + * Copyright: + * 2023 Yi-Yen Chung (Copyright owned by Andes Technology) + */ + +#if !defined(SIMDE_ARM_NEON_CALT_H) +#define SIMDE_ARM_NEON_CALT_H + +#include "cagt.h" + +HEDLEY_DIAGNOSTIC_PUSH +SIMDE_DISABLE_UNWANTED_DIAGNOSTICS +SIMDE_BEGIN_DECLS_ + +SIMDE_FUNCTION_ATTRIBUTES +uint16_t +simde_vcalth_f16(simde_float16_t a, simde_float16_t b) { + #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && defined(SIMDE_ARM_NEON_FP16) + return vcalth_f16(a, b); + #else + return simde_vcagth_f16(b, a); + #endif +} +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vcalth_f16 + #define vcalth_f16(a, b) simde_vcalth_f16((a), (b)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +uint32_t +simde_vcalts_f32(simde_float32_t a, simde_float32_t b) { + #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + return vcalts_f32(a, b); + #else + return simde_vcagts_f32(b, a); + #endif +} +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vcalts_f32 + #define vcalts_f32(a, b) simde_vcalts_f32((a), (b)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +uint64_t +simde_vcaltd_f64(simde_float64_t a, simde_float64_t b) { + #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + return vcaltd_f64(a, b); + #else + return simde_vcagtd_f64(b, a); + #endif +} +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vcaltd_f64 + #define vcaltd_f64(a, b) simde_vcaltd_f64((a), (b)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_uint16x4_t +simde_vcalt_f16(simde_float16x4_t a, simde_float16x4_t b) { + #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARM_NEON_FP16) + return vcalt_f16(a, b); + #else + return simde_vcagt_f16(b, a); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) + #undef vcalt_f16 + #define vcalt_f16(a, b) simde_vcalt_f16((a), (b)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_uint32x2_t +simde_vcalt_f32(simde_float32x2_t a, simde_float32x2_t b) { + #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) + return vcalt_f32(a, b); + #else + return simde_vcagt_f32(b, a); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vcalt_f32 + #define vcalt_f32(a, b) simde_vcalt_f32((a), (b)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_uint64x1_t +simde_vcalt_f64(simde_float64x1_t a, simde_float64x1_t b) { + #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + return vcalt_f64(a, b); + #else + return simde_vcagt_f64(b, a); + #endif +} +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vcalt_f64 + #define vcalt_f64(a, b) simde_vcalt_f64((a), (b)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_uint16x8_t +simde_vcaltq_f16(simde_float16x8_t a, simde_float16x8_t b) { + #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARM_NEON_FP16) + return vcaltq_f16(a, b); + #else + return simde_vcagtq_f16(b, a); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) + #undef vcaltq_f16 + #define vcaltq_f16(a, b) simde_vcaltq_f16((a), (b)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_uint32x4_t +simde_vcaltq_f32(simde_float32x4_t a, simde_float32x4_t b) { + #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) + return vcaltq_f32(a, b); + #else + return simde_vcagtq_f32(b, a); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vcaltq_f32 + #define vcaltq_f32(a, b) simde_vcaltq_f32((a), (b)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_uint64x2_t +simde_vcaltq_f64(simde_float64x2_t a, simde_float64x2_t b) { + #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + return vcaltq_f64(a, b); + #else + return simde_vcagtq_f64(b, a); + #endif +} +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vcaltq_f64 + #define vcaltq_f64(a, b) simde_vcaltq_f64((a), (b)) +#endif + +SIMDE_END_DECLS_ +HEDLEY_DIAGNOSTIC_POP + +#endif /* !defined(SIMDE_ARM_NEON_CAGT_H) */ diff --git a/lib/simd_wrapper/simde/arm/neon/ceq.h b/lib/simd_wrapper/simde/arm/neon/ceq.h index e60a4bf79d2..03a9c861223 100644 --- a/lib/simd_wrapper/simde/arm/neon/ceq.h +++ b/lib/simd_wrapper/simde/arm/neon/ceq.h @@ -22,6 +22,7 @@ * * Copyright: * 2020 Evan Nemerson + * 2023 Yi-Yen Chung (Copyright owned by Andes Technology) */ #if !defined(SIMDE_ARM_NEON_CEQ_H) @@ -766,6 +767,102 @@ simde_vceqq_u64(simde_uint64x2_t a, simde_uint64x2_t b) { #define vceqq_u64(a, b) simde_vceqq_u64((a), (b)) #endif +SIMDE_FUNCTION_ATTRIBUTES +simde_uint8x8_t +simde_vceq_p8(simde_poly8x8_t a, simde_poly8x8_t b) { + #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) + return vceq_p8(a, b); + #else + simde_uint8x8_private r_; + simde_poly8x8_private + a_ = simde_poly8x8_to_private(a), + b_ = simde_poly8x8_to_private(b); + + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { + r_.values[i] = (a_.values[i] == b_.values[i]) ? HEDLEY_STATIC_CAST(uint8_t, ~UINT8_C(0)) : HEDLEY_STATIC_CAST(uint8_t, UINT8_C(0)); + } + + return simde_uint8x8_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vceq_p8 + #define vceq_p8(a, b) simde_vceq_p8((a), (b)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_uint8x16_t +simde_vceqq_p8(simde_poly8x16_t a, simde_poly8x16_t b) { + #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) + return vceqq_p8(a, b); + #else + simde_uint8x16_private r_; + simde_poly8x16_private + a_ = simde_poly8x16_to_private(a), + b_ = simde_poly8x16_to_private(b); + + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { + r_.values[i] = (a_.values[i] == b_.values[i]) ? HEDLEY_STATIC_CAST(uint8_t, ~UINT8_C(0)) : HEDLEY_STATIC_CAST(uint8_t, UINT8_C(0)); + } + + return simde_uint8x16_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vceqq_p8 + #define vceqq_p8(a, b) simde_vceqq_p8((a), (b)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_uint64x1_t +simde_vceq_p64(simde_poly64x1_t a, simde_poly64x1_t b) { + #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) + return vceq_p64(a, b); + #else + simde_uint64x1_private r_; + simde_poly64x1_private + a_ = simde_poly64x1_to_private(a), + b_ = simde_poly64x1_to_private(b); + + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { + r_.values[i] = (a_.values[i] == b_.values[i]) ? ~UINT64_C(0) : UINT64_C(0); + } + + return simde_uint64x1_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) + #undef vceq_p64 + #define vceq_p64(a, b) simde_vceq_p64((a), (b)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_uint64x2_t +simde_vceqq_p64(simde_poly64x2_t a, simde_poly64x2_t b) { + #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) + return vceqq_p64(a, b); + #else + simde_uint64x2_private r_; + simde_poly64x2_private + a_ = simde_poly64x2_to_private(a), + b_ = simde_poly64x2_to_private(b); + + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { + r_.values[i] = (a_.values[i] == b_.values[i]) ? ~UINT64_C(0) : UINT64_C(0); + } + + return simde_uint64x2_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) + #undef vceqq_p64 + #define vceqq_p64(a, b) simde_vceqq_p64((a), (b)) +#endif + SIMDE_END_DECLS_ HEDLEY_DIAGNOSTIC_POP diff --git a/lib/simd_wrapper/simde/arm/neon/ceqz.h b/lib/simd_wrapper/simde/arm/neon/ceqz.h index 176ecce0f8a..54f3ce8fbb4 100644 --- a/lib/simd_wrapper/simde/arm/neon/ceqz.h +++ b/lib/simd_wrapper/simde/arm/neon/ceqz.h @@ -23,6 +23,7 @@ * Copyright: * 2020 Evan Nemerson * 2020 Sean Maher (Copyright owned by Google, LLC) + * 2023 Yi-Yen Chung (Copyright owned by Andes Technology) */ #if !defined(SIMDE_ARM_NEON_CEQZ_H) @@ -375,7 +376,7 @@ simde_vceqzd_u64(uint64_t a) { SIMDE_FUNCTION_ATTRIBUTES uint16_t -simde_vceqzh_f16(simde_float16 a) { +simde_vceqzh_f16(simde_float16_t a) { #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && defined(SIMDE_ARM_NEON_FP16) return vceqzh_f16(a); #else @@ -415,6 +416,62 @@ simde_vceqzd_f64(simde_float64_t a) { #define vceqzd_f64(a) simde_vceqzd_f64((a)) #endif +SIMDE_FUNCTION_ATTRIBUTES +simde_uint8x8_t +simde_vceqz_p8(simde_poly8x8_t a) { + #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + return vceqz_p8(a); + #else + return simde_vceq_p8(a, simde_vdup_n_p8(0)); + #endif +} +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vceqz_p8 + #define vceqz_p8(a) simde_vceqz_p8((a)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_uint8x16_t +simde_vceqzq_p8(simde_poly8x16_t a) { + #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + return vceqzq_p8(a); + #else + return simde_vceqq_p8(a, simde_vdupq_n_p8(0)); + #endif +} +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vceqzq_p8 + #define vceqzq_p8(a) simde_vceqzq_p8((a)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_uint64x1_t +simde_vceqz_p64(simde_poly64x1_t a) { + #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + return vceqz_p64(a); + #else + return simde_vceq_p64(a, simde_vdup_n_p64(0)); + #endif +} +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vceqz_p64 + #define vceqz_p64(a) simde_vceqz_p64((a)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_uint64x2_t +simde_vceqzq_p64(simde_poly64x2_t a) { + #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + return vceqzq_p64(a); + #else + return simde_vceqq_p64(a, simde_vdupq_n_p64(0)); + #endif +} +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vceqzq_p64 + #define vceqzq_p64(a) simde_vceqzq_p64((a)) +#endif + SIMDE_END_DECLS_ HEDLEY_DIAGNOSTIC_POP diff --git a/lib/simd_wrapper/simde/arm/neon/cgez.h b/lib/simd_wrapper/simde/arm/neon/cgez.h index b8440836165..04024c48ec8 100644 --- a/lib/simd_wrapper/simde/arm/neon/cgez.h +++ b/lib/simd_wrapper/simde/arm/neon/cgez.h @@ -23,6 +23,7 @@ * Copyright: * 2020 Evan Nemerson * 2020 Christopher Moore + * 2023 Yi-Yen Chung (Copyright owned by Andes Technology) */ #if !defined(SIMDE_ARM_NEON_CGEZ_H) @@ -78,6 +79,42 @@ simde_vcgezs_f32(simde_float32_t a) { #define vcgezs_f32(a) simde_vcgezs_f32(a) #endif +SIMDE_FUNCTION_ATTRIBUTES +uint16_t +simde_vcgezh_f16(simde_float16_t a) { + #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && defined(SIMDE_ARM_NEON_FP16) + return HEDLEY_STATIC_CAST(uint16_t, vcgezh_f16(a)); + #else + return (simde_float16_to_float32(a) >= SIMDE_FLOAT32_C(0.0)) ? UINT16_MAX : 0; + #endif +} +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vcgezh_f16 + #define vcgezh_f16(a) simde_vcgezh_f16(a) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_uint16x8_t +simde_vcgezq_f16(simde_float16x8_t a) { + #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARM_NEON_FP16) + return vcgezq_f16(a); + #else + simde_float16x8_private a_ = simde_float16x8_to_private(a); + simde_uint16x8_private r_; + + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { + r_.values[i] = simde_vcgezh_f16(a_.values[i]); + } + + return simde_uint16x8_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) + #undef vcgezq_f16 + #define vcgezq_f16(a) simde_vcgezq_f16(a) +#endif + SIMDE_FUNCTION_ATTRIBUTES simde_uint32x4_t simde_vcgezq_f32(simde_float32x4_t a) { @@ -246,6 +283,28 @@ simde_vcgezq_s64(simde_int64x2_t a) { #define vcgezq_s64(a) simde_vcgezq_s64(a) #endif +SIMDE_FUNCTION_ATTRIBUTES +simde_uint16x4_t +simde_vcgez_f16(simde_float16x4_t a) { + #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARM_NEON_FP16) + return vcgez_f16(a); + #else + simde_float16x4_private a_ = simde_float16x4_to_private(a); + simde_uint16x4_private r_; + + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { + r_.values[i] = simde_vcgezh_f16(a_.values[i]); + } + + return simde_uint16x4_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) + #undef vcgez_f16 + #define vcgez_f16(a) simde_vcgez_f16(a) +#endif + SIMDE_FUNCTION_ATTRIBUTES simde_uint32x2_t simde_vcgez_f32(simde_float32x2_t a) { diff --git a/lib/simd_wrapper/simde/arm/neon/cgt.h b/lib/simd_wrapper/simde/arm/neon/cgt.h index a090dca5b85..465cdb91786 100644 --- a/lib/simd_wrapper/simde/arm/neon/cgt.h +++ b/lib/simd_wrapper/simde/arm/neon/cgt.h @@ -23,6 +23,7 @@ * Copyright: * 2020 Evan Nemerson * 2020 Christopher Moore + * 2023 Yi-Yen Chung (Copyright owned by Andes Technology) */ #if !defined(SIMDE_ARM_NEON_CGT_H) @@ -78,6 +79,23 @@ simde_vcgtd_u64(uint64_t a, uint64_t b) { #define vcgtd_u64(a, b) simde_vcgtd_u64((a), (b)) #endif +SIMDE_FUNCTION_ATTRIBUTES +uint16_t +simde_vcgth_f16(simde_float16_t a, simde_float16_t b) { + #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && defined(SIMDE_ARM_NEON_FP16) + return HEDLEY_STATIC_CAST(uint16_t, vcgth_f16(a, b)); + #else + simde_float32_t a_ = simde_float16_to_float32(a); + simde_float32_t b_ = simde_float16_to_float32(b); + + return (a_ > b_) ? UINT16_MAX : 0; + #endif +} +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vcgth_f16 + #define vcgth_f16(a, b) simde_vcgth_f16((a), (b)) +#endif + SIMDE_FUNCTION_ATTRIBUTES uint32_t simde_vcgts_f32(simde_float32_t a, simde_float32_t b) { @@ -92,6 +110,30 @@ simde_vcgts_f32(simde_float32_t a, simde_float32_t b) { #define vcgts_f32(a, b) simde_vcgts_f32((a), (b)) #endif +SIMDE_FUNCTION_ATTRIBUTES +simde_uint16x8_t +simde_vcgtq_f16(simde_float16x8_t a, simde_float16x8_t b) { + #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARM_NEON_FP16) + return vcgtq_f16(a, b); + #else + simde_float16x8_private + a_ = simde_float16x8_to_private(a), + b_ = simde_float16x8_to_private(b); + simde_uint16x8_private r_; + + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { + r_.values[i] = simde_vcgth_f16(a_.values[i], b_.values[i]); + } + + return simde_uint16x8_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) + #undef vcgtq_f16 + #define vcgtq_f16(a, b) simde_vcgtq_f16((a), (b)) +#endif + SIMDE_FUNCTION_ATTRIBUTES simde_uint32x4_t simde_vcgtq_f32(simde_float32x4_t a, simde_float32x4_t b) { @@ -442,6 +484,30 @@ simde_vcgtq_u64(simde_uint64x2_t a, simde_uint64x2_t b) { #define vcgtq_u64(a, b) simde_vcgtq_u64((a), (b)) #endif +SIMDE_FUNCTION_ATTRIBUTES +simde_uint16x4_t +simde_vcgt_f16(simde_float16x4_t a, simde_float16x4_t b) { + #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARM_NEON_FP16) + return vcgt_f16(a, b); + #else + simde_float16x4_private + a_ = simde_float16x4_to_private(a), + b_ = simde_float16x4_to_private(b); + simde_uint16x4_private r_; + + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { + r_.values[i] = simde_vcgth_f16(a_.values[i], b_.values[i]); + } + + return simde_uint16x4_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) + #undef vcgt_f16 + #define vcgt_f16(a, b) simde_vcgt_f16((a), (b)) +#endif + SIMDE_FUNCTION_ATTRIBUTES simde_uint32x2_t simde_vcgt_f32(simde_float32x2_t a, simde_float32x2_t b) { diff --git a/lib/simd_wrapper/simde/arm/neon/cgtz.h b/lib/simd_wrapper/simde/arm/neon/cgtz.h index 125e009b2a1..30c6e5dd04f 100644 --- a/lib/simd_wrapper/simde/arm/neon/cgtz.h +++ b/lib/simd_wrapper/simde/arm/neon/cgtz.h @@ -23,6 +23,7 @@ * Copyright: * 2020 Evan Nemerson * 2020 Christopher Moore + * 2023 Yi-Yen Chung (Copyright owned by Andes Technology) */ #if !defined(SIMDE_ARM_NEON_CGTZ_H) @@ -66,6 +67,42 @@ simde_vcgtzd_f64(simde_float64_t a) { #define vcgtzd_f64(a) simde_vcgtzd_f64(a) #endif +SIMDE_FUNCTION_ATTRIBUTES +uint16_t +simde_vcgtzh_f16(simde_float16_t a) { + #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && defined(SIMDE_ARM_NEON_FP16) + return HEDLEY_STATIC_CAST(uint16_t, vcgtzh_f16(a)); + #else + return (simde_float16_to_float32(a) > SIMDE_FLOAT32_C(0.0)) ? UINT16_MAX : 0; + #endif +} +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vcgtzh_f16 + #define vcgtzh_f16(a) simde_vcgtzh_f16(a) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_uint16x8_t +simde_vcgtzq_f16(simde_float16x8_t a) { + #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARM_NEON_FP16) + return vcgtzq_f16(a); + #else + simde_float16x8_private a_ = simde_float16x8_to_private(a); + simde_uint16x8_private r_; + + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { + r_.values[i] = simde_vcgtzh_f16(a_.values[i]); + } + + return simde_uint16x8_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) + #undef vcgtzq_f16 + #define vcgtzq_f16(a) simde_vcgtzq_f16(a) +#endif + SIMDE_FUNCTION_ATTRIBUTES uint32_t simde_vcgtzs_f32(simde_float32_t a) { @@ -248,6 +285,28 @@ simde_vcgtzq_s64(simde_int64x2_t a) { #define vcgtzq_s64(a) simde_vcgtzq_s64(a) #endif +SIMDE_FUNCTION_ATTRIBUTES +simde_uint16x4_t +simde_vcgtz_f16(simde_float16x4_t a) { + #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARM_NEON_FP16) + return vcgtz_f16(a); + #else + simde_float16x4_private a_ = simde_float16x4_to_private(a); + simde_uint16x4_private r_; + + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { + r_.values[i] = simde_vcgtzh_f16(a_.values[i]); + } + + return simde_uint16x4_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) + #undef vcgtz_f16 + #define vcgtz_f16(a) simde_vcgtz_f16(a) +#endif + SIMDE_FUNCTION_ATTRIBUTES simde_uint32x2_t simde_vcgtz_f32(simde_float32x2_t a) { diff --git a/lib/simd_wrapper/simde/arm/neon/cle.h b/lib/simd_wrapper/simde/arm/neon/cle.h index 5a1591b3039..fedfcc52245 100644 --- a/lib/simd_wrapper/simde/arm/neon/cle.h +++ b/lib/simd_wrapper/simde/arm/neon/cle.h @@ -23,6 +23,7 @@ * Copyright: * 2020 Evan Nemerson * 2020 Christopher Moore + * 2023 Yi-Yen Chung (Copyright owned by Andes Technology) */ #if !defined(SIMDE_ARM_NEON_CLE_H) @@ -90,6 +91,44 @@ simde_vcles_f32(simde_float32_t a, simde_float32_t b) { #define vcles_f32(a, b) simde_vcles_f32((a), (b)) #endif +SIMDE_FUNCTION_ATTRIBUTES +uint16_t +simde_vcleh_f16(simde_float16_t a, simde_float16_t b) { + #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && defined(SIMDE_ARM_NEON_FP16) + return HEDLEY_STATIC_CAST(uint16_t, vcleh_f16(a, b)); + #else + return (simde_float16_to_float32(a) <= simde_float16_to_float32(b)) ? UINT16_MAX : 0; + #endif +} +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vcleh_f16 + #define vcleh_f16(a, b) simde_vcleh_f16((a), (b)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_uint16x8_t +simde_vcleq_f16(simde_float16x8_t a, simde_float16x8_t b) { + #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARM_NEON_FP16) + return vcleq_f16(a, b); + #else + simde_float16x8_private + a_ = simde_float16x8_to_private(a), + b_ = simde_float16x8_to_private(b); + simde_uint16x8_private r_; + + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { + r_.values[i] = simde_vcleh_f16(a_.values[i], b_.values[i]); + } + + return simde_uint16x8_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) + #undef vcleq_f16 + #define vcleq_f16(a, b) simde_vcleq_f16((a), (b)) +#endif + SIMDE_FUNCTION_ATTRIBUTES simde_uint32x4_t simde_vcleq_f32(simde_float32x4_t a, simde_float32x4_t b) { @@ -475,6 +514,30 @@ simde_vcleq_u64(simde_uint64x2_t a, simde_uint64x2_t b) { #define vcleq_u64(a, b) simde_vcleq_u64((a), (b)) #endif +SIMDE_FUNCTION_ATTRIBUTES +simde_uint16x4_t +simde_vcle_f16(simde_float16x4_t a, simde_float16x4_t b) { + #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARM_NEON_FP16) + return vcle_f16(a, b); + #else + simde_float16x4_private + a_ = simde_float16x4_to_private(a), + b_ = simde_float16x4_to_private(b); + simde_uint16x4_private r_; + + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { + r_.values[i] = simde_vcleh_f16(a_.values[i], b_.values[i]); + } + + return simde_uint16x4_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) + #undef vcle_f16 + #define vcle_f16(a, b) simde_vcle_f16((a), (b)) +#endif + SIMDE_FUNCTION_ATTRIBUTES simde_uint32x2_t simde_vcle_f32(simde_float32x2_t a, simde_float32x2_t b) { diff --git a/lib/simd_wrapper/simde/arm/neon/clez.h b/lib/simd_wrapper/simde/arm/neon/clez.h index ae3eea9b8a7..dd308c7f4ba 100644 --- a/lib/simd_wrapper/simde/arm/neon/clez.h +++ b/lib/simd_wrapper/simde/arm/neon/clez.h @@ -23,6 +23,7 @@ * Copyright: * 2020 Evan Nemerson * 2020 Christopher Moore + * 2023 Yi-Yen Chung (Copyright owned by Andes Technology) */ #if !defined(SIMDE_ARM_NEON_CLEZ_H) @@ -78,6 +79,44 @@ simde_vclezs_f32(simde_float32_t a) { #define vclezs_f32(a) simde_vclezs_f32(a) #endif +SIMDE_FUNCTION_ATTRIBUTES +uint16_t +simde_vclezh_f16(simde_float16_t a) { + #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && defined(SIMDE_ARM_NEON_FP16) + return HEDLEY_STATIC_CAST(uint16_t, vclezh_f16(a)); + #else + simde_float32_t a_ = simde_float16_to_float32(a); + + return (a_ <= 0.0f) ? UINT16_MAX : 0; + #endif +} +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vclezh_f16 + #define vclezh_f16(a) simde_vclezh_f16(a) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_uint16x8_t +simde_vclezq_f16(simde_float16x8_t a) { + #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARM_NEON_FP16) + return vclezq_f16(a); + #else + simde_float16x8_private a_ = simde_float16x8_to_private(a); + simde_uint16x8_private r_; + + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { + r_.values[i] = simde_vclezh_f16(a_.values[i]); + } + + return simde_uint16x8_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) + #undef vclezq_f16 + #define vclezq_f16(a) simde_vclezq_f16(a) +#endif + SIMDE_FUNCTION_ATTRIBUTES simde_uint32x4_t simde_vclezq_f32(simde_float32x4_t a) { @@ -246,6 +285,28 @@ simde_vclezq_s64(simde_int64x2_t a) { #define vclezq_s64(a) simde_vclezq_s64(a) #endif +SIMDE_FUNCTION_ATTRIBUTES +simde_uint16x4_t +simde_vclez_f16(simde_float16x4_t a) { + #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARM_NEON_FP16) + return vclez_f16(a); + #else + simde_float16x4_private a_ = simde_float16x4_to_private(a); + simde_uint16x4_private r_; + + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { + r_.values[i] = simde_vclezh_f16(a_.values[i]); + } + + return simde_uint16x4_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) + #undef vclez_f16 + #define vclez_f16(a) simde_vclez_f16(a) +#endif + SIMDE_FUNCTION_ATTRIBUTES simde_uint32x2_t simde_vclez_f32(simde_float32x2_t a) { diff --git a/lib/simd_wrapper/simde/arm/neon/clt.h b/lib/simd_wrapper/simde/arm/neon/clt.h index ae36027327b..9d3cf407647 100644 --- a/lib/simd_wrapper/simde/arm/neon/clt.h +++ b/lib/simd_wrapper/simde/arm/neon/clt.h @@ -23,6 +23,7 @@ * Copyright: * 2020 Evan Nemerson * 2020 Christopher Moore + * 2023 Yi-Yen Chung (Copyright owned by Andes Technology) */ #if !defined(SIMDE_ARM_NEON_CLT_H) @@ -77,6 +78,23 @@ simde_vcltd_u64(uint64_t a, uint64_t b) { #define vcltd_u64(a, b) simde_vcltd_u64((a), (b)) #endif +SIMDE_FUNCTION_ATTRIBUTES +uint16_t +simde_vclth_f16(simde_float16_t a, simde_float16_t b) { + #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && defined(SIMDE_ARM_NEON_FP16) + return HEDLEY_STATIC_CAST(uint16_t, vclth_f16(a, b)); + #else + simde_float32_t a_ = simde_float16_to_float32(a); + simde_float32_t b_ = simde_float16_to_float32(b); + + return (a_ < b_) ? UINT16_MAX : 0; + #endif +} +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vclth_f16 + #define vclth_f16(a, b) simde_vclth_f16((a), (b)) +#endif + SIMDE_FUNCTION_ATTRIBUTES uint32_t simde_vclts_f32(simde_float32_t a, simde_float32_t b) { @@ -91,6 +109,30 @@ simde_vclts_f32(simde_float32_t a, simde_float32_t b) { #define vclts_f32(a, b) simde_vclts_f32((a), (b)) #endif +SIMDE_FUNCTION_ATTRIBUTES +simde_uint16x8_t +simde_vcltq_f16(simde_float16x8_t a, simde_float16x8_t b) { + #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARM_NEON_FP16) + return vcltq_f16(a, b); + #else + simde_float16x8_private + a_ = simde_float16x8_to_private(a), + b_ = simde_float16x8_to_private(b); + simde_uint16x8_private r_; + + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { + r_.values[i] = simde_vclth_f16(a_.values[i], b_.values[i]); + } + + return simde_uint16x8_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) + #undef vcltq_f16 + #define vcltq_f16(a, b) simde_vcltq_f16((a), (b)) +#endif + SIMDE_FUNCTION_ATTRIBUTES simde_uint32x4_t simde_vcltq_f32(simde_float32x4_t a, simde_float32x4_t b) { @@ -450,6 +492,30 @@ simde_vcltq_u64(simde_uint64x2_t a, simde_uint64x2_t b) { #define vcltq_u64(a, b) simde_vcltq_u64((a), (b)) #endif +SIMDE_FUNCTION_ATTRIBUTES +simde_uint16x4_t +simde_vclt_f16(simde_float16x4_t a, simde_float16x4_t b) { + #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARM_NEON_FP16) + return vclt_f16(a, b); + #else + simde_float16x4_private + a_ = simde_float16x4_to_private(a), + b_ = simde_float16x4_to_private(b); + simde_uint16x4_private r_; + + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { + r_.values[i] = simde_vclth_f16(a_.values[i], b_.values[i]); + } + + return simde_uint16x4_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) + #undef vclt_f16 + #define vclt_f16(a, b) simde_vclt_f16((a), (b)) +#endif + SIMDE_FUNCTION_ATTRIBUTES simde_uint32x2_t simde_vclt_f32(simde_float32x2_t a, simde_float32x2_t b) { diff --git a/lib/simd_wrapper/simde/arm/neon/cltz.h b/lib/simd_wrapper/simde/arm/neon/cltz.h index a9c94984e98..2c61d1a1622 100644 --- a/lib/simd_wrapper/simde/arm/neon/cltz.h +++ b/lib/simd_wrapper/simde/arm/neon/cltz.h @@ -22,6 +22,7 @@ * * Copyright: * 2020 Evan Nemerson + * 2023 Yi-Yen Chung (Copyright owned by Andes Technology) */ /* TODO: float fallbacks should use vclt(a, vdup_n(0.0)) */ @@ -81,6 +82,42 @@ simde_vcltzs_f32(simde_float32_t a) { #define vcltzs_f32(a) simde_vcltzs_f32(a) #endif +SIMDE_FUNCTION_ATTRIBUTES +uint16_t +simde_vcltzh_f16(simde_float16_t a) { + #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && defined(SIMDE_ARM_NEON_FP16) + return HEDLEY_STATIC_CAST(uint16_t, vcltzh_f16(a)); + #else + return (simde_float16_to_float32(a) < SIMDE_FLOAT32_C(0.0)) ? UINT16_MAX : 0; + #endif +} +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vcltzh_f16 + #define vcltzh_f16(a) simde_vcltzh_f16(a) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_uint16x4_t +simde_vcltz_f16(simde_float16x4_t a) { + #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARM_NEON_FP16) + return vcltz_f16(a); + #else + simde_float16x4_private a_ = simde_float16x4_to_private(a); + simde_uint16x4_private r_; + + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { + r_.values[i] = simde_vcltzh_f16(a_.values[i]); + } + + return simde_uint16x4_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vcltz_f16 + #define vcltz_f16(a) simde_vcltz_f16(a) +#endif + SIMDE_FUNCTION_ATTRIBUTES simde_uint32x2_t simde_vcltz_f32(simde_float32x2_t a) { @@ -201,6 +238,28 @@ simde_vcltz_s64(simde_int64x1_t a) { #define vcltz_s64(a) simde_vcltz_s64(a) #endif +SIMDE_FUNCTION_ATTRIBUTES +simde_uint16x8_t +simde_vcltzq_f16(simde_float16x8_t a) { + #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARM_NEON_FP16) + return vcltzq_f16(a); + #else + simde_float16x8_private a_ = simde_float16x8_to_private(a); + simde_uint16x8_private r_; + + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { + r_.values[i] = simde_vcltzh_f16(a_.values[i]); + } + + return simde_uint16x8_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) + #undef vcltzq_f16 + #define vcltzq_f16(a) simde_vcltzq_f16(a) +#endif + SIMDE_FUNCTION_ATTRIBUTES simde_uint32x4_t simde_vcltzq_f32(simde_float32x4_t a) { diff --git a/lib/simd_wrapper/simde/arm/neon/cmla.h b/lib/simd_wrapper/simde/arm/neon/cmla.h index 559e607032c..68b9a0065c8 100644 --- a/lib/simd_wrapper/simde/arm/neon/cmla.h +++ b/lib/simd_wrapper/simde/arm/neon/cmla.h @@ -22,6 +22,7 @@ * * Copyright: * 2021 Atharva Nimbalkar +* 2023 Yi-Yen Chung (Copyright owned by Andes Technology) */ #if !defined(SIMDE_ARM_NEON_CMLA_H) @@ -33,12 +34,47 @@ HEDLEY_DIAGNOSTIC_PUSH SIMDE_DISABLE_UNWANTED_DIAGNOSTICS SIMDE_BEGIN_DECLS_ +SIMDE_FUNCTION_ATTRIBUTES +simde_float16x4_t +simde_vcmla_f16(simde_float16x4_t r, simde_float16x4_t a, simde_float16x4_t b) { + #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && SIMDE_ARCH_ARM_CHECK(8,3) && \ + (!defined(HEDLEY_GCC_VERSION) || HEDLEY_GCC_VERSION_CHECK(8,5,0)) && \ + (!defined(__clang__) || SIMDE_DETECT_CLANG_VERSION_CHECK(12,0,0)) && \ + defined(SIMDE_ARM_NEON_FP16) && defined(SIMDE_ARCH_ARM_COMPLEX) + return vcmla_f16(r, a, b); + #else + simde_float16x4_private + r_ = simde_float16x4_to_private(r), + a_ = simde_float16x4_to_private(a), + b_ = simde_float16x4_to_private(b); + + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0]) / 2) ; i++) { + r_.values[2 * i] = simde_float16_from_float32( + simde_float16_to_float32(r_.values[2 * i]) + + simde_float16_to_float32(b_.values[2 * i]) * + simde_float16_to_float32(a_.values[2 * i])); + r_.values[2 * i + 1] = simde_float16_from_float32( + simde_float16_to_float32(r_.values[2 * i + 1]) + + simde_float16_to_float32(b_.values[2 * i + 1]) * + simde_float16_to_float32(a_.values[2 * i])); + } + + return simde_float16x4_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) + #undef vcmla_f16 + #define vcmla_f16(r, a, b) simde_vcmla_f16(r, a, b) +#endif + SIMDE_FUNCTION_ATTRIBUTES simde_float32x2_t simde_vcmla_f32(simde_float32x2_t r, simde_float32x2_t a, simde_float32x2_t b) { #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && SIMDE_ARCH_ARM_CHECK(8,3) && \ (!defined(HEDLEY_GCC_VERSION) || HEDLEY_GCC_VERSION_CHECK(9,0,0)) && \ - (!defined(__clang__) || SIMDE_DETECT_CLANG_VERSION_CHECK(12,0,0)) + (!defined(__clang__) || SIMDE_DETECT_CLANG_VERSION_CHECK(12,0,0)) && \ + defined(SIMDE_ARCH_ARM_COMPLEX) return vcmla_f32(r, a, b); #else simde_float32x2_private @@ -64,12 +100,47 @@ simde_vcmla_f32(simde_float32x2_t r, simde_float32x2_t a, simde_float32x2_t b) { #define vcmla_f32(r, a, b) simde_vcmla_f32(r, a, b) #endif +SIMDE_FUNCTION_ATTRIBUTES +simde_float16x8_t +simde_vcmlaq_f16(simde_float16x8_t r, simde_float16x8_t a, simde_float16x8_t b) { + #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && SIMDE_ARCH_ARM_CHECK(8,3) && \ + (!defined(HEDLEY_GCC_VERSION) || HEDLEY_GCC_VERSION_CHECK(8,5,0)) && \ + (!defined(__clang__) || SIMDE_DETECT_CLANG_VERSION_CHECK(12,0,0)) && \ + defined(SIMDE_ARM_NEON_FP16) && defined(SIMDE_ARCH_ARM_COMPLEX) + return vcmlaq_f16(r, a, b); + #else + simde_float16x8_private + r_ = simde_float16x8_to_private(r), + a_ = simde_float16x8_to_private(a), + b_ = simde_float16x8_to_private(b); + + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0]) / 2) ; i++) { + r_.values[2 * i] = simde_float16_from_float32( + simde_float16_to_float32(r_.values[2 * i]) + + simde_float16_to_float32(b_.values[2 * i]) * + simde_float16_to_float32(a_.values[2 * i])); + r_.values[2 * i + 1] = simde_float16_from_float32( + simde_float16_to_float32(r_.values[2 * i + 1]) + + simde_float16_to_float32(b_.values[2 * i + 1]) * + simde_float16_to_float32(a_.values[2 * i])); + } + + return simde_float16x8_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) + #undef vcmlaq_f16 + #define vcmlaq_f16(r, a, b) simde_vcmlaq_f16(r, a, b) +#endif + SIMDE_FUNCTION_ATTRIBUTES simde_float32x4_t simde_vcmlaq_f32(simde_float32x4_t r, simde_float32x4_t a, simde_float32x4_t b) { #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && SIMDE_ARCH_ARM_CHECK(8,3) && \ (!defined(HEDLEY_GCC_VERSION) || HEDLEY_GCC_VERSION_CHECK(9,0,0)) && \ - (!defined(__clang__) || SIMDE_DETECT_CLANG_VERSION_CHECK(12,0,0)) + (!defined(__clang__) || SIMDE_DETECT_CLANG_VERSION_CHECK(12,0,0)) && \ + defined(SIMDE_ARCH_ARM_COMPLEX) return vcmlaq_f32(r, a, b); #else simde_float32x4_private @@ -77,7 +148,9 @@ simde_vcmlaq_f32(simde_float32x4_t r, simde_float32x4_t a, simde_float32x4_t b) a_ = simde_float32x4_to_private(a), b_ = simde_float32x4_to_private(b); - #if defined(SIMDE_SHUFFLE_VECTOR_) + #if defined(SIMDE_WASM_SIMD128_NATIVE) + r_.v128 = wasm_f32x4_add(r_.v128, wasm_f32x4_mul(b_.v128, wasm_i32x4_shuffle(a_.v128, a_.v128, 0, 0, 2, 2))); + #elif defined(SIMDE_SHUFFLE_VECTOR_) a_.values = SIMDE_SHUFFLE_VECTOR_(32, 16, a_.values, a_.values, 0, 0, 2, 2); r_.values += b_.values * a_.values; #else @@ -100,7 +173,8 @@ simde_float64x2_t simde_vcmlaq_f64(simde_float64x2_t r, simde_float64x2_t a, simde_float64x2_t b) { #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && SIMDE_ARCH_ARM_CHECK(8,3) && \ (!defined(HEDLEY_GCC_VERSION) || HEDLEY_GCC_VERSION_CHECK(9,0,0)) && \ - (!defined(__clang__) || SIMDE_DETECT_CLANG_VERSION_CHECK(12,0,0)) + (!defined(__clang__) || SIMDE_DETECT_CLANG_VERSION_CHECK(12,0,0)) && \ + defined(SIMDE_ARCH_ARM_COMPLEX) return vcmlaq_f64(r, a, b); #else simde_float64x2_private @@ -108,7 +182,9 @@ simde_vcmlaq_f64(simde_float64x2_t r, simde_float64x2_t a, simde_float64x2_t b) a_ = simde_float64x2_to_private(a), b_ = simde_float64x2_to_private(b); - #if defined(SIMDE_SHUFFLE_VECTOR_) + #if defined(SIMDE_WASM_SIMD128_NATIVE) + r_.v128 = wasm_f64x2_add(r_.v128, wasm_f64x2_mul(b_.v128, wasm_i64x2_shuffle(a_.v128, a_.v128, 0, 0))); + #elif defined(SIMDE_SHUFFLE_VECTOR_) a_.values = SIMDE_SHUFFLE_VECTOR_(64, 16, a_.values, a_.values, 0, 0); r_.values += b_.values * a_.values; #else diff --git a/lib/simd_wrapper/simde/arm/neon/cmla_lane.h b/lib/simd_wrapper/simde/arm/neon/cmla_lane.h new file mode 100644 index 00000000000..4355bf7a5f7 --- /dev/null +++ b/lib/simd_wrapper/simde/arm/neon/cmla_lane.h @@ -0,0 +1,304 @@ +/* SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, copy, + * modify, merge, publish, distribute, sublicense, and/or sell copies + * of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + * Copyright: + * 2023 Chi-Wei Chu + */ + +#if !defined(SIMDE_ARM_NEON_CMLA_LANE_H) +#define SIMDE_ARM_NEON_CMLA_LANE_H + +#include "add.h" +#include "combine.h" +#include "cvt.h" +#include "dup_lane.h" +#include "get_high.h" +#include "get_low.h" +#include "mul.h" +#include "types.h" +HEDLEY_DIAGNOSTIC_PUSH +SIMDE_DISABLE_UNWANTED_DIAGNOSTICS +SIMDE_BEGIN_DECLS_ + +SIMDE_FUNCTION_ATTRIBUTES +simde_float16x4_t simde_vcmla_lane_f16(simde_float16x4_t r, simde_float16x4_t a, simde_float16x4_t b, const int lane) + SIMDE_REQUIRE_CONSTANT_RANGE(lane, 0, 1) +{ + simde_float32x4_private r_ = simde_float32x4_to_private(simde_vcvt_f32_f16(r)), + a_ = simde_float32x4_to_private(simde_vcvt_f32_f16(a)), + b_ = simde_float32x4_to_private( + simde_vcvt_f32_f16(simde_vdup_n_f16(simde_float16x4_to_private(b).values[lane]))); + #if defined(SIMDE_SHUFFLE_VECTOR_) && \ + ((SIMDE_FLOAT16_API == SIMDE_FLOAT16_API_FP16) || (SIMDE_FLOAT16_API == SIMDE_FLOAT16_API_FLOAT16)) + a_.values = SIMDE_SHUFFLE_VECTOR_(16, 4, a_.values, a_.values, 0, 0, 2, 2); + r_.values += b_.values * a_.values; + #else + SIMDE_VECTORIZE + for (size_t i = 0; i < (sizeof(r_.values) / (2 * sizeof(r_.values[0]))); i++) + { + r_.values[2 * i] += b_.values[lane] * a_.values[2 * i]; + r_.values[2 * i + 1] += b_.values[lane] * a_.values[2 * i]; + } + #endif + return simde_vcvt_f16_f32(simde_float32x4_from_private(r_)); +} +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) + #undef vcmla_lane_f16 + #define vcmla_lane_f16(r, a, b, lane) simde_vcmla_lane_f16(r, a, b, lane) +#endif +#if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && SIMDE_ARCH_ARM_CHECK(8, 3) && \ + (!defined(HEDLEY_GCC_VERSION) || HEDLEY_GCC_VERSION_CHECK(9, 0, 0)) && \ + (!defined(__clang__) || SIMDE_DETECT_CLANG_VERSION_CHECK(12, 0, 0)) + #define simde_vcmla_lane_f16(r, a, b, lane) vcmla_lane_f16(r, a, b, lane) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_float32x2_t simde_vcmla_lane_f32(simde_float32x2_t r, simde_float32x2_t a, simde_float32x2_t b, const int lane) + SIMDE_REQUIRE_CONSTANT_RANGE(lane, 0, 0) +{ + simde_float32x2_private r_ = simde_float32x2_to_private(r), a_ = simde_float32x2_to_private(a), + b_ = simde_float32x2_to_private(simde_vdup_n_f32(simde_float32x2_to_private(b).values[lane])); + #if defined(SIMDE_SHUFFLE_VECTOR_) + a_.values = SIMDE_SHUFFLE_VECTOR_(32, 8, a_.values, a_.values, 0, 0); + r_.values += b_.values * a_.values; + #else + SIMDE_VECTORIZE + for (size_t i = 0; i < (sizeof(r_.values) / (2 * sizeof(r_.values[0]))); i++) + { + r_.values[2 * i] += b_.values[lane] * a_.values[2 * i]; + r_.values[2 * i + 1] += b_.values[lane] * a_.values[2 * i]; + } + #endif + return simde_float32x2_from_private(r_); +} +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) + #undef vcmla_lane_f32 + #define vcmla_lane_f32(r, a, b, lane) simde_vcmla_lane_f32(r, a, b, lane) +#endif +#if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && SIMDE_ARCH_ARM_CHECK(8, 3) && \ + (!defined(HEDLEY_GCC_VERSION) || HEDLEY_GCC_VERSION_CHECK(9, 0, 0)) && \ + (!defined(__clang__) || SIMDE_DETECT_CLANG_VERSION_CHECK(12, 0, 0)) + #define simde_vcmla_lane_f32(r, a, b, lane) vcmla_lane_f32(r, a, b, lane) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_float16x4_t simde_vcmla_laneq_f16(simde_float16x4_t r, simde_float16x4_t a, simde_float16x8_t b, const int lane) + SIMDE_REQUIRE_CONSTANT_RANGE(lane, 0, 1) +{ + simde_float32x4_private r_ = simde_float32x4_to_private(simde_vcvt_f32_f16(r)), + a_ = simde_float32x4_to_private(simde_vcvt_f32_f16(a)), + b_ = simde_float32x4_to_private( + simde_vcvt_f32_f16(simde_vdup_n_f16(simde_float16x8_to_private(b).values[lane]))); + #if defined(SIMDE_SHUFFLE_VECTOR_) && \ + ((SIMDE_FLOAT16_API == SIMDE_FLOAT16_API_FP16) || (SIMDE_FLOAT16_API == SIMDE_FLOAT16_API_FLOAT16)) + a_.values = SIMDE_SHUFFLE_VECTOR_(16, 4, a_.values, a_.values, 0, 0, 2, 2); + r_.values += b_.values * a_.values; + #else + SIMDE_VECTORIZE + for (size_t i = 0; i < (sizeof(r_.values) / (2 * sizeof(r_.values[0]))); i++) + { + r_.values[2 * i] += b_.values[lane] * a_.values[2 * i]; + r_.values[2 * i + 1] += b_.values[lane] * a_.values[2 * i]; + } + #endif + return simde_vcvt_f16_f32(simde_float32x4_from_private(r_)); +} +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) + #undef vcmla_laneq_f16 + #define vcmla_laneq_f16(r, a, b, lane) simde_vcmla_laneq_f16(r, a, b, lane) +#endif +#if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && SIMDE_ARCH_ARM_CHECK(8, 3) && \ + (!defined(HEDLEY_GCC_VERSION) || HEDLEY_GCC_VERSION_CHECK(9, 0, 0)) && \ + (!defined(__clang__) || SIMDE_DETECT_CLANG_VERSION_CHECK(12, 0, 0)) + #define simde_vcmla_laneq_f16(r, a, b, lane) vcmla_laneq_f16(r, a, b, lane) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_float32x2_t simde_vcmla_laneq_f32(simde_float32x2_t r, simde_float32x2_t a, simde_float32x4_t b, const int lane) + SIMDE_REQUIRE_CONSTANT_RANGE(lane, 0, 1) +{ + simde_float32x2_private r_ = simde_float32x2_to_private(r), a_ = simde_float32x2_to_private(a), + b_ = simde_float32x2_to_private(simde_vdup_n_f32(simde_float32x4_to_private(b).values[lane])); + #if defined(SIMDE_SHUFFLE_VECTOR_) + a_.values = SIMDE_SHUFFLE_VECTOR_(32, 8, a_.values, a_.values, 0, 0); + r_.values += b_.values * a_.values; + #else + SIMDE_VECTORIZE + for (size_t i = 0; i < (sizeof(r_.values) / (2 * sizeof(r_.values[0]))); i++) + { + r_.values[2 * i] += b_.values[lane] * a_.values[2 * i]; + r_.values[2 * i + 1] += b_.values[lane] * a_.values[2 * i]; + } + #endif + return simde_float32x2_from_private(r_); +} +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) + #undef vcmla_laneq_f32 + #define vcmla_laneq_f32(r, a, b, lane) simde_vcmla_laneq_f32(r, a, b, lane) +#endif +#if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && SIMDE_ARCH_ARM_CHECK(8, 3) && \ + (!defined(HEDLEY_GCC_VERSION) || HEDLEY_GCC_VERSION_CHECK(9, 0, 0)) && \ + (!defined(__clang__) || SIMDE_DETECT_CLANG_VERSION_CHECK(12, 0, 0)) + #define simde_vcmla_laneq_f32(r, a, b, lane) vcmla_laneq_f32(r, a, b, lane) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_float16x8_t simde_vcmlaq_lane_f16(simde_float16x8_t r, simde_float16x8_t a, simde_float16x4_t b, const int lane) + SIMDE_REQUIRE_CONSTANT_RANGE(lane, 0, 1) +{ + simde_float32x4_private r_low = simde_float32x4_to_private(simde_vcvt_f32_f16(simde_vget_low_f16(r))), + a_low = simde_float32x4_to_private(simde_vcvt_f32_f16(simde_vget_low_f16(a))), + r_high = simde_float32x4_to_private(simde_vcvt_f32_f16(simde_vget_high_f16(r))), + a_high = simde_float32x4_to_private(simde_vcvt_f32_f16(simde_vget_high_f16(a))), + b_ = simde_float32x4_to_private( + simde_vcvt_f32_f16(simde_vdup_n_f16(simde_float16x4_to_private(b).values[lane]))); + #if defined(SIMDE_SHUFFLE_VECTOR_) && \ + ((SIMDE_FLOAT16_API == SIMDE_FLOAT16_API_FP16) || (SIMDE_FLOAT16_API == SIMDE_FLOAT16_API_FLOAT16)) + a_low.values = SIMDE_SHUFFLE_VECTOR_(16, 4, a_low.values, a_low.values, 0, 0, 2, 2); + a_high.values = SIMDE_SHUFFLE_VECTOR_(16, 4, a_high.values, a_high.values, 0, 0, 2, 2); + r_low.values += b_.values * a_low.values; + r_high.values += b_.values * a_high.values; + #else + SIMDE_VECTORIZE + for (size_t i = 0; i < (sizeof(r_low.values) / (2 * sizeof(r_low.values[0]))); i++) + { + r_low.values[2 * i] += b_.values[lane] * a_low.values[2 * i]; + r_low.values[2 * i + 1] += b_.values[lane] * a_low.values[2 * i]; + r_high.values[2 * i] += b_.values[lane] * a_high.values[2 * i]; + r_high.values[2 * i + 1] += b_.values[lane] * a_high.values[2 * i]; + } + #endif + return simde_vcombine_f16(simde_vcvt_f16_f32(simde_float32x4_from_private(r_low)), + simde_vcvt_f16_f32(simde_float32x4_from_private(r_high))); +} +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) + #undef vcmlaq_lane_f16 + #define vcmlaq_lane_f16(r, a, b, lane) simde_vcmlaq_lane_f16(r, a, b, lane) +#endif +#if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && SIMDE_ARCH_ARM_CHECK(8, 3) && \ + (!defined(HEDLEY_GCC_VERSION) || HEDLEY_GCC_VERSION_CHECK(9, 0, 0)) && \ + (!defined(__clang__) || SIMDE_DETECT_CLANG_VERSION_CHECK(12, 0, 0)) + #define simde_vcmlaq_lane_f16(r, a, b, lane) vcmlaq_lane_f16(r, a, b, lane) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_float32x4_t simde_vcmlaq_lane_f32(simde_float32x4_t r, simde_float32x4_t a, simde_float32x2_t b, const int lane) + SIMDE_REQUIRE_CONSTANT_RANGE(lane, 0, 0) +{ + simde_float32x4_private r_ = simde_float32x4_to_private(r), a_ = simde_float32x4_to_private(a), + b_ = simde_float32x4_to_private(simde_vdupq_n_f32(simde_float32x2_to_private(b).values[lane])); + #if defined(SIMDE_SHUFFLE_VECTOR_) + a_.values = SIMDE_SHUFFLE_VECTOR_(32, 16, a_.values, a_.values, 0, 0, 2, 2); + r_.values += b_.values * a_.values; + #else + SIMDE_VECTORIZE + for (size_t i = 0; i < (sizeof(r_.values) / (2 * sizeof(r_.values[0]))); i++) + { + r_.values[2 * i] += b_.values[lane] * a_.values[2 * i]; + r_.values[2 * i + 1] += b_.values[lane] * a_.values[2 * i]; + } + #endif + return simde_float32x4_from_private(r_); +} +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) + #undef vcmlaq_lane_f32 + #define vcmlaq_lane_f32(r, a, b, lane) simde_vcmlaq_lane_f32(r, a, b, lane) +#endif +#if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && SIMDE_ARCH_ARM_CHECK(8, 3) && \ + (!defined(HEDLEY_GCC_VERSION) || HEDLEY_GCC_VERSION_CHECK(9, 0, 0)) && \ + (!defined(__clang__) || SIMDE_DETECT_CLANG_VERSION_CHECK(12, 0, 0)) + #define simde_vcmlaq_lane_f32(r, a, b, lane) vcmlaq_lane_f32(r, a, b, 0); +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_float16x8_t simde_vcmlaq_laneq_f16(simde_float16x8_t r, simde_float16x8_t a, simde_float16x8_t b, const int lane) + SIMDE_REQUIRE_CONSTANT_RANGE(lane, 0, 3) +{ + simde_float32x4_private r_low = simde_float32x4_to_private(simde_vcvt_f32_f16(simde_vget_low_f16(r))), + a_low = simde_float32x4_to_private(simde_vcvt_f32_f16(simde_vget_low_f16(a))), + r_high = simde_float32x4_to_private(simde_vcvt_f32_f16(simde_vget_high_f16(r))), + a_high = simde_float32x4_to_private(simde_vcvt_f32_f16(simde_vget_high_f16(a))), + b_ = simde_float32x4_to_private( + simde_vcvt_f32_f16(simde_vdup_n_f16(simde_float16x8_to_private(b).values[lane]))); + #if defined(SIMDE_SHUFFLE_VECTOR_) && \ + ((SIMDE_FLOAT16_API == SIMDE_FLOAT16_API_FP16) || (SIMDE_FLOAT16_API == SIMDE_FLOAT16_API_FLOAT16)) + a_low.values = SIMDE_SHUFFLE_VECTOR_(16, 4, a_low.values, a_low.values, 0, 0, 2, 2); + r_low.values += b_.values * a_low.values; + a_high.values = SIMDE_SHUFFLE_VECTOR_(16, 4, a_high.values, a_high.values, 0, 0, 2, 2); + r_high.values += b_.values * a_high.values; + #else + SIMDE_VECTORIZE + for (size_t i = 0; i < (sizeof(r_low.values) / (2 * sizeof(r_low.values[0]))); i++) + { + r_low.values[2 * i] += b_.values[lane] * a_low.values[2 * i]; + r_low.values[2 * i + 1] += b_.values[lane] * a_low.values[2 * i]; + r_high.values[2 * i] += b_.values[lane] * a_high.values[2 * i]; + r_high.values[2 * i + 1] += b_.values[lane] * a_high.values[2 * i]; + } + #endif + return simde_vcombine_f16(simde_vcvt_f16_f32(simde_float32x4_from_private(r_low)), + simde_vcvt_f16_f32(simde_float32x4_from_private(r_high))); +} +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) + #undef vcmlaq_laneq_f16 + #define vcmlaq_laneq_f16(r, a, b, lane) simde_vcmlaq_laneq_f16(r, a, b, lane) +#endif +#if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && SIMDE_ARCH_ARM_CHECK(8, 3) && \ + (!defined(HEDLEY_GCC_VERSION) || HEDLEY_GCC_VERSION_CHECK(9, 0, 0)) && \ + (!defined(__clang__) || SIMDE_DETECT_CLANG_VERSION_CHECK(12, 0, 0)) + #define simde_vcmlaq_laneq_f16(r, a, b, lane) vcmlaq_laneq_f16(r, a, b, lane) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_float32x4_t simde_vcmlaq_laneq_f32(simde_float32x4_t r, simde_float32x4_t a, simde_float32x4_t b, const int lane) + SIMDE_REQUIRE_CONSTANT_RANGE(lane, 0, 1) +{ + simde_float32x4_private r_ = simde_float32x4_to_private(r), a_ = simde_float32x4_to_private(a), + b_ = simde_float32x4_to_private(simde_vdupq_n_f32(simde_float32x4_to_private(b).values[lane])); + + #if defined(SIMDE_SHUFFLE_VECTOR_) + a_.values = SIMDE_SHUFFLE_VECTOR_(32, 16, a_.values, a_.values, 0, 0, 2, 2); + r_.values += b_.values * a_.values; + #else + SIMDE_VECTORIZE + for (size_t i = 0; i < (sizeof(r_.values) / (2 * sizeof(r_.values[0]))); i++) + { + r_.values[2 * i] += b_.values[lane] * a_.values[2 * i]; + r_.values[2 * i + 1] += b_.values[lane] * a_.values[2 * i]; + } + #endif + return simde_float32x4_from_private(r_); +} +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) + #undef vcmlaq_laneq_f32 + #define vcmlaq_laneq_f32(r, a, b, lane) simde_vcmlaq_laneq_f32(r, a, b, lane) +#endif +#if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && SIMDE_ARCH_ARM_CHECK(8, 3) && \ + (!defined(HEDLEY_GCC_VERSION) || HEDLEY_GCC_VERSION_CHECK(9, 0, 0)) && \ + (!defined(__clang__) || SIMDE_DETECT_CLANG_VERSION_CHECK(12, 0, 0)) + #define simde_vcmlaq_laneq_f32(r, a, b, lane) vcmlaq_laneq_f32(r, a, b, lane) +#endif + +SIMDE_END_DECLS_ +HEDLEY_DIAGNOSTIC_POP + +#endif /* !defined(SIMDE_ARM_NEON_CMLA_LANE_H) */ diff --git a/lib/simd_wrapper/simde/arm/neon/cmla_rot180.h b/lib/simd_wrapper/simde/arm/neon/cmla_rot180.h index 5a5fa3f85a2..44cf283121c 100644 --- a/lib/simd_wrapper/simde/arm/neon/cmla_rot180.h +++ b/lib/simd_wrapper/simde/arm/neon/cmla_rot180.h @@ -22,6 +22,7 @@ * * Copyright: * 2021 Atharva Nimbalkar +* 2023 Yi-Yen Chung (Copyright owned by Andes Technology) */ #if !defined(SIMDE_ARM_NEON_CMLA_ROT180_H) @@ -33,12 +34,82 @@ HEDLEY_DIAGNOSTIC_PUSH SIMDE_DISABLE_UNWANTED_DIAGNOSTICS SIMDE_BEGIN_DECLS_ +SIMDE_FUNCTION_ATTRIBUTES +simde_float16x4_t +simde_vcmla_rot180_f16(simde_float16x4_t r, simde_float16x4_t a, simde_float16x4_t b) { + #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && SIMDE_ARCH_ARM_CHECK(8,3) && \ + (!defined(HEDLEY_GCC_VERSION) || HEDLEY_GCC_VERSION_CHECK(8,5,0)) && \ + (!defined(__clang__) || SIMDE_DETECT_CLANG_VERSION_CHECK(12,0,0)) && \ + defined(SIMDE_ARM_NEON_FP16) && defined(SIMDE_ARCH_ARM_COMPLEX) + return vcmla_rot180_f16(r, a, b); + #else + simde_float16x4_private + r_ = simde_float16x4_to_private(r), + a_ = simde_float16x4_to_private(a), + b_ = simde_float16x4_to_private(b); + + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.values) / (2 * sizeof(r_.values[0]))) ; i++) { + r_.values[2 * i] = simde_float16_from_float32( + simde_float16_to_float32(r_.values[2 * i]) - + simde_float16_to_float32(b_.values[2 * i]) * + simde_float16_to_float32(a_.values[2 * i])); + r_.values[2 * i + 1] = simde_float16_from_float32( + simde_float16_to_float32(r_.values[2 * i + 1]) - + simde_float16_to_float32(b_.values[2 * i + 1]) * + simde_float16_to_float32(a_.values[2 * i])); + } + + return simde_float16x4_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) + #undef vcmla_rot180_f16 + #define vcmla_rot180_f16(r, a, b) simde_vcmla_rot180_f16(r, a, b) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_float16x8_t +simde_vcmlaq_rot180_f16(simde_float16x8_t r, simde_float16x8_t a, simde_float16x8_t b) { + #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && SIMDE_ARCH_ARM_CHECK(8,3) && \ + (!defined(HEDLEY_GCC_VERSION) || HEDLEY_GCC_VERSION_CHECK(8,5,0)) && \ + (!defined(__clang__) || SIMDE_DETECT_CLANG_VERSION_CHECK(12,0,0)) && \ + defined(SIMDE_ARM_NEON_FP16) && defined(SIMDE_ARCH_ARM_COMPLEX) + return vcmlaq_rot180_f16(r, a, b); + #else + simde_float16x8_private + r_ = simde_float16x8_to_private(r), + a_ = simde_float16x8_to_private(a), + b_ = simde_float16x8_to_private(b); + + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.values) / (2 * sizeof(r_.values[0]))) ; i++) { + r_.values[2 * i] = simde_float16_from_float32( + simde_float16_to_float32(r_.values[2 * i]) - + simde_float16_to_float32(b_.values[2 * i]) * + simde_float16_to_float32(a_.values[2 * i])); + r_.values[2 * i + 1] = simde_float16_from_float32( + simde_float16_to_float32(r_.values[2 * i + 1]) - + simde_float16_to_float32(b_.values[2 * i + 1]) * + simde_float16_to_float32(a_.values[2 * i])); + } + + return simde_float16x8_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) + #undef vcmlaq_rot180_f16 + #define vcmlaq_rot180_f16(r, a, b) simde_vcmlaq_rot180_f16(r, a, b) +#endif + + SIMDE_FUNCTION_ATTRIBUTES simde_float32x2_t simde_vcmla_rot180_f32(simde_float32x2_t r, simde_float32x2_t a, simde_float32x2_t b) { #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && SIMDE_ARCH_ARM_CHECK(8,3) && \ (!defined(HEDLEY_GCC_VERSION) || HEDLEY_GCC_VERSION_CHECK(9,0,0)) && \ - (!defined(__clang__) || SIMDE_DETECT_CLANG_VERSION_CHECK(12,0,0)) + (!defined(__clang__) || SIMDE_DETECT_CLANG_VERSION_CHECK(12,0,0)) && \ + defined(SIMDE_ARCH_ARM_COMPLEX) return vcmla_rot180_f32(r, a, b); #else simde_float32x2_private @@ -71,7 +142,8 @@ simde_float32x4_t simde_vcmlaq_rot180_f32(simde_float32x4_t r, simde_float32x4_t a, simde_float32x4_t b) { #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && SIMDE_ARCH_ARM_CHECK(8,3) && \ (!defined(HEDLEY_GCC_VERSION) || HEDLEY_GCC_VERSION_CHECK(9,0,0)) && \ - (!defined(__clang__) || SIMDE_DETECT_CLANG_VERSION_CHECK(12,0,0)) + (!defined(__clang__) || SIMDE_DETECT_CLANG_VERSION_CHECK(12,0,0)) && \ + defined(SIMDE_ARCH_ARM_COMPLEX) return vcmlaq_rot180_f32(r, a, b); #else simde_float32x4_private @@ -79,7 +151,11 @@ simde_vcmlaq_rot180_f32(simde_float32x4_t r, simde_float32x4_t a, simde_float32x a_ = simde_float32x4_to_private(a), b_ = simde_float32x4_to_private(b); - #if defined(SIMDE_SHUFFLE_VECTOR_) + #if defined(SIMDE_WASM_SIMD128_NATIVE) + a_.v128 = wasm_i32x4_shuffle(a_.v128, a_.v128, 0, 0, 2, 2); + b_.v128 = wasm_i32x4_shuffle(wasm_f32x4_neg(b_.v128), wasm_f32x4_neg(b_.v128), 0, 1, 2, 3); + r_.v128 = wasm_f32x4_add(r_.v128, wasm_f32x4_mul(b_.v128, a_.v128)); + #elif defined(SIMDE_SHUFFLE_VECTOR_) a_.values = SIMDE_SHUFFLE_VECTOR_(32, 16, a_.values, a_.values, 0, 0, 2, 2); b_.values = SIMDE_SHUFFLE_VECTOR_(32, 16, -b_.values, -b_.values, 0, 1, 2, 3); r_.values += b_.values * a_.values; @@ -104,7 +180,8 @@ simde_float64x2_t simde_vcmlaq_rot180_f64(simde_float64x2_t r, simde_float64x2_t a, simde_float64x2_t b) { #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && SIMDE_ARCH_ARM_CHECK(8,3) && \ (!defined(HEDLEY_GCC_VERSION) || HEDLEY_GCC_VERSION_CHECK(9,0,0)) && \ - (!defined(__clang__) || SIMDE_DETECT_CLANG_VERSION_CHECK(12,0,0)) + (!defined(__clang__) || SIMDE_DETECT_CLANG_VERSION_CHECK(12,0,0)) && \ + defined(SIMDE_ARCH_ARM_COMPLEX) return vcmlaq_rot180_f64(r, a, b); #else simde_float64x2_private @@ -112,7 +189,11 @@ simde_vcmlaq_rot180_f64(simde_float64x2_t r, simde_float64x2_t a, simde_float64x a_ = simde_float64x2_to_private(a), b_ = simde_float64x2_to_private(b); - #if defined(SIMDE_SHUFFLE_VECTOR_) + #if defined(SIMDE_WASM_SIMD128_NATIVE) + a_.v128 = wasm_i64x2_shuffle(a_.v128, a_.v128, 0, 0); + b_.v128 = wasm_i64x2_shuffle(wasm_f64x2_neg(b_.v128), wasm_f64x2_neg(b_.v128), 0, 1); + r_.v128 = wasm_f64x2_add(r_.v128, wasm_f64x2_mul(b_.v128, a_.v128)); + #elif defined(SIMDE_SHUFFLE_VECTOR_) a_.values = SIMDE_SHUFFLE_VECTOR_(64, 16, a_.values, a_.values, 0, 0); b_.values = SIMDE_SHUFFLE_VECTOR_(64, 16, -b_.values, -b_.values, 0, 1); r_.values += b_.values * a_.values; diff --git a/lib/simd_wrapper/simde/arm/neon/cmla_rot180_lane.h b/lib/simd_wrapper/simde/arm/neon/cmla_rot180_lane.h new file mode 100644 index 00000000000..d7222591786 --- /dev/null +++ b/lib/simd_wrapper/simde/arm/neon/cmla_rot180_lane.h @@ -0,0 +1,310 @@ +/* SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, copy, + * modify, merge, publish, distribute, sublicense, and/or sell copies + * of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + * Copyright: + * 2023 Chi-Wei Chu + */ + +#if !defined(SIMDE_ARM_NEON_CMLA_ROT180_LANE_H) +#define SIMDE_ARM_NEON_CMLA_ROT180_LANE_H + +#include "add.h" +#include "combine.h" +#include "cvt.h" +#include "dup_lane.h" +#include "get_high.h" +#include "get_low.h" +#include "mul.h" +#include "types.h" +HEDLEY_DIAGNOSTIC_PUSH +SIMDE_DISABLE_UNWANTED_DIAGNOSTICS +SIMDE_BEGIN_DECLS_ + +SIMDE_FUNCTION_ATTRIBUTES +simde_float16x4_t simde_vcmla_rot180_lane_f16(simde_float16x4_t r, simde_float16x4_t a, simde_float16x4_t b, const int lane) + SIMDE_REQUIRE_CONSTANT_RANGE(lane, 0, 1) +{ + simde_float32x4_private r_ = simde_float32x4_to_private(simde_vcvt_f32_f16(r)), + a_ = simde_float32x4_to_private(simde_vcvt_f32_f16(a)), + b_ = simde_float32x4_to_private( + simde_vcvt_f32_f16(simde_vdup_n_f16(simde_float16x4_to_private(b).values[lane]))); + #if defined(SIMDE_SHUFFLE_VECTOR_) && \ + ((SIMDE_FLOAT16_API == SIMDE_FLOAT16_API_FP16) || (SIMDE_FLOAT16_API == SIMDE_FLOAT16_API_FLOAT16)) + a_.values = SIMDE_SHUFFLE_VECTOR_(16, 4, a_.values, a_.values, 0, 0, 2, 2); + b_.values = SIMDE_SHUFFLE_VECTOR_(16, 4, -b_.values, b_.values, 0, 1, 2, 3); + r_.values += b_.values * a_.values; + #else + SIMDE_VECTORIZE + for (size_t i = 0; i < (sizeof(r_.values) / (2 * sizeof(r_.values[0]))); i++) + { + r_.values[2 * i] += -(b_.values[2 * i]) * a_.values[2 * i]; + r_.values[2 * i + 1] += -(b_.values[2 * i + 1]) * a_.values[2 * i]; + } + #endif + return simde_vcvt_f16_f32(simde_float32x4_from_private(r_)); +} +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) + #undef vcmla_rot180_lane_f16 + #define vcmla_rot180_lane_f16(r, a, b, lane) simde_vcmla_rot180_lane_f16(r, a, b, lane) +#endif +#if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && SIMDE_ARCH_ARM_CHECK(8, 3) && \ + (!defined(HEDLEY_GCC_VERSION) || HEDLEY_GCC_VERSION_CHECK(9, 0, 0)) && \ + (!defined(__clang__) || SIMDE_DETECT_CLANG_VERSION_CHECK(12, 0, 0)) + #define simde_vcmla_rot180_lane_f16(r, a, b, lane) vcmla_rot180_lane_f16(r, a, b, lane) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_float32x2_t simde_vcmla_rot180_lane_f32(simde_float32x2_t r, simde_float32x2_t a, simde_float32x2_t b, const int lane) + SIMDE_REQUIRE_CONSTANT_RANGE(lane, 0, 0) +{ + simde_float32x2_private r_ = simde_float32x2_to_private(r), a_ = simde_float32x2_to_private(a), + b_ = simde_float32x2_to_private(simde_vdup_n_f32(simde_float32x2_to_private(b).values[lane])); + #if defined(SIMDE_SHUFFLE_VECTOR_) && !defined(SIMDE_BUG_GCC_100760) + a_.values = SIMDE_SHUFFLE_VECTOR_(32, 8, a_.values, a_.values, 0, 0); + b_.values = SIMDE_SHUFFLE_VECTOR_(32, 8, -b_.values, -b_.values, 0, 1); + r_.values += b_.values * a_.values; + #else + SIMDE_VECTORIZE + for (size_t i = 0; i < (sizeof(r_.values) / (2 * sizeof(r_.values[0]))); i++) + { + r_.values[2 * i] += -(b_.values[2 * i]) * a_.values[2 * i]; + r_.values[2 * i + 1] += -(b_.values[2 * i + 1]) * a_.values[2 * i]; + } + #endif + return simde_float32x2_from_private(r_); +} +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) + #undef vcmla_rot180_lane_f32 + #define vcmla_rot180_lane_f32(r, a, b, lane) simde_vcmla_rot180_lane_f32(r, a, b, lane) +#endif +#if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && SIMDE_ARCH_ARM_CHECK(8, 3) && \ + (!defined(HEDLEY_GCC_VERSION) || HEDLEY_GCC_VERSION_CHECK(9, 0, 0)) && \ + (!defined(__clang__) || SIMDE_DETECT_CLANG_VERSION_CHECK(12, 0, 0)) + #define simde_vcmla_rot180_lane_f32(r, a, b, lane) vcmla_rot180_lane_f32(r, a, b, lane) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_float16x8_t simde_vcmlaq_rot180_lane_f16(simde_float16x8_t r, simde_float16x8_t a, simde_float16x4_t b, const int lane) + SIMDE_REQUIRE_CONSTANT_RANGE(lane, 0, 1) +{ + simde_float32x4_private r_low = simde_float32x4_to_private(simde_vcvt_f32_f16(simde_vget_low_f16(r))), + a_low = simde_float32x4_to_private(simde_vcvt_f32_f16(simde_vget_low_f16(a))), + r_high = simde_float32x4_to_private(simde_vcvt_f32_f16(simde_vget_high_f16(r))), + a_high = simde_float32x4_to_private(simde_vcvt_f32_f16(simde_vget_high_f16(a))), + b_ = simde_float32x4_to_private( + simde_vcvt_f32_f16(simde_vdup_n_f16(simde_float16x4_to_private(b).values[lane]))); + #if defined(SIMDE_SHUFFLE_VECTOR_) && !defined(SIMDE_BUG_GCC_100760) && \ + ((SIMDE_FLOAT16_API == SIMDE_FLOAT16_API_FP16) || (SIMDE_FLOAT16_API == SIMDE_FLOAT16_API_FLOAT16)) + a_low.values = SIMDE_SHUFFLE_VECTOR_(16, 4, a_low.values, a_low.values, 0, 0, 2, 2); + a_high.values = SIMDE_SHUFFLE_VECTOR_(16, 4, a_high.values, a_high.values, 0, 0, 2, 2); + b_.values = SIMDE_SHUFFLE_VECTOR_(16, 4, -b_.values, b_.values, 0, 1, 2, 3); + r_low.values += b_.values * a_low.values; + r_high.values += b_.values * a_high.values; + #else + SIMDE_VECTORIZE + for (size_t i = 0; i < (sizeof(r_low.values) / (2 * sizeof(r_low.values[0]))); i++) + { + r_low.values[2 * i] += -(b_.values[2 * i]) * a_low.values[2 * i]; + r_low.values[2 * i + 1] += -(b_.values[2 * i + 1]) * a_low.values[2 * i]; + r_high.values[2 * i] += -(b_.values[2 * i]) * a_high.values[2 * i]; + r_high.values[2 * i + 1] += -(b_.values[2 * i + 1]) * a_high.values[2 * i]; + } + #endif + return simde_vcombine_f16(simde_vcvt_f16_f32(simde_float32x4_from_private(r_low)), + simde_vcvt_f16_f32(simde_float32x4_from_private(r_high))); +} +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) + #undef vcmlaq_rot180_lane_f16 + #define vcmlaq_rot180_lane_f16(r, a, b, lane) simde_vcmlaq_rot180_lane_f16(r, a, b, lane) +#endif +#if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && SIMDE_ARCH_ARM_CHECK(8, 3) && \ + (!defined(HEDLEY_GCC_VERSION) || HEDLEY_GCC_VERSION_CHECK(9, 0, 0)) && \ + (!defined(__clang__) || SIMDE_DETECT_CLANG_VERSION_CHECK(12, 0, 0)) + #define simde_vcmlaq_rot180_lane_f16(r, a, b, lane) vcmlaq_rot180_lane_f16(r, a, b, lane) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_float32x4_t simde_vcmlaq_rot180_lane_f32(simde_float32x4_t r, simde_float32x4_t a, simde_float32x2_t b, const int lane) + SIMDE_REQUIRE_CONSTANT_RANGE(lane, 0, 0) +{ + simde_float32x4_private r_ = simde_float32x4_to_private(r), a_ = simde_float32x4_to_private(a), + b_ = simde_float32x4_to_private(simde_vdupq_n_f32(simde_float32x2_to_private(b).values[lane])); + #if defined(SIMDE_SHUFFLE_VECTOR_) && !defined(SIMDE_BUG_GCC_100760) + a_.values = SIMDE_SHUFFLE_VECTOR_(32, 16, a_.values, a_.values, 0, 0, 2, 2); + b_.values = SIMDE_SHUFFLE_VECTOR_(32, 16, -b_.values, b_.values, 0, 1, 2, 3); + r_.values += b_.values * a_.values; + #else + SIMDE_VECTORIZE + for (size_t i = 0; i < (sizeof(r_.values) / (2 * sizeof(r_.values[0]))); i++) + { + r_.values[2 * i] += -(b_.values[2 * i]) * a_.values[2 * i]; + r_.values[2 * i + 1] += -(b_.values[2 * i + 1]) * a_.values[2 * i]; + } + #endif + return simde_float32x4_from_private(r_); +} +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) + #undef vcmlaq_rot180_lane_f32 + #define vcmlaq_rot180_lane_f32(r, a, b, lane) simde_vcmlaq_rot180_lane_f32(r, a, b, lane) +#endif +#if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && SIMDE_ARCH_ARM_CHECK(8, 3) && \ + (!defined(HEDLEY_GCC_VERSION) || HEDLEY_GCC_VERSION_CHECK(9, 0, 0)) && \ + (!defined(__clang__) || SIMDE_DETECT_CLANG_VERSION_CHECK(12, 0, 0)) + #define simde_vcmlaq_rot180_lane_f32(r, a, b, lane) vcmlaq_rot180_lane_f32(r, a, b, lane) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_float16x4_t simde_vcmla_rot180_laneq_f16(simde_float16x4_t r, simde_float16x4_t a, simde_float16x8_t b, const int lane) + SIMDE_REQUIRE_CONSTANT_RANGE(lane, 0, 1) +{ + simde_float32x4_private r_ = simde_float32x4_to_private(simde_vcvt_f32_f16(r)), + a_ = simde_float32x4_to_private(simde_vcvt_f32_f16(a)), + b_ = simde_float32x4_to_private( + simde_vcvt_f32_f16(simde_vdup_n_f16(simde_float16x8_to_private(b).values[lane]))); + #if defined(SIMDE_SHUFFLE_VECTOR_) && !defined(SIMDE_BUG_GCC_100760) && \ + ((SIMDE_FLOAT16_API == SIMDE_FLOAT16_API_FP16) || (SIMDE_FLOAT16_API == SIMDE_FLOAT16_API_FLOAT16)) + a_.values = SIMDE_SHUFFLE_VECTOR_(16, 4, a_.values, a_.values, 0, 0, 2, 2); + b_.values = SIMDE_SHUFFLE_VECTOR_(16, 4, -b_.values, b_.values, 0, 1, 2, 3); + r_.values += b_.values * a_.values; + #else + SIMDE_VECTORIZE + for (size_t i = 0; i < (sizeof(r_.values) / (2 * sizeof(r_.values[0]))); i++) + { + r_.values[2 * i] += -(b_.values[2 * i]) * a_.values[2 * i]; + r_.values[2 * i + 1] += -(b_.values[2 * i + 1]) * a_.values[2 * i]; + } + #endif + return simde_vcvt_f16_f32(simde_float32x4_from_private(r_)); +} +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) + #undef vcmla_rot180_laneq_f16 + #define vcmla_rot180_laneq_f16(r, a, b, lane) simde_vcmla_rot180_laneq_f16(r, a, b, lane) +#endif +#if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && SIMDE_ARCH_ARM_CHECK(8, 3) && \ + (!defined(HEDLEY_GCC_VERSION) || HEDLEY_GCC_VERSION_CHECK(9, 0, 0)) && \ + (!defined(__clang__) || SIMDE_DETECT_CLANG_VERSION_CHECK(12, 0, 0)) + #define simde_vcmla_rot180_laneq_f16(r, a, b, lane) vcmla_rot180_laneq_f16(r, a, b, lane) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_float32x2_t simde_vcmla_rot180_laneq_f32(simde_float32x2_t r, simde_float32x2_t a, simde_float32x4_t b, const int lane) + SIMDE_REQUIRE_CONSTANT_RANGE(lane, 0, 1) +{ + simde_float32x2_private r_ = simde_float32x2_to_private(r), a_ = simde_float32x2_to_private(a), + b_ = simde_float32x2_to_private(simde_vdup_n_f32(simde_float32x4_to_private(b).values[lane])); + #if defined(SIMDE_SHUFFLE_VECTOR_) && !defined(SIMDE_BUG_GCC_100760) + a_.values = SIMDE_SHUFFLE_VECTOR_(32, 8, a_.values, a_.values, 0, 0); + b_.values = SIMDE_SHUFFLE_VECTOR_(32, 8, -b_.values, -b_.values, 0, 1); + r_.values += b_.values * a_.values; + #else + SIMDE_VECTORIZE + for (size_t i = 0; i < (sizeof(r_.values) / (2 * sizeof(r_.values[0]))); i++) + { + r_.values[2 * i] += -(b_.values[2 * i]) * a_.values[2 * i]; + r_.values[2 * i + 1] += -(b_.values[2 * i + 1]) * a_.values[2 * i]; + } + #endif + return simde_float32x2_from_private(r_); +} +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) + #undef vcmla_rot180_laneq_f32 + #define vcmla_rot180_laneq_f32(r, a, b, lane) simde_vcmla_rot180_laneq_f32(r, a, b, lane) +#endif +#if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && SIMDE_ARCH_ARM_CHECK(8, 3) && \ + (!defined(HEDLEY_GCC_VERSION) || HEDLEY_GCC_VERSION_CHECK(9, 0, 0)) && \ + (!defined(__clang__) || SIMDE_DETECT_CLANG_VERSION_CHECK(12, 0, 0)) + #define simde_vcmla_rot180_laneq_f32(r, a, b, lane) vcmla_rot180_laneq_f32(r, a, b, lane) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_float16x8_t simde_vcmlaq_rot180_laneq_f16(simde_float16x8_t r, simde_float16x8_t a, simde_float16x8_t b, + const int lane) SIMDE_REQUIRE_CONSTANT_RANGE(lane, 0, 3) +{ + simde_float32x4_private r_low = simde_float32x4_to_private(simde_vcvt_f32_f16(simde_vget_low_f16(r))), + a_low = simde_float32x4_to_private(simde_vcvt_f32_f16(simde_vget_low_f16(a))), + r_high = simde_float32x4_to_private(simde_vcvt_f32_f16(simde_vget_high_f16(r))), + a_high = simde_float32x4_to_private(simde_vcvt_f32_f16(simde_vget_high_f16(a))), + b_ = simde_float32x4_to_private( + simde_vcvt_f32_f16(simde_vdup_n_f16(simde_float16x8_to_private(b).values[lane]))); + #if defined(SIMDE_SHUFFLE_VECTOR_) && !defined(SIMDE_BUG_GCC_100760) && \ + ((SIMDE_FLOAT16_API == SIMDE_FLOAT16_API_FP16) || (SIMDE_FLOAT16_API == SIMDE_FLOAT16_API_FLOAT16)) + a_low.values = SIMDE_SHUFFLE_VECTOR_(16, 4, a_low.values, a_low.values, 0, 0, 2, 2); + a_high.values = SIMDE_SHUFFLE_VECTOR_(16, 4, a_high.values, a_high.values, 0, 0, 2, 2); + b_.values = SIMDE_SHUFFLE_VECTOR_(16, 4, -b_.values, b_.values, 0, 1, 2, 3); + r_low.values += b_.values * a_low.values; + r_high.values += b_.values * a_high.values; + #else + SIMDE_VECTORIZE + for (size_t i = 0; i < (sizeof(r_low.values) / (2 * sizeof(r_low.values[0]))); i++) + { + r_low.values[2 * i] += -(b_.values[2 * i]) * a_low.values[2 * i]; + r_low.values[2 * i + 1] += -(b_.values[2 * i + 1]) * a_low.values[2 * i]; + r_high.values[2 * i] += -(b_.values[2 * i]) * a_high.values[2 * i]; + r_high.values[2 * i + 1] += -(b_.values[2 * i + 1]) * a_high.values[2 * i]; + } + #endif + return simde_vcombine_f16(simde_vcvt_f16_f32(simde_float32x4_from_private(r_low)), + simde_vcvt_f16_f32(simde_float32x4_from_private(r_high))); +} +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) + #undef vcmlaq_rot180_laneq_f16 + #define vcmlaq_rot180_laneq_f16(r, a, b, lane) simde_vcmlaq_rot180_laneq_f16(r, a, b, lane) +#endif +#if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && SIMDE_ARCH_ARM_CHECK(8, 3) && \ + (!defined(HEDLEY_GCC_VERSION) || HEDLEY_GCC_VERSION_CHECK(9, 0, 0)) && \ + (!defined(__clang__) || SIMDE_DETECT_CLANG_VERSION_CHECK(12, 0, 0)) + #define simde_vcmlaq_rot180_laneq_f16(r, a, b, lane) vcmlaq_rot180_laneq_f16(r, a, b, lane) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_float32x4_t simde_vcmlaq_rot180_laneq_f32(simde_float32x4_t r, simde_float32x4_t a, simde_float32x4_t b, + const int lane) SIMDE_REQUIRE_CONSTANT_RANGE(lane, 0, 1) +{ + simde_float32x4_private r_ = simde_float32x4_to_private(r), a_ = simde_float32x4_to_private(a), + b_ = simde_float32x4_to_private(simde_vdupq_n_f32(simde_float32x4_to_private(b).values[lane])); + #if defined(SIMDE_SHUFFLE_VECTOR_) && !defined(SIMDE_BUG_GCC_100760) + a_.values = SIMDE_SHUFFLE_VECTOR_(32, 16, a_.values, a_.values, 0, 0, 2, 2); + b_.values = SIMDE_SHUFFLE_VECTOR_(32, 16, -b_.values, b_.values, 0, 1, 2, 3); + r_.values += b_.values * a_.values; + #else + SIMDE_VECTORIZE + for (size_t i = 0; i < (sizeof(r_.values) / (2 * sizeof(r_.values[0]))); i++) + { + r_.values[2 * i] += -(b_.values[2 * i]) * a_.values[2 * i]; + r_.values[2 * i + 1] += -(b_.values[2 * i + 1]) * a_.values[2 * i]; + } + #endif + return simde_float32x4_from_private(r_); +} +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) + #undef vcmlaq_rot180_laneq_f32 + #define vcmlaq_rot180_laneq_f32(r, a, b, lane) simde_vcmlaq_rot180_laneq_f32(r, a, b, lane) +#endif +#if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && SIMDE_ARCH_ARM_CHECK(8, 3) && \ + (!defined(HEDLEY_GCC_VERSION) || HEDLEY_GCC_VERSION_CHECK(9, 0, 0)) && \ + (!defined(__clang__) || SIMDE_DETECT_CLANG_VERSION_CHECK(12, 0, 0)) + #define simde_vcmlaq_rot180_laneq_f32(r, a, b, lane) vcmlaq_rot180_laneq_f32(r, a, b, lane) +#endif +SIMDE_END_DECLS_ +HEDLEY_DIAGNOSTIC_POP + +#endif /* !defined(SIMDE_ARM_NEON_CMLA_ROT180_LANE_H) */ diff --git a/lib/simd_wrapper/simde/arm/neon/cmla_rot270.h b/lib/simd_wrapper/simde/arm/neon/cmla_rot270.h index cb9835c1fe5..530a30ae95d 100644 --- a/lib/simd_wrapper/simde/arm/neon/cmla_rot270.h +++ b/lib/simd_wrapper/simde/arm/neon/cmla_rot270.h @@ -22,6 +22,7 @@ * * Copyright: * 2021 Atharva Nimbalkar +* 2023 Yi-Yen Chung (Copyright owned by Andes Technology) */ #if !defined(SIMDE_ARM_NEON_CMLA_ROT270_H) @@ -33,12 +34,81 @@ HEDLEY_DIAGNOSTIC_PUSH SIMDE_DISABLE_UNWANTED_DIAGNOSTICS SIMDE_BEGIN_DECLS_ +SIMDE_FUNCTION_ATTRIBUTES +simde_float16x4_t +simde_vcmla_rot270_f16(simde_float16x4_t r, simde_float16x4_t a, simde_float16x4_t b) { + #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && SIMDE_ARCH_ARM_CHECK(8,3) && \ + (!defined(HEDLEY_GCC_VERSION) || HEDLEY_GCC_VERSION_CHECK(8,5,0)) && \ + (!defined(__clang__) || SIMDE_DETECT_CLANG_VERSION_CHECK(12,0,0)) && \ + defined(SIMDE_ARM_NEON_FP16) && defined(SIMDE_ARCH_ARM_COMPLEX) + return vcmla_rot270_f16(r, a, b); + #else + simde_float16x4_private + r_ = simde_float16x4_to_private(r), + a_ = simde_float16x4_to_private(a), + b_ = simde_float16x4_to_private(b); + + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.values) / (2 * sizeof(r_.values[0]))) ; i++) { + r_.values[2 * i] = simde_float16_from_float32( + simde_float16_to_float32(r_.values[2 * i]) + + simde_float16_to_float32(b_.values[2 * i + 1]) * + simde_float16_to_float32(a_.values[2 * i + 1])); + r_.values[2 * i + 1] = simde_float16_from_float32( + simde_float16_to_float32(r_.values[2 * i + 1]) - + simde_float16_to_float32(b_.values[2 * i]) * + simde_float16_to_float32(a_.values[2 * i + 1])); + } + + return simde_float16x4_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) + #undef vcmla_rot270_f16 + #define vcmla_rot270_f16(r, a, b) simde_vcmla_rot270_f16(r, a, b) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_float16x8_t +simde_vcmlaq_rot270_f16(simde_float16x8_t r, simde_float16x8_t a, simde_float16x8_t b) { + #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && SIMDE_ARCH_ARM_CHECK(8,3) && \ + (!defined(HEDLEY_GCC_VERSION) || HEDLEY_GCC_VERSION_CHECK(8,5,0)) && \ + (!defined(__clang__) || SIMDE_DETECT_CLANG_VERSION_CHECK(12,0,0)) && \ + defined(SIMDE_ARM_NEON_FP16) && defined(SIMDE_ARCH_ARM_COMPLEX) + return vcmlaq_rot270_f16(r, a, b); + #else + simde_float16x8_private + r_ = simde_float16x8_to_private(r), + a_ = simde_float16x8_to_private(a), + b_ = simde_float16x8_to_private(b); + + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.values) / (2 * sizeof(r_.values[0]))) ; i++) { + r_.values[2 * i] = simde_float16_from_float32( + simde_float16_to_float32(r_.values[2 * i]) + + simde_float16_to_float32(b_.values[2 * i + 1]) * + simde_float16_to_float32(a_.values[2 * i + 1])); + r_.values[2 * i + 1] = simde_float16_from_float32( + simde_float16_to_float32(r_.values[2 * i + 1]) - + simde_float16_to_float32(b_.values[2 * i]) * + simde_float16_to_float32(a_.values[2 * i + 1])); + } + + return simde_float16x8_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) + #undef vcmlaq_rot270_f16 + #define vcmlaq_rot270_f16(r, a, b) simde_vcmlaq_rot270_f16(r, a, b) +#endif + SIMDE_FUNCTION_ATTRIBUTES simde_float32x2_t simde_vcmla_rot270_f32(simde_float32x2_t r, simde_float32x2_t a, simde_float32x2_t b) { #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && SIMDE_ARCH_ARM_CHECK(8,3) && \ (!defined(HEDLEY_GCC_VERSION) || HEDLEY_GCC_VERSION_CHECK(9,0,0)) && \ - (!defined(__clang__) || SIMDE_DETECT_CLANG_VERSION_CHECK(12,0,0)) + (!defined(__clang__) || SIMDE_DETECT_CLANG_VERSION_CHECK(12,0,0)) && \ + defined(SIMDE_ARCH_ARM_COMPLEX) return vcmla_rot270_f32(r, a, b); #else simde_float32x2_private @@ -71,7 +141,8 @@ simde_float32x4_t simde_vcmlaq_rot270_f32(simde_float32x4_t r, simde_float32x4_t a, simde_float32x4_t b) { #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && SIMDE_ARCH_ARM_CHECK(8,3) && \ (!defined(HEDLEY_GCC_VERSION) || HEDLEY_GCC_VERSION_CHECK(9,0,0)) && \ - (!defined(__clang__) || SIMDE_DETECT_CLANG_VERSION_CHECK(12,0,0)) + (!defined(__clang__) || SIMDE_DETECT_CLANG_VERSION_CHECK(12,0,0)) && \ + defined(SIMDE_ARCH_ARM_COMPLEX) return vcmlaq_rot270_f32(r, a, b); #else simde_float32x4_private @@ -79,7 +150,11 @@ simde_vcmlaq_rot270_f32(simde_float32x4_t r, simde_float32x4_t a, simde_float32x a_ = simde_float32x4_to_private(a), b_ = simde_float32x4_to_private(b); - #if defined(SIMDE_SHUFFLE_VECTOR_) + #if defined(SIMDE_WASM_SIMD128_NATIVE) + a_.v128 = wasm_i32x4_shuffle(a_.v128, a_.v128, 1, 1, 3, 3); + b_.v128 = wasm_i32x4_shuffle(wasm_f32x4_neg(b_.v128), b_.v128, 5, 0, 7, 2); + r_.v128 = wasm_f32x4_add(r_.v128, wasm_f32x4_mul(b_.v128, a_.v128)); + #elif defined(SIMDE_SHUFFLE_VECTOR_) a_.values = SIMDE_SHUFFLE_VECTOR_(32, 16, a_.values, a_.values, 1, 1, 3, 3); b_.values = SIMDE_SHUFFLE_VECTOR_(32, 16, -b_.values, b_.values, 5, 0, 7, 2); r_.values += b_.values * a_.values; @@ -104,7 +179,8 @@ simde_float64x2_t simde_vcmlaq_rot270_f64(simde_float64x2_t r, simde_float64x2_t a, simde_float64x2_t b) { #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && SIMDE_ARCH_ARM_CHECK(8,3) && \ (!defined(HEDLEY_GCC_VERSION) || HEDLEY_GCC_VERSION_CHECK(9,0,0)) && \ - (!defined(__clang__) || SIMDE_DETECT_CLANG_VERSION_CHECK(12,0,0)) + (!defined(__clang__) || SIMDE_DETECT_CLANG_VERSION_CHECK(12,0,0)) && \ + defined(SIMDE_ARCH_ARM_COMPLEX) return vcmlaq_rot270_f64(r, a, b); #else simde_float64x2_private @@ -112,7 +188,11 @@ simde_vcmlaq_rot270_f64(simde_float64x2_t r, simde_float64x2_t a, simde_float64x a_ = simde_float64x2_to_private(a), b_ = simde_float64x2_to_private(b); - #if defined(SIMDE_SHUFFLE_VECTOR_) + #if defined(SIMDE_WASM_SIMD128_NATIVE) + a_.v128 = wasm_i64x2_shuffle(a_.v128, a_.v128, 1, 1); + b_.v128 = wasm_i64x2_shuffle(wasm_f64x2_neg(b_.v128), b_.v128, 3, 0); + r_.v128 = wasm_f64x2_add(r_.v128, wasm_f64x2_mul(b_.v128, a_.v128)); + #elif defined(SIMDE_SHUFFLE_VECTOR_) a_.values = SIMDE_SHUFFLE_VECTOR_(64, 16, a_.values, a_.values, 1, 1); b_.values = SIMDE_SHUFFLE_VECTOR_(64, 16, -b_.values, b_.values, 3, 0); r_.values += b_.values * a_.values; diff --git a/lib/simd_wrapper/simde/arm/neon/cmla_rot270_lane.h b/lib/simd_wrapper/simde/arm/neon/cmla_rot270_lane.h new file mode 100644 index 00000000000..d8d64dd388e --- /dev/null +++ b/lib/simd_wrapper/simde/arm/neon/cmla_rot270_lane.h @@ -0,0 +1,311 @@ +/* SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, copy, + * modify, merge, publish, distribute, sublicense, and/or sell copies + * of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + * Copyright: + * 2023 Chi-Wei Chu + */ + +#if !defined(SIMDE_ARM_NEON_CMLA_ROT270_LANE_H) +#define SIMDE_ARM_NEON_CMLA_ROT270_LANE_H + +#include "add.h" +#include "combine.h" +#include "cvt.h" +#include "dup_lane.h" +#include "get_high.h" +#include "get_low.h" +#include "mul.h" +#include "types.h" +HEDLEY_DIAGNOSTIC_PUSH +SIMDE_DISABLE_UNWANTED_DIAGNOSTICS +SIMDE_BEGIN_DECLS_ + +SIMDE_FUNCTION_ATTRIBUTES +simde_float16x4_t simde_vcmla_rot270_lane_f16(simde_float16x4_t r, simde_float16x4_t a, simde_float16x4_t b, const int lane) + SIMDE_REQUIRE_CONSTANT_RANGE(lane, 0, 1) +{ + simde_float32x4_private r_ = simde_float32x4_to_private(simde_vcvt_f32_f16(r)), + a_ = simde_float32x4_to_private(simde_vcvt_f32_f16(a)), + b_ = simde_float32x4_to_private( + simde_vcvt_f32_f16(simde_vdup_n_f16(simde_float16x4_to_private(b).values[lane]))); + #if defined(SIMDE_SHUFFLE_VECTOR_) && !defined(SIMDE_BUG_GCC_100760) && \ + ((SIMDE_FLOAT16_API == SIMDE_FLOAT16_API_FP16) || (SIMDE_FLOAT16_API == SIMDE_FLOAT16_API_FLOAT16)) + a_.values = SIMDE_SHUFFLE_VECTOR_(16, 4, a_.values, a_.values, 1, 1, 3, 3); + b_.values = SIMDE_SHUFFLE_VECTOR_(16, 4, -b_.values, b_.values, 5, 0, 7, 2); + r_.values += b_.values * a_.values; + #else + SIMDE_VECTORIZE + for (size_t i = 0; i < (sizeof(r_.values) / (2 * sizeof(r_.values[0]))); i++) + { + r_.values[2 * i] += b_.values[2 * i + 1] * a_.values[2 * i + 1]; + r_.values[2 * i + 1] += -(b_.values[2 * i]) * a_.values[2 * i + 1]; + } + #endif + return simde_vcvt_f16_f32(simde_float32x4_from_private(r_)); +} +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) + #undef vcmla_rot270_lane_f16 + #define vcmla_rot270_lane_f16(r, a, b, lane) simde_vcmla_rot270_lane_f16(r, a, b, lane) +#endif +#if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && SIMDE_ARCH_ARM_CHECK(8, 3) && \ + (!defined(HEDLEY_GCC_VERSION) || HEDLEY_GCC_VERSION_CHECK(9, 0, 0)) && \ + (!defined(__clang__) || SIMDE_DETECT_CLANG_VERSION_CHECK(12, 0, 0)) + #define simde_vcmla_rot270_lane_f16(r, a, b, lane) vcmla_rot270_lane_f16(r, a, b, lane) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_float32x2_t simde_vcmla_rot270_lane_f32(simde_float32x2_t r, simde_float32x2_t a, simde_float32x2_t b, const int lane) + SIMDE_REQUIRE_CONSTANT_RANGE(lane, 0, 0) +{ + simde_float32x2_private r_ = simde_float32x2_to_private(r), a_ = simde_float32x2_to_private(a), + b_ = simde_float32x2_to_private(simde_vdup_n_f32(simde_float32x2_to_private(b).values[lane])); + #if defined(SIMDE_SHUFFLE_VECTOR_) && !defined(SIMDE_BUG_GCC_100760) + a_.values = SIMDE_SHUFFLE_VECTOR_(32, 8, a_.values, a_.values, 1, 1); + b_.values = SIMDE_SHUFFLE_VECTOR_(32, 8, -b_.values, b_.values, 3, 0); + r_.values += b_.values * a_.values; + #else + SIMDE_VECTORIZE + for (size_t i = 0; i < (sizeof(r_.values) / (2 * sizeof(r_.values[0]))); i++) + { + r_.values[2 * i] += b_.values[2 * i + 1] * a_.values[2 * i + 1]; + r_.values[2 * i + 1] += -(b_.values[2 * i]) * a_.values[2 * i + 1]; + } + #endif + return simde_float32x2_from_private(r_); +} +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) + #undef vcmla_rot270_lane_f32 + #define vcmla_rot270_lane_f32(r, a, b, lane) simde_vcmla_rot270_lane_f32(r, a, b, lane) +#endif +#if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && SIMDE_ARCH_ARM_CHECK(8, 3) && \ + (!defined(HEDLEY_GCC_VERSION) || HEDLEY_GCC_VERSION_CHECK(9, 0, 0)) && \ + (!defined(__clang__) || SIMDE_DETECT_CLANG_VERSION_CHECK(12, 0, 0)) + #define simde_vcmla_rot270_lane_f32(r, a, b, lane) vcmla_rot270_lane_f32(r, a, b, lane) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_float16x8_t simde_vcmlaq_rot270_lane_f16(simde_float16x8_t r, simde_float16x8_t a, simde_float16x4_t b, const int lane) + SIMDE_REQUIRE_CONSTANT_RANGE(lane, 0, 1) +{ + simde_float32x4_private r_low = simde_float32x4_to_private(simde_vcvt_f32_f16(simde_vget_low_f16(r))), + a_low = simde_float32x4_to_private(simde_vcvt_f32_f16(simde_vget_low_f16(a))), + r_high = simde_float32x4_to_private(simde_vcvt_f32_f16(simde_vget_high_f16(r))), + a_high = simde_float32x4_to_private(simde_vcvt_f32_f16(simde_vget_high_f16(a))), + b_ = simde_float32x4_to_private( + simde_vcvt_f32_f16(simde_vdup_n_f16(simde_float16x4_to_private(b).values[lane]))); + #if defined(SIMDE_SHUFFLE_VECTOR_) && !defined(SIMDE_BUG_GCC_100760) && \ + ((SIMDE_FLOAT16_API == SIMDE_FLOAT16_API_FP16) || (SIMDE_FLOAT16_API == SIMDE_FLOAT16_API_FLOAT16)) + a_low.values = SIMDE_SHUFFLE_VECTOR_(16, 4, a_low.values, a_low.values, 1, 1, 3, 3); + a_high.values = SIMDE_SHUFFLE_VECTOR_(16, 4, a_high.values, a_high.values, 1, 1, 3, 3); + b_.values = SIMDE_SHUFFLE_VECTOR_(16, 4, -b_.values, b_.values, 5, 0, 7, 2); + r_low.values += b_.values * a_low.values; + r_high.values += b_.values * a_high.values; + #else + SIMDE_VECTORIZE + for (size_t i = 0; i < (sizeof(r_low.values) / (2 * sizeof(r_low.values[0]))); i++) + { + r_low.values[2 * i] += b_.values[2 * i + 1] * a_low.values[2 * i + 1]; + r_low.values[2 * i + 1] += -(b_.values[2 * i]) * a_low.values[2 * i + 1]; + r_high.values[2 * i] += b_.values[2 * i + 1] * a_high.values[2 * i + 1]; + r_high.values[2 * i + 1] += -(b_.values[2 * i]) * a_high.values[2 * i + 1]; + } + #endif + return simde_vcombine_f16(simde_vcvt_f16_f32(simde_float32x4_from_private(r_low)), + simde_vcvt_f16_f32(simde_float32x4_from_private(r_high))); +} +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) + #undef vcmlaq_rot270_lane_f16 + #define vcmlaq_rot270_lane_f16(r, a, b, lane) simde_vcmlaq_rot270_lane_f16(r, a, b, lane) +#endif +#if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && SIMDE_ARCH_ARM_CHECK(8, 3) && \ + (!defined(HEDLEY_GCC_VERSION) || HEDLEY_GCC_VERSION_CHECK(9, 0, 0)) && \ + (!defined(__clang__) || SIMDE_DETECT_CLANG_VERSION_CHECK(12, 0, 0)) + #define simde_vcmlaq_rot270_lane_f16(r, a, b, lane) vcmlaq_rot270_lane_f16(r, a, b, lane) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_float32x4_t simde_vcmlaq_rot270_lane_f32(simde_float32x4_t r, simde_float32x4_t a, simde_float32x2_t b, const int lane) + SIMDE_REQUIRE_CONSTANT_RANGE(lane, 0, 0) +{ + simde_float32x4_private r_ = simde_float32x4_to_private(r), a_ = simde_float32x4_to_private(a), + b_ = simde_float32x4_to_private(simde_vdupq_n_f32(simde_float32x2_to_private(b).values[lane])); + #if defined(SIMDE_SHUFFLE_VECTOR_) && !defined(SIMDE_BUG_GCC_100760) + a_.values = SIMDE_SHUFFLE_VECTOR_(32, 16, a_.values, a_.values, 1, 1, 3, 3); + b_.values = SIMDE_SHUFFLE_VECTOR_(32, 16, -b_.values, b_.values, 5, 0, 7, 2); + r_.values += b_.values * a_.values; + #else + SIMDE_VECTORIZE + for (size_t i = 0; i < (sizeof(r_.values) / (2 * sizeof(r_.values[0]))); i++) + { + r_.values[2 * i] += b_.values[2 * i + 1] * a_.values[2 * i + 1]; + r_.values[2 * i + 1] += -(b_.values[2 * i]) * a_.values[2 * i + 1]; + } + #endif + return simde_float32x4_from_private(r_); +} +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) + #undef vcmlaq_rot270_lane_f32 + #define vcmlaq_rot270_lane_f32(r, a, b, lane) simde_vcmlaq_rot270_lane_f32(r, a, b, lane) +#endif +#if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && SIMDE_ARCH_ARM_CHECK(8, 3) && \ + (!defined(HEDLEY_GCC_VERSION) || HEDLEY_GCC_VERSION_CHECK(9, 0, 0)) && \ + (!defined(__clang__) || SIMDE_DETECT_CLANG_VERSION_CHECK(12, 0, 0)) + #define simde_vcmlaq_rot270_lane_f32(r, a, b, lane) vcmlaq_rot270_lane_f32(r, a, b, lane) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_float16x4_t simde_vcmla_rot270_laneq_f16(simde_float16x4_t r, simde_float16x4_t a, simde_float16x8_t b, const int lane) + SIMDE_REQUIRE_CONSTANT_RANGE(lane, 0, 1) +{ + simde_float32x4_private r_ = simde_float32x4_to_private(simde_vcvt_f32_f16(r)), + a_ = simde_float32x4_to_private(simde_vcvt_f32_f16(a)), + b_ = simde_float32x4_to_private( + simde_vcvt_f32_f16(simde_vdup_n_f16(simde_float16x8_to_private(b).values[lane]))); + #if defined(SIMDE_SHUFFLE_VECTOR_) && !defined(SIMDE_BUG_GCC_100760) && \ + ((SIMDE_FLOAT16_API == SIMDE_FLOAT16_API_FP16) || (SIMDE_FLOAT16_API == SIMDE_FLOAT16_API_FLOAT16)) + a_.values = SIMDE_SHUFFLE_VECTOR_(16, 4, a_.values, a_.values, 1, 1, 3, 3); + b_.values = SIMDE_SHUFFLE_VECTOR_(16, 4, -b_.values, b_.values, 5, 0, 7, 2); + r_.values += b_.values * a_.values; + #else + SIMDE_VECTORIZE + for (size_t i = 0; i < (sizeof(r_.values) / (2 * sizeof(r_.values[0]))); i++) + { + r_.values[2 * i] += b_.values[2 * i + 1] * a_.values[2 * i + 1]; + r_.values[2 * i + 1] += -(b_.values[2 * i]) * a_.values[2 * i + 1]; + } + #endif + return simde_vcvt_f16_f32(simde_float32x4_from_private(r_)); +} +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) + #undef vcmla_rot270_laneq_f16 + #define vcmla_rot270_laneq_f16(r, a, b, lane) simde_vcmla_rot270_laneq_f16(r, a, b, lane) +#endif +#if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && SIMDE_ARCH_ARM_CHECK(8, 3) && \ + (!defined(HEDLEY_GCC_VERSION) || HEDLEY_GCC_VERSION_CHECK(9, 0, 0)) && \ + (!defined(__clang__) || SIMDE_DETECT_CLANG_VERSION_CHECK(12, 0, 0)) + #define simde_vcmla_rot270_laneq_f16(r, a, b, lane) vcmla_rot270_laneq_f16(r, a, b, lane) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_float32x2_t simde_vcmla_rot270_laneq_f32(simde_float32x2_t r, simde_float32x2_t a, simde_float32x4_t b, const int lane) + SIMDE_REQUIRE_CONSTANT_RANGE(lane, 0, 1) +{ + simde_float32x2_private r_ = simde_float32x2_to_private(r), a_ = simde_float32x2_to_private(a), + b_ = simde_float32x2_to_private(simde_vdup_n_f32(simde_float32x4_to_private(b).values[lane])); + #if defined(SIMDE_SHUFFLE_VECTOR_) && !defined(SIMDE_BUG_GCC_100760) + a_.values = SIMDE_SHUFFLE_VECTOR_(32, 8, a_.values, a_.values, 1, 1); + b_.values = SIMDE_SHUFFLE_VECTOR_(32, 8, -b_.values, b_.values, 3, 0); + r_.values += b_.values * a_.values; + #else + SIMDE_VECTORIZE + for (size_t i = 0; i < (sizeof(r_.values) / (2 * sizeof(r_.values[0]))); i++) + { + r_.values[2 * i] += b_.values[2 * i + 1] * a_.values[2 * i + 1]; + r_.values[2 * i + 1] += -(b_.values[2 * i]) * a_.values[2 * i + 1]; + } + #endif + return simde_float32x2_from_private(r_); +} +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) + #undef vcmla_rot270_laneq_f32 + #define vcmla_rot270_laneq_f32(r, a, b, lane) simde_vcmla_rot270_laneq_f32(r, a, b, lane) +#endif +#if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && SIMDE_ARCH_ARM_CHECK(8, 3) && \ + (!defined(HEDLEY_GCC_VERSION) || HEDLEY_GCC_VERSION_CHECK(9, 0, 0)) && \ + (!defined(__clang__) || SIMDE_DETECT_CLANG_VERSION_CHECK(12, 0, 0)) + #define simde_vcmla_rot270_laneq_f32(r, a, b, lane) vcmla_rot270_laneq_f32(r, a, b, lane) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_float16x8_t simde_vcmlaq_rot270_laneq_f16(simde_float16x8_t r, simde_float16x8_t a, simde_float16x8_t b, + const int lane) SIMDE_REQUIRE_CONSTANT_RANGE(lane, 0, 3) +{ + simde_float32x4_private r_low = simde_float32x4_to_private(simde_vcvt_f32_f16(simde_vget_low_f16(r))), + a_low = simde_float32x4_to_private(simde_vcvt_f32_f16(simde_vget_low_f16(a))), + r_high = simde_float32x4_to_private(simde_vcvt_f32_f16(simde_vget_high_f16(r))), + a_high = simde_float32x4_to_private(simde_vcvt_f32_f16(simde_vget_high_f16(a))), + b_ = simde_float32x4_to_private( + simde_vcvt_f32_f16(simde_vdup_n_f16(simde_float16x8_to_private(b).values[lane]))); + #if defined(SIMDE_SHUFFLE_VECTOR_) && !defined(SIMDE_BUG_GCC_100760) && \ + ((SIMDE_FLOAT16_API == SIMDE_FLOAT16_API_FP16) || (SIMDE_FLOAT16_API == SIMDE_FLOAT16_API_FLOAT16)) + a_high.values = SIMDE_SHUFFLE_VECTOR_(16, 4, a_high.values, a_high.values, 1, 1, 3, 3); + a_low.values = SIMDE_SHUFFLE_VECTOR_(16, 4, a_low.values, a_low.values, 1, 1, 3, 3); + b_.values = SIMDE_SHUFFLE_VECTOR_(16, 4, -b_.values, b_.values, 5, 0, 7, 2); + r_high.values += b_.values * a_high.values; + r_low.values += b_.values * a_low.values; + #else + SIMDE_VECTORIZE + for (size_t i = 0; i < (sizeof(r_low.values) / (2 * sizeof(r_low.values[0]))); i++) + { + r_low.values[2 * i] += b_.values[2 * i + 1] * a_low.values[2 * i + 1]; + r_low.values[2 * i + 1] += -(b_.values[2 * i]) * a_low.values[2 * i + 1]; + r_high.values[2 * i] += b_.values[2 * i + 1] * a_high.values[2 * i + 1]; + r_high.values[2 * i + 1] += -(b_.values[2 * i]) * a_high.values[2 * i + 1]; + } + #endif + return simde_vcombine_f16(simde_vcvt_f16_f32(simde_float32x4_from_private(r_low)), + simde_vcvt_f16_f32(simde_float32x4_from_private(r_high))); +} +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) + #undef vcmlaq_rot270_laneq_f16 + #define vcmlaq_rot270_laneq_f16(r, a, b, lane) simde_vcmlaq_rot270_laneq_f16(r, a, b, lane) +#endif +#if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && SIMDE_ARCH_ARM_CHECK(8, 3) && \ + (!defined(HEDLEY_GCC_VERSION) || HEDLEY_GCC_VERSION_CHECK(9, 0, 0)) && \ + (!defined(__clang__) || SIMDE_DETECT_CLANG_VERSION_CHECK(12, 0, 0)) + #define simde_vcmlaq_rot270_laneq_f16(r, a, b, lane) vcmlaq_rot270_laneq_f16(r, a, b, lane) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_float32x4_t simde_vcmlaq_rot270_laneq_f32(simde_float32x4_t r, simde_float32x4_t a, simde_float32x4_t b, + const int lane) SIMDE_REQUIRE_CONSTANT_RANGE(lane, 0, 1) +{ + simde_float32x4_private r_ = simde_float32x4_to_private(r), a_ = simde_float32x4_to_private(a), + b_ = simde_float32x4_to_private(simde_vdupq_n_f32(simde_float32x4_to_private(b).values[lane])); + #if defined(SIMDE_SHUFFLE_VECTOR_) && !defined(SIMDE_BUG_GCC_100760) + a_.values = SIMDE_SHUFFLE_VECTOR_(32, 16, a_.values, a_.values, 1, 1, 3, 3); + b_.values = SIMDE_SHUFFLE_VECTOR_(32, 16, -b_.values, b_.values, 5, 0, 7, 2); + r_.values += b_.values * a_.values; + #else + SIMDE_VECTORIZE + for (size_t i = 0; i < (sizeof(r_.values) / (2 * sizeof(r_.values[0]))); i++) + { + r_.values[2 * i] += b_.values[2 * i + 1] * a_.values[2 * i + 1]; + r_.values[2 * i + 1] += -(b_.values[2 * i]) * a_.values[2 * i + 1]; + } + #endif + return simde_float32x4_from_private(r_); +} +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) + #undef vcmlaq_rot270_laneq_f32 + #define vcmlaq_rot270_laneq_f32(r, a, b, lane) simde_vcmlaq_rot270_laneq_f32(r, a, b, lane) +#endif +#if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && SIMDE_ARCH_ARM_CHECK(8, 3) && \ + (!defined(HEDLEY_GCC_VERSION) || HEDLEY_GCC_VERSION_CHECK(9, 0, 0)) && \ + (!defined(__clang__) || SIMDE_DETECT_CLANG_VERSION_CHECK(12, 0, 0)) + #define simde_vcmlaq_rot270_laneq_f32(r, a, b, lane) vcmlaq_rot270_laneq_f32(r, a, b, lane) +#endif + +SIMDE_END_DECLS_ +HEDLEY_DIAGNOSTIC_POP + +#endif /* !defined(SIMDE_ARM_NEON_CMLA_ROT270_LANE_H) */ diff --git a/lib/simd_wrapper/simde/arm/neon/cmla_rot90.h b/lib/simd_wrapper/simde/arm/neon/cmla_rot90.h index f4ebd13df19..d16a09b20dd 100644 --- a/lib/simd_wrapper/simde/arm/neon/cmla_rot90.h +++ b/lib/simd_wrapper/simde/arm/neon/cmla_rot90.h @@ -22,6 +22,7 @@ * * Copyright: * 2021 Atharva Nimbalkar +* 2023 Yi-Yen Chung (Copyright owned by Andes Technology) */ #if !defined(SIMDE_ARM_NEON_CMLA_ROT90_H) @@ -33,12 +34,81 @@ HEDLEY_DIAGNOSTIC_PUSH SIMDE_DISABLE_UNWANTED_DIAGNOSTICS SIMDE_BEGIN_DECLS_ +SIMDE_FUNCTION_ATTRIBUTES +simde_float16x4_t +simde_vcmla_rot90_f16(simde_float16x4_t r, simde_float16x4_t a, simde_float16x4_t b) { + #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && SIMDE_ARCH_ARM_CHECK(8,3) && \ + (!defined(HEDLEY_GCC_VERSION) || HEDLEY_GCC_VERSION_CHECK(8,5,0)) && \ + (!defined(__clang__) || SIMDE_DETECT_CLANG_VERSION_CHECK(12,0,0)) && \ + defined(SIMDE_ARM_NEON_FP16) && defined(SIMDE_ARCH_ARM_COMPLEX) + return vcmla_rot90_f16(r, a, b); + #else + simde_float16x4_private + r_ = simde_float16x4_to_private(r), + a_ = simde_float16x4_to_private(a), + b_ = simde_float16x4_to_private(b); + + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.values) / (2 * sizeof(r_.values[0]))) ; i++) { + r_.values[2 * i] = simde_float16_from_float32( + simde_float16_to_float32(r_.values[2 * i]) - + simde_float16_to_float32(b_.values[2 * i + 1]) * + simde_float16_to_float32(a_.values[2 * i + 1])); + r_.values[2 * i + 1] = simde_float16_from_float32( + simde_float16_to_float32(r_.values[2 * i + 1]) + + simde_float16_to_float32(b_.values[2 * i]) * + simde_float16_to_float32(a_.values[2 * i + 1])); + } + + return simde_float16x4_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) + #undef vcmla_rot90_f16 + #define vcmla_rot90_f16(r, a, b) simde_vcmla_rot90_f16(r, a, b) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_float16x8_t +simde_vcmlaq_rot90_f16(simde_float16x8_t r, simde_float16x8_t a, simde_float16x8_t b) { + #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && SIMDE_ARCH_ARM_CHECK(8,3) && \ + (!defined(HEDLEY_GCC_VERSION) || HEDLEY_GCC_VERSION_CHECK(8,5,0)) && \ + (!defined(__clang__) || SIMDE_DETECT_CLANG_VERSION_CHECK(12,0,0)) && \ + defined(SIMDE_ARM_NEON_FP16) && defined(SIMDE_ARCH_ARM_COMPLEX) + return vcmlaq_rot90_f16(r, a, b); + #else + simde_float16x8_private + r_ = simde_float16x8_to_private(r), + a_ = simde_float16x8_to_private(a), + b_ = simde_float16x8_to_private(b); + + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.values) / (2 * sizeof(r_.values[0]))) ; i++) { + r_.values[2 * i] = simde_float16_from_float32( + simde_float16_to_float32(r_.values[2 * i]) - + simde_float16_to_float32(b_.values[2 * i + 1]) * + simde_float16_to_float32(a_.values[2 * i + 1])); + r_.values[2 * i + 1] = simde_float16_from_float32( + simde_float16_to_float32(r_.values[2 * i + 1]) + + simde_float16_to_float32(b_.values[2 * i]) * + simde_float16_to_float32(a_.values[2 * i + 1])); + } + + return simde_float16x8_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) + #undef vcmlaq_rot90_f16 + #define vcmlaq_rot90_f16(r, a, b) simde_vcmlaq_rot90_f16(r, a, b) +#endif + SIMDE_FUNCTION_ATTRIBUTES simde_float32x2_t simde_vcmla_rot90_f32(simde_float32x2_t r, simde_float32x2_t a, simde_float32x2_t b) { #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && SIMDE_ARCH_ARM_CHECK(8,3) && \ (!defined(HEDLEY_GCC_VERSION) || HEDLEY_GCC_VERSION_CHECK(9,0,0)) && \ - (!defined(__clang__) || SIMDE_DETECT_CLANG_VERSION_CHECK(12,0,0)) + (!defined(__clang__) || SIMDE_DETECT_CLANG_VERSION_CHECK(12,0,0)) && \ + defined(SIMDE_ARCH_ARM_COMPLEX) return vcmla_rot90_f32(r, a, b); #else simde_float32x2_private @@ -71,7 +141,8 @@ simde_float32x4_t simde_vcmlaq_rot90_f32(simde_float32x4_t r, simde_float32x4_t a, simde_float32x4_t b) { #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && SIMDE_ARCH_ARM_CHECK(8,3) && \ (!defined(HEDLEY_GCC_VERSION) || HEDLEY_GCC_VERSION_CHECK(9,0,0)) && \ - (!defined(__clang__) || SIMDE_DETECT_CLANG_VERSION_CHECK(12,0,0)) + (!defined(__clang__) || SIMDE_DETECT_CLANG_VERSION_CHECK(12,0,0)) && \ + defined(SIMDE_ARCH_ARM_COMPLEX) return vcmlaq_rot90_f32(r, a, b); #else simde_float32x4_private @@ -79,7 +150,11 @@ simde_vcmlaq_rot90_f32(simde_float32x4_t r, simde_float32x4_t a, simde_float32x4 a_ = simde_float32x4_to_private(a), b_ = simde_float32x4_to_private(b); - #if defined(SIMDE_SHUFFLE_VECTOR_) + #if defined(SIMDE_WASM_SIMD128_NATIVE) + a_.v128 = wasm_i32x4_shuffle(a_.v128, a_.v128, 1, 1, 3, 3); + b_.v128 = wasm_i32x4_shuffle(wasm_f32x4_neg(b_.v128), b_.v128, 1, 4, 3, 6); + r_.v128 = wasm_f32x4_add(r_.v128, wasm_f32x4_mul(b_.v128, a_.v128)); + #elif defined(SIMDE_SHUFFLE_VECTOR_) a_.values = SIMDE_SHUFFLE_VECTOR_(32, 16, a_.values, a_.values, 1, 1, 3, 3); b_.values = SIMDE_SHUFFLE_VECTOR_(32, 16, -b_.values, b_.values, 1, 4, 3, 6); r_.values += b_.values * a_.values; @@ -104,7 +179,8 @@ simde_float64x2_t simde_vcmlaq_rot90_f64(simde_float64x2_t r, simde_float64x2_t a, simde_float64x2_t b) { #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && SIMDE_ARCH_ARM_CHECK(8,3) && \ (!defined(HEDLEY_GCC_VERSION) || HEDLEY_GCC_VERSION_CHECK(9,0,0)) && \ - (!defined(__clang__) || SIMDE_DETECT_CLANG_VERSION_CHECK(12,0,0)) + (!defined(__clang__) || SIMDE_DETECT_CLANG_VERSION_CHECK(12,0,0)) && \ + defined(SIMDE_ARCH_ARM_COMPLEX) return vcmlaq_rot90_f64(r, a, b); #else simde_float64x2_private @@ -112,7 +188,11 @@ simde_vcmlaq_rot90_f64(simde_float64x2_t r, simde_float64x2_t a, simde_float64x2 a_ = simde_float64x2_to_private(a), b_ = simde_float64x2_to_private(b); - #if defined(SIMDE_SHUFFLE_VECTOR_) + #if defined(SIMDE_WASM_SIMD128_NATIVE) + a_.v128 = wasm_i64x2_shuffle(a_.v128, a_.v128, 1, 1); + b_.v128 = wasm_i64x2_shuffle(wasm_f64x2_neg(b_.v128), b_.v128, 1, 2); + r_.v128 = wasm_f64x2_add(r_.v128, wasm_f64x2_mul(b_.v128, a_.v128)); + #elif defined(SIMDE_SHUFFLE_VECTOR_) a_.values = SIMDE_SHUFFLE_VECTOR_(64, 16, a_.values, a_.values, 1, 1); b_.values = SIMDE_SHUFFLE_VECTOR_(64, 16, -b_.values, b_.values, 1, 2); r_.values += b_.values * a_.values; diff --git a/lib/simd_wrapper/simde/arm/neon/cmla_rot90_lane.h b/lib/simd_wrapper/simde/arm/neon/cmla_rot90_lane.h new file mode 100644 index 00000000000..45df8c0ed48 --- /dev/null +++ b/lib/simd_wrapper/simde/arm/neon/cmla_rot90_lane.h @@ -0,0 +1,311 @@ +/* SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, copy, + * modify, merge, publish, distribute, sublicense, and/or sell copies + * of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + * Copyright: + * 2023 Chi-Wei Chu + */ + +#if !defined(SIMDE_ARM_NEON_CMLA_ROT90_LANE_H) +#define SIMDE_ARM_NEON_CMLA_ROT90_LANE_H + +#include "add.h" +#include "combine.h" +#include "cvt.h" +#include "dup_lane.h" +#include "get_high.h" +#include "get_low.h" +#include "mul.h" +#include "types.h" +HEDLEY_DIAGNOSTIC_PUSH +SIMDE_DISABLE_UNWANTED_DIAGNOSTICS +SIMDE_BEGIN_DECLS_ + +SIMDE_FUNCTION_ATTRIBUTES +simde_float16x4_t simde_vcmla_rot90_lane_f16(simde_float16x4_t r, simde_float16x4_t a, simde_float16x4_t b, const int lane) + SIMDE_REQUIRE_CONSTANT_RANGE(lane, 0, 1) +{ + simde_float32x4_private r_ = simde_float32x4_to_private(simde_vcvt_f32_f16(r)), + a_ = simde_float32x4_to_private(simde_vcvt_f32_f16(a)), + b_ = simde_float32x4_to_private( + simde_vcvt_f32_f16(simde_vdup_n_f16(simde_float16x4_to_private(b).values[lane]))); + #if defined(SIMDE_SHUFFLE_VECTOR_) && !defined(SIMDE_BUG_GCC_100760) && \ + ((SIMDE_FLOAT16_API == SIMDE_FLOAT16_API_FP16) || (SIMDE_FLOAT16_API == SIMDE_FLOAT16_API_FLOAT16)) + a_.values = SIMDE_SHUFFLE_VECTOR_(16, 4, a_.values, a_.values, 1, 1, 3, 3); + b_.values = SIMDE_SHUFFLE_VECTOR_(16, 4, -b_.values, b_.values, 1, 4, 3, 6); + r_.values += b_.values * a_.values; + #else + SIMDE_VECTORIZE + for (size_t i = 0; i < (sizeof(r_.values) / (2 * sizeof(r_.values[0]))); i++) + { + r_.values[2 * i] += -(b_.values[2 * i + 1]) * a_.values[2 * i + 1]; + r_.values[2 * i + 1] += b_.values[2 * i] * a_.values[2 * i + 1]; + } + #endif + return simde_vcvt_f16_f32(simde_float32x4_from_private(r_)); +} +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) + #undef vcmla_rot90_lane_f16 + #define vcmla_rot90_lane_f16(r, a, b, lane) simde_vcmla_rot90_lane_f16(r, a, b, lane) +#endif +#if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && SIMDE_ARCH_ARM_CHECK(8, 3) && \ + (!defined(HEDLEY_GCC_VERSION) || HEDLEY_GCC_VERSION_CHECK(9, 0, 0)) && \ + (!defined(__clang__) || SIMDE_DETECT_CLANG_VERSION_CHECK(12, 0, 0)) + #define simde_vcmla_rot90_lane_f16(r, a, b, lane) vcmla_rot90_lane_f16(r, a, b, lane) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_float32x2_t simde_vcmla_rot90_lane_f32(simde_float32x2_t r, simde_float32x2_t a, simde_float32x2_t b, const int lane) + SIMDE_REQUIRE_CONSTANT_RANGE(lane, 0, 0) +{ + simde_float32x2_private r_ = simde_float32x2_to_private(r), a_ = simde_float32x2_to_private(a), + b_ = simde_float32x2_to_private(simde_vdup_n_f32(simde_float32x2_to_private(b).values[lane])); + #if defined(SIMDE_SHUFFLE_VECTOR_) && !defined(SIMDE_BUG_GCC_100760) + a_.values = SIMDE_SHUFFLE_VECTOR_(32, 8, a_.values, a_.values, 1, 1); + b_.values = SIMDE_SHUFFLE_VECTOR_(32, 8, -b_.values, b_.values, 1, 2); + r_.values += b_.values * a_.values; + #else + SIMDE_VECTORIZE + for (size_t i = 0; i < (sizeof(r_.values) / (2 * sizeof(r_.values[0]))); i++) + { + r_.values[2 * i] += -(b_.values[2 * i + 1]) * a_.values[2 * i + 1]; + r_.values[2 * i + 1] += b_.values[2 * i] * a_.values[2 * i + 1]; + } + #endif + return simde_float32x2_from_private(r_); +} +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) + #undef vcmla_rot90_lane_f32 + #define vcmla_rot90_lane_f32(r, a, b, lane) simde_vcmla_rot90_lane_f32(r, a, b, lane) +#endif +#if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && SIMDE_ARCH_ARM_CHECK(8, 3) && \ + (!defined(HEDLEY_GCC_VERSION) || HEDLEY_GCC_VERSION_CHECK(9, 0, 0)) && \ + (!defined(__clang__) || SIMDE_DETECT_CLANG_VERSION_CHECK(12, 0, 0)) + #define simde_vcmla_rot90_lane_f32(r, a, b, lane) vcmla_rot90_lane_f32(r, a, b, lane) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_float16x4_t simde_vcmla_rot90_laneq_f16(simde_float16x4_t r, simde_float16x4_t a, simde_float16x8_t b, const int lane) + SIMDE_REQUIRE_CONSTANT_RANGE(lane, 0, 1) +{ + simde_float32x4_private r_ = simde_float32x4_to_private(simde_vcvt_f32_f16(r)), + a_ = simde_float32x4_to_private(simde_vcvt_f32_f16(a)), + b_ = simde_float32x4_to_private( + simde_vcvt_f32_f16(simde_vdup_n_f16(simde_float16x8_to_private(b).values[lane]))); + #if defined(SIMDE_SHUFFLE_VECTOR_) && !defined(SIMDE_BUG_GCC_100760) && \ + ((SIMDE_FLOAT16_API == SIMDE_FLOAT16_API_FP16) || (SIMDE_FLOAT16_API == SIMDE_FLOAT16_API_FLOAT16)) + a_.values = SIMDE_SHUFFLE_VECTOR_(16, 4, a_.values, a_.values, 1, 1, 3, 3); + b_.values = SIMDE_SHUFFLE_VECTOR_(16, 4, -b_.values, b_.values, 1, 4, 3, 6); + r_.values += b_.values * a_.values; + #else + SIMDE_VECTORIZE + for (size_t i = 0; i < (sizeof(r_.values) / (2 * sizeof(r_.values[0]))); i++) + { + r_.values[2 * i] += -(b_.values[2 * i + 1]) * a_.values[2 * i + 1]; + r_.values[2 * i + 1] += b_.values[2 * i] * a_.values[2 * i + 1]; + } + #endif + return simde_vcvt_f16_f32(simde_float32x4_from_private(r_)); +} +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) + #undef vcmla_rot90_laneq_f16 + #define vcmla_rot90_laneq_f16(r, a, b, lane) simde_vcmla_rot90_laneq_f16(r, a, b, lane) +#endif +#if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && SIMDE_ARCH_ARM_CHECK(8, 3) && \ + (!defined(HEDLEY_GCC_VERSION) || HEDLEY_GCC_VERSION_CHECK(9, 0, 0)) && \ + (!defined(__clang__) || SIMDE_DETECT_CLANG_VERSION_CHECK(12, 0, 0)) + #define simde_vcmla_rot90_laneq_f16(r, a, b, lane) vcmla_rot90_laneq_f16(r, a, b, lane) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_float32x2_t simde_vcmla_rot90_laneq_f32(simde_float32x2_t r, simde_float32x2_t a, simde_float32x4_t b, const int lane) + SIMDE_REQUIRE_CONSTANT_RANGE(lane, 0, 1) +{ + simde_float32x2_private r_ = simde_float32x2_to_private(r), a_ = simde_float32x2_to_private(a), + b_ = simde_float32x2_to_private(simde_vdup_n_f32(simde_float32x4_to_private(b).values[lane])); + + #if defined(SIMDE_SHUFFLE_VECTOR_) && !defined(SIMDE_BUG_GCC_100760) + a_.values = SIMDE_SHUFFLE_VECTOR_(32, 8, a_.values, a_.values, 1, 1); + b_.values = SIMDE_SHUFFLE_VECTOR_(32, 8, -b_.values, b_.values, 1, 2); + r_.values += b_.values * a_.values; + #else + SIMDE_VECTORIZE + for (size_t i = 0; i < (sizeof(r_.values) / (2 * sizeof(r_.values[0]))); i++) + { + r_.values[2 * i] += -(b_.values[2 * i + 1]) * a_.values[2 * i + 1]; + r_.values[2 * i + 1] += b_.values[2 * i] * a_.values[2 * i + 1]; + } + #endif + return simde_float32x2_from_private(r_); +} +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) + #undef vcmla_rot90_laneq_f32 + #define vcmla_rot90_laneq_f32(r, a, b, lane) simde_vcmla_rot90_laneq_f32(r, a, b, lane) +#endif +#if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && SIMDE_ARCH_ARM_CHECK(8, 3) && \ + (!defined(HEDLEY_GCC_VERSION) || HEDLEY_GCC_VERSION_CHECK(9, 0, 0)) && \ + (!defined(__clang__) || SIMDE_DETECT_CLANG_VERSION_CHECK(12, 0, 0)) + #define simde_vcmla_rot90_laneq_f32(r, a, b, lane) vcmla_rot90_laneq_f32(r, a, b, lane) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_float16x8_t simde_vcmlaq_rot90_lane_f16(simde_float16x8_t r, simde_float16x8_t a, simde_float16x4_t b, const int lane) + SIMDE_REQUIRE_CONSTANT_RANGE(lane, 0, 1) +{ + simde_float32x4_private r_low = simde_float32x4_to_private(simde_vcvt_f32_f16(simde_vget_low_f16(r))), + a_low = simde_float32x4_to_private(simde_vcvt_f32_f16(simde_vget_low_f16(a))), + r_high = simde_float32x4_to_private(simde_vcvt_f32_f16(simde_vget_high_f16(r))), + a_high = simde_float32x4_to_private(simde_vcvt_f32_f16(simde_vget_high_f16(a))), + b_ = simde_float32x4_to_private( + simde_vcvt_f32_f16(simde_vdup_n_f16(simde_float16x4_to_private(b).values[lane]))); + #if defined(SIMDE_SHUFFLE_VECTOR_) && !defined(SIMDE_BUG_GCC_100760) && \ + ((SIMDE_FLOAT16_API == SIMDE_FLOAT16_API_FP16) || (SIMDE_FLOAT16_API == SIMDE_FLOAT16_API_FLOAT16)) + a_low.values = SIMDE_SHUFFLE_VECTOR_(16, 4, a_low.values, a_low.values, 1, 1, 3, 3); + a_high.values = SIMDE_SHUFFLE_VECTOR_(16, 4, a_high.values, a_high.values, 1, 1, 3, 3); + b_.values = SIMDE_SHUFFLE_VECTOR_(16, 4, -b_.values, b_.values, 1, 4, 3, 6); + r_low.values += b_.values * a_low.values; + r_high.values += b_.values * a_high.values; + #else + SIMDE_VECTORIZE + for (size_t i = 0; i < (sizeof(r_low.values) / (2 * sizeof(r_low.values[0]))); i++) + { + r_low.values[2 * i] += -(b_.values[2 * i + 1]) * a_low.values[2 * i + 1]; + r_low.values[2 * i + 1] += b_.values[2 * i] * a_low.values[2 * i + 1]; + r_high.values[2 * i] += -(b_.values[2 * i + 1]) * a_high.values[2 * i + 1]; + r_high.values[2 * i + 1] += b_.values[2 * i] * a_high.values[2 * i + 1]; + } + #endif + return simde_vcombine_f16(simde_vcvt_f16_f32(simde_float32x4_from_private(r_low)), + simde_vcvt_f16_f32(simde_float32x4_from_private(r_high))); +} +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) + #undef vcmlaq_rot90_lane_f16 + #define vcmlaq_rot90_lane_f16(r, a, b, lane) simde_vcmlaq_rot90_lane_f16(r, a, b, lane) +#endif +#if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && SIMDE_ARCH_ARM_CHECK(8, 3) && \ + (!defined(HEDLEY_GCC_VERSION) || HEDLEY_GCC_VERSION_CHECK(9, 0, 0)) && \ + (!defined(__clang__) || SIMDE_DETECT_CLANG_VERSION_CHECK(12, 0, 0)) + #define simde_vcmlaq_rot90_lane_f16(r, a, b, lane) vcmlaq_rot90_lane_f16(r, a, b, lane) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_float32x4_t simde_vcmlaq_rot90_lane_f32(simde_float32x4_t r, simde_float32x4_t a, simde_float32x2_t b, const int lane) + SIMDE_REQUIRE_CONSTANT_RANGE(lane, 0, 0) +{ + simde_float32x4_private r_ = simde_float32x4_to_private(r), a_ = simde_float32x4_to_private(a), + b_ = simde_float32x4_to_private(simde_vdupq_n_f32(simde_float32x2_to_private(b).values[lane])); + #if defined(SIMDE_SHUFFLE_VECTOR_) && !defined(SIMDE_BUG_GCC_100760) + a_.values = SIMDE_SHUFFLE_VECTOR_(32, 16, a_.values, a_.values, 1, 1, 3, 3); + b_.values = SIMDE_SHUFFLE_VECTOR_(32, 16, -b_.values, b_.values, 1, 4, 3, 6); + r_.values += b_.values * a_.values; + #else + SIMDE_VECTORIZE + for (size_t i = 0; i < (sizeof(r_.values) / (2 * sizeof(r_.values[0]))); i++) + { + r_.values[2 * i] += -(b_.values[2 * i + 1]) * a_.values[2 * i + 1]; + r_.values[2 * i + 1] += b_.values[2 * i] * a_.values[2 * i + 1]; + } + #endif + return simde_float32x4_from_private(r_); +} +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) + #undef vcmlaq_rot90_lane_f32 + #define vcmlaq_rot90_lane_f32(r, a, b, lane) simde_vcmlaq_rot90_lane_f32(r, a, b, lane) +#endif +#if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && SIMDE_ARCH_ARM_CHECK(8, 3) && \ + (!defined(HEDLEY_GCC_VERSION) || HEDLEY_GCC_VERSION_CHECK(9, 0, 0)) && \ + (!defined(__clang__) || SIMDE_DETECT_CLANG_VERSION_CHECK(12, 0, 0)) + #define simde_vcmlaq_rot90_lane_f32(r, a, b, lane) vcmlaq_rot90_lane_f32(r, a, b, lane) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_float16x8_t simde_vcmlaq_rot90_laneq_f16(simde_float16x8_t r, simde_float16x8_t a, simde_float16x8_t b, const int lane) + SIMDE_REQUIRE_CONSTANT_RANGE(lane, 0, 3) +{ + simde_float32x4_private r_low = simde_float32x4_to_private(simde_vcvt_f32_f16(simde_vget_low_f16(r))), + a_low = simde_float32x4_to_private(simde_vcvt_f32_f16(simde_vget_low_f16(a))), + r_high = simde_float32x4_to_private(simde_vcvt_f32_f16(simde_vget_high_f16(r))), + a_high = simde_float32x4_to_private(simde_vcvt_f32_f16(simde_vget_high_f16(a))), + b_ = simde_float32x4_to_private( + simde_vcvt_f32_f16(simde_vdup_n_f16(simde_float16x8_to_private(b).values[lane]))); + #if defined(SIMDE_SHUFFLE_VECTOR_) && \ + ((SIMDE_FLOAT16_API == SIMDE_FLOAT16_API_FP16) || (SIMDE_FLOAT16_API == SIMDE_FLOAT16_API_FLOAT16)) + a_low.values = SIMDE_SHUFFLE_VECTOR_(16, 4, a_low.values, a_low.values, 1, 1, 3, 3); + a_high.values = SIMDE_SHUFFLE_VECTOR_(16, 4, a_high.values, a_high.values, 1, 1, 3, 3); + b_.values = SIMDE_SHUFFLE_VECTOR_(16, 4, -b_.values, b_.values, 1, 4, 3, 6); + r_low.values += b_.values * a_low.values; + r_high.values += b_.values * a_high.values; + #else + SIMDE_VECTORIZE + for (size_t i = 0; i < (sizeof(r_low.values) / (2 * sizeof(r_low.values[0]))); i++) + { + r_low.values[2 * i] += -(b_.values[2 * i + 1]) * a_low.values[2 * i + 1]; + r_low.values[2 * i + 1] += b_.values[2 * i] * a_low.values[2 * i + 1]; + r_high.values[2 * i] += -(b_.values[2 * i + 1]) * a_high.values[2 * i + 1]; + r_high.values[2 * i + 1] += b_.values[2 * i] * a_high.values[2 * i + 1]; + } + #endif + return simde_vcombine_f16(simde_vcvt_f16_f32(simde_float32x4_from_private(r_low)), + simde_vcvt_f16_f32(simde_float32x4_from_private(r_high))); +} +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) + #undef vcmlaq_rot90_laneq_f16 + #define vcmlaq_rot90_laneq_f16(r, a, b, lane) simde_vcmlaq_rot90_laneq_f16(r, a, b, lane) +#endif +#if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && SIMDE_ARCH_ARM_CHECK(8, 3) && \ + (!defined(HEDLEY_GCC_VERSION) || HEDLEY_GCC_VERSION_CHECK(9, 0, 0)) && \ + (!defined(__clang__) || SIMDE_DETECT_CLANG_VERSION_CHECK(12, 0, 0)) + #define simde_vcmlaq_rot90_laneq_f16(r, a, b, lane) vcmlaq_rot90_laneq_f16(r, a, b, lane) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_float32x4_t simde_vcmlaq_rot90_laneq_f32(simde_float32x4_t r, simde_float32x4_t a, simde_float32x4_t b, const int lane) + SIMDE_REQUIRE_CONSTANT_RANGE(lane, 0, 1) +{ + simde_float32x4_private r_ = simde_float32x4_to_private(r), a_ = simde_float32x4_to_private(a), + b_ = simde_float32x4_to_private(simde_vdupq_n_f32(simde_float32x4_to_private(b).values[lane])); + #if defined(SIMDE_SHUFFLE_VECTOR_) && !defined(SIMDE_BUG_GCC_100760) + a_.values = SIMDE_SHUFFLE_VECTOR_(32, 16, a_.values, a_.values, 1, 1, 3, 3); + b_.values = SIMDE_SHUFFLE_VECTOR_(32, 16, -b_.values, b_.values, 1, 4, 3, 6); + r_.values += b_.values * a_.values; + #else + SIMDE_VECTORIZE + for (size_t i = 0; i < (sizeof(r_.values) / (2 * sizeof(r_.values[0]))); i++) + { + r_.values[2 * i] += -(b_.values[2 * i + 1]) * a_.values[2 * i + 1]; + r_.values[2 * i + 1] += b_.values[2 * i] * a_.values[2 * i + 1]; + } + #endif + return simde_float32x4_from_private(r_); +} +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) + #undef vcmlaq_rot90_laneq_f32 + #define vcmlaq_rot90_laneq_f32(r, a, b, lane) simde_vcmlaq_rot90_laneq_f32(r, a, b, lane) +#endif +#if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && SIMDE_ARCH_ARM_CHECK(8, 3) && \ + (!defined(HEDLEY_GCC_VERSION) || HEDLEY_GCC_VERSION_CHECK(9, 0, 0)) && \ + (!defined(__clang__) || SIMDE_DETECT_CLANG_VERSION_CHECK(12, 0, 0)) + #define simde_vcmlaq_rot90_laneq_f32(r, a, b, lane) vcmlaq_rot90_laneq_f32(r, a, b, lane) +#endif +SIMDE_END_DECLS_ +HEDLEY_DIAGNOSTIC_POP + +#endif /* !defined(SIMDE_ARM_NEON_CMLA_ROT90_LANE_H) */ diff --git a/lib/simd_wrapper/simde/arm/neon/cnt.h b/lib/simd_wrapper/simde/arm/neon/cnt.h index e1fda38e758..9169f7e24ec 100644 --- a/lib/simd_wrapper/simde/arm/neon/cnt.h +++ b/lib/simd_wrapper/simde/arm/neon/cnt.h @@ -22,6 +22,7 @@ * * Copyright: * 2020 Evan Nemerson + * 2023 Yi-Yen Chung (Copyright owned by Andes Technology) */ #if !defined(SIMDE_ARM_NEON_CNT_H) @@ -164,6 +165,34 @@ simde_vcntq_u8(simde_uint8x16_t a) { #define vcntq_u8(a) simde_vcntq_u8((a)) #endif +SIMDE_FUNCTION_ATTRIBUTES +simde_poly8x8_t +simde_vcnt_p8(simde_poly8x8_t a) { + #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) + return vcnt_p8(a); + #else + return simde_vreinterpret_p8_s8(simde_vcnt_s8(simde_vreinterpret_s8_p8(a))); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vcnt_p8 + #define vcnt_p8(a) simde_vcnt_p8((a)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_poly8x16_t +simde_vcntq_p8(simde_poly8x16_t a) { + #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) + return vcntq_p8(a); + #else + return simde_vreinterpretq_p8_s8(simde_vcntq_s8(simde_vreinterpretq_s8_p8(a))); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vcntq_p8 + #define vcntq_p8(a) simde_vcntq_p8((a)) +#endif + SIMDE_END_DECLS_ HEDLEY_DIAGNOSTIC_POP diff --git a/lib/simd_wrapper/simde/arm/neon/combine.h b/lib/simd_wrapper/simde/arm/neon/combine.h index 66c1df646cb..1a92187846b 100644 --- a/lib/simd_wrapper/simde/arm/neon/combine.h +++ b/lib/simd_wrapper/simde/arm/neon/combine.h @@ -23,6 +23,7 @@ * Copyright: * 2020 Evan Nemerson * 2020 Sean Maher (Copyright owned by Google, LLC) + * 2023 Yi-Yen Chung (Copyright owned by Andes Technology) */ #if !defined(SIMDE_ARM_NEON_COMBINE_H) @@ -34,6 +35,32 @@ HEDLEY_DIAGNOSTIC_PUSH SIMDE_DISABLE_UNWANTED_DIAGNOSTICS SIMDE_BEGIN_DECLS_ +SIMDE_FUNCTION_ATTRIBUTES +simde_float16x8_t +simde_vcombine_f16(simde_float16x4_t low, simde_float16x4_t high) { + #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) && defined(SIMDE_ARM_NEON_FP16) + return vcombine_f16(low, high); + #else + simde_float16x8_private r_; + simde_float16x4_private + low_ = simde_float16x4_to_private(low), + high_ = simde_float16x4_to_private(high); + + size_t halfway = (sizeof(r_.values) / sizeof(r_.values[0])) / 2; + SIMDE_VECTORIZE + for (size_t i = 0 ; i < halfway ; i++) { + r_.values[i] = low_.values[i]; + r_.values[i + halfway] = high_.values[i]; + } + + return simde_float16x8_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vcombine_f16 + #define vcombine_f16(low, high) simde_vcombine_f16((low), (high)) +#endif + SIMDE_FUNCTION_ATTRIBUTES simde_float32x4_t simde_vcombine_f32(simde_float32x2_t low, simde_float32x2_t high) { @@ -337,6 +364,110 @@ simde_vcombine_u64(simde_uint64x1_t low, simde_uint64x1_t high) { #define vcombine_u64(low, high) simde_vcombine_u64((low), (high)) #endif +SIMDE_FUNCTION_ATTRIBUTES +simde_poly8x16_t +simde_vcombine_p8(simde_poly8x8_t low, simde_poly8x8_t high) { + #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) + return vcombine_p8(low, high); + #else + simde_poly8x16_private r_; + simde_poly8x8_private + low_ = simde_poly8x8_to_private(low), + high_ = simde_poly8x8_to_private(high); + + size_t halfway = (sizeof(r_.values) / sizeof(r_.values[0])) / 2; + SIMDE_VECTORIZE + for (size_t i = 0 ; i < halfway ; i++) { + r_.values[i] = low_.values[i]; + r_.values[i + halfway] = high_.values[i]; + } + + return simde_poly8x16_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vcombine_p8 + #define vcombine_p8(low, high) simde_vcombine_p8((low), (high)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_poly16x8_t +simde_vcombine_p16(simde_poly16x4_t low, simde_poly16x4_t high) { + #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) + return vcombine_p16(low, high); + #else + simde_poly16x8_private r_; + simde_poly16x4_private + low_ = simde_poly16x4_to_private(low), + high_ = simde_poly16x4_to_private(high); + + size_t halfway = (sizeof(r_.values) / sizeof(r_.values[0])) / 2; + SIMDE_VECTORIZE + for (size_t i = 0 ; i < halfway ; i++) { + r_.values[i] = low_.values[i]; + r_.values[i + halfway] = high_.values[i]; + } + + return simde_poly16x8_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vcombine_p16 + #define vcombine_p16(low, high) simde_vcombine_p16((low), (high)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_poly64x2_t +simde_vcombine_p64(simde_poly64x1_t low, simde_poly64x1_t high) { + #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) + return vcombine_p64(low, high); + #else + simde_poly64x2_private r_; + simde_poly64x1_private + low_ = simde_poly64x1_to_private(low), + high_ = simde_poly64x1_to_private(high); + + size_t halfway = (sizeof(r_.values) / sizeof(r_.values[0])) / 2; + SIMDE_VECTORIZE + for (size_t i = 0 ; i < halfway ; i++) { + r_.values[i] = low_.values[i]; + r_.values[i + halfway] = high_.values[i]; + } + + return simde_poly64x2_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) + #undef vcombine_p64 + #define vcombine_p64(low, high) simde_vcombine_p64((low), (high)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_bfloat16x8_t +simde_vcombine_bf16(simde_bfloat16x4_t low, simde_bfloat16x4_t high) { + #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARM_NEON_BF16) + return vcombine_bf16(low, high); + #else + simde_bfloat16x8_private r_; + simde_bfloat16x4_private + low_ = simde_bfloat16x4_to_private(low), + high_ = simde_bfloat16x4_to_private(high); + + size_t halfway = (sizeof(r_.values) / sizeof(r_.values[0])) / 2; + SIMDE_VECTORIZE + for (size_t i = 0 ; i < halfway ; i++) { + r_.values[i] = low_.values[i]; + r_.values[i + halfway] = high_.values[i]; + } + + return simde_bfloat16x8_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) + #undef vcombine_bf16 + #define vcombine_bf16(low, high) simde_vcombine_bf16((low), (high)) +#endif + SIMDE_END_DECLS_ HEDLEY_DIAGNOSTIC_POP diff --git a/lib/simd_wrapper/simde/arm/neon/copy_lane.h b/lib/simd_wrapper/simde/arm/neon/copy_lane.h new file mode 100644 index 00000000000..7195c8076fb --- /dev/null +++ b/lib/simd_wrapper/simde/arm/neon/copy_lane.h @@ -0,0 +1,1184 @@ +/* SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, copy, + * modify, merge, publish, distribute, sublicense, and/or sell copies + * of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + * Copyright: + * 2023 Yi-Yen Chung (Copyright owned by Andes Technology) + */ + +#if !defined(SIMDE_ARM_NEON_COPY_LANE_H) +#define SIMDE_ARM_NEON_COPY_LANE_H + +#include "types.h" +#include "cvt.h" + +HEDLEY_DIAGNOSTIC_PUSH +SIMDE_DISABLE_UNWANTED_DIAGNOSTICS +SIMDE_BEGIN_DECLS_ + +SIMDE_FUNCTION_ATTRIBUTES +simde_int8x8_t +simde_vcopy_lane_s8(simde_int8x8_t a, const int lane1, simde_int8x8_t b, const int lane2) + SIMDE_REQUIRE_CONSTANT_RANGE(lane1, 0, 7) + SIMDE_REQUIRE_CONSTANT_RANGE(lane2, 0, 7) { + simde_int8x8_private + b_ = simde_int8x8_to_private(b), + r_ = simde_int8x8_to_private(a); + + r_.values[lane1] = b_.values[lane2]; + return simde_int8x8_from_private(r_); +} +#if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + #define simde_vcopy_lane_s8(a, lane1, b, lane2) vcopy_lane_s8((a), (lane1), (b), (lane2)) +#endif +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vcopy_lane_s8 + #define vcopy_lane_s8(a, lane1, b, lane2) simde_vcopy_lane_s8((a), (lane1), (b), (lane2)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_int16x4_t +simde_vcopy_lane_s16(simde_int16x4_t a, const int lane1, simde_int16x4_t b, const int lane2) + SIMDE_REQUIRE_CONSTANT_RANGE(lane1, 0, 3) + SIMDE_REQUIRE_CONSTANT_RANGE(lane2, 0, 3) { + simde_int16x4_private + b_ = simde_int16x4_to_private(b), + r_ = simde_int16x4_to_private(a); + + r_.values[lane1] = b_.values[lane2]; + return simde_int16x4_from_private(r_); +} +#if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + #define simde_vcopy_lane_s16(a, lane1, b, lane2) vcopy_lane_s16((a), (lane1), (b), (lane2)) +#endif +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vcopy_lane_s16 + #define vcopy_lane_s16(a, lane1, b, lane2) simde_vcopy_lane_s16((a), (lane1), (b), (lane2)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_int32x2_t +simde_vcopy_lane_s32(simde_int32x2_t a, const int lane1, simde_int32x2_t b, const int lane2) + SIMDE_REQUIRE_CONSTANT_RANGE(lane1, 0, 1) + SIMDE_REQUIRE_CONSTANT_RANGE(lane2, 0, 1) { + simde_int32x2_private + b_ = simde_int32x2_to_private(b), + r_ = simde_int32x2_to_private(a); + + r_.values[lane1] = b_.values[lane2]; + return simde_int32x2_from_private(r_); +} +#if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + #define simde_vcopy_lane_s32(a, lane1, b, lane2) vcopy_lane_s32((a), (lane1), (b), (lane2)) +#endif +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vcopy_lane_s32 + #define vcopy_lane_s32(a, lane1, b, lane2) simde_vcopy_lane_s32((a), (lane1), (b), (lane2)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_int64x1_t +simde_vcopy_lane_s64(simde_int64x1_t a, const int lane1, simde_int64x1_t b, const int lane2) + SIMDE_REQUIRE_CONSTANT_RANGE(lane1, 0, 0) + SIMDE_REQUIRE_CONSTANT_RANGE(lane2, 0, 0) { + simde_int64x1_private + b_ = simde_int64x1_to_private(b), + r_ = simde_int64x1_to_private(a); + + r_.values[lane1] = b_.values[lane2]; + return simde_int64x1_from_private(r_); +} +#if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + #define simde_vcopy_lane_s64(a, lane1, b, lane2) vcopy_lane_s64((a), (lane1), (b), (lane2)) +#endif +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vcopy_lane_s64 + #define vcopy_lane_s64(a, lane1, b, lane2) simde_vcopy_lane_s64((a), (lane1), (b), (lane2)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_uint8x8_t +simde_vcopy_lane_u8(simde_uint8x8_t a, const int lane1, simde_uint8x8_t b, const int lane2) + SIMDE_REQUIRE_CONSTANT_RANGE(lane1, 0, 7) + SIMDE_REQUIRE_CONSTANT_RANGE(lane2, 0, 7) { + simde_uint8x8_private + b_ = simde_uint8x8_to_private(b), + r_ = simde_uint8x8_to_private(a); + + r_.values[lane1] = b_.values[lane2]; + return simde_uint8x8_from_private(r_); +} +#if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + #define simde_vcopy_lane_u8(a, lane1, b, lane2) vcopy_lane_u8((a), (lane1), (b), (lane2)) +#endif +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vcopy_lane_u8 + #define vcopy_lane_u8(a, lane1, b, lane2) simde_vcopy_lane_u8((a), (lane1), (b), (lane2)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_uint16x4_t +simde_vcopy_lane_u16(simde_uint16x4_t a, const int lane1, simde_uint16x4_t b, const int lane2) + SIMDE_REQUIRE_CONSTANT_RANGE(lane1, 0, 3) + SIMDE_REQUIRE_CONSTANT_RANGE(lane2, 0, 3) { + simde_uint16x4_private + b_ = simde_uint16x4_to_private(b), + r_ = simde_uint16x4_to_private(a); + + r_.values[lane1] = b_.values[lane2]; + return simde_uint16x4_from_private(r_); +} +#if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + #define simde_vcopy_lane_u16(a, lane1, b, lane2) vcopy_lane_u16((a), (lane1), (b), (lane2)) +#endif +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vcopy_lane_u16 + #define vcopy_lane_u16(a, lane1, b, lane2) simde_vcopy_lane_u16((a), (lane1), (b), (lane2)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_uint32x2_t +simde_vcopy_lane_u32(simde_uint32x2_t a, const int lane1, simde_uint32x2_t b, const int lane2) + SIMDE_REQUIRE_CONSTANT_RANGE(lane1, 0, 1) + SIMDE_REQUIRE_CONSTANT_RANGE(lane2, 0, 1) { + simde_uint32x2_private + b_ = simde_uint32x2_to_private(b), + r_ = simde_uint32x2_to_private(a); + + r_.values[lane1] = b_.values[lane2]; + return simde_uint32x2_from_private(r_); +} +#if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + #define simde_vcopy_lane_u32(a, lane1, b, lane2) vcopy_lane_u32((a), (lane1), (b), (lane2)) +#endif +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vcopy_lane_u32 + #define vcopy_lane_u32(a, lane1, b, lane2) simde_vcopy_lane_u32((a), (lane1), (b), (lane2)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_uint64x1_t +simde_vcopy_lane_u64(simde_uint64x1_t a, const int lane1, simde_uint64x1_t b, const int lane2) + SIMDE_REQUIRE_CONSTANT_RANGE(lane1, 0, 0) + SIMDE_REQUIRE_CONSTANT_RANGE(lane2, 0, 0) { + simde_uint64x1_private + b_ = simde_uint64x1_to_private(b), + r_ = simde_uint64x1_to_private(a); + + r_.values[lane1] = b_.values[lane2]; + return simde_uint64x1_from_private(r_); +} +#if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + #define simde_vcopy_lane_u64(a, lane1, b, lane2) vcopy_lane_u64((a), (lane1), (b), (lane2)) +#endif +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vcopy_lane_u64 + #define vcopy_lane_u64(a, lane1, b, lane2) simde_vcopy_lane_u64((a), (lane1), (b), (lane2)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_float32x2_t +simde_vcopy_lane_f32(simde_float32x2_t a, const int lane1, simde_float32x2_t b, const int lane2) + SIMDE_REQUIRE_CONSTANT_RANGE(lane1, 0, 1) + SIMDE_REQUIRE_CONSTANT_RANGE(lane2, 0, 1) { + simde_float32x2_private + b_ = simde_float32x2_to_private(b), + r_ = simde_float32x2_to_private(a); + + r_.values[lane1] = b_.values[lane2]; + return simde_float32x2_from_private(r_); +} +#if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + #define simde_vcopy_lane_f32(a, lane1, b, lane2) vcopy_lane_f32((a), (lane1), (b), (lane2)) +#endif +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vcopy_lane_f32 + #define vcopy_lane_f32(a, lane1, b, lane2) simde_vcopy_lane_f32((a), (lane1), (b), (lane2)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_float64x1_t +simde_vcopy_lane_f64(simde_float64x1_t a, const int lane1, simde_float64x1_t b, const int lane2) + SIMDE_REQUIRE_CONSTANT_RANGE(lane1, 0, 0) + SIMDE_REQUIRE_CONSTANT_RANGE(lane2, 0, 0) { + simde_float64x1_private + b_ = simde_float64x1_to_private(b), + r_ = simde_float64x1_to_private(a); + + r_.values[lane1] = b_.values[lane2]; + return simde_float64x1_from_private(r_); +} +#if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + #define simde_vcopy_lane_f64(a, lane1, b, lane2) vcopy_lane_f64((a), (lane1), (b), (lane2)) +#endif +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vcopy_lane_f64 + #define vcopy_lane_f64(a, lane1, b, lane2) simde_vcopy_lane_f64((a), (lane1), (b), (lane2)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_int8x8_t +simde_vcopy_laneq_s8(simde_int8x8_t a, const int lane1, simde_int8x16_t b, const int lane2) + SIMDE_REQUIRE_CONSTANT_RANGE(lane1, 0, 7) + SIMDE_REQUIRE_CONSTANT_RANGE(lane2, 0, 15) { + simde_int8x8_private + r_ = simde_int8x8_to_private(a); + simde_int8x16_private + b_ = simde_int8x16_to_private(b); + + r_.values[lane1] = b_.values[lane2]; + return simde_int8x8_from_private(r_); +} +#if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + #define simde_vcopy_laneq_s8(a, lane1, b, lane2) vcopy_laneq_s8((a), (lane1), (b), (lane2)) +#endif +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vcopy_laneq_s8 + #define vcopy_laneq_s8(a, lane1, b, lane2) simde_vcopy_laneq_s8((a), (lane1), (b), (lane2)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_int16x4_t +simde_vcopy_laneq_s16(simde_int16x4_t a, const int lane1, simde_int16x8_t b, const int lane2) + SIMDE_REQUIRE_CONSTANT_RANGE(lane1, 0, 3) + SIMDE_REQUIRE_CONSTANT_RANGE(lane2, 0, 7) { + simde_int16x4_private + r_ = simde_int16x4_to_private(a); + simde_int16x8_private + b_ = simde_int16x8_to_private(b); + + r_.values[lane1] = b_.values[lane2]; + return simde_int16x4_from_private(r_); +} +#if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + #define simde_vcopy_laneq_s16(a, lane1, b, lane2) vcopy_laneq_s16((a), (lane1), (b), (lane2)) +#endif +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vcopy_laneq_s16 + #define vcopy_laneq_s16(a, lane1, b, lane2) simde_vcopy_laneq_s16((a), (lane1), (b), (lane2)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_int32x2_t +simde_vcopy_laneq_s32(simde_int32x2_t a, const int lane1, simde_int32x4_t b, const int lane2) + SIMDE_REQUIRE_CONSTANT_RANGE(lane1, 0, 1) + SIMDE_REQUIRE_CONSTANT_RANGE(lane2, 0, 3) { + simde_int32x2_private + r_ = simde_int32x2_to_private(a); + simde_int32x4_private + b_ = simde_int32x4_to_private(b); + + r_.values[lane1] = b_.values[lane2]; + return simde_int32x2_from_private(r_); +} +#if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + #define simde_vcopy_laneq_s32(a, lane1, b, lane2) vcopy_laneq_s32((a), (lane1), (b), (lane2)) +#endif +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vcopy_laneq_s32 + #define vcopy_laneq_s32(a, lane1, b, lane2) simde_vcopy_laneq_s32((a), (lane1), (b), (lane2)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_int64x1_t +simde_vcopy_laneq_s64(simde_int64x1_t a, const int lane1, simde_int64x2_t b, const int lane2) + SIMDE_REQUIRE_CONSTANT_RANGE(lane1, 0, 0) + SIMDE_REQUIRE_CONSTANT_RANGE(lane2, 0, 1) { + simde_int64x1_private + r_ = simde_int64x1_to_private(a); + simde_int64x2_private + b_ = simde_int64x2_to_private(b); + + r_.values[lane1] = b_.values[lane2]; + return simde_int64x1_from_private(r_); +} +#if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + #define simde_vcopy_laneq_s64(a, lane1, b, lane2) vcopy_laneq_s64((a), (lane1), (b), (lane2)) +#endif +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vcopy_laneq_s64 + #define vcopy_laneq_s64(a, lane1, b, lane2) simde_vcopy_laneq_s64((a), (lane1), (b), (lane2)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_uint8x8_t +simde_vcopy_laneq_u8(simde_uint8x8_t a, const int lane1, simde_uint8x16_t b, const int lane2) + SIMDE_REQUIRE_CONSTANT_RANGE(lane1, 0, 7) + SIMDE_REQUIRE_CONSTANT_RANGE(lane2, 0, 15) { + simde_uint8x8_private + r_ = simde_uint8x8_to_private(a); + simde_uint8x16_private + b_ = simde_uint8x16_to_private(b); + + r_.values[lane1] = b_.values[lane2]; + return simde_uint8x8_from_private(r_); +} +#if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + #define simde_vcopy_laneq_u8(a, lane1, b, lane2) vcopy_laneq_u8((a), (lane1), (b), (lane2)) +#endif +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vcopy_laneq_u8 + #define vcopy_laneq_u8(a, lane1, b, lane2) simde_vcopy_laneq_u8((a), (lane1), (b), (lane2)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_uint16x4_t +simde_vcopy_laneq_u16(simde_uint16x4_t a, const int lane1, simde_uint16x8_t b, const int lane2) + SIMDE_REQUIRE_CONSTANT_RANGE(lane1, 0, 3) + SIMDE_REQUIRE_CONSTANT_RANGE(lane2, 0, 7) { + simde_uint16x4_private + r_ = simde_uint16x4_to_private(a); + simde_uint16x8_private + b_ = simde_uint16x8_to_private(b); + + r_.values[lane1] = b_.values[lane2]; + return simde_uint16x4_from_private(r_); +} +#if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + #define simde_vcopy_laneq_u16(a, lane1, b, lane2) vcopy_laneq_u16((a), (lane1), (b), (lane2)) +#endif +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vcopy_laneq_u16 + #define vcopy_laneq_u16(a, lane1, b, lane2) simde_vcopy_laneq_u16((a), (lane1), (b), (lane2)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_uint32x2_t +simde_vcopy_laneq_u32(simde_uint32x2_t a, const int lane1, simde_uint32x4_t b, const int lane2) + SIMDE_REQUIRE_CONSTANT_RANGE(lane1, 0, 1) + SIMDE_REQUIRE_CONSTANT_RANGE(lane2, 0, 3) { + simde_uint32x2_private + r_ = simde_uint32x2_to_private(a); + simde_uint32x4_private + b_ = simde_uint32x4_to_private(b); + + r_.values[lane1] = b_.values[lane2]; + return simde_uint32x2_from_private(r_); +} +#if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + #define simde_vcopy_laneq_u32(a, lane1, b, lane2) vcopy_laneq_u32((a), (lane1), (b), (lane2)) +#endif +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vcopy_laneq_u32 + #define vcopy_laneq_u32(a, lane1, b, lane2) simde_vcopy_laneq_u32((a), (lane1), (b), (lane2)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_uint64x1_t +simde_vcopy_laneq_u64(simde_uint64x1_t a, const int lane1, simde_uint64x2_t b, const int lane2) + SIMDE_REQUIRE_CONSTANT_RANGE(lane1, 0, 0) + SIMDE_REQUIRE_CONSTANT_RANGE(lane2, 0, 1) { + simde_uint64x1_private + r_ = simde_uint64x1_to_private(a); + simde_uint64x2_private + b_ = simde_uint64x2_to_private(b); + + r_.values[lane1] = b_.values[lane2]; + return simde_uint64x1_from_private(r_); +} +#if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + #define simde_vcopy_laneq_u64(a, lane1, b, lane2) vcopy_laneq_u64((a), (lane1), (b), (lane2)) +#endif +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vcopy_laneq_u64 + #define vcopy_laneq_u64(a, lane1, b, lane2) simde_vcopy_laneq_u64((a), (lane1), (b), (lane2)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_float32x2_t +simde_vcopy_laneq_f32(simde_float32x2_t a, const int lane1, simde_float32x4_t b, const int lane2) + SIMDE_REQUIRE_CONSTANT_RANGE(lane1, 0, 1) + SIMDE_REQUIRE_CONSTANT_RANGE(lane2, 0, 3) { + simde_float32x2_private + r_ = simde_float32x2_to_private(a); + simde_float32x4_private + b_ = simde_float32x4_to_private(b); + + r_.values[lane1] = b_.values[lane2]; + return simde_float32x2_from_private(r_); +} +#if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + #define simde_vcopy_laneq_f32(a, lane1, b, lane2) vcopy_laneq_f32((a), (lane1), (b), (lane2)) +#endif +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vcopy_laneq_f32 + #define vcopy_laneq_f32(a, lane1, b, lane2) simde_vcopy_laneq_f32((a), (lane1), (b), (lane2)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_float64x1_t +simde_vcopy_laneq_f64(simde_float64x1_t a, const int lane1, simde_float64x2_t b, const int lane2) + SIMDE_REQUIRE_CONSTANT_RANGE(lane1, 0, 0) + SIMDE_REQUIRE_CONSTANT_RANGE(lane2, 0, 1) { + simde_float64x1_private + r_ = simde_float64x1_to_private(a); + simde_float64x2_private + b_ = simde_float64x2_to_private(b); + + r_.values[lane1] = b_.values[lane2]; + return simde_float64x1_from_private(r_); +} +#if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + #define simde_vcopy_laneq_f64(a, lane1, b, lane2) vcopy_laneq_f64((a), (lane1), (b), (lane2)) +#endif +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vcopy_laneq_f64 + #define vcopy_laneq_f64(a, lane1, b, lane2) simde_vcopy_laneq_f64((a), (lane1), (b), (lane2)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_int8x16_t +simde_vcopyq_lane_s8(simde_int8x16_t a, const int lane1, simde_int8x8_t b, const int lane2) + SIMDE_REQUIRE_CONSTANT_RANGE(lane1, 0, 15) + SIMDE_REQUIRE_CONSTANT_RANGE(lane2, 0, 7) { + simde_int8x8_private + b_ = simde_int8x8_to_private(b); + simde_int8x16_private + r_ = simde_int8x16_to_private(a); + + r_.values[lane1] = b_.values[lane2]; + return simde_int8x16_from_private(r_); +} +#if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + #define simde_vcopyq_lane_s8(a, lane1, b, lane2) vcopyq_lane_s8((a), (lane1), (b), (lane2)) +#endif +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vcopyq_lane_s8 + #define vcopyq_lane_s8(a, lane1, b, lane2) simde_vcopyq_lane_s8((a), (lane1), (b), (lane2)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_int16x8_t +simde_vcopyq_lane_s16(simde_int16x8_t a, const int lane1, simde_int16x4_t b, const int lane2) + SIMDE_REQUIRE_CONSTANT_RANGE(lane1, 0, 7) + SIMDE_REQUIRE_CONSTANT_RANGE(lane2, 0, 3) { + simde_int16x4_private + b_ = simde_int16x4_to_private(b); + simde_int16x8_private + r_ = simde_int16x8_to_private(a); + + r_.values[lane1] = b_.values[lane2]; + return simde_int16x8_from_private(r_); +} +#if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + #define simde_vcopyq_lane_s16(a, lane1, b, lane2) vcopyq_lane_s16((a), (lane1), (b), (lane2)) +#endif +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vcopyq_lane_s16 + #define vcopyq_lane_s16(a, lane1, b, lane2) simde_vcopyq_lane_s16((a), (lane1), (b), (lane2)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_int32x4_t +simde_vcopyq_lane_s32(simde_int32x4_t a, const int lane1, simde_int32x2_t b, const int lane2) + SIMDE_REQUIRE_CONSTANT_RANGE(lane1, 0, 3) + SIMDE_REQUIRE_CONSTANT_RANGE(lane2, 0, 1) { + simde_int32x2_private + b_ = simde_int32x2_to_private(b); + simde_int32x4_private + r_ = simde_int32x4_to_private(a); + + r_.values[lane1] = b_.values[lane2]; + return simde_int32x4_from_private(r_); +} +#if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + #define simde_vcopyq_lane_s32(a, lane1, b, lane2) vcopyq_lane_s32((a), (lane1), (b), (lane2)) +#endif +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vcopyq_lane_s32 + #define vcopyq_lane_s32(a, lane1, b, lane2) simde_vcopyq_lane_s32((a), (lane1), (b), (lane2)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_int64x2_t +simde_vcopyq_lane_s64(simde_int64x2_t a, const int lane1, simde_int64x1_t b, const int lane2) + SIMDE_REQUIRE_CONSTANT_RANGE(lane1, 0, 1) + SIMDE_REQUIRE_CONSTANT_RANGE(lane2, 0, 0) { + simde_int64x1_private + b_ = simde_int64x1_to_private(b); + simde_int64x2_private + r_ = simde_int64x2_to_private(a); + + r_.values[lane1] = b_.values[lane2]; + return simde_int64x2_from_private(r_); +} +#if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + #define simde_vcopyq_lane_s64(a, lane1, b, lane2) vcopyq_lane_s64((a), (lane1), (b), (lane2)) +#endif +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vcopyq_lane_s64 + #define vcopyq_lane_s64(a, lane1, b, lane2) simde_vcopyq_lane_s64((a), (lane1), (b), (lane2)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_uint8x16_t +simde_vcopyq_lane_u8(simde_uint8x16_t a, const int lane1, simde_uint8x8_t b, const int lane2) + SIMDE_REQUIRE_CONSTANT_RANGE(lane1, 0, 15) + SIMDE_REQUIRE_CONSTANT_RANGE(lane2, 0, 7) { + simde_uint8x8_private + b_ = simde_uint8x8_to_private(b); + simde_uint8x16_private + r_ = simde_uint8x16_to_private(a); + + r_.values[lane1] = b_.values[lane2]; + return simde_uint8x16_from_private(r_); +} +#if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + #define simde_vcopyq_lane_u8(a, lane1, b, lane2) vcopyq_lane_u8((a), (lane1), (b), (lane2)) +#endif +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vcopyq_lane_u8 + #define vcopyq_lane_u8(a, lane1, b, lane2) simde_vcopyq_lane_u8((a), (lane1), (b), (lane2)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_uint16x8_t +simde_vcopyq_lane_u16(simde_uint16x8_t a, const int lane1, simde_uint16x4_t b, const int lane2) + SIMDE_REQUIRE_CONSTANT_RANGE(lane1, 0, 7) + SIMDE_REQUIRE_CONSTANT_RANGE(lane2, 0, 3) { + simde_uint16x4_private + b_ = simde_uint16x4_to_private(b); + simde_uint16x8_private + r_ = simde_uint16x8_to_private(a); + + r_.values[lane1] = b_.values[lane2]; + return simde_uint16x8_from_private(r_); +} +#if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + #define simde_vcopyq_lane_u16(a, lane1, b, lane2) vcopyq_lane_u16((a), (lane1), (b), (lane2)) +#endif +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vcopyq_lane_u16 + #define vcopyq_lane_u16(a, lane1, b, lane2) simde_vcopyq_lane_u16((a), (lane1), (b), (lane2)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_uint32x4_t +simde_vcopyq_lane_u32(simde_uint32x4_t a, const int lane1, simde_uint32x2_t b, const int lane2) + SIMDE_REQUIRE_CONSTANT_RANGE(lane1, 0, 3) + SIMDE_REQUIRE_CONSTANT_RANGE(lane2, 0, 1) { + simde_uint32x2_private + b_ = simde_uint32x2_to_private(b); + simde_uint32x4_private + r_ = simde_uint32x4_to_private(a); + + r_.values[lane1] = b_.values[lane2]; + return simde_uint32x4_from_private(r_); +} +#if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + #define simde_vcopyq_lane_u32(a, lane1, b, lane2) vcopyq_lane_u32((a), (lane1), (b), (lane2)) +#endif +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vcopyq_lane_u32 + #define vcopyq_lane_u32(a, lane1, b, lane2) simde_vcopyq_lane_u32((a), (lane1), (b), (lane2)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_uint64x2_t +simde_vcopyq_lane_u64(simde_uint64x2_t a, const int lane1, simde_uint64x1_t b, const int lane2) + SIMDE_REQUIRE_CONSTANT_RANGE(lane1, 0, 1) + SIMDE_REQUIRE_CONSTANT_RANGE(lane2, 0, 0) { + simde_uint64x1_private + b_ = simde_uint64x1_to_private(b); + simde_uint64x2_private + r_ = simde_uint64x2_to_private(a); + + r_.values[lane1] = b_.values[lane2]; + return simde_uint64x2_from_private(r_); +} +#if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + #define simde_vcopyq_lane_u64(a, lane1, b, lane2) vcopyq_lane_u64((a), (lane1), (b), (lane2)) +#endif +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vcopyq_lane_u64 + #define vcopyq_lane_u64(a, lane1, b, lane2) simde_vcopyq_lane_u64((a), (lane1), (b), (lane2)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_float32x4_t +simde_vcopyq_lane_f32(simde_float32x4_t a, const int lane1, simde_float32x2_t b, const int lane2) + SIMDE_REQUIRE_CONSTANT_RANGE(lane1, 0, 3) + SIMDE_REQUIRE_CONSTANT_RANGE(lane2, 0, 1) { + simde_float32x2_private + b_ = simde_float32x2_to_private(b); + simde_float32x4_private + r_ = simde_float32x4_to_private(a); + + r_.values[lane1] = b_.values[lane2]; + return simde_float32x4_from_private(r_); +} +#if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + #define simde_vcopyq_lane_f32(a, lane1, b, lane2) vcopyq_lane_f32((a), (lane1), (b), (lane2)) +#endif +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vcopyq_lane_f32 + #define vcopyq_lane_f32(a, lane1, b, lane2) simde_vcopyq_lane_f32((a), (lane1), (b), (lane2)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_float64x2_t +simde_vcopyq_lane_f64(simde_float64x2_t a, const int lane1, simde_float64x1_t b, const int lane2) + SIMDE_REQUIRE_CONSTANT_RANGE(lane1, 0, 1) + SIMDE_REQUIRE_CONSTANT_RANGE(lane2, 0, 0) { + simde_float64x1_private + b_ = simde_float64x1_to_private(b); + simde_float64x2_private + r_ = simde_float64x2_to_private(a); + + r_.values[lane1] = b_.values[lane2]; + return simde_float64x2_from_private(r_); +} +#if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + #define simde_vcopyq_lane_f64(a, lane1, b, lane2) vcopyq_lane_f64((a), (lane1), (b), (lane2)) +#endif +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vcopyq_lane_f64 + #define vcopyq_lane_f64(a, lane1, b, lane2) simde_vcopyq_lane_f64((a), (lane1), (b), (lane2)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_int8x16_t +simde_vcopyq_laneq_s8(simde_int8x16_t a, const int lane1, simde_int8x16_t b, const int lane2) + SIMDE_REQUIRE_CONSTANT_RANGE(lane1, 0, 15) + SIMDE_REQUIRE_CONSTANT_RANGE(lane2, 0, 15) { + simde_int8x16_private + b_ = simde_int8x16_to_private(b), + r_ = simde_int8x16_to_private(a); + + r_.values[lane1] = b_.values[lane2]; + return simde_int8x16_from_private(r_); +} +#if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + #define simde_vcopyq_laneq_s8(a, lane1, b, lane2) vcopyq_laneq_s8((a), (lane1), (b), (lane2)) +#endif +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vcopyq_laneq_s8 + #define vcopyq_laneq_s8(a, lane1, b, lane2) simde_vcopyq_laneq_s8((a), (lane1), (b), (lane2)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_int16x8_t +simde_vcopyq_laneq_s16(simde_int16x8_t a, const int lane1, simde_int16x8_t b, const int lane2) + SIMDE_REQUIRE_CONSTANT_RANGE(lane1, 0, 7) + SIMDE_REQUIRE_CONSTANT_RANGE(lane2, 0, 7) { + simde_int16x8_private + b_ = simde_int16x8_to_private(b), + r_ = simde_int16x8_to_private(a); + + r_.values[lane1] = b_.values[lane2]; + return simde_int16x8_from_private(r_); +} +#if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + #define simde_vcopyq_laneq_s16(a, lane1, b, lane2) vcopyq_laneq_s16((a), (lane1), (b), (lane2)) +#endif +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vcopyq_laneq_s16 + #define vcopyq_laneq_s16(a, lane1, b, lane2) simde_vcopyq_laneq_s16((a), (lane1), (b), (lane2)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_int32x4_t +simde_vcopyq_laneq_s32(simde_int32x4_t a, const int lane1, simde_int32x4_t b, const int lane2) + SIMDE_REQUIRE_CONSTANT_RANGE(lane1, 0, 3) + SIMDE_REQUIRE_CONSTANT_RANGE(lane2, 0, 3) { + simde_int32x4_private + b_ = simde_int32x4_to_private(b), + r_ = simde_int32x4_to_private(a); + + r_.values[lane1] = b_.values[lane2]; + return simde_int32x4_from_private(r_); +} +#if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + #define simde_vcopyq_laneq_s32(a, lane1, b, lane2) vcopyq_laneq_s32((a), (lane1), (b), (lane2)) +#endif +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vcopyq_laneq_s32 + #define vcopyq_laneq_s32(a, lane1, b, lane2) simde_vcopyq_laneq_s32((a), (lane1), (b), (lane2)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_int64x2_t +simde_vcopyq_laneq_s64(simde_int64x2_t a, const int lane1, simde_int64x2_t b, const int lane2) + SIMDE_REQUIRE_CONSTANT_RANGE(lane1, 0, 1) + SIMDE_REQUIRE_CONSTANT_RANGE(lane2, 0, 1) { + simde_int64x2_private + b_ = simde_int64x2_to_private(b), + r_ = simde_int64x2_to_private(a); + + r_.values[lane1] = b_.values[lane2]; + return simde_int64x2_from_private(r_); +} +#if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + #define simde_vcopyq_laneq_s64(a, lane1, b, lane2) vcopyq_laneq_s64((a), (lane1), (b), (lane2)) +#endif +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vcopyq_laneq_s64 + #define vcopyq_laneq_s64(a, lane1, b, lane2) simde_vcopyq_laneq_s64((a), (lane1), (b), (lane2)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_uint8x16_t +simde_vcopyq_laneq_u8(simde_uint8x16_t a, const int lane1, simde_uint8x16_t b, const int lane2) + SIMDE_REQUIRE_CONSTANT_RANGE(lane1, 0, 15) + SIMDE_REQUIRE_CONSTANT_RANGE(lane2, 0, 15) { + simde_uint8x16_private + b_ = simde_uint8x16_to_private(b), + r_ = simde_uint8x16_to_private(a); + + r_.values[lane1] = b_.values[lane2]; + return simde_uint8x16_from_private(r_); +} +#if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + #define simde_vcopyq_laneq_u8(a, lane1, b, lane2) vcopyq_laneq_u8((a), (lane1), (b), (lane2)) +#endif +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vcopyq_laneq_u8 + #define vcopyq_laneq_u8(a, lane1, b, lane2) simde_vcopyq_laneq_u8((a), (lane1), (b), (lane2)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_uint16x8_t +simde_vcopyq_laneq_u16(simde_uint16x8_t a, const int lane1, simde_uint16x8_t b, const int lane2) + SIMDE_REQUIRE_CONSTANT_RANGE(lane1, 0, 7) + SIMDE_REQUIRE_CONSTANT_RANGE(lane2, 0, 7) { + simde_uint16x8_private + b_ = simde_uint16x8_to_private(b), + r_ = simde_uint16x8_to_private(a); + + r_.values[lane1] = b_.values[lane2]; + return simde_uint16x8_from_private(r_); +} +#if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + #define simde_vcopyq_laneq_u16(a, lane1, b, lane2) vcopyq_laneq_u16((a), (lane1), (b), (lane2)) +#endif +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vcopyq_laneq_u16 + #define vcopyq_laneq_u16(a, lane1, b, lane2) simde_vcopyq_laneq_u16((a), (lane1), (b), (lane2)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_uint32x4_t +simde_vcopyq_laneq_u32(simde_uint32x4_t a, const int lane1, simde_uint32x4_t b, const int lane2) + SIMDE_REQUIRE_CONSTANT_RANGE(lane1, 0, 3) + SIMDE_REQUIRE_CONSTANT_RANGE(lane2, 0, 3) { + simde_uint32x4_private + b_ = simde_uint32x4_to_private(b), + r_ = simde_uint32x4_to_private(a); + + r_.values[lane1] = b_.values[lane2]; + return simde_uint32x4_from_private(r_); +} +#if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + #define simde_vcopyq_laneq_u32(a, lane1, b, lane2) vcopyq_laneq_u32((a), (lane1), (b), (lane2)) +#endif +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vcopyq_laneq_u32 + #define vcopyq_laneq_u32(a, lane1, b, lane2) simde_vcopyq_laneq_u32((a), (lane1), (b), (lane2)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_uint64x2_t +simde_vcopyq_laneq_u64(simde_uint64x2_t a, const int lane1, simde_uint64x2_t b, const int lane2) + SIMDE_REQUIRE_CONSTANT_RANGE(lane1, 0, 1) + SIMDE_REQUIRE_CONSTANT_RANGE(lane2, 0, 1) { + simde_uint64x2_private + b_ = simde_uint64x2_to_private(b), + r_ = simde_uint64x2_to_private(a); + + r_.values[lane1] = b_.values[lane2]; + return simde_uint64x2_from_private(r_); +} +#if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + #define simde_vcopyq_laneq_u64(a, lane1, b, lane2) vcopyq_laneq_u64((a), (lane1), (b), (lane2)) +#endif +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vcopyq_laneq_u64 + #define vcopyq_laneq_u64(a, lane1, b, lane2) simde_vcopyq_laneq_u64((a), (lane1), (b), (lane2)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_float32x4_t +simde_vcopyq_laneq_f32(simde_float32x4_t a, const int lane1, simde_float32x4_t b, const int lane2) + SIMDE_REQUIRE_CONSTANT_RANGE(lane1, 0, 3) + SIMDE_REQUIRE_CONSTANT_RANGE(lane2, 0, 3) { + simde_float32x4_private + b_ = simde_float32x4_to_private(b), + r_ = simde_float32x4_to_private(a); + + r_.values[lane1] = b_.values[lane2]; + return simde_float32x4_from_private(r_); +} +#if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + #define simde_vcopyq_laneq_f32(a, lane1, b, lane2) vcopyq_laneq_f32((a), (lane1), (b), (lane2)) +#endif +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vcopyq_laneq_f32 + #define vcopyq_laneq_f32(a, lane1, b, lane2) simde_vcopyq_laneq_f32((a), (lane1), (b), (lane2)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_float64x2_t +simde_vcopyq_laneq_f64(simde_float64x2_t a, const int lane1, simde_float64x2_t b, const int lane2) + SIMDE_REQUIRE_CONSTANT_RANGE(lane1, 0, 1) + SIMDE_REQUIRE_CONSTANT_RANGE(lane2, 0, 1) { + simde_float64x2_private + b_ = simde_float64x2_to_private(b), + r_ = simde_float64x2_to_private(a); + + r_.values[lane1] = b_.values[lane2]; + return simde_float64x2_from_private(r_); +} +#if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + #define simde_vcopyq_laneq_f64(a, lane1, b, lane2) vcopyq_laneq_f64((a), (lane1), (b), (lane2)) +#endif +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vcopyq_laneq_f64 + #define vcopyq_laneq_f64(a, lane1, b, lane2) simde_vcopyq_laneq_f64((a), (lane1), (b), (lane2)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_poly8x8_t +simde_vcopy_lane_p8(simde_poly8x8_t a, const int lane1, simde_poly8x8_t b, const int lane2) + SIMDE_REQUIRE_CONSTANT_RANGE(lane1, 0, 7) + SIMDE_REQUIRE_CONSTANT_RANGE(lane2, 0, 7) { + simde_poly8x8_private + b_ = simde_poly8x8_to_private(b), + r_ = simde_poly8x8_to_private(a); + + r_.values[lane1] = b_.values[lane2]; + return simde_poly8x8_from_private(r_); +} +#if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && !defined(SIMDE_BUG_CLANG_71362) + #define simde_vcopy_lane_p8(a, lane1, b, lane2) vcopy_lane_p8((a), (lane1), (b), (lane2)) +#endif +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vcopy_lane_p8 + #define vcopy_lane_p8(a, lane1, b, lane2) simde_vcopy_lane_p8((a), (lane1), (b), (lane2)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_poly16x4_t +simde_vcopy_lane_p16(simde_poly16x4_t a, const int lane1, simde_poly16x4_t b, const int lane2) + SIMDE_REQUIRE_CONSTANT_RANGE(lane1, 0, 3) + SIMDE_REQUIRE_CONSTANT_RANGE(lane2, 0, 3) { + simde_poly16x4_private + b_ = simde_poly16x4_to_private(b), + r_ = simde_poly16x4_to_private(a); + + r_.values[lane1] = b_.values[lane2]; + return simde_poly16x4_from_private(r_); +} +#if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && !defined(SIMDE_BUG_CLANG_71362) + #define simde_vcopy_lane_p16(a, lane1, b, lane2) vcopy_lane_p16((a), (lane1), (b), (lane2)) +#endif +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vcopy_lane_p16 + #define vcopy_lane_p16(a, lane1, b, lane2) simde_vcopy_lane_p16((a), (lane1), (b), (lane2)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_poly64x1_t +simde_vcopy_lane_p64(simde_poly64x1_t a, const int lane1, simde_poly64x1_t b, const int lane2) + SIMDE_REQUIRE_CONSTANT_RANGE(lane1, 0, 0) + SIMDE_REQUIRE_CONSTANT_RANGE(lane2, 0, 0) { + simde_poly64x1_private + b_ = simde_poly64x1_to_private(b), + r_ = simde_poly64x1_to_private(a); + + r_.values[lane1] = b_.values[lane2]; + return simde_poly64x1_from_private(r_); +} +#if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && !defined(SIMDE_BUG_CLANG_71362) + #define simde_vcopy_lane_p64(a, lane1, b, lane2) vcopy_lane_p64((a), (lane1), (b), (lane2)) +#endif +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vcopy_lane_p64 + #define vcopy_lane_p64(a, lane1, b, lane2) simde_vcopy_lane_p64((a), (lane1), (b), (lane2)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_poly8x8_t +simde_vcopy_laneq_p8(simde_poly8x8_t a, const int lane1, simde_poly8x16_t b, const int lane2) + SIMDE_REQUIRE_CONSTANT_RANGE(lane1, 0, 7) + SIMDE_REQUIRE_CONSTANT_RANGE(lane2, 0, 15) { + simde_poly8x8_private + r_ = simde_poly8x8_to_private(a); + simde_poly8x16_private + b_ = simde_poly8x16_to_private(b); + + r_.values[lane1] = b_.values[lane2]; + return simde_poly8x8_from_private(r_); +} +#if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && !defined(SIMDE_BUG_CLANG_71362) + #define simde_vcopy_laneq_p8(a, lane1, b, lane2) vcopy_laneq_p8((a), (lane1), (b), (lane2)) +#endif +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vcopy_laneq_p8 + #define vcopy_laneq_p8(a, lane1, b, lane2) simde_vcopy_laneq_p8((a), (lane1), (b), (lane2)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_poly16x4_t +simde_vcopy_laneq_p16(simde_poly16x4_t a, const int lane1, simde_poly16x8_t b, const int lane2) + SIMDE_REQUIRE_CONSTANT_RANGE(lane1, 0, 3) + SIMDE_REQUIRE_CONSTANT_RANGE(lane2, 0, 7) { + simde_poly16x4_private + r_ = simde_poly16x4_to_private(a); + simde_poly16x8_private + b_ = simde_poly16x8_to_private(b); + + r_.values[lane1] = b_.values[lane2]; + return simde_poly16x4_from_private(r_); +} +#if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && !defined(SIMDE_BUG_CLANG_71362) + #define simde_vcopy_laneq_p16(a, lane1, b, lane2) vcopy_laneq_p16((a), (lane1), (b), (lane2)) +#endif +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vcopy_laneq_p16 + #define vcopy_laneq_p16(a, lane1, b, lane2) simde_vcopy_laneq_p16((a), (lane1), (b), (lane2)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_poly64x1_t +simde_vcopy_laneq_p64(simde_poly64x1_t a, const int lane1, simde_poly64x2_t b, const int lane2) + SIMDE_REQUIRE_CONSTANT_RANGE(lane1, 0, 0) + SIMDE_REQUIRE_CONSTANT_RANGE(lane2, 0, 1) { + simde_poly64x1_private + r_ = simde_poly64x1_to_private(a); + simde_poly64x2_private + b_ = simde_poly64x2_to_private(b); + + r_.values[lane1] = b_.values[lane2]; + return simde_poly64x1_from_private(r_); +} +#if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && !defined(SIMDE_BUG_CLANG_71362) + #define simde_vcopy_laneq_p64(a, lane1, b, lane2) vcopy_laneq_p64((a), (lane1), (b), (lane2)) +#endif +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vcopy_laneq_p64 + #define vcopy_laneq_p64(a, lane1, b, lane2) simde_vcopy_laneq_p64((a), (lane1), (b), (lane2)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_poly8x16_t +simde_vcopyq_lane_p8(simde_poly8x16_t a, const int lane1, simde_poly8x8_t b, const int lane2) + SIMDE_REQUIRE_CONSTANT_RANGE(lane1, 0, 15) + SIMDE_REQUIRE_CONSTANT_RANGE(lane2, 0, 7) { + simde_poly8x8_private + b_ = simde_poly8x8_to_private(b); + simde_poly8x16_private + r_ = simde_poly8x16_to_private(a); + + r_.values[lane1] = b_.values[lane2]; + return simde_poly8x16_from_private(r_); +} +#if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && !defined(SIMDE_BUG_CLANG_71362) + #define simde_vcopyq_lane_p8(a, lane1, b, lane2) vcopyq_lane_p8((a), (lane1), (b), (lane2)) +#endif +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vcopyq_lane_p8 + #define vcopyq_lane_p8(a, lane1, b, lane2) simde_vcopyq_lane_p8((a), (lane1), (b), (lane2)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_poly16x8_t +simde_vcopyq_lane_p16(simde_poly16x8_t a, const int lane1, simde_poly16x4_t b, const int lane2) + SIMDE_REQUIRE_CONSTANT_RANGE(lane1, 0, 7) + SIMDE_REQUIRE_CONSTANT_RANGE(lane2, 0, 3) { + simde_poly16x4_private + b_ = simde_poly16x4_to_private(b); + simde_poly16x8_private + r_ = simde_poly16x8_to_private(a); + + r_.values[lane1] = b_.values[lane2]; + return simde_poly16x8_from_private(r_); +} +#if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && !defined(SIMDE_BUG_CLANG_71362) + #define simde_vcopyq_lane_p16(a, lane1, b, lane2) vcopyq_lane_p16((a), (lane1), (b), (lane2)) +#endif +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vcopyq_lane_p16 + #define vcopyq_lane_p16(a, lane1, b, lane2) simde_vcopyq_lane_p16((a), (lane1), (b), (lane2)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_poly64x2_t +simde_vcopyq_lane_p64(simde_poly64x2_t a, const int lane1, simde_poly64x1_t b, const int lane2) + SIMDE_REQUIRE_CONSTANT_RANGE(lane1, 0, 1) + SIMDE_REQUIRE_CONSTANT_RANGE(lane2, 0, 0) { + simde_poly64x1_private + b_ = simde_poly64x1_to_private(b); + simde_poly64x2_private + r_ = simde_poly64x2_to_private(a); + + r_.values[lane1] = b_.values[lane2]; + return simde_poly64x2_from_private(r_); +} +#if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && !defined(SIMDE_BUG_CLANG_71362) + #define simde_vcopyq_lane_p64(a, lane1, b, lane2) vcopyq_lane_p64((a), (lane1), (b), (lane2)) +#endif +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vcopyq_lane_p64 + #define vcopyq_lane_p64(a, lane1, b, lane2) simde_vcopyq_lane_p64((a), (lane1), (b), (lane2)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_poly8x16_t +simde_vcopyq_laneq_p8(simde_poly8x16_t a, const int lane1, simde_poly8x16_t b, const int lane2) + SIMDE_REQUIRE_CONSTANT_RANGE(lane1, 0, 15) + SIMDE_REQUIRE_CONSTANT_RANGE(lane2, 0, 15) { + simde_poly8x16_private + b_ = simde_poly8x16_to_private(b), + r_ = simde_poly8x16_to_private(a); + + r_.values[lane1] = b_.values[lane2]; + return simde_poly8x16_from_private(r_); +} +#if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && !defined(SIMDE_BUG_CLANG_71362) + #define simde_vcopyq_laneq_p8(a, lane1, b, lane2) vcopyq_laneq_p8((a), (lane1), (b), (lane2)) +#endif +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vcopyq_laneq_p8 + #define vcopyq_laneq_p8(a, lane1, b, lane2) simde_vcopyq_laneq_p8((a), (lane1), (b), (lane2)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_poly16x8_t +simde_vcopyq_laneq_p16(simde_poly16x8_t a, const int lane1, simde_poly16x8_t b, const int lane2) + SIMDE_REQUIRE_CONSTANT_RANGE(lane1, 0, 7) + SIMDE_REQUIRE_CONSTANT_RANGE(lane2, 0, 7) { + simde_poly16x8_private + b_ = simde_poly16x8_to_private(b), + r_ = simde_poly16x8_to_private(a); + + r_.values[lane1] = b_.values[lane2]; + return simde_poly16x8_from_private(r_); +} +#if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && !defined(SIMDE_BUG_CLANG_71362) + #define simde_vcopyq_laneq_p16(a, lane1, b, lane2) vcopyq_laneq_p16((a), (lane1), (b), (lane2)) +#endif +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vcopyq_laneq_p16 + #define vcopyq_laneq_p16(a, lane1, b, lane2) simde_vcopyq_laneq_p16((a), (lane1), (b), (lane2)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_poly64x2_t +simde_vcopyq_laneq_p64(simde_poly64x2_t a, const int lane1, simde_poly64x2_t b, const int lane2) + SIMDE_REQUIRE_CONSTANT_RANGE(lane1, 0, 1) + SIMDE_REQUIRE_CONSTANT_RANGE(lane2, 0, 1) { + simde_poly64x2_private + b_ = simde_poly64x2_to_private(b), + r_ = simde_poly64x2_to_private(a); + + r_.values[lane1] = b_.values[lane2]; + return simde_poly64x2_from_private(r_); +} +#if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && !defined(SIMDE_BUG_CLANG_71362) + #define simde_vcopyq_laneq_p64(a, lane1, b, lane2) vcopyq_laneq_p64((a), (lane1), (b), (lane2)) +#endif +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vcopyq_laneq_p64 + #define vcopyq_laneq_p64(a, lane1, b, lane2) simde_vcopyq_laneq_p64((a), (lane1), (b), (lane2)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_bfloat16x4_t +simde_vcopy_lane_bf16(simde_bfloat16x4_t a, const int lane1, simde_bfloat16x4_t b, const int lane2) + SIMDE_REQUIRE_CONSTANT_RANGE(lane1, 0, 3) + SIMDE_REQUIRE_CONSTANT_RANGE(lane2, 0, 3) { + simde_bfloat16x4_private + b_ = simde_bfloat16x4_to_private(b), + r_ = simde_bfloat16x4_to_private(a); + + r_.values[lane1] = b_.values[lane2]; + return simde_bfloat16x4_from_private(r_); +} +#if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && defined(SIMDE_ARM_NEON_BF16) + #define simde_vcopy_lane_bf16(a, lane1, b, lane2) vcopy_lane_bf16((a), (lane1), (b), (lane2)) +#endif +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vcopy_lane_bf16 + #define vcopy_lane_bf16(a, lane1, b, lane2) simde_vcopy_lane_bf16((a), (lane1), (b), (lane2)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_bfloat16x4_t +simde_vcopy_laneq_bf16(simde_bfloat16x4_t a, const int lane1, simde_bfloat16x8_t b, const int lane2) + SIMDE_REQUIRE_CONSTANT_RANGE(lane1, 0, 3) + SIMDE_REQUIRE_CONSTANT_RANGE(lane2, 0, 7) { + simde_bfloat16x4_private r_ = simde_bfloat16x4_to_private(a); + simde_bfloat16x8_private b_ = simde_bfloat16x8_to_private(b); + + r_.values[lane1] = b_.values[lane2]; + return simde_bfloat16x4_from_private(r_); +} +#if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && defined(SIMDE_ARM_NEON_BF16) + #define simde_vcopy_laneq_bf16(a, lane1, b, lane2) vcopy_laneq_bf16((a), (lane1), (b), (lane2)) +#endif +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vcopy_laneq_bf16 + #define vcopy_laneq_bf16(a, lane1, b, lane2) simde_vcopy_laneq_bf16((a), (lane1), (b), (lane2)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_bfloat16x8_t +simde_vcopyq_lane_bf16(simde_bfloat16x8_t a, const int lane1, simde_bfloat16x4_t b, const int lane2) + SIMDE_REQUIRE_CONSTANT_RANGE(lane1, 0, 7) + SIMDE_REQUIRE_CONSTANT_RANGE(lane2, 0, 3) { + simde_bfloat16x4_private b_ = simde_bfloat16x4_to_private(b); + simde_bfloat16x8_private r_ = simde_bfloat16x8_to_private(a); + + r_.values[lane1] = b_.values[lane2]; + return simde_bfloat16x8_from_private(r_); +} +#if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && defined(SIMDE_ARM_NEON_BF16) + #define simde_vcopyq_lane_bf16(a, lane1, b, lane2) vcopyq_lane_bf16((a), (lane1), (b), (lane2)) +#endif +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vcopyq_lane_bf16 + #define vcopyq_lane_bf16(a, lane1, b, lane2) simde_vcopyq_lane_bf16((a), (lane1), (b), (lane2)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_bfloat16x8_t +simde_vcopyq_laneq_bf16(simde_bfloat16x8_t a, const int lane1, simde_bfloat16x8_t b, const int lane2) + SIMDE_REQUIRE_CONSTANT_RANGE(lane1, 0, 7) + SIMDE_REQUIRE_CONSTANT_RANGE(lane2, 0, 7) { + simde_bfloat16x8_private + b_ = simde_bfloat16x8_to_private(b), + r_ = simde_bfloat16x8_to_private(a); + + r_.values[lane1] = b_.values[lane2]; + return simde_bfloat16x8_from_private(r_); +} +#if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && defined(SIMDE_ARM_NEON_BF16) + #define simde_vcopyq_laneq_bf16(a, lane1, b, lane2) vcopyq_laneq_bf16((a), (lane1), (b), (lane2)) +#endif +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vcopyq_laneq_bf16 + #define vcopyq_laneq_bf16(a, lane1, b, lane2) simde_vcopyq_laneq_bf16((a), (lane1), (b), (lane2)) +#endif + +SIMDE_END_DECLS_ +HEDLEY_DIAGNOSTIC_POP + +#endif /* SIMDE_ARM_NEON_COPY_LANE_H */ diff --git a/lib/simd_wrapper/simde/arm/neon/crc32.h b/lib/simd_wrapper/simde/arm/neon/crc32.h new file mode 100644 index 00000000000..1223190c189 --- /dev/null +++ b/lib/simd_wrapper/simde/arm/neon/crc32.h @@ -0,0 +1,282 @@ +/* SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, copy, + * modify, merge, publish, distribute, sublicense, and/or sell copies + * of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + * Copyright: + * 2023 Yi-Yen Chung (Copyright owned by Andes Technology) + */ + +#if !defined(SIMDE_ARM_NEON_CRC32_H) +#define SIMDE_ARM_NEON_CRC32_H + +#include "types.h" + +HEDLEY_DIAGNOSTIC_PUSH +SIMDE_DISABLE_UNWANTED_DIAGNOSTICS +SIMDE_BEGIN_DECLS_ + +SIMDE_FUNCTION_ATTRIBUTES +uint64_t simde_crc32_reverseBits(uint64_t num, int num_of_bits) +{ + uint64_t reverse_num = 0; + for (int i = 0; i < num_of_bits; i++) { + if (num & (1ULL << i)) + reverse_num |= 1ULL << (num_of_bits - 1 - i); + } + return reverse_num; +} + +SIMDE_FUNCTION_ATTRIBUTES +uint32_t simde_crc32_eor_mask(uint32_t a, uint32_t b, uint32_t mask) { + uint32_t part_a = a & mask; + uint32_t part_result = part_a ^ b; + uint32_t result = (a & ~mask) | part_result; + return result; +} + +SIMDE_FUNCTION_ATTRIBUTES +uint32_t +simde___crc32b(uint32_t a, uint8_t b) { + #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(__ARM_ACLE) + return __crc32b(a, b); + #else + uint32_t r_acc = HEDLEY_STATIC_CAST(uint32_t, simde_crc32_reverseBits(a, 32)); + uint32_t r_val = HEDLEY_STATIC_CAST(uint32_t, (simde_crc32_reverseBits(b, 8) << 24)); + uint32_t head = r_acc ^ r_val; + uint32_t tail = 0; + const uint32_t poly = 0x04C11DB7; + for(int i = 31; i >= 24; --i) { + if ((head>>i) & 1) { + head = simde_crc32_eor_mask(head, poly >> (32-i), (1u << (i)) - 1); + tail = simde_crc32_eor_mask(tail, poly << i, 0xFFFFFFFF); + } + } + uint32_t result = ((head & 0x00FFFFFF) << 8) | ((tail & 0xFF000000) >> 24); + return HEDLEY_STATIC_CAST(uint32_t, simde_crc32_reverseBits(result, 32)); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) + #undef __crc32b + #define __crc32b(a, b) simde___crc32b((a), (b)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +uint32_t +simde___crc32h(uint32_t a, uint16_t b) { + #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(__ARM_ACLE) + return __crc32h(a, b); + #else + uint32_t r_acc = HEDLEY_STATIC_CAST(uint32_t, simde_crc32_reverseBits(a, 32)); + uint32_t r_val = HEDLEY_STATIC_CAST(uint32_t, (simde_crc32_reverseBits(b, 16) << 16)); + uint32_t head = r_acc ^ r_val; + uint32_t tail = 0; + const uint32_t poly = 0x04C11DB7; + for(int i = 31; i >= 16; --i) { + if ((head>>i) & 1) { + head = simde_crc32_eor_mask(head, poly >> (32-i), (1u << (i)) - 1); + tail = simde_crc32_eor_mask(tail, poly << i, 0xFFFFFFFF); + } + } + uint32_t result = ((head & 0x0000FFFF) << 16) | ((tail & 0xFFFF0000) >> 16); + return HEDLEY_STATIC_CAST(uint32_t, simde_crc32_reverseBits(result, 32)); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) + #undef __crc32h + #define __crc32h(a, b) simde___crc32h((a), (b)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +uint32_t +simde___crc32w(uint32_t a, uint32_t b) { + #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(__ARM_ACLE) + return __crc32w(a, b); + #else + uint32_t r_acc = HEDLEY_STATIC_CAST(uint32_t, simde_crc32_reverseBits(a, 32)); + uint32_t r_val = HEDLEY_STATIC_CAST(uint32_t, simde_crc32_reverseBits(b, 32)); + uint32_t head = r_acc ^ r_val; + uint32_t tail = 0; + const uint32_t poly = 0x04C11DB7; + for(int i = 31; i >= 0; --i) { + if ((head>>i) & 1) { + head = simde_crc32_eor_mask(head, poly >> (32-i), (1u << (i)) - 1); + tail = simde_crc32_eor_mask(tail, poly << i, 0xFFFFFFFF); + } + } + return HEDLEY_STATIC_CAST(uint32_t, simde_crc32_reverseBits(tail, 32)); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) + #undef __crc32w + #define __crc32w(a, b) simde___crc32w((a), (b)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +uint32_t +simde___crc32d(uint32_t a, uint64_t b) { + #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(__ARM_ACLE) + return __crc32d(a, b); + #else + uint32_t r_acc = HEDLEY_STATIC_CAST(uint32_t, simde_crc32_reverseBits(a, 32)); + uint64_t r_val = simde_crc32_reverseBits(b, 64); + uint32_t val_head = HEDLEY_STATIC_CAST(uint32_t, r_val >> 32); + uint32_t val_mid = HEDLEY_STATIC_CAST(uint32_t, r_val & 0x00000000FFFFFFFF); + uint32_t head = r_acc ^ val_head; + uint32_t mid = 0u ^ val_mid; + uint32_t tail = 0u; + const uint32_t poly = 0x04C11DB7; + for(int i = 31; i >= 0; --i) { + if ((head>>i) & 1) { + head = simde_crc32_eor_mask(head, poly >> (32-i), (1u << (i)) - 1); + mid = simde_crc32_eor_mask(mid, poly << i, 0xFFFFFFFF); + tail = simde_crc32_eor_mask(tail, 0x0, 0xFFFFFFFF); + } + } + for(int i = 31; i >= 0; --i) { + if ((mid>>i) & 1) { + mid = simde_crc32_eor_mask(mid, poly >> (32-i), (1u << (i)) - 1); + tail = simde_crc32_eor_mask(tail, poly << i, 0xFFFFFFFF); + } + } + return HEDLEY_STATIC_CAST(uint32_t, simde_crc32_reverseBits(tail, 32)); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) + #undef __crc32d + #define __crc32d(a, b) simde___crc32d((a), (b)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +uint32_t +simde___crc32cb(uint32_t a, uint8_t b) { + #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(__ARM_ACLE) + return __crc32cb(a, b); + #else + uint32_t r_acc = HEDLEY_STATIC_CAST(uint32_t, simde_crc32_reverseBits(a, 32)); + uint32_t r_val = HEDLEY_STATIC_CAST(uint32_t, (simde_crc32_reverseBits(b, 8) << 24)); + uint32_t head = r_acc ^ r_val; + uint32_t tail = 0; + const uint32_t poly = 0x1EDC6F41; + for(int i = 31; i >= 24; --i) { + if ((head>>i) & 1) { + head = simde_crc32_eor_mask(head, poly >> (32-i), (1u << (i)) - 1); + tail = simde_crc32_eor_mask(tail, poly << i, 0xFFFFFFFF); + } + } + uint32_t result = ((head & 0x00FFFFFF) << 8) | ((tail & 0xFF000000) >> 24); + return HEDLEY_STATIC_CAST(uint32_t, simde_crc32_reverseBits(result, 32)); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) + #undef __crc32cb + #define __crc32cb(a, b) simde___crc32cb((a), (b)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +uint32_t +simde___crc32ch(uint32_t a, uint16_t b) { + #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(__ARM_ACLE) + return __crc32ch(a, b); + #else + uint32_t r_acc = HEDLEY_STATIC_CAST(uint32_t, simde_crc32_reverseBits(a, 32)); + uint32_t r_val = HEDLEY_STATIC_CAST(uint32_t, simde_crc32_reverseBits(b, 16) << 16); + uint32_t head = r_acc ^ r_val; + uint32_t tail = 0; + const uint32_t poly = 0x1EDC6F41; + for(int i = 31; i >= 16; --i) { + if ((head>>i) & 1) { + head = simde_crc32_eor_mask(head, poly >> (32-i), (1u << (i)) - 1); + tail = simde_crc32_eor_mask(tail, poly << i, 0xFFFFFFFF); + } + } + uint32_t result = ((head & 0x0000FFFF) << 16) | ((tail & 0xFFFF0000) >> 16); + return HEDLEY_STATIC_CAST(uint32_t, simde_crc32_reverseBits(result, 32)); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) + #undef __crc32ch + #define __crc32ch(a, b) simde___crc32ch((a), (b)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +uint32_t +simde___crc32cw(uint32_t a, uint32_t b) { + #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(__ARM_ACLE) + return __crc32cw(a, b); + #else + uint32_t r_acc = HEDLEY_STATIC_CAST(uint32_t, simde_crc32_reverseBits(a, 32)); + uint32_t r_val = HEDLEY_STATIC_CAST(uint32_t, simde_crc32_reverseBits(b, 32)); + uint32_t head = r_acc ^ r_val; + uint32_t tail = 0; + const uint32_t poly = 0x1EDC6F41; + for(int i = 31; i >= 0; --i) { + if ((head>>i) & 1) { + head = simde_crc32_eor_mask(head, poly >> (32-i), (1u << (i)) - 1); + tail = simde_crc32_eor_mask(tail, poly << i, 0xFFFFFFFF); + } + } + return HEDLEY_STATIC_CAST(uint32_t, simde_crc32_reverseBits(tail, 32)); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) + #undef __crc32cw + #define __crc32cw(a, b) simde___crc32cw((a), (b)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +uint32_t +simde___crc32cd(uint32_t a, uint64_t b) { + #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(__ARM_ACLE) + return __crc32cd(a, b); + #else + uint32_t r_acc = HEDLEY_STATIC_CAST(uint32_t, simde_crc32_reverseBits(a, 32)); + uint64_t r_val = simde_crc32_reverseBits(b, 64); + uint32_t val_head = HEDLEY_STATIC_CAST(uint32_t, r_val >> 32); + uint32_t val_mid = HEDLEY_STATIC_CAST(uint32_t, r_val & 0x00000000FFFFFFFF); + uint32_t head = r_acc ^ val_head; + uint32_t mid = 0u ^ val_mid; + uint32_t tail = 0u; + const uint32_t poly = 0x1EDC6F41; + for(int i = 31; i >= 0; --i) { + if ((head>>i) & 1) { + head = simde_crc32_eor_mask(head, poly >> (32-i), (1u << (i)) - 1); + mid = simde_crc32_eor_mask(mid, poly << i, 0xFFFFFFFF); + tail = simde_crc32_eor_mask(tail, 0x0, 0xFFFFFFFF); + } + } + for(int i = 31; i >= 0; --i) { + if ((mid>>i) & 1) { + mid = simde_crc32_eor_mask(mid, poly >> (32-i), (1u << (i)) - 1); + tail = simde_crc32_eor_mask(tail, poly << i, 0xFFFFFFFF); + } + } + return HEDLEY_STATIC_CAST(uint32_t, simde_crc32_reverseBits(tail, 32)); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) + #undef __crc32cd + #define __crc32cd(a, b) simde___crc32cd((a), (b)) +#endif + +SIMDE_END_DECLS_ +HEDLEY_DIAGNOSTIC_POP + +#endif /* !defined(SIMDE_ARM_NEON_CRC32_H) */ diff --git a/lib/simd_wrapper/simde/arm/neon/create.h b/lib/simd_wrapper/simde/arm/neon/create.h index 57f6f6ebaa9..5954922bb14 100644 --- a/lib/simd_wrapper/simde/arm/neon/create.h +++ b/lib/simd_wrapper/simde/arm/neon/create.h @@ -23,12 +23,9 @@ * Copyright: * 2020 Evan Nemerson * 2020 Christopher Moore + * 2023 Yi-Yen Chung (Copyright owned by Andes Technology) */ -/* N.B. CM: vcreate_f16 and vcreate_bf16 are omitted as - * SIMDe has no 16-bit floating point support. - * Idem for the poly types. */ - #if !defined(SIMDE_ARM_NEON_CREATE_H) #define SIMDE_ARM_NEON_CREATE_H @@ -152,6 +149,20 @@ simde_vcreate_u64(uint64_t a) { #define vcreate_u64(a) simde_vcreate_u64(a) #endif +SIMDE_FUNCTION_ATTRIBUTES +simde_float16x4_t +simde_vcreate_f16(uint64_t a) { + #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) && defined(SIMDE_ARM_NEON_FP16) + return vcreate_f16(a); + #else + return simde_vreinterpret_f16_u64(simde_vdup_n_u64(a)); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vcreate_f16 + #define vcreate_f16(a) simde_vcreate_f16(a) +#endif + SIMDE_FUNCTION_ATTRIBUTES simde_float32x2_t simde_vcreate_f32(uint64_t a) { @@ -180,6 +191,62 @@ simde_vcreate_f64(uint64_t a) { #define vcreate_f64(a) simde_vcreate_f64(a) #endif +SIMDE_FUNCTION_ATTRIBUTES +simde_poly8x8_t +simde_vcreate_p8(simde_poly64_t a) { + #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) + return vcreate_p8(a); + #else + return simde_vreinterpret_p8_p64(simde_vdup_n_p64(a)); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vcreate_p8 + #define vcreate_p8(a) simde_vcreate_p8(a) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_poly16x4_t +simde_vcreate_p16(simde_poly64_t a) { + #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) + return vcreate_p16(a); + #else + return simde_vreinterpret_p16_p64(simde_vdup_n_p64(a)); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vcreate_p16 + #define vcreate_p16(a) simde_vcreate_p16(a) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_poly64x1_t +simde_vcreate_p64(simde_poly64_t a) { + #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) + return vcreate_p64(a); + #else + return simde_vdup_n_p64(a); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) + #undef vcreate_p64 + #define vcreate_p64(a) simde_vcreate_p64(a) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_bfloat16x4_t +simde_vcreate_bf16(uint64_t a) { + #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARM_NEON_BF16) + return vcreate_bf16(a); + #else + return simde_vreinterpret_bf16_u64(simde_vdup_n_u64(a)); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) + #undef vcreate_bf16 + #define vcreate_bf16(a) simde_vcreate_bf16(a) +#endif + SIMDE_END_DECLS_ HEDLEY_DIAGNOSTIC_POP diff --git a/lib/simd_wrapper/simde/arm/neon/cvt.h b/lib/simd_wrapper/simde/arm/neon/cvt.h index 55693c86943..ab5122527f4 100644 --- a/lib/simd_wrapper/simde/arm/neon/cvt.h +++ b/lib/simd_wrapper/simde/arm/neon/cvt.h @@ -23,6 +23,7 @@ * Copyright: * 2020 Sean Maher * 2020-2021 Evan Nemerson + * 2023 Yi-Yen Chung (Copyright owned by Andes Technology) */ #if !defined(SIMDE_ARM_NEON_CVT_H) @@ -43,7 +44,7 @@ simde_vcvt_f16_f32(simde_float32x4_t a) { simde_float32x4_private a_ = simde_float32x4_to_private(a); simde_float16x4_private r_; - #if defined(SIMDE_CONVERT_VECTOR_) && (SIMDE_FLOAT16_API == SIMDE_FLOAT16_API_FP16) + #if defined(SIMDE_CONVERT_VECTOR_) && defined(SIMDE_FLOAT16_VECTOR) SIMDE_CONVERT_VECTOR_(r_.values, a_.values); #else SIMDE_VECTORIZE @@ -69,7 +70,7 @@ simde_vcvt_f32_f16(simde_float16x4_t a) { simde_float16x4_private a_ = simde_float16x4_to_private(a); simde_float32x4_private r_; - #if defined(SIMDE_CONVERT_VECTOR_) && (SIMDE_FLOAT16_API == SIMDE_FLOAT16_API_FP16) + #if defined(SIMDE_CONVERT_VECTOR_) && defined(SIMDE_FLOAT16_VECTOR) SIMDE_CONVERT_VECTOR_(r_.values, a_.values); #else SIMDE_VECTORIZE @@ -139,42 +140,134 @@ simde_vcvt_f64_f32(simde_float32x2_t a) { #endif SIMDE_FUNCTION_ATTRIBUTES -int16_t -simde_x_vcvts_s16_f16(simde_float16 a) { - #if defined(SIMDE_FAST_CONVERSION_RANGE) && defined(SIMDE_ARM_NEON_FP16) - return HEDLEY_STATIC_CAST(int16_t, a); +uint16_t +simde_vcvth_u16_f16(simde_float16_t a) { + #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && defined(SIMDE_ARM_NEON_FP16) + return vcvth_u16_f16(a); + #elif defined(SIMDE_FAST_CONVERSION_RANGE) + return HEDLEY_STATIC_CAST(uint16_t, + simde_float16_to_float32(a)); #else simde_float32 af = simde_float16_to_float32(a); - if (HEDLEY_UNLIKELY(af < HEDLEY_STATIC_CAST(simde_float32, INT16_MIN))) { - return INT16_MIN; - } else if (HEDLEY_UNLIKELY(af > HEDLEY_STATIC_CAST(simde_float32, INT16_MAX))) { - return INT16_MAX; - } else if (HEDLEY_UNLIKELY(simde_math_isnanf(af))) { + if (HEDLEY_UNLIKELY(af <= SIMDE_FLOAT32_C(0.0))) { + return 0; + } else if (HEDLEY_UNLIKELY(af >= HEDLEY_STATIC_CAST(simde_float32, UINT16_MAX))) { + return UINT16_MAX; + } else if (simde_isnanhf(a)) { return 0; } else { - return HEDLEY_STATIC_CAST(int16_t, af); + return HEDLEY_STATIC_CAST(uint16_t, af); } #endif } +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vcvth_u16_f16 + #define vcvth_u16_f16(a) simde_vcvth_u16_f16(a) +#endif SIMDE_FUNCTION_ATTRIBUTES -uint16_t -simde_x_vcvts_u16_f16(simde_float16 a) { - #if defined(SIMDE_FAST_CONVERSION_RANGE) - return HEDLEY_STATIC_CAST(uint16_t, simde_float16_to_float32(a)); +int32_t +simde_vcvth_s32_f16(simde_float16_t a) { + #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARM_NEON_FP16) + return vcvth_s32_f16(a); + #elif defined(SIMDE_FAST_CONVERSION_RANGE) + return HEDLEY_STATIC_CAST(int32_t, + simde_float16_to_float32(a)); #else simde_float32 af = simde_float16_to_float32(a); - if (HEDLEY_UNLIKELY(af < SIMDE_FLOAT32_C(0.0))) { + if (HEDLEY_UNLIKELY(af <= HEDLEY_STATIC_CAST(simde_float32, INT32_MIN))) { + return INT32_MIN; + } else if (HEDLEY_UNLIKELY(af >= HEDLEY_STATIC_CAST(simde_float32, INT32_MAX))) { + return INT32_MAX; + } else if (HEDLEY_UNLIKELY(simde_isnanhf(a))) { return 0; - } else if (HEDLEY_UNLIKELY(af > HEDLEY_STATIC_CAST(simde_float32, UINT16_MAX))) { - return UINT16_MAX; - } else if (simde_math_isnanf(af)) { + } else { + return HEDLEY_STATIC_CAST(int32_t, af); + } + #endif +} +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) + #undef vcvth_s32_f16 + #define vcvth_s32_f16(a) simde_vcvth_s32_f16(a) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +uint32_t +simde_vcvth_u32_f16(simde_float16_t a) { + #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARM_NEON_FP16) + return vcvth_u32_f16(a); + #elif defined(SIMDE_FAST_CONVERSION_RANGE) + return HEDLEY_STATIC_CAST(uint32_t, + simde_float16_to_float32(a)); + #else + simde_float32 af = simde_float16_to_float32(a); + if (HEDLEY_UNLIKELY(af <= SIMDE_FLOAT32_C(0.0))) { + return 0; + } else if (HEDLEY_UNLIKELY(af >= HEDLEY_STATIC_CAST(simde_float32, UINT32_MAX))) { + return UINT32_MAX; + } else if (HEDLEY_UNLIKELY(simde_isnanhf(a))) { return 0; } else { - return HEDLEY_STATIC_CAST(uint16_t, af); + return HEDLEY_STATIC_CAST(uint32_t, af); + } + #endif +} +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) + #undef vcvth_u32_f16 + #define vcvth_u32_f16(a) simde_vcvth_u32_f16(a) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +int64_t +simde_vcvth_s64_f16(simde_float16_t a) { + #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && defined(SIMDE_ARM_NEON_FP16) + return vcvth_s64_f16(a); + #elif defined(SIMDE_FAST_CONVERSION_RANGE) + return HEDLEY_STATIC_CAST(int64_t, + simde_float16_to_float32(a)); + #else + simde_float32 af = simde_float16_to_float32(a); + if (HEDLEY_UNLIKELY(af <= HEDLEY_STATIC_CAST(simde_float32, INT64_MIN))) { + return INT64_MIN; + } else if (HEDLEY_UNLIKELY(af >= HEDLEY_STATIC_CAST(simde_float32, INT64_MAX))) { + return INT64_MAX; + } else if (HEDLEY_UNLIKELY(simde_isnanhf(a))) { + return 0; + } else { + return HEDLEY_STATIC_CAST(int64_t, af); + } + #endif +} +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vcvth_s64_f16 + #define vcvth_s64_f16(a) simde_vcvth_s64_f16(a) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +uint64_t +simde_vcvth_u64_f16(simde_float16_t a) { + #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && defined(SIMDE_ARM_NEON_FP16) + return vcvth_u64_f16(a); + #elif defined(SIMDE_FAST_CONVERSION_RANGE) + return HEDLEY_STATIC_CAST(uint64_t, + simde_float16_to_float32(a)); + #else + simde_float32 af = simde_float16_to_float32(a); + if (HEDLEY_UNLIKELY(af <= SIMDE_FLOAT32_C(0.0))) { + return 0; + } else if (HEDLEY_UNLIKELY(af >= HEDLEY_STATIC_CAST(simde_float32, UINT64_MAX))) { + return UINT64_MAX; + } else if (HEDLEY_UNLIKELY(simde_isnanhf(a))) { + return 0; + } else { + return HEDLEY_STATIC_CAST(uint64_t, af); } #endif } +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vcvth_u64_f16 + #define vcvth_u64_f16(a) simde_vcvth_u64_f16(a) +#endif SIMDE_FUNCTION_ATTRIBUTES int32_t @@ -265,7 +358,7 @@ simde_vcvtd_s64_f64(simde_float64 a) { return INT64_MIN; } else if (HEDLEY_UNLIKELY(a > HEDLEY_STATIC_CAST(simde_float64, INT64_MAX))) { return INT64_MAX; - } else if (simde_math_isnanf(a)) { + } else if (simde_math_isnan(a)) { return 0; } else { return HEDLEY_STATIC_CAST(int64_t, a); @@ -330,29 +423,99 @@ simde_vcvtd_f64_u64(uint64_t a) { #endif SIMDE_FUNCTION_ATTRIBUTES -simde_int16x4_t -simde_vcvt_s16_f16(simde_float16x4_t a) { +simde_float16_t +simde_vcvth_f16_u32(uint32_t a) { #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARM_NEON_FP16) - return vcvt_s16_f16(a); + return vcvth_f16_u32(a); + #elif SIMDE_FLOAT16_API != SIMDE_FLOAT16_API_PORTABLE && SIMDE_FLOAT16_API != SIMDE_FLOAT16_API_FP16_NO_ABI + return HEDLEY_STATIC_CAST(simde_float16_t, a); #else - simde_float16x4_private a_ = simde_float16x4_to_private(a); - simde_int16x4_private r_; + return simde_float16_from_float32(HEDLEY_STATIC_CAST(simde_float32_t, a)); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) + #undef vcvth_f16_u32 + #define vcvth_f16_u32(a) simde_vcvth_f16_u32(a) +#endif - #if defined(SIMDE_CONVERT_VECTOR_) && defined(SIMDE_FAST_CONVERSION_RANGE) - SIMDE_CONVERT_VECTOR_(r_.values, a_.values); - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { - r_.values[i] = simde_x_vcvts_s16_f16(a_.values[i]); - } - #endif +SIMDE_FUNCTION_ATTRIBUTES +simde_float16_t +simde_vcvth_f16_u64(uint64_t a) { + #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && defined(SIMDE_ARM_NEON_FP16) + return vcvth_f16_u64(a); + #elif SIMDE_FLOAT16_API != SIMDE_FLOAT16_API_PORTABLE && SIMDE_FLOAT16_API != SIMDE_FLOAT16_API_FP16_NO_ABI + return HEDLEY_STATIC_CAST(simde_float16_t, a); + #else + return simde_float16_from_float32(HEDLEY_STATIC_CAST(simde_float32_t, a)); + #endif +} +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vcvth_f16_u64 + #define vcvth_f16_u64(a) simde_vcvth_f16_u64(a) +#endif - return simde_int16x4_from_private(r_); +SIMDE_FUNCTION_ATTRIBUTES +simde_float16_t +simde_vcvth_f16_s32(int32_t a) { + #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARM_NEON_FP16) + return vcvth_f16_s32(a); + #elif SIMDE_FLOAT16_API != SIMDE_FLOAT16_API_PORTABLE && SIMDE_FLOAT16_API != SIMDE_FLOAT16_API_FP16_NO_ABI + return HEDLEY_STATIC_CAST(simde_float16_t, a); + #else + return simde_float16_from_float32(HEDLEY_STATIC_CAST(simde_float32_t, a)); #endif } #if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) - #undef vcvt_s16_f16 - #define vcvt_s16_f16(a) simde_vcvt_s16_f16(a) + #undef vcvth_f16_s32 + #define vcvth_f16_s32(a) simde_vcvth_f16_s32(a) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_float16_t +simde_vcvth_f16_s64(int64_t a) { + #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && defined(SIMDE_ARM_NEON_FP16) + return vcvth_f16_s64(a); + #elif SIMDE_FLOAT16_API != SIMDE_FLOAT16_API_PORTABLE && SIMDE_FLOAT16_API != SIMDE_FLOAT16_API_FP16_NO_ABI + return HEDLEY_STATIC_CAST(simde_float16_t, a); + #else + return simde_float16_from_float32(HEDLEY_STATIC_CAST(simde_float32_t, a)); + #endif +} +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vcvth_f16_s64 + #define vcvth_f16_s64(a) simde_vcvth_f16_s64(a) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_float16_t +simde_vcvth_f16_s16(int16_t a) { + #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && defined(SIMDE_ARM_NEON_FP16) + return vcvth_f16_s16(a); + #elif SIMDE_FLOAT16_API != SIMDE_FLOAT16_API_PORTABLE && SIMDE_FLOAT16_API != SIMDE_FLOAT16_API_FP16_NO_ABI + return HEDLEY_STATIC_CAST(simde_float16_t, a); + #else + return simde_float16_from_float32(HEDLEY_STATIC_CAST(simde_float32_t, a)); + #endif +} +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vcvth_f16_s16 + #define vcvth_f16_s16(a) simde_vcvth_f16_s16(a) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_float16_t +simde_vcvth_f16_u16(uint16_t a) { + #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && defined(SIMDE_ARM_NEON_FP16) + return vcvth_f16_u16(a); + #elif SIMDE_FLOAT16_API != SIMDE_FLOAT16_API_PORTABLE && SIMDE_FLOAT16_API != SIMDE_FLOAT16_API_FP16_NO_ABI + return HEDLEY_STATIC_CAST(simde_float16_t, a); + #else + return simde_float16_from_float32(HEDLEY_STATIC_CAST(simde_float32_t, a)); + #endif +} +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vcvth_f16_u16 + #define vcvth_f16_u16(a) simde_vcvth_f16_u16(a) #endif SIMDE_FUNCTION_ATTRIBUTES @@ -390,12 +553,12 @@ simde_vcvt_u16_f16(simde_float16x4_t a) { simde_float16x4_private a_ = simde_float16x4_to_private(a); simde_uint16x4_private r_; - #if defined(SIMDE_CONVERT_VECTOR_) && defined(SIMDE_FAST_CONVERSION_RANGE) + #if defined(SIMDE_CONVERT_VECTOR_) && defined(SIMDE_FAST_CONVERSION_RANGE) && defined(SIMDE_FLOAT16_VECTOR) SIMDE_CONVERT_VECTOR_(r_.values, a_.values); #else SIMDE_VECTORIZE for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { - r_.values[i] = simde_x_vcvts_u16_f16(a_.values[i]); + r_.values[i] = simde_vcvth_u16_f16(a_.values[i]); } #endif @@ -486,33 +649,6 @@ simde_vcvt_u64_f64(simde_float64x1_t a) { #define vcvt_u64_f64(a) simde_vcvt_u64_f64(a) #endif - -SIMDE_FUNCTION_ATTRIBUTES -simde_int16x8_t -simde_vcvtq_s16_f16(simde_float16x8_t a) { - #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARM_NEON_FP16) - return vcvtq_s16_f16(a); - #else - simde_float16x8_private a_ = simde_float16x8_to_private(a); - simde_int16x8_private r_; - - #if defined(SIMDE_CONVERT_VECTOR_) && defined(SIMDE_FAST_CONVERSION_RANGE) - SIMDE_CONVERT_VECTOR_(r_.values, a_.values); - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { - r_.values[i] = simde_x_vcvts_s16_f16(a_.values[i]); - } - #endif - - return simde_int16x8_from_private(r_); - #endif -} -#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) - #undef vcvtq_s16_f16 - #define vcvtq_s16_f16(a) simde_vcvtq_s16_f16(a) -#endif - SIMDE_FUNCTION_ATTRIBUTES simde_int32x4_t simde_vcvtq_s32_f32(simde_float32x4_t a) { @@ -600,12 +736,12 @@ simde_vcvtq_u16_f16(simde_float16x8_t a) { simde_float16x8_private a_ = simde_float16x8_to_private(a); simde_uint16x8_private r_; - #if defined(SIMDE_CONVERT_VECTOR_) && defined(SIMDE_FAST_CONVERSION_RANGE) + #if defined(SIMDE_CONVERT_VECTOR_) && defined(SIMDE_FAST_CONVERSION_RANGE) && defined(SIMDE_FLOAT16_VECTOR) SIMDE_CONVERT_VECTOR_(r_.values, a_.values); #else SIMDE_VECTORIZE for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { - r_.values[i] = simde_x_vcvts_u16_f16(a_.values[i]); + r_.values[i] = simde_vcvth_u16_f16(a_.values[i]); } #endif @@ -850,7 +986,7 @@ simde_vcvt_f16_s16(simde_int16x4_t a) { simde_int16x4_private a_ = simde_int16x4_to_private(a); simde_float16x4_private r_; - #if defined(SIMDE_CONVERT_VECTOR_) && (SIMDE_FLOAT16_API == SIMDE_FLOAT16_API_FP16) + #if defined(SIMDE_CONVERT_VECTOR_) && defined(SIMDE_FLOAT16_VECTOR) SIMDE_CONVERT_VECTOR_(r_.values, a_.values); #else SIMDE_VECTORIZE @@ -1010,7 +1146,7 @@ simde_vcvtq_f16_s16(simde_int16x8_t a) { simde_int16x8_private a_ = simde_int16x8_to_private(a); simde_float16x8_private r_; - #if defined(SIMDE_CONVERT_VECTOR_) && (SIMDE_FLOAT16_API == SIMDE_FLOAT16_API_FP16) + #if defined(SIMDE_CONVERT_VECTOR_) && defined(SIMDE_FLOAT16_VECTOR) SIMDE_CONVERT_VECTOR_(r_.values, a_.values); #else SIMDE_VECTORIZE @@ -1066,7 +1202,7 @@ simde_vcvtq_f16_u16(simde_uint16x8_t a) { simde_uint16x8_private a_ = simde_uint16x8_to_private(a); simde_float16x8_private r_; - #if defined(SIMDE_CONVERT_VECTOR_) && (SIMDE_FLOAT16_API == SIMDE_FLOAT16_API_FP16) + #if defined(SIMDE_CONVERT_VECTOR_) && defined(SIMDE_FLOAT16_VECTOR) SIMDE_CONVERT_VECTOR_(r_.values, a_.values); #else SIMDE_VECTORIZE @@ -1169,6 +1305,785 @@ simde_vcvtq_f64_u64(simde_uint64x2_t a) { #define vcvtq_f64_u64(a) simde_vcvtq_f64_u64(a) #endif +SIMDE_FUNCTION_ATTRIBUTES +uint16_t +simde_vcvtah_u16_f16(simde_float16_t a) { + #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && !defined(SIMDE_BUG_CLANG_46844) && defined(SIMDE_ARM_NEON_FP16) + return vcvtah_u16_f16(a); + #elif defined(SIMDE_FAST_CONVERSION_RANGE) + return HEDLEY_STATIC_CAST(uint16_t, + simde_math_roundf(simde_float16_to_float32(a))); + #else + simde_float32 af = simde_float16_to_float32(a); + if (HEDLEY_UNLIKELY(af <= SIMDE_FLOAT32_C(0.0))) { + return 0; + } else if (HEDLEY_UNLIKELY(af >= HEDLEY_STATIC_CAST(simde_float32, UINT16_MAX))) { + return UINT16_MAX; + } else if (HEDLEY_UNLIKELY(simde_isnanhf(a))) { + return 0; + } else { + return HEDLEY_STATIC_CAST(uint16_t, simde_math_roundf(af)); + } + #endif +} +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vcvtah_u16_f16 + #define vcvtah_u16_f16(a) simde_vcvtah_u16_f16(a) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +int32_t +simde_vcvtah_s32_f16(simde_float16_t a) { + #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARM_NEON_FP16) + return vcvtah_s32_f16(a); + #elif defined(SIMDE_FAST_CONVERSION_RANGE) + return HEDLEY_STATIC_CAST(int32_t, + simde_math_roundf(simde_float16_to_float32(a))); + #else + simde_float32 af = simde_float16_to_float32(a); + if (HEDLEY_UNLIKELY(af <= HEDLEY_STATIC_CAST(simde_float32, INT32_MIN))) { + return INT32_MIN; + } else if (HEDLEY_UNLIKELY(af >= HEDLEY_STATIC_CAST(simde_float32, INT32_MAX))) { + return INT32_MAX; + } else if (HEDLEY_UNLIKELY(simde_isnanhf(a))) { + return 0; + } else { + return HEDLEY_STATIC_CAST(int32_t, simde_math_roundf(af)); + } + #endif +} +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) + #undef vcvtah_s32_f16 + #define vcvtah_s32_f16(a) simde_vcvtah_s32_f16(a) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +uint32_t +simde_vcvtah_u32_f16(simde_float16_t a) { + #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && !defined(SIMDE_BUG_CLANG_46844) && defined(SIMDE_ARM_NEON_FP16) + return vcvtah_u32_f16(a); + #elif defined(SIMDE_FAST_CONVERSION_RANGE) + return HEDLEY_STATIC_CAST(uint32_t, + simde_math_roundf(simde_float16_to_float32(a))); + #else + simde_float32 af = simde_float16_to_float32(a); + if (HEDLEY_UNLIKELY(af <= SIMDE_FLOAT32_C(0.0))) { + return 0; + } else if (HEDLEY_UNLIKELY(af >= HEDLEY_STATIC_CAST(simde_float32, UINT32_MAX))) { + return UINT32_MAX; + } else if (HEDLEY_UNLIKELY(simde_isnanhf(a))) { + return 0; + } else { + return HEDLEY_STATIC_CAST(uint32_t, simde_math_roundf(af)); + } + #endif +} +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) + #undef vcvtah_u32_f16 + #define vcvtah_u32_f16(a) simde_vcvtah_u32_f16(a) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +int64_t +simde_vcvtah_s64_f16(simde_float16_t a) { + #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && defined(SIMDE_ARM_NEON_FP16) + return vcvtah_s64_f16(a); + #elif defined(SIMDE_FAST_CONVERSION_RANGE) + return HEDLEY_STATIC_CAST(int64_t, + simde_math_roundf(simde_float16_to_float32(a))); + #else + simde_float32 af = simde_float16_to_float32(a); + if (HEDLEY_UNLIKELY(af <= HEDLEY_STATIC_CAST(simde_float32, INT64_MIN))) { + return INT64_MIN; + } else if (HEDLEY_UNLIKELY(af >= HEDLEY_STATIC_CAST(simde_float32, INT64_MAX))) { + return INT64_MAX; + } else if (HEDLEY_UNLIKELY(simde_isnanhf(a))) { + return 0; + } else { + return HEDLEY_STATIC_CAST(int64_t, simde_math_roundf(af)); + } + #endif +} +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vcvtah_s64_f16 + #define vcvtah_s64_f16(a) simde_vcvtah_s64_f16(a) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +uint64_t +simde_vcvtah_u64_f16(simde_float16_t a) { + #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && !defined(SIMDE_BUG_CLANG_46844) && defined(SIMDE_ARM_NEON_FP16) + return vcvtah_u64_f16(a); + #elif defined(SIMDE_FAST_CONVERSION_RANGE) + return HEDLEY_STATIC_CAST(uint64_t, + simde_math_roundf(simde_float16_to_float32(a))); + #else + simde_float32 af = simde_float16_to_float32(a); + if (HEDLEY_UNLIKELY(af <= SIMDE_FLOAT32_C(0.0))) { + return 0; + } else if (HEDLEY_UNLIKELY(af >= HEDLEY_STATIC_CAST(simde_float32, UINT64_MAX))) { + return UINT64_MAX; + } else if (HEDLEY_UNLIKELY(simde_isnanhf(a))) { + return 0; + } else { + return HEDLEY_STATIC_CAST(uint64_t, simde_math_roundf(af)); + } + #endif +} +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vcvtah_u64_f16 + #define vcvtah_u64_f16(a) simde_vcvtah_u64_f16(a) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +int64_t +simde_vcvtad_s64_f64(simde_float64 a) { + #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + return vcvtad_s64_f64(a); + #elif defined(SIMDE_FAST_CONVERSION_RANGE) + return HEDLEY_STATIC_CAST(int64_t, simde_math_round(a)); + #else + if (HEDLEY_UNLIKELY(a <= HEDLEY_STATIC_CAST(simde_float64, INT64_MIN))) { + return INT64_MIN; + } else if (HEDLEY_UNLIKELY(a >= HEDLEY_STATIC_CAST(simde_float64, INT64_MAX))) { + return INT64_MAX; + } else if (HEDLEY_UNLIKELY(simde_math_isnan(a))) { + return 0; + } else { + return HEDLEY_STATIC_CAST(int64_t, simde_math_round(a)); + } + #endif +} +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vcvtad_s64_f64 + #define vcvtad_s64_f64(a) simde_vcvtad_s64_f64(a) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +uint64_t +simde_vcvtad_u64_f64(simde_float64 a) { + #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && !defined(SIMDE_BUG_CLANG_46844) + return vcvtad_u64_f64(a); + #elif defined(SIMDE_FAST_CONVERSION_RANGE) + return HEDLEY_STATIC_CAST(uint64_t, simde_math_round(a)); + #else + if (HEDLEY_UNLIKELY(a <= SIMDE_FLOAT64_C(0.0))) { + return 0; + } else if (HEDLEY_UNLIKELY(a >= HEDLEY_STATIC_CAST(simde_float64, UINT64_MAX))) { + return UINT64_MAX; + } else if (simde_math_isnan(a)) { + return 0; + } else { + return HEDLEY_STATIC_CAST(uint64_t, simde_math_round(a)); + } + #endif +} +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vcvtad_u64_f64 + #define vcvtad_u64_f64(a) simde_vcvtad_u64_f64(a) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +int32_t +simde_vcvtas_s32_f32(simde_float32 a) { + #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + return vcvtas_s32_f32(a); + #elif defined(SIMDE_FAST_CONVERSION_RANGE) + return HEDLEY_STATIC_CAST(int32_t, simde_math_roundf(a)); + #else + if (HEDLEY_UNLIKELY(a <= HEDLEY_STATIC_CAST(simde_float32, INT32_MIN))) { + return INT32_MIN; + } else if (HEDLEY_UNLIKELY(a >= HEDLEY_STATIC_CAST(simde_float32, INT32_MAX))) { + return INT32_MAX; + } else if (HEDLEY_UNLIKELY(simde_math_isnanf(a))) { + return 0; + } else { + return HEDLEY_STATIC_CAST(int32_t, simde_math_roundf(a)); + } + #endif +} +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vcvtas_s32_f32 + #define vcvtas_s32_f32(a) simde_vcvtas_s32_f32(a) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +uint32_t +simde_vcvtas_u32_f32(simde_float32 a) { + #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + return vcvtas_u32_f32(a); + #elif defined(SIMDE_FAST_CONVERSION_RANGE) + return HEDLEY_STATIC_CAST(uint32_t, simde_math_roundf(a)); + #else + if (HEDLEY_UNLIKELY(a < SIMDE_FLOAT32_C(0.0))) { + return 0; + } else if (HEDLEY_UNLIKELY(a >= HEDLEY_STATIC_CAST(simde_float32, UINT32_MAX))) { + return UINT32_MAX; + } else if (HEDLEY_UNLIKELY(simde_math_isnanf(a))) { + return 0; + } else { + if (a < 0) return 0; + return HEDLEY_STATIC_CAST(uint32_t, simde_math_roundf(a)); + } + #endif +} +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vcvtas_u32_f32 + #define vcvtas_u32_f32(a) simde_vcvtas_u32_f32(a) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_uint16x4_t +simde_vcvta_u16_f16(simde_float16x4_t a) { + #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARM_NEON_FP16) + return vcvta_u16_f16(a); + #else + simde_float16x4_private a_ = simde_float16x4_to_private(a); + simde_uint16x4_private r_; + + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { + r_.values[i] = simde_vcvtah_u16_f16(a_.values[i]); + } + + return simde_uint16x4_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) + #undef vcvta_u16_f16 + #define vcvta_u16_f16(a) simde_vcvta_u16_f16(a) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_int64x1_t +simde_vcvta_s64_f64(simde_float64x1_t a) { + #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + return vcvta_s64_f64(a); + #else + simde_float64x1_private a_ = simde_float64x1_to_private(a); + simde_int64x1_private r_; + + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { + r_.values[i] = simde_vcvtad_s64_f64(a_.values[i]); + } + + return simde_int64x1_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vcvta_s64_f64 + #define vcvta_s64_f64(a) simde_vcvta_s64_f64(a) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_uint64x1_t +simde_vcvta_u64_f64(simde_float64x1_t a) { + #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + return vcvta_u64_f64(a); + #else + simde_float64x1_private a_ = simde_float64x1_to_private(a); + simde_uint64x1_private r_; + + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { + r_.values[i] = simde_vcvtad_u64_f64(a_.values[i]); + } + + return simde_uint64x1_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vcvta_u64_f64 + #define vcvta_u64_f64(a) simde_vcvta_u64_f64(a) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_int32x2_t +simde_vcvta_s32_f32(simde_float32x2_t a) { + #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + return vcvta_s32_f32(a); + #else + simde_float32x2_private a_ = simde_float32x2_to_private(a); + simde_int32x2_private r_; + + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { + r_.values[i] = simde_vcvtas_s32_f32(a_.values[i]); + } + + return simde_int32x2_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vcvta_s32_f32 + #define vcvta_s32_f32(a) simde_vcvta_s32_f32(a) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_uint16x8_t +simde_vcvtaq_u16_f16(simde_float16x8_t a) { + #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARM_NEON_FP16) + return vcvtaq_u16_f16(a); + #else + simde_float16x8_private a_ = simde_float16x8_to_private(a); + simde_uint16x8_private r_; + + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { + r_.values[i] = simde_vcvtah_u16_f16(a_.values[i]); + } + + return simde_uint16x8_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) + #undef vcvtaq_u16_f16 + #define vcvtaq_u16_f16(a) simde_vcvtaq_u16_f16(a) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_int32x4_t +simde_vcvtaq_s32_f32(simde_float32x4_t a) { + #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + return vcvtaq_s32_f32(a); + #else + simde_float32x4_private a_ = simde_float32x4_to_private(a); + simde_int32x4_private r_; + + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { + r_.values[i] = simde_vcvtas_s32_f32(a_.values[i]); + } + + return simde_int32x4_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vcvtaq_s32_f32 + #define vcvtaq_s32_f32(a) simde_vcvtaq_s32_f32(a) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_int64x2_t +simde_vcvtaq_s64_f64(simde_float64x2_t a) { + #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + return vcvtaq_s64_f64(a); + #else + simde_float64x2_private a_ = simde_float64x2_to_private(a); + simde_int64x2_private r_; + + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { + r_.values[i] = simde_vcvtad_s64_f64(a_.values[i]); + } + + return simde_int64x2_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vcvtaq_s64_f64 + #define vcvtaq_s64_f64(a) simde_vcvtaq_s64_f64(a) +#endif + + +SIMDE_FUNCTION_ATTRIBUTES +simde_uint64x2_t +simde_vcvtaq_u64_f64(simde_float64x2_t a) { + #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + return vcvtaq_u64_f64(a); + #else + simde_float64x2_private a_ = simde_float64x2_to_private(a); + simde_uint64x2_private r_; + + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { + r_.values[i] = simde_vcvtad_u64_f64(a_.values[i]); + } + + return simde_uint64x2_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vcvtaq_u64_f64 + #define vcvtaq_u64_f64(a) simde_vcvtaq_u64_f64(a) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_uint32x2_t +simde_vcvta_u32_f32(simde_float32x2_t a) { + #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + return vcvta_u32_f32(a); + #else + simde_float32x2_private a_ = simde_float32x2_to_private(a); + simde_uint32x2_private r_; + + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { + r_.values[i] = simde_vcvtas_u32_f32(a_.values[i]); + } + + return simde_uint32x2_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vcvta_u32_f32 + #define vcvta_u32_f32(a) simde_vcvta_u32_f32(a) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_uint32x4_t +simde_vcvtaq_u32_f32(simde_float32x4_t a) { + #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + return vcvtaq_u32_f32(a); + #else + simde_float32x4_private a_ = simde_float32x4_to_private(a); + simde_uint32x4_private r_; + + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { + r_.values[i] = simde_vcvtas_u32_f32(a_.values[i]); + } + + return simde_uint32x4_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vcvtaq_u32_f32 + #define vcvtaq_u32_f32(a) simde_vcvtaq_u32_f32(a) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_float16x8_t +simde_vcvt_high_f16_f32(simde_float16x4_t r, simde_float32x4_t a) { + #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && defined(SIMDE_ARM_NEON_FP16) + return vcvt_high_f16_f32(r, a); + #else + simde_float32x4_private a_ = simde_float32x4_to_private(a); + simde_float16x4_private b_ = simde_float16x4_to_private(r); + simde_float16x8_private r_; + + size_t half_pos = (sizeof(r_.values) / sizeof(r_.values[0]) / 2); + SIMDE_VECTORIZE + for (size_t i = 0 ; i < half_pos; i++) { + r_.values[i] = b_.values[i]; + } + SIMDE_VECTORIZE + for (size_t i = half_pos; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { + r_.values[i] = simde_float16_from_float32(a_.values[i-half_pos]); + } + + return simde_float16x8_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vcvt_high_f16_f32 + #define vcvt_high_f16_f32(r, a) simde_vcvt_high_f16_f32((r), (a)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_float32x4_t +simde_vcvt_high_f32_f64(simde_float32x2_t r, simde_float64x2_t a) { + #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && defined(SIMDE_ARM_NEON_FP16) + return vcvt_high_f32_f64(r, a); + #else + simde_float64x2_private a_ = simde_float64x2_to_private(a); + simde_float32x2_private b_ = simde_float32x2_to_private(r); + simde_float32x4_private r_; + + size_t half_pos = (sizeof(r_.values) / sizeof(r_.values[0]) / 2); + SIMDE_VECTORIZE + for (size_t i = 0 ; i < half_pos; i++) { + r_.values[i] = b_.values[i]; + } + SIMDE_VECTORIZE + for (size_t i = half_pos; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { + r_.values[i] = HEDLEY_STATIC_CAST(simde_float32, a_.values[i-half_pos]); + } + + return simde_float32x4_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vcvt_high_f32_f64 + #define vcvt_high_f32_f64(r, a) simde_vcvt_high_f32_f64((r), (a)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_float32x4_t +simde_vcvt_high_f32_f16(simde_float16x8_t a) { + #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && defined(SIMDE_ARM_NEON_FP16) + return vcvt_high_f32_f16(a); + #else + simde_float16x8_private a_ = simde_float16x8_to_private(a); + simde_float32x4_private r_; + + size_t rsize = (sizeof(r_.values) / sizeof(r_.values[0])); + SIMDE_VECTORIZE + for (size_t i = 0 ; i < rsize; i++) { + r_.values[i] = simde_float16_to_float32(a_.values[i+rsize]); + } + + return simde_float32x4_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vcvt_high_f32_f16 + #define vcvt_high_f32_f16(a) simde_vcvt_high_f32_f16(a) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_float64x2_t +simde_vcvt_high_f64_f32(simde_float32x4_t a) { + #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + return vcvt_high_f64_f32(a); + #else + simde_float32x4_private a_ = simde_float32x4_to_private(a); + simde_float64x2_private r_; + + size_t rsize = (sizeof(r_.values) / sizeof(r_.values[0])); + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { + r_.values[i] = HEDLEY_STATIC_CAST(simde_float64, a_.values[i+rsize]); + } + + return simde_float64x2_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vcvt_high_f64_f32 + #define vcvt_high_f64_f32(a) simde_vcvt_high_f64_f32(a) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_float32_t +simde_vcvtxd_f32_f64(simde_float64_t a) { + #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + return vcvtxd_f32_f64(a); + #else + return HEDLEY_STATIC_CAST(simde_float32_t, a); + #endif +} +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vcvtxd_f32_f64 + #define vcvtxd_f32_f64(a) simde_vcvtxd_f32_f64(a) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_float32x2_t +simde_vcvtx_f32_f64(simde_float64x2_t a) { + #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + return vcvtx_f32_f64(a); + #else + simde_float64x2_private a_ = simde_float64x2_to_private(a); + simde_float32x2_private r_; + + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { + r_.values[i] = simde_vcvtxd_f32_f64(a_.values[i]); + } + + return simde_float32x2_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vcvtx_f32_f64 + #define vcvtx_f32_f64(a) simde_vcvtx_f32_f64(a) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_float32x4_t +simde_vcvtx_high_f32_f64(simde_float32x2_t r, simde_float64x2_t a) { + #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + return vcvtx_high_f32_f64(r, a); + #else + simde_float64x2_private a_ = simde_float64x2_to_private(a); + simde_float32x2_private r_ = simde_float32x2_to_private(r); + simde_float32x4_private ret; + + size_t half_pos = (sizeof(ret.values) / sizeof(ret.values[0]) / 2); + SIMDE_VECTORIZE + for (size_t i = 0 ; i < half_pos; i++) { + ret.values[i] = r_.values[i]; + } + SIMDE_VECTORIZE + for (size_t i = half_pos; i < (sizeof(ret.values) / sizeof(ret.values[0])) ; i++) { + ret.values[i] = simde_vcvtxd_f32_f64(a_.values[i-half_pos]); + } + + return simde_float32x4_from_private(ret); + #endif +} +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vcvtx_high_f32_f64 + #define vcvtx_high_f32_f64(r, a) simde_vcvtx_high_f32_f64((r), (a)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_bfloat16x4_t +simde_vcvt_bf16_f32(simde_float32x4_t a) { + #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARM_NEON_BF16) + return vcvt_bf16_f32(a); + #else + simde_float32x4_private a_ = simde_float32x4_to_private(a); + simde_bfloat16x4_private r_; + + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { + r_.values[i] = simde_bfloat16_from_float32(a_.values[i]); + } + + return simde_bfloat16x4_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) + #undef vcvt_bf16_f32 + #define vcvt_bf16_f32(a) simde_vcvt_bf16_f32(a) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_float32x4_t +simde_vcvt_f32_bf16(simde_bfloat16x4_t a) { + #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARM_NEON_BF16) + return vcvt_f32_bf16(a); + #else + simde_bfloat16x4_private a_ = simde_bfloat16x4_to_private(a); + simde_float32x4_private r_; + + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { + r_.values[i] = simde_bfloat16_to_float32(a_.values[i]); + } + + return simde_float32x4_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) + #undef vcvt_f32_bf16 + #define vcvt_f32_bf16(a) simde_vcvt_f32_bf16(a) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_float32_t +simde_vcvtah_f32_bf16(simde_bfloat16_t a) { + #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARM_NEON_BF16) + return vcvtah_f32_bf16(a); + #else + return simde_bfloat16_to_float32(a); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) + #undef vcvtah_f32_bf16 + #define vcvtah_f32_bf16(a) simde_vcvtah_f32_bf16(a) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_bfloat16_t +simde_vcvth_bf16_f32(float a) { + #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARM_NEON_BF16) + return vcvth_bf16_f32(a); + #else + return simde_bfloat16_from_float32(a); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) + #undef vcvth_bf16_f32 + #define vcvth_bf16_f32(a) simde_vcvth_bf16_f32(a) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_float32x4_t +simde_vcvtq_low_f32_bf16(simde_bfloat16x8_t a) { + #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARM_NEON_BF16) + return vcvtq_low_f32_bf16(a); + #else + simde_bfloat16x8_private a_ = simde_bfloat16x8_to_private(a); + simde_float32x4_private r_; + + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { + r_.values[i] = simde_bfloat16_to_float32(a_.values[i]); + } + + return simde_float32x4_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) + #undef vcvtq_low_f32_bf16 + #define vcvtq_low_f32_bf16(a) simde_vcvtq_low_f32_bf16(a) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_float32x4_t +simde_vcvtq_high_f32_bf16(simde_bfloat16x8_t a) { + #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARM_NEON_BF16) + return vcvtq_high_f32_bf16(a); + #else + simde_bfloat16x8_private a_ = simde_bfloat16x8_to_private(a); + simde_float32x4_private r_; + + size_t rsize = (sizeof(r_.values) / sizeof(r_.values[0])); + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { + r_.values[i] = simde_bfloat16_to_float32(a_.values[i + rsize]); + } + + return simde_float32x4_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) + #undef vcvtq_high_f32_bf16 + #define vcvtq_high_f32_bf16(a) simde_vcvtq_high_f32_bf16(a) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_bfloat16x8_t +simde_vcvtq_low_bf16_f32(simde_float32x4_t a) { + #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARM_NEON_BF16) + return vcvtq_low_bf16_f32(a); + #else + simde_float32x4_private a_ = simde_float32x4_to_private(a); + simde_bfloat16x8_private r_; + + size_t asize = (sizeof(a_.values) / sizeof(a_.values[0])); + SIMDE_VECTORIZE + for (size_t i = 0 ; i < asize; i++) { + r_.values[i] = simde_bfloat16_from_float32(a_.values[i]); + r_.values[i + asize] = SIMDE_BFLOAT16_VALUE(0.0); + } + + return simde_bfloat16x8_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) + #undef vcvtq_low_bf16_f32 + #define vcvtq_low_bf16_f32(a) simde_vcvtq_low_bf16_f32(a) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_bfloat16x8_t +simde_vcvtq_high_bf16_f32(simde_bfloat16x8_t inactive, simde_float32x4_t a) { + #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARM_NEON_BF16) + return vcvtq_high_bf16_f32(inactive, a); + #else + simde_bfloat16x8_private inactive_ = simde_bfloat16x8_to_private(inactive); + simde_float32x4_private a_ = simde_float32x4_to_private(a); + simde_bfloat16x8_private r_; + + size_t asize = (sizeof(a_.values) / sizeof(a_.values[0])); + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(a_.values) / sizeof(a_.values[0])) ; i++) { + r_.values[i] = inactive_.values[i]; + r_.values[i + asize] = simde_bfloat16_from_float32(a_.values[i]); + } + return simde_bfloat16x8_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) + #undef vcvtq_high_bf16_f32 + #define vcvtq_high_bf16_f32(inactive, a) simde_vcvtq_high_bf16_f32((inactive), (a)) +#endif + SIMDE_END_DECLS_ HEDLEY_DIAGNOSTIC_POP diff --git a/lib/simd_wrapper/simde/arm/neon/cvt_n.h b/lib/simd_wrapper/simde/arm/neon/cvt_n.h new file mode 100644 index 00000000000..677751525b0 --- /dev/null +++ b/lib/simd_wrapper/simde/arm/neon/cvt_n.h @@ -0,0 +1,691 @@ +/* SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, copy, + * modify, merge, publish, distribute, sublicense, and/or sell copies + * of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + * Copyright: + * 2023 Yi-Yen Chung (Copyright owned by Andes Technology) + */ + +#if !defined(SIMDE_ARM_NEON_CVT_N_H) +#define SIMDE_ARM_NEON_CVT_N_H + +#include "types.h" +#include "cvt.h" + +HEDLEY_DIAGNOSTIC_PUSH +SIMDE_DISABLE_UNWANTED_DIAGNOSTICS +SIMDE_BEGIN_DECLS_ + +SIMDE_FUNCTION_ATTRIBUTES +uint16_t +simde_vcvth_n_u16_f16(simde_float16_t a, const int n) + SIMDE_REQUIRE_CONSTANT_RANGE(n, 1, 16) { + return simde_vcvth_u16_f16( + simde_float16_from_float32( + simde_float16_to_float32(a) * HEDLEY_STATIC_CAST(simde_float32_t, pow(2, n)))); +} +#if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && defined(SIMDE_ARM_NEON_FP16) + #define simde_vcvth_n_u16_f16(a, n) vcvth_n_u16_f16(a, n) +#endif +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vcvth_n_u16_f16 + #define vcvth_n_u16_f16(a, n) simde_vcvth_n_u16_f16(a, n) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_float16_t +simde_vcvth_n_f16_s16(int16_t a, const int n) + SIMDE_REQUIRE_CONSTANT_RANGE(n, 1, 16) { + return simde_float16_from_float32( + HEDLEY_STATIC_CAST(simde_float32_t, + HEDLEY_STATIC_CAST(simde_float64_t, a) / pow(2, n))); +} +#if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && defined(SIMDE_ARM_NEON_FP16) + #define simde_vcvth_n_f16_s16(a, n) vcvth_n_f16_s16(a, n) +#endif +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vcvth_n_f16_s16 + #define vcvth_n_f16_s16(a, n) simde_vcvth_n_f16_s16(a, n) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_float16_t +simde_vcvth_n_f16_u16(uint16_t a, const int n) + SIMDE_REQUIRE_CONSTANT_RANGE(n, 1, 16) { + return simde_float16_from_float32( + HEDLEY_STATIC_CAST(simde_float32_t, + HEDLEY_STATIC_CAST(simde_float64_t, a) / pow(2, n))); +} +#if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && defined(SIMDE_ARM_NEON_FP16) + #define simde_vcvth_n_f16_u16(a, n) vcvth_n_f16_u16(a, n) +#endif +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vcvth_n_f16_u16 + #define vcvth_n_f16_u16(a, n) simde_vcvth_n_f16_u16(a, n) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +int32_t +simde_vcvts_n_s32_f32(simde_float32_t a, const int n) + SIMDE_REQUIRE_CONSTANT_RANGE(n, 1, 32) { + return simde_vcvts_s32_f32(a * HEDLEY_STATIC_CAST(simde_float32_t, pow(2, n))); +} +#if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + #define simde_vcvts_n_s32_f32(a, n) vcvts_n_s32_f32(a, n) +#endif +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vcvts_n_s32_f32 + #define vcvts_n_s32_f32(a, n) simde_vcvts_n_s32_f32(a, n) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +uint32_t +simde_vcvts_n_u32_f32(simde_float32_t a, const int n) + SIMDE_REQUIRE_CONSTANT_RANGE(n, 1, 32) { + return simde_vcvts_u32_f32(a * HEDLEY_STATIC_CAST(simde_float32_t, pow(2, n))); +} +#if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + #define simde_vcvts_n_u32_f32(a, n) vcvts_n_u32_f32(a, n) +#endif +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vcvts_n_u32_f32 + #define vcvts_n_u32_f32(a, n) simde_vcvts_n_u32_f32(a, n) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_float32_t +simde_vcvts_n_f32_s32(int32_t a, const int n) + SIMDE_REQUIRE_CONSTANT_RANGE(n, 1, 32) { + return HEDLEY_STATIC_CAST(simde_float32_t, + HEDLEY_STATIC_CAST(simde_float64_t, a) / pow(2, n)); +} +#if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + #define simde_vcvts_n_f32_s32(a, n) vcvts_n_f32_s32(a, n) +#endif +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vcvts_n_f32_s32 + #define vcvts_n_f32_s32(a, n) simde_vcvts_n_f32_s32(a, n) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_float32_t +simde_vcvts_n_f32_u32(uint32_t a, const int n) + SIMDE_REQUIRE_CONSTANT_RANGE(n, 1, 32) { + return HEDLEY_STATIC_CAST(simde_float32_t, + HEDLEY_STATIC_CAST(simde_float64_t, a) / pow(2, n)); +} +#if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + #define simde_vcvts_n_f32_u32(a, n) vcvts_n_f32_u32(a, n) +#endif +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vcvts_n_f32_u32 + #define vcvts_n_f32_u32(a, n) simde_vcvts_n_f32_u32(a, n) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +int64_t +simde_vcvtd_n_s64_f64(simde_float64_t a, const int n) + SIMDE_REQUIRE_CONSTANT_RANGE(n, 1, 64) { + return simde_vcvtd_s64_f64(a * pow(2, n)); +} +#if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + #define simde_vcvtd_n_s64_f64(a, n) vcvtd_n_s64_f64(a, n) +#endif +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vcvtd_n_s64_f64 + #define vcvtd_n_s64_f64(a, n) simde_vcvtd_n_s64_f64(a, n) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +uint64_t +simde_vcvtd_n_u64_f64(simde_float64_t a, const int n) + SIMDE_REQUIRE_CONSTANT_RANGE(n, 1, 64) { + return simde_vcvtd_u64_f64(a * pow(2, n)); +} +#if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + #define simde_vcvtd_n_u64_f64(a, n) vcvtd_n_u64_f64(a, n) +#endif +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vcvtd_n_u64_f64 + #define vcvtd_n_u64_f64(a, n) simde_vcvtd_n_u64_f64(a, n) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_float64_t +simde_vcvtd_n_f64_s64(int64_t a, const int n) + SIMDE_REQUIRE_CONSTANT_RANGE(n, 1, 64) { + return HEDLEY_STATIC_CAST(simde_float64_t, a) / pow(2, n); +} +#if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + #define simde_vcvtd_n_f64_s64(a, n) vcvtd_n_f64_s64(a, n) +#endif +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vcvtd_n_f64_s64 + #define vcvtd_n_f64_s64(a, n) simde_vcvtd_n_f64_s64(a, n) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_float64_t +simde_vcvtd_n_f64_u64(uint64_t a, const int n) + SIMDE_REQUIRE_CONSTANT_RANGE(n, 1, 64) { + return HEDLEY_STATIC_CAST(simde_float64_t, a) / pow(2, n); +} +#if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + #define simde_vcvtd_n_f64_u64(a, n) vcvtd_n_f64_u64(a, n) +#endif +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vcvtd_n_f64_u64 + #define vcvtd_n_f64_u64(a, n) simde_vcvtd_n_f64_u64(a, n) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_int32x2_t +simde_vcvt_n_s32_f32(simde_float32x2_t a, const int n) + SIMDE_REQUIRE_CONSTANT_RANGE(n, 1, 32) { + simde_float32x2_private a_ = simde_float32x2_to_private(a); + simde_int32x2_private r_; + + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { + r_.values[i] = simde_vcvts_s32_f32(a_.values[i] * HEDLEY_STATIC_CAST(simde_float32_t, pow(2, n))); + } + + return simde_int32x2_from_private(r_); +} +#if defined(SIMDE_ARM_NEON_A32V7_NATIVE) + #define simde_vcvt_n_s32_f32(a, n) vcvt_n_s32_f32((a), (n)) +#endif +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vcvt_n_s32_f32 + #define vcvt_n_s32_f32(a, n) simde_vcvt_n_s32_f32((a), (n)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_int64x1_t +simde_vcvt_n_s64_f64(simde_float64x1_t a, const int n) + SIMDE_REQUIRE_CONSTANT_RANGE(n, 1, 64) { + simde_float64x1_private a_ = simde_float64x1_to_private(a); + simde_int64x1_private r_; + + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { + r_.values[i] = simde_vcvtd_s64_f64(a_.values[i] * pow(2, n)); + } + + return simde_int64x1_from_private(r_); +} +#if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + #define simde_vcvt_n_s64_f64(a, n) vcvt_n_s64_f64((a), (n)) +#endif +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vcvt_n_s64_f64 + #define vcvt_n_s64_f64(a, n) simde_vcvt_n_s64_f64((a), (n)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_uint16x4_t +simde_vcvt_n_u16_f16(simde_float16x4_t a, const int n) + SIMDE_REQUIRE_CONSTANT_RANGE(n, 1, 16) { + simde_float16x4_private a_ = simde_float16x4_to_private(a); + simde_uint16x4_private r_; + + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { + r_.values[i] = simde_vcvth_u16_f16(simde_float16_from_float32( + simde_float16_to_float32(a_.values[i]) * + HEDLEY_STATIC_CAST(simde_float32_t, pow(2, n)))); + } + + return simde_uint16x4_from_private(r_); +} +#if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARM_NEON_FP16) + #define simde_vcvt_n_u16_f16(a, n) vcvt_n_u16_f16((a), (n)) +#endif +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) + #undef vcvt_n_u16_f16 + #define vcvt_n_u16_f16(a, n) simde_vcvt_n_u16_f16((a), (n)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_uint32x2_t +simde_vcvt_n_u32_f32(simde_float32x2_t a, const int n) + SIMDE_REQUIRE_CONSTANT_RANGE(n, 1, 32) { + simde_float32x2_private a_ = simde_float32x2_to_private(a); + simde_uint32x2_private r_; + + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { + r_.values[i] = simde_vcvts_u32_f32(a_.values[i] * HEDLEY_STATIC_CAST(simde_float32_t, pow(2, n))); + } + + return simde_uint32x2_from_private(r_); +} +#if defined(SIMDE_ARM_NEON_A32V7_NATIVE) + #define simde_vcvt_n_u32_f32(a, n) vcvt_n_u32_f32((a), (n)) +#endif +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vcvt_n_u32_f32 + #define vcvt_n_u32_f32(a, n) simde_vcvt_n_u32_f32((a), (n)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_uint64x1_t +simde_vcvt_n_u64_f64(simde_float64x1_t a, const int n) + SIMDE_REQUIRE_CONSTANT_RANGE(n, 1, 64) { + simde_float64x1_private a_ = simde_float64x1_to_private(a); + simde_uint64x1_private r_; + + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { + r_.values[i] = simde_vcvtd_u64_f64(a_.values[i] * pow(2, n)); + } + + return simde_uint64x1_from_private(r_); +} +#if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && !defined(SIMDE_BUG_CLANG_46844) + #define simde_vcvt_n_u64_f64(a, n) vcvt_n_u64_f64((a), (n)) +#endif +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vcvt_n_u64_f64 + #define vcvt_n_u64_f64(a, n) simde_vcvt_n_u64_f64((a), (n)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_int32x4_t +simde_vcvtq_n_s32_f32(simde_float32x4_t a, const int n) + SIMDE_REQUIRE_CONSTANT_RANGE(n, 1, 32) { + simde_float32x4_private a_ = simde_float32x4_to_private(a); + simde_int32x4_private r_; + + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { + r_.values[i] = simde_vcvts_s32_f32(a_.values[i] * HEDLEY_STATIC_CAST(simde_float32_t, pow(2, n))); + } + + return simde_int32x4_from_private(r_); +} +#if defined(SIMDE_ARM_NEON_A32V7_NATIVE) + #define simde_vcvtq_n_s32_f32(a, n) vcvtq_n_s32_f32((a), (n)) +#endif +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vcvtq_n_s32_f32 + #define vcvtq_n_s32_f32(a, n) simde_vcvtq_n_s32_f32((a), (n)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_int64x2_t +simde_vcvtq_n_s64_f64(simde_float64x2_t a, const int n) + SIMDE_REQUIRE_CONSTANT_RANGE(n, 1, 64) { + simde_float64x2_private a_ = simde_float64x2_to_private(a); + simde_int64x2_private r_; + + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { + r_.values[i] = simde_vcvtd_s64_f64(a_.values[i] * pow(2, n)); + } + + return simde_int64x2_from_private(r_); +} +#if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + #define simde_vcvtq_n_s64_f64(a, n) vcvtq_n_s64_f64((a), (n)) +#endif +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vcvtq_n_s64_f64 + #define vcvtq_n_s64_f64(a, n) simde_vcvtq_n_s64_f64((a), (n)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_uint16x8_t +simde_vcvtq_n_u16_f16(simde_float16x8_t a, const int n) + SIMDE_REQUIRE_CONSTANT_RANGE(n, 1, 16) { + simde_float16x8_private a_ = simde_float16x8_to_private(a); + simde_uint16x8_private r_; + + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { + r_.values[i] = simde_vcvth_u16_f16(simde_float16_from_float32( + simde_float16_to_float32(a_.values[i]) * + HEDLEY_STATIC_CAST(simde_float32_t, pow(2, n)))); + } + + return simde_uint16x8_from_private(r_); +} +#if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARM_NEON_FP16) +#define simde_vcvtq_n_u16_f16(a, n) vcvtq_n_u16_f16((a), (n)) +#endif +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) + #undef vcvtq_n_u16_f16 + #define vcvtq_n_u16_f16(a, n) simde_vcvtq_n_u16_f16((a), (n)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_uint32x4_t +simde_vcvtq_n_u32_f32(simde_float32x4_t a, const int n) + SIMDE_REQUIRE_CONSTANT_RANGE(n, 1, 32) { + simde_float32x4_private a_ = simde_float32x4_to_private(a); + simde_uint32x4_private r_; + + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { + r_.values[i] = simde_vcvts_u32_f32(a_.values[i] * HEDLEY_STATIC_CAST(simde_float32_t, pow(2, n))); + } + + return simde_uint32x4_from_private(r_); +} +#if defined(SIMDE_ARM_NEON_A32V7_NATIVE) && !defined(SIMDE_BUG_CLANG_46844) + #define simde_vcvtq_n_u32_f32(a, n) vcvtq_n_u32_f32((a), (n)) +#endif +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vcvtq_n_u32_f32 + #define vcvtq_n_u32_f32(a, n) simde_vcvtq_n_u32_f32((a), (n)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_uint64x2_t +simde_vcvtq_n_u64_f64(simde_float64x2_t a, const int n) + SIMDE_REQUIRE_CONSTANT_RANGE(n, 1, 64) { + simde_float64x2_private a_ = simde_float64x2_to_private(a); + simde_uint64x2_private r_; + + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { + r_.values[i] = simde_vcvtd_u64_f64(a_.values[i] * pow(2, n)); + } + + return simde_uint64x2_from_private(r_); +} +#if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && !defined(SIMDE_BUG_CLANG_46844) + #define simde_vcvtq_n_u64_f64(a, n) vcvtq_n_u64_f64((a), (n)) +#endif +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vcvtq_n_u64_f64 + #define vcvtq_n_u64_f64(a, n) simde_vcvtq_n_u64_f64((a), (n)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_float16x4_t +simde_vcvt_n_f16_u16(simde_uint16x4_t a, const int n) + SIMDE_REQUIRE_CONSTANT_RANGE(n, 1, 16) { + simde_uint16x4_private a_ = simde_uint16x4_to_private(a); + simde_float16x4_private r_; + + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { + r_.values[i] = simde_float16_from_float32(HEDLEY_STATIC_CAST(simde_float32, HEDLEY_STATIC_CAST(simde_float64_t, a_.values[i]) / pow(2, n))); + } + + return simde_float16x4_from_private(r_); +} +#if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARM_NEON_FP16) + #define simde_vcvt_n_f16_u16(a, n) vcvt_n_f16_u16((a), (n)) +#endif +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) + #undef vcvt_n_f16_u16 + #define vcvt_n_f16_u16(a, n) simde_vcvt_n_f16_u16((a), (n)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_float16x4_t +simde_vcvt_n_f16_s16(simde_int16x4_t a, const int n) + SIMDE_REQUIRE_CONSTANT_RANGE(n, 1, 16) { + simde_int16x4_private a_ = simde_int16x4_to_private(a); + simde_float16x4_private r_; + + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { + r_.values[i] = simde_float16_from_float32(HEDLEY_STATIC_CAST(simde_float32_t, HEDLEY_STATIC_CAST(simde_float64_t, a_.values[i]) / pow(2, n))); + } + + return simde_float16x4_from_private(r_); +} +#if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARM_NEON_FP16) + #define simde_vcvt_n_f16_s16(a, n) vcvt_n_f16_s16((a), (n)) +#endif +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) + #undef vcvt_n_f16_s16 + #define vcvt_n_f16_s16(a, n) simde_vcvt_n_f16_s16((a), (n)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_float16x8_t +simde_vcvtq_n_f16_u16(simde_uint16x8_t a, const int n) + SIMDE_REQUIRE_CONSTANT_RANGE(n, 1, 16) { + simde_uint16x8_private a_ = simde_uint16x8_to_private(a); + simde_float16x8_private r_; + + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { + r_.values[i] = simde_float16_from_float32(HEDLEY_STATIC_CAST(simde_float32_t, HEDLEY_STATIC_CAST(simde_float64_t, a_.values[i]) / pow(2, n))); + } + + return simde_float16x8_from_private(r_); +} +#if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARM_NEON_FP16) + #define simde_vcvtq_n_f16_u16(a, n) vcvtq_n_f16_u16((a), (n)) +#endif +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) + #undef vcvtq_n_f16_u16 + #define vcvtq_n_f16_u16(a, n) simde_vcvtq_n_f16_u16((a), (n)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_float16x8_t +simde_vcvtq_n_f16_s16(simde_int16x8_t a, const int n) + SIMDE_REQUIRE_CONSTANT_RANGE(n, 1, 16) { + simde_int16x8_private a_ = simde_int16x8_to_private(a); + simde_float16x8_private r_; + + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { + r_.values[i] = simde_float16_from_float32(HEDLEY_STATIC_CAST(simde_float32_t, (a_.values[i] / pow(2, n)))); + } + + return simde_float16x8_from_private(r_); +} +#if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARM_NEON_FP16) + #define simde_vcvtq_n_f16_s16(a, n) vcvtq_n_f16_s16((a), (n)) +#endif +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) + #undef vcvtq_n_f16_s16 + #define vcvtq_n_f16_s16(a, n) simde_vcvtq_n_f16_s16((a), (n)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_float32x2_t +simde_vcvt_n_f32_u32(simde_uint32x2_t a, const int n) + SIMDE_REQUIRE_CONSTANT_RANGE(n, 1, 32) { + simde_uint32x2_private a_ = simde_uint32x2_to_private(a); + simde_float32x2_private r_; + + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { + r_.values[i] = HEDLEY_STATIC_CAST(simde_float32_t, HEDLEY_STATIC_CAST(simde_float64_t, a_.values[i]) / pow(2, n)); + } + + return simde_float32x2_from_private(r_); +} +#if defined(SIMDE_ARM_NEON_A32V7_NATIVE) + #define simde_vcvt_n_f32_u32(a, n) vcvt_n_f32_u32((a), (n)) +#endif +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vcvt_n_f32_u32 + #define vcvt_n_f32_u32(a, n) simde_vcvt_n_f32_u32((a), (n)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_float32x2_t +simde_vcvt_n_f32_s32(simde_int32x2_t a, const int n) + SIMDE_REQUIRE_CONSTANT_RANGE(n, 1, 32) { + simde_int32x2_private a_ = simde_int32x2_to_private(a); + simde_float32x2_private r_; + + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { + r_.values[i] = HEDLEY_STATIC_CAST(simde_float32_t, HEDLEY_STATIC_CAST(simde_float64_t, a_.values[i]) / pow(2, n)); + } + + return simde_float32x2_from_private(r_); +} +#if defined(SIMDE_ARM_NEON_A32V7_NATIVE) + #define simde_vcvt_n_f32_s32(a, n) vcvt_n_f32_s32((a), (n)) +#endif +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vcvt_n_f32_s32 + #define vcvt_n_f32_s32(a, n) simde_vcvt_n_f32_s32((a), (n)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_float64x1_t +simde_vcvt_n_f64_u64(simde_uint64x1_t a, const int n) + SIMDE_REQUIRE_CONSTANT_RANGE(n, 1, 64) { + simde_uint64x1_private a_ = simde_uint64x1_to_private(a); + simde_float64x1_private r_; + + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { + r_.values[i] = HEDLEY_STATIC_CAST(simde_float64_t, HEDLEY_STATIC_CAST(simde_float64_t, a_.values[i]) / pow(2, n)); + } + + return simde_float64x1_from_private(r_); +} +#if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + #define simde_vcvt_n_f64_u64(a, n) vcvt_n_f64_u64((a), (n)) +#endif +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vcvt_n_f64_u64 + #define vcvt_n_f64_u64(a, n) simde_vcvt_n_f64_u64((a), (n)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_float64x2_t +simde_vcvtq_n_f64_u64(simde_uint64x2_t a, const int n) + SIMDE_REQUIRE_CONSTANT_RANGE(n, 1, 64) { + simde_uint64x2_private a_ = simde_uint64x2_to_private(a); + simde_float64x2_private r_; + + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { + r_.values[i] = HEDLEY_STATIC_CAST(simde_float64_t, HEDLEY_STATIC_CAST(simde_float64_t, a_.values[i]) / pow(2, n)); + } + + return simde_float64x2_from_private(r_); +} +#if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + #define simde_vcvtq_n_f64_u64(a, n) vcvtq_n_f64_u64((a), (n)) +#endif +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vcvtq_n_f64_u64 + #define vcvtq_n_f64_u64(a, n) simde_vcvtq_n_f64_u64((a), (n)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_float64x1_t +simde_vcvt_n_f64_s64(simde_int64x1_t a, const int n) + SIMDE_REQUIRE_CONSTANT_RANGE(n, 1, 64) { + simde_int64x1_private a_ = simde_int64x1_to_private(a); + simde_float64x1_private r_; + + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { + r_.values[i] = HEDLEY_STATIC_CAST(simde_float64_t, HEDLEY_STATIC_CAST(simde_float64_t, a_.values[i]) / pow(2, n)); + } + + return simde_float64x1_from_private(r_); +} +#if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + #define simde_vcvt_n_f64_s64(a, n) vcvt_n_f64_s64((a), (n)) +#endif +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vcvt_n_f64_s64 + #define vcvt_n_f64_s64(a, n) simde_vcvt_n_f64_s64((a), (n)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_float64x2_t +simde_vcvtq_n_f64_s64(simde_int64x2_t a, const int n) + SIMDE_REQUIRE_CONSTANT_RANGE(n, 1, 64) { + simde_int64x2_private a_ = simde_int64x2_to_private(a); + simde_float64x2_private r_; + + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { + r_.values[i] = HEDLEY_STATIC_CAST(simde_float64_t, HEDLEY_STATIC_CAST(simde_float64_t, a_.values[i]) / pow(2, n)); + } + + return simde_float64x2_from_private(r_); +} +#if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + #define simde_vcvtq_n_f64_s64(a, n) vcvtq_n_f64_s64((a), (n)) +#endif +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vcvtq_n_f64_s64 + #define vcvtq_n_f64_s64(a, n) simde_vcvtq_n_f64_s64((a), (n)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_float32x4_t +simde_vcvtq_n_f32_s32(simde_int32x4_t a, const int n) + SIMDE_REQUIRE_CONSTANT_RANGE(n, 1, 32) { + simde_int32x4_private a_ = simde_int32x4_to_private(a); + simde_float32x4_private r_; + + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { + r_.values[i] = HEDLEY_STATIC_CAST(simde_float32_t, HEDLEY_STATIC_CAST(simde_float64_t, a_.values[i]) / pow(2, n)); + } + + return simde_float32x4_from_private(r_); +} +#if defined(SIMDE_ARM_NEON_A32V7_NATIVE) + #define simde_vcvtq_n_f32_s32(a, n) vcvtq_n_f32_s32((a), (n)) +#endif +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vcvtq_n_f32_s32 + #define vcvtq_n_f32_s32(a, n) simde_vcvtq_n_f32_s32((a), (n)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_float32x4_t +simde_vcvtq_n_f32_u32(simde_uint32x4_t a, const int n) + SIMDE_REQUIRE_CONSTANT_RANGE(n, 1, 32) { + simde_uint32x4_private a_ = simde_uint32x4_to_private(a); + simde_float32x4_private r_; + + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { + r_.values[i] = HEDLEY_STATIC_CAST(simde_float32_t, HEDLEY_STATIC_CAST(simde_float64_t, a_.values[i]) / pow(2, n)); + } + + return simde_float32x4_from_private(r_); +} +#if defined(SIMDE_ARM_NEON_A32V7_NATIVE) + #define simde_vcvtq_n_f32_u32(a, n) vcvtq_n_f32_u32((a), (n)) +#endif +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vcvtq_n_f32_u32 + #define vcvtq_n_f32_u32(a, n) simde_vcvtq_n_f32_u32((a), (n)) +#endif + + +SIMDE_END_DECLS_ +HEDLEY_DIAGNOSTIC_POP + +#endif /* SIMDE_ARM_NEON_CVT_N_H */ diff --git a/lib/simd_wrapper/simde/arm/neon/cvtm.h b/lib/simd_wrapper/simde/arm/neon/cvtm.h new file mode 100644 index 00000000000..ae2c98ae02f --- /dev/null +++ b/lib/simd_wrapper/simde/arm/neon/cvtm.h @@ -0,0 +1,381 @@ +/* SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, copy, + * modify, merge, publish, distribute, sublicense, and/or sell copies + * of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + * Copyright: + * 2023 Yi-Yen Chung (Copyright owned by Andes Technology) + */ + +#if !defined(SIMDE_ARM_NEON_CVTM_H) +#define SIMDE_ARM_NEON_CVTM_H + +#include "types.h" +#include "cvt.h" + +HEDLEY_DIAGNOSTIC_PUSH +SIMDE_DISABLE_UNWANTED_DIAGNOSTICS +SIMDE_BEGIN_DECLS_ + +SIMDE_FUNCTION_ATTRIBUTES +int64_t +simde_vcvtmh_s64_f16(simde_float16_t a) { + #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && defined(SIMDE_ARM_NEON_FP16) + return vcvtmh_s64_f16(a); + #elif defined(SIMDE_FAST_CONVERSION_RANGE) + return HEDLEY_STATIC_CAST(int64_t, + simde_math_floorf( + simde_float16_to_float32(a))); + #else + simde_float32 af = simde_float16_to_float32(a); + if (HEDLEY_UNLIKELY(af <= HEDLEY_STATIC_CAST(simde_float32, INT64_MIN))) { + return INT64_MIN; + } else if (HEDLEY_UNLIKELY(af >= HEDLEY_STATIC_CAST(simde_float32, INT64_MAX))) { + return INT64_MAX; + } else if (HEDLEY_UNLIKELY(simde_isnanhf(a))) { + return 0; + } else { + return HEDLEY_STATIC_CAST(int64_t, simde_math_floorf(af)); + } + #endif +} +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vcvtmh_s64_f16 + #define vcvtmh_s64_f16(a) simde_vcvtmh_s64_f16(a) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +int32_t +simde_vcvtmh_s32_f16(simde_float16_t a) { + #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARM_NEON_FP16) + return vcvtmh_s32_f16(a); + #elif defined(SIMDE_FAST_CONVERSION_RANGE) + return HEDLEY_STATIC_CAST(int32_t, + simde_math_floorf( + simde_float16_to_float32(a))); + #else + simde_float32 af = simde_float16_to_float32(a); + if (HEDLEY_UNLIKELY(af <= HEDLEY_STATIC_CAST(simde_float32, INT32_MIN))) { + return INT32_MIN; + } else if (HEDLEY_UNLIKELY(af >= HEDLEY_STATIC_CAST(simde_float32, INT32_MAX))) { + return INT32_MAX; + } else if (HEDLEY_UNLIKELY(simde_isnanhf(a))) { + return 0; + } else { + return HEDLEY_STATIC_CAST(int32_t, simde_math_floorf(af)); + } + #endif +} +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) + #undef vcvtmh_s32_f16 + #define vcvtmh_s32_f16(a) simde_vcvtmh_s32_f16(a) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +uint64_t +simde_vcvtmh_u64_f16(simde_float16_t a) { + #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && defined(SIMDE_ARM_NEON_FP16) + return vcvtmh_u64_f16(a); + #elif defined(SIMDE_FAST_CONVERSION_RANGE) + return HEDLEY_STATIC_CAST(uint64_t, + simde_math_floorf( + simde_float16_to_float32(a))); + #else + simde_float32 af = simde_float16_to_float32(a); + if (HEDLEY_UNLIKELY(af <= SIMDE_FLOAT32_C(0.0))) { + return 0; + } else if (HEDLEY_UNLIKELY(af >= HEDLEY_STATIC_CAST(simde_float32, UINT64_MAX))) { + return UINT64_MAX; + } else if (HEDLEY_UNLIKELY(simde_isnanhf(a))) { + return 0; + } else { + return HEDLEY_STATIC_CAST(uint64_t, simde_math_floorf(af)); + } + #endif +} +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vcvtmh_u64_f16 + #define vcvtmh_u64_f16(a) simde_vcvtmh_u64_f16(a) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +uint32_t +simde_vcvtmh_u32_f16(simde_float16_t a) { + #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARM_NEON_FP16) + return vcvtmh_u32_f16(a); + #elif defined(SIMDE_FAST_CONVERSION_RANGE) + return HEDLEY_STATIC_CAST(uint32_t, + simde_math_floorf( + simde_float16_to_float32(a))); + #else + simde_float32 af = simde_float16_to_float32(a); + if (HEDLEY_UNLIKELY(af <= SIMDE_FLOAT32_C(0.0))) { + return 0; + } else if (HEDLEY_UNLIKELY(af >= HEDLEY_STATIC_CAST(simde_float32, UINT32_MAX))) { + return UINT32_MAX; + } else if (HEDLEY_UNLIKELY(simde_isnanhf(a))) { + return 0; + } else { + return HEDLEY_STATIC_CAST(uint32_t, simde_math_floorf(af)); + } + #endif +} +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) + #undef vcvtmh_u32_f16 + #define vcvtmh_u32_f16(a) simde_vcvtmh_u32_f16(a) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +uint16_t +simde_vcvtmh_u16_f16(simde_float16_t a) { + #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && defined(SIMDE_ARM_NEON_FP16) + return vcvtmh_u16_f16(a); + #elif defined(SIMDE_FAST_CONVERSION_RANGE) + return HEDLEY_STATIC_CAST(uint16_t, + simde_math_floorf( + simde_float16_to_float32(a))); + #else + simde_float32 af = simde_float16_to_float32(a); + if (HEDLEY_UNLIKELY(af <= SIMDE_FLOAT32_C(0.0))) { + return 0; + } else if (HEDLEY_UNLIKELY(af >= HEDLEY_STATIC_CAST(simde_float32, UINT16_MAX))) { + return UINT16_MAX; + } else if (HEDLEY_UNLIKELY(simde_isnanhf(a))) { + return 0; + } else { + return HEDLEY_STATIC_CAST(uint16_t, simde_math_floorf(af)); + } + #endif +} +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vcvtmh_u16_f16 + #define vcvtmh_u16_f16(a) simde_vcvtmh_u16_f16(a) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +uint32_t +simde_vcvtms_u32_f32(simde_float32 a) { + #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + return vcvtms_u32_f32(a); + #elif defined(SIMDE_FAST_CONVERSION_RANGE) + return HEDLEY_STATIC_CAST(uint32_t, simde_math_floorf(a)); + #else + if (HEDLEY_UNLIKELY(a <= SIMDE_FLOAT32_C(0.0))) { + return 0; + } else if (HEDLEY_UNLIKELY(a >= HEDLEY_STATIC_CAST(simde_float32, UINT32_MAX))) { + return UINT32_MAX; + } else if (HEDLEY_UNLIKELY(simde_math_isnanf(a))) { + return 0; + } else { + return HEDLEY_STATIC_CAST(uint32_t, simde_math_floorf(a)); + } + #endif +} +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vcvtms_u32_f32 + #define vcvtms_u32_f32(a) simde_vcvtms_u32_f32(a) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +uint64_t +simde_vcvtmd_u64_f64(simde_float64 a) { + #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + return vcvtmd_u64_f64(a); + #elif defined(SIMDE_FAST_CONVERSION_RANGE) + return HEDLEY_STATIC_CAST(uint64_t, simde_math_floor(a)); + #else + if (HEDLEY_UNLIKELY(a <= SIMDE_FLOAT64_C(0.0))) { + return 0; + } else if (HEDLEY_UNLIKELY(a >= HEDLEY_STATIC_CAST(simde_float64, UINT64_MAX))) { + return UINT64_MAX; + } else if (simde_math_isnan(a)) { + return 0; + } else { + return HEDLEY_STATIC_CAST(uint64_t, simde_math_floor(a)); + } + #endif +} +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vcvtmd_u64_f64 + #define vcvtmd_u64_f64(a) simde_vcvtmd_u64_f64(a) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_uint16x8_t +simde_vcvtmq_u16_f16(simde_float16x8_t a) { + #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARM_NEON_FP16) + return vcvtmq_u16_f16(a); + #else + simde_float16x8_private a_ = simde_float16x8_to_private(a); + simde_uint16x8_private r_; + + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { + r_.values[i] = simde_vcvtmh_u16_f16(a_.values[i]); + } + + return simde_uint16x8_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) + #undef vcvtmq_u16_f16 + #define vcvtmq_u16_f16(a) simde_vcvtmq_u16_f16(a) +#endif + + +SIMDE_FUNCTION_ATTRIBUTES +simde_uint32x4_t +simde_vcvtmq_u32_f32(simde_float32x4_t a) { + #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && !defined(SIMDE_BUG_CLANG_46844) + return vcvtmq_u32_f32(a); + #else + simde_float32x4_private a_ = simde_float32x4_to_private(a); + simde_uint32x4_private r_; + + #if 0 && defined(SIMDE_X86_AVX512F_NATIVE) && defined(SIMDE_X86_AVX512VL_NATIVE) + // Hmm.. this doesn't work, unlike the signed versions + if (HEDLEY_UNLIKELY(_MM_GET_ROUNDING_MODE() != _MM_ROUND_NEAREST)) { + unsigned int rounding_mode = _MM_GET_ROUNDING_MODE(); + _MM_SET_ROUNDING_MODE(_MM_ROUND_NEAREST); + r_.m128i = _mm_cvtps_epu32(a_.m128); + _MM_SET_ROUNDING_MODE(rounding_mode); + } else { + r_.m128i = _mm_cvtps_epu32(a_.m128); + } + #else + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { + r_.values[i] = simde_vcvtms_u32_f32(a_.values[i]); + } + #endif + + return simde_uint32x4_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vcvtmq_u32_f32 + #define vcvtmq_u32_f32(a) simde_vcvtmq_u32_f32(a) +#endif + + +SIMDE_FUNCTION_ATTRIBUTES +simde_uint64x2_t +simde_vcvtmq_u64_f64(simde_float64x2_t a) { + #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + return vcvtmq_u64_f64(a); + #else + simde_float64x2_private a_ = simde_float64x2_to_private(a); + simde_uint64x2_private r_; + + #if 0 && defined(SIMDE_X86_AVX512DQ_NATIVE) && defined(SIMDE_X86_AVX512VL_NATIVE) + // Hmm.. this doesn't work, unlike the signed versions + if (HEDLEY_UNLIKELY(_MM_GET_ROUNDING_MODE() != _MM_ROUND_NEAREST)) { + unsigned int rounding_mode = _MM_GET_ROUNDING_MODE(); + _MM_SET_ROUNDING_MODE(_MM_ROUND_NEAREST); + r_.m128i = _mm_cvtpd_epu64(a_.m128d); + _MM_SET_ROUNDING_MODE(rounding_mode); + } else { + r_.m128i = _mm_cvtpd_epu64(a_.m128d); + } + #else + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { + r_.values[i] = simde_vcvtmd_u64_f64(a_.values[i]); + } + #endif + + return simde_uint64x2_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vcvtmq_u64_f64 + #define vcvtmq_u64_f64(a) simde_vcvtmq_u64_f64(a) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_uint16x4_t +simde_vcvtm_u16_f16(simde_float16x4_t a) { + #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARM_NEON_FP16) + return vcvtm_u16_f16(a); + #else + simde_float16x4_private a_ = simde_float16x4_to_private(a); + simde_uint16x4_private r_; + + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { + r_.values[i] = simde_vcvtmh_u16_f16(a_.values[i]); + } + + return simde_uint16x4_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) + #undef vcvtm_u16_f16 + #define vcvtm_u16_f16(a) simde_vcvtm_u16_f16(a) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_uint32x2_t +simde_vcvtm_u32_f32(simde_float32x2_t a) { + #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + return vcvtm_u32_f32(a); + #else + simde_float32x2_private a_ = simde_float32x2_to_private(a); + simde_uint32x2_private r_; + + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { + r_.values[i] = simde_vcvtms_u32_f32(a_.values[i]); + } + + return simde_uint32x2_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vcvtm_u32_f32 + #define vcvtm_u32_f32(a) simde_vcvtm_u32_f32(a) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_uint64x1_t +simde_vcvtm_u64_f64(simde_float64x1_t a) { + #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + return vcvtm_u64_f64(a); + #else + simde_float64x1_private a_ = simde_float64x1_to_private(a); + simde_uint64x1_private r_; + + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { + r_.values[i] = simde_vcvtmd_u64_f64(a_.values[i]); + } + + return simde_uint64x1_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vcvtm_u64_f64 + #define vcvtm_u64_f64(a) simde_vcvtm_u64_f64(a) +#endif + +SIMDE_END_DECLS_ +HEDLEY_DIAGNOSTIC_POP + +#endif /* SIMDE_ARM_NEON_CVTM_H */ diff --git a/lib/simd_wrapper/simde/arm/neon/cvtn.h b/lib/simd_wrapper/simde/arm/neon/cvtn.h new file mode 100644 index 00000000000..8198a9721b7 --- /dev/null +++ b/lib/simd_wrapper/simde/arm/neon/cvtn.h @@ -0,0 +1,530 @@ +/* SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, copy, + * modify, merge, publish, distribute, sublicense, and/or sell copies + * of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + * Copyright: + * 2023 Michael R. Crusoe + * 2023 Yi-Yen Chung (Copyright owned by Andes Technology) + */ + +#if !defined(SIMDE_ARM_NEON_CVTN_H) +#define SIMDE_ARM_NEON_CVTN_H + +#include "types.h" +#include "cvt.h" +#include "calt.h" +#include "cagt.h" + +HEDLEY_DIAGNOSTIC_PUSH +SIMDE_DISABLE_UNWANTED_DIAGNOSTICS +SIMDE_BEGIN_DECLS_ + +SIMDE_FUNCTION_ATTRIBUTES +simde_int32x4_t +simde_vcvtnq_s32_f32(simde_float32x4_t a) { + #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + return vcvtnq_s32_f32(a); + #else + simde_float32x4_private a_ = simde_float32x4_to_private(a); + simde_int32x4_private r_; + + #if defined(SIMDE_X86_SSE2_NATIVE) + if (HEDLEY_UNLIKELY(_MM_GET_ROUNDING_MODE() != _MM_ROUND_NEAREST)) { + unsigned int rounding_mode = _MM_GET_ROUNDING_MODE(); + _MM_SET_ROUNDING_MODE(_MM_ROUND_NEAREST); + r_.m128i = _mm_cvtps_epi32(a_.m128); + _MM_SET_ROUNDING_MODE(rounding_mode); + } else { + r_.m128i = _mm_cvtps_epi32(a_.m128); + } + #else + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { + r_.values[i] = HEDLEY_STATIC_CAST(int32_t, simde_math_roundevenf(a_.values[i])); + } + #endif + + return simde_int32x4_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vcvtnq_s32_f32 + #define vcvtnq_s32_f32(a) simde_vcvtnq_s32_f32(a) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_int64x2_t +simde_vcvtnq_s64_f64(simde_float64x2_t a) { + #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + return vcvtnq_s64_f64(a); + #else + simde_float64x2_private a_ = simde_float64x2_to_private(a); + simde_int64x2_private r_; + + #if defined(SIMDE_X86_AVX512DQ_NATIVE) && defined(SIMDE_X86_AVX512VL_NATIVE) + if (HEDLEY_UNLIKELY(_MM_GET_ROUNDING_MODE() != _MM_ROUND_NEAREST)) { + unsigned int rounding_mode = _MM_GET_ROUNDING_MODE(); + _MM_SET_ROUNDING_MODE(_MM_ROUND_NEAREST); + r_.m128i = _mm_cvtpd_epi64(a_.m128d); + _MM_SET_ROUNDING_MODE(rounding_mode); + } else { + r_.m128i = _mm_cvtpd_epi64(a_.m128d); + } + #else + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { + r_.values[i] = HEDLEY_STATIC_CAST(int64_t, simde_math_roundeven(a_.values[i])); + } + #endif + + return simde_int64x2_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vcvtnq_s64_f64 + #define vcvtnq_s64_f64(a) simde_vcvtnq_s64_f64(a) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +int64_t +simde_vcvtnh_s64_f16(simde_float16_t a) { + #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && defined(SIMDE_ARM_NEON_FP16) + return vcvtnh_s64_f16(a); + #elif defined(SIMDE_FAST_CONVERSION_RANGE) + return HEDLEY_STATIC_CAST(int64_t, simde_math_roundevenf(simde_float16_to_float32(a))); + #else + simde_float32 a_ = simde_float16_to_float32(a); + if (HEDLEY_UNLIKELY(a_ < HEDLEY_STATIC_CAST(simde_float32, INT64_MIN))) { + return INT64_MIN; + } else if (HEDLEY_UNLIKELY(a_ > HEDLEY_STATIC_CAST(simde_float32, INT64_MAX))) { + return INT64_MAX; + } else if (simde_math_isnanf(a_)) { + return 0; + } else { + return HEDLEY_STATIC_CAST(int64_t, simde_math_roundevenf(a_)); + } + #endif +} +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vcvtnh_s64_f16 + #define vcvtnh_s64_f16(a) simde_vcvtnh_s64_f16(a) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +int32_t +simde_vcvtnh_s32_f16(simde_float16_t a) { + #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARM_NEON_FP16) + return vcvtnh_s32_f16(a); + #elif defined(SIMDE_FAST_CONVERSION_RANGE) + return HEDLEY_STATIC_CAST(int32_t, simde_math_roundevenf(simde_float16_to_float32(a))); + #else + simde_float32 a_ = simde_float16_to_float32(a); + if (HEDLEY_UNLIKELY(a_ < HEDLEY_STATIC_CAST(simde_float32, INT32_MIN))) { + return INT32_MIN; + } else if (HEDLEY_UNLIKELY(a_ > HEDLEY_STATIC_CAST(simde_float32, INT32_MAX))) { + return INT32_MAX; + } else if (simde_math_isnanf(a_)) { + return 0; + } else { + return HEDLEY_STATIC_CAST(int32_t, simde_math_roundevenf(a_)); + } + #endif +} +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) + #undef vcvtnh_s32_f16 + #define vcvtnh_s32_f16(a) simde_vcvtnh_s32_f16(a) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +uint64_t +simde_vcvtnh_u64_f16(simde_float16_t a) { + #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && defined(SIMDE_ARM_NEON_FP16) + return vcvtnh_u64_f16(a); + #elif defined(SIMDE_FAST_CONVERSION_RANGE) + return HEDLEY_STATIC_CAST(uint64_t, simde_math_roundevenf(simde_float16_to_float32(a))); + #else + simde_float32 a_ = simde_float16_to_float32(a); + if (HEDLEY_UNLIKELY(a_ < HEDLEY_STATIC_CAST(simde_float32, 0))) { + return 0; + } else if (HEDLEY_UNLIKELY(a_ > HEDLEY_STATIC_CAST(simde_float32, UINT64_MAX))) { + return UINT64_MAX; + } else if (simde_math_isnanf(a_)) { + return 0; + } else { + return HEDLEY_STATIC_CAST(uint64_t, simde_math_roundevenf(a_)); + } + #endif +} +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vcvtnh_u64_f16 + #define vcvtnh_u64_f16(a) simde_vcvtnh_u64_f16(a) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +uint32_t +simde_vcvtnh_u32_f16(simde_float16_t a) { + #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARM_NEON_FP16) + return vcvtnh_u32_f16(a); + #elif defined(SIMDE_FAST_CONVERSION_RANGE) + return HEDLEY_STATIC_CAST(uint32_t, simde_math_roundevenf(simde_float16_to_float32(a))); + #else + simde_float32 a_ = simde_float16_to_float32(a); + if (HEDLEY_UNLIKELY(a_ < HEDLEY_STATIC_CAST(simde_float32, 0))) { + return 0; + } else if (HEDLEY_UNLIKELY(a_ > HEDLEY_STATIC_CAST(simde_float32, UINT32_MAX))) { + return UINT32_MAX; + } else if (simde_math_isnanf(a_)) { + return 0; + } else { + return HEDLEY_STATIC_CAST(uint32_t, simde_math_roundevenf(a_)); + } + #endif +} +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) + #undef vcvtnh_u32_f16 + #define vcvtnh_u32_f16(a) simde_vcvtnh_u32_f16(a) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +uint16_t +simde_vcvtnh_u16_f16(simde_float16_t a) { + #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && defined(SIMDE_ARM_NEON_FP16) + return vcvtnh_u16_f16(a); + #elif defined(SIMDE_FAST_CONVERSION_RANGE) + return HEDLEY_STATIC_CAST(uint16_t, simde_math_roundevenf(simde_float16_to_float32(a))); + #else + simde_float32 a_ = simde_float16_to_float32(a); + if (HEDLEY_UNLIKELY(a_ < HEDLEY_STATIC_CAST(simde_float32, 0))) { + return 0; + } else if (HEDLEY_UNLIKELY(a_ > HEDLEY_STATIC_CAST(simde_float32, UINT16_MAX))) { + return UINT16_MAX; + } else if (simde_math_isnanf(a_)) { + return 0; + } else { + return HEDLEY_STATIC_CAST(uint16_t, simde_math_roundevenf(a_)); + } + #endif +} +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vcvtnh_u16_f16 + #define vcvtnh_u16_f16(a) simde_vcvtnh_u16_f16(a) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +int32_t +simde_vcvtns_s32_f32(simde_float32 a) { + #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + return vcvtns_s32_f32(a); + #elif defined(SIMDE_FAST_CONVERSION_RANGE) + return HEDLEY_STATIC_CAST(int32_t, simde_math_roundevenf(a)); + #else + if (HEDLEY_UNLIKELY(a < HEDLEY_STATIC_CAST(simde_float32, INT32_MIN))) { + return INT32_MIN; + } else if (HEDLEY_UNLIKELY(a > HEDLEY_STATIC_CAST(simde_float32, INT32_MAX))) { + return INT32_MAX; + } else if (HEDLEY_UNLIKELY(simde_math_isnanf(a))) { + return 0; + } else { + return HEDLEY_STATIC_CAST(int32_t, simde_math_roundevenf(a)); + } + #endif +} +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vcvtns_s32_f32 + #define vcvtns_s32_f32(a) simde_vcvtns_s32_f32(a) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +uint32_t +simde_vcvtns_u32_f32(simde_float32 a) { + #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + return vcvtns_u32_f32(a); + #elif defined(SIMDE_FAST_CONVERSION_RANGE) + return HEDLEY_STATIC_CAST(uint32_t, simde_math_roundevenf(a)); + #else + if (HEDLEY_UNLIKELY(a < SIMDE_FLOAT32_C(0.0))) { + return 0; + } else if (HEDLEY_UNLIKELY(a > HEDLEY_STATIC_CAST(simde_float32, UINT32_MAX))) { + return UINT32_MAX; + } else if (HEDLEY_UNLIKELY(simde_math_isnanf(a))) { + return 0; + } else { + return HEDLEY_STATIC_CAST(uint32_t, simde_math_roundevenf(a)); + } + #endif +} +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vcvtns_u32_f32 + #define vcvtns_u32_f32(a) simde_vcvtns_u32_f32(a) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_uint32x4_t +simde_vcvtnq_u32_f32(simde_float32x4_t a) { + #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && !defined(SIMDE_BUG_CLANG_46844) + return vcvtnq_u32_f32(a); + #else + simde_float32x4_private a_ = simde_float32x4_to_private(a); + simde_uint32x4_private r_; + + #if 0 && defined(SIMDE_X86_AVX512F_NATIVE) && defined(SIMDE_X86_AVX512VL_NATIVE) + // Hmm.. this doesn't work, unlike the signed versions + if (HEDLEY_UNLIKELY(_MM_GET_ROUNDING_MODE() != _MM_ROUND_NEAREST)) { + unsigned int rounding_mode = _MM_GET_ROUNDING_MODE(); + _MM_SET_ROUNDING_MODE(_MM_ROUND_NEAREST); + r_.m128i = _mm_cvtps_epu32(a_.m128); + _MM_SET_ROUNDING_MODE(rounding_mode); + } else { + r_.m128i = _mm_cvtps_epu32(a_.m128); + } + #else + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { + r_.values[i] = simde_vcvtns_u32_f32(a_.values[i]); + } + #endif + + return simde_uint32x4_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vcvtnq_u32_f32 + #define vcvtnq_u32_f32(a) simde_vcvtnq_u32_f32(a) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +int64_t +simde_vcvtnd_s64_f64(simde_float64 a) { + #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + return vcvtnd_s64_f64(a); + #elif defined(SIMDE_FAST_CONVERSION_RANGE) + return HEDLEY_STATIC_CAST(int64_t, simde_math_roundeven(a)); + #else + if (HEDLEY_UNLIKELY(a < HEDLEY_STATIC_CAST(simde_float64, INT64_MIN))) { + return INT64_MIN; + } else if (HEDLEY_UNLIKELY(a > HEDLEY_STATIC_CAST(simde_float64, INT64_MAX))) { + return INT64_MAX; + } else if (simde_math_isnan(a)) { + return 0; + } else { + return HEDLEY_STATIC_CAST(int64_t, simde_math_roundeven(a)); + } + #endif +} +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vcvtnd_s64_f64 + #define vcvtnd_s64_f64(a) simde_vcvtnd_s64_f64(a) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +uint64_t +simde_vcvtnd_u64_f64(simde_float64 a) { + #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + return vcvtnd_u64_f64(a); + #elif defined(SIMDE_FAST_CONVERSION_RANGE) + return HEDLEY_STATIC_CAST(uint64_t, simde_math_roundeven(a)); + #else + if (HEDLEY_UNLIKELY(a < SIMDE_FLOAT64_C(0.0))) { + return 0; + } else if (HEDLEY_UNLIKELY(a > HEDLEY_STATIC_CAST(simde_float64, UINT64_MAX))) { + return UINT64_MAX; + } else if (simde_math_isnan(a)) { + return 0; + } else { + return HEDLEY_STATIC_CAST(uint64_t, simde_math_roundeven(a)); + } + #endif +} +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vcvtnd_u64_f64 + #define vcvtnd_u64_f64(a) simde_vcvtnd_u64_f64(a) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_uint64x2_t +simde_vcvtnq_u64_f64(simde_float64x2_t a) { + #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + return vcvtnq_u64_f64(a); + #else + simde_float64x2_private a_ = simde_float64x2_to_private(a); + simde_uint64x2_private r_; + + #if 0 && defined(SIMDE_X86_AVX512DQ_NATIVE) && defined(SIMDE_X86_AVX512VL_NATIVE) + // Hmm.. this doesn't work, unlike the signed versions + if (HEDLEY_UNLIKELY(_MM_GET_ROUNDING_MODE() != _MM_ROUND_NEAREST)) { + unsigned int rounding_mode = _MM_GET_ROUNDING_MODE(); + _MM_SET_ROUNDING_MODE(_MM_ROUND_NEAREST); + r_.m128i = _mm_cvtpd_epu64(a_.m128d); + _MM_SET_ROUNDING_MODE(rounding_mode); + } else { + r_.m128i = _mm_cvtpd_epu64(a_.m128d); + } + #else + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { + r_.values[i] = simde_vcvtnd_u64_f64(a_.values[i]); + } + #endif + + return simde_uint64x2_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vcvtnq_u64_f64 + #define vcvtnq_u64_f64(a) simde_vcvtnq_u64_f64(a) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_uint16x8_t +simde_vcvtnq_u16_f16(simde_float16x8_t a) { + #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARM_NEON_FP16) + return vcvtnq_u16_f16(a); + #else + simde_float16x8_private a_ = simde_float16x8_to_private(a); + simde_uint16x8_private r_; + + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { + r_.values[i] = simde_vcvtnh_u16_f16(a_.values[i]); + } + + return simde_uint16x8_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) + #undef vcvtnq_u16_f16 + #define vcvtnq_u16_f16(a) simde_vcvtnq_u16_f16(a) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_uint16x4_t +simde_vcvtn_u16_f16(simde_float16x4_t a) { + #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARM_NEON_FP16) + return vcvtn_u16_f16(a); + #else + simde_float16x4_private a_ = simde_float16x4_to_private(a); + simde_uint16x4_private r_; + + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { + r_.values[i] = simde_vcvtnh_u16_f16(a_.values[i]); + } + + return simde_uint16x4_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) + #undef vcvtn_u16_f16 + #define vcvtn_u16_f16(a) simde_vcvtn_u16_f16(a) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_uint32x2_t +simde_vcvtn_u32_f32(simde_float32x2_t a) { + #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + return vcvtn_u32_f32(a); + #else + simde_float32x2_private a_ = simde_float32x2_to_private(a); + simde_uint32x2_private r_; + + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { + r_.values[i] = simde_vcvtns_u32_f32(a_.values[i]); + } + + return simde_uint32x2_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vcvtn_u32_f32 + #define vcvtn_u32_f32(a) simde_vcvtn_u32_f32(a) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_int32x2_t +simde_vcvtn_s32_f32(simde_float32x2_t a) { + #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + return vcvtn_s32_f32(a); + #else + simde_float32x2_private a_ = simde_float32x2_to_private(a); + simde_int32x2_private r_; + + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { + r_.values[i] = simde_vcvtns_s32_f32(a_.values[i]); + } + + return simde_int32x2_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vcvtn_s32_f32 + #define vcvtn_s32_f32(a) simde_vcvtn_s32_f32(a) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_int64x1_t +simde_vcvtn_s64_f64(simde_float64x1_t a) { + #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + return vcvtn_s64_f64(a); + #else + simde_float64x1_private a_ = simde_float64x1_to_private(a); + simde_int64x1_private r_; + + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { + r_.values[i] = simde_vcvtnd_s64_f64(a_.values[i]); + } + + return simde_int64x1_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vcvtn_s64_f64 + #define vcvtn_s64_f64(a) simde_vcvtn_s64_f64(a) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_uint64x1_t +simde_vcvtn_u64_f64(simde_float64x1_t a) { + #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + return vcvtn_u64_f64(a); + #else + simde_float64x1_private a_ = simde_float64x1_to_private(a); + simde_uint64x1_private r_; + + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { + r_.values[i] = simde_vcvtnd_u64_f64(a_.values[i]); + } + + return simde_uint64x1_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vcvtn_u64_f64 + #define vcvtn_u64_f64(a) simde_vcvtn_u64_f64(a) +#endif + +SIMDE_END_DECLS_ +HEDLEY_DIAGNOSTIC_POP + +#endif /* SIMDE_ARM_NEON_CVTN_H */ diff --git a/lib/simd_wrapper/simde/arm/neon/cvtp.h b/lib/simd_wrapper/simde/arm/neon/cvtp.h new file mode 100644 index 00000000000..92bcb2b99f9 --- /dev/null +++ b/lib/simd_wrapper/simde/arm/neon/cvtp.h @@ -0,0 +1,379 @@ +/* SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, copy, + * modify, merge, publish, distribute, sublicense, and/or sell copies + * of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + * Copyright: + * 2023 Yi-Yen Chung (Copyright owned by Andes Technology) + */ + +#if !defined(SIMDE_ARM_NEON_CVTP_H) +#define SIMDE_ARM_NEON_CVTP_H + +#include "types.h" +#include "cvt.h" + +HEDLEY_DIAGNOSTIC_PUSH +SIMDE_DISABLE_UNWANTED_DIAGNOSTICS +SIMDE_BEGIN_DECLS_ + +SIMDE_FUNCTION_ATTRIBUTES +int64_t +simde_vcvtph_s64_f16(simde_float16_t a) { + #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && defined(SIMDE_ARM_NEON_FP16) + return vcvtph_s64_f16(a); + #elif defined(SIMDE_FAST_CONVERSION_RANGE) + return HEDLEY_STATIC_CAST(int64_t, + simde_math_ceilf( + simde_float16_to_float32(a))); + #else + simde_float32 af = simde_float16_to_float32(a); + if (HEDLEY_UNLIKELY(af <= HEDLEY_STATIC_CAST(simde_float32, INT64_MIN))) { + return INT64_MIN; + } else if (HEDLEY_UNLIKELY(af >= HEDLEY_STATIC_CAST(simde_float32, INT64_MAX))) { + return INT64_MAX; + } else if (HEDLEY_UNLIKELY(simde_isnanhf(a))) { + return 0; + } else { + return HEDLEY_STATIC_CAST(int64_t, simde_math_ceilf(af)); + } + #endif +} +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vcvtph_s64_f16 + #define vcvtph_s64_f16(a) simde_vcvtph_s64_f16(a) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +int32_t +simde_vcvtph_s32_f16(simde_float16_t a) { + #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARM_NEON_FP16) + return vcvtph_s32_f16(a); + #elif defined(SIMDE_FAST_CONVERSION_RANGE) + return HEDLEY_STATIC_CAST(int32_t, + simde_math_ceilf( + simde_float16_to_float32(a))); + #else + simde_float32 af = simde_float16_to_float32(a); + if (HEDLEY_UNLIKELY(af <= HEDLEY_STATIC_CAST(simde_float32, INT32_MIN))) { + return INT32_MIN; + } else if (HEDLEY_UNLIKELY(af >= HEDLEY_STATIC_CAST(simde_float32, INT32_MAX))) { + return INT32_MAX; + } else if (HEDLEY_UNLIKELY(simde_isnanhf(a))) { + return 0; + } else { + return HEDLEY_STATIC_CAST(int32_t, simde_math_ceilf(af)); + } + #endif +} +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) + #undef vcvtph_s32_f16 + #define vcvtph_s32_f16(a) simde_vcvtph_s32_f16(a) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +uint64_t +simde_vcvtph_u64_f16(simde_float16_t a) { + #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && defined(SIMDE_ARM_NEON_FP16) + return vcvtph_u64_f16(a); + #elif defined(SIMDE_FAST_CONVERSION_RANGE) + return HEDLEY_STATIC_CAST(uint64_t, + simde_math_ceilf( + simde_float16_to_float32(a))); + #else + simde_float32 af = simde_float16_to_float32(a); + if (HEDLEY_UNLIKELY(af <= SIMDE_FLOAT32_C(0.0))) { + return 0; + } else if (HEDLEY_UNLIKELY(af >= HEDLEY_STATIC_CAST(simde_float32, UINT64_MAX))) { + return UINT64_MAX; + } else if (HEDLEY_UNLIKELY(simde_isnanhf(a))) { + return 0; + } else { + return HEDLEY_STATIC_CAST(uint64_t, simde_math_ceilf(af)); + } + #endif +} +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vcvtph_u64_f16 + #define vcvtph_u64_f16(a) simde_vcvtph_u64_f16(a) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +uint32_t +simde_vcvtph_u32_f16(simde_float16_t a) { + #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARM_NEON_FP16) + return vcvtph_u32_f16(a); + #elif defined(SIMDE_FAST_CONVERSION_RANGE) + return HEDLEY_STATIC_CAST(uint32_t, + simde_math_ceilf( + simde_float16_to_float32(a))); + #else + simde_float32 af = simde_float16_to_float32(a); + if (HEDLEY_UNLIKELY(af <= SIMDE_FLOAT32_C(0.0))) { + return 0; + } else if (HEDLEY_UNLIKELY(af >= HEDLEY_STATIC_CAST(simde_float32, UINT32_MAX))) { + return UINT32_MAX; + } else if (HEDLEY_UNLIKELY(simde_isnanhf(a))) { + return 0; + } else { + return HEDLEY_STATIC_CAST(uint32_t, simde_math_ceilf(af)); + } + #endif +} +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) + #undef vcvtph_u32_f16 + #define vcvtph_u32_f16(a) simde_vcvtph_u32_f16(a) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +uint16_t +simde_vcvtph_u16_f16(simde_float16_t a) { + #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && defined(SIMDE_ARM_NEON_FP16) + return vcvtph_u16_f16(a); + #elif defined(SIMDE_FAST_CONVERSION_RANGE) + return HEDLEY_STATIC_CAST(uint16_t, + simde_math_ceilf( + simde_float16_to_float32(a))); + #else + simde_float32 af = simde_float16_to_float32(a); + if (HEDLEY_UNLIKELY(af <= SIMDE_FLOAT32_C(0.0))) { + return 0; + } else if (HEDLEY_UNLIKELY(af >= HEDLEY_STATIC_CAST(simde_float32, UINT16_MAX))) { + return UINT16_MAX; + } else if (HEDLEY_UNLIKELY(simde_isnanhf(a))) { + return 0; + } else { + return HEDLEY_STATIC_CAST(uint16_t, simde_math_ceilf(af)); + } + #endif +} +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vcvtph_u16_f16 + #define vcvtph_u16_f16(a) simde_vcvtph_u16_f16(a) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +uint32_t +simde_vcvtps_u32_f32(simde_float32 a) { + #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + return vcvtps_u32_f32(a); + #elif defined(SIMDE_FAST_CONVERSION_RANGE) + return HEDLEY_STATIC_CAST(uint32_t, simde_math_ceilf(a)); + #else + if (HEDLEY_UNLIKELY(a <= SIMDE_FLOAT32_C(0.0))) { + return 0; + } else if (HEDLEY_UNLIKELY(a >= HEDLEY_STATIC_CAST(simde_float32, UINT32_MAX))) { + return UINT32_MAX; + } else if (HEDLEY_UNLIKELY(simde_math_isnanf(a))) { + return 0; + } else { + return HEDLEY_STATIC_CAST(uint32_t, simde_math_ceilf(a)); + } + #endif +} +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vcvtps_u32_f32 + #define vcvtps_u32_f32(a) simde_vcvtps_u32_f32(a) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +uint64_t +simde_vcvtpd_u64_f64(simde_float64 a) { + #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + return vcvtpd_u64_f64(a); + #elif defined(SIMDE_FAST_CONVERSION_RANGE) + return HEDLEY_STATIC_CAST(uint64_t, simde_math_ceil(a)); + #else + if (HEDLEY_UNLIKELY(a <= SIMDE_FLOAT64_C(0.0))) { + return 0; + } else if (HEDLEY_UNLIKELY(a >= HEDLEY_STATIC_CAST(simde_float64, UINT64_MAX))) { + return UINT64_MAX; + } else if (simde_math_isnan(a)) { + return 0; + } else { + return HEDLEY_STATIC_CAST(uint64_t, simde_math_ceil(a)); + } + #endif +} +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vcvtpd_u64_f64 + #define vcvtpd_u64_f64(a) simde_vcvtpd_u64_f64(a) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_uint16x8_t +simde_vcvtpq_u16_f16(simde_float16x8_t a) { + #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARM_NEON_FP16) + return vcvtpq_u16_f16(a); + #else + simde_float16x8_private a_ = simde_float16x8_to_private(a); + simde_uint16x8_private r_; + + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { + r_.values[i] = simde_vcvtph_u16_f16(a_.values[i]); + } + + return simde_uint16x8_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) + #undef vcvtpq_u16_f16 + #define vcvtpq_u16_f16(a) simde_vcvtpq_u16_f16(a) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_uint32x4_t +simde_vcvtpq_u32_f32(simde_float32x4_t a) { + #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && !defined(SIMDE_BUG_CLANG_46844) + return vcvtpq_u32_f32(a); + #else + simde_float32x4_private a_ = simde_float32x4_to_private(a); + simde_uint32x4_private r_; + + #if 0 && defined(SIMDE_X86_AVX512F_NATIVE) && defined(SIMDE_X86_AVX512VL_NATIVE) + // Hmm.. this doesn't work, unlike the signed versions + if (HEDLEY_UNLIKELY(_MM_GET_ROUNDING_MODE() != _MM_ROUND_NEAREST)) { + unsigned int rounding_mode = _MM_GET_ROUNDING_MODE(); + _MM_SET_ROUNDING_MODE(_MM_ROUND_NEAREST); + r_.m128i = _mm_cvtps_epu32(a_.m128); + _MM_SET_ROUNDING_MODE(rounding_mode); + } else { + r_.m128i = _mm_cvtps_epu32(a_.m128); + } + #else + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { + r_.values[i] = simde_vcvtps_u32_f32(a_.values[i]); + } + #endif + + return simde_uint32x4_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vcvtpq_u32_f32 + #define vcvtpq_u32_f32(a) simde_vcvtpq_u32_f32(a) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_uint64x2_t +simde_vcvtpq_u64_f64(simde_float64x2_t a) { + #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + return vcvtpq_u64_f64(a); + #else + simde_float64x2_private a_ = simde_float64x2_to_private(a); + simde_uint64x2_private r_; + + #if 0 && defined(SIMDE_X86_AVX512DQ_NATIVE) && defined(SIMDE_X86_AVX512VL_NATIVE) + // Hmm.. this doesn't work, unlike the signed versions + if (HEDLEY_UNLIKELY(_MM_GET_ROUNDING_MODE() != _MM_ROUND_NEAREST)) { + unsigned int rounding_mode = _MM_GET_ROUNDING_MODE(); + _MM_SET_ROUNDING_MODE(_MM_ROUND_NEAREST); + r_.m128i = _mm_cvtpd_epu64(a_.m128d); + _MM_SET_ROUNDING_MODE(rounding_mode); + } else { + r_.m128i = _mm_cvtpd_epu64(a_.m128d); + } + #else + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { + r_.values[i] = simde_vcvtpd_u64_f64(a_.values[i]); + } + #endif + + return simde_uint64x2_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vcvtpq_u64_f64 + #define vcvtpq_u64_f64(a) simde_vcvtpq_u64_f64(a) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_uint16x4_t +simde_vcvtp_u16_f16(simde_float16x4_t a) { + #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARM_NEON_FP16) + return vcvtp_u16_f16(a); + #else + simde_float16x4_private a_ = simde_float16x4_to_private(a); + simde_uint16x4_private r_; + + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { + r_.values[i] = simde_vcvtph_u16_f16(a_.values[i]); + } + + return simde_uint16x4_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) + #undef vcvtp_u16_f16 + #define vcvtp_u16_f16(a) simde_vcvtp_u16_f16(a) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_uint32x2_t +simde_vcvtp_u32_f32(simde_float32x2_t a) { + #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + return vcvtp_u32_f32(a); + #else + simde_float32x2_private a_ = simde_float32x2_to_private(a); + simde_uint32x2_private r_; + + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { + r_.values[i] = simde_vcvtps_u32_f32(a_.values[i]); + } + + return simde_uint32x2_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vcvtp_u32_f32 + #define vcvtp_u32_f32(a) simde_vcvtp_u32_f32(a) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_uint64x1_t +simde_vcvtp_u64_f64(simde_float64x1_t a) { + #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + return vcvtp_u64_f64(a); + #else + simde_float64x1_private a_ = simde_float64x1_to_private(a); + simde_uint64x1_private r_; + + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { + r_.values[i] = simde_vcvtpd_u64_f64(a_.values[i]); + } + + return simde_uint64x1_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vcvtp_u64_f64 + #define vcvtp_u64_f64(a) simde_vcvtp_u64_f64(a) +#endif + +SIMDE_END_DECLS_ +HEDLEY_DIAGNOSTIC_POP + +#endif /* SIMDE_ARM_NEON_CVTP_H */ diff --git a/lib/simd_wrapper/simde/arm/neon/div.h b/lib/simd_wrapper/simde/arm/neon/div.h new file mode 100644 index 00000000000..05a59084b76 --- /dev/null +++ b/lib/simd_wrapper/simde/arm/neon/div.h @@ -0,0 +1,199 @@ +/* SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, copy, + * modify, merge, publish, distribute, sublicense, and/or sell copies + * of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + * Copyright: + * 2023 Yi-Yen Chung (Copyright owned by Andes Technology) + */ + +#if !defined(SIMDE_ARM_NEON_DIV_H) +#define SIMDE_ARM_NEON_DIV_H + +#include "types.h" + +#include "reinterpret.h" + +HEDLEY_DIAGNOSTIC_PUSH +SIMDE_DISABLE_UNWANTED_DIAGNOSTICS +SIMDE_BEGIN_DECLS_ + +SIMDE_FUNCTION_ATTRIBUTES +simde_float16_t +simde_vdivh_f16(simde_float16_t a, simde_float16_t b) { + #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && defined(SIMDE_ARM_NEON_FP16) + return vdivh_f16(a, b); + #else + return simde_float16_from_float32(simde_float16_to_float32(a) / simde_float16_to_float32(b)); + #endif +} +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vdivh_f16 + #define vdivh_f16(a, b) simde_vdivh_f16((a), (b)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_float16x4_t +simde_vdiv_f16(simde_float16x4_t a, simde_float16x4_t b) { + #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && defined(SIMDE_ARM_NEON_FP16) + return vdiv_f16(a, b); + #else + simde_float16x4_private + r_, + a_ = simde_float16x4_to_private(a), + b_ = simde_float16x4_to_private(b); + + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { + r_.values[i] = simde_vdivh_f16(a_.values[i], b_.values[i]); + } + + return simde_float16x4_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vdiv_f16 + #define vdiv_f16(a, b) simde_vdiv_f16((a), (b)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_float16x8_t +simde_vdivq_f16(simde_float16x8_t a, simde_float16x8_t b) { + #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && defined(SIMDE_ARM_NEON_FP16) + return vdivq_f16(a, b); + #else + simde_float16x8_private + r_, + a_ = simde_float16x8_to_private(a), + b_ = simde_float16x8_to_private(b); + + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { + r_.values[i] = simde_vdivh_f16(a_.values[i], b_.values[i]); + } + + return simde_float16x8_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vdivq_f16 + #define vdivq_f16(a, b) simde_vdivq_f16((a), (b)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_float32x2_t +simde_vdiv_f32(simde_float32x2_t a, simde_float32x2_t b) { + #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + return vdiv_f32(a, b); + #else + simde_float32x2_private + r_, + a_ = simde_float32x2_to_private(a), + b_ = simde_float32x2_to_private(b); + + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { + r_.values[i] = a_.values[i] / b_.values[i]; + } + + return simde_float32x2_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vdiv_f32 + #define vdiv_f32(a, b) simde_vdiv_f32((a), (b)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_float32x4_t +simde_vdivq_f32(simde_float32x4_t a, simde_float32x4_t b) { + #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + return vdivq_f32(a, b); + #else + simde_float32x4_private + r_, + a_ = simde_float32x4_to_private(a), + b_ = simde_float32x4_to_private(b); + + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { + r_.values[i] = a_.values[i] / b_.values[i]; + } + + return simde_float32x4_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vdivq_f32 + #define vdivq_f32(a, b) simde_vdivq_f32((a), (b)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_float64x1_t +simde_vdiv_f64(simde_float64x1_t a, simde_float64x1_t b) { + #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + return vdiv_f64(a, b); + #else + simde_float64x1_private + r_, + a_ = simde_float64x1_to_private(a), + b_ = simde_float64x1_to_private(b); + + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { + r_.values[i] = a_.values[i] / b_.values[i]; + } + + return simde_float64x1_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vdiv_f64 + #define vdiv_f64(a, b) simde_vdiv_f64((a), (b)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_float64x2_t +simde_vdivq_f64(simde_float64x2_t a, simde_float64x2_t b) { + #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + return vdivq_f64(a, b); + #else + simde_float64x2_private + r_, + a_ = simde_float64x2_to_private(a), + b_ = simde_float64x2_to_private(b); + + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { + r_.values[i] = a_.values[i] / b_.values[i]; + } + + return simde_float64x2_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vdivq_f64 + #define vdivq_f64(a, b) simde_vdivq_f64((a), (b)) +#endif + +SIMDE_END_DECLS_ +HEDLEY_DIAGNOSTIC_POP + +#endif /* !defined(SIMDE_ARM_NEON_MUL_H) */ diff --git a/lib/simd_wrapper/simde/arm/neon/dot.h b/lib/simd_wrapper/simde/arm/neon/dot.h index fa7febe0364..a05d32d47e8 100644 --- a/lib/simd_wrapper/simde/arm/neon/dot.h +++ b/lib/simd_wrapper/simde/arm/neon/dot.h @@ -46,7 +46,7 @@ SIMDE_BEGIN_DECLS_ SIMDE_FUNCTION_ATTRIBUTES simde_int32x2_t simde_vdot_s32(simde_int32x2_t r, simde_int8x8_t a, simde_int8x8_t b) { - #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(__ARM_FEATURE_DOTPROD) + #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARCH_ARM_DOTPROD) return vdot_s32(r, a, b); #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE) return simde_vadd_s32(r, simde_vmovn_s64(simde_vpaddlq_s32(simde_vpaddlq_s16(simde_vmull_s8(a, b))))); @@ -67,7 +67,7 @@ simde_vdot_s32(simde_int32x2_t r, simde_int8x8_t a, simde_int8x8_t b) { return simde_vadd_s32(r, simde_int32x2_from_private(r_)); #endif } -#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && !defined(__ARM_FEATURE_DOTPROD)) +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) #undef vdot_s32 #define vdot_s32(r, a, b) simde_vdot_s32((r), (a), (b)) #endif @@ -75,7 +75,7 @@ simde_vdot_s32(simde_int32x2_t r, simde_int8x8_t a, simde_int8x8_t b) { SIMDE_FUNCTION_ATTRIBUTES simde_uint32x2_t simde_vdot_u32(simde_uint32x2_t r, simde_uint8x8_t a, simde_uint8x8_t b) { - #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(__ARM_FEATURE_DOTPROD) + #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARCH_ARM_DOTPROD) return vdot_u32(r, a, b); #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE) return simde_vadd_u32(r, simde_vmovn_u64(simde_vpaddlq_u32(simde_vpaddlq_u16(simde_vmull_u8(a, b))))); @@ -97,7 +97,7 @@ simde_vdot_u32(simde_uint32x2_t r, simde_uint8x8_t a, simde_uint8x8_t b) { return simde_vadd_u32(r, simde_uint32x2_from_private(r_)); #endif } -#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && !defined(__ARM_FEATURE_DOTPROD)) +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) #undef vdot_u32 #define vdot_u32(r, a, b) simde_vdot_u32((r), (a), (b)) #endif @@ -105,7 +105,7 @@ simde_vdot_u32(simde_uint32x2_t r, simde_uint8x8_t a, simde_uint8x8_t b) { SIMDE_FUNCTION_ATTRIBUTES simde_int32x4_t simde_vdotq_s32(simde_int32x4_t r, simde_int8x16_t a, simde_int8x16_t b) { - #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(__ARM_FEATURE_DOTPROD) + #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARCH_ARM_DOTPROD) return vdotq_s32(r, a, b); #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE) return simde_vaddq_s32(r, @@ -128,7 +128,7 @@ simde_vdotq_s32(simde_int32x4_t r, simde_int8x16_t a, simde_int8x16_t b) { return simde_vaddq_s32(r, simde_int32x4_from_private(r_)); #endif } -#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && !defined(__ARM_FEATURE_DOTPROD)) +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) #undef vdotq_s32 #define vdotq_s32(r, a, b) simde_vdotq_s32((r), (a), (b)) #endif @@ -136,7 +136,7 @@ simde_vdotq_s32(simde_int32x4_t r, simde_int8x16_t a, simde_int8x16_t b) { SIMDE_FUNCTION_ATTRIBUTES simde_uint32x4_t simde_vdotq_u32(simde_uint32x4_t r, simde_uint8x16_t a, simde_uint8x16_t b) { - #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(__ARM_FEATURE_DOTPROD) + #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARCH_ARM_DOTPROD) return vdotq_u32(r, a, b); #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE) return simde_vaddq_u32(r, @@ -159,11 +159,64 @@ simde_vdotq_u32(simde_uint32x4_t r, simde_uint8x16_t a, simde_uint8x16_t b) { return simde_vaddq_u32(r, simde_uint32x4_from_private(r_)); #endif } -#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && !defined(__ARM_FEATURE_DOTPROD)) +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) #undef vdotq_u32 #define vdotq_u32(r, a, b) simde_vdotq_u32((r), (a), (b)) #endif +SIMDE_FUNCTION_ATTRIBUTES +simde_float32x2_t +simde_vbfdot_f32(simde_float32x2_t r, simde_bfloat16x4_t a, simde_bfloat16x4_t b) { + #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && \ + defined(SIMDE_ARM_NEON_BF16) + return vbfdot_f32(r, a, b); + #else + simde_float32x2_private r_ = simde_float32x2_to_private(r); + simde_bfloat16x4_private + a_ = simde_bfloat16x4_to_private(a), + b_ = simde_bfloat16x4_to_private(b); + + for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { + simde_float32_t elt1_a = simde_bfloat16_to_float32(a_.values[2 * i + 0]); + simde_float32_t elt1_b = simde_bfloat16_to_float32(a_.values[2 * i + 1]); + simde_float32_t elt2_a = simde_bfloat16_to_float32(b_.values[2 * i + 0]); + simde_float32_t elt2_b = simde_bfloat16_to_float32(b_.values[2 * i + 1]); + r_.values[i] = r_.values[i] + elt1_a * elt2_a + elt1_b * elt2_b; + } + return simde_float32x2_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) + #undef vbfdot_f32 + #define vbfdot_f32(r, a, b) simde_vbfdot_f32((r), (a), (b)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_float32x4_t +simde_vbfdotq_f32(simde_float32x4_t r, simde_bfloat16x8_t a, simde_bfloat16x8_t b) { + #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARCH_ARM_DOTPROD) && \ + defined(SIMDE_ARM_NEON_BF16) + return vbfdotq_f32(r, a, b); + #else + simde_float32x4_private r_ = simde_float32x4_to_private(r); + simde_bfloat16x8_private + a_ = simde_bfloat16x8_to_private(a), + b_ = simde_bfloat16x8_to_private(b); + + for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { + simde_float32_t elt1_a = simde_bfloat16_to_float32(a_.values[2 * i + 0]); + simde_float32_t elt1_b = simde_bfloat16_to_float32(a_.values[2 * i + 1]); + simde_float32_t elt2_a = simde_bfloat16_to_float32(b_.values[2 * i + 0]); + simde_float32_t elt2_b = simde_bfloat16_to_float32(b_.values[2 * i + 1]); + r_.values[i] = r_.values[i] + elt1_a * elt2_a + elt1_b * elt2_b; + } + return simde_float32x4_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) + #undef vbfdotq_f32 + #define vbfdotq_f32(r, a, b) simde_vbfdotq_f32((r), (a), (b)) +#endif SIMDE_END_DECLS_ HEDLEY_DIAGNOSTIC_POP diff --git a/lib/simd_wrapper/simde/arm/neon/dot_lane.h b/lib/simd_wrapper/simde/arm/neon/dot_lane.h index 84f706948bd..a7d570b4ab9 100644 --- a/lib/simd_wrapper/simde/arm/neon/dot_lane.h +++ b/lib/simd_wrapper/simde/arm/neon/dot_lane.h @@ -45,7 +45,7 @@ simde_int32x2_t simde_vdot_lane_s32(simde_int32x2_t r, simde_int8x8_t a, simde_int8x8_t b, const int lane) SIMDE_REQUIRE_CONSTANT_RANGE(lane, 0, 1) { simde_int32x2_t result; - #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && defined(__ARM_FEATURE_DOTPROD) + #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && defined(SIMDE_ARCH_ARM_DOTPROD) SIMDE_CONSTIFY_2_(vdot_lane_s32, result, (HEDLEY_UNREACHABLE(), result), lane, r, a, b); #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE) simde_int32x2_t @@ -86,7 +86,7 @@ simde_vdot_lane_s32(simde_int32x2_t r, simde_int8x8_t a, simde_int8x8_t b, const return result; } -#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && !defined(__ARM_FEATURE_DOTPROD)) +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) #undef vdot_lane_s32 #define vdot_lane_s32(r, a, b, lane) simde_vdot_lane_s32((r), (a), (b), (lane)) #endif @@ -96,7 +96,7 @@ simde_uint32x2_t simde_vdot_lane_u32(simde_uint32x2_t r, simde_uint8x8_t a, simde_uint8x8_t b, const int lane) SIMDE_REQUIRE_CONSTANT_RANGE(lane, 0, 1) { simde_uint32x2_t result; - #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && defined(__ARM_FEATURE_DOTPROD) + #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && defined(SIMDE_ARCH_ARM_DOTPROD) SIMDE_CONSTIFY_2_(vdot_lane_u32, result, (HEDLEY_UNREACHABLE(), result), lane, r, a, b); #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE) simde_uint32x2_t @@ -137,7 +137,7 @@ simde_vdot_lane_u32(simde_uint32x2_t r, simde_uint8x8_t a, simde_uint8x8_t b, co return result; } -#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && !defined(__ARM_FEATURE_DOTPROD)) +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) #undef vdot_lane_u32 #define vdot_lane_u32(r, a, b, lane) simde_vdot_lane_u32((r), (a), (b), (lane)) #endif @@ -147,7 +147,7 @@ simde_int32x2_t simde_vdot_laneq_s32(simde_int32x2_t r, simde_int8x8_t a, simde_int8x16_t b, const int lane) SIMDE_REQUIRE_CONSTANT_RANGE(lane, 0, 3) { simde_int32x2_t result; - #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && defined(__ARM_FEATURE_DOTPROD) + #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && defined(SIMDE_ARCH_ARM_DOTPROD) SIMDE_CONSTIFY_4_(vdot_laneq_s32, result, (HEDLEY_UNREACHABLE(), result), lane, r, a, b); #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE) simde_int32x2_t b_lane; @@ -186,7 +186,7 @@ simde_vdot_laneq_s32(simde_int32x2_t r, simde_int8x8_t a, simde_int8x16_t b, con return result; } -#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && !defined(__ARM_FEATURE_DOTPROD)) +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) #undef vdot_laneq_s32 #define vdot_laneq_s32(r, a, b, lane) simde_vdot_laneq_s32((r), (a), (b), (lane)) #endif @@ -196,7 +196,7 @@ simde_uint32x2_t simde_vdot_laneq_u32(simde_uint32x2_t r, simde_uint8x8_t a, simde_uint8x16_t b, const int lane) SIMDE_REQUIRE_CONSTANT_RANGE(lane, 0, 3) { simde_uint32x2_t result; - #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && defined(__ARM_FEATURE_DOTPROD) + #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && defined(SIMDE_ARCH_ARM_DOTPROD) SIMDE_CONSTIFY_4_(vdot_laneq_u32, result, (HEDLEY_UNREACHABLE(), result), lane, r, a, b); #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE) simde_uint32x2_t b_lane; @@ -234,7 +234,7 @@ simde_vdot_laneq_u32(simde_uint32x2_t r, simde_uint8x8_t a, simde_uint8x16_t b, #endif return result; } -#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && !defined(__ARM_FEATURE_DOTPROD)) +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) #undef vdot_laneq_u32 #define vdot_laneq_u32(r, a, b, lane) simde_vdot_laneq_u32((r), (a), (b), (lane)) #endif @@ -244,7 +244,7 @@ simde_uint32x4_t simde_vdotq_laneq_u32(simde_uint32x4_t r, simde_uint8x16_t a, simde_uint8x16_t b, const int lane) SIMDE_REQUIRE_CONSTANT_RANGE(lane, 0, 3) { simde_uint32x4_t result; - #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && defined(__ARM_FEATURE_DOTPROD) + #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && defined(SIMDE_ARCH_ARM_DOTPROD) SIMDE_CONSTIFY_4_(vdotq_laneq_u32, result, (HEDLEY_UNREACHABLE(), result), lane, r, a, b); #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE) simde_uint32x4_t @@ -296,7 +296,7 @@ simde_vdotq_laneq_u32(simde_uint32x4_t r, simde_uint8x16_t a, simde_uint8x16_t b #endif return result; } -#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && !defined(__ARM_FEATURE_DOTPROD)) +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) #undef vdotq_laneq_u32 #define vdotq_laneq_u32(r, a, b, lane) simde_vdotq_laneq_u32((r), (a), (b), (lane)) #endif @@ -306,7 +306,7 @@ simde_int32x4_t simde_vdotq_laneq_s32(simde_int32x4_t r, simde_int8x16_t a, simde_int8x16_t b, const int lane) SIMDE_REQUIRE_CONSTANT_RANGE(lane, 0, 3) { simde_int32x4_t result; - #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && defined(__ARM_FEATURE_DOTPROD) + #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && defined(SIMDE_ARCH_ARM_DOTPROD) SIMDE_CONSTIFY_4_(vdotq_laneq_s32, result, (HEDLEY_UNREACHABLE(), result), lane, r, a, b); #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE) simde_int32x4_t @@ -358,7 +358,7 @@ simde_vdotq_laneq_s32(simde_int32x4_t r, simde_int8x16_t a, simde_int8x16_t b, c #endif return result; } -#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && !defined(__ARM_FEATURE_DOTPROD)) +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) #undef vdotq_laneq_s32 #define vdotq_laneq_s32(r, a, b, lane) simde_vdotq_laneq_s32((r), (a), (b), (lane)) #endif @@ -368,7 +368,7 @@ simde_uint32x4_t simde_vdotq_lane_u32(simde_uint32x4_t r, simde_uint8x16_t a, simde_uint8x8_t b, const int lane) SIMDE_REQUIRE_CONSTANT_RANGE(lane, 0, 1) { simde_uint32x4_t result; - #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && defined(__ARM_FEATURE_DOTPROD) + #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && defined(SIMDE_ARCH_ARM_DOTPROD) SIMDE_CONSTIFY_2_(vdotq_lane_u32, result, (HEDLEY_UNREACHABLE(), result), lane, r, a, b); #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE) simde_uint32x2_t @@ -419,7 +419,7 @@ simde_vdotq_lane_u32(simde_uint32x4_t r, simde_uint8x16_t a, simde_uint8x8_t b, #endif return result; } -#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && !defined(__ARM_FEATURE_DOTPROD)) +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) #undef vdotq_lane_u32 #define vdotq_lane_u32(r, a, b, lane) simde_vdotq_lane_u32((r), (a), (b), (lane)) #endif @@ -429,7 +429,7 @@ simde_int32x4_t simde_vdotq_lane_s32(simde_int32x4_t r, simde_int8x16_t a, simde_int8x8_t b, const int lane) SIMDE_REQUIRE_CONSTANT_RANGE(lane, 0, 1) { simde_int32x4_t result; - #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && defined(__ARM_FEATURE_DOTPROD) + #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && defined(SIMDE_ARCH_ARM_DOTPROD) SIMDE_CONSTIFY_2_(vdotq_lane_s32, result, (HEDLEY_UNREACHABLE(), result), lane, r, a, b); #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE) simde_int32x2_t @@ -480,11 +480,137 @@ simde_vdotq_lane_s32(simde_int32x4_t r, simde_int8x16_t a, simde_int8x8_t b, con #endif return result; } -#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && !defined(__ARM_FEATURE_DOTPROD)) +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) #undef vdotq_lane_s32 #define vdotq_lane_s32(r, a, b, lane) simde_vdotq_lane_s32((r), (a), (b), (lane)) #endif +SIMDE_FUNCTION_ATTRIBUTES +simde_float32x2_t +simde_vbfdot_lane_f32(simde_float32x2_t r, simde_bfloat16x4_t a, simde_bfloat16x4_t b, const int lane) + SIMDE_REQUIRE_CONSTANT_RANGE(lane, 0, 1) { + simde_float32x2_t result; + #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARCH_ARM_DOTPROD) && \ + defined(SIMDE_ARM_NEON_BF16) + SIMDE_CONSTIFY_2_(vbfdot_lane_f32, result, (HEDLEY_UNREACHABLE(), result), lane, r, a, b); + #else + simde_float32x2_private r_ = simde_float32x2_to_private(r); + simde_bfloat16x4_private + a_ = simde_bfloat16x4_to_private(a), + b_ = simde_bfloat16x4_to_private(b); + + for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { + simde_float32_t elt1_a = simde_bfloat16_to_float32(a_.values[2 * i + 0]); + simde_float32_t elt1_b = simde_bfloat16_to_float32(a_.values[2 * i + 1]); + simde_float32_t elt2_a = simde_bfloat16_to_float32(b_.values[2 * lane + 0]); + simde_float32_t elt2_b = simde_bfloat16_to_float32(b_.values[2 * lane + 1]); + r_.values[i] = r_.values[i] + elt1_a * elt2_a + elt1_b * elt2_b; + } + + result = simde_float32x2_from_private(r_); + #endif + + return result; +} +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) + #undef vbfdot_lane_f32 + #define vbfdot_lane_f32(r, a, b, lane) simde_vbfdot_lane_f32((r), (a), (b), (lane)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_float32x4_t +simde_vbfdotq_lane_f32(simde_float32x4_t r, simde_bfloat16x8_t a, simde_bfloat16x4_t b, const int lane) + SIMDE_REQUIRE_CONSTANT_RANGE(lane, 0, 1) { + simde_float32x4_t result; + #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARCH_ARM_DOTPROD) && \ + defined(SIMDE_ARM_NEON_BF16) + SIMDE_CONSTIFY_2_(vbfdotq_lane_f32, result, (HEDLEY_UNREACHABLE(), result), lane, r, a, b); + #else + simde_float32x4_private r_ = simde_float32x4_to_private(r); + simde_bfloat16x8_private a_ = simde_bfloat16x8_to_private(a); + simde_bfloat16x4_private b_ = simde_bfloat16x4_to_private(b); + + for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { + simde_float32_t elt1_a = simde_bfloat16_to_float32(a_.values[2 * i + 0]); + simde_float32_t elt1_b = simde_bfloat16_to_float32(a_.values[2 * i + 1]); + simde_float32_t elt2_a = simde_bfloat16_to_float32(b_.values[2 * lane + 0]); + simde_float32_t elt2_b = simde_bfloat16_to_float32(b_.values[2 * lane + 1]); + r_.values[i] = r_.values[i] + elt1_a * elt2_a + elt1_b * elt2_b; + } + + result = simde_float32x4_from_private(r_); + #endif + + return result; +} +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) + #undef vbfdotq_lane_f32 + #define vbfdotq_lane_f32(r, a, b, lane) simde_vbfdotq_lane_f32((r), (a), (b), (lane)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_float32x2_t +simde_vbfdot_laneq_f32(simde_float32x2_t r, simde_bfloat16x4_t a, simde_bfloat16x8_t b, const int lane) + SIMDE_REQUIRE_CONSTANT_RANGE(lane, 0, 3) { + simde_float32x2_t result; + #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARCH_ARM_DOTPROD) && \ + defined(SIMDE_ARM_NEON_BF16) + SIMDE_CONSTIFY_4_(vbfdot_laneq_f32, result, (HEDLEY_UNREACHABLE(), result), lane, r, a, b); + #else + simde_float32x2_private r_ = simde_float32x2_to_private(r); + simde_bfloat16x4_private a_ = simde_bfloat16x4_to_private(a); + simde_bfloat16x8_private b_ = simde_bfloat16x8_to_private(b); + + for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { + simde_float32_t elt1_a = simde_bfloat16_to_float32(a_.values[2 * i + 0]); + simde_float32_t elt1_b = simde_bfloat16_to_float32(a_.values[2 * i + 1]); + simde_float32_t elt2_a = simde_bfloat16_to_float32(b_.values[2 * lane + 0]); + simde_float32_t elt2_b = simde_bfloat16_to_float32(b_.values[2 * lane + 1]); + r_.values[i] = r_.values[i] + elt1_a * elt2_a + elt1_b * elt2_b; + } + + result = simde_float32x2_from_private(r_); + #endif + + return result; +} +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) + #undef vbfdot_laneq_f32 + #define vbfdot_laneq_f32(r, a, b, lane) simde_vbfdot_laneq_f32((r), (a), (b), (lane)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_float32x4_t +simde_vbfdotq_laneq_f32(simde_float32x4_t r, simde_bfloat16x8_t a, simde_bfloat16x8_t b, const int lane) + SIMDE_REQUIRE_CONSTANT_RANGE(lane, 0, 3) { + simde_float32x4_t result; + #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARCH_ARM_DOTPROD) && \ + defined(SIMDE_ARM_NEON_BF16) + SIMDE_CONSTIFY_4_(vbfdotq_laneq_f32, result, (HEDLEY_UNREACHABLE(), result), lane, r, a, b); + #else + simde_float32x4_private r_ = simde_float32x4_to_private(r); + simde_bfloat16x8_private + a_ = simde_bfloat16x8_to_private(a), + b_ = simde_bfloat16x8_to_private(b); + + for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { + simde_float32_t elt1_a = simde_bfloat16_to_float32(a_.values[2 * i + 0]); + simde_float32_t elt1_b = simde_bfloat16_to_float32(a_.values[2 * i + 1]); + simde_float32_t elt2_a = simde_bfloat16_to_float32(b_.values[2 * lane + 0]); + simde_float32_t elt2_b = simde_bfloat16_to_float32(b_.values[2 * lane + 1]); + r_.values[i] = r_.values[i] + elt1_a * elt2_a + elt1_b * elt2_b; + } + + result = simde_float32x4_from_private(r_); + #endif + + return result; +} +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) + #undef vbfdotq_laneq_f32 + #define vbfdotq_laneq_f32(r, a, b, lane) simde_vbfdotq_laneq_f32((r), (a), (b), (lane)) +#endif + SIMDE_END_DECLS_ HEDLEY_DIAGNOSTIC_POP diff --git a/lib/simd_wrapper/simde/arm/neon/dup_lane.h b/lib/simd_wrapper/simde/arm/neon/dup_lane.h index bc1720518a4..44db662be64 100644 --- a/lib/simd_wrapper/simde/arm/neon/dup_lane.h +++ b/lib/simd_wrapper/simde/arm/neon/dup_lane.h @@ -22,6 +22,7 @@ * * Copyright: * 2020-2021 Evan Nemerson + * 2023 Yi-Yen Chung (Copyright owned by Andes Technology) */ #if !defined(SIMDE_ARM_NEON_DUP_LANE_H) @@ -146,6 +147,59 @@ simde_vdupd_lane_u64(simde_uint64x1_t vec, const int lane) #define vdupd_lane_u64(vec, lane) simde_vdupd_lane_u64((vec), (lane)) #endif +SIMDE_FUNCTION_ATTRIBUTES +simde_float16_t +simde_vduph_lane_f16(simde_float16x4_t vec, const int lane) + SIMDE_REQUIRE_CONSTANT_RANGE(lane, 0, 3) { + return simde_float16x4_to_private(vec).values[lane]; +} +#if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && defined(SIMDE_ARM_NEON_FP16) + #define simde_vduph_lane_f16(vec, lane) vduph_lane_f16(vec, lane) +#endif +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vduph_lane_f16 + #define vduph_lane_f16(vec, lane) simde_vduph_lane_f16((vec), (lane)) +#endif + +// simde_vdup_lane_f16 +#if defined(SIMDE_ARM_NEON_A32V7_NATIVE) && defined(SIMDE_ARM_NEON_FP16) + #define simde_vdup_lane_f16(vec, lane) vdup_lane_f16(vec, lane) +#else + #define simde_vdup_lane_f16(vec, lane) simde_vdup_n_f16(simde_vduph_lane_f16(vec, lane)) +#endif +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vdup_lane_f16 + #define vdup_lane_f16(vec, lane) simde_vdup_lane_f16((vec), (lane)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_float16x4_t +simde_vdup_laneq_f16(simde_float16x8_t vec, const int lane) + SIMDE_REQUIRE_CONSTANT_RANGE(lane, 0, 7) { + return simde_vdup_n_f16(simde_float16x8_to_private(vec).values[lane]); +} +#if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && defined(SIMDE_ARM_NEON_FP16) + #define simde_vdup_laneq_f16(vec, lane) vdup_laneq_f16(vec, lane) +#endif +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vdup_laneq_f16 + #define vdup_laneq_f16(vec, lane) simde_vdup_laneq_f16((vec), (lane)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_float16x8_t +simde_vdupq_lane_f16(simde_float16x4_t vec, const int lane) + SIMDE_REQUIRE_CONSTANT_RANGE(lane, 0, 3) { + return simde_vdupq_n_f16(simde_float16x4_to_private(vec).values[lane]); +} +#if defined(SIMDE_ARM_NEON_A32V7_NATIVE) && defined(SIMDE_ARM_NEON_FP16) + #define simde_vdupq_lane_f16(vec, lane) vdupq_lane_f16(vec, lane) +#endif +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vdupq_lane_f16 + #define vdupq_lane_f16(vec, lane) simde_vdupq_lane_f16((vec), (lane)) +#endif + SIMDE_FUNCTION_ATTRIBUTES simde_float64_t simde_vdupd_lane_f64(simde_float64x1_t vec, const int lane) @@ -924,6 +978,20 @@ simde_vdupq_lane_u64(simde_uint64x1_t vec, const int lane) #define vdupq_lane_u64(vec, lane) simde_vdupq_lane_u64((vec), (lane)) #endif +SIMDE_FUNCTION_ATTRIBUTES +simde_float16x8_t +simde_vdupq_laneq_f16(simde_float16x8_t vec, const int lane) + SIMDE_REQUIRE_CONSTANT_RANGE(lane, 0, 7) { + return simde_vdupq_n_f16(simde_float16x8_to_private(vec).values[lane]); +} +#if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && defined(SIMDE_ARM_NEON_FP16) + #define simde_vdupq_laneq_f16(vec, lane) vdupq_laneq_f16(vec, lane) +#endif +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vdupq_laneq_f16 + #define vdupq_laneq_f16(vec, lane) simde_vdupq_laneq_f16((vec), (lane)) +#endif + SIMDE_FUNCTION_ATTRIBUTES simde_float32x4_t simde_vdupq_laneq_f32(simde_float32x4_t vec, const int lane) @@ -1194,6 +1262,437 @@ simde_vdupq_laneq_u64(simde_uint64x2_t vec, const int lane) #define vdupq_laneq_u64(vec, lane) simde_vdupq_laneq_u64((vec), (lane)) #endif +SIMDE_FUNCTION_ATTRIBUTES +int8_t +simde_vdupb_lane_s8(simde_int8x8_t vec, const int lane) + SIMDE_REQUIRE_CONSTANT_RANGE(lane, 0, 7) { + return simde_int8x8_to_private(vec).values[lane]; +} +#if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + #define simde_vdupb_lane_s8(vec, lane) vdupb_lane_s8(vec, lane) +#endif +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vdupb_lane_s8 + #define vdupb_lane_s8(vec, lane) simde_vdupb_lane_s8((vec), (lane)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +uint8_t +simde_vdupb_lane_u8(simde_uint8x8_t vec, const int lane) + SIMDE_REQUIRE_CONSTANT_RANGE(lane, 0, 7) { + return simde_uint8x8_to_private(vec).values[lane]; +} +#if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + #define simde_vdupb_lane_u8(vec, lane) vdupb_lane_u8(vec, lane) +#endif +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vdupb_lane_u8 + #define vdupb_lane_u8(vec, lane) simde_vdupb_lane_u8((vec), (lane)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +int8_t +simde_vdupb_laneq_s8(simde_int8x16_t vec, const int lane) + SIMDE_REQUIRE_CONSTANT_RANGE(lane, 0, 15) { + return simde_int8x16_to_private(vec).values[lane]; +} +#if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + #define simde_vdupb_laneq_s8(vec, lane) vdupb_laneq_s8(vec, lane) +#endif +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vdupb_laneq_s8 + #define vdupb_laneq_s8(vec, lane) simde_vdupb_laneq_s8((vec), (lane)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +uint8_t +simde_vdupb_laneq_u8(simde_uint8x16_t vec, const int lane) + SIMDE_REQUIRE_CONSTANT_RANGE(lane, 0, 15) { + return simde_uint8x16_to_private(vec).values[lane]; +} +#if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + #define simde_vdupb_laneq_u8(vec, lane) vdupb_laneq_u8(vec, lane) +#endif +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vdupb_laneq_u8 + #define vdupb_laneq_u8(vec, lane) simde_vdupb_laneq_u8((vec), (lane)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +int16_t +simde_vduph_lane_s16(simde_int16x4_t vec, const int lane) + SIMDE_REQUIRE_CONSTANT_RANGE(lane, 0, 3) { + return simde_int16x4_to_private(vec).values[lane]; +} +#if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + #define simde_vduph_lane_s16(vec, lane) vduph_lane_s16(vec, lane) +#endif +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vduph_lane_s16 + #define vduph_lane_s16(vec, lane) simde_vduph_lane_s16((vec), (lane)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +uint16_t +simde_vduph_lane_u16(simde_uint16x4_t vec, const int lane) + SIMDE_REQUIRE_CONSTANT_RANGE(lane, 0, 3) { + return simde_uint16x4_to_private(vec).values[lane]; +} +#if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + #define simde_vduph_lane_u16(vec, lane) vduph_lane_u16(vec, lane) +#endif +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vduph_lane_u16 + #define vduph_lane_u16(vec, lane) simde_vduph_lane_u16((vec), (lane)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +int16_t +simde_vduph_laneq_s16(simde_int16x8_t vec, const int lane) + SIMDE_REQUIRE_CONSTANT_RANGE(lane, 0, 7) { + return simde_int16x8_to_private(vec).values[lane]; +} +#if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + #define simde_vduph_laneq_s16(vec, lane) vduph_laneq_s16(vec, lane) +#endif +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vduph_laneq_s16 + #define vduph_laneq_s16(vec, lane) simde_vduph_laneq_s16((vec), (lane)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +uint16_t +simde_vduph_laneq_u16(simde_uint16x8_t vec, const int lane) + SIMDE_REQUIRE_CONSTANT_RANGE(lane, 0, 7) { + return simde_uint16x8_to_private(vec).values[lane]; +} +#if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + #define simde_vduph_laneq_u16(vec, lane) vduph_laneq_u16(vec, lane) +#endif +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vduph_laneq_u16 + #define vduph_laneq_u16(vec, lane) simde_vduph_laneq_u16((vec), (lane)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_float16_t +simde_vduph_laneq_f16(simde_float16x8_t vec, const int lane) + SIMDE_REQUIRE_CONSTANT_RANGE(lane, 0, 7) { + return simde_float16x8_to_private(vec).values[lane]; +} +#if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && defined(SIMDE_ARM_NEON_FP16) + #define simde_vduph_laneq_f16(vec, lane) vduph_laneq_f16(vec, lane) +#endif +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vduph_laneq_f16 + #define vduph_laneq_f16(vec, lane) simde_vduph_laneq_f16((vec), (lane)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_poly8x8_t +simde_vdup_lane_p8(simde_poly8x8_t vec, const int lane) + SIMDE_REQUIRE_CONSTANT_RANGE(lane, 0, 7) { + return simde_vdup_n_p8(simde_poly8x8_to_private(vec).values[lane]); +} +#if defined(SIMDE_ARM_NEON_A32V7_NATIVE) + #define simde_vdup_lane_p8(vec, lane) vdup_lane_p8((vec), (lane)) +#endif +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vdup_lane_p8 + #define vdup_lane_p8(vec, lane) simde_vdup_lane_p8((vec), (lane)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_poly16x4_t +simde_vdup_lane_p16(simde_poly16x4_t vec, const int lane) + SIMDE_REQUIRE_CONSTANT_RANGE(lane, 0, 3) { + return simde_vdup_n_p16(simde_poly16x4_to_private(vec).values[lane]); +} +#if defined(SIMDE_ARM_NEON_A32V7_NATIVE) + #define simde_vdup_lane_p16(vec, lane) vdup_lane_p16((vec), (lane)) +#endif +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vdup_lane_p16 + #define vdup_lane_p16(vec, lane) simde_vdup_lane_p16((vec), (lane)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_poly64x1_t +simde_vdup_lane_p64(simde_poly64x1_t vec, const int lane) + SIMDE_REQUIRE_CONSTANT_RANGE(lane, 0, 0) { + return simde_vdup_n_p64(simde_poly64x1_to_private(vec).values[lane]); +} +#if defined(SIMDE_ARM_NEON_A32V8_NATIVE) + #define simde_vdup_lane_p64(vec, lane) vdup_lane_p64((vec), (lane)) +#endif +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) + #undef vdup_lane_p64 + #define vdup_lane_p64(vec, lane) simde_vdup_lane_p64((vec), (lane)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_poly8x8_t +simde_vdup_laneq_p8(simde_poly8x16_t vec, const int lane) + SIMDE_REQUIRE_CONSTANT_RANGE(lane, 0, 15) { + return simde_vdup_n_p8(simde_poly8x16_to_private(vec).values[lane]); +} +#if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + #define simde_vdup_laneq_p8(vec, lane) vdup_laneq_p8((vec), (lane)) +#endif +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vdup_laneq_p8 + #define vdup_laneq_p8(vec, lane) simde_vdup_laneq_p8((vec), (lane)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_poly16x4_t +simde_vdup_laneq_p16(simde_poly16x8_t vec, const int lane) + SIMDE_REQUIRE_CONSTANT_RANGE(lane, 0, 7) { + return simde_vdup_n_p16(simde_poly16x8_to_private(vec).values[lane]); +} +#if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + #define simde_vdup_laneq_p16(vec, lane) vdup_laneq_p16((vec), (lane)) +#endif +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vdup_laneq_p16 + #define vdup_laneq_p16(vec, lane) simde_vdup_laneq_p16((vec), (lane)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_poly64x1_t +simde_vdup_laneq_p64(simde_poly64x2_t vec, const int lane) + SIMDE_REQUIRE_CONSTANT_RANGE(lane, 0, 1) { + return simde_vdup_n_p64(simde_poly64x2_to_private(vec).values[lane]); +} +#if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + #define simde_vdup_laneq_p64(vec, lane) vdup_laneq_p64((vec), (lane)) +#endif +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) + #undef vdup_laneq_p64 + #define vdup_laneq_p64(vec, lane) simde_vdup_laneq_p64((vec), (lane)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_poly8x16_t +simde_vdupq_lane_p8(simde_poly8x8_t vec, const int lane) + SIMDE_REQUIRE_CONSTANT_RANGE(lane, 0, 7) { + return simde_vdupq_n_p8(simde_poly8x8_to_private(vec).values[lane]); +} +#if defined(SIMDE_ARM_NEON_A32V7_NATIVE) + #define simde_vdupq_lane_p8(vec, lane) vdupq_lane_p8((vec), (lane)) +#endif +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vdupq_lane_p8 + #define vdupq_lane_p8(vec, lane) simde_vdupq_lane_p8((vec), (lane)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_poly16x8_t +simde_vdupq_lane_p16(simde_poly16x4_t vec, const int lane) + SIMDE_REQUIRE_CONSTANT_RANGE(lane, 0, 3) { + return simde_vdupq_n_p16(simde_poly16x4_to_private(vec).values[lane]); +} +#if defined(SIMDE_ARM_NEON_A32V7_NATIVE) + #define simde_vdupq_lane_p16(vec, lane) vdupq_lane_p16((vec), (lane)) +#endif +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vdupq_lane_p16 + #define vdupq_lane_p16(vec, lane) simde_vdupq_lane_p16((vec), (lane)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_poly64x2_t +simde_vdupq_lane_p64(simde_poly64x1_t vec, const int lane) + SIMDE_REQUIRE_CONSTANT_RANGE(lane, 0, 0) { + return simde_vdupq_n_p64(simde_poly64x1_to_private(vec).values[lane]); +} +#if defined(SIMDE_ARM_NEON_A32V8_NATIVE) + #define simde_vdupq_lane_p64(vec, lane) vdupq_lane_p64((vec), (lane)) +#endif +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) + #undef vdupq_lane_p64 + #define vdupq_lane_p64(vec, lane) simde_vdupq_lane_p64((vec), (lane)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_poly8x16_t +simde_vdupq_laneq_p8(simde_poly8x16_t vec, const int lane) + SIMDE_REQUIRE_CONSTANT_RANGE(lane, 0, 15) { + return simde_vdupq_n_p8(simde_poly8x16_to_private(vec).values[lane]); +} +#if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + #define simde_vdupq_laneq_p8(vec, lane) vdupq_laneq_p8((vec), (lane)) +#endif +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vdupq_laneq_p8 + #define vdupq_laneq_p8(vec, lane) simde_vdupq_laneq_p8((vec), (lane)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_poly16x8_t +simde_vdupq_laneq_p16(simde_poly16x8_t vec, const int lane) + SIMDE_REQUIRE_CONSTANT_RANGE(lane, 0, 7) { + return simde_vdupq_n_p16(simde_poly16x8_to_private(vec).values[lane]); +} +#if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + #define simde_vdupq_laneq_p16(vec, lane) vdupq_laneq_p16((vec), (lane)) +#endif +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vdupq_laneq_p16 + #define vdupq_laneq_p16(vec, lane) simde_vdupq_laneq_p16((vec), (lane)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_poly64x2_t +simde_vdupq_laneq_p64(simde_poly64x2_t vec, const int lane) + SIMDE_REQUIRE_CONSTANT_RANGE(lane, 0, 1) { + return simde_vdupq_n_p64(simde_poly64x2_to_private(vec).values[lane]); +} +#if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + #define simde_vdupq_laneq_p64(vec, lane) vdupq_laneq_p64((vec), (lane)) +#endif +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vdupq_laneq_p64 + #define vdupq_laneq_p64(vec, lane) simde_vdupq_laneq_p64((vec), (lane)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_poly8_t +simde_vdupb_lane_p8(simde_poly8x8_t vec, const int lane) + SIMDE_REQUIRE_CONSTANT_RANGE(lane, 0, 7) { + return simde_poly8x8_to_private(vec).values[lane]; +} +#if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && !defined(SIMDE_BUG_CLANG_71362) + #define simde_vdupb_lane_p8(vec, lane) vdupb_lane_p8((vec), (lane)) +#endif +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vdupb_lane_p8 + #define vdupb_lane_p8(vec, lane) simde_vdupb_lane_p8((vec), (lane)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_poly8_t +simde_vdupb_laneq_p8(simde_poly8x16_t vec, const int lane) + SIMDE_REQUIRE_CONSTANT_RANGE(lane, 0, 15) { + return simde_poly8x16_to_private(vec).values[lane]; +} +#if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && !defined(SIMDE_BUG_CLANG_71362) + #define simde_vdupb_laneq_p8(vec, lane) vdupb_laneq_p8((vec), (lane)) +#endif +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vdupb_laneq_p8 + #define vdupb_laneq_p8(vec, lane) simde_vdupb_laneq_p8((vec), (lane)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_poly16_t +simde_vduph_lane_p16(simde_poly16x4_t vec, const int lane) + SIMDE_REQUIRE_CONSTANT_RANGE(lane, 0, 3) { + return simde_poly16x4_to_private(vec).values[lane]; +} +#if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && !defined(SIMDE_BUG_CLANG_71362) + #define simde_vduph_lane_p16(vec, lane) vduph_lane_p16((vec), (lane)) +#endif +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vduph_lane_p16 + #define vduph_lane_p16(vec, lane) simde_vduph_lane_p16((vec), (lane)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_poly16_t +simde_vduph_laneq_p16(simde_poly16x8_t vec, const int lane) + SIMDE_REQUIRE_CONSTANT_RANGE(lane, 0, 7) { + return simde_poly16x8_to_private(vec).values[lane]; +} +#if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && !defined(SIMDE_BUG_CLANG_71362) + #define simde_vduph_laneq_p16(vec, lane) vduph_laneq_p16((vec), (lane)) +#endif +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vduph_laneq_p16 + #define vduph_laneq_p16(vec, lane) simde_vduph_laneq_p16((vec), (lane)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_bfloat16_t +simde_vduph_lane_bf16(simde_bfloat16x4_t vec, const int lane) + SIMDE_REQUIRE_CONSTANT_RANGE(lane, 0, 3) { + return simde_bfloat16x4_to_private(vec).values[lane]; +} +#if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARM_NEON_BF16) + #define simde_vduph_lane_bf16(vec, lane) vduph_lane_bf16(vec, lane) +#endif +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) + #undef vduph_lane_bf16 + #define vduph_lane_bf16(vec, lane) simde_vduph_lane_bf16((vec), (lane)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_bfloat16_t +simde_vduph_laneq_bf16(simde_bfloat16x8_t vec, const int lane) + SIMDE_REQUIRE_CONSTANT_RANGE(lane, 0, 7) { + return simde_bfloat16x8_to_private(vec).values[lane]; +} +#if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARM_NEON_BF16) + #define simde_vduph_laneq_bf16(vec, lane) vduph_laneq_bf16(vec, lane) +#endif +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) + #undef vduph_laneq_bf16 + #define vduph_laneq_bf16(vec, lane) simde_vduph_laneq_bf16((vec), (lane)) +#endif + +// simde_vdup_lane_bf16 +#if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARM_NEON_BF16) + #define simde_vdup_lane_bf16(vec, lane) vdup_lane_bf16(vec, lane) +#else + #define simde_vdup_lane_bf16(vec, lane) simde_vdup_n_bf16(simde_vduph_lane_bf16(vec, lane)) +#endif +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) + #undef vdup_lane_bf16 + #define vdup_lane_bf16(vec, lane) simde_vdup_lane_bf16((vec), (lane)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_bfloat16x4_t +simde_vdup_laneq_bf16(simde_bfloat16x8_t vec, const int lane) + SIMDE_REQUIRE_CONSTANT_RANGE(lane, 0, 7) { + return simde_vdup_n_bf16(simde_bfloat16x8_to_private(vec).values[lane]); +} +#if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARM_NEON_BF16) +#define simde_vdup_laneq_bf16(vec, lane) vdup_laneq_bf16(vec, lane) +#endif +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) + #undef vdup_laneq_bf16 + #define vdup_laneq_bf16(vec, lane) simde_vdup_laneq_bf16((vec), (lane)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_bfloat16x8_t +simde_vdupq_lane_bf16(simde_bfloat16x4_t vec, const int lane) + SIMDE_REQUIRE_CONSTANT_RANGE(lane, 0, 3) { + return simde_vdupq_n_bf16(simde_bfloat16x4_to_private(vec).values[lane]); +} +#if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARM_NEON_BF16) +#define simde_vdupq_lane_bf16(vec, lane) vdupq_lane_bf16(vec, lane) +#endif +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) + #undef vdupq_lane_bf16 + #define vdupq_lane_bf16(vec, lane) simde_vdupq_lane_bf16((vec), (lane)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_bfloat16x8_t +simde_vdupq_laneq_bf16(simde_bfloat16x8_t vec, const int lane) + SIMDE_REQUIRE_CONSTANT_RANGE(lane, 0, 7) { + return simde_vdupq_n_bf16(simde_bfloat16x8_to_private(vec).values[lane]); +} +#if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARM_NEON_BF16) + #define simde_vdupq_laneq_bf16(vec, lane) vdupq_laneq_bf16(vec, lane) +#endif +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) + #undef vdupq_laneq_bf16 + #define vdupq_laneq_bf16(vec, lane) simde_vdupq_laneq_bf16((vec), (lane)) +#endif + SIMDE_END_DECLS_ HEDLEY_DIAGNOSTIC_POP diff --git a/lib/simd_wrapper/simde/arm/neon/dup_n.h b/lib/simd_wrapper/simde/arm/neon/dup_n.h index e945e99c902..365293edf87 100644 --- a/lib/simd_wrapper/simde/arm/neon/dup_n.h +++ b/lib/simd_wrapper/simde/arm/neon/dup_n.h @@ -23,6 +23,7 @@ * Copyright: * 2020 Sean Maher (Copyright owned by Google, LLC) * 2020 Evan Nemerson + * 2023 Yi-Yen Chung (Copyright owned by Andes Technology) */ #if !defined(SIMDE_ARM_NEON_DUP_N_H) @@ -36,7 +37,7 @@ SIMDE_BEGIN_DECLS_ SIMDE_FUNCTION_ATTRIBUTES simde_float16x4_t -simde_vdup_n_f16(simde_float16 value) { +simde_vdup_n_f16(simde_float16_t value) { #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) && defined(SIMDE_ARM_NEON_FP16) return vdup_n_f16(value); #else @@ -324,7 +325,7 @@ simde_vdup_n_u64(uint64_t value) { SIMDE_FUNCTION_ATTRIBUTES simde_float16x8_t -simde_vdupq_n_f16(simde_float16 value) { +simde_vdupq_n_f16(simde_float16_t value) { #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) && defined(SIMDE_ARM_NEON_FP16) return vdupq_n_f16(value); #else @@ -338,7 +339,7 @@ simde_vdupq_n_f16(simde_float16 value) { return simde_float16x8_from_private(r_); #endif } -#define simde_vmovq_n_f32 simde_vdupq_n_f32 +#define simde_vmovq_n_f16 simde_vdupq_n_f16 #if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) #undef vdupq_n_f16 #define vdupq_n_f16(value) simde_vdupq_n_f16((value)) @@ -668,6 +669,186 @@ simde_vdupq_n_u64(uint64_t value) { #define vmovq_n_u64(value) simde_vmovq_n_u64((value)) #endif +SIMDE_FUNCTION_ATTRIBUTES +simde_poly8x8_t +simde_vdup_n_p8(simde_poly8_t value) { + #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) + return vdup_n_p8(value); + #else + simde_poly8x8_private r_; + + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { + r_.values[i] = value; + } + + return simde_poly8x8_from_private(r_); + #endif +} +#define simde_vmov_n_p8 simde_vdup_n_p8 +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vdup_n_p8 + #define vdup_n_p8(value) simde_vdup_n_p8((value)) + #undef vmov_n_p8 + #define vmov_n_p8(value) simde_vmov_n_p8((value)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_poly16x4_t +simde_vdup_n_p16(simde_poly16_t value) { + #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) + return vdup_n_p16(value); + #else + simde_poly16x4_private r_; + + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { + r_.values[i] = value; + } + + return simde_poly16x4_from_private(r_); + #endif +} +#define simde_vmov_n_p16 simde_vdup_n_p16 +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vdup_n_p16 + #define vdup_n_p16(value) simde_vdup_n_p16((value)) + #undef vmov_n_p16 + #define vmov_n_p16(value) simde_vmov_n_p16((value)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_poly64x1_t +simde_vdup_n_p64(simde_poly64_t value) { + #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) + return vdup_n_p64(value); + #else + simde_poly64x1_private r_; + + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { + r_.values[i] = value; + } + + return simde_poly64x1_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) + #undef vdup_n_p64 + #define vdup_n_p64(value) simde_vdup_n_p64((value)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_poly8x16_t +simde_vdupq_n_p8(simde_poly8_t value) { + #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) + return vdupq_n_p8(value); + #else + simde_poly8x16_private r_; + + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { + r_.values[i] = value; + } + + return simde_poly8x16_from_private(r_); + #endif +} +#define simde_vmovq_n_p8 simde_vdupq_n_p8 +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vdupq_n_p8 + #define vdupq_n_p8(value) simde_vdupq_n_p8((value)) + #undef vmovq_n_p8 + #define vmovq_n_p8(value) simde_vmovq_n_p8((value)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_poly16x8_t +simde_vdupq_n_p16(simde_poly16_t value) { + #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) + return vdupq_n_p16(value); + #else + simde_poly16x8_private r_; + + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { + r_.values[i] = value; + } + + return simde_poly16x8_from_private(r_); + #endif +} +#define simde_vmovq_n_p16 simde_vdupq_n_p16 +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vdupq_n_p16 + #define vdupq_n_p16(value) simde_vdupq_n_p16((value)) + #undef vmovq_n_p16 + #define vmovq_n_p16(value) simde_vmovq_n_p16((value)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_poly64x2_t +simde_vdupq_n_p64(simde_poly64_t value) { + #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) + return vdupq_n_p64(value); + #else + simde_poly64x2_private r_; + + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { + r_.values[i] = value; + } + + return simde_poly64x2_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) + #undef vdupq_n_p64 + #define vdupq_n_p64(value) simde_vdupq_n_p64((value)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_bfloat16x4_t +simde_vdup_n_bf16(simde_bfloat16_t value) { + #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARM_NEON_BF16) + return vdup_n_bf16(value); + #else + simde_bfloat16x4_private r_; + + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { + r_.values[i] = value; + } + + return simde_bfloat16x4_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) + #undef vdup_n_bf16 + #define vdup_n_bf16(value) simde_vdup_n_bf16((value)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_bfloat16x8_t +simde_vdupq_n_bf16(simde_bfloat16_t value) { + #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARM_NEON_BF16) + return vdupq_n_bf16(value); + #else + simde_bfloat16x8_private r_; + + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { + r_.values[i] = value; + } + + return simde_bfloat16x8_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) + #undef vdupq_n_bf16 + #define vdupq_n_bf16(value) simde_vdupq_n_bf16((value)) +#endif + SIMDE_END_DECLS_ HEDLEY_DIAGNOSTIC_POP diff --git a/lib/simd_wrapper/simde/arm/neon/eor.h b/lib/simd_wrapper/simde/arm/neon/eor.h index bf5a66d3b6a..9514760251c 100644 --- a/lib/simd_wrapper/simde/arm/neon/eor.h +++ b/lib/simd_wrapper/simde/arm/neon/eor.h @@ -23,6 +23,7 @@ * Copyright: * 2020 Evan Nemerson * 2020 Christopher Moore + * 2023 Yi-Yen Chung (Copyright owned by Andes Technology) */ #if !defined(SIMDE_ARM_NEON_EOR_H) @@ -546,6 +547,207 @@ simde_veorq_u64(simde_uint64x2_t a, simde_uint64x2_t b) { #define veorq_u64(a, b) simde_veorq_u64((a), (b)) #endif +// Note: EOR3 instructions are implemented only when FEAT_SHA3 is implemented. +SIMDE_FUNCTION_ATTRIBUTES +simde_int8x16_t +simde_veor3q_s8(simde_int8x16_t a, simde_int8x16_t b, simde_int8x16_t c) { + #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && defined(SIMDE_ARCH_ARM_SHA3) + return veor3q_s8(a, b, c); + #else + simde_int8x16_private + r_, + a_ = simde_int8x16_to_private(a), + b_ = simde_int8x16_to_private(b), + c_ = simde_int8x16_to_private(c); + + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { + r_.values[i] = a_.values[i] ^ b_.values[i] ^ c_.values[i]; + } + + return simde_int8x16_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef veor3q_s8 + #define veor3q_s8(a, b, c) simde_veor3q_s8((a), (b), (c)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_int16x8_t +simde_veor3q_s16(simde_int16x8_t a, simde_int16x8_t b, simde_int16x8_t c) { + #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && defined(SIMDE_ARCH_ARM_SHA3) + return veor3q_s16(a, b, c); + #else + simde_int16x8_private + r_, + a_ = simde_int16x8_to_private(a), + b_ = simde_int16x8_to_private(b), + c_ = simde_int16x8_to_private(c); + + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { + r_.values[i] = a_.values[i] ^ b_.values[i] ^ c_.values[i]; + } + + return simde_int16x8_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef veor3q_s16 + #define veor3q_s16(a, b, c) simde_veor3q_s16((a), (b), (c)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_int32x4_t +simde_veor3q_s32(simde_int32x4_t a, simde_int32x4_t b, simde_int32x4_t c) { + #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && defined(SIMDE_ARCH_ARM_SHA3) + return veor3q_s32(a, b, c); + #else + simde_int32x4_private + r_, + a_ = simde_int32x4_to_private(a), + b_ = simde_int32x4_to_private(b), + c_ = simde_int32x4_to_private(c); + + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { + r_.values[i] = a_.values[i] ^ b_.values[i] ^ c_.values[i]; + } + + return simde_int32x4_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef veor3q_s32 + #define veor3q_s32(a, b, c) simde_veor3q_s32((a), (b), (c)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_int64x2_t +simde_veor3q_s64(simde_int64x2_t a, simde_int64x2_t b, simde_int64x2_t c) { + #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && defined(SIMDE_ARCH_ARM_SHA3) + return veor3q_s64(a, b, c); + #else + simde_int64x2_private + r_, + a_ = simde_int64x2_to_private(a), + b_ = simde_int64x2_to_private(b), + c_ = simde_int64x2_to_private(c); + + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { + r_.values[i] = a_.values[i] ^ b_.values[i] ^ c_.values[i]; + } + + return simde_int64x2_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef veor3q_s64 + #define veor3q_s64(a, b, c) simde_veor3q_s64((a), (b), (c)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_uint8x16_t +simde_veor3q_u8(simde_uint8x16_t a, simde_uint8x16_t b, simde_uint8x16_t c) { + #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && defined(SIMDE_ARCH_ARM_SHA3) + return veor3q_u8(a, b, c); + #else + simde_uint8x16_private + r_, + a_ = simde_uint8x16_to_private(a), + b_ = simde_uint8x16_to_private(b), + c_ = simde_uint8x16_to_private(c); + + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { + r_.values[i] = a_.values[i] ^ b_.values[i] ^ c_.values[i]; + } + + return simde_uint8x16_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef veor3q_u8 + #define veor3q_u8(a, b, c) simde_veor3q_u8((a), (b), (c)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_uint16x8_t +simde_veor3q_u16(simde_uint16x8_t a, simde_uint16x8_t b, simde_uint16x8_t c) { + #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && defined(SIMDE_ARCH_ARM_SHA3) + return veor3q_u16(a, b, c); + #else + simde_uint16x8_private + r_, + a_ = simde_uint16x8_to_private(a), + b_ = simde_uint16x8_to_private(b), + c_ = simde_uint16x8_to_private(c); + + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { + r_.values[i] = a_.values[i] ^ b_.values[i] ^ c_.values[i]; + } + + return simde_uint16x8_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef veor3q_u16 + #define veor3q_u16(a, b, c) simde_veor3q_u16((a), (b), (c)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_uint32x4_t +simde_veor3q_u32(simde_uint32x4_t a, simde_uint32x4_t b, simde_uint32x4_t c) { + #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && defined(SIMDE_ARCH_ARM_SHA3) + return veor3q_u32(a, b, c); + #else + simde_uint32x4_private + r_, + a_ = simde_uint32x4_to_private(a), + b_ = simde_uint32x4_to_private(b), + c_ = simde_uint32x4_to_private(c); + + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { + r_.values[i] = a_.values[i] ^ b_.values[i] ^ c_.values[i]; + } + + return simde_uint32x4_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef veor3q_u32 + #define veor3q_u32(a, b, c) simde_veor3q_u32((a), (b), (c)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_uint64x2_t +simde_veor3q_u64(simde_uint64x2_t a, simde_uint64x2_t b, simde_uint64x2_t c) { + #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && defined(SIMDE_ARCH_ARM_SHA3) + return veor3q_u64(a, b, c); + #else + simde_uint64x2_private + r_, + a_ = simde_uint64x2_to_private(a), + b_ = simde_uint64x2_to_private(b), + c_ = simde_uint64x2_to_private(c); + + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { + r_.values[i] = a_.values[i] ^ b_.values[i] ^ c_.values[i]; + } + + return simde_uint64x2_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef veor3q_u64 + #define veor3q_u64(a, b, c) simde_veor3q_u64((a), (b), (c)) +#endif + SIMDE_END_DECLS_ HEDLEY_DIAGNOSTIC_POP diff --git a/lib/simd_wrapper/simde/arm/neon/ext.h b/lib/simd_wrapper/simde/arm/neon/ext.h index 0768e9d1a77..45c5aa0f009 100644 --- a/lib/simd_wrapper/simde/arm/neon/ext.h +++ b/lib/simd_wrapper/simde/arm/neon/ext.h @@ -23,6 +23,7 @@ * Copyright: * 2020 Evan Nemerson * 2020 Sean Maher (Copyright owned by Google, LLC) + * 2023 Yi-Yen Chung (Copyright owned by Andes Technology) */ #if !defined(SIMDE_ARM_NEON_EXT_H) @@ -33,6 +34,32 @@ HEDLEY_DIAGNOSTIC_PUSH SIMDE_DISABLE_UNWANTED_DIAGNOSTICS SIMDE_BEGIN_DECLS_ +SIMDE_FUNCTION_ATTRIBUTES +simde_float16x4_t +simde_vext_f16(simde_float16x4_t a, simde_float16x4_t b, const int n) + SIMDE_REQUIRE_CONSTANT_RANGE(n, 0, 3) { + #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) && defined(SIMDE_ARM_NEON_FP16) + simde_float16x4_t r; + SIMDE_CONSTIFY_4_(vext_f16, r, (HEDLEY_UNREACHABLE(), a), n, a, b); + return r; + #else + simde_float16x4_private + a_ = simde_float16x4_to_private(a), + b_ = simde_float16x4_to_private(b), + r_ = a_; + const size_t n_ = HEDLEY_STATIC_CAST(size_t, n); + for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { + size_t src = i + n_; + r_.values[i] = (src < (sizeof(r_.values) / sizeof(r_.values[0]))) ? a_.values[src] : b_.values[src & 3]; + } + return simde_float16x4_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vext_f16 + #define vext_f16(a, b, n) simde_vext_f16((a), (b), (n)) +#endif + SIMDE_FUNCTION_ATTRIBUTES simde_float32x2_t simde_vext_f32(simde_float32x2_t a, simde_float32x2_t b, const int n) @@ -54,7 +81,7 @@ simde_vext_f32(simde_float32x2_t a, simde_float32x2_t b, const int n) return simde_float32x2_from_private(r_); #endif } -#if defined(SIMDE_X86_SSSE3_NATIVE) && !defined(SIMDE_BUG_GCC_SIZEOF_IMMEDIATE) +#if defined(SIMDE_X86_SSSE3_NATIVE) && defined(SIMDE_X86_MMX_NATIVE) && !defined(SIMDE_BUG_GCC_SIZEOF_IMMEDIATE) #define simde_vext_f32(a, b, n) simde_float32x2_from_m64(_mm_alignr_pi8(simde_float32x2_to_m64(b), simde_float32x2_to_m64(a), n * sizeof(simde_float32))) #elif defined(SIMDE_SHUFFLE_VECTOR_) && !defined(SIMDE_ARM_NEON_A32V7_NATIVE) && !defined(SIMDE_BUG_GCC_BAD_VEXT_REV32) && !defined(SIMDE_BUG_GCC_100760) #define simde_vext_f32(a, b, n) (__extension__ ({ \ @@ -89,7 +116,7 @@ simde_vext_f64(simde_float64x1_t a, simde_float64x1_t b, const int n) return simde_float64x1_from_private(r_); #endif } -#if defined(SIMDE_X86_SSSE3_NATIVE) && !defined(SIMDE_BUG_GCC_SIZEOF_IMMEDIATE) +#if defined(SIMDE_X86_SSSE3_NATIVE) && defined(SIMDE_X86_MMX_NATIVE) && !defined(SIMDE_BUG_GCC_SIZEOF_IMMEDIATE) #define simde_vext_f64(a, b, n) simde_float64x1_from_m64(_mm_alignr_pi8(simde_float64x1_to_m64(b), simde_float64x1_to_m64(a), n * sizeof(simde_float64))) #elif defined(SIMDE_SHUFFLE_VECTOR_) && !defined(SIMDE_ARM_NEON_A32V7_NATIVE) && !defined(SIMDE_BUG_GCC_BAD_VEXT_REV32) #define simde_vext_f64(a, b, n) (__extension__ ({ \ @@ -125,7 +152,7 @@ simde_vext_s8(simde_int8x8_t a, simde_int8x8_t b, const int n) return simde_int8x8_from_private(r_); #endif } -#if defined(SIMDE_X86_SSSE3_NATIVE) && !defined(SIMDE_BUG_GCC_SIZEOF_IMMEDIATE) +#if defined(SIMDE_X86_SSSE3_NATIVE) && defined(SIMDE_X86_MMX_NATIVE) && !defined(SIMDE_BUG_GCC_SIZEOF_IMMEDIATE) #define simde_vext_s8(a, b, n) simde_int8x8_from_m64(_mm_alignr_pi8(simde_int8x8_to_m64(b), simde_int8x8_to_m64(a), n * sizeof(int8_t))) #elif defined(SIMDE_SHUFFLE_VECTOR_) && !defined(SIMDE_ARM_NEON_A32V7_NATIVE) && !defined(SIMDE_BUG_GCC_BAD_VEXT_REV32) && !defined(SIMDE_BUG_GCC_100760) #define simde_vext_s8(a, b, n) (__extension__ ({ \ @@ -164,7 +191,7 @@ simde_vext_s16(simde_int16x4_t a, simde_int16x4_t b, const int n) return simde_int16x4_from_private(r_); #endif } -#if defined(SIMDE_X86_SSSE3_NATIVE) && !defined(SIMDE_BUG_GCC_SIZEOF_IMMEDIATE) +#if defined(SIMDE_X86_SSSE3_NATIVE) && defined(SIMDE_X86_MMX_NATIVE) && !defined(SIMDE_BUG_GCC_SIZEOF_IMMEDIATE) #define simde_vext_s16(a, b, n) simde_int16x4_from_m64(_mm_alignr_pi8(simde_int16x4_to_m64(b), simde_int16x4_to_m64(a), n * sizeof(int16_t))) #elif defined(SIMDE_SHUFFLE_VECTOR_) && !defined(SIMDE_ARM_NEON_A32V7_NATIVE) && !defined(SIMDE_BUG_GCC_BAD_VEXT_REV32) && !defined(SIMDE_BUG_GCC_100760) #define simde_vext_s16(a, b, n) (__extension__ ({ \ @@ -201,7 +228,7 @@ simde_vext_s32(simde_int32x2_t a, simde_int32x2_t b, const int n) return simde_int32x2_from_private(r_); #endif } -#if defined(SIMDE_X86_SSSE3_NATIVE) && !defined(SIMDE_BUG_GCC_SIZEOF_IMMEDIATE) +#if defined(SIMDE_X86_SSSE3_NATIVE) && defined(SIMDE_X86_MMX_NATIVE) && !defined(SIMDE_BUG_GCC_SIZEOF_IMMEDIATE) #define simde_vext_s32(a, b, n) simde_int32x2_from_m64(_mm_alignr_pi8(simde_int32x2_to_m64(b), simde_int32x2_to_m64(a), n * sizeof(int32_t))) #elif defined(SIMDE_SHUFFLE_VECTOR_) && !defined(SIMDE_ARM_NEON_A32V7_NATIVE) && !defined(SIMDE_BUG_GCC_BAD_VEXT_REV32) && !defined(SIMDE_BUG_GCC_100760) #define simde_vext_s32(a, b, n) (__extension__ ({ \ @@ -236,7 +263,7 @@ simde_vext_s64(simde_int64x1_t a, simde_int64x1_t b, const int n) return simde_int64x1_from_private(r_); #endif } -#if defined(SIMDE_X86_SSSE3_NATIVE) && !defined(SIMDE_BUG_GCC_SIZEOF_IMMEDIATE) +#if defined(SIMDE_X86_SSSE3_NATIVE) && defined(SIMDE_X86_MMX_NATIVE) && !defined(SIMDE_BUG_GCC_SIZEOF_IMMEDIATE) #define simde_vext_s64(a, b, n) simde_int64x1_from_m64(_mm_alignr_pi8(simde_int64x1_to_m64(b), simde_int64x1_to_m64(a), n * sizeof(int64_t))) #elif defined(SIMDE_SHUFFLE_VECTOR_) && !defined(SIMDE_ARM_NEON_A32V7_NATIVE) && !defined(SIMDE_BUG_GCC_BAD_VEXT_REV32) #define simde_vext_s64(a, b, n) (__extension__ ({ \ @@ -272,7 +299,7 @@ simde_vext_u8(simde_uint8x8_t a, simde_uint8x8_t b, const int n) return simde_uint8x8_from_private(r_); #endif } -#if defined(SIMDE_X86_SSSE3_NATIVE) && !defined(SIMDE_BUG_GCC_SIZEOF_IMMEDIATE) +#if defined(SIMDE_X86_SSSE3_NATIVE) && defined(SIMDE_X86_MMX_NATIVE) && !defined(SIMDE_BUG_GCC_SIZEOF_IMMEDIATE) #define simde_vext_u8(a, b, n) simde_uint8x8_from_m64(_mm_alignr_pi8(simde_uint8x8_to_m64(b), simde_uint8x8_to_m64(a), n * sizeof(uint8_t))) #elif defined(SIMDE_SHUFFLE_VECTOR_) && !defined(SIMDE_ARM_NEON_A32V7_NATIVE) && !defined(SIMDE_BUG_GCC_BAD_VEXT_REV32) && !defined(SIMDE_BUG_GCC_100760) #define simde_vext_u8(a, b, n) (__extension__ ({ \ @@ -311,7 +338,7 @@ simde_vext_u16(simde_uint16x4_t a, simde_uint16x4_t b, const int n) return simde_uint16x4_from_private(r_); #endif } -#if defined(SIMDE_X86_SSSE3_NATIVE) && !defined(SIMDE_BUG_GCC_SIZEOF_IMMEDIATE) +#if defined(SIMDE_X86_SSSE3_NATIVE) && defined(SIMDE_X86_MMX_NATIVE) && !defined(SIMDE_BUG_GCC_SIZEOF_IMMEDIATE) #define simde_vext_u16(a, b, n) simde_uint16x4_from_m64(_mm_alignr_pi8(simde_uint16x4_to_m64(b), simde_uint16x4_to_m64(a), n * sizeof(uint16_t))) #elif defined(SIMDE_SHUFFLE_VECTOR_) && !defined(SIMDE_ARM_NEON_A32V7_NATIVE) && !defined(SIMDE_BUG_GCC_BAD_VEXT_REV32) && !defined(SIMDE_BUG_GCC_100760) #define simde_vext_u16(a, b, n) (__extension__ ({ \ @@ -348,7 +375,7 @@ simde_vext_u32(simde_uint32x2_t a, simde_uint32x2_t b, const int n) return simde_uint32x2_from_private(r_); #endif } -#if defined(SIMDE_X86_SSSE3_NATIVE) && !defined(SIMDE_BUG_GCC_SIZEOF_IMMEDIATE) +#if defined(SIMDE_X86_SSSE3_NATIVE) && defined(SIMDE_X86_MMX_NATIVE) && !defined(SIMDE_BUG_GCC_SIZEOF_IMMEDIATE) #define simde_vext_u32(a, b, n) simde_uint32x2_from_m64(_mm_alignr_pi8(simde_uint32x2_to_m64(b), simde_uint32x2_to_m64(a), n * sizeof(uint32_t))) #elif defined(SIMDE_SHUFFLE_VECTOR_) && !defined(SIMDE_ARM_NEON_A32V7_NATIVE) && !defined(SIMDE_BUG_GCC_BAD_VEXT_REV32) && !defined(SIMDE_BUG_GCC_100760) #define simde_vext_u32(a, b, n) (__extension__ ({ \ @@ -383,7 +410,7 @@ simde_vext_u64(simde_uint64x1_t a, simde_uint64x1_t b, const int n) return simde_uint64x1_from_private(r_); #endif } -#if defined(SIMDE_X86_SSSE3_NATIVE) && !defined(SIMDE_BUG_GCC_SIZEOF_IMMEDIATE) +#if defined(SIMDE_X86_SSSE3_NATIVE) && defined(SIMDE_X86_MMX_NATIVE) && !defined(SIMDE_BUG_GCC_SIZEOF_IMMEDIATE) #define simde_vext_u64(a, b, n) simde_uint64x1_from_m64(_mm_alignr_pi8(simde_uint64x1_to_m64(b), simde_uint64x1_to_m64(a), n * sizeof(uint64_t))) #elif defined(SIMDE_SHUFFLE_VECTOR_) && !defined(SIMDE_ARM_NEON_A32V7_NATIVE) && !defined(SIMDE_BUG_GCC_BAD_VEXT_REV32) #define simde_vext_u64(a, b, n) (__extension__ ({ \ @@ -398,6 +425,32 @@ simde_vext_u64(simde_uint64x1_t a, simde_uint64x1_t b, const int n) #define vext_u64(a, b, n) simde_vext_u64((a), (b), (n)) #endif +SIMDE_FUNCTION_ATTRIBUTES +simde_float16x8_t +simde_vextq_f16(simde_float16x8_t a, simde_float16x8_t b, const int n) + SIMDE_REQUIRE_CONSTANT_RANGE(n, 0, 7) { + #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) && defined(SIMDE_ARM_NEON_FP16) + simde_float16x8_t r; + SIMDE_CONSTIFY_8_(vextq_f16, r, (HEDLEY_UNREACHABLE(), a), n, a, b); + return r; + #else + simde_float16x8_private + a_ = simde_float16x8_to_private(a), + b_ = simde_float16x8_to_private(b), + r_ = a_; + const size_t n_ = HEDLEY_STATIC_CAST(size_t, n); + for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { + size_t src = i + n_; + r_.values[i] = (src < (sizeof(r_.values) / sizeof(r_.values[0]))) ? a_.values[src] : b_.values[src & 7]; + } + return simde_float16x8_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vextq_f16 + #define vextq_f16(a, b, n) simde_vextq_f16((a), (b), (n)) +#endif + SIMDE_FUNCTION_ATTRIBUTES simde_float32x4_t simde_vextq_f32(simde_float32x4_t a, simde_float32x4_t b, const int n) @@ -420,7 +473,15 @@ simde_vextq_f32(simde_float32x4_t a, simde_float32x4_t b, const int n) #endif } #if defined(SIMDE_X86_SSSE3_NATIVE) && !defined(SIMDE_BUG_GCC_SIZEOF_IMMEDIATE) - #define simde_vextq_f32(a, b, n) simde_float32x4_from_m128(_mm_castsi128_ps(_mm_alignr_epi8(_mm_castps_si128(simde_float32x4_to_m128(b)), _mm_castps_si128(simde_float32x4_to_m128(a)), n * sizeof(simde_float32)))) + #define simde_vextq_f32(a, b, n) simde_float32x4_from_m128(_mm_castsi128_ps(_mm_alignr_epi8(_mm_castps_si128(simde_float32x4_to_m128(b)), _mm_castps_si128(simde_float32x4_to_m128(a)), (n) * sizeof(simde_float32)))) +#elif defined(SIMDE_WASM_SIMD128_NATIVE) + #define simde_vextq_f32(a, b, n) (__extension__ ({ \ + simde_float32x4_private simde_vextq_f32_r_; \ + simde_vextq_f32_r_.v128 = wasm_i32x4_shuffle(simde_float32x4_to_private(a).v128, simde_float32x4_to_private(b).v128, \ + HEDLEY_STATIC_CAST(int8_t, ((n) + 0)), HEDLEY_STATIC_CAST(int8_t, ((n) + 1)), \ + HEDLEY_STATIC_CAST(int8_t, ((n) + 2)), HEDLEY_STATIC_CAST(int8_t, ((n) + 3))); \ + simde_float32x4_from_private(simde_vextq_f32_r_); \ + })) #elif defined(SIMDE_SHUFFLE_VECTOR_) && !defined(SIMDE_ARM_NEON_A32V7_NATIVE) && !defined(SIMDE_BUG_GCC_BAD_VEXT_REV32) #define simde_vextq_f32(a, b, n) (__extension__ ({ \ simde_float32x4_private simde_vextq_f32_r_; \ @@ -457,7 +518,14 @@ simde_vextq_f64(simde_float64x2_t a, simde_float64x2_t b, const int n) #endif } #if defined(SIMDE_X86_SSSE3_NATIVE) && !defined(SIMDE_BUG_GCC_SIZEOF_IMMEDIATE) - #define simde_vextq_f64(a, b, n) simde_float64x2_from_m128d(_mm_castsi128_pd(_mm_alignr_epi8(_mm_castpd_si128(simde_float64x2_to_m128d(b)), _mm_castpd_si128(simde_float64x2_to_m128d(a)), n * sizeof(simde_float64)))) + #define simde_vextq_f64(a, b, n) simde_float64x2_from_m128d(_mm_castsi128_pd(_mm_alignr_epi8(_mm_castpd_si128(simde_float64x2_to_m128d(b)), _mm_castpd_si128(simde_float64x2_to_m128d(a)), (n) * sizeof(simde_float64)))) +#elif defined(SIMDE_WASM_SIMD128_NATIVE) + #define simde_vextq_f64(a, b, n) (__extension__ ({ \ + simde_float64x2_private simde_vextq_f64_r_; \ + simde_vextq_f64_r_.v128 = wasm_i64x2_shuffle(simde_float64x2_to_private(a).v128, simde_float64x2_to_private(b).v128, \ + HEDLEY_STATIC_CAST(int8_t, ((n) + 0)), HEDLEY_STATIC_CAST(int8_t, ((n) + 1))); \ + simde_float64x2_from_private(simde_vextq_f64_r_); \ + })) #elif defined(SIMDE_SHUFFLE_VECTOR_) && !defined(SIMDE_ARM_NEON_A32V7_NATIVE) && !defined(SIMDE_BUG_GCC_BAD_VEXT_REV32) #define simde_vextq_f64(a, b, n) (__extension__ ({ \ simde_float64x2_private simde_vextq_f64_r_; \ @@ -494,6 +562,20 @@ simde_vextq_s8(simde_int8x16_t a, simde_int8x16_t b, const int n) } #if defined(SIMDE_X86_SSSE3_NATIVE) && !defined(SIMDE_BUG_GCC_SIZEOF_IMMEDIATE) #define simde_vextq_s8(a, b, n) simde_int8x16_from_m128i(_mm_alignr_epi8(simde_int8x16_to_m128i(b), simde_int8x16_to_m128i(a), n * sizeof(int8_t))) +#elif defined(SIMDE_WASM_SIMD128_NATIVE) + #define simde_vextq_s8(a, b, n) (__extension__ ({ \ + simde_int8x16_private simde_vextq_s8_r_; \ + simde_vextq_s8_r_.v128 = wasm_i8x16_shuffle(simde_int8x16_to_private(a).v128, simde_int8x16_to_private(b).v128, \ + HEDLEY_STATIC_CAST(int8_t, ((n) + 0)), HEDLEY_STATIC_CAST(int8_t, ((n) + 1)), \ + HEDLEY_STATIC_CAST(int8_t, ((n) + 2)), HEDLEY_STATIC_CAST(int8_t, ((n) + 3)), \ + HEDLEY_STATIC_CAST(int8_t, ((n) + 4)), HEDLEY_STATIC_CAST(int8_t, ((n) + 5)), \ + HEDLEY_STATIC_CAST(int8_t, ((n) + 6)), HEDLEY_STATIC_CAST(int8_t, ((n) + 7)), \ + HEDLEY_STATIC_CAST(int8_t, ((n) + 8)), HEDLEY_STATIC_CAST(int8_t, ((n) + 9)), \ + HEDLEY_STATIC_CAST(int8_t, ((n) + 10)), HEDLEY_STATIC_CAST(int8_t, ((n) + 11)), \ + HEDLEY_STATIC_CAST(int8_t, ((n) + 12)), HEDLEY_STATIC_CAST(int8_t, ((n) + 13)), \ + HEDLEY_STATIC_CAST(int8_t, ((n) + 14)), HEDLEY_STATIC_CAST(int8_t, ((n) + 15))); \ + simde_int8x16_from_private(simde_vextq_s8_r_); \ + })) #elif defined(SIMDE_SHUFFLE_VECTOR_) && !defined(SIMDE_ARM_NEON_A32V7_NATIVE) && !defined(SIMDE_BUG_GCC_BAD_VEXT_REV32) #define simde_vextq_s8(a, b, n) (__extension__ ({ \ simde_int8x16_private simde_vextq_s8_r_; \ @@ -537,6 +619,16 @@ simde_vextq_s16(simde_int16x8_t a, simde_int16x8_t b, const int n) } #if defined(SIMDE_X86_SSSE3_NATIVE) && !defined(SIMDE_BUG_GCC_SIZEOF_IMMEDIATE) #define simde_vextq_s16(a, b, n) simde_int16x8_from_m128i(_mm_alignr_epi8(simde_int16x8_to_m128i(b), simde_int16x8_to_m128i(a), n * sizeof(int16_t))) +#elif defined(SIMDE_WASM_SIMD128_NATIVE) + #define simde_vextq_s16(a, b, n) (__extension__ ({ \ + simde_int16x8_private simde_vextq_s16_r_; \ + simde_vextq_s16_r_.v128 = wasm_i16x8_shuffle(simde_int16x8_to_private(a).v128, simde_int16x8_to_private(b).v128, \ + HEDLEY_STATIC_CAST(int8_t, ((n) + 0)), HEDLEY_STATIC_CAST(int8_t, ((n) + 1)), \ + HEDLEY_STATIC_CAST(int8_t, ((n) + 2)), HEDLEY_STATIC_CAST(int8_t, ((n) + 3)), \ + HEDLEY_STATIC_CAST(int8_t, ((n) + 4)), HEDLEY_STATIC_CAST(int8_t, ((n) + 5)), \ + HEDLEY_STATIC_CAST(int8_t, ((n) + 6)), HEDLEY_STATIC_CAST(int8_t, ((n) + 7))); \ + simde_int16x8_from_private(simde_vextq_s16_r_); \ + })) #elif defined(SIMDE_SHUFFLE_VECTOR_) && !defined(SIMDE_ARM_NEON_A32V7_NATIVE) && !defined(SIMDE_BUG_GCC_BAD_VEXT_REV32) #define simde_vextq_s16(a, b, n) (__extension__ ({ \ simde_int16x8_private simde_vextq_s16_r_; \ @@ -576,6 +668,14 @@ simde_vextq_s32(simde_int32x4_t a, simde_int32x4_t b, const int n) } #if defined(SIMDE_X86_SSSE3_NATIVE) && !defined(SIMDE_BUG_GCC_SIZEOF_IMMEDIATE) #define simde_vextq_s32(a, b, n) simde_int32x4_from_m128i(_mm_alignr_epi8(simde_int32x4_to_m128i(b), simde_int32x4_to_m128i(a), n * sizeof(int32_t))) +#elif defined(SIMDE_WASM_SIMD128_NATIVE) + #define simde_vextq_s32(a, b, n) (__extension__ ({ \ + simde_int32x4_private simde_vextq_s32_r_; \ + simde_vextq_s32_r_.v128 = wasm_i32x4_shuffle(simde_int32x4_to_private(a).v128, simde_int32x4_to_private(b).v128, \ + HEDLEY_STATIC_CAST(int8_t, ((n) + 0)), HEDLEY_STATIC_CAST(int8_t, ((n) + 1)), \ + HEDLEY_STATIC_CAST(int8_t, ((n) + 2)), HEDLEY_STATIC_CAST(int8_t, ((n) + 3))); \ + simde_int32x4_from_private(simde_vextq_s32_r_); \ + })) #elif defined(SIMDE_SHUFFLE_VECTOR_) && !defined(SIMDE_ARM_NEON_A32V7_NATIVE) && !defined(SIMDE_BUG_GCC_BAD_VEXT_REV32) #define simde_vextq_s32(a, b, n) (__extension__ ({ \ simde_int32x4_private simde_vextq_s32_r_; \ @@ -613,6 +713,13 @@ simde_vextq_s64(simde_int64x2_t a, simde_int64x2_t b, const int n) } #if defined(SIMDE_X86_SSSE3_NATIVE) && !defined(SIMDE_BUG_GCC_SIZEOF_IMMEDIATE) #define simde_vextq_s64(a, b, n) simde_int64x2_from_m128i(_mm_alignr_epi8(simde_int64x2_to_m128i(b), simde_int64x2_to_m128i(a), n * sizeof(int64_t))) +#elif defined(SIMDE_WASM_SIMD128_NATIVE) + #define simde_vextq_s64(a, b, n) (__extension__ ({ \ + simde_int64x2_private simde_vextq_s64_r_; \ + simde_vextq_s64_r_.v128 = wasm_i64x2_shuffle(simde_int64x2_to_private(a).v128, simde_int64x2_to_private(b).v128, \ + HEDLEY_STATIC_CAST(int8_t, ((n) + 0)), HEDLEY_STATIC_CAST(int8_t, ((n) + 1))); \ + simde_int64x2_from_private(simde_vextq_s64_r_); \ + })) #elif defined(SIMDE_SHUFFLE_VECTOR_) && !defined(SIMDE_ARM_NEON_A32V7_NATIVE) && !defined(SIMDE_BUG_GCC_BAD_VEXT_REV32) #define simde_vextq_s64(a, b, n) (__extension__ ({ \ simde_int64x2_private simde_vextq_s64_r_; \ @@ -790,6 +897,161 @@ simde_vextq_u64(simde_uint64x2_t a, simde_uint64x2_t b, const int n) #define vextq_u64(a, b, n) simde_vextq_u64((a), (b), (n)) #endif +SIMDE_FUNCTION_ATTRIBUTES +simde_poly8x8_t +simde_vext_p8(simde_poly8x8_t a, simde_poly8x8_t b, const int n) + SIMDE_REQUIRE_CONSTANT_RANGE(n, 0, 7) { + #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) + simde_poly8x8_t r; + SIMDE_CONSTIFY_8_(vext_p8, r, (HEDLEY_UNREACHABLE(), a), n, a, b); + return r; + #else + simde_poly8x8_private + a_ = simde_poly8x8_to_private(a), + b_ = simde_poly8x8_to_private(b), + r_ = a_; + const size_t n_ = HEDLEY_STATIC_CAST(size_t, n); + for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { + size_t src = i + n_; + r_.values[i] = (src < (sizeof(r_.values) / sizeof(r_.values[0]))) ? a_.values[src] : b_.values[src & 7]; + } + return simde_poly8x8_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vext_p8 + #define vext_p8(a, b, n) simde_vext_p8((a), (b), (n)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_poly16x4_t +simde_vext_p16(simde_poly16x4_t a, simde_poly16x4_t b, const int n) + SIMDE_REQUIRE_CONSTANT_RANGE(n, 0, 3) { + #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) + simde_poly16x4_t r; + SIMDE_CONSTIFY_4_(vext_p16, r, (HEDLEY_UNREACHABLE(), a), n, a, b); + return r; + #else + simde_poly16x4_private + a_ = simde_poly16x4_to_private(a), + b_ = simde_poly16x4_to_private(b), + r_ = a_; + const size_t n_ = HEDLEY_STATIC_CAST(size_t, n); + for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { + size_t src = i + n_; + r_.values[i] = (src < (sizeof(r_.values) / sizeof(r_.values[0]))) ? a_.values[src] : b_.values[src & 3]; + } + return simde_poly16x4_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vext_p16 + #define vext_p16(a, b, n) simde_vext_p16((a), (b), (n)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_poly64x1_t +simde_vext_p64(simde_poly64x1_t a, simde_poly64x1_t b, const int n) + SIMDE_REQUIRE_CONSTANT_RANGE(n, 0, 0) { + #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) + (void) n; + return vext_p64(a, b, 0); + #else + simde_poly64x1_private + a_ = simde_poly64x1_to_private(a), + b_ = simde_poly64x1_to_private(b), + r_ = a_; + const size_t n_ = HEDLEY_STATIC_CAST(size_t, n); + for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { + size_t src = i + n_; + r_.values[i] = (src < (sizeof(r_.values) / sizeof(r_.values[0]))) ? a_.values[src] : b_.values[src & 0]; + } + return simde_poly64x1_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) + #undef vext_p64 + #define vext_p64(a, b, n) simde_vext_p64((a), (b), (n)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_poly8x16_t +simde_vextq_p8(simde_poly8x16_t a, simde_poly8x16_t b, const int n) + SIMDE_REQUIRE_CONSTANT_RANGE(n, 0, 15) { + #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) + simde_poly8x16_t r; + SIMDE_CONSTIFY_16_(vextq_p8, r, (HEDLEY_UNREACHABLE(), a), n, a, b); + return r; + #else + simde_poly8x16_private + a_ = simde_poly8x16_to_private(a), + b_ = simde_poly8x16_to_private(b), + r_ = a_; + const size_t n_ = HEDLEY_STATIC_CAST(size_t, n); + for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { + size_t src = i + n_; + r_.values[i] = (src < (sizeof(r_.values) / sizeof(r_.values[0]))) ? a_.values[src] : b_.values[src & 15]; + } + return simde_poly8x16_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vextq_p8 + #define vextq_p8(a, b, n) simde_vextq_p8((a), (b), (n)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_poly16x8_t +simde_vextq_p16(simde_poly16x8_t a, simde_poly16x8_t b, const int n) + SIMDE_REQUIRE_CONSTANT_RANGE(n, 0, 7) { + #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) + simde_poly16x8_t r; + SIMDE_CONSTIFY_8_(vextq_p16, r, (HEDLEY_UNREACHABLE(), a), n, a, b); + return r; + #else + simde_poly16x8_private + a_ = simde_poly16x8_to_private(a), + b_ = simde_poly16x8_to_private(b), + r_ = a_; + const size_t n_ = HEDLEY_STATIC_CAST(size_t, n); + for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { + size_t src = i + n_; + r_.values[i] = (src < (sizeof(r_.values) / sizeof(r_.values[0]))) ? a_.values[src] : b_.values[src & 7]; + } + return simde_poly16x8_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vextq_p16 + #define vextq_p16(a, b, n) simde_vextq_p16((a), (b), (n)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_poly64x2_t +simde_vextq_p64(simde_poly64x2_t a, simde_poly64x2_t b, const int n) + SIMDE_REQUIRE_CONSTANT_RANGE(n, 0, 1) { + #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) + simde_poly64x2_t r; + SIMDE_CONSTIFY_2_(vextq_p64, r, (HEDLEY_UNREACHABLE(), a), n, a, b); + return r; + #else + simde_poly64x2_private + a_ = simde_poly64x2_to_private(a), + b_ = simde_poly64x2_to_private(b), + r_ = a_; + const size_t n_ = HEDLEY_STATIC_CAST(size_t, n); + for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { + size_t src = i + n_; + r_.values[i] = (src < (sizeof(r_.values) / sizeof(r_.values[0]))) ? a_.values[src] : b_.values[src & 1]; + } + return simde_poly64x2_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) + #undef vextq_p64 + #define vextq_p64(a, b, n) simde_vextq_p64((a), (b), (n)) +#endif + SIMDE_END_DECLS_ HEDLEY_DIAGNOSTIC_POP diff --git a/lib/simd_wrapper/simde/arm/neon/fma.h b/lib/simd_wrapper/simde/arm/neon/fma.h index 4ee30d1d677..aaf9e04e056 100644 --- a/lib/simd_wrapper/simde/arm/neon/fma.h +++ b/lib/simd_wrapper/simde/arm/neon/fma.h @@ -22,6 +22,7 @@ * * Copyright: * 2021 Atharva Nimbalkar +* 2023 Yi-Yen Chung (Copyright owned by Andes Technology) */ #if !defined(SIMDE_ARM_NEON_FMA_H) @@ -34,10 +35,24 @@ HEDLEY_DIAGNOSTIC_PUSH SIMDE_DISABLE_UNWANTED_DIAGNOSTICS SIMDE_BEGIN_DECLS_ +SIMDE_FUNCTION_ATTRIBUTES +simde_float16_t +simde_vfmah_f16(simde_float16_t a, simde_float16_t b, simde_float16_t c) { + #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARCH_ARM_FMA) && defined(SIMDE_ARM_NEON_FP16) + return vfmah_f16(a, b, c); + #else + return simde_vaddh_f16(a, simde_vmulh_f16(b, c)); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) + #undef vfmah_f16 + #define vfmah_f16(a, b, c) simde_vfmah_f16(a, b, c) +#endif + SIMDE_FUNCTION_ATTRIBUTES simde_float32x2_t simde_vfma_f32(simde_float32x2_t a, simde_float32x2_t b, simde_float32x2_t c) { - #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) && (defined(__ARM_FEATURE_FMA) && __ARM_FEATURE_FMA) + #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) && defined(SIMDE_ARCH_ARM_FMA) return vfma_f32(a, b, c); #else return simde_vadd_f32(a, simde_vmul_f32(b, c)); @@ -51,7 +66,7 @@ simde_vfma_f32(simde_float32x2_t a, simde_float32x2_t b, simde_float32x2_t c) { SIMDE_FUNCTION_ATTRIBUTES simde_float64x1_t simde_vfma_f64(simde_float64x1_t a, simde_float64x1_t b, simde_float64x1_t c) { - #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && (defined(__ARM_FEATURE_FMA) && __ARM_FEATURE_FMA) + #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && defined(SIMDE_ARCH_ARM_FMA) return vfma_f64(a, b, c); #else return simde_vadd_f64(a, simde_vmul_f64(b, c)); @@ -62,10 +77,38 @@ simde_vfma_f64(simde_float64x1_t a, simde_float64x1_t b, simde_float64x1_t c) { #define vfma_f64(a, b, c) simde_vfma_f64(a, b, c) #endif +SIMDE_FUNCTION_ATTRIBUTES +simde_float16x4_t +simde_vfma_f16(simde_float16x4_t a, simde_float16x4_t b, simde_float16x4_t c) { + #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARCH_ARM_FMA) && defined(SIMDE_ARM_NEON_FP16) + return vfma_f16(a, b, c); + #else + return simde_vadd_f16(a, simde_vmul_f16(b, c)); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) + #undef vfma_f16 + #define vfma_f16(a, b, c) simde_vfma_f16(a, b, c) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_float16x8_t +simde_vfmaq_f16(simde_float16x8_t a, simde_float16x8_t b, simde_float16x8_t c) { + #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARCH_ARM_FMA) && defined(SIMDE_ARM_NEON_FP16) + return vfmaq_f16(a, b, c); + #else + return simde_vaddq_f16(a, simde_vmulq_f16(b, c)); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) + #undef vfmaq_f16 + #define vfmaq_f16(a, b, c) simde_vfmaq_f16(a, b, c) +#endif + SIMDE_FUNCTION_ATTRIBUTES simde_float32x4_t simde_vfmaq_f32(simde_float32x4_t a, simde_float32x4_t b, simde_float32x4_t c) { - #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) && (defined(__ARM_FEATURE_FMA) && __ARM_FEATURE_FMA) + #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) && defined(SIMDE_ARCH_ARM_FMA) return vfmaq_f32(a, b, c); #elif defined(SIMDE_POWER_ALTIVEC_P6_NATIVE) return vec_madd(b, c, a); @@ -94,7 +137,7 @@ simde_vfmaq_f32(simde_float32x4_t a, simde_float32x4_t b, simde_float32x4_t c) { SIMDE_FUNCTION_ATTRIBUTES simde_float64x2_t simde_vfmaq_f64(simde_float64x2_t a, simde_float64x2_t b, simde_float64x2_t c) { - #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && (defined(__ARM_FEATURE_FMA) && __ARM_FEATURE_FMA) + #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && defined(SIMDE_ARCH_ARM_FMA) return vfmaq_f64(a, b, c); #elif defined(SIMDE_POWER_ALTIVEC_P7_NATIVE) || defined(SIMDE_ZARCH_ZVECTOR_13_NATIVE) return vec_madd(b, c, a); diff --git a/lib/simd_wrapper/simde/arm/neon/fma_lane.h b/lib/simd_wrapper/simde/arm/neon/fma_lane.h index 6100ed78ca0..e937f715cb3 100644 --- a/lib/simd_wrapper/simde/arm/neon/fma_lane.h +++ b/lib/simd_wrapper/simde/arm/neon/fma_lane.h @@ -22,6 +22,7 @@ * * Copyright: * 2021 Atharva Nimbalkar +* 2023 Yi-Yen Chung (Copyright owned by Andes Technology) */ #if !defined(SIMDE_ARM_NEON_FMA_LANE_H) @@ -38,7 +39,7 @@ SIMDE_DISABLE_UNWANTED_DIAGNOSTICS SIMDE_BEGIN_DECLS_ /* simde_vfmad_lane_f64 */ -#if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && (defined(__ARM_FEATURE_FMA) && __ARM_FEATURE_FMA) +#if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && defined(SIMDE_ARCH_ARM_FMA) #if defined(__clang__) && !SIMDE_DETECT_CLANG_VERSION_CHECK(11,0,0) #define simde_vfmad_lane_f64(a, b, v, lane) \ SIMDE_DISABLE_DIAGNOSTIC_EXPR_(SIMDE_DIAGNOSTIC_DISABLE_VECTOR_CONVERSION_, vfmad_lane_f64(a, b, v, lane)) @@ -61,7 +62,7 @@ SIMDE_BEGIN_DECLS_ #endif /* simde_vfmad_laneq_f64 */ -#if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && (defined(__ARM_FEATURE_FMA) && __ARM_FEATURE_FMA) +#if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && defined(SIMDE_ARCH_ARM_FMA) #if defined(__clang__) && !SIMDE_DETECT_CLANG_VERSION_CHECK(11,0,0) #define simde_vfmad_laneq_f64(a, b, v, lane) \ SIMDE_DISABLE_DIAGNOSTIC_EXPR_(SIMDE_DIAGNOSTIC_DISABLE_VECTOR_CONVERSION_, vfmad_laneq_f64(a, b, v, lane)) @@ -83,8 +84,54 @@ SIMDE_BEGIN_DECLS_ #define vfmad_laneq_f64(a, b, v, lane) simde_vfmad_laneq_f64(a, b, v, lane) #endif +/* simde_vfmah_lane_f16 */ +#if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && defined(SIMDE_ARCH_ARM_FMA) && defined(SIMDE_ARM_NEON_FP16) + #if defined(__clang__) && !SIMDE_DETECT_CLANG_VERSION_CHECK(11,0,0) + #define simde_vfmah_lane_f16(a, b, v, lane) \ + SIMDE_DISABLE_DIAGNOSTIC_EXPR_(SIMDE_DIAGNOSTIC_DISABLE_VECTOR_CONVERSION_, vfmah_lane_f16(a, b, v, lane)) + #else + #define simde_vfmah_lane_f16(a, b, v, lane) vfmah_lane_f16((a), (b), (v), (lane)) + #endif +#else + #define simde_vfmah_lane_f16(a, b, v, lane) \ + simde_vget_lane_f16( \ + simde_vadd_f16( \ + simde_vdup_n_f16(a), \ + simde_vdup_n_f16(simde_vmulh_lane_f16(b, v, lane)) \ + ), \ + 0 \ + ) +#endif +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vfmah_lane_f16 + #define vfmah_lane_f16(a, b, v, lane) simde_vfmah_lane_f16(a, b, v, lane) +#endif + +/* simde_vfmah_laneq_f16 */ +#if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && defined(SIMDE_ARCH_ARM_FMA) && defined(SIMDE_ARM_NEON_FP16) + #if defined(__clang__) && !SIMDE_DETECT_CLANG_VERSION_CHECK(11,0,0) + #define simde_vfmah_laneq_f16(a, b, v, lane) \ + SIMDE_DISABLE_DIAGNOSTIC_EXPR_(SIMDE_DIAGNOSTIC_DISABLE_VECTOR_CONVERSION_, vfmah_laneq_f16(a, b, v, lane)) + #else + #define simde_vfmah_laneq_f16(a, b, v, lane) vfmah_laneq_f16((a), (b), (v), (lane)) + #endif +#else + #define simde_vfmah_laneq_f16(a, b, v, lane) \ + simde_vget_lane_f16( \ + simde_vadd_f16( \ + simde_vdup_n_f16(a), \ + simde_vdup_n_f16(simde_vmulh_laneq_f16(b, v, lane)) \ + ), \ + 0 \ + ) +#endif +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vfmah_laneq_f16 + #define vfmah_laneq_f16(a, b, v, lane) simde_vfmah_laneq_f16(a, b, v, lane) +#endif + /* simde_vfmas_lane_f32 */ -#if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && (defined(__ARM_FEATURE_FMA) && __ARM_FEATURE_FMA) +#if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && defined(SIMDE_ARCH_ARM_FMA) #if defined(__clang__) && !SIMDE_DETECT_CLANG_VERSION_CHECK(11,0,0) #define simde_vfmas_lane_f32(a, b, v, lane) \ SIMDE_DISABLE_DIAGNOSTIC_EXPR_(SIMDE_DIAGNOSTIC_DISABLE_VECTOR_CONVERSION_, vfmas_lane_f32(a, b, v, lane)) @@ -107,7 +154,7 @@ SIMDE_BEGIN_DECLS_ #endif /* simde_vfmas_laneq_f32 */ -#if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && (defined(__ARM_FEATURE_FMA) && __ARM_FEATURE_FMA) +#if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && defined(SIMDE_ARCH_ARM_FMA) #if defined(__clang__) && !SIMDE_DETECT_CLANG_VERSION_CHECK(11,0,0) #define simde_vfmas_laneq_f32(a, b, v, lane) \ SIMDE_DISABLE_DIAGNOSTIC_EXPR_(SIMDE_DIAGNOSTIC_DISABLE_VECTOR_CONVERSION_, vfmas_laneq_f32(a, b, v, lane)) @@ -129,8 +176,19 @@ SIMDE_BEGIN_DECLS_ #define vfmas_laneq_f32(a, b, v, lane) simde_vfmas_laneq_f32(a, b, v, lane) #endif +/* simde_vfma_lane_f16 */ +#if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && defined(SIMDE_ARCH_ARM_FMA) && defined(SIMDE_ARM_NEON_FP16) + #define simde_vfma_lane_f16(a, b, v, lane) vfma_lane_f16(a, b, v, lane) +#else + #define simde_vfma_lane_f16(a, b, v, lane) simde_vadd_f16(a, simde_vmul_lane_f16(b, v, lane)) +#endif +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vfma_lane_f16 + #define vfma_lane_f16(a, b, v, lane) simde_vfma_lane_f16(a, b, v, lane) +#endif + /* simde_vfma_lane_f32 */ -#if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && (defined(__ARM_FEATURE_FMA) && __ARM_FEATURE_FMA) +#if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && defined(SIMDE_ARCH_ARM_FMA) #define simde_vfma_lane_f32(a, b, v, lane) vfma_lane_f32(a, b, v, lane) #else #define simde_vfma_lane_f32(a, b, v, lane) simde_vadd_f32(a, simde_vmul_lane_f32(b, v, lane)) @@ -141,7 +199,7 @@ SIMDE_BEGIN_DECLS_ #endif /* simde_vfma_lane_f64 */ -#if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && (defined(__ARM_FEATURE_FMA) && __ARM_FEATURE_FMA) +#if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && defined(SIMDE_ARCH_ARM_FMA) #define simde_vfma_lane_f64(a, b, v, lane) vfma_lane_f64((a), (b), (v), (lane)) #else #define simde_vfma_lane_f64(a, b, v, lane) simde_vadd_f64(a, simde_vmul_lane_f64(b, v, lane)) @@ -151,8 +209,19 @@ SIMDE_BEGIN_DECLS_ #define vfma_lane_f64(a, b, v, lane) simde_vfma_lane_f64(a, b, v, lane) #endif +/* simde_vfma_laneq_f16 */ +#if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && defined(SIMDE_ARCH_ARM_FMA) && defined(SIMDE_ARM_NEON_FP16) + #define simde_vfma_laneq_f16(a, b, v, lane) vfma_laneq_f16((a), (b), (v), (lane)) +#else + #define simde_vfma_laneq_f16(a, b, v, lane) simde_vadd_f16(a, simde_vmul_laneq_f16(b, v, lane)) +#endif +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vfma_laneq_f16 + #define vfma_laneq_f16(a, b, v, lane) simde_vfma_laneq_f16(a, b, v, lane) +#endif + /* simde_vfma_laneq_f32 */ -#if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && (defined(__ARM_FEATURE_FMA) && __ARM_FEATURE_FMA) +#if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && defined(SIMDE_ARCH_ARM_FMA) #define simde_vfma_laneq_f32(a, b, v, lane) vfma_laneq_f32((a), (b), (v), (lane)) #else #define simde_vfma_laneq_f32(a, b, v, lane) simde_vadd_f32(a, simde_vmul_laneq_f32(b, v, lane)) @@ -163,7 +232,7 @@ SIMDE_BEGIN_DECLS_ #endif /* simde_vfma_laneq_f64 */ -#if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && (defined(__ARM_FEATURE_FMA) && __ARM_FEATURE_FMA) +#if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && defined(SIMDE_ARCH_ARM_FMA) #define simde_vfma_laneq_f64(a, b, v, lane) vfma_laneq_f64((a), (b), (v), (lane)) #else #define simde_vfma_laneq_f64(a, b, v, lane) simde_vadd_f64(a, simde_vmul_laneq_f64(b, v, lane)) @@ -174,7 +243,7 @@ SIMDE_BEGIN_DECLS_ #endif /* simde_vfmaq_lane_f64 */ -#if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && (defined(__ARM_FEATURE_FMA) && __ARM_FEATURE_FMA) +#if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && defined(SIMDE_ARCH_ARM_FMA) #define simde_vfmaq_lane_f64(a, b, v, lane) vfmaq_lane_f64((a), (b), (v), (lane)) #else #define simde_vfmaq_lane_f64(a, b, v, lane) simde_vaddq_f64(a, simde_vmulq_lane_f64(b, v, lane)) @@ -184,8 +253,19 @@ SIMDE_BEGIN_DECLS_ #define vfmaq_lane_f64(a, b, v, lane) simde_vfmaq_lane_f64(a, b, v, lane) #endif +/* simde_vfmaq_lane_f16 */ +#if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && defined(SIMDE_ARCH_ARM_FMA) && defined(SIMDE_ARM_NEON_FP16) + #define simde_vfmaq_lane_f16(a, b, v, lane) vfmaq_lane_f16((a), (b), (v), (lane)) +#else + #define simde_vfmaq_lane_f16(a, b, v, lane) simde_vaddq_f16(a, simde_vmulq_lane_f16(b, v, lane)) +#endif +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vfmaq_lane_f16 + #define vfmaq_lane_f16(a, b, v, lane) simde_vfmaq_lane_f16(a, b, v, lane) +#endif + /* simde_vfmaq_lane_f32 */ -#if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && (defined(__ARM_FEATURE_FMA) && __ARM_FEATURE_FMA) +#if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && defined(SIMDE_ARCH_ARM_FMA) #define simde_vfmaq_lane_f32(a, b, v, lane) vfmaq_lane_f32((a), (b), (v), (lane)) #else #define simde_vfmaq_lane_f32(a, b, v, lane) simde_vaddq_f32(a, simde_vmulq_lane_f32(b, v, lane)) @@ -195,8 +275,20 @@ SIMDE_BEGIN_DECLS_ #define vfmaq_lane_f32(a, b, v, lane) simde_vfmaq_lane_f32(a, b, v, lane) #endif +/* simde_vfmaq_laneq_f16 */ +#if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && defined(SIMDE_ARCH_ARM_FMA) && defined(SIMDE_ARM_NEON_FP16) + #define simde_vfmaq_laneq_f16(a, b, v, lane) vfmaq_laneq_f16((a), (b), (v), (lane)) +#else + #define simde_vfmaq_laneq_f16(a, b, v, lane) \ + simde_vaddq_f16(a, simde_vmulq_laneq_f16(b, v, lane)) +#endif +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vfmaq_laneq_f16 + #define vfmaq_laneq_f16(a, b, v, lane) simde_vfmaq_laneq_f16(a, b, v, lane) +#endif + /* simde_vfmaq_laneq_f32 */ -#if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && (defined(__ARM_FEATURE_FMA) && __ARM_FEATURE_FMA) +#if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && defined(SIMDE_ARCH_ARM_FMA) #define simde_vfmaq_laneq_f32(a, b, v, lane) vfmaq_laneq_f32((a), (b), (v), (lane)) #else #define simde_vfmaq_laneq_f32(a, b, v, lane) \ @@ -208,7 +300,7 @@ SIMDE_BEGIN_DECLS_ #endif /* simde_vfmaq_laneq_f64 */ -#if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && (defined(__ARM_FEATURE_FMA) && __ARM_FEATURE_FMA) +#if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && defined(SIMDE_ARCH_ARM_FMA) #define simde_vfmaq_laneq_f64(a, b, v, lane) vfmaq_laneq_f64((a), (b), (v), (lane)) #else #define simde_vfmaq_laneq_f64(a, b, v, lane) \ diff --git a/lib/simd_wrapper/simde/arm/neon/fma_n.h b/lib/simd_wrapper/simde/arm/neon/fma_n.h index 6cf58259c06..0a23407c6cb 100644 --- a/lib/simd_wrapper/simde/arm/neon/fma_n.h +++ b/lib/simd_wrapper/simde/arm/neon/fma_n.h @@ -22,6 +22,7 @@ * * Copyright: * 2021 Evan Nemerson +* 2023 Yi-Yen Chung (Copyright owned by Andes Technology) */ #if !defined(SIMDE_ARM_NEON_FMA_N_H) @@ -35,10 +36,38 @@ HEDLEY_DIAGNOSTIC_PUSH SIMDE_DISABLE_UNWANTED_DIAGNOSTICS SIMDE_BEGIN_DECLS_ +SIMDE_FUNCTION_ATTRIBUTES +simde_float16x4_t +simde_vfma_n_f16(simde_float16x4_t a, simde_float16x4_t b, simde_float16_t c) { + #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && defined(SIMDE_ARCH_ARM_FMA) && (!defined(__clang__) || SIMDE_DETECT_CLANG_VERSION_CHECK(7,0,0)) && !defined(SIMDE_BUG_GCC_95399) && defined(SIMDE_ARM_NEON_FP16) + return vfma_n_f16(a, b, c); + #else + return simde_vfma_f16(a, b, simde_vdup_n_f16(c)); + #endif +} +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vfma_n_f16 + #define vfma_n_f16(a, b, c) simde_vfma_n_f16(a, b, c) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_float16x8_t +simde_vfmaq_n_f16(simde_float16x8_t a, simde_float16x8_t b, simde_float16_t c) { + #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && defined(SIMDE_ARCH_ARM_FMA) && (!defined(__clang__) || SIMDE_DETECT_CLANG_VERSION_CHECK(7,0,0)) && !defined(SIMDE_BUG_GCC_95399) && defined(SIMDE_ARM_NEON_FP16) + return vfmaq_n_f16(a, b, c); + #else + return simde_vfmaq_f16(a, b, simde_vdupq_n_f16(c)); + #endif +} +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vfmaq_n_f16 + #define vfmaq_n_f16(a, b, c) simde_vfmaq_n_f16(a, b, c) +#endif + SIMDE_FUNCTION_ATTRIBUTES simde_float32x2_t simde_vfma_n_f32(simde_float32x2_t a, simde_float32x2_t b, simde_float32_t c) { - #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) && (defined(__ARM_FEATURE_FMA) && __ARM_FEATURE_FMA) && (!defined(__clang__) || SIMDE_DETECT_CLANG_VERSION_CHECK(7,0,0)) && !defined(SIMDE_BUG_GCC_95399) + #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) && defined(SIMDE_ARCH_ARM_FMA) && (!defined(__clang__) || SIMDE_DETECT_CLANG_VERSION_CHECK(7,0,0)) && !defined(SIMDE_BUG_GCC_95399) return vfma_n_f32(a, b, c); #else return simde_vfma_f32(a, b, simde_vdup_n_f32(c)); @@ -52,7 +81,7 @@ simde_vfma_n_f32(simde_float32x2_t a, simde_float32x2_t b, simde_float32_t c) { SIMDE_FUNCTION_ATTRIBUTES simde_float64x1_t simde_vfma_n_f64(simde_float64x1_t a, simde_float64x1_t b, simde_float64_t c) { - #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && (defined(__ARM_FEATURE_FMA) && __ARM_FEATURE_FMA) && (!defined(__clang__) || SIMDE_DETECT_CLANG_VERSION_CHECK(7,0,0)) + #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && defined(SIMDE_ARCH_ARM_FMA) && (!defined(__clang__) || SIMDE_DETECT_CLANG_VERSION_CHECK(7,0,0)) return vfma_n_f64(a, b, c); #else return simde_vfma_f64(a, b, simde_vdup_n_f64(c)); @@ -66,7 +95,7 @@ simde_vfma_n_f64(simde_float64x1_t a, simde_float64x1_t b, simde_float64_t c) { SIMDE_FUNCTION_ATTRIBUTES simde_float32x4_t simde_vfmaq_n_f32(simde_float32x4_t a, simde_float32x4_t b, simde_float32_t c) { - #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) && (defined(__ARM_FEATURE_FMA) && __ARM_FEATURE_FMA) && (!defined(__clang__) || SIMDE_DETECT_CLANG_VERSION_CHECK(7,0,0)) && !defined(SIMDE_BUG_GCC_95399) + #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) && defined(SIMDE_ARCH_ARM_FMA) && (!defined(__clang__) || SIMDE_DETECT_CLANG_VERSION_CHECK(7,0,0)) && !defined(SIMDE_BUG_GCC_95399) return vfmaq_n_f32(a, b, c); #else return simde_vfmaq_f32(a, b, simde_vdupq_n_f32(c)); @@ -80,7 +109,7 @@ simde_vfmaq_n_f32(simde_float32x4_t a, simde_float32x4_t b, simde_float32_t c) { SIMDE_FUNCTION_ATTRIBUTES simde_float64x2_t simde_vfmaq_n_f64(simde_float64x2_t a, simde_float64x2_t b, simde_float64_t c) { - #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && (defined(__ARM_FEATURE_FMA) && __ARM_FEATURE_FMA) && (!defined(__clang__) || SIMDE_DETECT_CLANG_VERSION_CHECK(7,0,0)) + #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && defined(SIMDE_ARCH_ARM_FMA) && (!defined(__clang__) || SIMDE_DETECT_CLANG_VERSION_CHECK(7,0,0)) return vfmaq_n_f64(a, b, c); #else return simde_vfmaq_f64(a, b, simde_vdupq_n_f64(c)); diff --git a/lib/simd_wrapper/simde/arm/neon/fmlal.h b/lib/simd_wrapper/simde/arm/neon/fmlal.h new file mode 100644 index 00000000000..f71d3019c8f --- /dev/null +++ b/lib/simd_wrapper/simde/arm/neon/fmlal.h @@ -0,0 +1,527 @@ +/* SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, copy, + * modify, merge, publish, distribute, sublicense, and/or sell copies + * of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + * Copyright: + * 2023 Yi-Yen Chung (Copyright owned by Andes Technology) + */ + +#if !defined(SIMDE_ARM_NEON_FMLAL_H) +#define SIMDE_ARM_NEON_FMLAL_H + +#include "types.h" + +HEDLEY_DIAGNOSTIC_PUSH +SIMDE_DISABLE_UNWANTED_DIAGNOSTICS +SIMDE_BEGIN_DECLS_ + +SIMDE_FUNCTION_ATTRIBUTES +simde_float32x2_t +simde_vfmlal_low_f16(simde_float32x2_t r, simde_float16x4_t a, simde_float16x4_t b) { + #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARM_NEON_FP16) && \ + defined(SIMDE_ARCH_ARM_FP16_FML) + return vfmlal_low_f16(r, a, b); + #else + simde_float32x2_private + ret_, + r_ = simde_float32x2_to_private(r); + simde_float16x4_private + a_ = simde_float16x4_to_private(a), + b_ = simde_float16x4_to_private(b); + + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(ret_.values) / sizeof(ret_.values[0])) ; i++) { + ret_.values[i] = r_.values[i] + + simde_float16_to_float32(a_.values[i]) * simde_float16_to_float32(b_.values[i]); + } + return simde_float32x2_from_private(ret_); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) + #undef vfmlal_low_f16 + #define vfmlal_low_f16(r, a, b) simde_vfmlal_low_f16((r), (a), (b)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_float32x4_t +simde_vfmlalq_low_f16(simde_float32x4_t r, simde_float16x8_t a, simde_float16x8_t b) { + #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARM_NEON_FP16) && \ + defined(SIMDE_ARCH_ARM_FP16_FML) + return vfmlalq_low_f16(r, a, b); + #else + simde_float32x4_private + ret_, + r_ = simde_float32x4_to_private(r); + simde_float16x8_private + a_ = simde_float16x8_to_private(a), + b_ = simde_float16x8_to_private(b); + + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(ret_.values) / sizeof(ret_.values[0])) ; i++) { + ret_.values[i] = r_.values[i] + + simde_float16_to_float32(a_.values[i]) * simde_float16_to_float32(b_.values[i]); + } + return simde_float32x4_from_private(ret_); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) + #undef vfmlalq_low_f16 + #define vfmlalq_low_f16(r, a, b) simde_vfmlalq_low_f16((r), (a), (b)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_float32x2_t +simde_vfmlal_high_f16(simde_float32x2_t r, simde_float16x4_t a, simde_float16x4_t b) { + #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARM_NEON_FP16) && \ + defined(SIMDE_ARCH_ARM_FP16_FML) + return vfmlal_high_f16(r, a, b); + #else + simde_float32x2_private + ret_, + r_ = simde_float32x2_to_private(r); + simde_float16x4_private + a_ = simde_float16x4_to_private(a), + b_ = simde_float16x4_to_private(b); + size_t high_offset = sizeof(a_.values) / sizeof(a_.values[0]) / 2; + + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(ret_.values) / sizeof(ret_.values[0])) ; i++) { + ret_.values[i] = r_.values[i] + + simde_float16_to_float32(a_.values[i+high_offset]) * simde_float16_to_float32(b_.values[i+high_offset]); + } + return simde_float32x2_from_private(ret_); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) + #undef vfmlal_high_f16 + #define vfmlal_high_f16(r, a, b) simde_vfmlal_high_f16((r), (a), (b)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_float32x4_t +simde_vfmlalq_high_f16(simde_float32x4_t r, simde_float16x8_t a, simde_float16x8_t b) { + #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARM_NEON_FP16) && \ + defined(SIMDE_ARCH_ARM_FP16_FML) + return vfmlalq_high_f16(r, a, b); + #else + simde_float32x4_private + ret_, + r_ = simde_float32x4_to_private(r); + simde_float16x8_private + a_ = simde_float16x8_to_private(a), + b_ = simde_float16x8_to_private(b); + size_t high_offset = sizeof(a_.values) / sizeof(a_.values[0]) / 2; + + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(ret_.values) / sizeof(ret_.values[0])) ; i++) { + ret_.values[i] = r_.values[i] + + simde_float16_to_float32(a_.values[i+high_offset]) * simde_float16_to_float32(b_.values[i+high_offset]); + } + return simde_float32x4_from_private(ret_); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) + #undef vfmlalq_high_f16 + #define vfmlalq_high_f16(r, a, b) simde_vfmlalq_high_f16((r), (a), (b)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_float32x2_t +simde_vfmlal_lane_low_f16(simde_float32x2_t r, simde_float16x4_t a, simde_float16x4_t b, const int lane) + SIMDE_REQUIRE_CONSTANT_RANGE(lane, 0, 3) { + simde_float32x2_private + ret_, + r_ = simde_float32x2_to_private(r); + simde_float16x4_private + a_ = simde_float16x4_to_private(a), + b_ = simde_float16x4_to_private(b); + + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(ret_.values) / sizeof(ret_.values[0])) ; i++) { + ret_.values[i] = r_.values[i] + + simde_float16_to_float32(a_.values[i]) * simde_float16_to_float32(b_.values[lane]); + } + return simde_float32x2_from_private(ret_); +} +#if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARM_NEON_FP16) && \ + defined(SIMDE_ARCH_ARM_FP16_FML) + #define simde_vfmlal_lane_low_f16(r, a, b, lane) vfmlal_lane_low_f16((r), (a), (b), (lane)); +#endif +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) + #undef vfmlal_lane_low_f16 + #define vfmlal_lane_low_f16(r, a, b, lane) simde_vfmlal_lane_low_f16((r), (a), (b), (lane)); +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_float32x2_t +simde_vfmlal_laneq_low_f16(simde_float32x2_t r, simde_float16x4_t a, simde_float16x8_t b, const int lane) + SIMDE_REQUIRE_CONSTANT_RANGE(lane, 0, 7) { + simde_float32x2_private + ret_, + r_ = simde_float32x2_to_private(r); + simde_float16x4_private + a_ = simde_float16x4_to_private(a); + simde_float16x8_private + b_ = simde_float16x8_to_private(b); + + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(ret_.values) / sizeof(ret_.values[0])) ; i++) { + ret_.values[i] = r_.values[i] + + simde_float16_to_float32(a_.values[i]) * simde_float16_to_float32(b_.values[lane]); + } + return simde_float32x2_from_private(ret_); +} +#if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARM_NEON_FP16) && \ + defined(SIMDE_ARCH_ARM_FP16_FML) + #define simde_vfmlal_laneq_low_f16(r, a, b, lane) vfmlal_laneq_low_f16((r), (a), (b), (lane)); +#endif +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) + #undef vfmlal_laneq_low_f16 + #define vfmlal_laneq_low_f16(r, a, b, lane) simde_vfmlal_laneq_low_f16((r), (a), (b), (lane)); +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_float32x4_t +simde_vfmlalq_lane_low_f16(simde_float32x4_t r, simde_float16x8_t a, simde_float16x4_t b, const int lane) + SIMDE_REQUIRE_CONSTANT_RANGE(lane, 0, 3) { + simde_float32x4_private + ret_, + r_ = simde_float32x4_to_private(r); + simde_float16x4_private + b_ = simde_float16x4_to_private(b); + simde_float16x8_private + a_ = simde_float16x8_to_private(a); + + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(ret_.values) / sizeof(ret_.values[0])) ; i++) { + ret_.values[i] = r_.values[i] + + simde_float16_to_float32(a_.values[i]) * simde_float16_to_float32(b_.values[lane]); + } + return simde_float32x4_from_private(ret_); +} +#if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARM_NEON_FP16) && \ + defined(SIMDE_ARCH_ARM_FP16_FML) + #define simde_vfmlalq_lane_low_f16(r, a, b, lane) vfmlalq_lane_low_f16((r), (a), (b), (lane)); +#endif +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) + #undef vfmlalq_lane_low_f16 + #define vfmlalq_lane_low_f16(r, a, b, lane) simde_vfmlalq_lane_low_f16((r), (a), (b), (lane)); +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_float32x4_t +simde_vfmlalq_laneq_low_f16(simde_float32x4_t r, simde_float16x8_t a, simde_float16x8_t b, const int lane) + SIMDE_REQUIRE_CONSTANT_RANGE(lane, 0, 7) { + simde_float32x4_private + ret_, + r_ = simde_float32x4_to_private(r); + simde_float16x8_private + a_ = simde_float16x8_to_private(a), + b_ = simde_float16x8_to_private(b); + + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(ret_.values) / sizeof(ret_.values[0])) ; i++) { + ret_.values[i] = r_.values[i] + + simde_float16_to_float32(a_.values[i]) * simde_float16_to_float32(b_.values[lane]); + } + return simde_float32x4_from_private(ret_); +} +#if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARM_NEON_FP16) && \ + defined(SIMDE_ARCH_ARM_FP16_FML) + #define simde_vfmlalq_laneq_low_f16(r, a, b, lane) vfmlalq_laneq_low_f16((r), (a), (b), (lane)); +#endif +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) + #undef vfmlalq_laneq_low_f16 + #define vfmlalq_laneq_low_f16(r, a, b, lane) simde_vfmlalq_laneq_low_f16((r), (a), (b), (lane)); +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_float32x2_t +simde_vfmlal_lane_high_f16(simde_float32x2_t r, simde_float16x4_t a, simde_float16x4_t b, const int lane) + SIMDE_REQUIRE_CONSTANT_RANGE(lane, 0, 3) { + simde_float32x2_private + ret_, + r_ = simde_float32x2_to_private(r); + simde_float16x4_private + a_ = simde_float16x4_to_private(a), + b_ = simde_float16x4_to_private(b); + size_t high_offset = sizeof(a_.values) / sizeof(a_.values[0]) / 2; + + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(ret_.values) / sizeof(ret_.values[0])) ; i++) { + ret_.values[i] = r_.values[i] + + simde_float16_to_float32(a_.values[i+high_offset]) * simde_float16_to_float32(b_.values[lane]); + } + return simde_float32x2_from_private(ret_); +} +#if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARM_NEON_FP16) && \ + defined(SIMDE_ARCH_ARM_FP16_FML) + #define simde_vfmlal_lane_high_f16(r, a, b, lane) vfmlal_lane_high_f16((r), (a), (b), (lane)); +#endif +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) + #undef vfmlal_lane_high_f16 + #define vfmlal_lane_high_f16(r, a, b, lane) simde_vfmlal_lane_high_f16((r), (a), (b), (lane)); +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_float32x2_t +simde_vfmlal_laneq_high_f16(simde_float32x2_t r, simde_float16x4_t a, simde_float16x8_t b, const int lane) + SIMDE_REQUIRE_CONSTANT_RANGE(lane, 0, 7) { + simde_float32x2_private + ret_, + r_ = simde_float32x2_to_private(r); + simde_float16x4_private + a_ = simde_float16x4_to_private(a); + simde_float16x8_private + b_ = simde_float16x8_to_private(b); + size_t high_offset = sizeof(a_.values) / sizeof(a_.values[0]) / 2; + + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(ret_.values) / sizeof(ret_.values[0])) ; i++) { + ret_.values[i] = r_.values[i] + + simde_float16_to_float32(a_.values[i+high_offset]) * simde_float16_to_float32(b_.values[lane]); + } + return simde_float32x2_from_private(ret_); +} +#if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARM_NEON_FP16) && \ + defined(SIMDE_ARCH_ARM_FP16_FML) + #define simde_vfmlal_laneq_high_f16(r, a, b, lane) vfmlal_laneq_high_f16((r), (a), (b), (lane)); +#endif +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) + #undef vfmlal_laneq_high_f16 + #define vfmlal_laneq_high_f16(r, a, b, lane) simde_vfmlal_laneq_high_f16((r), (a), (b), (lane)); +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_float32x4_t +simde_vfmlalq_lane_high_f16(simde_float32x4_t r, simde_float16x8_t a, simde_float16x4_t b, const int lane) + SIMDE_REQUIRE_CONSTANT_RANGE(lane, 0, 3) { + simde_float32x4_private + ret_, + r_ = simde_float32x4_to_private(r); + simde_float16x4_private + b_ = simde_float16x4_to_private(b); + simde_float16x8_private + a_ = simde_float16x8_to_private(a); + size_t high_offset = sizeof(a_.values) / sizeof(a_.values[0]) / 2; + + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(ret_.values) / sizeof(ret_.values[0])) ; i++) { + ret_.values[i] = r_.values[i] + + simde_float16_to_float32(a_.values[i+high_offset]) * simde_float16_to_float32(b_.values[lane]); + } + return simde_float32x4_from_private(ret_); +} +#if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARM_NEON_FP16) && \ + defined(SIMDE_ARCH_ARM_FP16_FML) + #define simde_vfmlalq_lane_high_f16(r, a, b, lane) vfmlalq_lane_high_f16((r), (a), (b), (lane)); +#endif +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) + #undef vfmlalq_lane_high_f16 + #define vfmlalq_lane_high_f16(r, a, b, lane) simde_vfmlalq_lane_high_f16((r), (a), (b), (lane)); +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_float32x4_t +simde_vfmlalq_laneq_high_f16(simde_float32x4_t r, simde_float16x8_t a, simde_float16x8_t b, const int lane) + SIMDE_REQUIRE_CONSTANT_RANGE(lane, 0, 7) { + simde_float32x4_private + ret_, + r_ = simde_float32x4_to_private(r); + simde_float16x8_private + a_ = simde_float16x8_to_private(a), + b_ = simde_float16x8_to_private(b); + size_t high_offset = sizeof(a_.values) / sizeof(a_.values[0]) / 2; + + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(ret_.values) / sizeof(ret_.values[0])) ; i++) { + ret_.values[i] = r_.values[i] + + simde_float16_to_float32(a_.values[i+high_offset]) * simde_float16_to_float32(b_.values[lane]); + } + return simde_float32x4_from_private(ret_); +} +#if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARM_NEON_FP16) && \ + defined(SIMDE_ARCH_ARM_FP16_FML) + #define simde_vfmlalq_laneq_high_f16(r, a, b, lane) vfmlalq_laneq_high_f16((r), (a), (b), (lane)); +#endif +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) + #undef vfmlalq_laneq_high_f16 + #define vfmlalq_laneq_high_f16(r, a, b, lane) simde_vfmlalq_laneq_high_f16((r), (a), (b), (lane)); +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_float32x4_t +simde_vbfmlalbq_f32(simde_float32x4_t r, simde_bfloat16x8_t a, simde_bfloat16x8_t b) { + #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARM_NEON_BF16) + return vbfmlalbq_f32(r, a, b); + #else + simde_float32x4_private + ret, + r_ = simde_float32x4_to_private(r); + simde_bfloat16x8_private + a_ = simde_bfloat16x8_to_private(a), + b_ = simde_bfloat16x8_to_private(b); + + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(ret.values) / sizeof(ret.values[0])) ; i++) { + ret.values[i] = r_.values[i] + + simde_bfloat16_to_float32(a_.values[i * 2]) * simde_bfloat16_to_float32(b_.values[i * 2]); + } + return simde_float32x4_from_private(ret); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) + #undef vbfmlalbq_f32 + #define vbfmlalbq_f32(r, a, b) simde_vbfmlalbq_f32((r), (a), (b)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_float32x4_t +simde_vbfmlaltq_f32(simde_float32x4_t r, simde_bfloat16x8_t a, simde_bfloat16x8_t b) { + #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARM_NEON_BF16) + return vbfmlaltq_f32(r, a, b); + #else + simde_float32x4_private + ret, + r_ = simde_float32x4_to_private(r); + simde_bfloat16x8_private + a_ = simde_bfloat16x8_to_private(a), + b_ = simde_bfloat16x8_to_private(b); + + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(ret.values) / sizeof(ret.values[0])) ; i++) { + ret.values[i] = r_.values[i] + + simde_bfloat16_to_float32(a_.values[i * 2 + 1]) * simde_bfloat16_to_float32(b_.values[i * 2 + 1]); + } + return simde_float32x4_from_private(ret); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) + #undef vbfmlaltq_f32 + #define vbfmlaltq_f32(r, a, b) simde_vbfmlaltq_f32((r), (a), (b)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_float32x4_t +simde_vbfmlalbq_lane_f32(simde_float32x4_t r, simde_bfloat16x8_t a, simde_bfloat16x4_t b, const int lane) + SIMDE_REQUIRE_CONSTANT_RANGE(lane, 0, 3) { + simde_float32x4_private + ret, + r_ = simde_float32x4_to_private(r); + simde_bfloat16x8_private a_ = simde_bfloat16x8_to_private(a); + simde_bfloat16x4_private b_ = simde_bfloat16x4_to_private(b); + + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(ret.values) / sizeof(ret.values[0])) ; i++) { + ret.values[i] = r_.values[i] + + simde_bfloat16_to_float32(a_.values[i * 2]) * simde_bfloat16_to_float32(b_.values[lane]); + } + return simde_float32x4_from_private(ret); +} +#if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARM_NEON_BF16) + #define simde_vbfmlalbq_lane_f32(r, a, b, lane) vbfmlalbq_lane_f32((r), (a), (b), (lane)) +#endif +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) + #undef vbfmlalbq_lane_f32 + #define vbfmlalbq_lane_f32(r, a, b, lane) simde_vbfmlalbq_lane_f32((r), (a), (b), (lane)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_float32x4_t +simde_vbfmlalbq_laneq_f32(simde_float32x4_t r, simde_bfloat16x8_t a, simde_bfloat16x8_t b, const int lane) + SIMDE_REQUIRE_CONSTANT_RANGE(lane, 0, 7) { + simde_float32x4_private + ret, + r_ = simde_float32x4_to_private(r); + simde_bfloat16x8_private + a_ = simde_bfloat16x8_to_private(a), + b_ = simde_bfloat16x8_to_private(b); + + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(ret.values) / sizeof(ret.values[0])) ; i++) { + ret.values[i] = r_.values[i] + + simde_bfloat16_to_float32(a_.values[i * 2]) * simde_bfloat16_to_float32(b_.values[lane]); + } + return simde_float32x4_from_private(ret); +} +#if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARM_NEON_BF16) + #define simde_vbfmlalbq_laneq_f32(r, a, b, lane) vbfmlalbq_laneq_f32((r), (a), (b), (lane)) +#endif +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) + #undef vbfmlalbq_laneq_f32 + #define vbfmlalbq_laneq_f32(r, a, b, lane) simde_vbfmlalbq_laneq_f32((r), (a), (b), (lane)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_float32x4_t +simde_vbfmlaltq_lane_f32(simde_float32x4_t r, simde_bfloat16x8_t a, simde_bfloat16x4_t b, const int lane) + SIMDE_REQUIRE_CONSTANT_RANGE(lane, 0, 3) { + simde_float32x4_private + ret, + r_ = simde_float32x4_to_private(r); + simde_bfloat16x8_private a_ = simde_bfloat16x8_to_private(a); + simde_bfloat16x4_private b_ = simde_bfloat16x4_to_private(b); + + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(ret.values) / sizeof(ret.values[0])) ; i++) { + ret.values[i] = r_.values[i] + + simde_bfloat16_to_float32(a_.values[i * 2 + 1]) * simde_bfloat16_to_float32(b_.values[lane]); + } + return simde_float32x4_from_private(ret); +} +#if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARM_NEON_BF16) + #define simde_vbfmlaltq_lane_f32(r, a, b, lane) vbfmlaltq_lane_f32((r), (a), (b), (lane)) +#endif +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) + #undef vbfmlaltq_lane_f32 + #define vbfmlaltq_lane_f32(r, a, b, lane) simde_vbfmlaltq_lane_f32((r), (a), (b), (lane)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_float32x4_t +simde_vbfmlaltq_laneq_f32(simde_float32x4_t r, simde_bfloat16x8_t a, simde_bfloat16x8_t b, const int lane) + SIMDE_REQUIRE_CONSTANT_RANGE(lane, 0, 7) { + simde_float32x4_private + ret, + r_ = simde_float32x4_to_private(r); + simde_bfloat16x8_private + a_ = simde_bfloat16x8_to_private(a), + b_ = simde_bfloat16x8_to_private(b); + + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(ret.values) / sizeof(ret.values[0])) ; i++) { + ret.values[i] = r_.values[i] + + simde_bfloat16_to_float32(a_.values[i * 2 + 1]) * simde_bfloat16_to_float32(b_.values[lane]); + } + return simde_float32x4_from_private(ret); +} +#if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARM_NEON_BF16) + #define simde_vbfmlaltq_laneq_f32(r, a, b, lane) vbfmlaltq_laneq_f32((r), (a), (b), (lane)) +#endif +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) + #undef vbfmlaltq_laneq_f32 + #define vbfmlaltq_laneq_f32(r, a, b, lane) simde_vbfmlaltq_laneq_f32((r), (a), (b), (lane)) +#endif + +SIMDE_END_DECLS_ +HEDLEY_DIAGNOSTIC_POP + +#endif /* !defined(SIMDE_ARM_NEON_FMLAL_H) */ diff --git a/lib/simd_wrapper/simde/arm/neon/fmlsl.h b/lib/simd_wrapper/simde/arm/neon/fmlsl.h new file mode 100644 index 00000000000..8a5be5461c3 --- /dev/null +++ b/lib/simd_wrapper/simde/arm/neon/fmlsl.h @@ -0,0 +1,373 @@ +/* SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, copy, + * modify, merge, publish, distribute, sublicense, and/or sell copies + * of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + * Copyright: + * 2023 Yi-Yen Chung (Copyright owned by Andes Technology) + */ + +#if !defined(SIMDE_ARM_NEON_FMLSL_H) +#define SIMDE_ARM_NEON_FMLSL_H + +#include "types.h" + +HEDLEY_DIAGNOSTIC_PUSH +SIMDE_DISABLE_UNWANTED_DIAGNOSTICS +SIMDE_BEGIN_DECLS_ + +SIMDE_FUNCTION_ATTRIBUTES +simde_float32x2_t +simde_vfmlsl_low_f16(simde_float32x2_t r, simde_float16x4_t a, simde_float16x4_t b) { + #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARM_NEON_FP16) && \ + defined(SIMDE_ARCH_ARM_FP16_FML) + return vfmlsl_low_f16(r, a, b); + #else + simde_float32x2_private + ret_, + r_ = simde_float32x2_to_private(r); + simde_float16x4_private + a_ = simde_float16x4_to_private(a), + b_ = simde_float16x4_to_private(b); + + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(ret_.values) / sizeof(ret_.values[0])) ; i++) { + ret_.values[i] = r_.values[i] - + simde_float16_to_float32(a_.values[i]) * simde_float16_to_float32(b_.values[i]); + } + return simde_float32x2_from_private(ret_); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) + #undef vfmlsl_low_f16 + #define vfmlsl_low_f16(r, a, b) simde_vfmlsl_low_f16((r), (a), (b)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_float32x4_t +simde_vfmlslq_low_f16(simde_float32x4_t r, simde_float16x8_t a, simde_float16x8_t b) { + #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARM_NEON_FP16) && \ + defined(SIMDE_ARCH_ARM_FP16_FML) + return vfmlslq_low_f16(r, a, b); + #else + simde_float32x4_private + ret_, + r_ = simde_float32x4_to_private(r); + simde_float16x8_private + a_ = simde_float16x8_to_private(a), + b_ = simde_float16x8_to_private(b); + + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(ret_.values) / sizeof(ret_.values[0])) ; i++) { + ret_.values[i] = r_.values[i] - + simde_float16_to_float32(a_.values[i]) * simde_float16_to_float32(b_.values[i]); + } + return simde_float32x4_from_private(ret_); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) + #undef vfmlslq_low_f16 + #define vfmlslq_low_f16(r, a, b) simde_vfmlslq_low_f16((r), (a), (b)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_float32x2_t +simde_vfmlsl_high_f16(simde_float32x2_t r, simde_float16x4_t a, simde_float16x4_t b) { + #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARM_NEON_FP16) && \ + defined(SIMDE_ARCH_ARM_FP16_FML) + return vfmlsl_high_f16(r, a, b); + #else + simde_float32x2_private + ret_, + r_ = simde_float32x2_to_private(r); + simde_float16x4_private + a_ = simde_float16x4_to_private(a), + b_ = simde_float16x4_to_private(b); + size_t high_offset = sizeof(a_.values) / sizeof(a_.values[0]) / 2; + + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(ret_.values) / sizeof(ret_.values[0])) ; i++) { + ret_.values[i] = r_.values[i] - + simde_float16_to_float32(a_.values[i+high_offset]) * simde_float16_to_float32(b_.values[i+high_offset]); + } + return simde_float32x2_from_private(ret_); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) + #undef vfmlsl_high_f16 + #define vfmlsl_high_f16(r, a, b) simde_vfmlsl_high_f16((r), (a), (b)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_float32x4_t +simde_vfmlslq_high_f16(simde_float32x4_t r, simde_float16x8_t a, simde_float16x8_t b) { + #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARM_NEON_FP16) && \ + defined(SIMDE_ARCH_ARM_FP16_FML) + return vfmlslq_high_f16(r, a, b); + #else + simde_float32x4_private + ret_, + r_ = simde_float32x4_to_private(r); + simde_float16x8_private + a_ = simde_float16x8_to_private(a), + b_ = simde_float16x8_to_private(b); + size_t high_offset = sizeof(a_.values) / sizeof(a_.values[0]) / 2; + + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(ret_.values) / sizeof(ret_.values[0])) ; i++) { + ret_.values[i] = r_.values[i] - + simde_float16_to_float32(a_.values[i+high_offset]) * simde_float16_to_float32(b_.values[i+high_offset]); + } + return simde_float32x4_from_private(ret_); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) + #undef vfmlslq_high_f16 + #define vfmlslq_high_f16(r, a, b) simde_vfmlslq_high_f16((r), (a), (b)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_float32x2_t +simde_vfmlsl_lane_low_f16(simde_float32x2_t r, simde_float16x4_t a, simde_float16x4_t b, const int lane) + SIMDE_REQUIRE_CONSTANT_RANGE(lane, 0, 3) { + simde_float32x2_private + ret_, + r_ = simde_float32x2_to_private(r); + simde_float16x4_private + a_ = simde_float16x4_to_private(a), + b_ = simde_float16x4_to_private(b); + + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(ret_.values) / sizeof(ret_.values[0])) ; i++) { + ret_.values[i] = r_.values[i] - + simde_float16_to_float32(a_.values[i]) * simde_float16_to_float32(b_.values[lane]); + } + return simde_float32x2_from_private(ret_); +} +#if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARM_NEON_FP16) && \ + defined(SIMDE_ARCH_ARM_FP16_FML) + #define simde_vfmlsl_lane_low_f16(r, a, b, lane) vfmlsl_lane_low_f16((r), (a), (b), (lane)); +#endif +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) + #undef vfmlsl_lane_low_f16 + #define vfmlsl_lane_low_f16(r, a, b, lane) simde_vfmlsl_lane_low_f16((r), (a), (b), (lane)); +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_float32x2_t +simde_vfmlsl_laneq_low_f16(simde_float32x2_t r, simde_float16x4_t a, simde_float16x8_t b, const int lane) + SIMDE_REQUIRE_CONSTANT_RANGE(lane, 0, 7) { + simde_float32x2_private + ret_, + r_ = simde_float32x2_to_private(r); + simde_float16x4_private + a_ = simde_float16x4_to_private(a); + simde_float16x8_private + b_ = simde_float16x8_to_private(b); + + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(ret_.values) / sizeof(ret_.values[0])) ; i++) { + ret_.values[i] = r_.values[i] - + simde_float16_to_float32(a_.values[i]) * simde_float16_to_float32(b_.values[lane]); + } + return simde_float32x2_from_private(ret_); +} +#if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARM_NEON_FP16) && \ + defined(SIMDE_ARCH_ARM_FP16_FML) + #define simde_vfmlsl_laneq_low_f16(r, a, b, lane) vfmlsl_laneq_low_f16((r), (a), (b), (lane)); +#endif +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) + #undef vfmlsl_laneq_low_f16 + #define vfmlsl_laneq_low_f16(r, a, b, lane) simde_vfmlsl_laneq_low_f16((r), (a), (b), (lane)); +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_float32x4_t +simde_vfmlslq_lane_low_f16(simde_float32x4_t r, simde_float16x8_t a, simde_float16x4_t b, const int lane) + SIMDE_REQUIRE_CONSTANT_RANGE(lane, 0, 3) { + simde_float32x4_private + ret_, + r_ = simde_float32x4_to_private(r); + simde_float16x4_private + b_ = simde_float16x4_to_private(b); + simde_float16x8_private + a_ = simde_float16x8_to_private(a); + + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(ret_.values) / sizeof(ret_.values[0])) ; i++) { + ret_.values[i] = r_.values[i] - + simde_float16_to_float32(a_.values[i]) * simde_float16_to_float32(b_.values[lane]); + } + return simde_float32x4_from_private(ret_); +} +#if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARM_NEON_FP16) && \ + defined(SIMDE_ARCH_ARM_FP16_FML) + #define simde_vfmlslq_lane_low_f16(r, a, b, lane) vfmlslq_lane_low_f16((r), (a), (b), (lane)); +#endif +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) + #undef vfmlslq_lane_low_f16 + #define vfmlslq_lane_low_f16(r, a, b, lane) simde_vfmlslq_lane_low_f16((r), (a), (b), (lane)); +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_float32x4_t +simde_vfmlslq_laneq_low_f16(simde_float32x4_t r, simde_float16x8_t a, simde_float16x8_t b, const int lane) + SIMDE_REQUIRE_CONSTANT_RANGE(lane, 0, 7) { + simde_float32x4_private + ret_, + r_ = simde_float32x4_to_private(r); + simde_float16x8_private + a_ = simde_float16x8_to_private(a), + b_ = simde_float16x8_to_private(b); + + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(ret_.values) / sizeof(ret_.values[0])) ; i++) { + ret_.values[i] = r_.values[i] - + simde_float16_to_float32(a_.values[i]) * simde_float16_to_float32(b_.values[lane]); + } + return simde_float32x4_from_private(ret_); +} +#if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARM_NEON_FP16) && \ + defined(SIMDE_ARCH_ARM_FP16_FML) + #define simde_vfmlslq_laneq_low_f16(r, a, b, lane) vfmlslq_laneq_low_f16((r), (a), (b), (lane)); +#endif +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) + #undef vfmlslq_laneq_low_f16 + #define vfmlslq_laneq_low_f16(r, a, b, lane) simde_vfmlslq_laneq_low_f16((r), (a), (b), (lane)); +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_float32x2_t +simde_vfmlsl_lane_high_f16(simde_float32x2_t r, simde_float16x4_t a, simde_float16x4_t b, const int lane) + SIMDE_REQUIRE_CONSTANT_RANGE(lane, 0, 3) { + simde_float32x2_private + ret_, + r_ = simde_float32x2_to_private(r); + simde_float16x4_private + a_ = simde_float16x4_to_private(a), + b_ = simde_float16x4_to_private(b); + size_t high_offset = sizeof(a_.values) / sizeof(a_.values[0]) / 2; + + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(ret_.values) / sizeof(ret_.values[0])) ; i++) { + ret_.values[i] = r_.values[i] - + simde_float16_to_float32(a_.values[i+high_offset]) * simde_float16_to_float32(b_.values[lane]); + } + return simde_float32x2_from_private(ret_); +} +#if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARM_NEON_FP16) && \ + defined(SIMDE_ARCH_ARM_FP16_FML) + #define simde_vfmlsl_lane_high_f16(r, a, b, lane) vfmlsl_lane_high_f16((r), (a), (b), (lane)); +#endif +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) + #undef vfmlsl_lane_high_f16 + #define vfmlsl_lane_high_f16(r, a, b, lane) simde_vfmlsl_lane_high_f16((r), (a), (b), (lane)); +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_float32x2_t +simde_vfmlsl_laneq_high_f16(simde_float32x2_t r, simde_float16x4_t a, simde_float16x8_t b, const int lane) + SIMDE_REQUIRE_CONSTANT_RANGE(lane, 0, 7) { + simde_float32x2_private + ret_, + r_ = simde_float32x2_to_private(r); + simde_float16x4_private + a_ = simde_float16x4_to_private(a); + simde_float16x8_private + b_ = simde_float16x8_to_private(b); + size_t high_offset = sizeof(a_.values) / sizeof(a_.values[0]) / 2; + + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(ret_.values) / sizeof(ret_.values[0])) ; i++) { + ret_.values[i] = r_.values[i] - + simde_float16_to_float32(a_.values[i+high_offset]) * simde_float16_to_float32(b_.values[lane]); + } + return simde_float32x2_from_private(ret_); +} +#if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARM_NEON_FP16) && \ + defined(SIMDE_ARCH_ARM_FP16_FML) + #define simde_vfmlsl_laneq_high_f16(r, a, b, lane) vfmlsl_laneq_high_f16((r), (a), (b), (lane)); +#endif +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) + #undef vfmlsl_laneq_high_f16 + #define vfmlsl_laneq_high_f16(r, a, b, lane) simde_vfmlsl_laneq_high_f16((r), (a), (b), (lane)); +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_float32x4_t +simde_vfmlslq_lane_high_f16(simde_float32x4_t r, simde_float16x8_t a, simde_float16x4_t b, const int lane) + SIMDE_REQUIRE_CONSTANT_RANGE(lane, 0, 3) { + simde_float32x4_private + ret_, + r_ = simde_float32x4_to_private(r); + simde_float16x4_private + b_ = simde_float16x4_to_private(b); + simde_float16x8_private + a_ = simde_float16x8_to_private(a); + size_t high_offset = sizeof(a_.values) / sizeof(a_.values[0]) / 2; + + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(ret_.values) / sizeof(ret_.values[0])) ; i++) { + ret_.values[i] = r_.values[i] - + simde_float16_to_float32(a_.values[i+high_offset]) * simde_float16_to_float32(b_.values[lane]); + } + return simde_float32x4_from_private(ret_); +} +#if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARM_NEON_FP16) && \ + defined(SIMDE_ARCH_ARM_FP16_FML) + #define simde_vfmlslq_lane_high_f16(r, a, b, lane) vfmlslq_lane_high_f16((r), (a), (b), (lane)); +#endif +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) + #undef vfmlslq_lane_high_f16 + #define vfmlslq_lane_high_f16(r, a, b, lane) simde_vfmlslq_lane_high_f16((r), (a), (b), (lane)); +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_float32x4_t +simde_vfmlslq_laneq_high_f16(simde_float32x4_t r, simde_float16x8_t a, simde_float16x8_t b, const int lane) + SIMDE_REQUIRE_CONSTANT_RANGE(lane, 0, 7) { + simde_float32x4_private + ret_, + r_ = simde_float32x4_to_private(r); + simde_float16x8_private + a_ = simde_float16x8_to_private(a), + b_ = simde_float16x8_to_private(b); + size_t high_offset = sizeof(a_.values) / sizeof(a_.values[0]) / 2; + + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(ret_.values) / sizeof(ret_.values[0])) ; i++) { + ret_.values[i] = r_.values[i] - + simde_float16_to_float32(a_.values[i+high_offset]) * simde_float16_to_float32(b_.values[lane]); + } + return simde_float32x4_from_private(ret_); +} +#if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARM_NEON_FP16) && \ + defined(SIMDE_ARCH_ARM_FP16_FML) + #define simde_vfmlslq_laneq_high_f16(r, a, b, lane) vfmlslq_laneq_high_f16((r), (a), (b), (lane)); +#endif +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) + #undef vfmlslq_laneq_high_f16 + #define vfmlslq_laneq_high_f16(r, a, b, lane) simde_vfmlslq_laneq_high_f16((r), (a), (b), (lane)); +#endif + +SIMDE_END_DECLS_ +HEDLEY_DIAGNOSTIC_POP + +#endif /* !defined(SIMDE_ARM_NEON_FMLSL_H) */ diff --git a/lib/simd_wrapper/simde/arm/neon/fms.h b/lib/simd_wrapper/simde/arm/neon/fms.h new file mode 100644 index 00000000000..0ad265c3d09 --- /dev/null +++ b/lib/simd_wrapper/simde/arm/neon/fms.h @@ -0,0 +1,139 @@ +/* SPDX-License-Identifier: MIT +* +* Permission is hereby granted, free of charge, to any person +* obtaining a copy of this software and associated documentation +* files (the "Software"), to deal in the Software without +* restriction, including without limitation the rights to use, copy, +* modify, merge, publish, distribute, sublicense, and/or sell copies +* of the Software, and to permit persons to whom the Software is +* furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice shall be +* included in all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF +* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS +* BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN +* ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN +* CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +* SOFTWARE. +* +* Copyright: +* 2023 Yi-Yen Chung (Copyright owned by Andes Technology) +*/ + +#if !defined(SIMDE_ARM_NEON_FMS_H) +#define SIMDE_ARM_NEON_FMS_H + +#include "add.h" +#include "mul.h" +#include "neg.h" + +HEDLEY_DIAGNOSTIC_PUSH +SIMDE_DISABLE_UNWANTED_DIAGNOSTICS +SIMDE_BEGIN_DECLS_ + +SIMDE_FUNCTION_ATTRIBUTES +simde_float16_t +simde_vfmsh_f16(simde_float16_t a, simde_float16_t b, simde_float16_t c) { + #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARCH_ARM_FMA) && defined(SIMDE_ARM_NEON_FP16) + return vfmsh_f16(a, b, c); + #else + return simde_vaddh_f16(a, simde_vnegh_f16(simde_vmulh_f16(b, c))); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) + #undef vfmsh_f16 + #define vfmsh_f16(a, b, c) simde_vfmsh_f16(a, b, c) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_float32x2_t +simde_vfms_f32(simde_float32x2_t a, simde_float32x2_t b, simde_float32x2_t c) { + #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) && defined(SIMDE_ARCH_ARM_FMA) + return vfms_f32(a, b, c); + #else + return simde_vadd_f32(a, simde_vneg_f32(simde_vmul_f32(b, c))); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vfms_f32 + #define vfms_f32(a, b, c) simde_vfms_f32(a, b, c) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_float64x1_t +simde_vfms_f64(simde_float64x1_t a, simde_float64x1_t b, simde_float64x1_t c) { + #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && defined(SIMDE_ARCH_ARM_FMA) + return vfms_f64(a, b, c); + #else + return simde_vadd_f64(a, simde_vneg_f64(simde_vmul_f64(b, c))); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vfms_f64 + #define vfms_f64(a, b, c) simde_vfms_f64(a, b, c) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_float16x4_t +simde_vfms_f16(simde_float16x4_t a, simde_float16x4_t b, simde_float16x4_t c) { + #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARCH_ARM_FMA) && defined(SIMDE_ARM_NEON_FP16) + return vfms_f16(a, b, c); + #else + return simde_vadd_f16(a, simde_vneg_f16(simde_vmul_f16(b, c))); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) + #undef vfms_f16 + #define vfms_f16(a, b, c) simde_vfms_f16(a, b, c) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_float16x8_t +simde_vfmsq_f16(simde_float16x8_t a, simde_float16x8_t b, simde_float16x8_t c) { + #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARCH_ARM_FMA) && defined(SIMDE_ARM_NEON_FP16) + return vfmsq_f16(a, b, c); + #else + return simde_vaddq_f16(a, simde_vnegq_f16(simde_vmulq_f16(b, c))); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) + #undef vfmsq_f16 + #define vfmsq_f16(a, b, c) simde_vfmsq_f16(a, b, c) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_float32x4_t +simde_vfmsq_f32(simde_float32x4_t a, simde_float32x4_t b, simde_float32x4_t c) { + #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) && defined(SIMDE_ARCH_ARM_FMA) + return vfmsq_f32(a, b, c); + #else + return simde_vaddq_f32(a, simde_vnegq_f32(simde_vmulq_f32(b, c))); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vfmsq_f32 + #define vfmsq_f32(a, b, c) simde_vfmsq_f32(a, b, c) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_float64x2_t +simde_vfmsq_f64(simde_float64x2_t a, simde_float64x2_t b, simde_float64x2_t c) { + #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && defined(SIMDE_ARCH_ARM_FMA) + return vfmsq_f64(a, b, c); + #else + return simde_vaddq_f64(a, simde_vnegq_f64(simde_vmulq_f64(b, c))); + #endif +} +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vfmsq_f64 + #define vfmsq_f64(a, b, c) simde_vfmsq_f64(a, b, c) +#endif + +SIMDE_END_DECLS_ +HEDLEY_DIAGNOSTIC_POP + +#endif /* !defined(SIMDE_ARM_NEON_FMS_H) */ diff --git a/lib/simd_wrapper/simde/arm/neon/fms_lane.h b/lib/simd_wrapper/simde/arm/neon/fms_lane.h new file mode 100644 index 00000000000..05ef96ae3d0 --- /dev/null +++ b/lib/simd_wrapper/simde/arm/neon/fms_lane.h @@ -0,0 +1,316 @@ +/* SPDX-License-Identifier: MIT +* +* Permission is hereby granted, free of charge, to any person +* obtaining a copy of this software and associated documentation +* files (the "Software"), to deal in the Software without +* restriction, including without limitation the rights to use, copy, +* modify, merge, publish, distribute, sublicense, and/or sell copies +* of the Software, and to permit persons to whom the Software is +* furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice shall be +* included in all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF +* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS +* BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN +* ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN +* CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +* SOFTWARE. +* +* Copyright: +* 2023 Yi-Yen Chung (Copyright owned by Andes Technology) +*/ + +#if !defined(SIMDE_ARM_NEON_FMS_LANE_H) +#define SIMDE_ARM_NEON_FMS_LANE_H + +#include "sub.h" +#include "dup_n.h" +#include "get_lane.h" +#include "mul.h" +#include "mul_lane.h" + +HEDLEY_DIAGNOSTIC_PUSH +SIMDE_DISABLE_UNWANTED_DIAGNOSTICS +SIMDE_BEGIN_DECLS_ + +/* simde_vfmsd_lane_f64 */ +#if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && defined(SIMDE_ARCH_ARM_FMA) + #if defined(__clang__) && !SIMDE_DETECT_CLANG_VERSION_CHECK(11,0,0) + #define simde_vfmsd_lane_f64(a, b, v, lane) \ + SIMDE_DISABLE_DIAGNOSTIC_EXPR_(SIMDE_DIAGNOSTIC_DISABLE_VECTOR_CONVERSION_, vfmsd_lane_f64(a, b, v, lane)) + #else + #define simde_vfmsd_lane_f64(a, b, v, lane) vfmsd_lane_f64((a), (b), (v), (lane)) + #endif +#else + #define simde_vfmsd_lane_f64(a, b, v, lane) \ + simde_vget_lane_f64( \ + simde_vsub_f64( \ + simde_vdup_n_f64(a), \ + simde_vdup_n_f64(simde_vmuld_lane_f64(b, v, lane)) \ + ), \ + 0 \ + ) +#endif +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vfmsd_lane_f64 + #define vfmsd_lane_f64(a, b, v, lane) simde_vfmsd_lane_f64(a, b, v, lane) +#endif + +/* simde_vfmsd_laneq_f64 */ +#if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && defined(SIMDE_ARCH_ARM_FMA) + #if defined(__clang__) && !SIMDE_DETECT_CLANG_VERSION_CHECK(11,0,0) + #define simde_vfmsd_laneq_f64(a, b, v, lane) \ + SIMDE_DISABLE_DIAGNOSTIC_EXPR_(SIMDE_DIAGNOSTIC_DISABLE_VECTOR_CONVERSION_, vfmsd_laneq_f64(a, b, v, lane)) + #else + #define simde_vfmsd_laneq_f64(a, b, v, lane) vfmsd_laneq_f64((a), (b), (v), (lane)) + #endif +#else + #define simde_vfmsd_laneq_f64(a, b, v, lane) \ + simde_vget_lane_f64( \ + simde_vsub_f64( \ + simde_vdup_n_f64(a), \ + simde_vdup_n_f64(simde_vmuld_laneq_f64(b, v, lane)) \ + ), \ + 0 \ + ) +#endif +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vfmsd_laneq_f64 + #define vfmsd_laneq_f64(a, b, v, lane) simde_vfmsd_laneq_f64(a, b, v, lane) +#endif + +/* simde_vfmsh_lane_f16 */ +#if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && defined(SIMDE_ARCH_ARM_FMA) && defined(SIMDE_ARM_NEON_FP16) + #if defined(__clang__) && !SIMDE_DETECT_CLANG_VERSION_CHECK(11,0,0) + #define simde_vfmsh_lane_f16(a, b, v, lane) \ + SIMDE_DISABLE_DIAGNOSTIC_EXPR_(SIMDE_DIAGNOSTIC_DISABLE_VECTOR_CONVERSION_, vfmsh_lane_f16(a, b, v, lane)) + #else + #define simde_vfmsh_lane_f16(a, b, v, lane) vfmsh_lane_f16((a), (b), (v), (lane)) + #endif +#else + #define simde_vfmsh_lane_f16(a, b, v, lane) \ + simde_vget_lane_f16( \ + simde_vsub_f16( \ + simde_vdup_n_f16(a), \ + simde_vdup_n_f16(simde_vmulh_lane_f16(b, v, lane)) \ + ), \ + 0 \ + ) +#endif +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vfmsh_lane_f16 + #define vfmsh_lane_f16(a, b, v, lane) simde_vfmsh_lane_f16(a, b, v, lane) +#endif + +/* simde_vfmsh_laneq_f16 */ +#if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && defined(SIMDE_ARCH_ARM_FMA) && defined(SIMDE_ARM_NEON_FP16) + #if defined(__clang__) && !SIMDE_DETECT_CLANG_VERSION_CHECK(11,0,0) + #define simde_vfmsh_laneq_f16(a, b, v, lane) \ + SIMDE_DISABLE_DIAGNOSTIC_EXPR_(SIMDE_DIAGNOSTIC_DISABLE_VECTOR_CONVERSION_, vfmsh_laneq_f16(a, b, v, lane)) + #else + #define simde_vfmsh_laneq_f16(a, b, v, lane) vfmsh_laneq_f16((a), (b), (v), (lane)) + #endif +#else + #define simde_vfmsh_laneq_f16(a, b, v, lane) \ + simde_vget_lane_f16( \ + simde_vsub_f16( \ + simde_vdup_n_f16(a), \ + simde_vdup_n_f16(simde_vmulh_laneq_f16(b, v, lane)) \ + ), \ + 0 \ + ) +#endif +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vfmsh_laneq_f16 + #define vfmsh_laneq_f16(a, b, v, lane) simde_vfmsh_laneq_f16(a, b, v, lane) +#endif + +/* simde_vfmss_lane_f32 */ +#if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && defined(SIMDE_ARCH_ARM_FMA) + #if defined(__clang__) && !SIMDE_DETECT_CLANG_VERSION_CHECK(11,0,0) + #define simde_vfmss_lane_f32(a, b, v, lane) \ + SIMDE_DISABLE_DIAGNOSTIC_EXPR_(SIMDE_DIAGNOSTIC_DISABLE_VECTOR_CONVERSION_, vfmss_lane_f32(a, b, v, lane)) + #else + #define simde_vfmss_lane_f32(a, b, v, lane) vfmss_lane_f32((a), (b), (v), (lane)) + #endif +#else + #define simde_vfmss_lane_f32(a, b, v, lane) \ + simde_vget_lane_f32( \ + simde_vsub_f32( \ + simde_vdup_n_f32(a), \ + simde_vdup_n_f32(simde_vmuls_lane_f32(b, v, lane)) \ + ), \ + 0 \ + ) +#endif +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vfmss_lane_f32 + #define vfmss_lane_f32(a, b, v, lane) simde_vfmss_lane_f32(a, b, v, lane) +#endif + +/* simde_vfmss_laneq_f32 */ +#if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && defined(SIMDE_ARCH_ARM_FMA) + #if defined(__clang__) && !SIMDE_DETECT_CLANG_VERSION_CHECK(11,0,0) + #define simde_vfmss_laneq_f32(a, b, v, lane) \ + SIMDE_DISABLE_DIAGNOSTIC_EXPR_(SIMDE_DIAGNOSTIC_DISABLE_VECTOR_CONVERSION_, vfmss_laneq_f32(a, b, v, lane)) + #else + #define simde_vfmss_laneq_f32(a, b, v, lane) vfmss_laneq_f32((a), (b), (v), (lane)) + #endif +#else + #define simde_vfmss_laneq_f32(a, b, v, lane) \ + simde_vget_lane_f32( \ + simde_vsub_f32( \ + simde_vdup_n_f32(a), \ + simde_vdup_n_f32(simde_vmuls_laneq_f32(b, v, lane)) \ + ), \ + 0 \ + ) +#endif +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vfmss_laneq_f32 + #define vfmss_laneq_f32(a, b, v, lane) simde_vfmss_laneq_f32(a, b, v, lane) +#endif + +/* simde_vfms_lane_f16 */ +#if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && defined(SIMDE_ARCH_ARM_FMA) && defined(SIMDE_ARM_NEON_FP16) + #define simde_vfms_lane_f16(a, b, v, lane) vfms_lane_f16(a, b, v, lane) +#else + #define simde_vfms_lane_f16(a, b, v, lane) simde_vsub_f16(a, simde_vmul_lane_f16(b, v, lane)) +#endif +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vfms_lane_f16 + #define vfms_lane_f16(a, b, v, lane) simde_vfms_lane_f16(a, b, v, lane) +#endif + +/* simde_vfms_lane_f32 */ +#if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && defined(SIMDE_ARCH_ARM_FMA) + #define simde_vfms_lane_f32(a, b, v, lane) vfms_lane_f32(a, b, v, lane) +#else + #define simde_vfms_lane_f32(a, b, v, lane) simde_vsub_f32(a, simde_vmul_lane_f32(b, v, lane)) +#endif +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vfms_lane_f32 + #define vfms_lane_f32(a, b, v, lane) simde_vfms_lane_f32(a, b, v, lane) +#endif + +/* simde_vfms_lane_f64 */ +#if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && defined(SIMDE_ARCH_ARM_FMA) + #define simde_vfms_lane_f64(a, b, v, lane) vfms_lane_f64((a), (b), (v), (lane)) +#else + #define simde_vfms_lane_f64(a, b, v, lane) simde_vsub_f64(a, simde_vmul_lane_f64(b, v, lane)) +#endif +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vfms_lane_f64 + #define vfms_lane_f64(a, b, v, lane) simde_vfms_lane_f64(a, b, v, lane) +#endif + +/* simde_vfms_laneq_f16 */ +#if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && defined(SIMDE_ARCH_ARM_FMA) && defined(SIMDE_ARM_NEON_FP16) + #define simde_vfms_laneq_f16(a, b, v, lane) vfms_laneq_f16((a), (b), (v), (lane)) +#else + #define simde_vfms_laneq_f16(a, b, v, lane) simde_vsub_f16(a, simde_vmul_laneq_f16(b, v, lane)) +#endif +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vfms_laneq_f16 + #define vfms_laneq_f16(a, b, v, lane) simde_vfms_laneq_f16(a, b, v, lane) +#endif + +/* simde_vfms_laneq_f32 */ +#if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && defined(SIMDE_ARCH_ARM_FMA) + #define simde_vfms_laneq_f32(a, b, v, lane) vfms_laneq_f32((a), (b), (v), (lane)) +#else + #define simde_vfms_laneq_f32(a, b, v, lane) simde_vsub_f32(a, simde_vmul_laneq_f32(b, v, lane)) +#endif +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vfms_laneq_f32 + #define vfms_laneq_f32(a, b, v, lane) simde_vfms_laneq_f32(a, b, v, lane) +#endif + +/* simde_vfms_laneq_f64 */ +#if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && defined(SIMDE_ARCH_ARM_FMA) + #define simde_vfms_laneq_f64(a, b, v, lane) vfms_laneq_f64((a), (b), (v), (lane)) +#else + #define simde_vfms_laneq_f64(a, b, v, lane) simde_vsub_f64(a, simde_vmul_laneq_f64(b, v, lane)) +#endif +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vfms_laneq_f64 + #define vfms_laneq_f64(a, b, v, lane) simde_vfms_laneq_f64(a, b, v, lane) +#endif + +/* simde_vfmsq_lane_f64 */ +#if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && defined(SIMDE_ARCH_ARM_FMA) + #define simde_vfmsq_lane_f64(a, b, v, lane) vfmsq_lane_f64((a), (b), (v), (lane)) +#else + #define simde_vfmsq_lane_f64(a, b, v, lane) simde_vsubq_f64(a, simde_vmulq_lane_f64(b, v, lane)) +#endif +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vfmsq_lane_f64 + #define vfmsq_lane_f64(a, b, v, lane) simde_vfmsq_lane_f64(a, b, v, lane) +#endif + +/* simde_vfmsq_lane_f16 */ +#if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && defined(SIMDE_ARCH_ARM_FMA) && defined(SIMDE_ARM_NEON_FP16) + #define simde_vfmsq_lane_f16(a, b, v, lane) vfmsq_lane_f16((a), (b), (v), (lane)) +#else + #define simde_vfmsq_lane_f16(a, b, v, lane) simde_vsubq_f16(a, simde_vmulq_lane_f16(b, v, lane)) +#endif +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vfmsq_lane_f16 + #define vfmsq_lane_f16(a, b, v, lane) simde_vfmsq_lane_f16(a, b, v, lane) +#endif + +/* simde_vfmsq_lane_f32 */ +#if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && defined(SIMDE_ARCH_ARM_FMA) + #define simde_vfmsq_lane_f32(a, b, v, lane) vfmsq_lane_f32((a), (b), (v), (lane)) +#else + #define simde_vfmsq_lane_f32(a, b, v, lane) simde_vsubq_f32(a, simde_vmulq_lane_f32(b, v, lane)) +#endif +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vfmsq_lane_f32 + #define vfmsq_lane_f32(a, b, v, lane) simde_vfmsq_lane_f32(a, b, v, lane) +#endif + +/* simde_vfmsq_laneq_f16 */ +#if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && defined(SIMDE_ARCH_ARM_FMA) && defined(SIMDE_ARM_NEON_FP16) + #define simde_vfmsq_laneq_f16(a, b, v, lane) vfmsq_laneq_f16((a), (b), (v), (lane)) +#else + #define simde_vfmsq_laneq_f16(a, b, v, lane) \ + simde_vsubq_f16(a, simde_vmulq_laneq_f16(b, v, lane)) +#endif +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vfmsq_laneq_f16 + #define vfmsq_laneq_f16(a, b, v, lane) simde_vfmsq_laneq_f16(a, b, v, lane) +#endif + +/* simde_vfmsq_laneq_f32 */ +#if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && defined(SIMDE_ARCH_ARM_FMA) + #define simde_vfmsq_laneq_f32(a, b, v, lane) vfmsq_laneq_f32((a), (b), (v), (lane)) +#else + #define simde_vfmsq_laneq_f32(a, b, v, lane) \ + simde_vsubq_f32(a, simde_vmulq_laneq_f32(b, v, lane)) +#endif +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vfmsq_laneq_f32 + #define vfmsq_laneq_f32(a, b, v, lane) simde_vfmsq_laneq_f32(a, b, v, lane) +#endif + +/* simde_vfmsq_laneq_f64 */ +#if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && defined(SIMDE_ARCH_ARM_FMA) + #define simde_vfmsq_laneq_f64(a, b, v, lane) vfmsq_laneq_f64((a), (b), (v), (lane)) +#else + #define simde_vfmsq_laneq_f64(a, b, v, lane) \ + simde_vsubq_f64(a, simde_vmulq_laneq_f64(b, v, lane)) +#endif +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vfmsq_laneq_f64 + #define vfmsq_laneq_f64(a, b, v, lane) simde_vfmsq_laneq_f64(a, b, v, lane) +#endif + +SIMDE_END_DECLS_ +HEDLEY_DIAGNOSTIC_POP + +#endif /* !defined(SIMDE_ARM_NEON_FMS_LANE_H) */ diff --git a/lib/simd_wrapper/simde/arm/neon/fms_n.h b/lib/simd_wrapper/simde/arm/neon/fms_n.h new file mode 100644 index 00000000000..6011ae41539 --- /dev/null +++ b/lib/simd_wrapper/simde/arm/neon/fms_n.h @@ -0,0 +1,125 @@ +/* SPDX-License-Identifier: MIT +* +* Permission is hereby granted, free of charge, to any person +* obtaining a copy of this software and associated documentation +* files (the "Software"), to deal in the Software without +* restriction, including without limitation the rights to use, copy, +* modify, merge, publish, distribute, sublicense, and/or sell copies +* of the Software, and to permit persons to whom the Software is +* furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice shall be +* included in all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF +* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS +* BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN +* ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN +* CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +* SOFTWARE. +* +* Copyright: +* 2023 Yi-Yen Chung (Copyright owned by Andes Technology) +*/ + +#if !defined(SIMDE_ARM_NEON_FMS_N_H) +#define SIMDE_ARM_NEON_FMS_N_H + +#include "types.h" +#include "dup_n.h" +#include "fms.h" + +HEDLEY_DIAGNOSTIC_PUSH +SIMDE_DISABLE_UNWANTED_DIAGNOSTICS +SIMDE_BEGIN_DECLS_ + +SIMDE_FUNCTION_ATTRIBUTES +simde_float16x4_t +simde_vfms_n_f16(simde_float16x4_t a, simde_float16x4_t b, simde_float16_t c) { + #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && defined(SIMDE_ARCH_ARM_FMA) && (!defined(__clang__) || SIMDE_DETECT_CLANG_VERSION_CHECK(7,0,0)) && !defined(SIMDE_BUG_GCC_95399) && defined(SIMDE_ARM_NEON_FP16) + return vfms_n_f16(a, b, c); + #else + return simde_vfms_f16(a, b, simde_vdup_n_f16(c)); + #endif +} +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vfms_n_f16 + #define vfms_n_f16(a, b, c) simde_vfms_n_f16(a, b, c) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_float16x8_t +simde_vfmsq_n_f16(simde_float16x8_t a, simde_float16x8_t b, simde_float16_t c) { + #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && defined(SIMDE_ARCH_ARM_FMA) && (!defined(__clang__) || SIMDE_DETECT_CLANG_VERSION_CHECK(7,0,0)) && !defined(SIMDE_BUG_GCC_95399) && defined(SIMDE_ARM_NEON_FP16) + return vfmsq_n_f16(a, b, c); + #else + return simde_vfmsq_f16(a, b, simde_vdupq_n_f16(c)); + #endif +} +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vfmsq_n_f16 + #define vfmsq_n_f16(a, b, c) simde_vfmsq_n_f16(a, b, c) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_float32x2_t +simde_vfms_n_f32(simde_float32x2_t a, simde_float32x2_t b, simde_float32_t c) { + #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && defined(SIMDE_ARCH_ARM_FMA) && (!defined(__clang__) || SIMDE_DETECT_CLANG_VERSION_CHECK(7,0,0)) && !defined(SIMDE_BUG_GCC_95399) + return vfms_n_f32(a, b, c); + #else + return simde_vfms_f32(a, b, simde_vdup_n_f32(c)); + #endif +} +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vfms_n_f32 + #define vfms_n_f32(a, b, c) simde_vfms_n_f32(a, b, c) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_float64x1_t +simde_vfms_n_f64(simde_float64x1_t a, simde_float64x1_t b, simde_float64_t c) { + #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && defined(SIMDE_ARCH_ARM_FMA) && (!defined(__clang__) || SIMDE_DETECT_CLANG_VERSION_CHECK(7,0,0)) + return vfms_n_f64(a, b, c); + #else + return simde_vfms_f64(a, b, simde_vdup_n_f64(c)); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vfms_n_f64 + #define vfms_n_f64(a, b, c) simde_vfms_n_f64(a, b, c) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_float32x4_t +simde_vfmsq_n_f32(simde_float32x4_t a, simde_float32x4_t b, simde_float32_t c) { + #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && defined(SIMDE_ARCH_ARM_FMA) && (!defined(__clang__) || SIMDE_DETECT_CLANG_VERSION_CHECK(7,0,0)) && !defined(SIMDE_BUG_GCC_95399) + return vfmsq_n_f32(a, b, c); + #else + return simde_vfmsq_f32(a, b, simde_vdupq_n_f32(c)); + #endif +} +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vfmsq_n_f32 + #define vfmsq_n_f32(a, b, c) simde_vfmsq_n_f32(a, b, c) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_float64x2_t +simde_vfmsq_n_f64(simde_float64x2_t a, simde_float64x2_t b, simde_float64_t c) { + #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && defined(SIMDE_ARCH_ARM_FMA) && (!defined(__clang__) || SIMDE_DETECT_CLANG_VERSION_CHECK(7,0,0)) + return vfmsq_n_f64(a, b, c); + #else + return simde_vfmsq_f64(a, b, simde_vdupq_n_f64(c)); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vfmsq_n_f64 + #define vfmsq_n_f64(a, b, c) simde_vfmsq_n_f64(a, b, c) +#endif + +SIMDE_END_DECLS_ +HEDLEY_DIAGNOSTIC_POP + +#endif /* !defined(SIMDE_ARM_NEON_FMS_N_H) */ diff --git a/lib/simd_wrapper/simde/arm/neon/get_high.h b/lib/simd_wrapper/simde/arm/neon/get_high.h index 654c63bd609..df37ccccae0 100644 --- a/lib/simd_wrapper/simde/arm/neon/get_high.h +++ b/lib/simd_wrapper/simde/arm/neon/get_high.h @@ -23,6 +23,7 @@ * Copyright: * 2020 Evan Nemerson * 2020 Sean Maher (Copyright owned by Google, LLC) + * 2023 Yi-Yen Chung (Copyright owned by Andes Technology) */ #if !defined(SIMDE_ARM_NEON_GET_HIGH_H) @@ -34,6 +35,28 @@ HEDLEY_DIAGNOSTIC_PUSH SIMDE_DISABLE_UNWANTED_DIAGNOSTICS SIMDE_BEGIN_DECLS_ +SIMDE_FUNCTION_ATTRIBUTES +simde_float16x4_t +simde_vget_high_f16(simde_float16x8_t a) { + #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) && defined(SIMDE_ARM_NEON_FP16) + return vget_high_f16(a); + #else + simde_float16x4_private r_; + simde_float16x8_private a_ = simde_float16x8_to_private(a); + + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { + r_.values[i] = a_.values[i + (sizeof(r_.values) / sizeof(r_.values[0]))]; + } + + return simde_float16x4_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vget_high_f16 + #define vget_high_f16(a) simde_vget_high_f16((a)) +#endif + SIMDE_FUNCTION_ATTRIBUTES simde_float32x2_t simde_vget_high_f32(simde_float32x4_t a) { @@ -294,6 +317,94 @@ simde_vget_high_u64(simde_uint64x2_t a) { #define vget_high_u64(a) simde_vget_high_u64((a)) #endif +SIMDE_FUNCTION_ATTRIBUTES +simde_poly8x8_t +simde_vget_high_p8(simde_poly8x16_t a) { + #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) + return vget_high_p8(a); + #else + simde_poly8x8_private r_; + simde_poly8x16_private a_ = simde_poly8x16_to_private(a); + + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { + r_.values[i] = a_.values[i + (sizeof(r_.values) / sizeof(r_.values[0]))]; + } + + return simde_poly8x8_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vget_high_p8 + #define vget_high_p8(a) simde_vget_high_p8((a)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_poly16x4_t +simde_vget_high_p16(simde_poly16x8_t a) { + #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) + return vget_high_p16(a); + #else + simde_poly16x4_private r_; + simde_poly16x8_private a_ = simde_poly16x8_to_private(a); + + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { + r_.values[i] = a_.values[i + (sizeof(r_.values) / sizeof(r_.values[0]))]; + } + + return simde_poly16x4_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vget_high_p16 + #define vget_high_p16(a) simde_vget_high_p16((a)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_poly64x1_t +simde_vget_high_p64(simde_poly64x2_t a) { + #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) + return vget_high_p64(a); + #else + simde_poly64x1_private r_; + simde_poly64x2_private a_ = simde_poly64x2_to_private(a); + + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { + r_.values[i] = a_.values[i + (sizeof(r_.values) / sizeof(r_.values[0]))]; + } + + return simde_poly64x1_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) + #undef vget_high_p64 + #define vget_high_p64(a) simde_vget_high_p64((a)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_bfloat16x4_t +simde_vget_high_bf16(simde_bfloat16x8_t a) { + #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARM_NEON_BF16) + return vget_high_bf16(a); + #else + simde_bfloat16x4_private r_; + simde_bfloat16x8_private a_ = simde_bfloat16x8_to_private(a); + + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { + r_.values[i] = a_.values[i + (sizeof(r_.values) / sizeof(r_.values[0]))]; + } + + return simde_bfloat16x4_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) + #undef vget_high_bf16 + #define vget_high_bf16(a) simde_vget_high_bf16((a)) +#endif + SIMDE_END_DECLS_ HEDLEY_DIAGNOSTIC_POP diff --git a/lib/simd_wrapper/simde/arm/neon/get_lane.h b/lib/simd_wrapper/simde/arm/neon/get_lane.h index 2dbeb55c6e0..06040eb2c32 100644 --- a/lib/simd_wrapper/simde/arm/neon/get_lane.h +++ b/lib/simd_wrapper/simde/arm/neon/get_lane.h @@ -23,6 +23,7 @@ * Copyright: * 2020 Evan Nemerson * 2020 Sean Maher (Copyright owned by Google, LLC) + * 2023 Yi-Yen Chung (Copyright owned by Andes Technology) */ #if !defined(SIMDE_ARM_NEON_GET_LANE_H) @@ -34,6 +35,27 @@ HEDLEY_DIAGNOSTIC_PUSH SIMDE_DISABLE_UNWANTED_DIAGNOSTICS SIMDE_BEGIN_DECLS_ +SIMDE_FUNCTION_ATTRIBUTES +simde_float16_t +simde_vget_lane_f16(simde_float16x4_t v, const int lane) + SIMDE_REQUIRE_CONSTANT_RANGE(lane, 0, 3) { + simde_float16_t r; + + #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARM_NEON_FP16) + SIMDE_CONSTIFY_4_(vget_lane_f16, r, (HEDLEY_UNREACHABLE(), SIMDE_FLOAT16_VALUE(0.0)), lane, v); + #else + simde_float16x4_private v_ = simde_float16x4_to_private(v); + + r = v_.values[lane]; + #endif + + return r; +} +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) + #undef vget_lane_f16 + #define vget_lane_f16(v, lane) simde_vget_lane_f16((v), (lane)) +#endif + SIMDE_FUNCTION_ATTRIBUTES simde_float32_t simde_vget_lane_f32(simde_float32x2_t v, const int lane) @@ -247,6 +269,27 @@ simde_vget_lane_u64(simde_uint64x1_t v, const int lane) #define vget_lane_u64(v, lane) simde_vget_lane_u64((v), (lane)) #endif +SIMDE_FUNCTION_ATTRIBUTES +simde_float16_t +simde_vgetq_lane_f16(simde_float16x8_t v, const int lane) + SIMDE_REQUIRE_CONSTANT_RANGE(lane, 0, 7) { + simde_float16_t r; + + #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) && defined(SIMDE_ARM_NEON_FP16) + SIMDE_CONSTIFY_8_(vgetq_lane_f16, r, (HEDLEY_UNREACHABLE(), SIMDE_FLOAT16_VALUE(0.0)), lane, v); + #else + simde_float16x8_private v_ = simde_float16x8_to_private(v); + + r = v_.values[lane]; + #endif + + return r; +} +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vgetq_lane_f16 + #define vgetq_lane_f16(v, lane) simde_vgetq_lane_f16((v), (lane)) +#endif + SIMDE_FUNCTION_ATTRIBUTES simde_float32_t simde_vgetq_lane_f32(simde_float32x4_t v, const int lane) @@ -513,6 +556,161 @@ simde_vgetq_lane_u64(simde_uint64x2_t v, const int lane) #define vgetq_lane_u64(v, lane) simde_vgetq_lane_u64((v), (lane)) #endif +SIMDE_FUNCTION_ATTRIBUTES +simde_poly8_t +simde_vget_lane_p8(simde_poly8x8_t v, const int lane) + SIMDE_REQUIRE_CONSTANT_RANGE(lane, 0, 7) { + simde_poly8_t r; + simde_poly8x8_private v_ = simde_poly8x8_to_private(v); + r = v_.values[lane]; + + return r; +} +#if defined(SIMDE_ARM_NEON_A32V7_NATIVE) && !defined(SIMDE_BUG_CLANG_71362) + #define simde_vget_lane_p8(v, lane) vget_lane_p8((v), (lane)) +#endif +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vget_lane_p8 + #define vget_lane_p8(v, lane) simde_vget_lane_p8((v), (lane)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_poly16_t +simde_vget_lane_p16(simde_poly16x4_t v, const int lane) + SIMDE_REQUIRE_CONSTANT_RANGE(lane, 0, 3) { + simde_poly16_t r; + simde_poly16x4_private v_ = simde_poly16x4_to_private(v); + + r = v_.values[lane]; + + return r; +} +#if defined(SIMDE_ARM_NEON_A32V7_NATIVE) && !defined(SIMDE_BUG_CLANG_71362) + #define simde_vget_lane_p16(v, lane) vget_lane_p16((v), (lane)) +#endif +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vget_lane_p16 + #define vget_lane_p16(v, lane) simde_vget_lane_p16((v), (lane)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_poly64_t +simde_vget_lane_p64(simde_poly64x1_t v, const int lane) + SIMDE_REQUIRE_CONSTANT_RANGE(lane, 0, 0) { + simde_poly64_t r; + simde_poly64x1_private v_ = simde_poly64x1_to_private(v); + + r = v_.values[lane]; + + return r; +} +#if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && !defined(SIMDE_BUG_CLANG_71362) + #define simde_vget_lane_p64(v, lane) vget_lane_p64((v), (lane)) +#endif +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) + #undef vget_lane_p64 + #define vget_lane_p64(v, lane) simde_vget_lane_p64((v), (lane)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_poly8_t +simde_vgetq_lane_p8(simde_poly8x16_t v, const int lane) + SIMDE_REQUIRE_CONSTANT_RANGE(lane, 0, 15) { + simde_poly8_t r; + simde_poly8x16_private v_ = simde_poly8x16_to_private(v); + + r = v_.values[lane]; + + return r; +} +#if defined(SIMDE_ARM_NEON_A32V7_NATIVE) && !defined(SIMDE_BUG_CLANG_71362) + #define simde_vgetq_lane_p8(v, lane) vgetq_lane_p8((v), (lane)) +#endif +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vgetq_lane_p8 + #define vgetq_lane_p8(v, lane) simde_vgetq_lane_p8((v), (lane)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_poly16_t +simde_vgetq_lane_p16(simde_poly16x8_t v, const int lane) + SIMDE_REQUIRE_CONSTANT_RANGE(lane, 0, 7) { + simde_poly16_t r; + simde_poly16x8_private v_ = simde_poly16x8_to_private(v); + + r = v_.values[lane]; + + return r; +} +#if defined(SIMDE_ARM_NEON_A32V7_NATIVE) && !defined(SIMDE_BUG_CLANG_71362) + #define simde_vgetq_lane_p16(v, lane) vgetq_lane_p16((v), (lane)) +#endif +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vgetq_lane_p16 + #define vgetq_lane_p16(v, lane) simde_vgetq_lane_p16((v), (lane)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_poly64_t +simde_vgetq_lane_p64(simde_poly64x2_t v, const int lane) + SIMDE_REQUIRE_CONSTANT_RANGE(lane, 0, 1) { + simde_poly64_t r; + simde_poly64x2_private v_ = simde_poly64x2_to_private(v); + + r = v_.values[lane]; + + return r; +} +#if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && !defined(SIMDE_BUG_CLANG_71362) + #define simde_vgetq_lane_p64(v, lane) vgetq_lane_p64((v), (lane)) +#endif +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) + #undef vgetq_lane_p64 + #define vgetq_lane_p64(v, lane) simde_vgetq_lane_p64((v), (lane)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_bfloat16_t +simde_vget_lane_bf16(simde_bfloat16x4_t v, const int lane) + SIMDE_REQUIRE_CONSTANT_RANGE(lane, 0, 3) { + simde_bfloat16_t r; + + #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARM_NEON_BF16) + SIMDE_CONSTIFY_4_(vget_lane_bf16, r, (HEDLEY_UNREACHABLE(), SIMDE_BFLOAT16_VALUE(0.0)), lane, v); + #else + simde_bfloat16x4_private v_ = simde_bfloat16x4_to_private(v); + + r = v_.values[lane]; + #endif + + return r; +} +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) + #undef vget_lane_bf16 + #define vget_lane_bf16(v, lane) simde_vget_lane_bf16((v), (lane)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_bfloat16_t +simde_vgetq_lane_bf16(simde_bfloat16x8_t v, const int lane) + SIMDE_REQUIRE_CONSTANT_RANGE(lane, 0, 7) { + simde_bfloat16_t r; + + #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARM_NEON_BF16) + SIMDE_CONSTIFY_8_(vgetq_lane_bf16, r, (HEDLEY_UNREACHABLE(), SIMDE_BFLOAT16_VALUE(0.0)), lane, v); + #else + simde_bfloat16x8_private v_ = simde_bfloat16x8_to_private(v); + + r = v_.values[lane]; + #endif + + return r; +} +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) + #undef vgetq_lane_bf16 + #define vgetq_lane_bf16(v, lane) simde_vgetq_lane_bf16((v), (lane)) +#endif + SIMDE_END_DECLS_ HEDLEY_DIAGNOSTIC_POP diff --git a/lib/simd_wrapper/simde/arm/neon/get_low.h b/lib/simd_wrapper/simde/arm/neon/get_low.h index 84e17783c3a..4594a3064ef 100644 --- a/lib/simd_wrapper/simde/arm/neon/get_low.h +++ b/lib/simd_wrapper/simde/arm/neon/get_low.h @@ -23,6 +23,7 @@ * Copyright: * 2020 Evan Nemerson * 2020 Sean Maher (Copyright owned by Google, LLC) + * 2023 Yi-Yen Chung (Copyright owned by Andes Technology) */ #if !defined(SIMDE_ARM_NEON_GET_LOW_H) @@ -34,6 +35,28 @@ HEDLEY_DIAGNOSTIC_PUSH SIMDE_DISABLE_UNWANTED_DIAGNOSTICS SIMDE_BEGIN_DECLS_ +SIMDE_FUNCTION_ATTRIBUTES +simde_float16x4_t +simde_vget_low_f16(simde_float16x8_t a) { + #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) && defined(SIMDE_ARM_NEON_FP16) + return vget_low_f16(a); + #else + simde_float16x4_private r_; + simde_float16x8_private a_ = simde_float16x8_to_private(a); + + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { + r_.values[i] = a_.values[i]; + } + + return simde_float16x4_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vget_low_f16 + #define vget_low_f16(a) simde_vget_low_f16((a)) +#endif + SIMDE_FUNCTION_ATTRIBUTES simde_float32x2_t simde_vget_low_f32(simde_float32x4_t a) { @@ -326,6 +349,94 @@ simde_vget_low_u64(simde_uint64x2_t a) { #define vget_low_u64(a) simde_vget_low_u64((a)) #endif +SIMDE_FUNCTION_ATTRIBUTES +simde_poly8x8_t +simde_vget_low_p8(simde_poly8x16_t a) { + #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) + return vget_low_p8(a); + #else + simde_poly8x8_private r_; + simde_poly8x16_private a_ = simde_poly8x16_to_private(a); + + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { + r_.values[i] = a_.values[i]; + } + + return simde_poly8x8_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vget_low_p8 + #define vget_low_p8(a) simde_vget_low_p8((a)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_poly16x4_t +simde_vget_low_p16(simde_poly16x8_t a) { + #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) + return vget_low_p16(a); + #else + simde_poly16x4_private r_; + simde_poly16x8_private a_ = simde_poly16x8_to_private(a); + + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { + r_.values[i] = a_.values[i]; + } + + return simde_poly16x4_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vget_low_p16 + #define vget_low_p16(a) simde_vget_low_p16((a)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_poly64x1_t +simde_vget_low_p64(simde_poly64x2_t a) { + #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) + return vget_low_p64(a); + #else + simde_poly64x1_private r_; + simde_poly64x2_private a_ = simde_poly64x2_to_private(a); + + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { + r_.values[i] = a_.values[i]; + } + + return simde_poly64x1_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) + #undef vget_low_p64 + #define vget_low_p64(a) simde_vget_low_p64((a)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_bfloat16x4_t +simde_vget_low_bf16(simde_bfloat16x8_t a) { + #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARM_NEON_BF16) + return vget_low_bf16(a); + #else + simde_bfloat16x4_private r_; + simde_bfloat16x8_private a_ = simde_bfloat16x8_to_private(a); + + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { + r_.values[i] = a_.values[i]; + } + + return simde_bfloat16x4_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) + #undef vget_low_bf16 + #define vget_low_bf16(a) simde_vget_low_bf16((a)) +#endif + SIMDE_END_DECLS_ HEDLEY_DIAGNOSTIC_POP diff --git a/lib/simd_wrapper/simde/arm/neon/hadd.h b/lib/simd_wrapper/simde/arm/neon/hadd.h index 53e26d71698..7e72ba3f794 100644 --- a/lib/simd_wrapper/simde/arm/neon/hadd.h +++ b/lib/simd_wrapper/simde/arm/neon/hadd.h @@ -46,6 +46,14 @@ simde_int8x8_t simde_vhadd_s8(simde_int8x8_t a, simde_int8x8_t b) { #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) return vhadd_s8(a, b); + #elif defined(SIMDE_RISCV_V_NATIVE) + simde_int8x8_private + r_, + a_ = simde_int8x8_to_private(a), + b_ = simde_int8x8_to_private(b); + + r_.sv64 = __riscv_vaadd_vv_i8m1(a_.sv64, b_.sv64, __RISCV_VXRM_RDN, 8); + return simde_int8x8_from_private(r_); #else return simde_vmovn_s16(simde_vshrq_n_s16(simde_vaddl_s8(a, b), 1)); #endif @@ -60,6 +68,14 @@ simde_int16x4_t simde_vhadd_s16(simde_int16x4_t a, simde_int16x4_t b) { #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) return vhadd_s16(a, b); + #elif defined(SIMDE_RISCV_V_NATIVE) + simde_int16x4_private + r_, + a_ = simde_int16x4_to_private(a), + b_ = simde_int16x4_to_private(b); + + r_.sv64 = __riscv_vaadd_vv_i16m1(a_.sv64, b_.sv64, __RISCV_VXRM_RDN, 4); + return simde_int16x4_from_private(r_); #else return simde_vmovn_s32(simde_vshrq_n_s32(simde_vaddl_s16(a, b), 1)); #endif @@ -74,6 +90,14 @@ simde_int32x2_t simde_vhadd_s32(simde_int32x2_t a, simde_int32x2_t b) { #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) return vhadd_s32(a, b); + #elif defined(SIMDE_RISCV_V_NATIVE) + simde_int32x2_private + r_, + a_ = simde_int32x2_to_private(a), + b_ = simde_int32x2_to_private(b); + + r_.sv64 = __riscv_vaadd_vv_i32m1(a_.sv64, b_.sv64, __RISCV_VXRM_RDN, 2); + return simde_int32x2_from_private(r_); #else return simde_vmovn_s64(simde_vshrq_n_s64(simde_vaddl_s32(a, b), 1)); #endif @@ -88,6 +112,14 @@ simde_uint8x8_t simde_vhadd_u8(simde_uint8x8_t a, simde_uint8x8_t b) { #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) return vhadd_u8(a, b); + #elif defined(SIMDE_RISCV_V_NATIVE) + simde_uint8x8_private + r_, + a_ = simde_uint8x8_to_private(a), + b_ = simde_uint8x8_to_private(b); + + r_.sv64 = __riscv_vaaddu_vv_u8m1(a_.sv64, b_.sv64, __RISCV_VXRM_RDN, 8); + return simde_uint8x8_from_private(r_); #else return simde_vmovn_u16(simde_vshrq_n_u16(simde_vaddl_u8(a, b), 1)); #endif @@ -102,6 +134,14 @@ simde_uint16x4_t simde_vhadd_u16(simde_uint16x4_t a, simde_uint16x4_t b) { #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) return vhadd_u16(a, b); + #elif defined(SIMDE_RISCV_V_NATIVE) + simde_uint16x4_private + r_, + a_ = simde_uint16x4_to_private(a), + b_ = simde_uint16x4_to_private(b); + + r_.sv64 = __riscv_vaaddu_vv_u16m1(a_.sv64, b_.sv64, __RISCV_VXRM_RDN, 4); + return simde_uint16x4_from_private(r_); #else return simde_vmovn_u32(simde_vshrq_n_u32(simde_vaddl_u16(a, b), 1)); #endif @@ -116,6 +156,14 @@ simde_uint32x2_t simde_vhadd_u32(simde_uint32x2_t a, simde_uint32x2_t b) { #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) return vhadd_u32(a, b); + #elif defined(SIMDE_RISCV_V_NATIVE) + simde_uint32x2_private + r_, + a_ = simde_uint32x2_to_private(a), + b_ = simde_uint32x2_to_private(b); + + r_.sv64 = __riscv_vaaddu_vv_u32m1(a_.sv64, b_.sv64, __RISCV_VXRM_RDN, 2); + return simde_uint32x2_from_private(r_); #else return simde_vmovn_u64(simde_vshrq_n_u64(simde_vaddl_u32(a, b), 1)); #endif @@ -138,6 +186,8 @@ simde_vhaddq_s8(simde_int8x16_t a, simde_int8x16_t b) { #if defined(SIMDE_X86_AVX512VL_NATIVE) && defined(SIMDE_X86_AVX512BW_NATIVE) r_.m128i = _mm256_cvtepi16_epi8(_mm256_srai_epi16(_mm256_add_epi16(_mm256_cvtepi8_epi16(a_.m128i), _mm256_cvtepi8_epi16(b_.m128i)), 1)); + #elif defined(SIMDE_RISCV_V_NATIVE) + r_.sv128 = __riscv_vaadd_vv_i8m1(a_.sv128, b_.sv128, __RISCV_VXRM_RDN, 16); #else SIMDE_VECTORIZE for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { @@ -166,6 +216,8 @@ simde_vhaddq_s16(simde_int16x8_t a, simde_int16x8_t b) { #if defined(SIMDE_X86_AVX512VL_NATIVE) r_.m128i = _mm256_cvtepi32_epi16(_mm256_srai_epi32(_mm256_add_epi32(_mm256_cvtepi16_epi32(a_.m128i), _mm256_cvtepi16_epi32(b_.m128i)), 1)); + #elif defined(SIMDE_RISCV_V_NATIVE) + r_.sv128 = __riscv_vaadd_vv_i16m1(a_.sv128, b_.sv128, __RISCV_VXRM_RDN, 8); #else SIMDE_VECTORIZE for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { @@ -194,6 +246,8 @@ simde_vhaddq_s32(simde_int32x4_t a, simde_int32x4_t b) { #if defined(SIMDE_X86_AVX512VL_NATIVE) r_.m128i = _mm256_cvtepi64_epi32(_mm256_srai_epi64(_mm256_add_epi64(_mm256_cvtepi32_epi64(a_.m128i), _mm256_cvtepi32_epi64(b_.m128i)), 1)); + #elif defined(SIMDE_RISCV_V_NATIVE) + r_.sv128 = __riscv_vaadd_vv_i32m1(a_.sv128, b_.sv128, __RISCV_VXRM_RDN, 4); #else SIMDE_VECTORIZE for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { @@ -233,6 +287,8 @@ simde_vhaddq_u8(simde_uint8x16_t a, simde_uint8x16_t b) { 1); r_.v128 = wasm_i8x16_shuffle(lo, hi, 0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30); + #elif defined(SIMDE_RISCV_V_NATIVE) + r_.sv128 = __riscv_vaaddu_vv_u8m1(a_.sv128, b_.sv128, __RISCV_VXRM_RDN, 16); #else SIMDE_VECTORIZE for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { @@ -261,6 +317,8 @@ simde_vhaddq_u16(simde_uint16x8_t a, simde_uint16x8_t b) { #if defined(SIMDE_X86_AVX512VL_NATIVE) r_.m128i = _mm256_cvtepi32_epi16(_mm256_srli_epi32(_mm256_add_epi32(_mm256_cvtepu16_epi32(a_.m128i), _mm256_cvtepu16_epi32(b_.m128i)), 1)); + #elif defined(SIMDE_RISCV_V_NATIVE) + r_.sv128 = __riscv_vaaddu_vv_u16m1(a_.sv128, b_.sv128, __RISCV_VXRM_RDN, 8); #else SIMDE_VECTORIZE for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { @@ -289,6 +347,8 @@ simde_vhaddq_u32(simde_uint32x4_t a, simde_uint32x4_t b) { #if defined(SIMDE_X86_AVX512VL_NATIVE) r_.m128i = _mm256_cvtepi64_epi32(_mm256_srli_epi64(_mm256_add_epi64(_mm256_cvtepu32_epi64(a_.m128i), _mm256_cvtepu32_epi64(b_.m128i)), 1)); + #elif defined(SIMDE_RISCV_V_NATIVE) + r_.sv128 = __riscv_vaaddu_vv_u32m1(a_.sv128, b_.sv128, __RISCV_VXRM_RDN, 4); #else SIMDE_VECTORIZE for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { diff --git a/lib/simd_wrapper/simde/arm/neon/ld1.h b/lib/simd_wrapper/simde/arm/neon/ld1.h index 2fa8d1f5608..5dd2d17c6e5 100644 --- a/lib/simd_wrapper/simde/arm/neon/ld1.h +++ b/lib/simd_wrapper/simde/arm/neon/ld1.h @@ -23,6 +23,8 @@ * Copyright: * 2020 Evan Nemerson * 2021 Zhi An Ng (Copyright owned by Google, LLC) + * 2023 Yi-Yen Chung (Copyright owned by Andes Technology) + * 2023 Chi-Wei Chu (Copyright owned by NTHU pllab) */ #if !defined(SIMDE_ARM_NEON_LD1_H) @@ -36,12 +38,16 @@ SIMDE_BEGIN_DECLS_ SIMDE_FUNCTION_ATTRIBUTES simde_float16x4_t -simde_vld1_f16(simde_float16 const ptr[HEDLEY_ARRAY_PARAM(4)]) { +simde_vld1_f16(simde_float16_t const ptr[HEDLEY_ARRAY_PARAM(4)]) { #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) && defined(SIMDE_ARM_NEON_FP16) return vld1_f16(ptr); #else simde_float16x4_private r_; - simde_memcpy(&r_, ptr, sizeof(r_)); + #if defined(SIMDE_RISCV_V_NATIVE) && defined(SIMDE_ARCH_RISCV_ZVFH) + r_.sv64 = __riscv_vle16_v_f16m1((_Float16 *)ptr , 4); + #else + simde_memcpy(&r_, ptr, sizeof(r_)); + #endif return simde_float16x4_from_private(r_); #endif } @@ -57,7 +63,11 @@ simde_vld1_f32(simde_float32 const ptr[HEDLEY_ARRAY_PARAM(2)]) { return vld1_f32(ptr); #else simde_float32x2_private r_; - simde_memcpy(&r_, ptr, sizeof(r_)); + #if defined(SIMDE_RISCV_V_NATIVE) + r_.sv64 = __riscv_vle32_v_f32m1(ptr , 2); + #else + simde_memcpy(&r_, ptr, sizeof(r_)); + #endif return simde_float32x2_from_private(r_); #endif } @@ -73,7 +83,11 @@ simde_vld1_f64(simde_float64 const ptr[HEDLEY_ARRAY_PARAM(1)]) { return vld1_f64(ptr); #else simde_float64x1_private r_; - simde_memcpy(&r_, ptr, sizeof(r_)); + #if defined(SIMDE_RISCV_V_NATIVE) + r_.sv64 = __riscv_vle64_v_f64m1(ptr , 1); + #else + simde_memcpy(&r_, ptr, sizeof(r_)); + #endif return simde_float64x1_from_private(r_); #endif } @@ -89,7 +103,11 @@ simde_vld1_s8(int8_t const ptr[HEDLEY_ARRAY_PARAM(8)]) { return vld1_s8(ptr); #else simde_int8x8_private r_; - simde_memcpy(&r_, ptr, sizeof(r_)); + #if defined(SIMDE_RISCV_V_NATIVE) + r_.sv64 = __riscv_vle8_v_i8m1(ptr , 8); + #else + simde_memcpy(&r_, ptr, sizeof(r_)); + #endif return simde_int8x8_from_private(r_); #endif } @@ -105,7 +123,11 @@ simde_vld1_s16(int16_t const ptr[HEDLEY_ARRAY_PARAM(4)]) { return vld1_s16(ptr); #else simde_int16x4_private r_; - simde_memcpy(&r_, ptr, sizeof(r_)); + #if defined(SIMDE_RISCV_V_NATIVE) + r_.sv64 = __riscv_vle16_v_i16m1(ptr , 4); + #else + simde_memcpy(&r_, ptr, sizeof(r_)); + #endif return simde_int16x4_from_private(r_); #endif } @@ -121,7 +143,11 @@ simde_vld1_s32(int32_t const ptr[HEDLEY_ARRAY_PARAM(2)]) { return vld1_s32(ptr); #else simde_int32x2_private r_; - simde_memcpy(&r_, ptr, sizeof(r_)); + #if defined(SIMDE_RISCV_V_NATIVE) + r_.sv64 = __riscv_vle32_v_i32m1(ptr , 2); + #else + simde_memcpy(&r_, ptr, sizeof(r_)); + #endif return simde_int32x2_from_private(r_); #endif } @@ -137,7 +163,11 @@ simde_vld1_s64(int64_t const ptr[HEDLEY_ARRAY_PARAM(1)]) { return vld1_s64(ptr); #else simde_int64x1_private r_; - simde_memcpy(&r_, ptr, sizeof(r_)); + #if defined(SIMDE_RISCV_V_NATIVE) + r_.sv64 = __riscv_vle64_v_i64m1(ptr , 1); + #else + simde_memcpy(&r_, ptr, sizeof(r_)); + #endif return simde_int64x1_from_private(r_); #endif } @@ -153,7 +183,11 @@ simde_vld1_u8(uint8_t const ptr[HEDLEY_ARRAY_PARAM(8)]) { return vld1_u8(ptr); #else simde_uint8x8_private r_; - simde_memcpy(&r_, ptr, sizeof(r_)); + #if defined(SIMDE_RISCV_V_NATIVE) + r_.sv64 = __riscv_vle8_v_u8m1(ptr , 8); + #else + simde_memcpy(&r_, ptr, sizeof(r_)); + #endif return simde_uint8x8_from_private(r_); #endif } @@ -169,7 +203,11 @@ simde_vld1_u16(uint16_t const ptr[HEDLEY_ARRAY_PARAM(4)]) { return vld1_u16(ptr); #else simde_uint16x4_private r_; - simde_memcpy(&r_, ptr, sizeof(r_)); + #if defined(SIMDE_RISCV_V_NATIVE) + r_.sv64 = __riscv_vle16_v_u16m1(ptr , 4); + #else + simde_memcpy(&r_, ptr, sizeof(r_)); + #endif return simde_uint16x4_from_private(r_); #endif } @@ -185,7 +223,11 @@ simde_vld1_u32(uint32_t const ptr[HEDLEY_ARRAY_PARAM(2)]) { return vld1_u32(ptr); #else simde_uint32x2_private r_; - simde_memcpy(&r_, ptr, sizeof(r_)); + #if defined(SIMDE_RISCV_V_NATIVE) + r_.sv64 = __riscv_vle32_v_u32m1(ptr , 2); + #else + simde_memcpy(&r_, ptr, sizeof(r_)); + #endif return simde_uint32x2_from_private(r_); #endif } @@ -201,7 +243,11 @@ simde_vld1_u64(uint64_t const ptr[HEDLEY_ARRAY_PARAM(1)]) { return vld1_u64(ptr); #else simde_uint64x1_private r_; - simde_memcpy(&r_, ptr, sizeof(r_)); + #if defined(SIMDE_RISCV_V_NATIVE) + r_.sv64 = __riscv_vle64_v_u64m1(ptr , 1); + #else + simde_memcpy(&r_, ptr, sizeof(r_)); + #endif return simde_uint64x1_from_private(r_); #endif } @@ -212,13 +258,15 @@ simde_vld1_u64(uint64_t const ptr[HEDLEY_ARRAY_PARAM(1)]) { SIMDE_FUNCTION_ATTRIBUTES simde_float16x8_t -simde_vld1q_f16(simde_float16 const ptr[HEDLEY_ARRAY_PARAM(8)]) { +simde_vld1q_f16(simde_float16_t const ptr[HEDLEY_ARRAY_PARAM(8)]) { #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) && defined(SIMDE_ARM_NEON_FP16) return vld1q_f16(ptr); #else simde_float16x8_private r_; #if defined(SIMDE_WASM_SIMD128_NATIVE) r_.v128 = wasm_v128_load(ptr); + #elif defined(SIMDE_RISCV_V_NATIVE) && defined(SIMDE_ARCH_RISCV_ZVFH) + r_.sv128 = __riscv_vle16_v_f16m1((_Float16 *)ptr , 8); #else simde_memcpy(&r_, ptr, sizeof(r_)); #endif @@ -239,6 +287,8 @@ simde_vld1q_f32(simde_float32 const ptr[HEDLEY_ARRAY_PARAM(4)]) { simde_float32x4_private r_; #if defined(SIMDE_WASM_SIMD128_NATIVE) r_.v128 = wasm_v128_load(ptr); + #elif defined(SIMDE_RISCV_V_NATIVE) + r_.sv128 = __riscv_vle32_v_f32m1(ptr , 4); #else simde_memcpy(&r_, ptr, sizeof(r_)); #endif @@ -259,6 +309,8 @@ simde_vld1q_f64(simde_float64 const ptr[HEDLEY_ARRAY_PARAM(2)]) { simde_float64x2_private r_; #if defined(SIMDE_WASM_SIMD128_NATIVE) r_.v128 = wasm_v128_load(ptr); + #elif defined(SIMDE_RISCV_V_NATIVE) + r_.sv128 = __riscv_vle64_v_f64m1(ptr , 2); #else simde_memcpy(&r_, ptr, sizeof(r_)); #endif @@ -279,6 +331,8 @@ simde_vld1q_s8(int8_t const ptr[HEDLEY_ARRAY_PARAM(16)]) { simde_int8x16_private r_; #if defined(SIMDE_WASM_SIMD128_NATIVE) r_.v128 = wasm_v128_load(ptr); + #elif defined(SIMDE_RISCV_V_NATIVE) + r_.sv128 = __riscv_vle8_v_i8m1(ptr , 16); #else simde_memcpy(&r_, ptr, sizeof(r_)); #endif @@ -299,6 +353,8 @@ simde_vld1q_s16(int16_t const ptr[HEDLEY_ARRAY_PARAM(8)]) { simde_int16x8_private r_; #if defined(SIMDE_WASM_SIMD128_NATIVE) r_.v128 = wasm_v128_load(ptr); + #elif defined(SIMDE_RISCV_V_NATIVE) + r_.sv128 = __riscv_vle16_v_i16m1(ptr , 8); #else simde_memcpy(&r_, ptr, sizeof(r_)); #endif @@ -319,6 +375,8 @@ simde_vld1q_s32(int32_t const ptr[HEDLEY_ARRAY_PARAM(4)]) { simde_int32x4_private r_; #if defined(SIMDE_WASM_SIMD128_NATIVE) r_.v128 = wasm_v128_load(ptr); + #elif defined(SIMDE_RISCV_V_NATIVE) + r_.sv128 = __riscv_vle32_v_i32m1(ptr , 4); #else simde_memcpy(&r_, ptr, sizeof(r_)); #endif @@ -339,6 +397,8 @@ simde_vld1q_s64(int64_t const ptr[HEDLEY_ARRAY_PARAM(2)]) { simde_int64x2_private r_; #if defined(SIMDE_WASM_SIMD128_NATIVE) r_.v128 = wasm_v128_load(ptr); + #elif defined(SIMDE_RISCV_V_NATIVE) + r_.sv128 = __riscv_vle64_v_i64m1(ptr , 2); #else simde_memcpy(&r_, ptr, sizeof(r_)); #endif @@ -359,6 +419,8 @@ simde_vld1q_u8(uint8_t const ptr[HEDLEY_ARRAY_PARAM(16)]) { simde_uint8x16_private r_; #if defined(SIMDE_WASM_SIMD128_NATIVE) r_.v128 = wasm_v128_load(ptr); + #elif defined(SIMDE_RISCV_V_NATIVE) + r_.sv128 = __riscv_vle8_v_u8m1(ptr , 16); #else simde_memcpy(&r_, ptr, sizeof(r_)); #endif @@ -370,82 +432,6 @@ simde_vld1q_u8(uint8_t const ptr[HEDLEY_ARRAY_PARAM(16)]) { #define vld1q_u8(a) simde_vld1q_u8((a)) #endif -#if !defined(SIMDE_BUG_INTEL_857088) - -SIMDE_FUNCTION_ATTRIBUTES -simde_uint8x16x2_t -simde_vld1q_u8_x2(uint8_t const ptr[HEDLEY_ARRAY_PARAM(32)]) { - #if \ - defined(SIMDE_ARM_NEON_A32V7_NATIVE) && \ - (!defined(HEDLEY_GCC_VERSION) || (HEDLEY_GCC_VERSION_CHECK(8,0,0) && defined(SIMDE_ARM_NEON_A64V8_NATIVE))) && \ - (!defined(__clang__) || SIMDE_DETECT_CLANG_VERSION_CHECK(7,0,0)) - return vld1q_u8_x2(ptr); - #else - simde_uint8x16_private a_[2]; - for (size_t i = 0; i < 32; i++) { - a_[i / 16].values[i % 16] = ptr[i]; - } - simde_uint8x16x2_t s_ = { { simde_uint8x16_from_private(a_[0]), - simde_uint8x16_from_private(a_[1]) } }; - return s_; - #endif -} -#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) - #undef vld1q_u8_x2 - #define vld1q_u8_x2(a) simde_vld1q_u8_x2((a)) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde_uint8x16x3_t -simde_vld1q_u8_x3(uint8_t const ptr[HEDLEY_ARRAY_PARAM(48)]) { - #if \ - defined(SIMDE_ARM_NEON_A32V7_NATIVE) && \ - (!defined(HEDLEY_GCC_VERSION) || (HEDLEY_GCC_VERSION_CHECK(8,0,0) && defined(SIMDE_ARM_NEON_A64V8_NATIVE))) && \ - (!defined(__clang__) || SIMDE_DETECT_CLANG_VERSION_CHECK(7,0,0)) - return vld1q_u8_x3(ptr); - #else - simde_uint8x16_private a_[3]; - for (size_t i = 0; i < 48; i++) { - a_[i / 16].values[i % 16] = ptr[i]; - } - simde_uint8x16x3_t s_ = { { simde_uint8x16_from_private(a_[0]), - simde_uint8x16_from_private(a_[1]), - simde_uint8x16_from_private(a_[2]) } }; - return s_; - #endif -} -#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) - #undef vld1q_u8_x3 - #define vld1q_u8_x3(a) simde_vld1q_u8_x3((a)) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde_uint8x16x4_t -simde_vld1q_u8_x4(uint8_t const ptr[HEDLEY_ARRAY_PARAM(64)]) { - #if \ - defined(SIMDE_ARM_NEON_A32V7_NATIVE) && \ - (!defined(HEDLEY_GCC_VERSION) || (HEDLEY_GCC_VERSION_CHECK(8,0,0) && defined(SIMDE_ARM_NEON_A64V8_NATIVE))) && \ - (!defined(__clang__) || SIMDE_DETECT_CLANG_VERSION_CHECK(7,0,0)) - return vld1q_u8_x4(ptr); - #else - simde_uint8x16_private a_[4]; - for (size_t i = 0; i < 64; i++) { - a_[i / 16].values[i % 16] = ptr[i]; - } - simde_uint8x16x4_t s_ = { { simde_uint8x16_from_private(a_[0]), - simde_uint8x16_from_private(a_[1]), - simde_uint8x16_from_private(a_[2]), - simde_uint8x16_from_private(a_[3]) } }; - return s_; - #endif -} -#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) - #undef vld1q_u8_x4 - #define vld1q_u8_x4(a) simde_vld1q_u8_x4((a)) -#endif - -#endif /* !defined(SIMDE_BUG_INTEL_857088) */ - SIMDE_FUNCTION_ATTRIBUTES simde_uint16x8_t simde_vld1q_u16(uint16_t const ptr[HEDLEY_ARRAY_PARAM(8)]) { @@ -455,6 +441,8 @@ simde_vld1q_u16(uint16_t const ptr[HEDLEY_ARRAY_PARAM(8)]) { simde_uint16x8_private r_; #if defined(SIMDE_WASM_SIMD128_NATIVE) r_.v128 = wasm_v128_load(ptr); + #elif defined(SIMDE_RISCV_V_NATIVE) + r_.sv128 = __riscv_vle16_v_u16m1(ptr , 8); #else simde_memcpy(&r_, ptr, sizeof(r_)); #endif @@ -475,6 +463,8 @@ simde_vld1q_u32(uint32_t const ptr[HEDLEY_ARRAY_PARAM(4)]) { simde_uint32x4_private r_; #if defined(SIMDE_WASM_SIMD128_NATIVE) r_.v128 = wasm_v128_load(ptr); + #elif defined(SIMDE_RISCV_V_NATIVE) + r_.sv128 = __riscv_vle32_v_u32m1(ptr , 4); #else simde_memcpy(&r_, ptr, sizeof(r_)); #endif @@ -495,6 +485,8 @@ simde_vld1q_u64(uint64_t const ptr[HEDLEY_ARRAY_PARAM(2)]) { simde_uint64x2_private r_; #if defined(SIMDE_WASM_SIMD128_NATIVE) r_.v128 = wasm_v128_load(ptr); + #elif defined(SIMDE_RISCV_V_NATIVE) + r_.sv128 = __riscv_vle64_v_u64m1(ptr , 2); #else simde_memcpy(&r_, ptr, sizeof(r_)); #endif @@ -506,6 +498,177 @@ simde_vld1q_u64(uint64_t const ptr[HEDLEY_ARRAY_PARAM(2)]) { #define vld1q_u64(a) simde_vld1q_u64((a)) #endif +SIMDE_FUNCTION_ATTRIBUTES +simde_poly8x8_t +simde_vld1_p8(simde_poly8_t const ptr[HEDLEY_ARRAY_PARAM(8)]) { + #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) + return vld1_p8(ptr); + #else + simde_poly8x8_private r_; + #if defined(SIMDE_RISCV_V_NATIVE) + r_.sv64 = __riscv_vle8_v_u8m1(ptr , 8); + #else + simde_memcpy(&r_, ptr, sizeof(r_)); + #endif + return simde_poly8x8_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vld1_p8 + #define vld1_p8(a) simde_vld1_p8((a)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_poly16x4_t +simde_vld1_p16(simde_poly16_t const ptr[HEDLEY_ARRAY_PARAM(4)]) { + #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) + return vld1_p16(ptr); + #else + simde_poly16x4_private r_; + #if defined(SIMDE_RISCV_V_NATIVE) + r_.sv64 = __riscv_vle16_v_u16m1(ptr , 4); + #else + simde_memcpy(&r_, ptr, sizeof(r_)); + #endif + return simde_poly16x4_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vld1_p16 + #define vld1_p16(a) simde_vld1_p16((a)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_poly64x1_t +simde_vld1_p64(simde_poly64_t const ptr[HEDLEY_ARRAY_PARAM(1)]) { + #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) + return vld1_p64(ptr); + #else + simde_poly64x1_private r_; + #if defined(SIMDE_RISCV_V_NATIVE) + r_.sv64 = __riscv_vle64_v_u64m1(ptr , 1); + #else + simde_memcpy(&r_, ptr, sizeof(r_)); + #endif + return simde_poly64x1_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) + #undef vld1_p64 + #define vld1_p64(a) simde_vld1_p64((a)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_poly8x16_t +simde_vld1q_p8(simde_poly8_t const ptr[HEDLEY_ARRAY_PARAM(16)]) { + #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) + return vld1q_p8(ptr); + #else + simde_poly8x16_private r_; + #if defined(SIMDE_RISCV_V_NATIVE) + r_.sv128 = __riscv_vle8_v_u8m1(ptr , 16); + #else + simde_memcpy(&r_, ptr, sizeof(r_)); + #endif + return simde_poly8x16_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vld1q_p8 + #define vld1q_p8(a) simde_vld1q_p8((a)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_poly16x8_t +simde_vld1q_p16(simde_poly16_t const ptr[HEDLEY_ARRAY_PARAM(8)]) { + #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) + return vld1q_p16(ptr); + #else + simde_poly16x8_private r_; + #if defined(SIMDE_RISCV_V_NATIVE) + r_.sv128 = __riscv_vle16_v_u16m1(ptr , 8); + #else + simde_memcpy(&r_, ptr, sizeof(r_)); + #endif + return simde_poly16x8_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vld1q_p16 + #define vld1q_p16(a) simde_vld1q_p16((a)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_poly64x2_t +simde_vld1q_p64(simde_poly64_t const ptr[HEDLEY_ARRAY_PARAM(2)]) { + #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) + return vld1q_p64(ptr); + #else + simde_poly64x2_private r_; + #if defined(SIMDE_RISCV_V_NATIVE) + r_.sv128 = __riscv_vle64_v_u64m1(ptr , 2); + #else + simde_memcpy(&r_, ptr, sizeof(r_)); + #endif + return simde_poly64x2_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) + #undef vld1q_p64 + #define vld1q_p64(a) simde_vld1q_p64((a)) +#endif + +#if !defined(SIMDE_TARGET_NOT_SUPPORT_INT128_TYPE) +SIMDE_FUNCTION_ATTRIBUTES +simde_poly128_t +simde_vldrq_p128(simde_poly128_t const ptr[HEDLEY_ARRAY_PARAM(1)]) { + #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARCH_ARM_CRYPTO) + return vldrq_p128(ptr); + #else + simde_poly128_t r_; + simde_memcpy(&r_, ptr, sizeof(r_)); + return r_; + #endif +} +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) + #undef vldrq_p128 + #define vldrq_p128(a) simde_vldrq_p128((a)) +#endif + +#endif /* !defined(SIMDE_TARGET_NOT_SUPPORT_INT128_TYPE) */ + +SIMDE_FUNCTION_ATTRIBUTES +simde_bfloat16x4_t +simde_vld1_bf16(simde_bfloat16_t const ptr[HEDLEY_ARRAY_PARAM(4)]) { + #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARM_NEON_BF16) + return vld1_bf16(ptr); + #else + simde_bfloat16x4_private r_; + simde_memcpy(&r_, ptr, sizeof(r_)); + return simde_bfloat16x4_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) + #undef vld1_bf16 + #define vld1_bf16(a) simde_vld1_bf16((a)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_bfloat16x8_t +simde_vld1q_bf16(simde_bfloat16_t const ptr[HEDLEY_ARRAY_PARAM(8)]) { + #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARM_NEON_BF16) + return vld1q_bf16(ptr); + #else + simde_bfloat16x8_private r_; + simde_memcpy(&r_, ptr, sizeof(r_)); + return simde_bfloat16x8_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) + #undef vld1q_bf16 + #define vld1q_bf16(a) simde_vld1q_bf16((a)) +#endif + SIMDE_END_DECLS_ HEDLEY_DIAGNOSTIC_POP diff --git a/lib/simd_wrapper/simde/arm/neon/ld1_dup.h b/lib/simd_wrapper/simde/arm/neon/ld1_dup.h index 9df7477b7de..cc15cf98230 100644 --- a/lib/simd_wrapper/simde/arm/neon/ld1_dup.h +++ b/lib/simd_wrapper/simde/arm/neon/ld1_dup.h @@ -23,6 +23,7 @@ * Copyright: * 2021 Zhi An Ng (Copyright owned by Google, LLC) * 2021 Evan Nemerson + * 2023 Yi-Yen Chung (Copyright owned by Andes Technology) */ #if !defined(SIMDE_ARM_NEON_LD1_DUP_H) @@ -35,6 +36,20 @@ HEDLEY_DIAGNOSTIC_PUSH SIMDE_DISABLE_UNWANTED_DIAGNOSTICS SIMDE_BEGIN_DECLS_ +SIMDE_FUNCTION_ATTRIBUTES +simde_float16x4_t +simde_vld1_dup_f16(simde_float16_t const * ptr) { + #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) && defined(SIMDE_ARM_NEON_FP16) + return vld1_dup_f16(ptr); + #else + return simde_vdup_n_f16(*ptr); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vld1_dup_f16 + #define vld1_dup_f16(a) simde_vld1_dup_f16((a)) +#endif + SIMDE_FUNCTION_ATTRIBUTES simde_float32x2_t simde_vld1_dup_f32(simde_float32 const * ptr) { @@ -177,6 +192,20 @@ simde_vld1_dup_u64(uint64_t const * ptr) { #define vld1_dup_u64(a) simde_vld1_dup_u64((a)) #endif +SIMDE_FUNCTION_ATTRIBUTES +simde_float16x8_t +simde_vld1q_dup_f16(simde_float16_t const * ptr) { + #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) && defined(SIMDE_ARM_NEON_FP16) + return vld1q_dup_f16(ptr); + #else + return simde_vdupq_n_f16(*ptr); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vld1q_dup_f16 + #define vld1q_dup_f16(a) simde_vld1q_dup_f16((a)) +#endif + SIMDE_FUNCTION_ATTRIBUTES simde_float32x4_t simde_vld1q_dup_f32(simde_float32 const * ptr) { @@ -401,6 +430,118 @@ simde_vld1q_dup_u64(uint64_t const * ptr) { #define vld1q_dup_u64(a) simde_vld1q_dup_u64((a)) #endif +SIMDE_FUNCTION_ATTRIBUTES +simde_poly8x8_t +simde_vld1_dup_p8(simde_poly8_t const * ptr) { + #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) + return vld1_dup_p8(ptr); + #else + return simde_vdup_n_p8(*ptr); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vld1_dup_p8 + #define vld1_dup_p8(a) simde_vld1_dup_p8((a)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_poly16x4_t +simde_vld1_dup_p16(simde_poly16_t const * ptr) { + #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) + return vld1_dup_p16(ptr); + #else + return simde_vdup_n_p16(*ptr); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vld1_dup_p16 + #define vld1_dup_p16(a) simde_vld1_dup_p16((a)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_poly64x1_t +simde_vld1_dup_p64(simde_poly64_t const * ptr) { + #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) + return vld1_dup_p64(ptr); + #else + return simde_vdup_n_p64(*ptr); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) + #undef vld1_dup_p64 + #define vld1_dup_p64(a) simde_vld1_dup_p64((a)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_poly8x16_t +simde_vld1q_dup_p8(simde_poly8_t const * ptr) { + #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) + return vld1q_dup_p8(ptr); + #else + return simde_vdupq_n_p8(*ptr); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vld1q_dup_p8 + #define vld1q_dup_p8(a) simde_vld1q_dup_p8((a)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_poly16x8_t +simde_vld1q_dup_p16(simde_poly16_t const * ptr) { + #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) + return vld1q_dup_p16(ptr); + #else + return simde_vdupq_n_p16(*ptr); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vld1q_dup_p16 + #define vld1q_dup_p16(a) simde_vld1q_dup_p16((a)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_poly64x2_t +simde_vld1q_dup_p64(simde_poly64_t const * ptr) { + #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) + return vld1q_dup_p64(ptr); + #else + return simde_vdupq_n_p64(*ptr); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) + #undef vld1q_dup_p64 + #define vld1q_dup_p64(a) simde_vld1q_dup_p64((a)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_bfloat16x4_t +simde_vld1_dup_bf16(simde_bfloat16 const * ptr) { + #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARM_NEON_BF16) + return vld1_dup_bf16(ptr); + #else + return simde_vdup_n_bf16(*ptr); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) + #undef vld1_dup_bf16 + #define vld1_dup_bf16(a) simde_vld1_dup_bf16((a)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_bfloat16x8_t +simde_vld1q_dup_bf16(simde_bfloat16 const * ptr) { + #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARM_NEON_BF16) + return vld1q_dup_bf16(ptr); + #else + return simde_vdupq_n_bf16(*ptr); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) + #undef vld1q_dup_bf16 + #define vld1q_dup_bf16(a) simde_vld1q_dup_bf16((a)) +#endif + SIMDE_END_DECLS_ HEDLEY_DIAGNOSTIC_POP diff --git a/lib/simd_wrapper/simde/arm/neon/ld1_lane.h b/lib/simd_wrapper/simde/arm/neon/ld1_lane.h index 4e36caf5249..5818ead64df 100644 --- a/lib/simd_wrapper/simde/arm/neon/ld1_lane.h +++ b/lib/simd_wrapper/simde/arm/neon/ld1_lane.h @@ -22,6 +22,7 @@ * * Copyright: * 2021 Zhi An Ng (Copyright owned by Google, LLC) + * 2023 Yi-Yen Chung (Copyright owned by Andes Technology) */ #if !defined(SIMDE_ARM_NEON_LD1_LANE_H) @@ -161,6 +162,22 @@ simde_uint64x1_t simde_vld1_lane_u64(uint64_t const *ptr, simde_uint64x1_t src, #define vld1_lane_u64(ptr, src, lane) simde_vld1_lane_u64((ptr), (src), (lane)) #endif +SIMDE_FUNCTION_ATTRIBUTES +simde_float16x4_t simde_vld1_lane_f16(simde_float16_t const *ptr, simde_float16x4_t src, + const int lane) + SIMDE_REQUIRE_CONSTANT_RANGE(lane, 0, 3) { + simde_float16x4_private r = simde_float16x4_to_private(src); + r.values[lane] = *ptr; + return simde_float16x4_from_private(r); +} +#if defined(SIMDE_ARM_NEON_A32V7_NATIVE) && defined(SIMDE_ARM_NEON_FP16) + #define simde_vld1_lane_f16(ptr, src, lane) vld1_lane_f16(ptr, src, lane) +#endif +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vld1_lane_f16 + #define vld1_lane_f16(ptr, src, lane) simde_vld1_lane_f16((ptr), (src), (lane)) +#endif + SIMDE_FUNCTION_ATTRIBUTES simde_float32x2_t simde_vld1_lane_f32(simde_float32_t const *ptr, simde_float32x2_t src, const int lane) @@ -321,6 +338,22 @@ simde_uint64x2_t simde_vld1q_lane_u64(uint64_t const *ptr, simde_uint64x2_t src, #define vld1q_lane_u64(ptr, src, lane) simde_vld1q_lane_u64((ptr), (src), (lane)) #endif +SIMDE_FUNCTION_ATTRIBUTES +simde_float16x8_t simde_vld1q_lane_f16(simde_float16_t const *ptr, simde_float16x8_t src, + const int lane) + SIMDE_REQUIRE_CONSTANT_RANGE(lane, 0, 7) { + simde_float16x8_private r = simde_float16x8_to_private(src); + r.values[lane] = *ptr; + return simde_float16x8_from_private(r); +} +#if defined(SIMDE_ARM_NEON_A32V7_NATIVE) && defined(SIMDE_ARM_NEON_FP16) + #define simde_vld1q_lane_f16(ptr, src, lane) vld1q_lane_f16(ptr, src, lane) +#endif +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vld1q_lane_f16 + #define vld1q_lane_f16(ptr, src, lane) simde_vld1q_lane_f16((ptr), (src), (lane)) +#endif + SIMDE_FUNCTION_ATTRIBUTES simde_float32x4_t simde_vld1q_lane_f32(simde_float32_t const *ptr, simde_float32x4_t src, const int lane) @@ -353,6 +386,139 @@ simde_float64x2_t simde_vld1q_lane_f64(simde_float64_t const *ptr, simde_float64 #define vld1q_lane_f64(ptr, src, lane) simde_vld1q_lane_f64((ptr), (src), (lane)) #endif +SIMDE_FUNCTION_ATTRIBUTES +simde_poly8x8_t +simde_vld1_lane_p8(simde_poly8_t const *ptr, simde_poly8x8_t src, + const int lane) + SIMDE_REQUIRE_CONSTANT_RANGE(lane, 0, 7) { + simde_poly8x8_private r = simde_poly8x8_to_private(src); + r.values[lane] = *ptr; + return simde_poly8x8_from_private(r); +} +#if defined(SIMDE_ARM_NEON_A32V7_NATIVE) + #define simde_vld1_lane_p8(ptr, src, lane) vld1_lane_p8(ptr, src, lane) +#endif +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vld1_lane_p8 + #define vld1_lane_p8(ptr, src, lane) simde_vld1_lane_p8((ptr), (src), (lane)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_poly16x4_t +simde_vld1_lane_p16(simde_poly16_t const *ptr, simde_poly16x4_t src, + const int lane) + SIMDE_REQUIRE_CONSTANT_RANGE(lane, 0, 3) { + simde_poly16x4_private r = simde_poly16x4_to_private(src); + r.values[lane] = *ptr; + return simde_poly16x4_from_private(r); +} +#if defined(SIMDE_ARM_NEON_A32V7_NATIVE) + #define simde_vld1_lane_p16(ptr, src, lane) vld1_lane_p16(ptr, src, lane) +#endif +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vld1_lane_p16 + #define vld1_lane_p16(ptr, src, lane) simde_vld1_lane_p16((ptr), (src), (lane)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_poly64x1_t +simde_vld1_lane_p64(simde_poly64_t const *ptr, simde_poly64x1_t src, + const int lane) + SIMDE_REQUIRE_CONSTANT_RANGE(lane, 0, 0) { + simde_poly64x1_private r = simde_poly64x1_to_private(src); + r.values[lane] = *ptr; + return simde_poly64x1_from_private(r); +} +#if defined(SIMDE_ARM_NEON_A32V8_NATIVE) + #define simde_vld1_lane_p64(ptr, src, lane) vld1_lane_p64(ptr, src, lane) +#endif +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) + #undef vld1_lane_p64 + #define vld1_lane_p64(ptr, src, lane) simde_vld1_lane_p64((ptr), (src), (lane)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_poly8x16_t +simde_vld1q_lane_p8(simde_poly8_t const *ptr, simde_poly8x16_t src, + const int lane) + SIMDE_REQUIRE_CONSTANT_RANGE(lane, 0, 15) { + simde_poly8x16_private r = simde_poly8x16_to_private(src); + r.values[lane] = *ptr; + return simde_poly8x16_from_private(r); +} +#if defined(SIMDE_ARM_NEON_A32V7_NATIVE) + #define simde_vld1q_lane_p8(ptr, src, lane) vld1q_lane_p8(ptr, src, lane) +#endif +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vld1q_lane_p8 + #define vld1q_lane_p8(ptr, src, lane) simde_vld1q_lane_p8((ptr), (src), (lane)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_poly16x8_t +simde_vld1q_lane_p16(simde_poly16_t const *ptr, simde_poly16x8_t src, + const int lane) + SIMDE_REQUIRE_CONSTANT_RANGE(lane, 0, 7) { + simde_poly16x8_private r = simde_poly16x8_to_private(src); + r.values[lane] = *ptr; + return simde_poly16x8_from_private(r); +} +#if defined(SIMDE_ARM_NEON_A32V7_NATIVE) + #define simde_vld1q_lane_p16(ptr, src, lane) vld1q_lane_p16(ptr, src, lane) +#endif +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vld1q_lane_p16 + #define vld1q_lane_p16(ptr, src, lane) simde_vld1q_lane_p16((ptr), (src), (lane)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_poly64x2_t +simde_vld1q_lane_p64(simde_poly64_t const *ptr, simde_poly64x2_t src, + const int lane) + SIMDE_REQUIRE_CONSTANT_RANGE(lane, 0, 1) { + simde_poly64x2_private r = simde_poly64x2_to_private(src); + r.values[lane] = *ptr; + return simde_poly64x2_from_private(r); +} +#if defined(SIMDE_ARM_NEON_A32V8_NATIVE) + #define simde_vld1q_lane_p64(ptr, src, lane) vld1q_lane_p64(ptr, src, lane) +#endif +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) + #undef vld1q_lane_p64 + #define vld1q_lane_p64(ptr, src, lane) simde_vld1q_lane_p64((ptr), (src), (lane)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_bfloat16x4_t simde_vld1_lane_bf16(simde_bfloat16_t const *ptr, simde_bfloat16x4_t src, const int lane) + SIMDE_REQUIRE_CONSTANT_RANGE(lane, 0, 3) { + simde_bfloat16x4_private r = simde_bfloat16x4_to_private(src); + r.values[lane] = *ptr; + return simde_bfloat16x4_from_private(r); +} +#if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARM_NEON_BF16) + #define simde_vld1_lane_bf16(ptr, src, lane) vld1_lane_bf16(ptr, src, lane) +#endif +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) + #undef vld1_lane_bf16 + #define vld1_lane_bf16(ptr, src, lane) simde_vld1_lane_bf16((ptr), (src), (lane)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_bfloat16x8_t simde_vld1q_lane_bf16(simde_bfloat16_t const *ptr, simde_bfloat16x8_t src, + const int lane) + SIMDE_REQUIRE_CONSTANT_RANGE(lane, 0, 7) { + simde_bfloat16x8_private r = simde_bfloat16x8_to_private(src); + r.values[lane] = *ptr; + return simde_bfloat16x8_from_private(r); +} +#if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARM_NEON_BF16) + #define simde_vld1q_lane_bf16(ptr, src, lane) vld1q_lane_bf16(ptr, src, lane) +#endif +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) + #undef vld1q_lane_bf16 + #define vld1q_lane_bf16(ptr, src, lane) simde_vld1q_lane_bf16((ptr), (src), (lane)) +#endif + SIMDE_END_DECLS_ HEDLEY_DIAGNOSTIC_POP diff --git a/lib/simd_wrapper/simde/arm/neon/ld1_x2.h b/lib/simd_wrapper/simde/arm/neon/ld1_x2.h new file mode 100644 index 00000000000..75ce61d10b3 --- /dev/null +++ b/lib/simd_wrapper/simde/arm/neon/ld1_x2.h @@ -0,0 +1,456 @@ +/* SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, copy, + * modify, merge, publish, distribute, sublicense, and/or sell copies + * of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + * Copyright: + * 2020 Evan Nemerson + * 2021 Zhi An Ng (Copyright owned by Google, LLC) + * 2021 Décio Luiz Gazzoni Filho + * 2023 Yi-Yen Chung (Copyright owned by Andes Technology) + * 2023 Chi-Wei Chu (Copyright owned by NTHU pllab) + */ + +#if !defined(SIMDE_ARM_NEON_LD1_X2_H) +#define SIMDE_ARM_NEON_LD1_X2_H + +#include "types.h" + +HEDLEY_DIAGNOSTIC_PUSH +SIMDE_DISABLE_UNWANTED_DIAGNOSTICS +#if HEDLEY_GCC_VERSION_CHECK(7,0,0) + SIMDE_DIAGNOSTIC_DISABLE_MAYBE_UNINITIAZILED_ +#endif +SIMDE_BEGIN_DECLS_ + +#if !defined(SIMDE_BUG_INTEL_857088) + +SIMDE_FUNCTION_ATTRIBUTES +simde_float16x4x2_t +simde_vld1_f16_x2(simde_float16_t const ptr[HEDLEY_ARRAY_PARAM(8)]) { + #if \ + defined(SIMDE_ARM_NEON_A32V7_NATIVE) && defined(SIMDE_ARM_NEON_FP16) && \ + (!defined(HEDLEY_GCC_VERSION) || (HEDLEY_GCC_VERSION_CHECK(8,0,0) && defined(SIMDE_ARM_NEON_A64V8_NATIVE))) && \ + (!defined(__clang__) || (SIMDE_DETECT_CLANG_VERSION_CHECK(7,0,0) && defined(SIMDE_ARM_NEON_A64V8_NATIVE))) + return vld1_f16_x2(ptr); + #else + simde_float16x4_private a_[2]; + #if defined(SIMDE_RISCV_V_NATIVE) && SIMDE_ARCH_RISCV_ZVFH + a_[0].sv64 = __riscv_vle16_v_f16m1((_Float16 *)ptr , 4); + a_[1].sv64 = __riscv_vle16_v_f16m1((_Float16 *)(ptr+4) , 4); + #else + for (size_t i = 0; i < 8; i++) { + a_[i / 4].values[i % 4] = ptr[i]; + } + #endif + simde_float16x4x2_t s_ = { { simde_float16x4_from_private(a_[0]), + simde_float16x4_from_private(a_[1]) } }; + return s_; + #endif +} +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vld1_f16_x2 + #define vld1_f16_x2(a) simde_vld1_f16_x2((a)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_float32x2x2_t +simde_vld1_f32_x2(simde_float32 const ptr[HEDLEY_ARRAY_PARAM(4)]) { + #if \ + defined(SIMDE_ARM_NEON_A32V7_NATIVE) && \ + (!defined(HEDLEY_GCC_VERSION) || (HEDLEY_GCC_VERSION_CHECK(8,0,0) && defined(SIMDE_ARM_NEON_A64V8_NATIVE))) && \ + (!defined(__clang__) || (SIMDE_DETECT_CLANG_VERSION_CHECK(7,0,0) && defined(SIMDE_ARM_NEON_A64V8_NATIVE))) + return vld1_f32_x2(ptr); + #else + simde_float32x2_private a_[2]; + #if defined(SIMDE_RISCV_V_NATIVE) + a_[0].sv64 = __riscv_vle32_v_f32m1(ptr , 2); + a_[1].sv64 = __riscv_vle32_v_f32m1(ptr+2 , 2); + #else + for (size_t i = 0; i < 4; i++) { + a_[i / 2].values[i % 2] = ptr[i]; + } + #endif + simde_float32x2x2_t s_ = { { simde_float32x2_from_private(a_[0]), + simde_float32x2_from_private(a_[1]) } }; + return s_; + #endif +} +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vld1_f32_x2 + #define vld1_f32_x2(a) simde_vld1_f32_x2((a)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_float64x1x2_t +simde_vld1_f64_x2(simde_float64 const ptr[HEDLEY_ARRAY_PARAM(2)]) { + #if \ + defined(SIMDE_ARM_NEON_A64V8_NATIVE) && \ + (!defined(HEDLEY_GCC_VERSION) || HEDLEY_GCC_VERSION_CHECK(8,0,0)) && \ + (!defined(__clang__) || SIMDE_DETECT_CLANG_VERSION_CHECK(7,0,0)) + return vld1_f64_x2(ptr); + #else + simde_float64x1_private a_[2]; + #if defined(SIMDE_RISCV_V_NATIVE) + a_[0].sv64 = __riscv_vle64_v_f64m1(ptr , 1); + a_[1].sv64 = __riscv_vle64_v_f64m1(ptr+1 , 1); + #else + for (size_t i = 0; i < 2; i++) { + a_[i].values[0] = ptr[i]; + } + #endif + simde_float64x1x2_t s_ = { { simde_float64x1_from_private(a_[0]), + simde_float64x1_from_private(a_[1]) } }; + return s_; + #endif +} +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vld1_f64_x2 + #define vld1_f64_x2(a) simde_vld1_f64_x2((a)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_int8x8x2_t +simde_vld1_s8_x2(int8_t const ptr[HEDLEY_ARRAY_PARAM(16)]) { + #if \ + defined(SIMDE_ARM_NEON_A32V7_NATIVE) && \ + (!defined(HEDLEY_GCC_VERSION) || (HEDLEY_GCC_VERSION_CHECK(8,0,0) && defined(SIMDE_ARM_NEON_A64V8_NATIVE))) && \ + (!defined(__clang__) || (SIMDE_DETECT_CLANG_VERSION_CHECK(7,0,0) && defined(SIMDE_ARM_NEON_A64V8_NATIVE))) + return vld1_s8_x2(ptr); + #else + simde_int8x8_private a_[2]; + #if defined(SIMDE_RISCV_V_NATIVE) + a_[0].sv64 = __riscv_vle8_v_i8m1(ptr , 8); + a_[1].sv64 = __riscv_vle8_v_i8m1(ptr+8 , 8); + #else + for (size_t i = 0; i < 16; i++) { + a_[i / 8].values[i % 8] = ptr[i]; + } + #endif + simde_int8x8x2_t s_ = { { simde_int8x8_from_private(a_[0]), + simde_int8x8_from_private(a_[1]) } }; + return s_; + #endif +} +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vld1_s8_x2 + #define vld1_s8_x2(a) simde_vld1_s8_x2((a)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_int16x4x2_t +simde_vld1_s16_x2(int16_t const ptr[HEDLEY_ARRAY_PARAM(8)]) { + #if \ + defined(SIMDE_ARM_NEON_A32V7_NATIVE) && \ + (!defined(HEDLEY_GCC_VERSION) || (HEDLEY_GCC_VERSION_CHECK(8,0,0) && defined(SIMDE_ARM_NEON_A64V8_NATIVE))) && \ + (!defined(__clang__) || (SIMDE_DETECT_CLANG_VERSION_CHECK(7,0,0) && defined(SIMDE_ARM_NEON_A64V8_NATIVE))) + return vld1_s16_x2(ptr); + #else + simde_int16x4_private a_[2]; + #if defined(SIMDE_RISCV_V_NATIVE) + a_[0].sv64 = __riscv_vle16_v_i16m1(ptr , 4); + a_[1].sv64 = __riscv_vle16_v_i16m1(ptr+4 , 4); + #else + for (size_t i = 0; i < 8; i++) { + a_[i / 4].values[i % 4] = ptr[i]; + } + #endif + simde_int16x4x2_t s_ = { { simde_int16x4_from_private(a_[0]), + simde_int16x4_from_private(a_[1]) } }; + return s_; + #endif +} +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vld1_s16_x2 + #define vld1_s16_x2(a) simde_vld1_s16_x2((a)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_int32x2x2_t +simde_vld1_s32_x2(int32_t const ptr[HEDLEY_ARRAY_PARAM(4)]) { + #if \ + defined(SIMDE_ARM_NEON_A32V7_NATIVE) && \ + (!defined(HEDLEY_GCC_VERSION) || (HEDLEY_GCC_VERSION_CHECK(8,0,0) && defined(SIMDE_ARM_NEON_A64V8_NATIVE))) && \ + (!defined(__clang__) || (SIMDE_DETECT_CLANG_VERSION_CHECK(7,0,0) && defined(SIMDE_ARM_NEON_A64V8_NATIVE))) + return vld1_s32_x2(ptr); + #else + simde_int32x2_private a_[2]; + #if defined(SIMDE_RISCV_V_NATIVE) + a_[0].sv64 = __riscv_vle32_v_i32m1(ptr , 2); + a_[1].sv64 = __riscv_vle32_v_i32m1(ptr+2 , 2); + #else + for (size_t i = 0; i < 4; i++) { + a_[i / 2].values[i % 2] = ptr[i]; + } + #endif + simde_int32x2x2_t s_ = { { simde_int32x2_from_private(a_[0]), + simde_int32x2_from_private(a_[1]) } }; + return s_; + #endif +} +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vld1_s32_x2 + #define vld1_s32_x2(a) simde_vld1_s32_x2((a)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_int64x1x2_t +simde_vld1_s64_x2(int64_t const ptr[HEDLEY_ARRAY_PARAM(2)]) { + #if \ + defined(SIMDE_ARM_NEON_A32V7_NATIVE) && \ + (!defined(HEDLEY_GCC_VERSION) || (HEDLEY_GCC_VERSION_CHECK(8,0,0) && defined(SIMDE_ARM_NEON_A64V8_NATIVE))) && \ + (!defined(__clang__) || (SIMDE_DETECT_CLANG_VERSION_CHECK(7,0,0) && defined(SIMDE_ARM_NEON_A64V8_NATIVE))) + return vld1_s64_x2(ptr); + #else + simde_int64x1_private a_[2]; + #if defined(SIMDE_RISCV_V_NATIVE) + a_[0].sv64 = __riscv_vle64_v_i64m1(ptr , 1); + a_[1].sv64 = __riscv_vle64_v_i64m1(ptr+1 , 1); + #else + for (size_t i = 0; i < 2; i++) { + a_[i].values[0] = ptr[i]; + } + #endif + simde_int64x1x2_t s_ = { { simde_int64x1_from_private(a_[0]), + simde_int64x1_from_private(a_[1]) } }; + return s_; + #endif +} +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vld1_s64_x2 + #define vld1_s64_x2(a) simde_vld1_s64_x2((a)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_uint8x8x2_t +simde_vld1_u8_x2(uint8_t const ptr[HEDLEY_ARRAY_PARAM(16)]) { + #if \ + defined(SIMDE_ARM_NEON_A32V7_NATIVE) && \ + (!defined(HEDLEY_GCC_VERSION) || (HEDLEY_GCC_VERSION_CHECK(8,0,0) && defined(SIMDE_ARM_NEON_A64V8_NATIVE))) && \ + (!defined(__clang__) || (SIMDE_DETECT_CLANG_VERSION_CHECK(7,0,0) && defined(SIMDE_ARM_NEON_A64V8_NATIVE))) + return vld1_u8_x2(ptr); + #else + simde_uint8x8_private a_[2]; + #if defined(SIMDE_RISCV_V_NATIVE) + a_[0].sv64 = __riscv_vle8_v_u8m1(ptr , 8); + a_[1].sv64 = __riscv_vle8_v_u8m1(ptr+8 , 8); + #else + for (size_t i = 0; i < 16; i++) { + a_[i / 8].values[i % 8] = ptr[i]; + } + #endif + simde_uint8x8x2_t s_ = { { simde_uint8x8_from_private(a_[0]), + simde_uint8x8_from_private(a_[1]) } }; + return s_; + #endif +} +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vld1_u8_x2 + #define vld1_u8_x2(a) simde_vld1_u8_x2((a)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_uint16x4x2_t +simde_vld1_u16_x2(uint16_t const ptr[HEDLEY_ARRAY_PARAM(8)]) { + #if \ + defined(SIMDE_ARM_NEON_A32V7_NATIVE) && \ + (!defined(HEDLEY_GCC_VERSION) || (HEDLEY_GCC_VERSION_CHECK(8,0,0) && defined(SIMDE_ARM_NEON_A64V8_NATIVE))) && \ + (!defined(__clang__) || (SIMDE_DETECT_CLANG_VERSION_CHECK(7,0,0) && defined(SIMDE_ARM_NEON_A64V8_NATIVE))) + return vld1_u16_x2(ptr); + #else + simde_uint16x4_private a_[2]; + #if defined(SIMDE_RISCV_V_NATIVE) + a_[0].sv64 = __riscv_vle16_v_u16m1(ptr , 4); + a_[1].sv64 = __riscv_vle16_v_u16m1(ptr+4 , 4); + #else + for (size_t i = 0; i < 8; i++) { + a_[i / 4].values[i % 4] = ptr[i]; + } + #endif + simde_uint16x4x2_t s_ = { { simde_uint16x4_from_private(a_[0]), + simde_uint16x4_from_private(a_[1]) } }; + return s_; + #endif +} +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vld1_u16_x2 + #define vld1_u16_x2(a) simde_vld1_u16_x2((a)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_uint32x2x2_t +simde_vld1_u32_x2(uint32_t const ptr[HEDLEY_ARRAY_PARAM(4)]) { + #if \ + defined(SIMDE_ARM_NEON_A32V7_NATIVE) && \ + (!defined(HEDLEY_GCC_VERSION) || (HEDLEY_GCC_VERSION_CHECK(8,0,0) && defined(SIMDE_ARM_NEON_A64V8_NATIVE))) && \ + (!defined(__clang__) || (SIMDE_DETECT_CLANG_VERSION_CHECK(7,0,0) && defined(SIMDE_ARM_NEON_A64V8_NATIVE))) + return vld1_u32_x2(ptr); + #else + simde_uint32x2_private a_[2]; + #if defined(SIMDE_RISCV_V_NATIVE) + a_[0].sv64 = __riscv_vle32_v_u32m1(ptr , 2); + a_[1].sv64 = __riscv_vle32_v_u32m1(ptr+2 , 2); + #else + for (size_t i = 0; i < 4; i++) { + a_[i / 2].values[i % 2] = ptr[i]; + } + #endif + simde_uint32x2x2_t s_ = { { simde_uint32x2_from_private(a_[0]), + simde_uint32x2_from_private(a_[1]) } }; + return s_; + #endif +} +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vld1_u32_x2 + #define vld1_u32_x2(a) simde_vld1_u32_x2((a)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_uint64x1x2_t +simde_vld1_u64_x2(uint64_t const ptr[HEDLEY_ARRAY_PARAM(2)]) { + #if \ + defined(SIMDE_ARM_NEON_A32V7_NATIVE) && \ + (!defined(HEDLEY_GCC_VERSION) || (HEDLEY_GCC_VERSION_CHECK(8,0,0) && defined(SIMDE_ARM_NEON_A64V8_NATIVE))) && \ + (!defined(__clang__) || (SIMDE_DETECT_CLANG_VERSION_CHECK(7,0,0) && defined(SIMDE_ARM_NEON_A64V8_NATIVE))) + return vld1_u64_x2(ptr); + #else + simde_uint64x1_private a_[2]; + #if defined(SIMDE_RISCV_V_NATIVE) + a_[0].sv64 = __riscv_vle64_v_u64m1(ptr , 1); + a_[1].sv64 = __riscv_vle64_v_u64m1(ptr+1 , 1); + #else + for (size_t i = 0; i < 2; i++) { + a_[i].values[0] = ptr[i]; + } + #endif + simde_uint64x1x2_t s_ = { { simde_uint64x1_from_private(a_[0]), + simde_uint64x1_from_private(a_[1]) } }; + return s_; + #endif +} +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vld1_u64_x2 + #define vld1_u64_x2(a) simde_vld1_u64_x2((a)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_poly8x8x2_t +simde_vld1_p8_x2(simde_poly8_t const ptr[HEDLEY_ARRAY_PARAM(16)]) { + #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) && !defined(SIMDE_BUG_GCC_95399) + return vld1_p8_x2(ptr); + #else + simde_poly8x8_private a_[2]; + #if defined(SIMDE_RISCV_V_NATIVE) + a_[0].sv64 = __riscv_vle8_v_u8m1(ptr , 8); + a_[1].sv64 = __riscv_vle8_v_u8m1(ptr+8 , 8); + #else + for (size_t i = 0; i < 16; i++) { + a_[i / 8].values[i % 8] = ptr[i]; + } + #endif + simde_poly8x8x2_t s_ = { { simde_poly8x8_from_private(a_[0]), + simde_poly8x8_from_private(a_[1]) } }; + return s_; + #endif +} +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vld1_p8_x2 + #define vld1_p8_x2(a) simde_vld1_p8_x2((a)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_poly16x4x2_t +simde_vld1_p16_x2(simde_poly16_t const ptr[HEDLEY_ARRAY_PARAM(8)]) { + #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) && !defined(SIMDE_BUG_GCC_95399) + return vld1_p16_x2(ptr); + #else + simde_poly16x4_private a_[2]; + #if defined(SIMDE_RISCV_V_NATIVE) + a_[0].sv64 = __riscv_vle16_v_u16m1(ptr , 4); + a_[1].sv64 = __riscv_vle16_v_u16m1(ptr+4 , 4); + #else + for (size_t i = 0; i < 8; i++) { + a_[i / 4].values[i % 4] = ptr[i]; + } + #endif + simde_poly16x4x2_t s_ = { { simde_poly16x4_from_private(a_[0]), + simde_poly16x4_from_private(a_[1]) } }; + return s_; + #endif +} +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vld1_p16_x2 + #define vld1_p16_x2(a) simde_vld1_p16_x2((a)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_poly64x1x2_t +simde_vld1_p64_x2(simde_poly64_t const ptr[HEDLEY_ARRAY_PARAM(2)]) { + #if \ + defined(SIMDE_ARM_NEON_A32V8_NATIVE) && \ + (!defined(HEDLEY_GCC_VERSION) || (HEDLEY_GCC_VERSION_CHECK(8,0,0) && defined(SIMDE_ARM_NEON_A64V8_NATIVE))) && \ + (!defined(__clang__) || (SIMDE_DETECT_CLANG_VERSION_CHECK(9,0,0) && defined(SIMDE_ARM_NEON_A64V8_NATIVE))) + return vld1_p64_x2(ptr); + #else + simde_poly64x1_private a_[2]; + #if defined(SIMDE_RISCV_V_NATIVE) + a_[0].sv64 = __riscv_vle64_v_u64m1(ptr , 1); + a_[1].sv64 = __riscv_vle64_v_u64m1(ptr+1 , 1); + #else + for (size_t i = 0; i < 2; i++) { + a_[i].values[0] = ptr[i]; + } + #endif + simde_poly64x1x2_t s_ = { { simde_poly64x1_from_private(a_[0]), + simde_poly64x1_from_private(a_[1]) } }; + return s_; + #endif +} +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) + #undef vld1_p64_x2 + #define vld1_p64_x2(a) simde_vld1_p64_x2((a)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_bfloat16x4x2_t +simde_vld1_bf16_x2(simde_bfloat16 const ptr[HEDLEY_ARRAY_PARAM(8)]) { + #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARM_NEON_BF16) + return vld1_bf16_x2(ptr); + #else + simde_bfloat16x4_private a_[2]; + for (size_t i = 0; i < 8; i++) { + a_[i / 4].values[i % 4] = ptr[i]; + } + simde_bfloat16x4x2_t s_ = { { simde_bfloat16x4_from_private(a_[0]), + simde_bfloat16x4_from_private(a_[1]) } }; + return s_; + #endif +} +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) + #undef vld1_bf16_x2 + #define vld1_bf16_x2(a) simde_vld1_bf16_x2((a)) +#endif + +#endif /* !defined(SIMDE_BUG_INTEL_857088) */ + +SIMDE_END_DECLS_ +HEDLEY_DIAGNOSTIC_POP + +#endif /* !defined(SIMDE_ARM_NEON_LD1_X2_H) */ diff --git a/lib/simd_wrapper/simde/arm/neon/ld1_x3.h b/lib/simd_wrapper/simde/arm/neon/ld1_x3.h new file mode 100644 index 00000000000..bdaf8e527a2 --- /dev/null +++ b/lib/simd_wrapper/simde/arm/neon/ld1_x3.h @@ -0,0 +1,486 @@ +/* SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, copy, + * modify, merge, publish, distribute, sublicense, and/or sell copies + * of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + * Copyright: + * 2020 Evan Nemerson + * 2021 Zhi An Ng (Copyright owned by Google, LLC) + * 2023 Yi-Yen Chung (Copyright owned by Andes Technology) + * 2023 Chi-Wei Chu (Copyright owned by NTHU pllab) + */ + +#if !defined(SIMDE_ARM_NEON_LD1_X3_H) +#define SIMDE_ARM_NEON_LD1_X3_H + +#include "types.h" + +HEDLEY_DIAGNOSTIC_PUSH +SIMDE_DISABLE_UNWANTED_DIAGNOSTICS +#if HEDLEY_GCC_VERSION_CHECK(7,0,0) + SIMDE_DIAGNOSTIC_DISABLE_MAYBE_UNINITIAZILED_ +#endif +SIMDE_BEGIN_DECLS_ + +#if !defined(SIMDE_BUG_INTEL_857088) + +SIMDE_FUNCTION_ATTRIBUTES +simde_float16x4x3_t +simde_vld1_f16_x3(simde_float16_t const ptr[HEDLEY_ARRAY_PARAM(12)]) { + #if \ + defined(SIMDE_ARM_NEON_A32V7_NATIVE) && defined(SIMDE_ARM_NEON_FP16) && \ + (!defined(HEDLEY_GCC_VERSION) || (HEDLEY_GCC_VERSION_CHECK(8,0,0) && defined(SIMDE_ARM_NEON_A64V8_NATIVE))) && \ + (!defined(__clang__) || (SIMDE_DETECT_CLANG_VERSION_CHECK(7,0,0) && defined(SIMDE_ARM_NEON_A64V8_NATIVE))) + return vld1_f16_x3(ptr); + #else + simde_float16x4_private a_[3]; + #if defined(SIMDE_RISCV_V_NATIVE) && SIMDE_ARCH_RISCV_ZVFH + a_[0].sv64 = __riscv_vle16_v_f16m1((_Float16 *)ptr , 4); + a_[1].sv64 = __riscv_vle16_v_f16m1((_Float16 *)(ptr+4) , 4); + a_[2].sv64 = __riscv_vle16_v_f16m1((_Float16 *)(ptr+8) , 4); + #else + for (size_t i = 0; i < 12; i++) { + a_[i / 4].values[i % 4] = ptr[i]; + } + #endif + simde_float16x4x3_t s_ = { { simde_float16x4_from_private(a_[0]), + simde_float16x4_from_private(a_[1]), + simde_float16x4_from_private(a_[2]) } }; + return s_; + #endif +} +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vld1_f16_x3 + #define vld1_f16_x3(a) simde_vld1_f16_x3((a)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_float32x2x3_t +simde_vld1_f32_x3(simde_float32 const ptr[HEDLEY_ARRAY_PARAM(6)]) { + #if \ + defined(SIMDE_ARM_NEON_A32V7_NATIVE) && \ + (!defined(HEDLEY_GCC_VERSION) || (HEDLEY_GCC_VERSION_CHECK(8,0,0) && defined(SIMDE_ARM_NEON_A64V8_NATIVE))) && \ + (!defined(__clang__) || (SIMDE_DETECT_CLANG_VERSION_CHECK(7,0,0) && defined(SIMDE_ARM_NEON_A64V8_NATIVE))) + return vld1_f32_x3(ptr); + #else + simde_float32x2_private a_[3]; + #if defined(SIMDE_RISCV_V_NATIVE) + a_[0].sv64 = __riscv_vle32_v_f32m1(ptr , 2); + a_[1].sv64 = __riscv_vle32_v_f32m1(ptr+2 , 2); + a_[2].sv64 = __riscv_vle32_v_f32m1(ptr+4 , 2); + #else + for (size_t i = 0; i < 6; i++) { + a_[i / 2].values[i % 2] = ptr[i]; + } + #endif + simde_float32x2x3_t s_ = { { simde_float32x2_from_private(a_[0]), + simde_float32x2_from_private(a_[1]), + simde_float32x2_from_private(a_[2]) } }; + return s_; + #endif +} +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vld1_f32_x3 + #define vld1_f32_x3(a) simde_vld1_f32_x3((a)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_float64x1x3_t +simde_vld1_f64_x3(simde_float64 const ptr[HEDLEY_ARRAY_PARAM(3)]) { + #if \ + defined(SIMDE_ARM_NEON_A64V8_NATIVE) && \ + (!defined(HEDLEY_GCC_VERSION) || HEDLEY_GCC_VERSION_CHECK(8,0,0)) && \ + (!defined(__clang__) || SIMDE_DETECT_CLANG_VERSION_CHECK(7,0,0)) + return vld1_f64_x3(ptr); + #else + simde_float64x1_private a_[3]; + #if defined(SIMDE_RISCV_V_NATIVE) + a_[0].sv64 = __riscv_vle64_v_f64m1(ptr , 1); + a_[1].sv64 = __riscv_vle64_v_f64m1(ptr+1 , 1); + a_[2].sv64 = __riscv_vle64_v_f64m1(ptr+2 , 1); + #else + for (size_t i = 0; i < 3; i++) { + a_[i].values[0] = ptr[i]; + } + #endif + simde_float64x1x3_t s_ = { { simde_float64x1_from_private(a_[0]), + simde_float64x1_from_private(a_[1]), + simde_float64x1_from_private(a_[2]) } }; + return s_; + #endif +} +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vld1_f64_x3 + #define vld1_f64_x3(a) simde_vld1_f64_x3((a)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_int8x8x3_t +simde_vld1_s8_x3(int8_t const ptr[HEDLEY_ARRAY_PARAM(24)]) { + #if \ + defined(SIMDE_ARM_NEON_A32V7_NATIVE) && \ + (!defined(HEDLEY_GCC_VERSION) || (HEDLEY_GCC_VERSION_CHECK(12,0,0) && defined(SIMDE_ARM_NEON_A64V8_NATIVE))) && \ + (!defined(__clang__) || (SIMDE_DETECT_CLANG_VERSION_CHECK(7,0,0) && defined(SIMDE_ARM_NEON_A64V8_NATIVE))) + return vld1_s8_x3(ptr); + #else + simde_int8x8_private a_[3]; + #if defined(SIMDE_RISCV_V_NATIVE) + a_[0].sv64 = __riscv_vle8_v_i8m1(ptr , 8); + a_[1].sv64 = __riscv_vle8_v_i8m1(ptr+8 , 8); + a_[2].sv64 = __riscv_vle8_v_i8m1(ptr+16 , 8); + #else + for (size_t i = 0; i < 24; i++) { + a_[i / 8].values[i % 8] = ptr[i]; + } + #endif + simde_int8x8x3_t s_ = { { simde_int8x8_from_private(a_[0]), + simde_int8x8_from_private(a_[1]), + simde_int8x8_from_private(a_[2]) } }; + return s_; + #endif +} +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vld1_s8_x3 + #define vld1_s8_x3(a) simde_vld1_s8_x3((a)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_int16x4x3_t +simde_vld1_s16_x3(int16_t const ptr[HEDLEY_ARRAY_PARAM(12)]) { + #if \ + defined(SIMDE_ARM_NEON_A32V7_NATIVE) && \ + (!defined(HEDLEY_GCC_VERSION) || (HEDLEY_GCC_VERSION_CHECK(8,0,0) && defined(SIMDE_ARM_NEON_A64V8_NATIVE))) && \ + (!defined(__clang__) || (SIMDE_DETECT_CLANG_VERSION_CHECK(7,0,0) && defined(SIMDE_ARM_NEON_A64V8_NATIVE))) + return vld1_s16_x3(ptr); + #else + simde_int16x4_private a_[3]; + #if defined(SIMDE_RISCV_V_NATIVE) + a_[0].sv64 = __riscv_vle16_v_i16m1(ptr , 4); + a_[1].sv64 = __riscv_vle16_v_i16m1(ptr+4 , 4); + a_[2].sv64 = __riscv_vle16_v_i16m1(ptr+8 , 4); + #else + for (size_t i = 0; i < 12; i++) { + a_[i / 4].values[i % 4] = ptr[i]; + } + #endif + simde_int16x4x3_t s_ = { { simde_int16x4_from_private(a_[0]), + simde_int16x4_from_private(a_[1]), + simde_int16x4_from_private(a_[2]) } }; + return s_; + #endif +} +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vld1_s16_x3 + #define vld1_s16_x3(a) simde_vld1_s16_x3((a)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_int32x2x3_t +simde_vld1_s32_x3(int32_t const ptr[HEDLEY_ARRAY_PARAM(6)]) { + #if \ + defined(SIMDE_ARM_NEON_A32V7_NATIVE) && \ + (!defined(HEDLEY_GCC_VERSION) || (HEDLEY_GCC_VERSION_CHECK(12,0,0) && defined(SIMDE_ARM_NEON_A64V8_NATIVE))) && \ + (!defined(__clang__) || (SIMDE_DETECT_CLANG_VERSION_CHECK(7,0,0) && defined(SIMDE_ARM_NEON_A64V8_NATIVE))) + return vld1_s32_x3(ptr); + #else + simde_int32x2_private a_[3]; + #if defined(SIMDE_RISCV_V_NATIVE) + a_[0].sv64 = __riscv_vle32_v_i32m1(ptr , 2); + a_[1].sv64 = __riscv_vle32_v_i32m1(ptr+2 , 2); + a_[2].sv64 = __riscv_vle32_v_i32m1(ptr+4 , 2); + #else + for (size_t i = 0; i < 6; i++) { + a_[i / 2].values[i % 2] = ptr[i]; + } + #endif + simde_int32x2x3_t s_ = { { simde_int32x2_from_private(a_[0]), + simde_int32x2_from_private(a_[1]), + simde_int32x2_from_private(a_[2]) } }; + return s_; + #endif +} +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vld1_s32_x3 + #define vld1_s32_x3(a) simde_vld1_s32_x3((a)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_int64x1x3_t +simde_vld1_s64_x3(int64_t const ptr[HEDLEY_ARRAY_PARAM(3)]) { + #if \ + defined(SIMDE_ARM_NEON_A32V7_NATIVE) && \ + (!defined(HEDLEY_GCC_VERSION) || (HEDLEY_GCC_VERSION_CHECK(8,0,0) && defined(SIMDE_ARM_NEON_A64V8_NATIVE))) && \ + (!defined(__clang__) || (SIMDE_DETECT_CLANG_VERSION_CHECK(7,0,0) && defined(SIMDE_ARM_NEON_A64V8_NATIVE))) + return vld1_s64_x3(ptr); + #else + simde_int64x1_private a_[3]; + #if defined(SIMDE_RISCV_V_NATIVE) + a_[0].sv64 = __riscv_vle64_v_i64m1(ptr , 1); + a_[1].sv64 = __riscv_vle64_v_i64m1(ptr+1 , 1); + a_[2].sv64 = __riscv_vle64_v_i64m1(ptr+2 , 1); + #else + for (size_t i = 0; i < 3; i++) { + a_[i].values[0] = ptr[i]; + } + #endif + simde_int64x1x3_t s_ = { { simde_int64x1_from_private(a_[0]), + simde_int64x1_from_private(a_[1]), + simde_int64x1_from_private(a_[2]) } }; + return s_; + #endif +} +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vld1_s64_x3 + #define vld1_s64_x3(a) simde_vld1_s64_x3((a)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_uint8x8x3_t +simde_vld1_u8_x3(uint8_t const ptr[HEDLEY_ARRAY_PARAM(24)]) { + #if \ + defined(SIMDE_ARM_NEON_A32V7_NATIVE) && \ + (!defined(HEDLEY_GCC_VERSION) || (HEDLEY_GCC_VERSION_CHECK(8,0,0) && defined(SIMDE_ARM_NEON_A64V8_NATIVE))) && \ + (!defined(__clang__) || (SIMDE_DETECT_CLANG_VERSION_CHECK(7,0,0) && defined(SIMDE_ARM_NEON_A64V8_NATIVE))) + return vld1_u8_x3(ptr); + #else + simde_uint8x8_private a_[3]; + #if defined(SIMDE_RISCV_V_NATIVE) + a_[0].sv64 = __riscv_vle8_v_u8m1(ptr , 8); + a_[1].sv64 = __riscv_vle8_v_u8m1(ptr+8 , 8); + a_[2].sv64 = __riscv_vle8_v_u8m1(ptr+16 , 8); + #else + for (size_t i = 0; i < 24; i++) { + a_[i / 8].values[i % 8] = ptr[i]; + } + #endif + simde_uint8x8x3_t s_ = { { simde_uint8x8_from_private(a_[0]), + simde_uint8x8_from_private(a_[1]), + simde_uint8x8_from_private(a_[2]) } }; + return s_; + #endif +} +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vld1_u8_x3 + #define vld1_u8_x3(a) simde_vld1_u8_x3((a)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_uint16x4x3_t +simde_vld1_u16_x3(uint16_t const ptr[HEDLEY_ARRAY_PARAM(12)]) { + #if \ + defined(SIMDE_ARM_NEON_A32V7_NATIVE) && \ + (!defined(HEDLEY_GCC_VERSION) || (HEDLEY_GCC_VERSION_CHECK(8,0,0) && defined(SIMDE_ARM_NEON_A64V8_NATIVE))) && \ + (!defined(__clang__) || (SIMDE_DETECT_CLANG_VERSION_CHECK(7,0,0) && defined(SIMDE_ARM_NEON_A64V8_NATIVE))) + return vld1_u16_x3(ptr); + #else + simde_uint16x4_private a_[3]; + #if defined(SIMDE_RISCV_V_NATIVE) + a_[0].sv64 = __riscv_vle16_v_u16m1(ptr , 4); + a_[1].sv64 = __riscv_vle16_v_u16m1(ptr+4 , 4); + a_[2].sv64 = __riscv_vle16_v_u16m1(ptr+8 , 4); + #else + for (size_t i = 0; i < 12; i++) { + a_[i / 4].values[i % 4] = ptr[i]; + } + #endif + simde_uint16x4x3_t s_ = { { simde_uint16x4_from_private(a_[0]), + simde_uint16x4_from_private(a_[1]), + simde_uint16x4_from_private(a_[2]) } }; + return s_; + #endif +} +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vld1_u16_x3 + #define vld1_u16_x3(a) simde_vld1_u16_x3((a)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_uint32x2x3_t +simde_vld1_u32_x3(uint32_t const ptr[HEDLEY_ARRAY_PARAM(6)]) { + #if \ + defined(SIMDE_ARM_NEON_A32V7_NATIVE) && \ + (!defined(HEDLEY_GCC_VERSION) || (HEDLEY_GCC_VERSION_CHECK(8,0,0) && defined(SIMDE_ARM_NEON_A64V8_NATIVE))) && \ + (!defined(__clang__) || (SIMDE_DETECT_CLANG_VERSION_CHECK(7,0,0) && defined(SIMDE_ARM_NEON_A64V8_NATIVE))) + return vld1_u32_x3(ptr); + #else + simde_uint32x2_private a_[3]; + #if defined(SIMDE_RISCV_V_NATIVE) + a_[0].sv64 = __riscv_vle32_v_u32m1(ptr , 2); + a_[1].sv64 = __riscv_vle32_v_u32m1(ptr+2 , 2); + a_[2].sv64 = __riscv_vle32_v_u32m1(ptr+4 , 2); + #else + for (size_t i = 0; i < 6; i++) { + a_[i / 2].values[i % 2] = ptr[i]; + } + #endif + simde_uint32x2x3_t s_ = { { simde_uint32x2_from_private(a_[0]), + simde_uint32x2_from_private(a_[1]), + simde_uint32x2_from_private(a_[2]) } }; + return s_; + #endif +} +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vld1_u32_x3 + #define vld1_u32_x3(a) simde_vld1_u32_x3((a)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_uint64x1x3_t +simde_vld1_u64_x3(uint64_t const ptr[HEDLEY_ARRAY_PARAM(3)]) { + #if \ + defined(SIMDE_ARM_NEON_A32V7_NATIVE) && \ + (!defined(HEDLEY_GCC_VERSION) || (HEDLEY_GCC_VERSION_CHECK(8,0,0) && defined(SIMDE_ARM_NEON_A64V8_NATIVE))) && \ + (!defined(__clang__) || (SIMDE_DETECT_CLANG_VERSION_CHECK(7,0,0) && defined(SIMDE_ARM_NEON_A64V8_NATIVE))) + return vld1_u64_x3(ptr); + #else + simde_uint64x1_private a_[3]; + #if defined(SIMDE_RISCV_V_NATIVE) + a_[0].sv64 = __riscv_vle64_v_u64m1(ptr , 1); + a_[1].sv64 = __riscv_vle64_v_u64m1(ptr+1 , 1); + a_[2].sv64 = __riscv_vle64_v_u64m1(ptr+2 , 1); + #else + for (size_t i = 0; i < 3; i++) { + a_[i].values[0] = ptr[i]; + } + #endif + simde_uint64x1x3_t s_ = { { simde_uint64x1_from_private(a_[0]), + simde_uint64x1_from_private(a_[1]), + simde_uint64x1_from_private(a_[2]) } }; + return s_; + #endif +} +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vld1_u64_x3 + #define vld1_u64_x3(a) simde_vld1_u64_x3((a)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_poly8x8x3_t +simde_vld1_p8_x3(simde_poly8_t const ptr[HEDLEY_ARRAY_PARAM(24)]) { + #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) && \ + (!defined(HEDLEY_GCC_VERSION) || (HEDLEY_GCC_VERSION_CHECK(8,5,0) && defined(SIMDE_ARM_NEON_A64V8_NATIVE))) + return vld1_p8_x3(ptr); + #else + simde_poly8x8_private a_[3]; + #if defined(SIMDE_RISCV_V_NATIVE) + a_[0].sv64 = __riscv_vle8_v_u8m1(ptr , 8); + a_[1].sv64 = __riscv_vle8_v_u8m1(ptr+8 , 8); + a_[2].sv64 = __riscv_vle8_v_u8m1(ptr+16 , 8); + #else + for (size_t i = 0; i < 24; i++) { + a_[i / 8].values[i % 8] = ptr[i]; + } + #endif + simde_poly8x8x3_t s_ = { { simde_poly8x8_from_private(a_[0]), + simde_poly8x8_from_private(a_[1]), + simde_poly8x8_from_private(a_[2]) } }; + return s_; + #endif +} +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vld1_p8_x3 + #define vld1_p8_x3(a) simde_vld1_p8_x3((a)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_poly16x4x3_t +simde_vld1_p16_x3(simde_poly16_t const ptr[HEDLEY_ARRAY_PARAM(12)]) { + #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) && \ + (!defined(HEDLEY_GCC_VERSION) || (HEDLEY_GCC_VERSION_CHECK(8,5,0) && defined(SIMDE_ARM_NEON_A64V8_NATIVE))) + return vld1_p16_x3(ptr); + #else + simde_poly16x4_private a_[3]; + #if defined(SIMDE_RISCV_V_NATIVE) + a_[0].sv64 = __riscv_vle16_v_u16m1(ptr , 4); + a_[1].sv64 = __riscv_vle16_v_u16m1(ptr+4 , 4); + a_[2].sv64 = __riscv_vle16_v_u16m1(ptr+8 , 4); + #else + for (size_t i = 0; i < 12; i++) { + a_[i / 4].values[i % 4] = ptr[i]; + } + #endif + simde_poly16x4x3_t s_ = { { simde_poly16x4_from_private(a_[0]), + simde_poly16x4_from_private(a_[1]), + simde_poly16x4_from_private(a_[2]) } }; + return s_; + #endif +} +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vld1_p16_x3 + #define vld1_p16_x3(a) simde_vld1_p16_x3((a)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_poly64x1x3_t +simde_vld1_p64_x3(simde_poly64_t const ptr[HEDLEY_ARRAY_PARAM(3)]) { + #if \ + defined(SIMDE_ARM_NEON_A32V8_NATIVE) && \ + (!defined(HEDLEY_GCC_VERSION) || (HEDLEY_GCC_VERSION_CHECK(8,5,0) && defined(SIMDE_ARM_NEON_A64V8_NATIVE))) && \ + (!defined(__clang__) || (SIMDE_DETECT_CLANG_VERSION_CHECK(9,0,0) && defined(SIMDE_ARM_NEON_A64V8_NATIVE))) + return vld1_p64_x3(ptr); + #else + simde_poly64x1_private a_[3]; + #if defined(SIMDE_RISCV_V_NATIVE) + a_[0].sv64 = __riscv_vle64_v_u64m1(ptr , 1); + a_[1].sv64 = __riscv_vle64_v_u64m1(ptr+1 , 1); + a_[2].sv64 = __riscv_vle64_v_u64m1(ptr+2 , 1); + #else + for (size_t i = 0; i < 3; i++) { + a_[i].values[0] = ptr[i]; + } + #endif + simde_poly64x1x3_t s_ = { { simde_poly64x1_from_private(a_[0]), + simde_poly64x1_from_private(a_[1]), + simde_poly64x1_from_private(a_[2]) } }; + return s_; + #endif +} +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) + #undef vld1_p64_x3 + #define vld1_p64_x3(a) simde_vld1_p64_x3((a)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_bfloat16x4x3_t +simde_vld1_bf16_x3(simde_bfloat16 const ptr[HEDLEY_ARRAY_PARAM(12)]) { + #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARM_NEON_BF16) + return vld1_bf16_x3(ptr); + #else + simde_bfloat16x4_private a_[3]; + for (size_t i = 0; i < 12; i++) { + a_[i / 4].values[i % 4] = ptr[i]; + } + simde_bfloat16x4x3_t s_ = { { simde_bfloat16x4_from_private(a_[0]), + simde_bfloat16x4_from_private(a_[1]), + simde_bfloat16x4_from_private(a_[2]) } }; + return s_; + #endif +} +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) + #undef vld1_bf16_x3 + #define vld1_bf16_x3(a) simde_vld1_bf16_x3((a)) +#endif + +#endif /* !defined(SIMDE_BUG_INTEL_857088) */ + +SIMDE_END_DECLS_ +HEDLEY_DIAGNOSTIC_POP + +#endif /* !defined(SIMDE_ARM_NEON_LD1_X3_H) */ diff --git a/lib/simd_wrapper/simde/arm/neon/ld1_x4.h b/lib/simd_wrapper/simde/arm/neon/ld1_x4.h new file mode 100644 index 00000000000..1d797364b60 --- /dev/null +++ b/lib/simd_wrapper/simde/arm/neon/ld1_x4.h @@ -0,0 +1,516 @@ +/* SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, copy, + * modify, merge, publish, distribute, sublicense, and/or sell copies + * of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + * Copyright: + * 2020 Evan Nemerson + * 2021 Zhi An Ng (Copyright owned by Google, LLC) + * 2021 Décio Luiz Gazzoni Filho + * 2023 Yi-Yen Chung (Copyright owned by Andes Technology) + * 2023 Chi-Wei Chu (Copyright owned by NTHU pllab) + */ + +#if !defined(SIMDE_ARM_NEON_LD1_X4_H) +#define SIMDE_ARM_NEON_LD1_X4_H + +#include "types.h" + +HEDLEY_DIAGNOSTIC_PUSH +SIMDE_DISABLE_UNWANTED_DIAGNOSTICS +#if HEDLEY_GCC_VERSION_CHECK(7,0,0) + SIMDE_DIAGNOSTIC_DISABLE_MAYBE_UNINITIAZILED_ +#endif +SIMDE_BEGIN_DECLS_ + +#if !defined(SIMDE_BUG_INTEL_857088) + +SIMDE_FUNCTION_ATTRIBUTES +simde_float16x4x4_t +simde_vld1_f16_x4(simde_float16_t const ptr[HEDLEY_ARRAY_PARAM(16)]) { + #if \ + defined(SIMDE_ARM_NEON_A32V7_NATIVE) && defined(SIMDE_ARM_NEON_FP16) && \ + (!defined(HEDLEY_GCC_VERSION) || (HEDLEY_GCC_VERSION_CHECK(8,0,0) && defined(SIMDE_ARM_NEON_A64V8_NATIVE))) && \ + (!defined(__clang__) || (SIMDE_DETECT_CLANG_VERSION_CHECK(7,0,0) && defined(SIMDE_ARM_NEON_A64V8_NATIVE))) + return vld1_f16_x4(ptr); + #else + simde_float16x4_private a_[4]; + #if defined(SIMDE_RISCV_V_NATIVE) && SIMDE_ARCH_RISCV_ZVFH + a_[0].sv64 = __riscv_vle16_v_f16m1((_Float16 *)ptr , 4); + a_[1].sv64 = __riscv_vle16_v_f16m1((_Float16 *)(ptr+4) , 4); + a_[2].sv64 = __riscv_vle16_v_f16m1((_Float16 *)(ptr+8) , 4); + a_[3].sv64 = __riscv_vle16_v_f16m1((_Float16 *)(ptr+12) , 4); + #else + for (size_t i = 0; i < 16; i++) { + a_[i / 4].values[i % 4] = ptr[i]; + } + #endif + simde_float16x4x4_t s_ = { { simde_float16x4_from_private(a_[0]), + simde_float16x4_from_private(a_[1]), + simde_float16x4_from_private(a_[2]), + simde_float16x4_from_private(a_[3]) } }; + return s_; + #endif +} +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vld1_f16_x4 + #define vld1_f16_x4(a) simde_vld1_f16_x4((a)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_float32x2x4_t +simde_vld1_f32_x4(simde_float32 const ptr[HEDLEY_ARRAY_PARAM(8)]) { + #if \ + defined(SIMDE_ARM_NEON_A32V7_NATIVE) && \ + (!defined(HEDLEY_GCC_VERSION) || (HEDLEY_GCC_VERSION_CHECK(8,0,0) && defined(SIMDE_ARM_NEON_A64V8_NATIVE))) && \ + (!defined(__clang__) || (SIMDE_DETECT_CLANG_VERSION_CHECK(7,0,0) && defined(SIMDE_ARM_NEON_A64V8_NATIVE))) + return vld1_f32_x4(ptr); + #else + simde_float32x2_private a_[4]; + #if defined(SIMDE_RISCV_V_NATIVE) + a_[0].sv64 = __riscv_vle32_v_f32m1(ptr , 2); + a_[1].sv64 = __riscv_vle32_v_f32m1(ptr+2 , 2); + a_[2].sv64 = __riscv_vle32_v_f32m1(ptr+4 , 2); + a_[3].sv64 = __riscv_vle32_v_f32m1(ptr+6 , 2); + #else + for (size_t i = 0; i < 8; i++) { + a_[i / 2].values[i % 2] = ptr[i]; + } + #endif + simde_float32x2x4_t s_ = { { simde_float32x2_from_private(a_[0]), + simde_float32x2_from_private(a_[1]), + simde_float32x2_from_private(a_[2]), + simde_float32x2_from_private(a_[3]) } }; + return s_; + #endif +} +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vld1_f32_x4 + #define vld1_f32_x4(a) simde_vld1_f32_x4((a)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_float64x1x4_t +simde_vld1_f64_x4(simde_float64 const ptr[HEDLEY_ARRAY_PARAM(4)]) { + #if \ + defined(SIMDE_ARM_NEON_A64V8_NATIVE) && \ + (!defined(HEDLEY_GCC_VERSION) || HEDLEY_GCC_VERSION_CHECK(8,0,0)) && \ + (!defined(__clang__) || SIMDE_DETECT_CLANG_VERSION_CHECK(7,0,0)) + return vld1_f64_x4(ptr); + #else + simde_float64x1_private a_[4]; + #if defined(SIMDE_RISCV_V_NATIVE) + a_[0].sv64 = __riscv_vle64_v_f64m1(ptr , 1); + a_[1].sv64 = __riscv_vle64_v_f64m1(ptr+1 , 1); + a_[2].sv64 = __riscv_vle64_v_f64m1(ptr+2 , 1); + a_[3].sv64 = __riscv_vle64_v_f64m1(ptr+3 , 1); + #else + for (size_t i = 0; i < 4; i++) { + a_[i].values[0] = ptr[i]; + } + #endif + simde_float64x1x4_t s_ = { { simde_float64x1_from_private(a_[0]), + simde_float64x1_from_private(a_[1]), + simde_float64x1_from_private(a_[2]), + simde_float64x1_from_private(a_[3]) } }; + return s_; + #endif +} +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vld1_f64_x4 + #define vld1_f64_x4(a) simde_vld1_f64_x4((a)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_int8x8x4_t +simde_vld1_s8_x4(int8_t const ptr[HEDLEY_ARRAY_PARAM(32)]) { + #if \ + defined(SIMDE_ARM_NEON_A32V7_NATIVE) && \ + (!defined(HEDLEY_GCC_VERSION) || (HEDLEY_GCC_VERSION_CHECK(8,0,0) && defined(SIMDE_ARM_NEON_A64V8_NATIVE))) && \ + (!defined(__clang__) || (SIMDE_DETECT_CLANG_VERSION_CHECK(7,0,0) && defined(SIMDE_ARM_NEON_A64V8_NATIVE))) + return vld1_s8_x4(ptr); + #else + simde_int8x8_private a_[4]; + #if defined(SIMDE_RISCV_V_NATIVE) + a_[0].sv64 = __riscv_vle8_v_i8m1(ptr , 8); + a_[1].sv64 = __riscv_vle8_v_i8m1(ptr+8 , 8); + a_[2].sv64 = __riscv_vle8_v_i8m1(ptr+16 , 8); + a_[3].sv64 = __riscv_vle8_v_i8m1(ptr+24 , 8); + #else + for (size_t i = 0; i < 32; i++) { + a_[i / 8].values[i % 8] = ptr[i]; + } + #endif + simde_int8x8x4_t s_ = { { simde_int8x8_from_private(a_[0]), + simde_int8x8_from_private(a_[1]), + simde_int8x8_from_private(a_[2]), + simde_int8x8_from_private(a_[3]) } }; + return s_; + #endif +} +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vld1_s8_x4 + #define vld1_s8_x4(a) simde_vld1_s8_x4((a)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_int16x4x4_t +simde_vld1_s16_x4(int16_t const ptr[HEDLEY_ARRAY_PARAM(16)]) { + #if \ + defined(SIMDE_ARM_NEON_A32V7_NATIVE) && \ + (!defined(HEDLEY_GCC_VERSION) || (HEDLEY_GCC_VERSION_CHECK(8,0,0) && defined(SIMDE_ARM_NEON_A64V8_NATIVE))) && \ + (!defined(__clang__) || (SIMDE_DETECT_CLANG_VERSION_CHECK(7,0,0) && defined(SIMDE_ARM_NEON_A64V8_NATIVE))) + return vld1_s16_x4(ptr); + #else + simde_int16x4_private a_[4]; + #if defined(SIMDE_RISCV_V_NATIVE) + a_[0].sv64 = __riscv_vle16_v_i16m1(ptr , 4); + a_[1].sv64 = __riscv_vle16_v_i16m1(ptr+4 , 4); + a_[2].sv64 = __riscv_vle16_v_i16m1(ptr+8 , 4); + a_[3].sv64 = __riscv_vle16_v_i16m1(ptr+12 , 4); + #else + for (size_t i = 0; i < 16; i++) { + a_[i / 4].values[i % 4] = ptr[i]; + } + #endif + simde_int16x4x4_t s_ = { { simde_int16x4_from_private(a_[0]), + simde_int16x4_from_private(a_[1]), + simde_int16x4_from_private(a_[2]), + simde_int16x4_from_private(a_[3]) } }; + return s_; + #endif +} +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vld1_s16_x4 + #define vld1_s16_x4(a) simde_vld1_s16_x4((a)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_int32x2x4_t +simde_vld1_s32_x4(int32_t const ptr[HEDLEY_ARRAY_PARAM(8)]) { + #if \ + defined(SIMDE_ARM_NEON_A32V7_NATIVE) && \ + (!defined(HEDLEY_GCC_VERSION) || (HEDLEY_GCC_VERSION_CHECK(8,0,0) && defined(SIMDE_ARM_NEON_A64V8_NATIVE))) && \ + (!defined(__clang__) || (SIMDE_DETECT_CLANG_VERSION_CHECK(7,0,0) && defined(SIMDE_ARM_NEON_A64V8_NATIVE))) + return vld1_s32_x4(ptr); + #else + simde_int32x2_private a_[4]; + #if defined(SIMDE_RISCV_V_NATIVE) + a_[0].sv64 = __riscv_vle32_v_i32m1(ptr , 2); + a_[1].sv64 = __riscv_vle32_v_i32m1(ptr+2 , 2); + a_[2].sv64 = __riscv_vle32_v_i32m1(ptr+4 , 2); + a_[3].sv64 = __riscv_vle32_v_i32m1(ptr+6 , 2); + #else + for (size_t i = 0; i < 8; i++) { + a_[i / 2].values[i % 2] = ptr[i]; + } + #endif + simde_int32x2x4_t s_ = { { simde_int32x2_from_private(a_[0]), + simde_int32x2_from_private(a_[1]), + simde_int32x2_from_private(a_[2]), + simde_int32x2_from_private(a_[3]) } }; + return s_; + #endif +} +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vld1_s32_x4 + #define vld1_s32_x4(a) simde_vld1_s32_x4((a)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_int64x1x4_t +simde_vld1_s64_x4(int64_t const ptr[HEDLEY_ARRAY_PARAM(4)]) { + #if \ + defined(SIMDE_ARM_NEON_A32V7_NATIVE) && \ + (!defined(HEDLEY_GCC_VERSION) || (HEDLEY_GCC_VERSION_CHECK(8,0,0) && defined(SIMDE_ARM_NEON_A64V8_NATIVE))) && \ + (!defined(__clang__) || (SIMDE_DETECT_CLANG_VERSION_CHECK(7,0,0) && defined(SIMDE_ARM_NEON_A64V8_NATIVE))) + return vld1_s64_x4(ptr); + #else + simde_int64x1_private a_[4]; + #if defined(SIMDE_RISCV_V_NATIVE) + a_[0].sv64 = __riscv_vle64_v_i64m1(ptr , 1); + a_[1].sv64 = __riscv_vle64_v_i64m1(ptr+1 , 1); + a_[2].sv64 = __riscv_vle64_v_i64m1(ptr+2 , 1); + a_[3].sv64 = __riscv_vle64_v_i64m1(ptr+3 , 1); + #else + for (size_t i = 0; i < 4; i++) { + a_[i].values[0] = ptr[i]; + } + #endif + simde_int64x1x4_t s_ = { { simde_int64x1_from_private(a_[0]), + simde_int64x1_from_private(a_[1]), + simde_int64x1_from_private(a_[2]), + simde_int64x1_from_private(a_[3]) } }; + return s_; + #endif +} +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vld1_s64_x4 + #define vld1_s64_x4(a) simde_vld1_s64_x4((a)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_uint8x8x4_t +simde_vld1_u8_x4(uint8_t const ptr[HEDLEY_ARRAY_PARAM(32)]) { + #if \ + defined(SIMDE_ARM_NEON_A32V7_NATIVE) && \ + (!defined(HEDLEY_GCC_VERSION) || (HEDLEY_GCC_VERSION_CHECK(8,0,0) && defined(SIMDE_ARM_NEON_A64V8_NATIVE))) && \ + (!defined(__clang__) || (SIMDE_DETECT_CLANG_VERSION_CHECK(7,0,0) && defined(SIMDE_ARM_NEON_A64V8_NATIVE))) + return vld1_u8_x4(ptr); + #else + simde_uint8x8_private a_[4]; + #if defined(SIMDE_RISCV_V_NATIVE) + a_[0].sv64 = __riscv_vle8_v_u8m1(ptr , 8); + a_[1].sv64 = __riscv_vle8_v_u8m1(ptr+8 , 8); + a_[2].sv64 = __riscv_vle8_v_u8m1(ptr+16 , 8); + a_[3].sv64 = __riscv_vle8_v_u8m1(ptr+24 , 8); + #else + for (size_t i = 0; i < 32; i++) { + a_[i / 8].values[i % 8] = ptr[i]; + } + #endif + simde_uint8x8x4_t s_ = { { simde_uint8x8_from_private(a_[0]), + simde_uint8x8_from_private(a_[1]), + simde_uint8x8_from_private(a_[2]), + simde_uint8x8_from_private(a_[3]) } }; + return s_; + #endif +} +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vld1_u8_x4 + #define vld1_u8_x4(a) simde_vld1_u8_x4((a)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_uint16x4x4_t +simde_vld1_u16_x4(uint16_t const ptr[HEDLEY_ARRAY_PARAM(16)]) { + #if \ + defined(SIMDE_ARM_NEON_A32V7_NATIVE) && \ + (!defined(HEDLEY_GCC_VERSION) || (HEDLEY_GCC_VERSION_CHECK(8,0,0) && defined(SIMDE_ARM_NEON_A64V8_NATIVE))) && \ + (!defined(__clang__) || (SIMDE_DETECT_CLANG_VERSION_CHECK(7,0,0) && defined(SIMDE_ARM_NEON_A64V8_NATIVE))) + return vld1_u16_x4(ptr); + #else + simde_uint16x4_private a_[4]; + #if defined(SIMDE_RISCV_V_NATIVE) + a_[0].sv64 = __riscv_vle16_v_u16m1(ptr , 4); + a_[1].sv64 = __riscv_vle16_v_u16m1(ptr+4 , 4); + a_[2].sv64 = __riscv_vle16_v_u16m1(ptr+8 , 4); + a_[3].sv64 = __riscv_vle16_v_u16m1(ptr+12 , 4); + #else + for (size_t i = 0; i < 16; i++) { + a_[i / 4].values[i % 4] = ptr[i]; + } + #endif + simde_uint16x4x4_t s_ = { { simde_uint16x4_from_private(a_[0]), + simde_uint16x4_from_private(a_[1]), + simde_uint16x4_from_private(a_[2]), + simde_uint16x4_from_private(a_[3]) } }; + return s_; + #endif +} +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vld1_u16_x4 + #define vld1_u16_x4(a) simde_vld1_u16_x4((a)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_uint32x2x4_t +simde_vld1_u32_x4(uint32_t const ptr[HEDLEY_ARRAY_PARAM(8)]) { + #if \ + defined(SIMDE_ARM_NEON_A32V7_NATIVE) && \ + (!defined(HEDLEY_GCC_VERSION) || (HEDLEY_GCC_VERSION_CHECK(8,0,0) && defined(SIMDE_ARM_NEON_A64V8_NATIVE))) && \ + (!defined(__clang__) || (SIMDE_DETECT_CLANG_VERSION_CHECK(7,0,0) && defined(SIMDE_ARM_NEON_A64V8_NATIVE))) + return vld1_u32_x4(ptr); + #else + simde_uint32x2_private a_[4]; + #if defined(SIMDE_RISCV_V_NATIVE) + a_[0].sv64 = __riscv_vle32_v_u32m1(ptr , 2); + a_[1].sv64 = __riscv_vle32_v_u32m1(ptr+2 , 2); + a_[2].sv64 = __riscv_vle32_v_u32m1(ptr+4 , 2); + a_[3].sv64 = __riscv_vle32_v_u32m1(ptr+6 , 2); + #else + for (size_t i = 0; i < 8; i++) { + a_[i / 2].values[i % 2] = ptr[i]; + } + #endif + simde_uint32x2x4_t s_ = { { simde_uint32x2_from_private(a_[0]), + simde_uint32x2_from_private(a_[1]), + simde_uint32x2_from_private(a_[2]), + simde_uint32x2_from_private(a_[3]) } }; + return s_; + #endif +} +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vld1_u32_x4 + #define vld1_u32_x4(a) simde_vld1_u32_x4((a)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_uint64x1x4_t +simde_vld1_u64_x4(uint64_t const ptr[HEDLEY_ARRAY_PARAM(4)]) { + #if \ + defined(SIMDE_ARM_NEON_A32V7_NATIVE) && \ + (!defined(HEDLEY_GCC_VERSION) || (HEDLEY_GCC_VERSION_CHECK(8,0,0) && defined(SIMDE_ARM_NEON_A64V8_NATIVE))) && \ + (!defined(__clang__) || (SIMDE_DETECT_CLANG_VERSION_CHECK(7,0,0) && defined(SIMDE_ARM_NEON_A64V8_NATIVE))) + return vld1_u64_x4(ptr); + #else + simde_uint64x1_private a_[4]; + #if defined(SIMDE_RISCV_V_NATIVE) + a_[0].sv64 = __riscv_vle64_v_u64m1(ptr , 1); + a_[1].sv64 = __riscv_vle64_v_u64m1(ptr+1 , 1); + a_[2].sv64 = __riscv_vle64_v_u64m1(ptr+2 , 1); + a_[3].sv64 = __riscv_vle64_v_u64m1(ptr+3 , 1); + #else + for (size_t i = 0; i < 4; i++) { + a_[i].values[0] = ptr[i]; + } + #endif + simde_uint64x1x4_t s_ = { { simde_uint64x1_from_private(a_[0]), + simde_uint64x1_from_private(a_[1]), + simde_uint64x1_from_private(a_[2]), + simde_uint64x1_from_private(a_[3]) } }; + return s_; + #endif +} +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vld1_u64_x4 + #define vld1_u64_x4(a) simde_vld1_u64_x4((a)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_poly8x8x4_t +simde_vld1_p8_x4(simde_poly8_t const ptr[HEDLEY_ARRAY_PARAM(32)]) { + #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) && \ + (!defined(HEDLEY_GCC_VERSION) || (HEDLEY_GCC_VERSION_CHECK(8,5,0) && defined(SIMDE_ARM_NEON_A64V8_NATIVE))) + return vld1_p8_x4(ptr); + #else + simde_poly8x8_private a_[4]; + #if defined(SIMDE_RISCV_V_NATIVE) + a_[0].sv64 = __riscv_vle8_v_u8m1(ptr , 8); + a_[1].sv64 = __riscv_vle8_v_u8m1(ptr+8 , 8); + a_[2].sv64 = __riscv_vle8_v_u8m1(ptr+16 , 8); + a_[3].sv64 = __riscv_vle8_v_u8m1(ptr+24 , 8); + #else + for (size_t i = 0; i < 32; i++) { + a_[i / 8].values[i % 8] = ptr[i]; + } + #endif + simde_poly8x8x4_t s_ = { { simde_poly8x8_from_private(a_[0]), + simde_poly8x8_from_private(a_[1]), + simde_poly8x8_from_private(a_[2]), + simde_poly8x8_from_private(a_[3]) } }; + return s_; + #endif +} +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vld1_p8_x4 + #define vld1_p8_x4(a) simde_vld1_p8_x4((a)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_poly16x4x4_t +simde_vld1_p16_x4(simde_poly16_t const ptr[HEDLEY_ARRAY_PARAM(16)]) { + #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) && \ + (!defined(HEDLEY_GCC_VERSION) || (HEDLEY_GCC_VERSION_CHECK(8,5,0) && defined(SIMDE_ARM_NEON_A64V8_NATIVE))) + return vld1_p16_x4(ptr); + #else + simde_poly16x4_private a_[4]; + #if defined(SIMDE_RISCV_V_NATIVE) + a_[0].sv64 = __riscv_vle16_v_u16m1(ptr , 4); + a_[1].sv64 = __riscv_vle16_v_u16m1(ptr+4 , 4); + a_[2].sv64 = __riscv_vle16_v_u16m1(ptr+8 , 4); + a_[3].sv64 = __riscv_vle16_v_u16m1(ptr+12 , 4); + #else + for (size_t i = 0; i < 16; i++) { + a_[i / 4].values[i % 4] = ptr[i]; + } + #endif + simde_poly16x4x4_t s_ = { { simde_poly16x4_from_private(a_[0]), + simde_poly16x4_from_private(a_[1]), + simde_poly16x4_from_private(a_[2]), + simde_poly16x4_from_private(a_[3]) } }; + return s_; + #endif +} +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vld1_p16_x4 + #define vld1_p16_x4(a) simde_vld1_p16_x4((a)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_poly64x1x4_t +simde_vld1_p64_x4(simde_poly64_t const ptr[HEDLEY_ARRAY_PARAM(4)]) { + #if \ + defined(SIMDE_ARM_NEON_A32V8_NATIVE) && \ + (!defined(HEDLEY_GCC_VERSION) || (HEDLEY_GCC_VERSION_CHECK(8,5,0) && defined(SIMDE_ARM_NEON_A64V8_NATIVE))) && \ + (!defined(__clang__) || (SIMDE_DETECT_CLANG_VERSION_CHECK(9,0,0) && defined(SIMDE_ARM_NEON_A64V8_NATIVE))) + return vld1_p64_x4(ptr); + #else + simde_poly64x1_private a_[4]; + #if defined(SIMDE_RISCV_V_NATIVE) + a_[0].sv64 = __riscv_vle64_v_u64m1(ptr , 1); + a_[1].sv64 = __riscv_vle64_v_u64m1(ptr+1 , 1); + a_[2].sv64 = __riscv_vle64_v_u64m1(ptr+2 , 1); + a_[3].sv64 = __riscv_vle64_v_u64m1(ptr+3 , 1); + #else + for (size_t i = 0; i < 4; i++) { + a_[i].values[0] = ptr[i]; + } + #endif + simde_poly64x1x4_t s_ = { { simde_poly64x1_from_private(a_[0]), + simde_poly64x1_from_private(a_[1]), + simde_poly64x1_from_private(a_[2]), + simde_poly64x1_from_private(a_[3]) } }; + return s_; + #endif +} +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) + #undef vld1_p64_x4 + #define vld1_p64_x4(a) simde_vld1_p64_x4((a)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_bfloat16x4x4_t +simde_vld1_bf16_x4(simde_bfloat16 const ptr[HEDLEY_ARRAY_PARAM(16)]) { + #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARM_NEON_BF16) + return vld1_bf16_x4(ptr); + #else + simde_bfloat16x4_private a_[4]; + for (size_t i = 0; i < 16; i++) { + a_[i / 4].values[i % 4] = ptr[i]; + } + simde_bfloat16x4x4_t s_ = { { simde_bfloat16x4_from_private(a_[0]), + simde_bfloat16x4_from_private(a_[1]), + simde_bfloat16x4_from_private(a_[2]), + simde_bfloat16x4_from_private(a_[3]) } }; + return s_; + #endif +} +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) + #undef vld1_bf16_x4 + #define vld1_bf16_x4(a) simde_vld1_bf16_x4((a)) +#endif + +#endif /* !defined(SIMDE_BUG_INTEL_857088) */ + +SIMDE_END_DECLS_ +HEDLEY_DIAGNOSTIC_POP + +#endif /* !defined(SIMDE_ARM_NEON_LD1_X4_H) */ diff --git a/lib/simd_wrapper/simde/arm/neon/ld1q_x2.h b/lib/simd_wrapper/simde/arm/neon/ld1q_x2.h new file mode 100644 index 00000000000..da1da866af7 --- /dev/null +++ b/lib/simd_wrapper/simde/arm/neon/ld1q_x2.h @@ -0,0 +1,461 @@ +/* SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, copy, + * modify, merge, publish, distribute, sublicense, and/or sell copies + * of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + * Copyright: + * 2020 Evan Nemerson + * 2021 Zhi An Ng (Copyright owned by Google, LLC) + * 2021 Décio Luiz Gazzoni Filho + * 2023 Yi-Yen Chung (Copyright owned by Andes Technology) + * 2023 Chi-Wei Chu (Copyright owned by NTHU pllab) + */ + +#if !defined(SIMDE_ARM_NEON_LD1Q_X2_H) +#define SIMDE_ARM_NEON_LD1Q_X2_H + +#include "types.h" + +HEDLEY_DIAGNOSTIC_PUSH +SIMDE_DISABLE_UNWANTED_DIAGNOSTICS +#if HEDLEY_GCC_VERSION_CHECK(7,0,0) + SIMDE_DIAGNOSTIC_DISABLE_MAYBE_UNINITIAZILED_ +#endif +SIMDE_BEGIN_DECLS_ + +#if !defined(SIMDE_BUG_INTEL_857088) + +SIMDE_FUNCTION_ATTRIBUTES +simde_float16x8x2_t +simde_vld1q_f16_x2(simde_float16_t const ptr[HEDLEY_ARRAY_PARAM(16)]) { + #if \ + defined(SIMDE_ARM_NEON_A32V7_NATIVE) && defined(SIMDE_ARM_NEON_FP16) && \ + (!defined(HEDLEY_GCC_VERSION) || (HEDLEY_GCC_VERSION_CHECK(8,0,0) && defined(SIMDE_ARM_NEON_A64V8_NATIVE))) && \ + (!defined(__clang__) || (SIMDE_DETECT_CLANG_VERSION_CHECK(7,0,0) && defined(SIMDE_ARM_NEON_A64V8_NATIVE))) && \ + defined(SIMDE_ARM_NEON_FP16) + return vld1q_f16_x2(ptr); + #else + simde_float16x8_private a_[2]; + #if defined(SIMDE_RISCV_V_NATIVE) && SIMDE_ARCH_RISCV_ZVFH + a_[0].sv128 = __riscv_vle16_v_f16m1((_Float16 *)ptr , 8); + a_[1].sv128 = __riscv_vle16_v_f16m1((_Float16 *)(ptr+8) , 8); + #else + for (size_t i = 0; i < 16; i++) { + a_[i / 8].values[i % 8] = ptr[i]; + } + #endif + simde_float16x8x2_t s_ = { { simde_float16x8_from_private(a_[0]), + simde_float16x8_from_private(a_[1]) } }; + return s_; + #endif +} +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vld1q_f16_x2 + #define vld1q_f16_x2(a) simde_vld1q_f16_x2((a)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_float32x4x2_t +simde_vld1q_f32_x2(simde_float32 const ptr[HEDLEY_ARRAY_PARAM(8)]) { + #if \ + defined(SIMDE_ARM_NEON_A32V7_NATIVE) && \ + (!defined(HEDLEY_GCC_VERSION) || (HEDLEY_GCC_VERSION_CHECK(8,0,0) && defined(SIMDE_ARM_NEON_A64V8_NATIVE))) && \ + (!defined(__clang__) || (SIMDE_DETECT_CLANG_VERSION_CHECK(7,0,0) && defined(SIMDE_ARM_NEON_A64V8_NATIVE))) + return vld1q_f32_x2(ptr); + #else + simde_float32x4_private a_[2]; + #if defined(SIMDE_RISCV_V_NATIVE) + a_[0].sv128 = __riscv_vle32_v_f32m1(ptr , 4); + a_[1].sv128 = __riscv_vle32_v_f32m1(ptr+4 , 4); + #else + for (size_t i = 0; i < 8; i++) { + a_[i / 4].values[i % 4] = ptr[i]; + } + #endif + simde_float32x4x2_t s_ = { { simde_float32x4_from_private(a_[0]), + simde_float32x4_from_private(a_[1]) } }; + return s_; + #endif +} +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vld1q_f32_x2 + #define vld1q_f32_x2(a) simde_vld1q_f32_x2((a)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_float64x2x2_t +simde_vld1q_f64_x2(simde_float64 const ptr[HEDLEY_ARRAY_PARAM(4)]) { + #if \ + defined(SIMDE_ARM_NEON_A64V8_NATIVE) && \ + (!defined(HEDLEY_GCC_VERSION) || HEDLEY_GCC_VERSION_CHECK(8,0,0)) && \ + (!defined(__clang__) || SIMDE_DETECT_CLANG_VERSION_CHECK(7,0,0)) + return vld1q_f64_x2(ptr); + #else + simde_float64x2_private a_[2]; + #if defined(SIMDE_RISCV_V_NATIVE) + a_[0].sv128 = __riscv_vle64_v_f64m1(ptr , 2); + a_[1].sv128 = __riscv_vle64_v_f64m1(ptr+2 , 2); + #else + for (size_t i = 0; i < 4; i++) { + a_[i / 2].values[i % 2] = ptr[i]; + } + #endif + simde_float64x2x2_t s_ = { { simde_float64x2_from_private(a_[0]), + simde_float64x2_from_private(a_[1]) } }; + return s_; + #endif +} +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vld1q_f64_x2 + #define vld1q_f64_x2(a) simde_vld1q_f64_x2((a)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_int8x16x2_t +simde_vld1q_s8_x2(int8_t const ptr[HEDLEY_ARRAY_PARAM(32)]) { + #if \ + defined(SIMDE_ARM_NEON_A32V7_NATIVE) && \ + (!defined(HEDLEY_GCC_VERSION) || (HEDLEY_GCC_VERSION_CHECK(8,0,0) && defined(SIMDE_ARM_NEON_A64V8_NATIVE))) && \ + (!defined(__clang__) || (SIMDE_DETECT_CLANG_VERSION_CHECK(7,0,0) && defined(SIMDE_ARM_NEON_A64V8_NATIVE))) + return vld1q_s8_x2(ptr); + #else + simde_int8x16_private a_[2]; + #if defined(SIMDE_RISCV_V_NATIVE) + a_[0].sv128 = __riscv_vle8_v_i8m1(ptr , 16); + a_[1].sv128 = __riscv_vle8_v_i8m1(ptr+16 , 16); + #else + for (size_t i = 0; i < 32; i++) { + a_[i / 16].values[i % 16] = ptr[i]; + } + #endif + simde_int8x16x2_t s_ = { { simde_int8x16_from_private(a_[0]), + simde_int8x16_from_private(a_[1]) } }; + return s_; + #endif +} +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vld1q_s8_x2 + #define vld1q_s8_x2(a) simde_vld1q_s8_x2((a)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_int16x8x2_t +simde_vld1q_s16_x2(int16_t const ptr[HEDLEY_ARRAY_PARAM(16)]) { + #if \ + defined(SIMDE_ARM_NEON_A32V7_NATIVE) && \ + (!defined(HEDLEY_GCC_VERSION) || (HEDLEY_GCC_VERSION_CHECK(8,0,0) && defined(SIMDE_ARM_NEON_A64V8_NATIVE))) && \ + (!defined(__clang__) || (SIMDE_DETECT_CLANG_VERSION_CHECK(7,0,0) && defined(SIMDE_ARM_NEON_A64V8_NATIVE))) + return vld1q_s16_x2(ptr); + #else + simde_int16x8_private a_[2]; + #if defined(SIMDE_RISCV_V_NATIVE) + a_[0].sv128 = __riscv_vle16_v_i16m1(ptr , 8); + a_[1].sv128 = __riscv_vle16_v_i16m1(ptr+8 , 8); + #else + for (size_t i = 0; i < 16; i++) { + a_[i / 8].values[i % 8] = ptr[i]; + } + #endif + simde_int16x8x2_t s_ = { { simde_int16x8_from_private(a_[0]), + simde_int16x8_from_private(a_[1]) } }; + return s_; + #endif +} +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vld1q_s16_x2 + #define vld1q_s16_x2(a) simde_vld1q_s16_x2((a)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_int32x4x2_t +simde_vld1q_s32_x2(int32_t const ptr[HEDLEY_ARRAY_PARAM(8)]) { + #if \ + defined(SIMDE_ARM_NEON_A32V7_NATIVE) && \ + (!defined(HEDLEY_GCC_VERSION) || (HEDLEY_GCC_VERSION_CHECK(8,0,0) && defined(SIMDE_ARM_NEON_A64V8_NATIVE))) && \ + (!defined(__clang__) || (SIMDE_DETECT_CLANG_VERSION_CHECK(7,0,0) && defined(SIMDE_ARM_NEON_A64V8_NATIVE))) + return vld1q_s32_x2(ptr); + #else + simde_int32x4_private a_[2]; + #if defined(SIMDE_RISCV_V_NATIVE) + a_[0].sv128 = __riscv_vle32_v_i32m1(ptr , 4); + a_[1].sv128 = __riscv_vle32_v_i32m1(ptr+4 , 4); + #else + for (size_t i = 0; i < 8; i++) { + a_[i / 4].values[i % 4] = ptr[i]; + } + #endif + simde_int32x4x2_t s_ = { { simde_int32x4_from_private(a_[0]), + simde_int32x4_from_private(a_[1]) } }; + return s_; + #endif +} +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vld1q_s32_x2 + #define vld1q_s32_x2(a) simde_vld1q_s32_x2((a)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_int64x2x2_t +simde_vld1q_s64_x2(int64_t const ptr[HEDLEY_ARRAY_PARAM(4)]) { + #if \ + defined(SIMDE_ARM_NEON_A32V7_NATIVE) && \ + (!defined(HEDLEY_GCC_VERSION) || (HEDLEY_GCC_VERSION_CHECK(8,0,0) && defined(SIMDE_ARM_NEON_A64V8_NATIVE))) && \ + (!defined(__clang__) || (SIMDE_DETECT_CLANG_VERSION_CHECK(7,0,0) && defined(SIMDE_ARM_NEON_A64V8_NATIVE))) + return vld1q_s64_x2(ptr); + #else + simde_int64x2_private a_[2]; + #if defined(SIMDE_RISCV_V_NATIVE) + a_[0].sv128 = __riscv_vle64_v_i64m1(ptr , 2); + a_[1].sv128 = __riscv_vle64_v_i64m1(ptr+2 , 2); + #else + for (size_t i = 0; i < 4; i++) { + a_[i / 2].values[i % 2] = ptr[i]; + } + #endif + simde_int64x2x2_t s_ = { { simde_int64x2_from_private(a_[0]), + simde_int64x2_from_private(a_[1]) } }; + return s_; + #endif +} +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vld1q_s64_x2 + #define vld1q_s64_x2(a) simde_vld1q_s64_x2((a)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_uint8x16x2_t +simde_vld1q_u8_x2(uint8_t const ptr[HEDLEY_ARRAY_PARAM(32)]) { + #if \ + defined(SIMDE_ARM_NEON_A32V7_NATIVE) && \ + (!defined(HEDLEY_GCC_VERSION) || (HEDLEY_GCC_VERSION_CHECK(8,0,0) && defined(SIMDE_ARM_NEON_A64V8_NATIVE))) && \ + (!defined(__clang__) || (SIMDE_DETECT_CLANG_VERSION_CHECK(7,0,0) && defined(SIMDE_ARM_NEON_A64V8_NATIVE))) + return vld1q_u8_x2(ptr); + #else + simde_uint8x16_private a_[2]; + #if defined(SIMDE_RISCV_V_NATIVE) + a_[0].sv128 = __riscv_vle8_v_u8m1(ptr , 16); + a_[1].sv128 = __riscv_vle8_v_u8m1(ptr+16 , 16); + #else + for (size_t i = 0; i < 32; i++) { + a_[i / 16].values[i % 16] = ptr[i]; + } + #endif + simde_uint8x16x2_t s_ = { { simde_uint8x16_from_private(a_[0]), + simde_uint8x16_from_private(a_[1]) } }; + return s_; + #endif +} +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vld1q_u8_x2 + #define vld1q_u8_x2(a) simde_vld1q_u8_x2((a)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_uint16x8x2_t +simde_vld1q_u16_x2(uint16_t const ptr[HEDLEY_ARRAY_PARAM(16)]) { + #if \ + defined(SIMDE_ARM_NEON_A32V7_NATIVE) && \ + (!defined(HEDLEY_GCC_VERSION) || (HEDLEY_GCC_VERSION_CHECK(8,0,0) && defined(SIMDE_ARM_NEON_A64V8_NATIVE))) && \ + (!defined(__clang__) || (SIMDE_DETECT_CLANG_VERSION_CHECK(7,0,0) && defined(SIMDE_ARM_NEON_A64V8_NATIVE))) + return vld1q_u16_x2(ptr); + #else + simde_uint16x8_private a_[2]; + #if defined(SIMDE_RISCV_V_NATIVE) + a_[0].sv128 = __riscv_vle16_v_u16m1(ptr , 8); + a_[1].sv128 = __riscv_vle16_v_u16m1(ptr+8 , 8); + #else + for (size_t i = 0; i < 16; i++) { + a_[i / 8].values[i % 8] = ptr[i]; + } + #endif + simde_uint16x8x2_t s_ = { { simde_uint16x8_from_private(a_[0]), + simde_uint16x8_from_private(a_[1]) } }; + return s_; + #endif +} +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vld1q_u16_x2 + #define vld1q_u16_x2(a) simde_vld1q_u16_x2((a)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_uint32x4x2_t +simde_vld1q_u32_x2(uint32_t const ptr[HEDLEY_ARRAY_PARAM(8)]) { + #if \ + defined(SIMDE_ARM_NEON_A32V7_NATIVE) && \ + (!defined(HEDLEY_GCC_VERSION) || (HEDLEY_GCC_VERSION_CHECK(8,0,0) && defined(SIMDE_ARM_NEON_A64V8_NATIVE))) && \ + (!defined(__clang__) || (SIMDE_DETECT_CLANG_VERSION_CHECK(7,0,0) && defined(SIMDE_ARM_NEON_A64V8_NATIVE))) + return vld1q_u32_x2(ptr); + #else + simde_uint32x4_private a_[2]; + #if defined(SIMDE_RISCV_V_NATIVE) + a_[0].sv128 = __riscv_vle32_v_u32m1(ptr , 4); + a_[1].sv128 = __riscv_vle32_v_u32m1(ptr+4 , 4); + #else + for (size_t i = 0; i < 8; i++) { + a_[i / 4].values[i % 4] = ptr[i]; + } + #endif + simde_uint32x4x2_t s_ = { { simde_uint32x4_from_private(a_[0]), + simde_uint32x4_from_private(a_[1]) } }; + return s_; + #endif +} +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vld1q_u32_x2 + #define vld1q_u32_x2(a) simde_vld1q_u32_x2((a)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_uint64x2x2_t +simde_vld1q_u64_x2(uint64_t const ptr[HEDLEY_ARRAY_PARAM(4)]) { + #if \ + defined(SIMDE_ARM_NEON_A32V7_NATIVE) && \ + (!defined(HEDLEY_GCC_VERSION) || (HEDLEY_GCC_VERSION_CHECK(8,0,0) && defined(SIMDE_ARM_NEON_A64V8_NATIVE))) && \ + (!defined(__clang__) || (SIMDE_DETECT_CLANG_VERSION_CHECK(7,0,0) && defined(SIMDE_ARM_NEON_A64V8_NATIVE))) + return vld1q_u64_x2(ptr); + #else + simde_uint64x2_private a_[2]; + #if defined(SIMDE_RISCV_V_NATIVE) + a_[0].sv128 = __riscv_vle64_v_u64m1(ptr , 2); + a_[1].sv128 = __riscv_vle64_v_u64m1(ptr+2 , 2); + #else + for (size_t i = 0; i < 4; i++) { + a_[i / 2].values[i % 2] = ptr[i]; + } + #endif + simde_uint64x2x2_t s_ = { { simde_uint64x2_from_private(a_[0]), + simde_uint64x2_from_private(a_[1]) } }; + return s_; + #endif +} +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vld1q_u64_x2 + #define vld1q_u64_x2(a) simde_vld1q_u64_x2((a)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_poly8x16x2_t +simde_vld1q_p8_x2(simde_poly8_t const ptr[HEDLEY_ARRAY_PARAM(32)]) { + #if \ + defined(SIMDE_ARM_NEON_A32V7_NATIVE) && \ + (!defined(HEDLEY_GCC_VERSION) || (HEDLEY_GCC_VERSION_CHECK(8,0,0) && defined(SIMDE_ARM_NEON_A64V8_NATIVE))) + return vld1q_p8_x2(ptr); + #else + simde_poly8x16_private a_[2]; + #if defined(SIMDE_RISCV_V_NATIVE) + a_[0].sv128 = __riscv_vle8_v_u8m1(ptr , 16); + a_[1].sv128 = __riscv_vle8_v_u8m1(ptr+16 , 16); + #else + for (size_t i = 0; i < 32; i++) { + a_[i / 16].values[i % 16] = ptr[i]; + } + #endif + simde_poly8x16x2_t s_ = { { simde_poly8x16_from_private(a_[0]), + simde_poly8x16_from_private(a_[1]) } }; + return s_; + #endif +} +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vld1q_p8_x2 + #define vld1q_p8_x2(a) simde_vld1q_p8_x2((a)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_poly16x8x2_t +simde_vld1q_p16_x2(simde_poly16_t const ptr[HEDLEY_ARRAY_PARAM(16)]) { + #if \ + defined(SIMDE_ARM_NEON_A32V7_NATIVE) && \ + (!defined(HEDLEY_GCC_VERSION) || (HEDLEY_GCC_VERSION_CHECK(8,0,0) && defined(SIMDE_ARM_NEON_A64V8_NATIVE))) + return vld1q_p16_x2(ptr); + #else + simde_poly16x8_private a_[2]; + #if defined(SIMDE_RISCV_V_NATIVE) + a_[0].sv128 = __riscv_vle16_v_u16m1(ptr , 8); + a_[1].sv128 = __riscv_vle16_v_u16m1(ptr+8 , 8); + #else + for (size_t i = 0; i < 16; i++) { + a_[i / 8].values[i % 8] = ptr[i]; + } + #endif + simde_poly16x8x2_t s_ = { { simde_poly16x8_from_private(a_[0]), + simde_poly16x8_from_private(a_[1]) } }; + return s_; + #endif +} +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vld1q_p16_x2 + #define vld1q_p16_x2(a) simde_vld1q_p16_x2((a)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_poly64x2x2_t +simde_vld1q_p64_x2(simde_poly64_t const ptr[HEDLEY_ARRAY_PARAM(4)]) { + #if \ + defined(SIMDE_ARM_NEON_A32V8_NATIVE) && \ + (!defined(HEDLEY_GCC_VERSION) || (HEDLEY_GCC_VERSION_CHECK(8,0,0) && defined(SIMDE_ARM_NEON_A64V8_NATIVE))) + return vld1q_p64_x2(ptr); + #else + simde_poly64x2_private a_[2]; + #if defined(SIMDE_RISCV_V_NATIVE) + a_[0].sv128 = __riscv_vle64_v_u64m1(ptr , 2); + a_[1].sv128 = __riscv_vle64_v_u64m1(ptr+2 , 2); + #else + for (size_t i = 0; i < 4; i++) { + a_[i / 2].values[i % 2] = ptr[i]; + } + #endif + simde_poly64x2x2_t s_ = { { simde_poly64x2_from_private(a_[0]), + simde_poly64x2_from_private(a_[1]) } }; + return s_; + #endif +} +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) + #undef vld1q_p64_x2 + #define vld1q_p64_x2(a) simde_vld1q_p64_x2((a)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_bfloat16x8x2_t +simde_vld1q_bf16_x2(simde_bfloat16 const ptr[HEDLEY_ARRAY_PARAM(16)]) { + #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARM_NEON_BF16) + return vld1q_bf16_x2(ptr); + #else + simde_bfloat16x8_private a_[2]; + for (size_t i = 0; i < 16; i++) { + a_[i / 8].values[i % 8] = ptr[i]; + } + simde_bfloat16x8x2_t s_ = { { simde_bfloat16x8_from_private(a_[0]), + simde_bfloat16x8_from_private(a_[1]) } }; + return s_; + #endif +} +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) + #undef vld1q_bf16_x2 + #define vld1q_bf16_x2(a) simde_vld1q_bf16_x2((a)) +#endif + + +#endif /* !defined(SIMDE_BUG_INTEL_857088) */ + +SIMDE_END_DECLS_ +HEDLEY_DIAGNOSTIC_POP + +#endif /* !defined(SIMDE_ARM_NEON_LD1Q_X2_H) */ diff --git a/lib/simd_wrapper/simde/arm/neon/ld1q_x3.h b/lib/simd_wrapper/simde/arm/neon/ld1q_x3.h new file mode 100644 index 00000000000..ec82989e74c --- /dev/null +++ b/lib/simd_wrapper/simde/arm/neon/ld1q_x3.h @@ -0,0 +1,487 @@ +/* SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, copy, + * modify, merge, publish, distribute, sublicense, and/or sell copies + * of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + * Copyright: + * 2020 Evan Nemerson + * 2021 Zhi An Ng (Copyright owned by Google, LLC) + * 2023 Yi-Yen Chung (Copyright owned by Andes Technology) + * 2023 Chi-Wei Chu (Copyright owned by NTHU pllab) + */ + +#if !defined(SIMDE_ARM_NEON_LD1Q_X3_H) +#define SIMDE_ARM_NEON_LD1Q_X3_H + +#include "types.h" + +HEDLEY_DIAGNOSTIC_PUSH +SIMDE_DISABLE_UNWANTED_DIAGNOSTICS +#if HEDLEY_GCC_VERSION_CHECK(7,0,0) + SIMDE_DIAGNOSTIC_DISABLE_MAYBE_UNINITIAZILED_ +#endif +SIMDE_BEGIN_DECLS_ + +#if !defined(SIMDE_BUG_INTEL_857088) + +SIMDE_FUNCTION_ATTRIBUTES +simde_float16x8x3_t +simde_vld1q_f16_x3(simde_float16_t const ptr[HEDLEY_ARRAY_PARAM(24)]) { + #if \ + defined(SIMDE_ARM_NEON_A32V7_NATIVE) && defined(SIMDE_ARM_NEON_FP16) && \ + (!defined(HEDLEY_GCC_VERSION) || (HEDLEY_GCC_VERSION_CHECK(8,0,0) && defined(SIMDE_ARM_NEON_A64V8_NATIVE))) && \ + (!defined(__clang__) || (SIMDE_DETECT_CLANG_VERSION_CHECK(7,0,0) && defined(SIMDE_ARM_NEON_A64V8_NATIVE))) + return vld1q_f16_x3(ptr); + #else + simde_float16x8_private a_[3]; + #if defined(SIMDE_RISCV_V_NATIVE) && SIMDE_ARCH_RISCV_ZVFH + a_[0].sv128 = __riscv_vle16_v_f16m1((_Float16 *)ptr , 8); + a_[1].sv128 = __riscv_vle16_v_f16m1((_Float16 *)(ptr+8) , 8); + a_[2].sv128 = __riscv_vle16_v_f16m1((_Float16 *)(ptr+16) , 8); + #else + for (size_t i = 0; i < 24; i++) { + a_[i / 8].values[i % 8] = ptr[i]; + } + #endif + simde_float16x8x3_t s_ = { { simde_float16x8_from_private(a_[0]), + simde_float16x8_from_private(a_[1]), + simde_float16x8_from_private(a_[2]) } }; + return s_; + #endif +} +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vld1q_f16_x3 + #define vld1q_f16_x3(a) simde_vld1q_f16_x3((a)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_float32x4x3_t +simde_vld1q_f32_x3(simde_float32 const ptr[HEDLEY_ARRAY_PARAM(12)]) { + #if \ + defined(SIMDE_ARM_NEON_A32V7_NATIVE) && \ + (!defined(HEDLEY_GCC_VERSION) || (HEDLEY_GCC_VERSION_CHECK(8,0,0) && defined(SIMDE_ARM_NEON_A64V8_NATIVE))) && \ + (!defined(__clang__) || (SIMDE_DETECT_CLANG_VERSION_CHECK(7,0,0) && defined(SIMDE_ARM_NEON_A64V8_NATIVE))) + return vld1q_f32_x3(ptr); + #else + simde_float32x4_private a_[3]; + #if defined(SIMDE_RISCV_V_NATIVE) + a_[0].sv128 = __riscv_vle32_v_f32m1(ptr , 4); + a_[1].sv128 = __riscv_vle32_v_f32m1(ptr+4 , 4); + a_[2].sv128 = __riscv_vle32_v_f32m1(ptr+8 , 4); + #else + for (size_t i = 0; i < 12; i++) { + a_[i / 4].values[i % 4] = ptr[i]; + } + #endif + simde_float32x4x3_t s_ = { { simde_float32x4_from_private(a_[0]), + simde_float32x4_from_private(a_[1]), + simde_float32x4_from_private(a_[2]) } }; + return s_; + #endif +} +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vld1q_f32_x3 + #define vld1q_f32_x3(a) simde_vld1q_f32_x3((a)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_float64x2x3_t +simde_vld1q_f64_x3(simde_float64 const ptr[HEDLEY_ARRAY_PARAM(6)]) { + #if \ + defined(SIMDE_ARM_NEON_A64V8_NATIVE) && \ + (!defined(HEDLEY_GCC_VERSION) || HEDLEY_GCC_VERSION_CHECK(8,0,0)) && \ + (!defined(__clang__) || SIMDE_DETECT_CLANG_VERSION_CHECK(7,0,0)) + return vld1q_f64_x3(ptr); + #else + simde_float64x2_private a_[3]; + #if defined(SIMDE_RISCV_V_NATIVE) + a_[0].sv128 = __riscv_vle64_v_f64m1(ptr , 2); + a_[1].sv128 = __riscv_vle64_v_f64m1(ptr+2 , 2); + a_[2].sv128 = __riscv_vle64_v_f64m1(ptr+4 , 2); + #else + for (size_t i = 0; i < 6; i++) { + a_[i / 2].values[i % 2] = ptr[i]; + } + #endif + simde_float64x2x3_t s_ = { { simde_float64x2_from_private(a_[0]), + simde_float64x2_from_private(a_[1]), + simde_float64x2_from_private(a_[2]) } }; + return s_; + #endif +} +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vld1q_f64_x3 + #define vld1q_f64_x3(a) simde_vld1q_f64_x3((a)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_int8x16x3_t +simde_vld1q_s8_x3(int8_t const ptr[HEDLEY_ARRAY_PARAM(48)]) { + #if \ + defined(SIMDE_ARM_NEON_A32V7_NATIVE) && \ + (!defined(HEDLEY_GCC_VERSION) || (HEDLEY_GCC_VERSION_CHECK(8,0,0) && defined(SIMDE_ARM_NEON_A64V8_NATIVE))) && \ + (!defined(__clang__) || (SIMDE_DETECT_CLANG_VERSION_CHECK(7,0,0) && defined(SIMDE_ARM_NEON_A64V8_NATIVE))) + return vld1q_s8_x3(ptr); + #else + simde_int8x16_private a_[3]; + #if defined(SIMDE_RISCV_V_NATIVE) + a_[0].sv128 = __riscv_vle8_v_i8m1(ptr , 16); + a_[1].sv128 = __riscv_vle8_v_i8m1(ptr+16 , 16); + a_[2].sv128 = __riscv_vle8_v_i8m1(ptr+32 , 16); + #else + for (size_t i = 0; i < 48; i++) { + a_[i / 16].values[i % 16] = ptr[i]; + } + #endif + simde_int8x16x3_t s_ = { { simde_int8x16_from_private(a_[0]), + simde_int8x16_from_private(a_[1]), + simde_int8x16_from_private(a_[2]) } }; + return s_; + #endif +} +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vld1q_s8_x3 + #define vld1q_s8_x3(a) simde_vld1q_s8_x3((a)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_int16x8x3_t +simde_vld1q_s16_x3(int16_t const ptr[HEDLEY_ARRAY_PARAM(12)]) { + #if \ + defined(SIMDE_ARM_NEON_A32V7_NATIVE) && \ + (!defined(HEDLEY_GCC_VERSION) || (HEDLEY_GCC_VERSION_CHECK(8,0,0) && defined(SIMDE_ARM_NEON_A64V8_NATIVE))) && \ + (!defined(__clang__) || (SIMDE_DETECT_CLANG_VERSION_CHECK(7,0,0) && defined(SIMDE_ARM_NEON_A64V8_NATIVE))) + return vld1q_s16_x3(ptr); + #else + simde_int16x8_private a_[3]; + #if defined(SIMDE_RISCV_V_NATIVE) + a_[0].sv128 = __riscv_vle16_v_i16m1(ptr , 8); + a_[1].sv128 = __riscv_vle16_v_i16m1(ptr+8 , 8); + a_[2].sv128 = __riscv_vle16_v_i16m1(ptr+16 , 8); + #else + for (size_t i = 0; i < 24; i++) { + a_[i / 8].values[i % 8] = ptr[i]; + } + #endif + simde_int16x8x3_t s_ = { { simde_int16x8_from_private(a_[0]), + simde_int16x8_from_private(a_[1]), + simde_int16x8_from_private(a_[2]) } }; + return s_; + #endif +} +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vld1q_s16_x3 + #define vld1q_s16_x3(a) simde_vld1q_s16_x3((a)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_int32x4x3_t +simde_vld1q_s32_x3(int32_t const ptr[HEDLEY_ARRAY_PARAM(6)]) { + #if \ + defined(SIMDE_ARM_NEON_A32V7_NATIVE) && \ + (!defined(HEDLEY_GCC_VERSION) || (HEDLEY_GCC_VERSION_CHECK(8,0,0) && defined(SIMDE_ARM_NEON_A64V8_NATIVE))) && \ + (!defined(__clang__) || (SIMDE_DETECT_CLANG_VERSION_CHECK(7,0,0) && defined(SIMDE_ARM_NEON_A64V8_NATIVE))) + return vld1q_s32_x3(ptr); + #else + simde_int32x4_private a_[3]; + #if defined(SIMDE_RISCV_V_NATIVE) + a_[0].sv128 = __riscv_vle32_v_i32m1(ptr , 4); + a_[1].sv128 = __riscv_vle32_v_i32m1(ptr+4 , 4); + a_[2].sv128 = __riscv_vle32_v_i32m1(ptr+8 , 4); + #else + for (size_t i = 0; i < 12; i++) { + a_[i / 4].values[i % 4] = ptr[i]; + } + #endif + simde_int32x4x3_t s_ = { { simde_int32x4_from_private(a_[0]), + simde_int32x4_from_private(a_[1]), + simde_int32x4_from_private(a_[2]) } }; + return s_; + #endif +} +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vld1q_s32_x3 + #define vld1q_s32_x3(a) simde_vld1q_s32_x3((a)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_int64x2x3_t +simde_vld1q_s64_x3(int64_t const ptr[HEDLEY_ARRAY_PARAM(3)]) { + #if \ + defined(SIMDE_ARM_NEON_A32V7_NATIVE) && \ + (!defined(HEDLEY_GCC_VERSION) || (HEDLEY_GCC_VERSION_CHECK(8,0,0) && defined(SIMDE_ARM_NEON_A64V8_NATIVE))) && \ + (!defined(__clang__) || (SIMDE_DETECT_CLANG_VERSION_CHECK(7,0,0) && defined(SIMDE_ARM_NEON_A64V8_NATIVE))) + return vld1q_s64_x3(ptr); + #else + simde_int64x2_private a_[3]; + #if defined(SIMDE_RISCV_V_NATIVE) + a_[0].sv128 = __riscv_vle64_v_i64m1(ptr , 2); + a_[1].sv128 = __riscv_vle64_v_i64m1(ptr+2 , 2); + a_[2].sv128 = __riscv_vle64_v_i64m1(ptr+4 , 2); + #else + for (size_t i = 0; i < 6; i++) { + a_[i / 2].values[i % 2] = ptr[i]; + } + #endif + simde_int64x2x3_t s_ = { { simde_int64x2_from_private(a_[0]), + simde_int64x2_from_private(a_[1]), + simde_int64x2_from_private(a_[2]) } }; + return s_; + #endif +} +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vld1q_s64_x3 + #define vld1q_s64_x3(a) simde_vld1q_s64_x3((a)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_uint8x16x3_t +simde_vld1q_u8_x3(uint8_t const ptr[HEDLEY_ARRAY_PARAM(48)]) { + #if \ + defined(SIMDE_ARM_NEON_A32V7_NATIVE) && \ + (!defined(HEDLEY_GCC_VERSION) || (HEDLEY_GCC_VERSION_CHECK(8,0,0) && defined(SIMDE_ARM_NEON_A64V8_NATIVE))) && \ + (!defined(__clang__) || (SIMDE_DETECT_CLANG_VERSION_CHECK(7,0,0) && defined(SIMDE_ARM_NEON_A64V8_NATIVE))) + return vld1q_u8_x3(ptr); + #else + simde_uint8x16_private a_[3]; + #if defined(SIMDE_RISCV_V_NATIVE) + a_[0].sv128 = __riscv_vle8_v_u8m1(ptr , 16); + a_[1].sv128 = __riscv_vle8_v_u8m1(ptr+16 , 16); + a_[2].sv128 = __riscv_vle8_v_u8m1(ptr+32 , 16); + #else + for (size_t i = 0; i < 48; i++) { + a_[i / 16].values[i % 16] = ptr[i]; + } + #endif + simde_uint8x16x3_t s_ = { { simde_uint8x16_from_private(a_[0]), + simde_uint8x16_from_private(a_[1]), + simde_uint8x16_from_private(a_[2]) } }; + return s_; + #endif +} +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vld1q_u8_x3 + #define vld1q_u8_x3(a) simde_vld1q_u8_x3((a)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_uint16x8x3_t +simde_vld1q_u16_x3(uint16_t const ptr[HEDLEY_ARRAY_PARAM(24)]) { + #if \ + defined(SIMDE_ARM_NEON_A32V7_NATIVE) && \ + (!defined(HEDLEY_GCC_VERSION) || (HEDLEY_GCC_VERSION_CHECK(8,0,0) && defined(SIMDE_ARM_NEON_A64V8_NATIVE))) && \ + (!defined(__clang__) || (SIMDE_DETECT_CLANG_VERSION_CHECK(7,0,0) && defined(SIMDE_ARM_NEON_A64V8_NATIVE))) + return vld1q_u16_x3(ptr); + #else + simde_uint16x8_private a_[3]; + #if defined(SIMDE_RISCV_V_NATIVE) + a_[0].sv128 = __riscv_vle16_v_u16m1(ptr , 8); + a_[1].sv128 = __riscv_vle16_v_u16m1(ptr+8 , 8); + a_[2].sv128 = __riscv_vle16_v_u16m1(ptr+16 , 8); + #else + for (size_t i = 0; i < 24; i++) { + a_[i / 8].values[i % 8] = ptr[i]; + } + #endif + simde_uint16x8x3_t s_ = { { simde_uint16x8_from_private(a_[0]), + simde_uint16x8_from_private(a_[1]), + simde_uint16x8_from_private(a_[2]) } }; + return s_; + #endif +} +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vld1q_u16_x3 + #define vld1q_u16_x3(a) simde_vld1q_u16_x3((a)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_uint32x4x3_t +simde_vld1q_u32_x3(uint32_t const ptr[HEDLEY_ARRAY_PARAM(6)]) { + #if \ + defined(SIMDE_ARM_NEON_A32V7_NATIVE) && \ + (!defined(HEDLEY_GCC_VERSION) || (HEDLEY_GCC_VERSION_CHECK(8,0,0) && defined(SIMDE_ARM_NEON_A64V8_NATIVE))) && \ + (!defined(__clang__) || (SIMDE_DETECT_CLANG_VERSION_CHECK(7,0,0) && defined(SIMDE_ARM_NEON_A64V8_NATIVE))) + return vld1q_u32_x3(ptr); + #else + simde_uint32x4_private a_[3]; + #if defined(SIMDE_RISCV_V_NATIVE) + a_[0].sv128 = __riscv_vle32_v_u32m1(ptr , 4); + a_[1].sv128 = __riscv_vle32_v_u32m1(ptr+4 , 4); + a_[2].sv128 = __riscv_vle32_v_u32m1(ptr+8 , 4); + #else + for (size_t i = 0; i < 12; i++) { + a_[i / 4].values[i % 4] = ptr[i]; + } + #endif + simde_uint32x4x3_t s_ = { { simde_uint32x4_from_private(a_[0]), + simde_uint32x4_from_private(a_[1]), + simde_uint32x4_from_private(a_[2]) } }; + return s_; + #endif +} +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vld1q_u32_x3 + #define vld1q_u32_x3(a) simde_vld1q_u32_x3((a)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_uint64x2x3_t +simde_vld1q_u64_x3(uint64_t const ptr[HEDLEY_ARRAY_PARAM(3)]) { + #if \ + defined(SIMDE_ARM_NEON_A32V7_NATIVE) && \ + (!defined(HEDLEY_GCC_VERSION) || (HEDLEY_GCC_VERSION_CHECK(8,0,0) && defined(SIMDE_ARM_NEON_A64V8_NATIVE))) && \ + (!defined(__clang__) || (SIMDE_DETECT_CLANG_VERSION_CHECK(7,0,0) && defined(SIMDE_ARM_NEON_A64V8_NATIVE))) + return vld1q_u64_x3(ptr); + #else + simde_uint64x2_private a_[3]; + #if defined(SIMDE_RISCV_V_NATIVE) + a_[0].sv128 = __riscv_vle64_v_u64m1(ptr , 2); + a_[1].sv128 = __riscv_vle64_v_u64m1(ptr+2 , 2); + a_[2].sv128 = __riscv_vle64_v_u64m1(ptr+4 , 2); + #else + for (size_t i = 0; i < 6; i++) { + a_[i / 2].values[i % 2] = ptr[i]; + } + #endif + simde_uint64x2x3_t s_ = { { simde_uint64x2_from_private(a_[0]), + simde_uint64x2_from_private(a_[1]), + simde_uint64x2_from_private(a_[2]) } }; + return s_; + #endif +} +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vld1q_u64_x3 + #define vld1q_u64_x3(a) simde_vld1q_u64_x3((a)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_poly8x16x3_t +simde_vld1q_p8_x3(simde_poly8_t const ptr[HEDLEY_ARRAY_PARAM(48)]) { + #if \ + defined(SIMDE_ARM_NEON_A32V7_NATIVE) && \ + (!defined(HEDLEY_GCC_VERSION) || (HEDLEY_GCC_VERSION_CHECK(8,5,0) && defined(SIMDE_ARM_NEON_A64V8_NATIVE))) + return vld1q_p8_x3(ptr); + #else + simde_poly8x16_private a_[3]; + #if defined(SIMDE_RISCV_V_NATIVE) + a_[0].sv128 = __riscv_vle8_v_u8m1(ptr , 16); + a_[1].sv128 = __riscv_vle8_v_u8m1(ptr+16 , 16); + a_[2].sv128 = __riscv_vle8_v_u8m1(ptr+32 , 16); + #else + for (size_t i = 0; i < 48; i++) { + a_[i / 16].values[i % 16] = ptr[i]; + } + #endif + simde_poly8x16x3_t s_ = { { simde_poly8x16_from_private(a_[0]), + simde_poly8x16_from_private(a_[1]), + simde_poly8x16_from_private(a_[2]) } }; + return s_; + #endif +} +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vld1q_p8_x3 + #define vld1q_p8_x3(a) simde_vld1q_p8_x3((a)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_poly16x8x3_t +simde_vld1q_p16_x3(simde_poly16_t const ptr[HEDLEY_ARRAY_PARAM(24)]) { + #if \ + defined(SIMDE_ARM_NEON_A32V7_NATIVE) && \ + (!defined(HEDLEY_GCC_VERSION) || (HEDLEY_GCC_VERSION_CHECK(8,5,0) && defined(SIMDE_ARM_NEON_A64V8_NATIVE))) + return vld1q_p16_x3(ptr); + #else + simde_poly16x8_private a_[3]; + #if defined(SIMDE_RISCV_V_NATIVE) + a_[0].sv128 = __riscv_vle16_v_u16m1(ptr , 8); + a_[1].sv128 = __riscv_vle16_v_u16m1(ptr+8 , 8); + a_[2].sv128 = __riscv_vle16_v_u16m1(ptr+16 , 8); + #else + for (size_t i = 0; i < 24; i++) { + a_[i / 8].values[i % 8] = ptr[i]; + } + #endif + simde_poly16x8x3_t s_ = { { simde_poly16x8_from_private(a_[0]), + simde_poly16x8_from_private(a_[1]), + simde_poly16x8_from_private(a_[2]) } }; + return s_; + #endif +} +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vld1q_p16_x3 + #define vld1q_p16_x3(a) simde_vld1q_p16_x3((a)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_poly64x2x3_t +simde_vld1q_p64_x3(simde_poly64_t const ptr[HEDLEY_ARRAY_PARAM(3)]) { + #if \ + defined(SIMDE_ARM_NEON_A32V8_NATIVE) && \ + (!defined(HEDLEY_GCC_VERSION) || (HEDLEY_GCC_VERSION_CHECK(8,5,0) && defined(SIMDE_ARM_NEON_A64V8_NATIVE))) + return vld1q_p64_x3(ptr); + #else + simde_poly64x2_private a_[3]; + #if defined(SIMDE_RISCV_V_NATIVE) + a_[0].sv128 = __riscv_vle64_v_u64m1(ptr , 2); + a_[1].sv128 = __riscv_vle64_v_u64m1(ptr+2 , 2); + a_[2].sv128 = __riscv_vle64_v_u64m1(ptr+4 , 2); + #else + for (size_t i = 0; i < 6; i++) { + a_[i / 2].values[i % 2] = ptr[i]; + } + #endif + simde_poly64x2x3_t s_ = { { simde_poly64x2_from_private(a_[0]), + simde_poly64x2_from_private(a_[1]), + simde_poly64x2_from_private(a_[2]) } }; + return s_; + #endif +} +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) + #undef vld1q_p64_x3 + #define vld1q_p64_x3(a) simde_vld1q_p64_x3((a)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_bfloat16x8x3_t +simde_vld1q_bf16_x3(simde_bfloat16 const ptr[HEDLEY_ARRAY_PARAM(24)]) { + #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARM_NEON_BF16) + return vld1q_bf16_x3(ptr); + #else + simde_bfloat16x8_private a_[3]; + for (size_t i = 0; i < 24; i++) { + a_[i / 8].values[i % 8] = ptr[i]; + } + simde_bfloat16x8x3_t s_ = { { simde_bfloat16x8_from_private(a_[0]), + simde_bfloat16x8_from_private(a_[1]), + simde_bfloat16x8_from_private(a_[2]) } }; + return s_; + #endif +} +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) + #undef vld1q_bf16_x3 + #define vld1q_bf16_x3(a) simde_vld1q_bf16_x3((a)) +#endif + +#endif /* !defined(SIMDE_BUG_INTEL_857088) */ + +SIMDE_END_DECLS_ +HEDLEY_DIAGNOSTIC_POP + +#endif /* !defined(SIMDE_ARM_NEON_LD1Q_X3_H) */ diff --git a/lib/simd_wrapper/simde/arm/neon/ld1q_x4.h b/lib/simd_wrapper/simde/arm/neon/ld1q_x4.h new file mode 100644 index 00000000000..2fa4c1a6996 --- /dev/null +++ b/lib/simd_wrapper/simde/arm/neon/ld1q_x4.h @@ -0,0 +1,517 @@ +/* SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, copy, + * modify, merge, publish, distribute, sublicense, and/or sell copies + * of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + * Copyright: + * 2020 Evan Nemerson + * 2021 Zhi An Ng (Copyright owned by Google, LLC) + * 2021 Décio Luiz Gazzoni Filho + * 2023 Yi-Yen Chung (Copyright owned by Andes Technology) + * 2023 Chi-Wei Chu (Copyright owned by NTHU pllab) + */ + +#if !defined(SIMDE_ARM_NEON_LD1Q_X4_H) +#define SIMDE_ARM_NEON_LD1Q_X4_H + +#include "types.h" + +HEDLEY_DIAGNOSTIC_PUSH +SIMDE_DISABLE_UNWANTED_DIAGNOSTICS +#if HEDLEY_GCC_VERSION_CHECK(7,0,0) + SIMDE_DIAGNOSTIC_DISABLE_MAYBE_UNINITIAZILED_ +#endif +SIMDE_BEGIN_DECLS_ + +#if !defined(SIMDE_BUG_INTEL_857088) + +SIMDE_FUNCTION_ATTRIBUTES +simde_float16x8x4_t +simde_vld1q_f16_x4(simde_float16_t const ptr[HEDLEY_ARRAY_PARAM(32)]) { + #if \ + defined(SIMDE_ARM_NEON_A32V7_NATIVE) && defined(SIMDE_ARM_NEON_FP16) && \ + (!defined(HEDLEY_GCC_VERSION) || (HEDLEY_GCC_VERSION_CHECK(8,0,0) && defined(SIMDE_ARM_NEON_A64V8_NATIVE))) && \ + (!defined(__clang__) || (SIMDE_DETECT_CLANG_VERSION_CHECK(7,0,0) && defined(SIMDE_ARM_NEON_A64V8_NATIVE))) + return vld1q_f16_x4(ptr); + #else + simde_float16x8_private a_[4]; + #if defined(SIMDE_RISCV_V_NATIVE) && SIMDE_ARCH_RISCV_ZVFH + a_[0].sv128 = __riscv_vle16_v_f16m1((_Float16 *)ptr , 8); + a_[1].sv128 = __riscv_vle16_v_f16m1((_Float16 *)(ptr+8) , 8); + a_[2].sv128 = __riscv_vle16_v_f16m1((_Float16 *)(ptr+16) , 8); + a_[3].sv128 = __riscv_vle16_v_f16m1((_Float16 *)(ptr+24) , 8); + #else + for (size_t i = 0; i < 32; i++) { + a_[i / 8].values[i % 8] = ptr[i]; + } + #endif + simde_float16x8x4_t s_ = { { simde_float16x8_from_private(a_[0]), + simde_float16x8_from_private(a_[1]), + simde_float16x8_from_private(a_[2]), + simde_float16x8_from_private(a_[3]) } }; + return s_; + #endif +} +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vld1q_f16_x4 + #define vld1q_f16_x4(a) simde_vld1q_f16_x4((a)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_float32x4x4_t +simde_vld1q_f32_x4(simde_float32 const ptr[HEDLEY_ARRAY_PARAM(16)]) { + #if \ + defined(SIMDE_ARM_NEON_A32V7_NATIVE) && \ + (!defined(HEDLEY_GCC_VERSION) || (HEDLEY_GCC_VERSION_CHECK(8,0,0) && defined(SIMDE_ARM_NEON_A64V8_NATIVE))) && \ + (!defined(__clang__) || (SIMDE_DETECT_CLANG_VERSION_CHECK(7,0,0) && defined(SIMDE_ARM_NEON_A64V8_NATIVE))) + return vld1q_f32_x4(ptr); + #else + simde_float32x4_private a_[4]; + #if defined(SIMDE_RISCV_V_NATIVE) + a_[0].sv128 = __riscv_vle32_v_f32m1(ptr , 4); + a_[1].sv128 = __riscv_vle32_v_f32m1(ptr+4 , 4); + a_[2].sv128 = __riscv_vle32_v_f32m1(ptr+8 , 4); + a_[3].sv128 = __riscv_vle32_v_f32m1(ptr+12 , 4); + #else + for (size_t i = 0; i < 16; i++) { + a_[i / 4].values[i % 4] = ptr[i]; + } + #endif + simde_float32x4x4_t s_ = { { simde_float32x4_from_private(a_[0]), + simde_float32x4_from_private(a_[1]), + simde_float32x4_from_private(a_[2]), + simde_float32x4_from_private(a_[3]) } }; + return s_; + #endif +} +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vld1q_f32_x4 + #define vld1q_f32_x4(a) simde_vld1q_f32_x4((a)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_float64x2x4_t +simde_vld1q_f64_x4(simde_float64 const ptr[HEDLEY_ARRAY_PARAM(8)]) { + #if \ + defined(SIMDE_ARM_NEON_A64V8_NATIVE) && \ + (!defined(HEDLEY_GCC_VERSION) || HEDLEY_GCC_VERSION_CHECK(8,0,0)) && \ + (!defined(__clang__) || SIMDE_DETECT_CLANG_VERSION_CHECK(7,0,0)) + return vld1q_f64_x4(ptr); + #else + simde_float64x2_private a_[4]; + #if defined(SIMDE_RISCV_V_NATIVE) + a_[0].sv128 = __riscv_vle64_v_f64m1(ptr , 2); + a_[1].sv128 = __riscv_vle64_v_f64m1(ptr+2 , 2); + a_[2].sv128 = __riscv_vle64_v_f64m1(ptr+4 , 2); + a_[3].sv128 = __riscv_vle64_v_f64m1(ptr+6 , 2); + #else + for (size_t i = 0; i < 8; i++) { + a_[i / 2].values[i % 2] = ptr[i]; + } + #endif + simde_float64x2x4_t s_ = { { simde_float64x2_from_private(a_[0]), + simde_float64x2_from_private(a_[1]), + simde_float64x2_from_private(a_[2]), + simde_float64x2_from_private(a_[3]) } }; + return s_; + #endif +} +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vld1q_f64_x4 + #define vld1q_f64_x4(a) simde_vld1q_f64_x4((a)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_int8x16x4_t +simde_vld1q_s8_x4(int8_t const ptr[HEDLEY_ARRAY_PARAM(64)]) { + #if \ + defined(SIMDE_ARM_NEON_A32V7_NATIVE) && \ + (!defined(HEDLEY_GCC_VERSION) || (HEDLEY_GCC_VERSION_CHECK(8,0,0) && defined(SIMDE_ARM_NEON_A64V8_NATIVE))) && \ + (!defined(__clang__) || (SIMDE_DETECT_CLANG_VERSION_CHECK(7,0,0) && defined(SIMDE_ARM_NEON_A64V8_NATIVE))) + return vld1q_s8_x4(ptr); + #else + simde_int8x16_private a_[4]; + #if defined(SIMDE_RISCV_V_NATIVE) + a_[0].sv128 = __riscv_vle8_v_i8m1(ptr , 16); + a_[1].sv128 = __riscv_vle8_v_i8m1(ptr+16 , 16); + a_[2].sv128 = __riscv_vle8_v_i8m1(ptr+32 , 16); + a_[3].sv128 = __riscv_vle8_v_i8m1(ptr+48 , 16); + #else + for (size_t i = 0; i < 64; i++) { + a_[i / 16].values[i % 16] = ptr[i]; + } + #endif + simde_int8x16x4_t s_ = { { simde_int8x16_from_private(a_[0]), + simde_int8x16_from_private(a_[1]), + simde_int8x16_from_private(a_[2]), + simde_int8x16_from_private(a_[3]) } }; + return s_; + #endif +} +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vld1q_s8_x4 + #define vld1q_s8_x4(a) simde_vld1q_s8_x4((a)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_int16x8x4_t +simde_vld1q_s16_x4(int16_t const ptr[HEDLEY_ARRAY_PARAM(32)]) { + #if \ + defined(SIMDE_ARM_NEON_A32V7_NATIVE) && \ + (!defined(HEDLEY_GCC_VERSION) || (HEDLEY_GCC_VERSION_CHECK(8,0,0) && defined(SIMDE_ARM_NEON_A64V8_NATIVE))) && \ + (!defined(__clang__) || (SIMDE_DETECT_CLANG_VERSION_CHECK(7,0,0) && defined(SIMDE_ARM_NEON_A64V8_NATIVE))) + return vld1q_s16_x4(ptr); + #else + simde_int16x8_private a_[4]; + #if defined(SIMDE_RISCV_V_NATIVE) + a_[0].sv128 = __riscv_vle16_v_i16m1(ptr , 8); + a_[1].sv128 = __riscv_vle16_v_i16m1(ptr+8 , 8); + a_[2].sv128 = __riscv_vle16_v_i16m1(ptr+16 , 8); + a_[3].sv128 = __riscv_vle16_v_i16m1(ptr+24 , 8); + #else + for (size_t i = 0; i < 32; i++) { + a_[i / 8].values[i % 8] = ptr[i]; + } + #endif + simde_int16x8x4_t s_ = { { simde_int16x8_from_private(a_[0]), + simde_int16x8_from_private(a_[1]), + simde_int16x8_from_private(a_[2]), + simde_int16x8_from_private(a_[3]) } }; + return s_; + #endif +} +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vld1q_s16_x4 + #define vld1q_s16_x4(a) simde_vld1q_s16_x4((a)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_int32x4x4_t +simde_vld1q_s32_x4(int32_t const ptr[HEDLEY_ARRAY_PARAM(16)]) { + #if \ + defined(SIMDE_ARM_NEON_A32V7_NATIVE) && \ + (!defined(HEDLEY_GCC_VERSION) || (HEDLEY_GCC_VERSION_CHECK(8,0,0) && defined(SIMDE_ARM_NEON_A64V8_NATIVE))) && \ + (!defined(__clang__) || (SIMDE_DETECT_CLANG_VERSION_CHECK(7,0,0) && defined(SIMDE_ARM_NEON_A64V8_NATIVE))) + return vld1q_s32_x4(ptr); + #else + simde_int32x4_private a_[4]; + #if defined(SIMDE_RISCV_V_NATIVE) + a_[0].sv128 = __riscv_vle32_v_i32m1(ptr , 4); + a_[1].sv128 = __riscv_vle32_v_i32m1(ptr+4 , 4); + a_[2].sv128 = __riscv_vle32_v_i32m1(ptr+8 , 4); + a_[3].sv128 = __riscv_vle32_v_i32m1(ptr+12 , 4); + #else + for (size_t i = 0; i < 16; i++) { + a_[i / 4].values[i % 4] = ptr[i]; + } + #endif + simde_int32x4x4_t s_ = { { simde_int32x4_from_private(a_[0]), + simde_int32x4_from_private(a_[1]), + simde_int32x4_from_private(a_[2]), + simde_int32x4_from_private(a_[3]) } }; + return s_; + #endif +} +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vld1q_s32_x4 + #define vld1q_s32_x4(a) simde_vld1q_s32_x4((a)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_int64x2x4_t +simde_vld1q_s64_x4(int64_t const ptr[HEDLEY_ARRAY_PARAM(8)]) { + #if \ + defined(SIMDE_ARM_NEON_A32V7_NATIVE) && \ + (!defined(HEDLEY_GCC_VERSION) || (HEDLEY_GCC_VERSION_CHECK(8,0,0) && defined(SIMDE_ARM_NEON_A64V8_NATIVE))) && \ + (!defined(__clang__) || (SIMDE_DETECT_CLANG_VERSION_CHECK(7,0,0) && defined(SIMDE_ARM_NEON_A64V8_NATIVE))) + return vld1q_s64_x4(ptr); + #else + simde_int64x2_private a_[4]; + #if defined(SIMDE_RISCV_V_NATIVE) + a_[0].sv128 = __riscv_vle64_v_i64m1(ptr , 2); + a_[1].sv128 = __riscv_vle64_v_i64m1(ptr+2 , 2); + a_[2].sv128 = __riscv_vle64_v_i64m1(ptr+4 , 2); + a_[3].sv128 = __riscv_vle64_v_i64m1(ptr+6 , 2); + #else + for (size_t i = 0; i < 8; i++) { + a_[i / 2].values[i % 2] = ptr[i]; + } + #endif + simde_int64x2x4_t s_ = { { simde_int64x2_from_private(a_[0]), + simde_int64x2_from_private(a_[1]), + simde_int64x2_from_private(a_[1]), + simde_int64x2_from_private(a_[1]) } }; + return s_; + #endif +} +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vld1q_s64_x4 + #define vld1q_s64_x4(a) simde_vld1q_s64_x4((a)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_uint8x16x4_t +simde_vld1q_u8_x4(uint8_t const ptr[HEDLEY_ARRAY_PARAM(64)]) { + #if \ + defined(SIMDE_ARM_NEON_A32V7_NATIVE) && \ + (!defined(HEDLEY_GCC_VERSION) || (HEDLEY_GCC_VERSION_CHECK(8,0,0) && defined(SIMDE_ARM_NEON_A64V8_NATIVE))) && \ + (!defined(__clang__) || (SIMDE_DETECT_CLANG_VERSION_CHECK(7,0,0) && defined(SIMDE_ARM_NEON_A64V8_NATIVE))) + return vld1q_u8_x4(ptr); + #else + simde_uint8x16_private a_[4]; + #if defined(SIMDE_RISCV_V_NATIVE) + a_[0].sv128 = __riscv_vle8_v_u8m1(ptr , 16); + a_[1].sv128 = __riscv_vle8_v_u8m1(ptr+16 , 16); + a_[2].sv128 = __riscv_vle8_v_u8m1(ptr+32 , 16); + a_[3].sv128 = __riscv_vle8_v_u8m1(ptr+48 , 16); + #else + for (size_t i = 0; i < 64; i++) { + a_[i / 16].values[i % 16] = ptr[i]; + } + #endif + simde_uint8x16x4_t s_ = { { simde_uint8x16_from_private(a_[0]), + simde_uint8x16_from_private(a_[1]), + simde_uint8x16_from_private(a_[2]), + simde_uint8x16_from_private(a_[3]) } }; + return s_; + #endif +} +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vld1q_u8_x4 + #define vld1q_u8_x4(a) simde_vld1q_u8_x4((a)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_uint16x8x4_t +simde_vld1q_u16_x4(uint16_t const ptr[HEDLEY_ARRAY_PARAM(32)]) { + #if \ + defined(SIMDE_ARM_NEON_A32V7_NATIVE) && \ + (!defined(HEDLEY_GCC_VERSION) || (HEDLEY_GCC_VERSION_CHECK(8,0,0) && defined(SIMDE_ARM_NEON_A64V8_NATIVE))) && \ + (!defined(__clang__) || (SIMDE_DETECT_CLANG_VERSION_CHECK(7,0,0) && defined(SIMDE_ARM_NEON_A64V8_NATIVE))) + return vld1q_u16_x4(ptr); + #else + simde_uint16x8_private a_[4]; + #if defined(SIMDE_RISCV_V_NATIVE) + a_[0].sv128 = __riscv_vle16_v_u16m1(ptr , 8); + a_[1].sv128 = __riscv_vle16_v_u16m1(ptr+8 , 8); + a_[2].sv128 = __riscv_vle16_v_u16m1(ptr+16 , 8); + a_[3].sv128 = __riscv_vle16_v_u16m1(ptr+24 , 8); + #else + for (size_t i = 0; i < 32; i++) { + a_[i / 8].values[i % 8] = ptr[i]; + } + #endif + simde_uint16x8x4_t s_ = { { simde_uint16x8_from_private(a_[0]), + simde_uint16x8_from_private(a_[1]), + simde_uint16x8_from_private(a_[2]), + simde_uint16x8_from_private(a_[3]) } }; + return s_; + #endif +} +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vld1q_u16_x4 + #define vld1q_u16_x4(a) simde_vld1q_u16_x4((a)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_uint32x4x4_t +simde_vld1q_u32_x4(uint32_t const ptr[HEDLEY_ARRAY_PARAM(16)]) { + #if \ + defined(SIMDE_ARM_NEON_A32V7_NATIVE) && \ + (!defined(HEDLEY_GCC_VERSION) || (HEDLEY_GCC_VERSION_CHECK(8,0,0) && defined(SIMDE_ARM_NEON_A64V8_NATIVE))) && \ + (!defined(__clang__) || (SIMDE_DETECT_CLANG_VERSION_CHECK(7,0,0) && defined(SIMDE_ARM_NEON_A64V8_NATIVE))) + return vld1q_u32_x4(ptr); + #else + simde_uint32x4_private a_[4]; + #if defined(SIMDE_RISCV_V_NATIVE) + a_[0].sv128 = __riscv_vle32_v_u32m1(ptr , 4); + a_[1].sv128 = __riscv_vle32_v_u32m1(ptr+4 , 4); + a_[2].sv128 = __riscv_vle32_v_u32m1(ptr+8 , 4); + a_[3].sv128 = __riscv_vle32_v_u32m1(ptr+12 , 4); + #else + for (size_t i = 0; i < 16; i++) { + a_[i / 4].values[i % 4] = ptr[i]; + } + #endif + simde_uint32x4x4_t s_ = { { simde_uint32x4_from_private(a_[0]), + simde_uint32x4_from_private(a_[1]), + simde_uint32x4_from_private(a_[2]), + simde_uint32x4_from_private(a_[3]) } }; + return s_; + #endif +} +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vld1q_u32_x4 + #define vld1q_u32_x4(a) simde_vld1q_u32_x4((a)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_uint64x2x4_t +simde_vld1q_u64_x4(uint64_t const ptr[HEDLEY_ARRAY_PARAM(8)]) { + #if \ + defined(SIMDE_ARM_NEON_A32V7_NATIVE) && \ + (!defined(HEDLEY_GCC_VERSION) || (HEDLEY_GCC_VERSION_CHECK(8,0,0) && defined(SIMDE_ARM_NEON_A64V8_NATIVE))) && \ + (!defined(__clang__) || (SIMDE_DETECT_CLANG_VERSION_CHECK(7,0,0) && defined(SIMDE_ARM_NEON_A64V8_NATIVE))) + return vld1q_u64_x4(ptr); + #else + simde_uint64x2_private a_[4]; + #if defined(SIMDE_RISCV_V_NATIVE) + a_[0].sv128 = __riscv_vle64_v_u64m1(ptr , 2); + a_[1].sv128 = __riscv_vle64_v_u64m1(ptr+2 , 2); + a_[2].sv128 = __riscv_vle64_v_u64m1(ptr+4 , 2); + a_[3].sv128 = __riscv_vle64_v_u64m1(ptr+6 , 2); + #else + for (size_t i = 0; i < 8; i++) { + a_[i / 2].values[i % 2] = ptr[i]; + } + #endif + simde_uint64x2x4_t s_ = { { simde_uint64x2_from_private(a_[0]), + simde_uint64x2_from_private(a_[1]), + simde_uint64x2_from_private(a_[2]), + simde_uint64x2_from_private(a_[3]) } }; + return s_; + #endif +} +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vld1q_u64_x4 + #define vld1q_u64_x4(a) simde_vld1q_u64_x4((a)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_poly8x16x4_t +simde_vld1q_p8_x4(simde_poly8_t const ptr[HEDLEY_ARRAY_PARAM(64)]) { + #if \ + defined(SIMDE_ARM_NEON_A32V7_NATIVE) && \ + (!defined(HEDLEY_GCC_VERSION) || (HEDLEY_GCC_VERSION_CHECK(8,5,0) && defined(SIMDE_ARM_NEON_A64V8_NATIVE))) + return vld1q_p8_x4(ptr); + #else + simde_poly8x16_private a_[4]; + #if defined(SIMDE_RISCV_V_NATIVE) + a_[0].sv128 = __riscv_vle8_v_u8m1(ptr , 16); + a_[1].sv128 = __riscv_vle8_v_u8m1(ptr+16 , 16); + a_[2].sv128 = __riscv_vle8_v_u8m1(ptr+32 , 16); + a_[3].sv128 = __riscv_vle8_v_u8m1(ptr+48 , 16); + #else + for (size_t i = 0; i < 64; i++) { + a_[i / 16].values[i % 16] = ptr[i]; + } + #endif + simde_poly8x16x4_t s_ = { { simde_poly8x16_from_private(a_[0]), + simde_poly8x16_from_private(a_[1]), + simde_poly8x16_from_private(a_[2]), + simde_poly8x16_from_private(a_[3]) } }; + return s_; + #endif +} +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vld1q_p8_x4 + #define vld1q_p8_x4(a) simde_vld1q_p8_x4((a)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_poly16x8x4_t +simde_vld1q_p16_x4(simde_poly16_t const ptr[HEDLEY_ARRAY_PARAM(32)]) { + #if \ + defined(SIMDE_ARM_NEON_A32V7_NATIVE) && \ + (!defined(HEDLEY_GCC_VERSION) || (HEDLEY_GCC_VERSION_CHECK(8,5,0) && defined(SIMDE_ARM_NEON_A64V8_NATIVE))) + return vld1q_p16_x4(ptr); + #else + simde_poly16x8_private a_[4]; + #if defined(SIMDE_RISCV_V_NATIVE) + a_[0].sv128 = __riscv_vle16_v_u16m1(ptr , 8); + a_[1].sv128 = __riscv_vle16_v_u16m1(ptr+8 , 8); + a_[2].sv128 = __riscv_vle16_v_u16m1(ptr+16 , 8); + a_[3].sv128 = __riscv_vle16_v_u16m1(ptr+24 , 8); + #else + for (size_t i = 0; i < 32; i++) { + a_[i / 8].values[i % 8] = ptr[i]; + } + #endif + simde_poly16x8x4_t s_ = { { simde_poly16x8_from_private(a_[0]), + simde_poly16x8_from_private(a_[1]), + simde_poly16x8_from_private(a_[2]), + simde_poly16x8_from_private(a_[3]) } }; + return s_; + #endif +} +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vld1q_p16_x4 + #define vld1q_p16_x4(a) simde_vld1q_p16_x4((a)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_poly64x2x4_t +simde_vld1q_p64_x4(simde_poly64_t const ptr[HEDLEY_ARRAY_PARAM(8)]) { + #if \ + defined(SIMDE_ARM_NEON_A32V8_NATIVE) && \ + (!defined(HEDLEY_GCC_VERSION) || (HEDLEY_GCC_VERSION_CHECK(8,5,0) && defined(SIMDE_ARM_NEON_A64V8_NATIVE))) + return vld1q_p64_x4(ptr); + #else + simde_poly64x2_private a_[4]; + #if defined(SIMDE_RISCV_V_NATIVE) + a_[0].sv128 = __riscv_vle64_v_u64m1(ptr , 2); + a_[1].sv128 = __riscv_vle64_v_u64m1(ptr+2 , 2); + a_[2].sv128 = __riscv_vle64_v_u64m1(ptr+4 , 2); + a_[3].sv128 = __riscv_vle64_v_u64m1(ptr+6 , 2); + #else + for (size_t i = 0; i < 8; i++) { + a_[i / 2].values[i % 2] = ptr[i]; + } + #endif + simde_poly64x2x4_t s_ = { { simde_poly64x2_from_private(a_[0]), + simde_poly64x2_from_private(a_[1]), + simde_poly64x2_from_private(a_[2]), + simde_poly64x2_from_private(a_[3]) } }; + return s_; + #endif +} +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) + #undef vld1q_p64_x4 + #define vld1q_p64_x4(a) simde_vld1q_p64_x4((a)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_bfloat16x8x4_t +simde_vld1q_bf16_x4(simde_bfloat16 const ptr[HEDLEY_ARRAY_PARAM(32)]) { + #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARM_NEON_BF16) + return vld1q_bf16_x4(ptr); + #else + simde_bfloat16x8_private a_[4]; + for (size_t i = 0; i < 32; i++) { + a_[i / 8].values[i % 8] = ptr[i]; + } + simde_bfloat16x8x4_t s_ = { { simde_bfloat16x8_from_private(a_[0]), + simde_bfloat16x8_from_private(a_[1]), + simde_bfloat16x8_from_private(a_[2]), + simde_bfloat16x8_from_private(a_[3]) } }; + return s_; + #endif +} +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) + #undef vld1q_bf16_x4 + #define vld1q_bf16_x4(a) simde_vld1q_bf16_x4((a)) +#endif + +#endif /* !defined(SIMDE_BUG_INTEL_857088) */ + +SIMDE_END_DECLS_ +HEDLEY_DIAGNOSTIC_POP + +#endif /* !defined(SIMDE_ARM_NEON_LD1Q_X4_H) */ diff --git a/lib/simd_wrapper/simde/arm/neon/ld2.h b/lib/simd_wrapper/simde/arm/neon/ld2.h index 70cb39af7c8..5d0be9f33fe 100644 --- a/lib/simd_wrapper/simde/arm/neon/ld2.h +++ b/lib/simd_wrapper/simde/arm/neon/ld2.h @@ -22,6 +22,8 @@ * * Copyright: * 2021 Zhi An Ng (Copyright owned by Google, LLC) + * 2023 Yi-Yen Chung (Copyright owned by Andes Technology) + * 2023 Chi-Wei Chu (Copyright owned by NTHU pllab) */ #if !defined(SIMDE_ARM_NEON_LD2_H) @@ -57,6 +59,16 @@ simde_vld2_s8(int8_t const ptr[HEDLEY_ARRAY_PARAM(16)]) { simde_vget_high_s8(q) }; return u; + #elif defined(SIMDE_RISCV_V_NATIVE) + simde_int8x8_private a_[2]; + vint8m1x2_t dest = __riscv_vlseg2e8_v_i8m1x2(&ptr[0], 8); + a_[0].sv64 = __riscv_vget_v_i8m1x2_i8m1(dest, 0); + a_[1].sv64 = __riscv_vget_v_i8m1x2_i8m1(dest, 1); + simde_int8x8x2_t r = { { + simde_int8x8_from_private(a_[0]), + simde_int8x8_from_private(a_[1]), + } }; + return r; #elif SIMDE_NATURAL_VECTOR_SIZE_GE(128) && defined(SIMDE_SHUFFLE_VECTOR_) simde_int8x16_private a_ = simde_int8x16_to_private(simde_vld1q_s8(ptr)); a_.values = SIMDE_SHUFFLE_VECTOR_(8, 16, a_.values, a_.values, 0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15); @@ -90,6 +102,16 @@ simde_int16x4x2_t simde_vld2_s16(int16_t const ptr[HEDLEY_ARRAY_PARAM(8)]) { #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) return vld2_s16(ptr); + #elif defined(SIMDE_RISCV_V_NATIVE) + simde_int16x4_private a_[2]; + vint16m1x2_t dest = __riscv_vlseg2e16_v_i16m1x2(&ptr[0], 4); + a_[0].sv64 = __riscv_vget_v_i16m1x2_i16m1(dest, 0); + a_[1].sv64 = __riscv_vget_v_i16m1x2_i16m1(dest, 1); + simde_int16x4x2_t r = { { + simde_int16x4_from_private(a_[0]), + simde_int16x4_from_private(a_[1]), + } }; + return r; #elif SIMDE_NATURAL_VECTOR_SIZE_GE(128) && defined(SIMDE_SHUFFLE_VECTOR_) simde_int16x8_private a_ = simde_int16x8_to_private(simde_vld1q_s16(ptr)); a_.values = SIMDE_SHUFFLE_VECTOR_(16, 16, a_.values, a_.values, 0, 2, 4, 6, 1, 3, 5, 7); @@ -97,6 +119,10 @@ simde_vld2_s16(int16_t const ptr[HEDLEY_ARRAY_PARAM(8)]) { simde_memcpy(&r, &a_, sizeof(r)); return r; #else + #if defined(SIMDE_DIAGNOSTIC_DISABLE_UNINITIALIZED_) && HEDLEY_GCC_VERSION_CHECK(12,0,0) + HEDLEY_DIAGNOSTIC_PUSH + SIMDE_DIAGNOSTIC_DISABLE_UNINITIALIZED_ + #endif simde_int16x4_private r_[2]; for (size_t i = 0 ; i < (sizeof(r_) / sizeof(r_[0])) ; i++) { @@ -104,6 +130,9 @@ simde_vld2_s16(int16_t const ptr[HEDLEY_ARRAY_PARAM(8)]) { r_[i].values[j] = ptr[i + (j * (sizeof(r_) / sizeof(r_[0])))]; } } + #if defined(SIMDE_DIAGNOSTIC_DISABLE_UNINITIALIZED_) && HEDLEY_GCC_VERSION_CHECK(12,0,0) + HEDLEY_DIAGNOSTIC_POP + #endif simde_int16x4x2_t r = { { simde_int16x4_from_private(r_[0]), @@ -123,6 +152,16 @@ simde_int32x2x2_t simde_vld2_s32(int32_t const ptr[HEDLEY_ARRAY_PARAM(4)]) { #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) return vld2_s32(ptr); + #elif defined(SIMDE_RISCV_V_NATIVE) + simde_int32x2_private a_[2]; + vint32m1x2_t dest = __riscv_vlseg2e32_v_i32m1x2(&ptr[0], 2); + a_[0].sv64 = __riscv_vget_v_i32m1x2_i32m1(dest, 0); + a_[1].sv64 = __riscv_vget_v_i32m1x2_i32m1(dest, 1); + simde_int32x2x2_t r = { { + simde_int32x2_from_private(a_[0]), + simde_int32x2_from_private(a_[1]), + } }; + return r; #elif SIMDE_NATURAL_VECTOR_SIZE_GE(128) && defined(SIMDE_SHUFFLE_VECTOR_) simde_int32x4_private a_ = simde_int32x4_to_private(simde_vld1q_s32(ptr)); a_.values = SIMDE_SHUFFLE_VECTOR_(32, 16, a_.values, a_.values, 0, 2, 1, 3); @@ -156,6 +195,16 @@ simde_int64x1x2_t simde_vld2_s64(int64_t const ptr[HEDLEY_ARRAY_PARAM(2)]) { #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) return vld2_s64(ptr); + #elif defined(SIMDE_RISCV_V_NATIVE) + simde_int64x1_private a_[2]; + vint64m1x2_t dest = __riscv_vlseg2e64_v_i64m1x2(&ptr[0], 1); + a_[0].sv64 = __riscv_vget_v_i64m1x2_i64m1(dest, 0); + a_[1].sv64 = __riscv_vget_v_i64m1x2_i64m1(dest, 1); + simde_int64x1x2_t r = { { + simde_int64x1_from_private(a_[0]), + simde_int64x1_from_private(a_[1]), + } }; + return r; #elif SIMDE_NATURAL_VECTOR_SIZE_GE(128) && defined(SIMDE_SHUFFLE_VECTOR_) simde_int64x2_private a_ = simde_int64x2_to_private(simde_vld1q_s64(ptr)); a_.values = SIMDE_SHUFFLE_VECTOR_(64, 16, a_.values, a_.values, 0, 1); @@ -200,6 +249,16 @@ simde_vld2_u8(uint8_t const ptr[HEDLEY_ARRAY_PARAM(16)]) { simde_vget_high_u8(q) }; return u; + #elif defined(SIMDE_RISCV_V_NATIVE) + simde_uint8x8_private a_[2]; + vuint8m1x2_t dest = __riscv_vlseg2e8_v_u8m1x2(&ptr[0], 8); + a_[0].sv64 = __riscv_vget_v_u8m1x2_u8m1(dest, 0); + a_[1].sv64 = __riscv_vget_v_u8m1x2_u8m1(dest, 1); + simde_uint8x8x2_t r = { { + simde_uint8x8_from_private(a_[0]), + simde_uint8x8_from_private(a_[1]), + } }; + return r; #elif SIMDE_NATURAL_VECTOR_SIZE_GE(128) && defined(SIMDE_SHUFFLE_VECTOR_) simde_uint8x16_private a_ = simde_uint8x16_to_private(simde_vld1q_u8(ptr)); a_.values = SIMDE_SHUFFLE_VECTOR_(8, 16, a_.values, a_.values, 0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15); @@ -233,6 +292,16 @@ simde_uint16x4x2_t simde_vld2_u16(uint16_t const ptr[HEDLEY_ARRAY_PARAM(8)]) { #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) return vld2_u16(ptr); + #elif defined(SIMDE_RISCV_V_NATIVE) + simde_uint16x4_private a_[2]; + vuint16m1x2_t dest = __riscv_vlseg2e16_v_u16m1x2(&ptr[0], 4); + a_[0].sv64 = __riscv_vget_v_u16m1x2_u16m1(dest, 0); + a_[1].sv64 = __riscv_vget_v_u16m1x2_u16m1(dest, 1); + simde_uint16x4x2_t r = { { + simde_uint16x4_from_private(a_[0]), + simde_uint16x4_from_private(a_[1]), + } }; + return r; #elif SIMDE_NATURAL_VECTOR_SIZE_GE(128) && defined(SIMDE_SHUFFLE_VECTOR_) simde_uint16x8_private a_ = simde_uint16x8_to_private(simde_vld1q_u16(ptr)); a_.values = SIMDE_SHUFFLE_VECTOR_(16, 16, a_.values, a_.values, 0, 2, 4, 6, 1, 3, 5, 7); @@ -240,6 +309,10 @@ simde_vld2_u16(uint16_t const ptr[HEDLEY_ARRAY_PARAM(8)]) { simde_memcpy(&r, &a_, sizeof(r)); return r; #else + #if defined(SIMDE_DIAGNOSTIC_DISABLE_UNINITIALIZED_) && HEDLEY_GCC_VERSION_CHECK(12,0,0) + HEDLEY_DIAGNOSTIC_PUSH + SIMDE_DIAGNOSTIC_DISABLE_UNINITIALIZED_ + #endif simde_uint16x4_private r_[2]; for (size_t i = 0 ; i < (sizeof(r_) / sizeof(r_[0])) ; i++) { @@ -247,6 +320,9 @@ simde_vld2_u16(uint16_t const ptr[HEDLEY_ARRAY_PARAM(8)]) { r_[i].values[j] = ptr[i + (j * (sizeof(r_) / sizeof(r_[0])))]; } } + #if defined(SIMDE_DIAGNOSTIC_DISABLE_UNINITIALIZED_) && HEDLEY_GCC_VERSION_CHECK(12,0,0) + HEDLEY_DIAGNOSTIC_POP + #endif simde_uint16x4x2_t r = { { simde_uint16x4_from_private(r_[0]), @@ -266,6 +342,16 @@ simde_uint32x2x2_t simde_vld2_u32(uint32_t const ptr[HEDLEY_ARRAY_PARAM(4)]) { #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) return vld2_u32(ptr); + #elif defined(SIMDE_RISCV_V_NATIVE) + simde_uint32x2_private a_[2]; + vuint32m1x2_t dest = __riscv_vlseg2e32_v_u32m1x2(&ptr[0], 2); + a_[0].sv64 = __riscv_vget_v_u32m1x2_u32m1(dest, 0); + a_[1].sv64 = __riscv_vget_v_u32m1x2_u32m1(dest, 1); + simde_uint32x2x2_t r = { { + simde_uint32x2_from_private(a_[0]), + simde_uint32x2_from_private(a_[1]), + } }; + return r; #elif SIMDE_NATURAL_VECTOR_SIZE_GE(128) && defined(SIMDE_SHUFFLE_VECTOR_) simde_uint32x4_private a_ = simde_uint32x4_to_private(simde_vld1q_u32(ptr)); a_.values = SIMDE_SHUFFLE_VECTOR_(32, 16, a_.values, a_.values, 0, 2, 1, 3); @@ -296,9 +382,19 @@ simde_vld2_u32(uint32_t const ptr[HEDLEY_ARRAY_PARAM(4)]) { SIMDE_FUNCTION_ATTRIBUTES simde_uint64x1x2_t -simde_vld2_u64(uint64_t const ptr[HEDLEY_ARRAY_PARAM(4)]) { +simde_vld2_u64(uint64_t const ptr[HEDLEY_ARRAY_PARAM(2)]) { #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) return vld2_u64(ptr); + #elif defined(SIMDE_RISCV_V_NATIVE) + simde_uint64x1_private a_[2]; + vuint64m1x2_t dest = __riscv_vlseg2e64_v_u64m1x2(&ptr[0], 1); + a_[0].sv64 = __riscv_vget_v_u64m1x2_u64m1(dest, 0); + a_[1].sv64 = __riscv_vget_v_u64m1x2_u64m1(dest, 1); + simde_uint64x1x2_t r = { { + simde_uint64x1_from_private(a_[0]), + simde_uint64x1_from_private(a_[1]), + } }; + return r; #elif SIMDE_NATURAL_VECTOR_SIZE_GE(128) && defined(SIMDE_SHUFFLE_VECTOR_) simde_uint64x2_private a_ = simde_uint64x2_to_private(simde_vld1q_u64(ptr)); a_.values = SIMDE_SHUFFLE_VECTOR_(64, 16, a_.values, a_.values, 0, 1); @@ -327,11 +423,58 @@ simde_vld2_u64(uint64_t const ptr[HEDLEY_ARRAY_PARAM(4)]) { #define vld2_u64(a) simde_vld2_u64((a)) #endif +SIMDE_FUNCTION_ATTRIBUTES +simde_float16x4x2_t +simde_vld2_f16(simde_float16_t const ptr[HEDLEY_ARRAY_PARAM(8)]) { + #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) && defined(SIMDE_ARM_NEON_FP16) + return vld2_f16(ptr); + #elif defined(SIMDE_RISCV_V_NATIVE) && SIMDE_ARCH_RISCV_ZVFH && (SIMDE_NATURAL_VECTOR_SIZE >= 128) + simde_float16x4_private r_[2]; + vfloat16m1x2_t dest = __riscv_vlseg2e16_v_f16m1x2((_Float16 *)&ptr[0], 4); + r_[0].sv64 = __riscv_vget_v_f16m1x2_f16m1(dest, 0); + r_[1].sv64 = __riscv_vget_v_f16m1x2_f16m1(dest, 1); + simde_float16x4x2_t r = { { + simde_float16x4_from_private(r_[0]), + simde_float16x4_from_private(r_[1]), + } }; + return r; + #else + simde_float16x4_private r_[2]; + + for (size_t i = 0 ; i < (sizeof(r_) / sizeof(r_[0])) ; i++) { + for (size_t j = 0 ; j < (sizeof(r_[0].values) / sizeof(r_[0].values[0])) ; j++) { + r_[i].values[j] = ptr[i + (j * (sizeof(r_) / sizeof(r_[0])))]; + } + } + + simde_float16x4x2_t r = { { + simde_float16x4_from_private(r_[0]), + simde_float16x4_from_private(r_[1]), + } }; + + return r; + #endif +} +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vld2_f16 + #define vld2_f16(a) simde_vld2_f16((a)) +#endif + SIMDE_FUNCTION_ATTRIBUTES simde_float32x2x2_t simde_vld2_f32(simde_float32_t const ptr[HEDLEY_ARRAY_PARAM(4)]) { #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) return vld2_f32(ptr); + #elif defined(SIMDE_RISCV_V_NATIVE) + simde_float32x2_private r_[2]; + vfloat32m1x2_t dest = __riscv_vlseg2e32_v_f32m1x2(&ptr[0], 2); + r_[0].sv64 = __riscv_vget_v_f32m1x2_f32m1(dest, 0); + r_[1].sv64 = __riscv_vget_v_f32m1x2_f32m1(dest, 1); + simde_float32x2x2_t r = { { + simde_float32x2_from_private(r_[0]), + simde_float32x2_from_private(r_[1]), + } }; + return r; #elif SIMDE_NATURAL_VECTOR_SIZE_GE(128) && defined(SIMDE_SHUFFLE_VECTOR_) simde_float32x4_private a_ = simde_float32x4_to_private(simde_vld1q_f32(ptr)); a_.values = SIMDE_SHUFFLE_VECTOR_(32, 16, a_.values, a_.values, 0, 2, 1, 3); @@ -362,9 +505,19 @@ simde_vld2_f32(simde_float32_t const ptr[HEDLEY_ARRAY_PARAM(4)]) { SIMDE_FUNCTION_ATTRIBUTES simde_float64x1x2_t -simde_vld2_f64(simde_float64_t const ptr[HEDLEY_ARRAY_PARAM(4)]) { +simde_vld2_f64(simde_float64_t const ptr[HEDLEY_ARRAY_PARAM(2)]) { #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) return vld2_f64(ptr); + #elif defined(SIMDE_RISCV_V_NATIVE) + simde_float64x1_private r_[2]; + vfloat64m1x2_t dest = __riscv_vlseg2e64_v_f64m1x2(&ptr[0], 1); + r_[0].sv64 = __riscv_vget_v_f64m1x2_f64m1(dest, 0); + r_[1].sv64 = __riscv_vget_v_f64m1x2_f64m1(dest, 1); + simde_float64x1x2_t r = { { + simde_float64x1_from_private(r_[0]), + simde_float64x1_from_private(r_[1]), + } }; + return r; #elif SIMDE_NATURAL_VECTOR_SIZE_GE(128) && defined(SIMDE_SHUFFLE_VECTOR_) simde_float64x2_private a_ = simde_float64x2_to_private(simde_vld1q_f64(ptr)); a_.values = SIMDE_SHUFFLE_VECTOR_(64, 16, a_.values, a_.values, 0, 1); @@ -398,6 +551,16 @@ simde_int8x16x2_t simde_vld2q_s8(int8_t const ptr[HEDLEY_ARRAY_PARAM(32)]) { #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) return vld2q_s8(ptr); + #elif defined(SIMDE_RISCV_V_NATIVE) + simde_int8x16_private a_[2]; + vint8m1x2_t dest = __riscv_vlseg2e8_v_i8m1x2(&ptr[0], 16); + a_[0].sv128 = __riscv_vget_v_i8m1x2_i8m1(dest, 0); + a_[1].sv128 = __riscv_vget_v_i8m1x2_i8m1(dest, 1); + simde_int8x16x2_t r = { { + simde_int8x16_from_private(a_[0]), + simde_int8x16_from_private(a_[1]), + } }; + return r; #elif SIMDE_NATURAL_VECTOR_SIZE_GE(128) return simde_vuzpq_s8( @@ -405,6 +568,10 @@ simde_vld2q_s8(int8_t const ptr[HEDLEY_ARRAY_PARAM(32)]) { simde_vld1q_s8(&(ptr[16])) ); #else + #if defined(SIMDE_DIAGNOSTIC_DISABLE_UNINITIALIZED_) && HEDLEY_GCC_VERSION_CHECK(12,0,0) && defined(SIMDE_ARCH_RISCV64) + HEDLEY_DIAGNOSTIC_PUSH + SIMDE_DIAGNOSTIC_DISABLE_UNINITIALIZED_ + #endif simde_int8x16_private r_[2]; for (size_t i = 0 ; i < (sizeof(r_) / sizeof(r_[0])) ; i++) { @@ -419,6 +586,9 @@ simde_vld2q_s8(int8_t const ptr[HEDLEY_ARRAY_PARAM(32)]) { } }; return r; + #if defined(SIMDE_DIAGNOSTIC_DISABLE_UNINITIALIZED_) && HEDLEY_GCC_VERSION_CHECK(12,0,0) && defined(SIMDE_ARCH_RISCV64) + HEDLEY_DIAGNOSTIC_POP + #endif #endif } #if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) @@ -431,6 +601,16 @@ simde_int32x4x2_t simde_vld2q_s32(int32_t const ptr[HEDLEY_ARRAY_PARAM(8)]) { #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) return vld2q_s32(ptr); + #elif defined(SIMDE_RISCV_V_NATIVE) + simde_int32x4_private a_[2]; + vint32m1x2_t dest = __riscv_vlseg2e32_v_i32m1x2(&ptr[0], 4); + a_[0].sv128 = __riscv_vget_v_i32m1x2_i32m1(dest, 0); + a_[1].sv128 = __riscv_vget_v_i32m1x2_i32m1(dest, 1); + simde_int32x4x2_t r = { { + simde_int32x4_from_private(a_[0]), + simde_int32x4_from_private(a_[1]), + } }; + return r; #elif SIMDE_NATURAL_VECTOR_SIZE_GE(128) return simde_vuzpq_s32( @@ -438,6 +618,10 @@ simde_vld2q_s32(int32_t const ptr[HEDLEY_ARRAY_PARAM(8)]) { simde_vld1q_s32(&(ptr[4])) ); #else + #if defined(SIMDE_DIAGNOSTIC_DISABLE_UNINITIALIZED_) && HEDLEY_GCC_VERSION_CHECK(12,0,0) + HEDLEY_DIAGNOSTIC_PUSH + SIMDE_DIAGNOSTIC_DISABLE_UNINITIALIZED_ + #endif simde_int32x4_private r_[2]; for (size_t i = 0 ; i < (sizeof(r_) / sizeof(r_[0])) ; i++) { @@ -445,6 +629,9 @@ simde_vld2q_s32(int32_t const ptr[HEDLEY_ARRAY_PARAM(8)]) { r_[i].values[j] = ptr[i + (j * (sizeof(r_) / sizeof(r_[0])))]; } } + #if defined(SIMDE_DIAGNOSTIC_DISABLE_UNINITIALIZED_) && HEDLEY_GCC_VERSION_CHECK(12,0,0) + HEDLEY_DIAGNOSTIC_POP + #endif simde_int32x4x2_t r = { { simde_int32x4_from_private(r_[0]), @@ -464,6 +651,16 @@ simde_int16x8x2_t simde_vld2q_s16(int16_t const ptr[HEDLEY_ARRAY_PARAM(16)]) { #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) return vld2q_s16(ptr); + #elif defined(SIMDE_RISCV_V_NATIVE) + simde_int16x8_private r_[2]; + vint16m1x2_t dest = __riscv_vlseg2e16_v_i16m1x2(&ptr[0], 8); + r_[0].sv128 = __riscv_vget_v_i16m1x2_i16m1(dest, 0); + r_[1].sv128 = __riscv_vget_v_i16m1x2_i16m1(dest, 1); + simde_int16x8x2_t r = { { + simde_int16x8_from_private(r_[0]), + simde_int16x8_from_private(r_[1]), + } }; + return r; #elif SIMDE_NATURAL_VECTOR_SIZE_GE(128) return simde_vuzpq_s16( @@ -471,6 +668,10 @@ simde_vld2q_s16(int16_t const ptr[HEDLEY_ARRAY_PARAM(16)]) { simde_vld1q_s16(&(ptr[8])) ); #else + #if defined(SIMDE_DIAGNOSTIC_DISABLE_UNINITIALIZED_) && HEDLEY_GCC_VERSION_CHECK(12,0,0) && defined(SIMDE_ARCH_RISCV64) + HEDLEY_DIAGNOSTIC_PUSH + SIMDE_DIAGNOSTIC_DISABLE_UNINITIALIZED_ + #endif simde_int16x8_private r_[2]; for (size_t i = 0 ; i < (sizeof(r_) / sizeof(r_[0])) ; i++) { @@ -485,6 +686,9 @@ simde_vld2q_s16(int16_t const ptr[HEDLEY_ARRAY_PARAM(16)]) { } }; return r; + #if defined(SIMDE_DIAGNOSTIC_DISABLE_UNINITIALIZED_) && HEDLEY_GCC_VERSION_CHECK(12,0,0) && defined(SIMDE_ARCH_RISCV64) + HEDLEY_DIAGNOSTIC_POP + #endif #endif } #if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) @@ -497,6 +701,16 @@ simde_int64x2x2_t simde_vld2q_s64(int64_t const ptr[HEDLEY_ARRAY_PARAM(4)]) { #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) return vld2q_s64(ptr); + #elif defined(SIMDE_RISCV_V_NATIVE) + simde_int64x2_private r_[2]; + vint64m1x2_t dest = __riscv_vlseg2e64_v_i64m1x2(&ptr[0], 2); + r_[0].sv128 = __riscv_vget_v_i64m1x2_i64m1(dest, 0); + r_[1].sv128 = __riscv_vget_v_i64m1x2_i64m1(dest, 1); + simde_int64x2x2_t r = { { + simde_int64x2_from_private(r_[0]), + simde_int64x2_from_private(r_[1]), + } }; + return r; #else simde_int64x2_private r_[2]; @@ -524,6 +738,16 @@ simde_uint8x16x2_t simde_vld2q_u8(uint8_t const ptr[HEDLEY_ARRAY_PARAM(32)]) { #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) return vld2q_u8(ptr); + #elif defined(SIMDE_RISCV_V_NATIVE) + simde_uint8x16_private r_[2]; + vuint8m1x2_t dest = __riscv_vlseg2e8_v_u8m1x2(&ptr[0], 16); + r_[0].sv128 = __riscv_vget_v_u8m1x2_u8m1(dest, 0); + r_[1].sv128 = __riscv_vget_v_u8m1x2_u8m1(dest, 1); + simde_uint8x16x2_t r = { { + simde_uint8x16_from_private(r_[0]), + simde_uint8x16_from_private(r_[1]), + } }; + return r; #elif SIMDE_NATURAL_VECTOR_SIZE_GE(128) return simde_vuzpq_u8( @@ -531,6 +755,10 @@ simde_vld2q_u8(uint8_t const ptr[HEDLEY_ARRAY_PARAM(32)]) { simde_vld1q_u8(&(ptr[16])) ); #else + #if defined(SIMDE_DIAGNOSTIC_DISABLE_UNINITIALIZED_) && HEDLEY_GCC_VERSION_CHECK(12,0,0) && defined(SIMDE_ARCH_RISCV64) + HEDLEY_DIAGNOSTIC_PUSH + SIMDE_DIAGNOSTIC_DISABLE_UNINITIALIZED_ + #endif simde_uint8x16_private r_[2]; for (size_t i = 0 ; i < (sizeof(r_) / sizeof(r_[0])) ; i++) { @@ -545,6 +773,9 @@ simde_vld2q_u8(uint8_t const ptr[HEDLEY_ARRAY_PARAM(32)]) { } }; return r; + #if defined(SIMDE_DIAGNOSTIC_DISABLE_UNINITIALIZED_) && HEDLEY_GCC_VERSION_CHECK(12,0,0) && defined(SIMDE_ARCH_RISCV64) + HEDLEY_DIAGNOSTIC_POP + #endif #endif } #if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) @@ -557,6 +788,16 @@ simde_uint16x8x2_t simde_vld2q_u16(uint16_t const ptr[HEDLEY_ARRAY_PARAM(16)]) { #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) return vld2q_u16(ptr); + #elif defined(SIMDE_RISCV_V_NATIVE) + simde_uint16x8_private r_[2]; + vuint16m1x2_t dest = __riscv_vlseg2e16_v_u16m1x2(&ptr[0], 8); + r_[0].sv128 = __riscv_vget_v_u16m1x2_u16m1(dest, 0); + r_[1].sv128 = __riscv_vget_v_u16m1x2_u16m1(dest, 1); + simde_uint16x8x2_t r = { { + simde_uint16x8_from_private(r_[0]), + simde_uint16x8_from_private(r_[1]), + } }; + return r; #elif SIMDE_NATURAL_VECTOR_SIZE_GE(128) return simde_vuzpq_u16( @@ -564,6 +805,10 @@ simde_vld2q_u16(uint16_t const ptr[HEDLEY_ARRAY_PARAM(16)]) { simde_vld1q_u16(&(ptr[8])) ); #else + #if defined(SIMDE_DIAGNOSTIC_DISABLE_UNINITIALIZED_) && HEDLEY_GCC_VERSION_CHECK(12,0,0) && defined(SIMDE_ARCH_RISCV64) + HEDLEY_DIAGNOSTIC_PUSH + SIMDE_DIAGNOSTIC_DISABLE_UNINITIALIZED_ + #endif simde_uint16x8_private r_[2]; for (size_t i = 0 ; i < (sizeof(r_) / sizeof(r_[0])) ; i++) { @@ -578,6 +823,9 @@ simde_vld2q_u16(uint16_t const ptr[HEDLEY_ARRAY_PARAM(16)]) { } }; return r; + #if defined(SIMDE_DIAGNOSTIC_DISABLE_UNINITIALIZED_) && HEDLEY_GCC_VERSION_CHECK(12,0,0) && defined(SIMDE_ARCH_RISCV64) + HEDLEY_DIAGNOSTIC_POP + #endif #endif } #if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) @@ -590,6 +838,16 @@ simde_uint32x4x2_t simde_vld2q_u32(uint32_t const ptr[HEDLEY_ARRAY_PARAM(8)]) { #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) return vld2q_u32(ptr); + #elif defined(SIMDE_RISCV_V_NATIVE) + simde_uint32x4_private r_[2]; + vuint32m1x2_t dest = __riscv_vlseg2e32_v_u32m1x2(&ptr[0], 4); + r_[0].sv128 = __riscv_vget_v_u32m1x2_u32m1(dest, 0); + r_[1].sv128 = __riscv_vget_v_u32m1x2_u32m1(dest, 1); + simde_uint32x4x2_t r = { { + simde_uint32x4_from_private(r_[0]), + simde_uint32x4_from_private(r_[1]), + } }; + return r; #elif SIMDE_NATURAL_VECTOR_SIZE_GE(128) return simde_vuzpq_u32( @@ -597,6 +855,10 @@ simde_vld2q_u32(uint32_t const ptr[HEDLEY_ARRAY_PARAM(8)]) { simde_vld1q_u32(&(ptr[4])) ); #else + #if defined(SIMDE_DIAGNOSTIC_DISABLE_UNINITIALIZED_) && HEDLEY_GCC_VERSION_CHECK(12,0,0) + HEDLEY_DIAGNOSTIC_PUSH + SIMDE_DIAGNOSTIC_DISABLE_UNINITIALIZED_ + #endif simde_uint32x4_private r_[2]; for (size_t i = 0 ; i < (sizeof(r_) / sizeof(r_[0])) ; i++) { @@ -604,6 +866,9 @@ simde_vld2q_u32(uint32_t const ptr[HEDLEY_ARRAY_PARAM(8)]) { r_[i].values[j] = ptr[i + (j * (sizeof(r_) / sizeof(r_[0])))]; } } + #if defined(SIMDE_DIAGNOSTIC_DISABLE_UNINITIALIZED_) && HEDLEY_GCC_VERSION_CHECK(12,0,0) + HEDLEY_DIAGNOSTIC_POP + #endif simde_uint32x4x2_t r = { { simde_uint32x4_from_private(r_[0]), @@ -623,6 +888,16 @@ simde_uint64x2x2_t simde_vld2q_u64(uint64_t const ptr[HEDLEY_ARRAY_PARAM(4)]) { #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) return vld2q_u64(ptr); + #elif defined(SIMDE_RISCV_V_NATIVE) + simde_uint64x2_private r_[2]; + vuint64m1x2_t dest = __riscv_vlseg2e64_v_u64m1x2(&ptr[0], 2); + r_[0].sv128 = __riscv_vget_v_u64m1x2_u64m1(dest, 0); + r_[1].sv128 = __riscv_vget_v_u64m1x2_u64m1(dest, 1); + simde_uint64x2x2_t r = { { + simde_uint64x2_from_private(r_[0]), + simde_uint64x2_from_private(r_[1]), + } }; + return r; #else simde_uint64x2_private r_[2]; @@ -645,11 +920,65 @@ simde_vld2q_u64(uint64_t const ptr[HEDLEY_ARRAY_PARAM(4)]) { #define vld2q_u64(a) simde_vld2q_u64((a)) #endif +SIMDE_FUNCTION_ATTRIBUTES +simde_float16x8x2_t +simde_vld2q_f16(simde_float16_t const ptr[HEDLEY_ARRAY_PARAM(16)]) { + #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) && defined(SIMDE_ARM_NEON_FP16) + return vld2q_f16(ptr); + #elif defined(SIMDE_RISCV_V_NATIVE) && SIMDE_ARCH_RISCV_ZVFH && (SIMDE_NATURAL_VECTOR_SIZE >= 128) + simde_float16x8_private r_[2]; + vfloat16m1x2_t dest = __riscv_vlseg2e16_v_f16m1x2((_Float16 *)&ptr[0], 8); + r_[0].sv128 = __riscv_vget_v_f16m1x2_f16m1(dest, 0); + r_[1].sv128 = __riscv_vget_v_f16m1x2_f16m1(dest, 1); + simde_float16x8x2_t r = { { + simde_float16x8_from_private(r_[0]), + simde_float16x8_from_private(r_[1]), + } }; + return r; + #else + #if defined(SIMDE_DIAGNOSTIC_DISABLE_UNINITIALIZED_) && HEDLEY_GCC_VERSION_CHECK(12,0,0) + HEDLEY_DIAGNOSTIC_PUSH + SIMDE_DIAGNOSTIC_DISABLE_UNINITIALIZED_ + #endif + simde_float16x8_private r_[2]; + + for (size_t i = 0 ; i < (sizeof(r_) / sizeof(r_[0])); i++) { + for (size_t j = 0 ; j < (sizeof(r_[0].values) / sizeof(r_[0].values[0])) ; j++) { + r_[i].values[j] = ptr[i + (j * (sizeof(r_) / sizeof(r_[0])))]; + } + } + #if defined(SIMDE_DIAGNOSTIC_DISABLE_UNINITIALIZED_) && HEDLEY_GCC_VERSION_CHECK(12,0,0) + HEDLEY_DIAGNOSTIC_POP + #endif + + simde_float16x8x2_t r = { { + simde_float16x8_from_private(r_[0]), + simde_float16x8_from_private(r_[1]), + } }; + + return r; + #endif +} +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vld2q_f16 + #define vld2q_f16(a) simde_vld2q_f16((a)) +#endif + SIMDE_FUNCTION_ATTRIBUTES simde_float32x4x2_t simde_vld2q_f32(simde_float32_t const ptr[HEDLEY_ARRAY_PARAM(8)]) { #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) return vld2q_f32(ptr); + #elif defined(SIMDE_RISCV_V_NATIVE) + simde_float32x4_private r_[2]; + vfloat32m1x2_t dest = __riscv_vlseg2e32_v_f32m1x2(&ptr[0], 4); + r_[0].sv128 = __riscv_vget_v_f32m1x2_f32m1(dest, 0); + r_[1].sv128 = __riscv_vget_v_f32m1x2_f32m1(dest, 1); + simde_float32x4x2_t r = { { + simde_float32x4_from_private(r_[0]), + simde_float32x4_from_private(r_[1]), + } }; + return r; #elif SIMDE_NATURAL_VECTOR_SIZE_GE(128) return simde_vuzpq_f32( @@ -657,6 +986,10 @@ simde_vld2q_f32(simde_float32_t const ptr[HEDLEY_ARRAY_PARAM(8)]) { simde_vld1q_f32(&(ptr[4])) ); #else + #if defined(SIMDE_DIAGNOSTIC_DISABLE_UNINITIALIZED_) && HEDLEY_GCC_VERSION_CHECK(12,0,0) + HEDLEY_DIAGNOSTIC_PUSH + SIMDE_DIAGNOSTIC_DISABLE_UNINITIALIZED_ + #endif simde_float32x4_private r_[2]; for (size_t i = 0 ; i < (sizeof(r_) / sizeof(r_[0])); i++) { @@ -664,6 +997,9 @@ simde_vld2q_f32(simde_float32_t const ptr[HEDLEY_ARRAY_PARAM(8)]) { r_[i].values[j] = ptr[i + (j * (sizeof(r_) / sizeof(r_[0])))]; } } + #if defined(SIMDE_DIAGNOSTIC_DISABLE_UNINITIALIZED_) && HEDLEY_GCC_VERSION_CHECK(12,0,0) + HEDLEY_DIAGNOSTIC_POP + #endif simde_float32x4x2_t r = { { simde_float32x4_from_private(r_[0]), @@ -683,6 +1019,16 @@ simde_float64x2x2_t simde_vld2q_f64(simde_float64_t const ptr[HEDLEY_ARRAY_PARAM(4)]) { #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) return vld2q_f64(ptr); + #elif defined(SIMDE_RISCV_V_NATIVE) + simde_float64x2_private r_[2]; + vfloat64m1x2_t dest = __riscv_vlseg2e64_v_f64m1x2(&ptr[0], 2); + r_[0].sv128 = __riscv_vget_v_f64m1x2_f64m1(dest, 0); + r_[1].sv128 = __riscv_vget_v_f64m1x2_f64m1(dest, 1); + simde_float64x2x2_t r = { { + simde_float64x2_from_private(r_[0]), + simde_float64x2_from_private(r_[1]), + } }; + return r; #else simde_float64x2_private r_[2]; @@ -705,6 +1051,276 @@ simde_vld2q_f64(simde_float64_t const ptr[HEDLEY_ARRAY_PARAM(4)]) { #define vld2q_f64(a) simde_vld2q_f64((a)) #endif +SIMDE_FUNCTION_ATTRIBUTES +simde_poly8x8x2_t +simde_vld2_p8(simde_poly8_t const ptr[HEDLEY_ARRAY_PARAM(16)]) { + #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) + return vld2_p8(ptr); + #else + simde_poly8x8_private r_[2]; + #if defined(SIMDE_RISCV_V_NATIVE) + vuint8m1x2_t dest = __riscv_vlseg2e8_v_u8m1x2(&ptr[0], 8); + r_[0].sv64 = __riscv_vget_v_u8m1x2_u8m1(dest, 0); + r_[1].sv64 = __riscv_vget_v_u8m1x2_u8m1(dest, 1); + #else + for (size_t i = 0 ; i < (sizeof(r_) / sizeof(r_[0])) ; i++) { + for (size_t j = 0 ; j < (sizeof(r_[0].values) / sizeof(r_[0].values[0])) ; j++) { + r_[i].values[j] = ptr[i + (j * (sizeof(r_) / sizeof(r_[0])))]; + } + } + #endif + simde_poly8x8x2_t r = { { + simde_poly8x8_from_private(r_[0]), + simde_poly8x8_from_private(r_[1]), + } }; + + return r; + #endif +} +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vld2_p8 + #define vld2_p8(a) simde_vld2_p8((a)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_poly16x4x2_t +simde_vld2_p16(simde_poly16_t const ptr[HEDLEY_ARRAY_PARAM(8)]) { + #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) + return vld2_p16(ptr); + #else + #if defined(SIMDE_DIAGNOSTIC_DISABLE_UNINITIALIZED_) && HEDLEY_GCC_VERSION_CHECK(12,0,0) + HEDLEY_DIAGNOSTIC_PUSH + SIMDE_DIAGNOSTIC_DISABLE_UNINITIALIZED_ + #endif + simde_poly16x4_private r_[2]; + #if defined(SIMDE_RISCV_V_NATIVE) + vuint16m1x2_t dest = __riscv_vlseg2e16_v_u16m1x2(&ptr[0], 4); + r_[0].sv64 = __riscv_vget_v_u16m1x2_u16m1(dest, 0); + r_[1].sv64 = __riscv_vget_v_u16m1x2_u16m1(dest, 1); + #else + for (size_t i = 0 ; i < (sizeof(r_) / sizeof(r_[0])) ; i++) { + for (size_t j = 0 ; j < (sizeof(r_[0].values) / sizeof(r_[0].values[0])) ; j++) { + r_[i].values[j] = ptr[i + (j * (sizeof(r_) / sizeof(r_[0])))]; + } + } + #endif + #if defined(SIMDE_DIAGNOSTIC_DISABLE_UNINITIALIZED_) && HEDLEY_GCC_VERSION_CHECK(12,0,0) + HEDLEY_DIAGNOSTIC_POP + #endif + + simde_poly16x4x2_t r = { { + simde_poly16x4_from_private(r_[0]), + simde_poly16x4_from_private(r_[1]), + } }; + + return r; + #endif +} +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vld2_p16 + #define vld2_p16(a) simde_vld2_p16((a)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_poly64x1x2_t +simde_vld2_p64(simde_poly64_t const ptr[HEDLEY_ARRAY_PARAM(2)]) { + #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) + return vld2_p64(ptr); + #else + simde_poly64x1_private r_[2]; + + #if defined(SIMDE_RISCV_V_NATIVE) + vuint64m1x2_t dest = __riscv_vlseg2e64_v_u64m1x2(&ptr[0], 1); + r_[0].sv64 = __riscv_vget_v_u64m1x2_u64m1(dest, 0); + r_[1].sv64 = __riscv_vget_v_u64m1x2_u64m1(dest, 1); + #else + for (size_t i = 0 ; i < (sizeof(r_) / sizeof(r_[0])) ; i++) { + for (size_t j = 0 ; j < (sizeof(r_[0].values) / sizeof(r_[0].values[0])) ; j++) { + r_[i].values[j] = ptr[i + (j * (sizeof(r_) / sizeof(r_[0])))]; + } + } + #endif + + simde_poly64x1x2_t r = { { + simde_poly64x1_from_private(r_[0]), + simde_poly64x1_from_private(r_[1]), + } }; + + return r; + #endif +} +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) + #undef vld2_p64 + #define vld2_p64(a) simde_vld2_p64((a)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_poly8x16x2_t +simde_vld2q_p8(simde_poly8_t const ptr[HEDLEY_ARRAY_PARAM(32)]) { + #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) + return vld2q_p8(ptr); + #else + #if defined(SIMDE_DIAGNOSTIC_DISABLE_UNINITIALIZED_) && HEDLEY_GCC_VERSION_CHECK(12,0,0) && defined(SIMDE_ARCH_RISCV64) + HEDLEY_DIAGNOSTIC_PUSH + SIMDE_DIAGNOSTIC_DISABLE_UNINITIALIZED_ + #endif + simde_poly8x16_private r_[2]; + + #if defined(SIMDE_RISCV_V_NATIVE) + vuint8m1x2_t dest = __riscv_vlseg2e8_v_u8m1x2(&ptr[0], 16); + r_[0].sv128 = __riscv_vget_v_u8m1x2_u8m1(dest, 0); + r_[1].sv128 = __riscv_vget_v_u8m1x2_u8m1(dest, 1); + #else + for (size_t i = 0 ; i < (sizeof(r_) / sizeof(r_[0])) ; i++) { + for (size_t j = 0 ; j < (sizeof(r_[0].values) / sizeof(r_[0].values[0])) ; j++) { + r_[i].values[j] = ptr[i + (j * (sizeof(r_) / sizeof(r_[0])))]; + } + } + #endif + + simde_poly8x16x2_t r = { { + simde_poly8x16_from_private(r_[0]), + simde_poly8x16_from_private(r_[1]), + } }; + + #if defined(SIMDE_DIAGNOSTIC_DISABLE_UNINITIALIZED_) && HEDLEY_GCC_VERSION_CHECK(12,0,0) && defined(SIMDE_ARCH_RISCV64) + HEDLEY_DIAGNOSTIC_POP + #endif + return r; + #endif +} +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vld2q_p8 + #define vld2q_p8(a) simde_vld2q_p8((a)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_poly16x8x2_t +simde_vld2q_p16(simde_poly16_t const ptr[HEDLEY_ARRAY_PARAM(16)]) { + #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) + return vld2q_p16(ptr); + #else + #if defined(SIMDE_DIAGNOSTIC_DISABLE_UNINITIALIZED_) && HEDLEY_GCC_VERSION_CHECK(12,0,0) && defined(SIMDE_ARCH_RISCV64) + HEDLEY_DIAGNOSTIC_PUSH + SIMDE_DIAGNOSTIC_DISABLE_UNINITIALIZED_ + #endif + simde_poly16x8_private r_[2]; + + #if defined(SIMDE_RISCV_V_NATIVE) + vuint16m1x2_t dest = __riscv_vlseg2e16_v_u16m1x2(&ptr[0], 8); + r_[0].sv128 = __riscv_vget_v_u16m1x2_u16m1(dest, 0); + r_[1].sv128 = __riscv_vget_v_u16m1x2_u16m1(dest, 1); + #else + for (size_t i = 0 ; i < (sizeof(r_) / sizeof(r_[0])) ; i++) { + for (size_t j = 0 ; j < (sizeof(r_[0].values) / sizeof(r_[0].values[0])) ; j++) { + r_[i].values[j] = ptr[i + (j * (sizeof(r_) / sizeof(r_[0])))]; + } + } + #endif + + simde_poly16x8x2_t r = { { + simde_poly16x8_from_private(r_[0]), + simde_poly16x8_from_private(r_[1]), + } }; + #if defined(SIMDE_DIAGNOSTIC_DISABLE_UNINITIALIZED_) && HEDLEY_GCC_VERSION_CHECK(12,0,0) && defined(SIMDE_ARCH_RISCV64) + HEDLEY_DIAGNOSTIC_POP + #endif + + return r; + #endif +} +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vld2q_p16 + #define vld2q_p16(a) simde_vld2q_p16((a)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_poly64x2x2_t +simde_vld2q_p64(simde_poly64_t const ptr[HEDLEY_ARRAY_PARAM(4)]) { + #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + return vld2q_p64(ptr); + #else + simde_poly64x2_private r_[2]; + + #if defined(SIMDE_RISCV_V_NATIVE) + vuint64m1x2_t dest = __riscv_vlseg2e64_v_u64m1x2(&ptr[0], 2); + r_[0].sv128 = __riscv_vget_v_u64m1x2_u64m1(dest, 0); + r_[1].sv128 = __riscv_vget_v_u64m1x2_u64m1(dest, 1); + #else + for (size_t i = 0 ; i < (sizeof(r_) / sizeof(r_[0])) ; i++) { + for (size_t j = 0 ; j < (sizeof(r_[0].values) / sizeof(r_[0].values[0])) ; j++) { + r_[i].values[j] = ptr[i + (j * (sizeof(r_) / sizeof(r_[0])))]; + } + } + #endif + + simde_poly64x2x2_t r = { { + simde_poly64x2_from_private(r_[0]), + simde_poly64x2_from_private(r_[1]), + } }; + + return r; + #endif +} +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vld2q_p64 + #define vld2q_p64(a) simde_vld2q_p64((a)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_bfloat16x4x2_t +simde_vld2_bf16(simde_bfloat16_t const ptr[HEDLEY_ARRAY_PARAM(8)]) { + #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARM_NEON_BF16) + return vld2_bf16(ptr); + #else + simde_bfloat16x4_private r_[2]; + + for (size_t i = 0 ; i < (sizeof(r_) / sizeof(r_[0])) ; i++) { + for (size_t j = 0 ; j < (sizeof(r_[0].values) / sizeof(r_[0].values[0])) ; j++) { + r_[i].values[j] = ptr[i + (j * (sizeof(r_) / sizeof(r_[0])))]; + } + } + + simde_bfloat16x4x2_t r = { { + simde_bfloat16x4_from_private(r_[0]), + simde_bfloat16x4_from_private(r_[1]), + } }; + + return r; + #endif +} +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) + #undef vld2_bf16 + #define vld2_bf16(a) simde_vld2_bf16((a)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_bfloat16x8x2_t +simde_vld2q_bf16(simde_bfloat16_t const ptr[HEDLEY_ARRAY_PARAM(16)]) { + #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARM_NEON_BF16) + return vld2q_bf16(ptr); + #else + simde_bfloat16x8_private r_[2]; + + for (size_t i = 0 ; i < (sizeof(r_) / sizeof(r_[0])); i++) { + for (size_t j = 0 ; j < (sizeof(r_[0].values) / sizeof(r_[0].values[0])) ; j++) { + r_[i].values[j] = ptr[i + (j * (sizeof(r_) / sizeof(r_[0])))]; + } + } + + simde_bfloat16x8x2_t r = { { + simde_bfloat16x8_from_private(r_[0]), + simde_bfloat16x8_from_private(r_[1]), + } }; + + return r; + #endif +} +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) + #undef vld2q_bf16 + #define vld2q_bf16(a) simde_vld2q_bf16((a)) +#endif + #endif /* !defined(SIMDE_BUG_INTEL_857088) */ SIMDE_END_DECLS_ diff --git a/lib/simd_wrapper/simde/arm/neon/ld2_dup.h b/lib/simd_wrapper/simde/arm/neon/ld2_dup.h new file mode 100644 index 00000000000..238807ab743 --- /dev/null +++ b/lib/simd_wrapper/simde/arm/neon/ld2_dup.h @@ -0,0 +1,612 @@ +/* SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, copy, + * modify, merge, publish, distribute, sublicense, and/or sell copies + * of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + * Copyright: + * 2023 Yi-Yen Chung (Copyright owned by Andes Technology) + */ + +#if !defined(SIMDE_ARM_NEON_LD2_DUP_H) +#define SIMDE_ARM_NEON_LD2_DUP_H + +#include "dup_n.h" +#include "reinterpret.h" + +HEDLEY_DIAGNOSTIC_PUSH +SIMDE_DISABLE_UNWANTED_DIAGNOSTICS +SIMDE_BEGIN_DECLS_ + +SIMDE_FUNCTION_ATTRIBUTES +simde_float16x4x2_t +simde_vld2_dup_f16(simde_float16_t const ptr[HEDLEY_ARRAY_PARAM(2)]) { + #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) && defined(SIMDE_ARM_NEON_FP16) + return vld2_dup_f16(ptr); + #else + simde_float16x4x2_t r; + + for (size_t i = 0 ; i < 2 ; i++) { + r.val[i] = simde_vdup_n_f16(ptr[i]); + } + return r; + #endif +} +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vld2_dup_f16 + #define vld2_dup_f16(a) simde_vld2_dup_f16((a)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_float32x2x2_t +simde_vld2_dup_f32(simde_float32 const * ptr) { + #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) + return vld2_dup_f32(ptr); + #else + simde_float32x2x2_t r; + + for (size_t i = 0 ; i < 2 ; i++) { + r.val[i] = simde_vdup_n_f32(ptr[i]); + } + return r; + #endif +} +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vld2_dup_f32 + #define vld2_dup_f32(a) simde_vld2_dup_f32((a)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_float64x1x2_t +simde_vld2_dup_f64(simde_float64 const * ptr) { + #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + return vld2_dup_f64(ptr); + #else + simde_float64x1x2_t r; + + for (size_t i = 0 ; i < 2 ; i++) { + r.val[i] = simde_vdup_n_f64(ptr[i]); + } + return r; + #endif +} +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vld2_dup_f64 + #define vld2_dup_f64(a) simde_vld2_dup_f64((a)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_int8x8x2_t +simde_vld2_dup_s8(int8_t const * ptr) { + #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) + return vld2_dup_s8(ptr); + #else + simde_int8x8x2_t r; + + for (size_t i = 0 ; i < 2 ; i++) { + r.val[i] = simde_vdup_n_s8(ptr[i]); + } + return r; + #endif +} +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vld2_dup_s8 + #define vld2_dup_s8(a) simde_vld2_dup_s8((a)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_int16x4x2_t +simde_vld2_dup_s16(int16_t const * ptr) { + #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) + return vld2_dup_s16(ptr); + #else + simde_int16x4x2_t r; + + for (size_t i = 0 ; i < 2 ; i++) { + r.val[i] = simde_vdup_n_s16(ptr[i]); + } + return r; + #endif +} +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vld2_dup_s16 + #define vld2_dup_s16(a) simde_vld2_dup_s16((a)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_int32x2x2_t +simde_vld2_dup_s32(int32_t const * ptr) { + #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) + return vld2_dup_s32(ptr); + #else + simde_int32x2x2_t r; + + for (size_t i = 0 ; i < 2 ; i++) { + r.val[i] = simde_vdup_n_s32(ptr[i]); + } + return r; + #endif +} +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vld2_dup_s32 + #define vld2_dup_s32(a) simde_vld2_dup_s32((a)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_int64x1x2_t +simde_vld2_dup_s64(int64_t const * ptr) { + #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) + return vld2_dup_s64(ptr); + #else + simde_int64x1x2_t r; + + for (size_t i = 0 ; i < 2 ; i++) { + r.val[i] = simde_vdup_n_s64(ptr[i]); + } + return r; + #endif +} +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vld2_dup_s64 + #define vld2_dup_s64(a) simde_vld2_dup_s64((a)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_uint8x8x2_t +simde_vld2_dup_u8(uint8_t const * ptr) { + #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) + return vld2_dup_u8(ptr); + #else + simde_uint8x8x2_t r; + + for (size_t i = 0 ; i < 2 ; i++) { + r.val[i] = simde_vdup_n_u8(ptr[i]); + } + return r; + #endif +} +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vld2_dup_u8 + #define vld2_dup_u8(a) simde_vld2_dup_u8((a)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_uint16x4x2_t +simde_vld2_dup_u16(uint16_t const * ptr) { + #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) + return vld2_dup_u16(ptr); + #else + simde_uint16x4x2_t r; + + for (size_t i = 0 ; i < 2 ; i++) { + r.val[i] = simde_vdup_n_u16(ptr[i]); + } + return r; + #endif +} +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vld2_dup_u16 + #define vld2_dup_u16(a) simde_vld2_dup_u16((a)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_uint32x2x2_t +simde_vld2_dup_u32(uint32_t const * ptr) { + #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) + return vld2_dup_u32(ptr); + #else + simde_uint32x2x2_t r; + + for (size_t i = 0 ; i < 2 ; i++) { + r.val[i] = simde_vdup_n_u32(ptr[i]); + } + return r; + #endif +} +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vld2_dup_u32 + #define vld2_dup_u32(a) simde_vld2_dup_u32((a)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_uint64x1x2_t +simde_vld2_dup_u64(uint64_t const * ptr) { + #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) + return vld2_dup_u64(ptr); + #else + simde_uint64x1x2_t r; + + for (size_t i = 0 ; i < 2 ; i++) { + r.val[i] = simde_vdup_n_u64(ptr[i]); + } + return r; + #endif +} +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vld2_dup_u64 + #define vld2_dup_u64(a) simde_vld2_dup_u64((a)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_float16x8x2_t +simde_vld2q_dup_f16(simde_float16_t const * ptr) { + #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARM_NEON_FP16) + return vld2q_dup_f16(ptr); + #else + simde_float16x8x2_t r; + + for (size_t i = 0 ; i < 2 ; i++) { + r.val[i] = simde_vdupq_n_f16(ptr[i]); + } + return r; + #endif +} +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) + #undef vld2q_dup_f16 + #define vld2q_dup_f16(a) simde_vld2q_dup_f16((a)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_float32x4x2_t +simde_vld2q_dup_f32(simde_float32 const * ptr) { + #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + return vld2q_dup_f32(ptr); + #else + simde_float32x4x2_t r; + + for (size_t i = 0 ; i < 2 ; i++) { + r.val[i] = simde_vdupq_n_f32(ptr[i]); + } + return r; + #endif +} +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vld2q_dup_f32 + #define vld2q_dup_f32(a) simde_vld2q_dup_f32((a)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_float64x2x2_t +simde_vld2q_dup_f64(simde_float64 const * ptr) { + #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + return vld2q_dup_f64(ptr); + #else + simde_float64x2x2_t r; + + for (size_t i = 0 ; i < 2 ; i++) { + r.val[i] = simde_vdupq_n_f64(ptr[i]); + } + return r; + #endif +} +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vld2q_dup_f64 + #define vld2q_dup_f64(a) simde_vld2q_dup_f64((a)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_int8x16x2_t +simde_vld2q_dup_s8(int8_t const * ptr) { + #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + return vld2q_dup_s8(ptr); + #else + simde_int8x16x2_t r; + + for (size_t i = 0 ; i < 2 ; i++) { + r.val[i] = simde_vdupq_n_s8(ptr[i]); + } + return r; + #endif +} +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vld2q_dup_s8 + #define vld2q_dup_s8(a) simde_vld2q_dup_s8((a)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_int16x8x2_t +simde_vld2q_dup_s16(int16_t const * ptr) { + #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + return vld2q_dup_s16(ptr); + #else + simde_int16x8x2_t r; + + for (size_t i = 0 ; i < 2 ; i++) { + r.val[i] = simde_vdupq_n_s16(ptr[i]); + } + return r; + #endif +} +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vld2q_dup_s16 + #define vld2q_dup_s16(a) simde_vld2q_dup_s16((a)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_int32x4x2_t +simde_vld2q_dup_s32(int32_t const * ptr) { + #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + return vld2q_dup_s32(ptr); + #else + simde_int32x4x2_t r; + + for (size_t i = 0 ; i < 2 ; i++) { + r.val[i] = simde_vdupq_n_s32(ptr[i]); + } + return r; + #endif +} +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vld2q_dup_s32 + #define vld2q_dup_s32(a) simde_vld2q_dup_s32((a)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_int64x2x2_t +simde_vld2q_dup_s64(int64_t const * ptr) { + #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + return vld2q_dup_s64(ptr); + #else + simde_int64x2x2_t r; + + for (size_t i = 0 ; i < 2 ; i++) { + r.val[i] = simde_vdupq_n_s64(ptr[i]); + } + return r; + #endif +} +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vld2q_dup_s64 + #define vld2q_dup_s64(a) simde_vld2q_dup_s64((a)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_uint8x16x2_t +simde_vld2q_dup_u8(uint8_t const * ptr) { + #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + return vld2q_dup_u8(ptr); + #else + simde_uint8x16x2_t r; + + for (size_t i = 0 ; i < 2 ; i++) { + r.val[i] = simde_vdupq_n_u8(ptr[i]); + } + return r; + #endif +} +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vld2q_dup_u8 + #define vld2q_dup_u8(a) simde_vld2q_dup_u8((a)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_uint16x8x2_t +simde_vld2q_dup_u16(uint16_t const * ptr) { + #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + return vld2q_dup_u16(ptr); + #else + simde_uint16x8x2_t r; + + for (size_t i = 0 ; i < 2 ; i++) { + r.val[i] = simde_vdupq_n_u16(ptr[i]); + } + return r; + #endif +} +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vld2q_dup_u16 + #define vld2q_dup_u16(a) simde_vld2q_dup_u16((a)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_uint32x4x2_t +simde_vld2q_dup_u32(uint32_t const * ptr) { + #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + return vld2q_dup_u32(ptr); + #else + simde_uint32x4x2_t r; + + for (size_t i = 0 ; i < 2 ; i++) { + r.val[i] = simde_vdupq_n_u32(ptr[i]); + } + return r; + #endif +} +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vld2q_dup_u32 + #define vld2q_dup_u32(a) simde_vld2q_dup_u32((a)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_uint64x2x2_t +simde_vld2q_dup_u64(uint64_t const * ptr) { + #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + return vld2q_dup_u64(ptr); + #else + simde_uint64x2x2_t r; + + for (size_t i = 0 ; i < 2 ; i++) { + r.val[i] = simde_vdupq_n_u64(ptr[i]); + } + return r; + #endif +} +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vld2q_dup_u64 + #define vld2q_dup_u64(a) simde_vld2q_dup_u64((a)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_poly8x8x2_t +simde_vld2_dup_p8(simde_poly8_t const * ptr) { + #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) + return vld2_dup_p8(ptr); + #else + simde_poly8x8x2_t r; + + for (size_t i = 0 ; i < 2 ; i++) { + r.val[i] = simde_vdup_n_p8(ptr[i]); + } + return r; + #endif +} +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vld2_dup_p8 + #define vld2_dup_p8(a) simde_vld2_dup_p8((a)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_poly16x4x2_t +simde_vld2_dup_p16(simde_poly16_t const * ptr) { + #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) + return vld2_dup_p16(ptr); + #else + simde_poly16x4x2_t r; + + for (size_t i = 0 ; i < 2 ; i++) { + r.val[i] = simde_vdup_n_p16(ptr[i]); + } + return r; + #endif +} +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vld2_dup_p16 + #define vld2_dup_p16(a) simde_vld2_dup_p16((a)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_poly64x1x2_t +simde_vld2_dup_p64(simde_poly64_t const * ptr) { + #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) + return vld2_dup_p64(ptr); + #else + simde_poly64x1x2_t r; + + for (size_t i = 0 ; i < 2 ; i++) { + r.val[i] = simde_vdup_n_p64(ptr[i]); + } + return r; + #endif +} +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) + #undef vld2_dup_p64 + #define vld2_dup_p64(a) simde_vld2_dup_p64((a)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_poly8x16x2_t +simde_vld2q_dup_p8(simde_poly8_t const * ptr) { + #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) && !defined(SIMDE_BUG_GCC_95399) && \ + !defined(SIMDE_BUG_CLANG_71763) + return vld2q_dup_p8(ptr); + #else + simde_poly8x16x2_t r; + + for (size_t i = 0 ; i < 2 ; i++) { + r.val[i] = simde_vdupq_n_p8(ptr[i]); + } + return r; + #endif +} +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vld2q_dup_p8 + #define vld2q_dup_p8(a) simde_vld2q_dup_p8((a)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_poly16x8x2_t +simde_vld2q_dup_p16(simde_poly16_t const * ptr) { + #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) && !defined(SIMDE_BUG_GCC_95399) && \ + !defined(SIMDE_BUG_CLANG_71763) + return vld2q_dup_p16(ptr); + #else + simde_poly16x8x2_t r; + + for (size_t i = 0 ; i < 2 ; i++) { + r.val[i] = simde_vdupq_n_p16(ptr[i]); + } + return r; + #endif +} +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vld2q_dup_p16 + #define vld2q_dup_p16(a) simde_vld2q_dup_p16((a)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_poly64x2x2_t +simde_vld2q_dup_p64(simde_poly64_t const * ptr) { + #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + return vld2q_dup_p64(ptr); + #else + simde_poly64x2x2_t r; + + for (size_t i = 0 ; i < 2 ; i++) { + r.val[i] = simde_vdupq_n_p64(ptr[i]); + } + return r; + #endif +} +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vld2q_dup_p64 + #define vld2q_dup_p64(a) simde_vld2q_dup_p64((a)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_bfloat16x4x2_t +simde_vld2_dup_bf16(simde_bfloat16_t const ptr[HEDLEY_ARRAY_PARAM(2)]) { + #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARM_NEON_BF16) + return vld2_dup_bf16(ptr); + #else + simde_bfloat16x4x2_t r; + + for (size_t i = 0 ; i < 2 ; i++) { + r.val[i] = simde_vdup_n_bf16(ptr[i]); + } + return r; + #endif +} +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) + #undef vld2_dup_bf16 + #define vld2_dup_bf16(a) simde_vld2_dup_bf16((a)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_bfloat16x8x2_t +simde_vld2q_dup_bf16(simde_bfloat16 const * ptr) { + #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARM_NEON_BF16) + return vld2q_dup_bf16(ptr); + #else + simde_bfloat16x8x2_t r; + + for (size_t i = 0 ; i < 2 ; i++) { + r.val[i] = simde_vdupq_n_bf16(ptr[i]); + } + return r; + #endif +} +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) + #undef vld2q_dup_bf16 + #define vld2q_dup_bf16(a) simde_vld2q_dup_bf16((a)) +#endif + +SIMDE_END_DECLS_ +HEDLEY_DIAGNOSTIC_POP + +#endif /* !defined(SIMDE_ARM_NEON_LD2_DUP_H) */ diff --git a/lib/simd_wrapper/simde/arm/neon/ld2_lane.h b/lib/simd_wrapper/simde/arm/neon/ld2_lane.h new file mode 100644 index 00000000000..81b29dd2005 --- /dev/null +++ b/lib/simd_wrapper/simde/arm/neon/ld2_lane.h @@ -0,0 +1,638 @@ +/* SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, copy, + * modify, merge, publish, distribute, sublicense, and/or sell copies + * of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + * Copyright: + * 2023 Yi-Yen Chung (Copyright owned by Andes Technology) + */ + +#if !defined(SIMDE_ARM_NEON_LD2_LANE_H) +#define SIMDE_ARM_NEON_LD2_LANE_H + +#include "types.h" + +HEDLEY_DIAGNOSTIC_PUSH +SIMDE_DISABLE_UNWANTED_DIAGNOSTICS +SIMDE_BEGIN_DECLS_ + +SIMDE_FUNCTION_ATTRIBUTES +simde_int8x8x2_t simde_vld2_lane_s8(int8_t const ptr[HEDLEY_ARRAY_PARAM(2)], simde_int8x8x2_t src, const int lane) + SIMDE_REQUIRE_CONSTANT_RANGE(lane, 0, 7) { + simde_int8x8x2_t r; + + for (size_t i = 0 ; i < 2 ; i++) { + simde_int8x8_private tmp_ = simde_int8x8_to_private(src.val[i]); + tmp_.values[lane] = ptr[i]; + r.val[i] = simde_int8x8_from_private(tmp_); + } + return r; +} +#if defined(SIMDE_ARM_NEON_A32V7_NATIVE) + #define simde_vld2_lane_s8(ptr, src, lane) vld2_lane_s8(ptr, src, lane) +#endif +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vld2_lane_s8 + #define vld2_lane_s8(ptr, src, lane) simde_vld2_lane_s8((ptr), (src), (lane)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_int16x4x2_t simde_vld2_lane_s16(int16_t const ptr[HEDLEY_ARRAY_PARAM(2)], simde_int16x4x2_t src, const int lane) + SIMDE_REQUIRE_CONSTANT_RANGE(lane, 0, 3) { + simde_int16x4x2_t r; + + for (size_t i = 0 ; i < 2 ; i++) { + simde_int16x4_private tmp_ = simde_int16x4_to_private(src.val[i]); + tmp_.values[lane] = ptr[i]; + r.val[i] = simde_int16x4_from_private(tmp_); + } + return r; +} +#if defined(SIMDE_ARM_NEON_A32V7_NATIVE) + #define simde_vld2_lane_s16(ptr, src, lane) vld2_lane_s16(ptr, src, lane) +#endif +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vld2_lane_s16 + #define vld2_lane_s16(ptr, src, lane) simde_vld2_lane_s16((ptr), (src), (lane)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_int32x2x2_t simde_vld2_lane_s32(int32_t const ptr[HEDLEY_ARRAY_PARAM(2)], simde_int32x2x2_t src, const int lane) + SIMDE_REQUIRE_CONSTANT_RANGE(lane, 0, 1) { + simde_int32x2x2_t r; + + for (size_t i = 0 ; i < 2 ; i++) { + simde_int32x2_private tmp_ = simde_int32x2_to_private(src.val[i]); + tmp_.values[lane] = ptr[i]; + r.val[i] = simde_int32x2_from_private(tmp_); + } + return r; +} +#if defined(SIMDE_ARM_NEON_A32V7_NATIVE) + #define simde_vld2_lane_s32(ptr, src, lane) vld2_lane_s32(ptr, src, lane) +#endif +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vld2_lane_s32 + #define vld2_lane_s32(ptr, src, lane) simde_vld2_lane_s32((ptr), (src), (lane)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_int64x1x2_t simde_vld2_lane_s64(int64_t const ptr[HEDLEY_ARRAY_PARAM(2)], simde_int64x1x2_t src, const int lane) + SIMDE_REQUIRE_CONSTANT_RANGE(lane, 0, 0) { + simde_int64x1x2_t r; + + for (size_t i = 0 ; i < 2 ; i++) { + simde_int64x1_private tmp_ = simde_int64x1_to_private(src.val[i]); + tmp_.values[lane] = ptr[i]; + r.val[i] = simde_int64x1_from_private(tmp_); + } + return r; +} +#if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + #define simde_vld2_lane_s64(ptr, src, lane) vld2_lane_s64(ptr, src, lane) +#endif +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vld2_lane_s64 + #define vld2_lane_s64(ptr, src, lane) simde_vld2_lane_s64((ptr), (src), (lane)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_uint8x8x2_t simde_vld2_lane_u8(uint8_t const ptr[HEDLEY_ARRAY_PARAM(2)], simde_uint8x8x2_t src, const int lane) + SIMDE_REQUIRE_CONSTANT_RANGE(lane, 0, 7) { + simde_uint8x8x2_t r; + + for (size_t i = 0 ; i < 2 ; i++) { + simde_uint8x8_private tmp_ = simde_uint8x8_to_private(src.val[i]); + tmp_.values[lane] = ptr[i]; + r.val[i] = simde_uint8x8_from_private(tmp_); + } + return r; +} +#if defined(SIMDE_ARM_NEON_A32V7_NATIVE) + #define simde_vld2_lane_u8(ptr, src, lane) vld2_lane_u8(ptr, src, lane) +#endif +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vld2_lane_u8 + #define vld2_lane_u8(ptr, src, lane) simde_vld2_lane_u8((ptr), (src), (lane)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_uint16x4x2_t simde_vld2_lane_u16(uint16_t const ptr[HEDLEY_ARRAY_PARAM(2)], simde_uint16x4x2_t src, const int lane) + SIMDE_REQUIRE_CONSTANT_RANGE(lane, 0, 3) { + simde_uint16x4x2_t r; + + for (size_t i = 0 ; i < 2 ; i++) { + simde_uint16x4_private tmp_ = simde_uint16x4_to_private(src.val[i]); + tmp_.values[lane] = ptr[i]; + r.val[i] = simde_uint16x4_from_private(tmp_); + } + return r; +} +#if defined(SIMDE_ARM_NEON_A32V7_NATIVE) + #define simde_vld2_lane_u16(ptr, src, lane) vld2_lane_u16(ptr, src, lane) +#endif +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vld2_lane_u16 + #define vld2_lane_u16(ptr, src, lane) simde_vld2_lane_u16((ptr), (src), (lane)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_uint32x2x2_t simde_vld2_lane_u32(uint32_t const ptr[HEDLEY_ARRAY_PARAM(2)], simde_uint32x2x2_t src, const int lane) + SIMDE_REQUIRE_CONSTANT_RANGE(lane, 0, 1) { + simde_uint32x2x2_t r; + + for (size_t i = 0 ; i < 2 ; i++) { + simde_uint32x2_private tmp_ = simde_uint32x2_to_private(src.val[i]); + tmp_.values[lane] = ptr[i]; + r.val[i] = simde_uint32x2_from_private(tmp_); + } + return r; +} +#if defined(SIMDE_ARM_NEON_A32V7_NATIVE) + #define simde_vld2_lane_u32(ptr, src, lane) vld2_lane_u32(ptr, src, lane) +#endif +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vld2_lane_u32 + #define vld2_lane_u32(ptr, src, lane) simde_vld2_lane_u32((ptr), (src), (lane)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_uint64x1x2_t simde_vld2_lane_u64(uint64_t const ptr[HEDLEY_ARRAY_PARAM(2)], simde_uint64x1x2_t src, const int lane) + SIMDE_REQUIRE_CONSTANT_RANGE(lane, 0, 0) { + simde_uint64x1x2_t r; + + for (size_t i = 0 ; i < 2 ; i++) { + simde_uint64x1_private tmp_ = simde_uint64x1_to_private(src.val[i]); + tmp_.values[lane] = ptr[i]; + r.val[i] = simde_uint64x1_from_private(tmp_); + } + return r; +} +#if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + #define simde_vld2_lane_u64(ptr, src, lane) vld2_lane_u64(ptr, src, lane) +#endif +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vld2_lane_u64 + #define vld2_lane_u64(ptr, src, lane) simde_vld2_lane_u64((ptr), (src), (lane)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_float16x4x2_t simde_vld2_lane_f16(simde_float16_t const ptr[HEDLEY_ARRAY_PARAM(2)], simde_float16x4x2_t src, const int lane) + SIMDE_REQUIRE_CONSTANT_RANGE(lane, 0, 3) { + simde_float16x4x2_t r; + + for (size_t i = 0 ; i < 2 ; i++) { + simde_float16x4_private tmp_ = simde_float16x4_to_private(src.val[i]); + tmp_.values[lane] = ptr[i]; + r.val[i] = simde_float16x4_from_private(tmp_); + } + return r; +} +#if defined(SIMDE_ARM_NEON_A32V7_NATIVE) && defined(SIMDE_ARM_NEON_FP16) + #define simde_vld2_lane_f16(ptr, src, lane) vld2_lane_f16(ptr, src, lane) +#endif +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vld2_lane_f16 + #define vld2_lane_f16(ptr, src, lane) simde_vld2_lane_f16((ptr), (src), (lane)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_float32x2x2_t simde_vld2_lane_f32(simde_float32_t const ptr[HEDLEY_ARRAY_PARAM(2)], simde_float32x2x2_t src, const int lane) + SIMDE_REQUIRE_CONSTANT_RANGE(lane, 0, 1) { + simde_float32x2x2_t r; + + for (size_t i = 0 ; i < 2 ; i++) { + simde_float32x2_private tmp_ = simde_float32x2_to_private(src.val[i]); + tmp_.values[lane] = ptr[i]; + r.val[i] = simde_float32x2_from_private(tmp_); + } + return r; +} +#if defined(SIMDE_ARM_NEON_A32V7_NATIVE) + #define simde_vld2_lane_f32(ptr, src, lane) vld2_lane_f32(ptr, src, lane) +#endif +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vld2_lane_f32 + #define vld2_lane_f32(ptr, src, lane) simde_vld2_lane_f32((ptr), (src), (lane)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_float64x1x2_t simde_vld2_lane_f64(simde_float64_t const ptr[HEDLEY_ARRAY_PARAM(2)], simde_float64x1x2_t src, const int lane) + SIMDE_REQUIRE_CONSTANT_RANGE(lane, 0, 0) { + simde_float64x1x2_t r; + + for (size_t i = 0 ; i < 2 ; i++) { + simde_float64x1_private tmp_ = simde_float64x1_to_private(src.val[i]); + tmp_.values[lane] = ptr[i]; + r.val[i] = simde_float64x1_from_private(tmp_); + } + return r; +} +#if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + #define simde_vld2_lane_f64(ptr, src, lane) vld2_lane_f64(ptr, src, lane) +#endif +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vld2_lane_f64 + #define vld2_lane_f64(ptr, src, lane) simde_vld2_lane_f64((ptr), (src), (lane)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_int8x16x2_t simde_vld2q_lane_s8(int8_t const ptr[HEDLEY_ARRAY_PARAM(2)], simde_int8x16x2_t src, const int lane) + SIMDE_REQUIRE_CONSTANT_RANGE(lane, 0, 15) { + simde_int8x16x2_t r; + + for (size_t i = 0 ; i < 2 ; i++) { + simde_int8x16_private tmp_ = simde_int8x16_to_private(src.val[i]); + tmp_.values[lane] = ptr[i]; + r.val[i] = simde_int8x16_from_private(tmp_); + } + return r; +} +#if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + #define simde_vld2q_lane_s8(ptr, src, lane) vld2q_lane_s8(ptr, src, lane) +#endif +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vld2q_lane_s8 + #define vld2q_lane_s8(ptr, src, lane) simde_vld2q_lane_s8((ptr), (src), (lane)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_int16x8x2_t simde_vld2q_lane_s16(int16_t const ptr[HEDLEY_ARRAY_PARAM(2)], simde_int16x8x2_t src, const int lane) + SIMDE_REQUIRE_CONSTANT_RANGE(lane, 0, 7) { + simde_int16x8x2_t r; + + for (size_t i = 0 ; i < 2 ; i++) { + simde_int16x8_private tmp_ = simde_int16x8_to_private(src.val[i]); + tmp_.values[lane] = ptr[i]; + r.val[i] = simde_int16x8_from_private(tmp_); + } + return r; +} +#if defined(SIMDE_ARM_NEON_A32V7_NATIVE) + #define simde_vld2q_lane_s16(ptr, src, lane) vld2q_lane_s16(ptr, src, lane) +#endif +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vld2q_lane_s16 + #define vld2q_lane_s16(ptr, src, lane) simde_vld2q_lane_s16((ptr), (src), (lane)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_int32x4x2_t simde_vld2q_lane_s32(int32_t const ptr[HEDLEY_ARRAY_PARAM(2)], simde_int32x4x2_t src, const int lane) + SIMDE_REQUIRE_CONSTANT_RANGE(lane, 0, 3) { + simde_int32x4x2_t r; + + for (size_t i = 0 ; i < 2 ; i++) { + simde_int32x4_private tmp_ = simde_int32x4_to_private(src.val[i]); + tmp_.values[lane] = ptr[i]; + r.val[i] = simde_int32x4_from_private(tmp_); + } + return r; +} +#if defined(SIMDE_ARM_NEON_A32V7_NATIVE) + #define simde_vld2q_lane_s32(ptr, src, lane) vld2q_lane_s32(ptr, src, lane) +#endif +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vld2q_lane_s32 + #define vld2q_lane_s32(ptr, src, lane) simde_vld2q_lane_s32((ptr), (src), (lane)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_int64x2x2_t simde_vld2q_lane_s64(int64_t const ptr[HEDLEY_ARRAY_PARAM(2)], simde_int64x2x2_t src, const int lane) + SIMDE_REQUIRE_CONSTANT_RANGE(lane, 0, 1) { + simde_int64x2x2_t r; + + for (size_t i = 0 ; i < 2 ; i++) { + simde_int64x2_private tmp_ = simde_int64x2_to_private(src.val[i]); + tmp_.values[lane] = ptr[i]; + r.val[i] = simde_int64x2_from_private(tmp_); + } + return r; +} +#if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + #define simde_vld2q_lane_s64(ptr, src, lane) vld2q_lane_s64(ptr, src, lane) +#endif +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vld2q_lane_s64 + #define vld2q_lane_s64(ptr, src, lane) simde_vld2q_lane_s64((ptr), (src), (lane)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_uint8x16x2_t simde_vld2q_lane_u8(uint8_t const ptr[HEDLEY_ARRAY_PARAM(2)], simde_uint8x16x2_t src, const int lane) + SIMDE_REQUIRE_CONSTANT_RANGE(lane, 0, 15) { + simde_uint8x16x2_t r; + + for (size_t i = 0 ; i < 2 ; i++) { + simde_uint8x16_private tmp_ = simde_uint8x16_to_private(src.val[i]); + tmp_.values[lane] = ptr[i]; + r.val[i] = simde_uint8x16_from_private(tmp_); + } + return r; +} +#if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + #define simde_vld2q_lane_u8(ptr, src, lane) vld2q_lane_u8(ptr, src, lane) +#endif +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vld2q_lane_u8 + #define vld2q_lane_u8(ptr, src, lane) simde_vld2q_lane_u8((ptr), (src), (lane)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_uint16x8x2_t simde_vld2q_lane_u16(uint16_t const ptr[HEDLEY_ARRAY_PARAM(2)], simde_uint16x8x2_t src, const int lane) + SIMDE_REQUIRE_CONSTANT_RANGE(lane, 0, 7) { + simde_uint16x8x2_t r; + + for (size_t i = 0 ; i < 2 ; i++) { + simde_uint16x8_private tmp_ = simde_uint16x8_to_private(src.val[i]); + tmp_.values[lane] = ptr[i]; + r.val[i] = simde_uint16x8_from_private(tmp_); + } + return r; +} +#if defined(SIMDE_ARM_NEON_A32V7_NATIVE) + #define simde_vld2q_lane_u16(ptr, src, lane) vld2q_lane_u16(ptr, src, lane) +#endif +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vld2q_lane_u16 + #define vld2q_lane_u16(ptr, src, lane) simde_vld2q_lane_u16((ptr), (src), (lane)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_uint32x4x2_t simde_vld2q_lane_u32(uint32_t const ptr[HEDLEY_ARRAY_PARAM(2)], simde_uint32x4x2_t src, const int lane) + SIMDE_REQUIRE_CONSTANT_RANGE(lane, 0, 3) { + simde_uint32x4x2_t r; + + for (size_t i = 0 ; i < 2 ; i++) { + simde_uint32x4_private tmp_ = simde_uint32x4_to_private(src.val[i]); + tmp_.values[lane] = ptr[i]; + r.val[i] = simde_uint32x4_from_private(tmp_); + } + return r; +} +#if defined(SIMDE_ARM_NEON_A32V7_NATIVE) + #define simde_vld2q_lane_u32(ptr, src, lane) vld2q_lane_u32(ptr, src, lane) +#endif +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vld2q_lane_u32 + #define vld2q_lane_u32(ptr, src, lane) simde_vld2q_lane_u32((ptr), (src), (lane)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_uint64x2x2_t simde_vld2q_lane_u64(uint64_t const ptr[HEDLEY_ARRAY_PARAM(2)], simde_uint64x2x2_t src, const int lane) + SIMDE_REQUIRE_CONSTANT_RANGE(lane, 0, 1) { + simde_uint64x2x2_t r; + + for (size_t i = 0 ; i < 2 ; i++) { + simde_uint64x2_private tmp_ = simde_uint64x2_to_private(src.val[i]); + tmp_.values[lane] = ptr[i]; + r.val[i] = simde_uint64x2_from_private(tmp_); + } + return r; +} +#if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + #define simde_vld2q_lane_u64(ptr, src, lane) vld2q_lane_u64(ptr, src, lane) +#endif +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vld2q_lane_u64 + #define vld2q_lane_u64(ptr, src, lane) simde_vld2q_lane_u64((ptr), (src), (lane)) +#endif +SIMDE_FUNCTION_ATTRIBUTES +simde_float16x8x2_t simde_vld2q_lane_f16(simde_float16_t const ptr[HEDLEY_ARRAY_PARAM(2)], simde_float16x8x2_t src, const int lane) + SIMDE_REQUIRE_CONSTANT_RANGE(lane, 0, 7) { + simde_float16x8x2_t r; + + for (size_t i = 0 ; i < 2 ; i++) { + simde_float16x8_private tmp_ = simde_float16x8_to_private(src.val[i]); + tmp_.values[lane] = ptr[i]; + r.val[i] = simde_float16x8_from_private(tmp_); + } + return r; +} +#if defined(SIMDE_ARM_NEON_A32V7_NATIVE) && defined(SIMDE_ARM_NEON_FP16) + #define simde_vld2q_lane_f16(ptr, src, lane) vld2q_lane_f16(ptr, src, lane) +#endif +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vld2q_lane_f16 + #define vld2q_lane_f16(ptr, src, lane) simde_vld2q_lane_f16((ptr), (src), (lane)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_float32x4x2_t simde_vld2q_lane_f32(simde_float32_t const ptr[HEDLEY_ARRAY_PARAM(2)], simde_float32x4x2_t src, const int lane) + SIMDE_REQUIRE_CONSTANT_RANGE(lane, 0, 3) { + simde_float32x4x2_t r; + + for (size_t i = 0 ; i < 2 ; i++) { + simde_float32x4_private tmp_ = simde_float32x4_to_private(src.val[i]); + tmp_.values[lane] = ptr[i]; + r.val[i] = simde_float32x4_from_private(tmp_); + } + return r; +} +#if defined(SIMDE_ARM_NEON_A32V7_NATIVE) + #define simde_vld2q_lane_f32(ptr, src, lane) vld2q_lane_f32(ptr, src, lane) +#endif +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vld2q_lane_f32 + #define vld2q_lane_f32(ptr, src, lane) simde_vld2q_lane_f32((ptr), (src), (lane)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_float64x2x2_t simde_vld2q_lane_f64(simde_float64_t const ptr[HEDLEY_ARRAY_PARAM(2)], simde_float64x2x2_t src, const int lane) + SIMDE_REQUIRE_CONSTANT_RANGE(lane, 0, 1) { + simde_float64x2x2_t r; + + for (size_t i = 0 ; i < 2 ; i++) { + simde_float64x2_private tmp_ = simde_float64x2_to_private(src.val[i]); + tmp_.values[lane] = ptr[i]; + r.val[i] = simde_float64x2_from_private(tmp_); + } + return r; +} +#if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + #define simde_vld2q_lane_f64(ptr, src, lane) vld2q_lane_f64(ptr, src, lane) +#endif +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vld2q_lane_f64 + #define vld2q_lane_f64(ptr, src, lane) simde_vld2q_lane_f64((ptr), (src), (lane)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_poly8x8x2_t simde_vld2_lane_p8(simde_poly8_t const ptr[HEDLEY_ARRAY_PARAM(2)], simde_poly8x8x2_t src, const int lane) + SIMDE_REQUIRE_CONSTANT_RANGE(lane, 0, 7) { + simde_poly8x8x2_t r; + + for (size_t i = 0 ; i < 2 ; i++) { + simde_poly8x8_private tmp_ = simde_poly8x8_to_private(src.val[i]); + tmp_.values[lane] = ptr[i]; + r.val[i] = simde_poly8x8_from_private(tmp_); + } + return r; +} +#if defined(SIMDE_ARM_NEON_A32V7_NATIVE) + #define simde_vld2_lane_p8(ptr, src, lane) vld2_lane_p8(ptr, src, lane) +#endif +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vld2_lane_p8 + #define vld2_lane_p8(ptr, src, lane) simde_vld2_lane_p8((ptr), (src), (lane)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_poly16x4x2_t simde_vld2_lane_p16(simde_poly16_t const ptr[HEDLEY_ARRAY_PARAM(2)], simde_poly16x4x2_t src, const int lane) + SIMDE_REQUIRE_CONSTANT_RANGE(lane, 0, 3) { + simde_poly16x4x2_t r; + + for (size_t i = 0 ; i < 2 ; i++) { + simde_poly16x4_private tmp_ = simde_poly16x4_to_private(src.val[i]); + tmp_.values[lane] = ptr[i]; + r.val[i] = simde_poly16x4_from_private(tmp_); + } + return r; +} +#if defined(SIMDE_ARM_NEON_A32V7_NATIVE) + #define simde_vld2_lane_p16(ptr, src, lane) vld2_lane_p16(ptr, src, lane) +#endif +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vld2_lane_p16 + #define vld2_lane_p16(ptr, src, lane) simde_vld2_lane_p16((ptr), (src), (lane)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_poly64x1x2_t simde_vld2_lane_p64(simde_poly64_t const ptr[HEDLEY_ARRAY_PARAM(2)], simde_poly64x1x2_t src, const int lane) + SIMDE_REQUIRE_CONSTANT_RANGE(lane, 0, 0) { + simde_poly64x1x2_t r; + + for (size_t i = 0 ; i < 2 ; i++) { + simde_poly64x1_private tmp_ = simde_poly64x1_to_private(src.val[i]); + tmp_.values[lane] = ptr[i]; + r.val[i] = simde_poly64x1_from_private(tmp_); + } + return r; +} +#if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + #define simde_vld2_lane_p64(ptr, src, lane) vld2_lane_p64(ptr, src, lane) +#endif +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vld2_lane_p64 + #define vld2_lane_p64(ptr, src, lane) simde_vld2_lane_p64((ptr), (src), (lane)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_poly8x16x2_t simde_vld2q_lane_p8(simde_poly8_t const ptr[HEDLEY_ARRAY_PARAM(2)], simde_poly8x16x2_t src, const int lane) + SIMDE_REQUIRE_CONSTANT_RANGE(lane, 0, 15) { + simde_poly8x16x2_t r; + + for (size_t i = 0 ; i < 2 ; i++) { + simde_poly8x16_private tmp_ = simde_poly8x16_to_private(src.val[i]); + tmp_.values[lane] = ptr[i]; + r.val[i] = simde_poly8x16_from_private(tmp_); + } + return r; +} +#if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + #define simde_vld2q_lane_p8(ptr, src, lane) vld2q_lane_p8(ptr, src, lane) +#endif +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vld2q_lane_p8 + #define vld2q_lane_p8(ptr, src, lane) simde_vld2q_lane_p8((ptr), (src), (lane)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_poly16x8x2_t simde_vld2q_lane_p16(simde_poly16_t const ptr[HEDLEY_ARRAY_PARAM(2)], simde_poly16x8x2_t src, const int lane) + SIMDE_REQUIRE_CONSTANT_RANGE(lane, 0, 7) { + simde_poly16x8x2_t r; + + for (size_t i = 0 ; i < 2 ; i++) { + simde_poly16x8_private tmp_ = simde_poly16x8_to_private(src.val[i]); + tmp_.values[lane] = ptr[i]; + r.val[i] = simde_poly16x8_from_private(tmp_); + } + return r; +} +#if defined(SIMDE_ARM_NEON_A32V7_NATIVE) + #define simde_vld2q_lane_p16(ptr, src, lane) vld2q_lane_p16(ptr, src, lane) +#endif +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vld2q_lane_p16 + #define vld2q_lane_p16(ptr, src, lane) simde_vld2q_lane_p16((ptr), (src), (lane)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_poly64x2x2_t simde_vld2q_lane_p64(simde_poly64_t const ptr[HEDLEY_ARRAY_PARAM(2)], simde_poly64x2x2_t src, const int lane) + SIMDE_REQUIRE_CONSTANT_RANGE(lane, 0, 1) { + simde_poly64x2x2_t r; + + for (size_t i = 0 ; i < 2 ; i++) { + simde_poly64x2_private tmp_ = simde_poly64x2_to_private(src.val[i]); + tmp_.values[lane] = ptr[i]; + r.val[i] = simde_poly64x2_from_private(tmp_); + } + return r; +} +#if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + #define simde_vld2q_lane_p64(ptr, src, lane) vld2q_lane_p64(ptr, src, lane) +#endif +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vld2q_lane_p64 + #define vld2q_lane_p64(ptr, src, lane) simde_vld2q_lane_p64((ptr), (src), (lane)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_bfloat16x4x2_t simde_vld2_lane_bf16(simde_bfloat16_t const ptr[HEDLEY_ARRAY_PARAM(2)], simde_bfloat16x4x2_t src, const int lane) + SIMDE_REQUIRE_CONSTANT_RANGE(lane, 0, 3) { + simde_bfloat16x4x2_t r; + + for (size_t i = 0 ; i < 2 ; i++) { + simde_bfloat16x4_private tmp_ = simde_bfloat16x4_to_private(src.val[i]); + tmp_.values[lane] = ptr[i]; + r.val[i] = simde_bfloat16x4_from_private(tmp_); + } + return r; +} +#if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARM_NEON_BF16) + #define simde_vld2_lane_bf16(ptr, src, lane) vld2_lane_bf16(ptr, src, lane) +#endif +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) + #undef vld2_lane_bf16 + #define vld2_lane_bf16(ptr, src, lane) simde_vld2_lane_bf16((ptr), (src), (lane)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_bfloat16x8x2_t simde_vld2q_lane_bf16(simde_bfloat16_t const ptr[HEDLEY_ARRAY_PARAM(2)], simde_bfloat16x8x2_t src, const int lane) + SIMDE_REQUIRE_CONSTANT_RANGE(lane, 0, 7) { + simde_bfloat16x8x2_t r; + + for (size_t i = 0 ; i < 2 ; i++) { + simde_bfloat16x8_private tmp_ = simde_bfloat16x8_to_private(src.val[i]); + tmp_.values[lane] = ptr[i]; + r.val[i] = simde_bfloat16x8_from_private(tmp_); + } + return r; +} +#if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARM_NEON_BF16) + #define simde_vld2q_lane_bf16(ptr, src, lane) vld2q_lane_bf16(ptr, src, lane) +#endif +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) + #undef vld2q_lane_bf16 + #define vld2q_lane_bf16(ptr, src, lane) simde_vld2q_lane_bf16((ptr), (src), (lane)) +#endif + +SIMDE_END_DECLS_ +HEDLEY_DIAGNOSTIC_POP + +#endif /* !defined(SIMDE_ARM_NEON_LD2_LANE_H) */ diff --git a/lib/simd_wrapper/simde/arm/neon/ld3.h b/lib/simd_wrapper/simde/arm/neon/ld3.h index e13eff1dbc1..a60c2aa0d9d 100644 --- a/lib/simd_wrapper/simde/arm/neon/ld3.h +++ b/lib/simd_wrapper/simde/arm/neon/ld3.h @@ -23,6 +23,8 @@ * Copyright: * 2020 Evan Nemerson * 2020 Sean Maher + * 2023 Yi-Yen Chung (Copyright owned by Andes Technology) + * 2023 Chi-Wei Chu (Copyright owned by NTHU pllab) */ #if !defined(SIMDE_ARM_NEON_LD3_H) @@ -40,6 +42,39 @@ SIMDE_BEGIN_DECLS_ #if !defined(SIMDE_BUG_INTEL_857088) +SIMDE_FUNCTION_ATTRIBUTES +simde_float16x4x3_t +simde_vld3_f16(simde_float16_t const *ptr) { + #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) && defined(SIMDE_ARM_NEON_FP16) + return vld3_f16(ptr); + #else + simde_float16x4_private r_[3]; + #if defined(SIMDE_RISCV_V_NATIVE) && SIMDE_ARCH_RISCV_ZVFH && (SIMDE_NATURAL_VECTOR_SIZE >= 128) + vfloat16m1x3_t dest = __riscv_vlseg3e16_v_f16m1x3((_Float16 *)&ptr[0], 4); + r_[0].sv64 = __riscv_vget_v_f16m1x3_f16m1(dest, 0); + r_[1].sv64 = __riscv_vget_v_f16m1x3_f16m1(dest, 1); + r_[2].sv64 = __riscv_vget_v_f16m1x3_f16m1(dest, 2); + #else + for (size_t i = 0; i < (sizeof(r_) / sizeof(r_[0])); i++) { + for (size_t j = 0 ; j < (sizeof(r_[0].values) / sizeof(r_[0].values[0])) ; j++) { + r_[i].values[j] = ptr[i + (j * (sizeof(r_) / sizeof(r_[0])))]; + } + } + #endif + simde_float16x4x3_t r = { { + simde_float16x4_from_private(r_[0]), + simde_float16x4_from_private(r_[1]), + simde_float16x4_from_private(r_[2]) + } }; + + return r; + #endif +} +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vld3_f16 + #define vld3_f16(a) simde_vld3_f16((a)) +#endif + SIMDE_FUNCTION_ATTRIBUTES simde_float32x2x3_t simde_vld3_f32(simde_float32 const *ptr) { @@ -47,13 +82,18 @@ simde_vld3_f32(simde_float32 const *ptr) { return vld3_f32(ptr); #else simde_float32x2_private r_[3]; - - for (size_t i = 0; i < (sizeof(r_) / sizeof(r_[0])); i++) { - for (size_t j = 0 ; j < (sizeof(r_[0].values) / sizeof(r_[0].values[0])) ; j++) { - r_[i].values[j] = ptr[i + (j * (sizeof(r_) / sizeof(r_[0])))]; + #if defined(SIMDE_RISCV_V_NATIVE) + vfloat32m1x3_t dest = __riscv_vlseg3e32_v_f32m1x3(&ptr[0], 2); + r_[0].sv64 = __riscv_vget_v_f32m1x3_f32m1(dest, 0); + r_[1].sv64 = __riscv_vget_v_f32m1x3_f32m1(dest, 1); + r_[2].sv64 = __riscv_vget_v_f32m1x3_f32m1(dest, 2); + #else + for (size_t i = 0; i < (sizeof(r_) / sizeof(r_[0])); i++) { + for (size_t j = 0 ; j < (sizeof(r_[0].values) / sizeof(r_[0].values[0])) ; j++) { + r_[i].values[j] = ptr[i + (j * (sizeof(r_) / sizeof(r_[0])))]; + } } - } - + #endif simde_float32x2x3_t r = { { simde_float32x2_from_private(r_[0]), simde_float32x2_from_private(r_[1]), @@ -75,13 +115,18 @@ simde_vld3_f64(simde_float64 const *ptr) { return vld3_f64(ptr); #else simde_float64x1_private r_[3]; - - for (size_t i = 0; i < (sizeof(r_) / sizeof(r_[0])); i++) { - for (size_t j = 0 ; j < (sizeof(r_[0].values) / sizeof(r_[0].values[0])) ; j++) { - r_[i].values[j] = ptr[i + (j * (sizeof(r_) / sizeof(r_[0])))]; + #if defined(SIMDE_RISCV_V_NATIVE) + vfloat64m1x3_t dest = __riscv_vlseg3e64_v_f64m1x3(&ptr[0], 1); + r_[0].sv64 = __riscv_vget_v_f64m1x3_f64m1(dest, 0); + r_[1].sv64 = __riscv_vget_v_f64m1x3_f64m1(dest, 1); + r_[2].sv64 = __riscv_vget_v_f64m1x3_f64m1(dest, 2); + #else + for (size_t i = 0; i < (sizeof(r_) / sizeof(r_[0])); i++) { + for (size_t j = 0 ; j < (sizeof(r_[0].values) / sizeof(r_[0].values[0])) ; j++) { + r_[i].values[j] = ptr[i + (j * (sizeof(r_) / sizeof(r_[0])))]; + } } - } - + #endif simde_float64x1x3_t r = { { simde_float64x1_from_private(r_[0]), simde_float64x1_from_private(r_[1]), @@ -103,13 +148,18 @@ simde_vld3_s8(int8_t const *ptr) { return vld3_s8(ptr); #else simde_int8x8_private r_[3]; - - for (size_t i = 0; i < (sizeof(r_) / sizeof(r_[0])); i++) { - for (size_t j = 0 ; j < (sizeof(r_[0].values) / sizeof(r_[0].values[0])) ; j++) { - r_[i].values[j] = ptr[i + (j * (sizeof(r_) / sizeof(r_[0])))]; + #if defined(SIMDE_RISCV_V_NATIVE) + vint8m1x3_t dest = __riscv_vlseg3e8_v_i8m1x3(&ptr[0], 8); + r_[0].sv64 = __riscv_vget_v_i8m1x3_i8m1(dest, 0); + r_[1].sv64 = __riscv_vget_v_i8m1x3_i8m1(dest, 1); + r_[2].sv64 = __riscv_vget_v_i8m1x3_i8m1(dest, 2); + #else + for (size_t i = 0; i < (sizeof(r_) / sizeof(r_[0])); i++) { + for (size_t j = 0 ; j < (sizeof(r_[0].values) / sizeof(r_[0].values[0])) ; j++) { + r_[i].values[j] = ptr[i + (j * (sizeof(r_) / sizeof(r_[0])))]; + } } - } - + #endif simde_int8x8x3_t r = { { simde_int8x8_from_private(r_[0]), simde_int8x8_from_private(r_[1]), @@ -131,13 +181,18 @@ simde_vld3_s16(int16_t const *ptr) { return vld3_s16(ptr); #else simde_int16x4_private r_[3]; - - for (size_t i = 0; i < (sizeof(r_) / sizeof(r_[0])); i++) { - for (size_t j = 0 ; j < (sizeof(r_[0].values) / sizeof(r_[0].values[0])) ; j++) { - r_[i].values[j] = ptr[i + (j * (sizeof(r_) / sizeof(r_[0])))]; + #if defined(SIMDE_RISCV_V_NATIVE) + vint16m1x3_t dest = __riscv_vlseg3e16_v_i16m1x3(&ptr[0], 4); + r_[0].sv64 = __riscv_vget_v_i16m1x3_i16m1(dest, 0); + r_[1].sv64 = __riscv_vget_v_i16m1x3_i16m1(dest, 1); + r_[2].sv64 = __riscv_vget_v_i16m1x3_i16m1(dest, 2); + #else + for (size_t i = 0; i < (sizeof(r_) / sizeof(r_[0])); i++) { + for (size_t j = 0 ; j < (sizeof(r_[0].values) / sizeof(r_[0].values[0])) ; j++) { + r_[i].values[j] = ptr[i + (j * (sizeof(r_) / sizeof(r_[0])))]; + } } - } - + #endif simde_int16x4x3_t r = { { simde_int16x4_from_private(r_[0]), simde_int16x4_from_private(r_[1]), @@ -159,13 +214,18 @@ simde_vld3_s32(int32_t const *ptr) { return vld3_s32(ptr); #else simde_int32x2_private r_[3]; - - for (size_t i = 0; i < (sizeof(r_) / sizeof(r_[0])); i++) { - for (size_t j = 0 ; j < (sizeof(r_[0].values) / sizeof(r_[0].values[0])) ; j++) { - r_[i].values[j] = ptr[i + (j * (sizeof(r_) / sizeof(r_[0])))]; + #if defined(SIMDE_RISCV_V_NATIVE) + vint32m1x3_t dest = __riscv_vlseg3e32_v_i32m1x3(&ptr[0], 2); + r_[0].sv64 = __riscv_vget_v_i32m1x3_i32m1(dest, 0); + r_[1].sv64 = __riscv_vget_v_i32m1x3_i32m1(dest, 1); + r_[2].sv64 = __riscv_vget_v_i32m1x3_i32m1(dest, 2); + #else + for (size_t i = 0; i < (sizeof(r_) / sizeof(r_[0])); i++) { + for (size_t j = 0 ; j < (sizeof(r_[0].values) / sizeof(r_[0].values[0])) ; j++) { + r_[i].values[j] = ptr[i + (j * (sizeof(r_) / sizeof(r_[0])))]; + } } - } - + #endif simde_int32x2x3_t r = { { simde_int32x2_from_private(r_[0]), simde_int32x2_from_private(r_[1]), @@ -187,13 +247,18 @@ simde_vld3_s64(int64_t const *ptr) { return vld3_s64(ptr); #else simde_int64x1_private r_[3]; - - for (size_t i = 0; i < (sizeof(r_) / sizeof(r_[0])); i++) { - for (size_t j = 0 ; j < (sizeof(r_[0].values) / sizeof(r_[0].values[0])) ; j++) { - r_[i].values[j] = ptr[i + (j * (sizeof(r_) / sizeof(r_[0])))]; + #if defined(SIMDE_RISCV_V_NATIVE) + vint64m1x3_t dest = __riscv_vlseg3e64_v_i64m1x3(&ptr[0], 1); + r_[0].sv64 = __riscv_vget_v_i64m1x3_i64m1(dest, 0); + r_[1].sv64 = __riscv_vget_v_i64m1x3_i64m1(dest, 1); + r_[2].sv64 = __riscv_vget_v_i64m1x3_i64m1(dest, 2); + #else + for (size_t i = 0; i < (sizeof(r_) / sizeof(r_[0])); i++) { + for (size_t j = 0 ; j < (sizeof(r_[0].values) / sizeof(r_[0].values[0])) ; j++) { + r_[i].values[j] = ptr[i + (j * (sizeof(r_) / sizeof(r_[0])))]; + } } - } - + #endif simde_int64x1x3_t r = { { simde_int64x1_from_private(r_[0]), simde_int64x1_from_private(r_[1]), @@ -203,7 +268,7 @@ simde_vld3_s64(int64_t const *ptr) { return r; #endif } -#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) #undef vld3_s64 #define vld3_s64(a) simde_vld3_s64((a)) #endif @@ -215,13 +280,18 @@ simde_vld3_u8(uint8_t const *ptr) { return vld3_u8(ptr); #else simde_uint8x8_private r_[3]; - - for (size_t i = 0; i < (sizeof(r_) / sizeof(r_[0])); i++) { - for (size_t j = 0 ; j < (sizeof(r_[0].values) / sizeof(r_[0].values[0])) ; j++) { - r_[i].values[j] = ptr[i + (j * (sizeof(r_) / sizeof(r_[0])))]; + #if defined(SIMDE_RISCV_V_NATIVE) + vuint8m1x3_t dest = __riscv_vlseg3e8_v_u8m1x3(&ptr[0], 8); + r_[0].sv64 = __riscv_vget_v_u8m1x3_u8m1(dest, 0); + r_[1].sv64 = __riscv_vget_v_u8m1x3_u8m1(dest, 1); + r_[2].sv64 = __riscv_vget_v_u8m1x3_u8m1(dest, 2); + #else + for (size_t i = 0; i < (sizeof(r_) / sizeof(r_[0])); i++) { + for (size_t j = 0 ; j < (sizeof(r_[0].values) / sizeof(r_[0].values[0])) ; j++) { + r_[i].values[j] = ptr[i + (j * (sizeof(r_) / sizeof(r_[0])))]; + } } - } - + #endif simde_uint8x8x3_t r = { { simde_uint8x8_from_private(r_[0]), simde_uint8x8_from_private(r_[1]), @@ -243,13 +313,18 @@ simde_vld3_u16(uint16_t const *ptr) { return vld3_u16(ptr); #else simde_uint16x4_private r_[3]; - - for (size_t i = 0; i < (sizeof(r_) / sizeof(r_[0])); i++) { - for (size_t j = 0 ; j < (sizeof(r_[0].values) / sizeof(r_[0].values[0])) ; j++) { - r_[i].values[j] = ptr[i + (j * (sizeof(r_) / sizeof(r_[0])))]; + #if defined(SIMDE_RISCV_V_NATIVE) + vuint16m1x3_t dest = __riscv_vlseg3e16_v_u16m1x3(&ptr[0], 4); + r_[0].sv64 = __riscv_vget_v_u16m1x3_u16m1(dest, 0); + r_[1].sv64 = __riscv_vget_v_u16m1x3_u16m1(dest, 1); + r_[2].sv64 = __riscv_vget_v_u16m1x3_u16m1(dest, 2); + #else + for (size_t i = 0; i < (sizeof(r_) / sizeof(r_[0])); i++) { + for (size_t j = 0 ; j < (sizeof(r_[0].values) / sizeof(r_[0].values[0])) ; j++) { + r_[i].values[j] = ptr[i + (j * (sizeof(r_) / sizeof(r_[0])))]; + } } - } - + #endif simde_uint16x4x3_t r = { { simde_uint16x4_from_private(r_[0]), simde_uint16x4_from_private(r_[1]), @@ -271,13 +346,18 @@ simde_vld3_u32(uint32_t const *ptr) { return vld3_u32(ptr); #else simde_uint32x2_private r_[3]; - - for (size_t i = 0; i < (sizeof(r_) / sizeof(r_[0])); i++) { - for (size_t j = 0 ; j < (sizeof(r_[0].values) / sizeof(r_[0].values[0])) ; j++) { - r_[i].values[j] = ptr[i + (j * (sizeof(r_) / sizeof(r_[0])))]; + #if defined(SIMDE_RISCV_V_NATIVE) + vuint32m1x3_t dest = __riscv_vlseg3e32_v_u32m1x3(&ptr[0], 2); + r_[0].sv64 = __riscv_vget_v_u32m1x3_u32m1(dest, 0); + r_[1].sv64 = __riscv_vget_v_u32m1x3_u32m1(dest, 1); + r_[2].sv64 = __riscv_vget_v_u32m1x3_u32m1(dest, 2); + #else + for (size_t i = 0; i < (sizeof(r_) / sizeof(r_[0])); i++) { + for (size_t j = 0 ; j < (sizeof(r_[0].values) / sizeof(r_[0].values[0])) ; j++) { + r_[i].values[j] = ptr[i + (j * (sizeof(r_) / sizeof(r_[0])))]; + } } - } - + #endif simde_uint32x2x3_t r = { { simde_uint32x2_from_private(r_[0]), simde_uint32x2_from_private(r_[1]), @@ -299,13 +379,18 @@ simde_vld3_u64(uint64_t const *ptr) { return vld3_u64(ptr); #else simde_uint64x1_private r_[3]; - - for (size_t i = 0; i < (sizeof(r_) / sizeof(r_[0])); i++) { - for (size_t j = 0 ; j < (sizeof(r_[0].values) / sizeof(r_[0].values[0])) ; j++) { - r_[i].values[j] = ptr[i + (j * (sizeof(r_) / sizeof(r_[0])))]; + #if defined(SIMDE_RISCV_V_NATIVE) + vuint64m1x3_t dest = __riscv_vlseg3e64_v_u64m1x3(&ptr[0], 1); + r_[0].sv64 = __riscv_vget_v_u64m1x3_u64m1(dest, 0); + r_[1].sv64 = __riscv_vget_v_u64m1x3_u64m1(dest, 1); + r_[2].sv64 = __riscv_vget_v_u64m1x3_u64m1(dest, 2); + #else + for (size_t i = 0; i < (sizeof(r_) / sizeof(r_[0])); i++) { + for (size_t j = 0 ; j < (sizeof(r_[0].values) / sizeof(r_[0].values[0])) ; j++) { + r_[i].values[j] = ptr[i + (j * (sizeof(r_) / sizeof(r_[0])))]; + } } - } - + #endif simde_uint64x1x3_t r = { { simde_uint64x1_from_private(r_[0]), simde_uint64x1_from_private(r_[1]), @@ -315,16 +400,61 @@ simde_vld3_u64(uint64_t const *ptr) { return r; #endif } -#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) #undef vld3_u64 #define vld3_u64(a) simde_vld3_u64((a)) #endif +SIMDE_FUNCTION_ATTRIBUTES +simde_float16x8x3_t +simde_vld3q_f16(simde_float16_t const *ptr) { + #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) && defined(SIMDE_ARM_NEON_FP16) + return vld3q_f16(ptr); + #else + simde_float16x8_private r_[3]; + #if defined(SIMDE_RISCV_V_NATIVE) && SIMDE_ARCH_RISCV_ZVFH && (SIMDE_NATURAL_VECTOR_SIZE >= 128) + vfloat16m1x3_t dest = __riscv_vlseg3e16_v_f16m1x3((_Float16 *)&ptr[0], 8); + r_[0].sv128 = __riscv_vget_v_f16m1x3_f16m1(dest, 0); + r_[1].sv128 = __riscv_vget_v_f16m1x3_f16m1(dest, 1); + r_[2].sv128 = __riscv_vget_v_f16m1x3_f16m1(dest, 2); + #else + for (size_t i = 0; i < (sizeof(r_) / sizeof(r_[0])); i++) { + for (size_t j = 0 ; j < (sizeof(r_[0].values) / sizeof(r_[0].values[0])) ; j++) { + r_[i].values[j] = ptr[i + (j * (sizeof(r_) / sizeof(r_[0])))]; + } + } + #endif + simde_float16x8x3_t r = { { + simde_float16x8_from_private(r_[0]), + simde_float16x8_from_private(r_[1]), + simde_float16x8_from_private(r_[2]) + } }; + + return r; + #endif +} +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vld3q_f16 + #define vld3q_f16(a) simde_vld3q_f16((a)) +#endif + SIMDE_FUNCTION_ATTRIBUTES simde_float32x4x3_t simde_vld3q_f32(simde_float32 const *ptr) { #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) return vld3q_f32(ptr); + #elif defined(SIMDE_RISCV_V_NATIVE) + simde_float32x4_private r_[3]; + vfloat32m1x3_t dest = __riscv_vlseg3e32_v_f32m1x3(&ptr[0], 4); + r_[0].sv128 = __riscv_vget_v_f32m1x3_f32m1(dest, 0); + r_[1].sv128 = __riscv_vget_v_f32m1x3_f32m1(dest, 1); + r_[2].sv128 = __riscv_vget_v_f32m1x3_f32m1(dest, 2); + simde_float32x4x3_t r = { { + simde_float32x4_from_private(r_[0]), + simde_float32x4_from_private(r_[1]), + simde_float32x4_from_private(r_[2]) + } }; + return r; #else simde_float32x4_private r_[3]; @@ -353,6 +483,18 @@ simde_float64x2x3_t simde_vld3q_f64(simde_float64 const *ptr) { #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) return vld3q_f64(ptr); + #elif defined(SIMDE_RISCV_V_NATIVE) + simde_float64x2_private r_[3]; + vfloat64m1x3_t dest = __riscv_vlseg3e64_v_f64m1x3(&ptr[0], 2); + r_[0].sv128 = __riscv_vget_v_f64m1x3_f64m1(dest, 0); + r_[1].sv128 = __riscv_vget_v_f64m1x3_f64m1(dest, 1); + r_[2].sv128 = __riscv_vget_v_f64m1x3_f64m1(dest, 2); + simde_float64x2x3_t r = { { + simde_float64x2_from_private(r_[0]), + simde_float64x2_from_private(r_[1]), + simde_float64x2_from_private(r_[2]) + } }; + return r; #else simde_float64x2_private r_[3]; @@ -381,6 +523,18 @@ simde_int8x16x3_t simde_vld3q_s8(int8_t const *ptr) { #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) return vld3q_s8(ptr); + #elif defined(SIMDE_RISCV_V_NATIVE) + simde_int8x16_private r_[3]; + vint8m1x3_t dest = __riscv_vlseg3e8_v_i8m1x3(&ptr[0], 16); + r_[0].sv128 = __riscv_vget_v_i8m1x3_i8m1(dest, 0); + r_[1].sv128 = __riscv_vget_v_i8m1x3_i8m1(dest, 1); + r_[2].sv128 = __riscv_vget_v_i8m1x3_i8m1(dest, 2); + simde_int8x16x3_t r = { { + simde_int8x16_from_private(r_[0]), + simde_int8x16_from_private(r_[1]), + simde_int8x16_from_private(r_[2]) + } }; + return r; #else simde_int8x16_private r_[3]; @@ -409,6 +563,18 @@ simde_int16x8x3_t simde_vld3q_s16(int16_t const *ptr) { #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) return vld3q_s16(ptr); + #elif defined(SIMDE_RISCV_V_NATIVE) + simde_int16x8_private r_[3]; + vint16m1x3_t dest = __riscv_vlseg3e16_v_i16m1x3(&ptr[0], 8); + r_[0].sv128 = __riscv_vget_v_i16m1x3_i16m1(dest, 0); + r_[1].sv128 = __riscv_vget_v_i16m1x3_i16m1(dest, 1); + r_[2].sv128 = __riscv_vget_v_i16m1x3_i16m1(dest, 2); + simde_int16x8x3_t r = { { + simde_int16x8_from_private(r_[0]), + simde_int16x8_from_private(r_[1]), + simde_int16x8_from_private(r_[2]) + } }; + return r; #else simde_int16x8_private r_[3]; @@ -437,6 +603,18 @@ simde_int32x4x3_t simde_vld3q_s32(int32_t const *ptr) { #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) return vld3q_s32(ptr); + #elif defined(SIMDE_RISCV_V_NATIVE) + simde_int32x4_private r_[3]; + vint32m1x3_t dest = __riscv_vlseg3e32_v_i32m1x3(&ptr[0], 4); + r_[0].sv128 = __riscv_vget_v_i32m1x3_i32m1(dest, 0); + r_[1].sv128 = __riscv_vget_v_i32m1x3_i32m1(dest, 1); + r_[2].sv128 = __riscv_vget_v_i32m1x3_i32m1(dest, 2); + simde_int32x4x3_t r = { { + simde_int32x4_from_private(r_[0]), + simde_int32x4_from_private(r_[1]), + simde_int32x4_from_private(r_[2]) + } }; + return r; #else simde_int32x4_private r_[3]; @@ -465,6 +643,18 @@ simde_int64x2x3_t simde_vld3q_s64(int64_t const *ptr) { #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) return vld3q_s64(ptr); + #elif defined(SIMDE_RISCV_V_NATIVE) + simde_int64x2_private r_[3]; + vint64m1x3_t dest = __riscv_vlseg3e64_v_i64m1x3(&ptr[0], 2); + r_[0].sv128 = __riscv_vget_v_i64m1x3_i64m1(dest, 0); + r_[1].sv128 = __riscv_vget_v_i64m1x3_i64m1(dest, 1); + r_[2].sv128 = __riscv_vget_v_i64m1x3_i64m1(dest, 2); + simde_int64x2x3_t r = { { + simde_int64x2_from_private(r_[0]), + simde_int64x2_from_private(r_[1]), + simde_int64x2_from_private(r_[2]) + } }; + return r; #else simde_int64x2_private r_[3]; @@ -494,6 +684,18 @@ simde_uint8x16x3_t simde_vld3q_u8(uint8_t const *ptr) { #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) return vld3q_u8(ptr); + #elif defined(SIMDE_RISCV_V_NATIVE) + simde_uint8x16_private r_[3]; + vuint8m1x3_t dest = __riscv_vlseg3e8_v_u8m1x3(&ptr[0], 16); + r_[0].sv128 = __riscv_vget_v_u8m1x3_u8m1(dest, 0); + r_[1].sv128 = __riscv_vget_v_u8m1x3_u8m1(dest, 1); + r_[2].sv128 = __riscv_vget_v_u8m1x3_u8m1(dest, 2); + simde_uint8x16x3_t r = { { + simde_uint8x16_from_private(r_[0]), + simde_uint8x16_from_private(r_[1]), + simde_uint8x16_from_private(r_[2]) + } }; + return r; #else simde_uint8x16_private r_[3]; @@ -522,6 +724,18 @@ simde_uint16x8x3_t simde_vld3q_u16(uint16_t const *ptr) { #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) return vld3q_u16(ptr); + #elif defined(SIMDE_RISCV_V_NATIVE) + simde_uint16x8_private r_[3]; + vuint16m1x3_t dest = __riscv_vlseg3e16_v_u16m1x3(&ptr[0], 8); + r_[0].sv128 = __riscv_vget_v_u16m1x3_u16m1(dest, 0); + r_[1].sv128 = __riscv_vget_v_u16m1x3_u16m1(dest, 1); + r_[2].sv128 = __riscv_vget_v_u16m1x3_u16m1(dest, 2); + simde_uint16x8x3_t r = { { + simde_uint16x8_from_private(r_[0]), + simde_uint16x8_from_private(r_[1]), + simde_uint16x8_from_private(r_[2]) + } }; + return r; #else simde_uint16x8_private r_[3]; @@ -550,6 +764,18 @@ simde_uint32x4x3_t simde_vld3q_u32(uint32_t const *ptr) { #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) return vld3q_u32(ptr); + #elif defined(SIMDE_RISCV_V_NATIVE) + simde_uint32x4_private r_[3]; + vuint32m1x3_t dest = __riscv_vlseg3e32_v_u32m1x3(&ptr[0], 4); + r_[0].sv128 = __riscv_vget_v_u32m1x3_u32m1(dest, 0); + r_[1].sv128 = __riscv_vget_v_u32m1x3_u32m1(dest, 1); + r_[2].sv128 = __riscv_vget_v_u32m1x3_u32m1(dest, 2); + simde_uint32x4x3_t r = { { + simde_uint32x4_from_private(r_[0]), + simde_uint32x4_from_private(r_[1]), + simde_uint32x4_from_private(r_[2]) + } }; + return r; #else simde_uint32x4_private r_[3]; @@ -578,6 +804,18 @@ simde_uint64x2x3_t simde_vld3q_u64(uint64_t const *ptr) { #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) return vld3q_u64(ptr); + #elif defined(SIMDE_RISCV_V_NATIVE) + simde_uint64x2_private r_[3]; + vuint64m1x3_t dest = __riscv_vlseg3e64_v_u64m1x3(&ptr[0], 2); + r_[0].sv128 = __riscv_vget_v_u64m1x3_u64m1(dest, 0); + r_[1].sv128 = __riscv_vget_v_u64m1x3_u64m1(dest, 1); + r_[2].sv128 = __riscv_vget_v_u64m1x3_u64m1(dest, 2); + simde_uint64x2x3_t r = { { + simde_uint64x2_from_private(r_[0]), + simde_uint64x2_from_private(r_[1]), + simde_uint64x2_from_private(r_[2]) + } }; + return r; #else simde_uint64x2_private r_[3]; @@ -601,6 +839,272 @@ simde_vld3q_u64(uint64_t const *ptr) { #define vld3q_u64(a) simde_vld3q_u64((a)) #endif +SIMDE_FUNCTION_ATTRIBUTES +simde_poly8x8x3_t +simde_vld3_p8(simde_poly8_t const *ptr) { + #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) + return vld3_p8(ptr); + #else + simde_poly8x8_private r_[3]; + + #if defined(SIMDE_RISCV_V_NATIVE) + vuint8m1x3_t dest = __riscv_vlseg3e8_v_u8m1x3(&ptr[0], 8); + r_[0].sv64 = __riscv_vget_v_u8m1x3_u8m1(dest, 0); + r_[1].sv64 = __riscv_vget_v_u8m1x3_u8m1(dest, 1); + r_[2].sv64 = __riscv_vget_v_u8m1x3_u8m1(dest, 2); + #else + for (size_t i = 0; i < (sizeof(r_) / sizeof(r_[0])); i++) { + for (size_t j = 0 ; j < (sizeof(r_[0].values) / sizeof(r_[0].values[0])) ; j++) { + r_[i].values[j] = ptr[i + (j * (sizeof(r_) / sizeof(r_[0])))]; + } + } + #endif + + simde_poly8x8x3_t r = { { + simde_poly8x8_from_private(r_[0]), + simde_poly8x8_from_private(r_[1]), + simde_poly8x8_from_private(r_[2]) + } }; + + return r; + #endif +} +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vld3_p8 + #define vld3_p8(a) simde_vld3_p8((a)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_poly16x4x3_t +simde_vld3_p16(simde_poly16_t const *ptr) { + #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) + return vld3_p16(ptr); + #else + simde_poly16x4_private r_[3]; + + #if defined(SIMDE_RISCV_V_NATIVE) + vuint16m1x3_t dest = __riscv_vlseg3e16_v_u16m1x3(&ptr[0], 4); + r_[0].sv64 = __riscv_vget_v_u16m1x3_u16m1(dest, 0); + r_[1].sv64 = __riscv_vget_v_u16m1x3_u16m1(dest, 1); + r_[2].sv64 = __riscv_vget_v_u16m1x3_u16m1(dest, 2); + #else + for (size_t i = 0; i < (sizeof(r_) / sizeof(r_[0])); i++) { + for (size_t j = 0 ; j < (sizeof(r_[0].values) / sizeof(r_[0].values[0])) ; j++) { + r_[i].values[j] = ptr[i + (j * (sizeof(r_) / sizeof(r_[0])))]; + } + } + #endif + + simde_poly16x4x3_t r = { { + simde_poly16x4_from_private(r_[0]), + simde_poly16x4_from_private(r_[1]), + simde_poly16x4_from_private(r_[2]) + } }; + + return r; + #endif +} +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vld3_p16 + #define vld3_p16(a) simde_vld3_p16((a)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_poly64x1x3_t +simde_vld3_p64(simde_poly64_t const *ptr) { + #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) + return vld3_p64(ptr); + #else + simde_poly64x1_private r_[3]; + + #if defined(SIMDE_RISCV_V_NATIVE) + vuint64m1x3_t dest = __riscv_vlseg3e64_v_u64m1x3(&ptr[0], 1); + r_[0].sv64 = __riscv_vget_v_u64m1x3_u64m1(dest, 0); + r_[1].sv64 = __riscv_vget_v_u64m1x3_u64m1(dest, 1); + r_[2].sv64 = __riscv_vget_v_u64m1x3_u64m1(dest, 2); + #else + for (size_t i = 0; i < (sizeof(r_) / sizeof(r_[0])); i++) { + for (size_t j = 0 ; j < (sizeof(r_[0].values) / sizeof(r_[0].values[0])) ; j++) { + r_[i].values[j] = ptr[i + (j * (sizeof(r_) / sizeof(r_[0])))]; + } + } + #endif + + simde_poly64x1x3_t r = { { + simde_poly64x1_from_private(r_[0]), + simde_poly64x1_from_private(r_[1]), + simde_poly64x1_from_private(r_[2]) + } }; + + return r; + #endif +} +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) + #undef vld3_p64 + #define vld3_p64(a) simde_vld3_p64((a)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_poly8x16x3_t +simde_vld3q_p8(simde_poly8_t const *ptr) { + #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) + return vld3q_p8(ptr); + #else + simde_poly8x16_private r_[3]; + + #if defined(SIMDE_RISCV_V_NATIVE) + vuint8m1x3_t dest = __riscv_vlseg3e8_v_u8m1x3(&ptr[0], 16); + r_[0].sv128 = __riscv_vget_v_u8m1x3_u8m1(dest, 0); + r_[1].sv128 = __riscv_vget_v_u8m1x3_u8m1(dest, 1); + r_[2].sv128 = __riscv_vget_v_u8m1x3_u8m1(dest, 2); + #else + for (size_t i = 0; i < (sizeof(r_) / sizeof(r_[0])); i++) { + for (size_t j = 0 ; j < (sizeof(r_[0].values) / sizeof(r_[0].values[0])) ; j++) { + r_[i].values[j] = ptr[i + (j * (sizeof(r_) / sizeof(r_[0])))]; + } + } + #endif + + simde_poly8x16x3_t r = { { + simde_poly8x16_from_private(r_[0]), + simde_poly8x16_from_private(r_[1]), + simde_poly8x16_from_private(r_[2]) + } }; + + return r; + #endif +} +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vld3q_p8 + #define vld3q_p8(a) simde_vld3q_p8((a)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_poly16x8x3_t +simde_vld3q_p16(simde_poly16_t const *ptr) { + #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) + return vld3q_p16(ptr); + #else + simde_poly16x8_private r_[3]; + + #if defined(SIMDE_RISCV_V_NATIVE) + vuint16m1x3_t dest = __riscv_vlseg3e16_v_u16m1x3(&ptr[0], 8); + r_[0].sv128 = __riscv_vget_v_u16m1x3_u16m1(dest, 0); + r_[1].sv128 = __riscv_vget_v_u16m1x3_u16m1(dest, 1); + r_[2].sv128 = __riscv_vget_v_u16m1x3_u16m1(dest, 2); + #else + for (size_t i = 0; i < (sizeof(r_) / sizeof(r_[0])); i++) { + for (size_t j = 0 ; j < (sizeof(r_[0].values) / sizeof(r_[0].values[0])) ; j++) { + r_[i].values[j] = ptr[i + (j * (sizeof(r_) / sizeof(r_[0])))]; + } + } + #endif + + simde_poly16x8x3_t r = { { + simde_poly16x8_from_private(r_[0]), + simde_poly16x8_from_private(r_[1]), + simde_poly16x8_from_private(r_[2]) + } }; + + return r; + #endif +} +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vld3q_p16 + #define vld3q_p16(a) simde_vld3q_p16((a)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_poly64x2x3_t +simde_vld3q_p64(simde_poly64_t const *ptr) { + #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + return vld3q_p64(ptr); + #else + simde_poly64x2_private r_[3]; + + #if defined(SIMDE_RISCV_V_NATIVE) + vuint64m1x3_t dest = __riscv_vlseg3e64_v_u64m1x3(&ptr[0], 2); + r_[0].sv128 = __riscv_vget_v_u64m1x3_u64m1(dest, 0); + r_[1].sv128 = __riscv_vget_v_u64m1x3_u64m1(dest, 1); + r_[2].sv128 = __riscv_vget_v_u64m1x3_u64m1(dest, 2); + #else + for (size_t i = 0; i < (sizeof(r_) / sizeof(r_[0])); i++) { + for (size_t j = 0 ; j < (sizeof(r_[0].values) / sizeof(r_[0].values[0])) ; j++) { + r_[i].values[j] = ptr[i + (j * (sizeof(r_) / sizeof(r_[0])))]; + } + } + #endif + + simde_poly64x2x3_t r = { { + simde_poly64x2_from_private(r_[0]), + simde_poly64x2_from_private(r_[1]), + simde_poly64x2_from_private(r_[2]) + } }; + + return r; + #endif +} +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vld3q_p64 + #define vld3q_p64(a) simde_vld3q_p64((a)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_bfloat16x4x3_t +simde_vld3_bf16(simde_bfloat16 const *ptr) { + #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARM_NEON_BF16) + return vld3_bf16(ptr); + #else + simde_bfloat16x4_private r_[3]; + + for (size_t i = 0; i < (sizeof(r_) / sizeof(r_[0])); i++) { + for (size_t j = 0 ; j < (sizeof(r_[0].values) / sizeof(r_[0].values[0])) ; j++) { + r_[i].values[j] = ptr[i + (j * (sizeof(r_) / sizeof(r_[0])))]; + } + } + + simde_bfloat16x4x3_t r = { { + simde_bfloat16x4_from_private(r_[0]), + simde_bfloat16x4_from_private(r_[1]), + simde_bfloat16x4_from_private(r_[2]) + } }; + + return r; + #endif +} +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) + #undef vld3_bf16 + #define vld3_bf16(a) simde_vld3_bf16((a)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_bfloat16x8x3_t +simde_vld3q_bf16(simde_bfloat16 const *ptr) { + #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARM_NEON_BF16) + return vld3q_bf16(ptr); + #else + simde_bfloat16x8_private r_[3]; + + for (size_t i = 0; i < (sizeof(r_) / sizeof(r_[0])); i++) { + for (size_t j = 0 ; j < (sizeof(r_[0].values) / sizeof(r_[0].values[0])) ; j++) { + r_[i].values[j] = ptr[i + (j * (sizeof(r_) / sizeof(r_[0])))]; + } + } + + simde_bfloat16x8x3_t r = { { + simde_bfloat16x8_from_private(r_[0]), + simde_bfloat16x8_from_private(r_[1]), + simde_bfloat16x8_from_private(r_[2]) + } }; + + return r; + #endif +} +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) + #undef vld3q_bf16 + #define vld3q_bf16(a) simde_vld3q_bf16((a)) +#endif + #endif /* !defined(SIMDE_BUG_INTEL_857088) */ SIMDE_END_DECLS_ diff --git a/lib/simd_wrapper/simde/arm/neon/ld3_dup.h b/lib/simd_wrapper/simde/arm/neon/ld3_dup.h new file mode 100644 index 00000000000..25f133b694a --- /dev/null +++ b/lib/simd_wrapper/simde/arm/neon/ld3_dup.h @@ -0,0 +1,610 @@ +/* SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, copy, + * modify, merge, publish, distribute, sublicense, and/or sell copies + * of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + * Copyright: + * 2023 Yi-Yen Chung (Copyright owned by Andes Technology) + */ + +#if !defined(SIMDE_ARM_NEON_LD3_DUP_H) +#define SIMDE_ARM_NEON_LD3_DUP_H + +#include "dup_n.h" +#include "reinterpret.h" + +HEDLEY_DIAGNOSTIC_PUSH +SIMDE_DISABLE_UNWANTED_DIAGNOSTICS +SIMDE_BEGIN_DECLS_ + +SIMDE_FUNCTION_ATTRIBUTES +simde_float16x4x3_t +simde_vld3_dup_f16(simde_float16_t const ptr[HEDLEY_ARRAY_PARAM(3)]) { + #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) && defined(SIMDE_ARM_NEON_FP16) + return vld3_dup_f16(ptr); + #else + simde_float16x4x3_t r; + + for (size_t i = 0 ; i < 3 ; i++) { + r.val[i] = simde_vdup_n_f16(ptr[i]); + } + return r; + #endif +} +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vld3_dup_f16 + #define vld3_dup_f16(a) simde_vld3_dup_f16((a)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_float32x2x3_t +simde_vld3_dup_f32(simde_float32 const * ptr) { + #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) + return vld3_dup_f32(ptr); + #else + simde_float32x2x3_t r; + + for (size_t i = 0 ; i < 3 ; i++) { + r.val[i] = simde_vdup_n_f32(ptr[i]); + } + return r; + #endif +} +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vld3_dup_f32 + #define vld3_dup_f32(a) simde_vld3_dup_f32((a)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_float64x1x3_t +simde_vld3_dup_f64(simde_float64 const * ptr) { + #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + return vld3_dup_f64(ptr); + #else + simde_float64x1x3_t r; + + for (size_t i = 0 ; i < 3 ; i++) { + r.val[i] = simde_vdup_n_f64(ptr[i]); + } + return r; + #endif +} +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vld3_dup_f64 + #define vld3_dup_f64(a) simde_vld3_dup_f64((a)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_int8x8x3_t +simde_vld3_dup_s8(int8_t const * ptr) { + #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) + return vld3_dup_s8(ptr); + #else + simde_int8x8x3_t r; + + for (size_t i = 0 ; i < 3 ; i++) { + r.val[i] = simde_vdup_n_s8(ptr[i]); + } + return r; + #endif +} +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vld3_dup_s8 + #define vld3_dup_s8(a) simde_vld3_dup_s8((a)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_int16x4x3_t +simde_vld3_dup_s16(int16_t const * ptr) { + #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) + return vld3_dup_s16(ptr); + #else + simde_int16x4x3_t r; + + for (size_t i = 0 ; i < 3 ; i++) { + r.val[i] = simde_vdup_n_s16(ptr[i]); + } + return r; + #endif +} +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vld3_dup_s16 + #define vld3_dup_s16(a) simde_vld3_dup_s16((a)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_int32x2x3_t +simde_vld3_dup_s32(int32_t const * ptr) { + #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) + return vld3_dup_s32(ptr); + #else + simde_int32x2x3_t r; + + for (size_t i = 0 ; i < 3 ; i++) { + r.val[i] = simde_vdup_n_s32(ptr[i]); + } + return r; + #endif +} +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vld3_dup_s32 + #define vld3_dup_s32(a) simde_vld3_dup_s32((a)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_int64x1x3_t +simde_vld3_dup_s64(int64_t const * ptr) { + #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) + return vld3_dup_s64(ptr); + #else + simde_int64x1x3_t r; + + for (size_t i = 0 ; i < 3 ; i++) { + r.val[i] = simde_vdup_n_s64(ptr[i]); + } + return r; + #endif +} +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vld3_dup_s64 + #define vld3_dup_s64(a) simde_vld3_dup_s64((a)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_uint8x8x3_t +simde_vld3_dup_u8(uint8_t const * ptr) { + #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) + return vld3_dup_u8(ptr); + #else + simde_uint8x8x3_t r; + + for (size_t i = 0 ; i < 3 ; i++) { + r.val[i] = simde_vdup_n_u8(ptr[i]); + } + return r; + #endif +} +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vld3_dup_u8 + #define vld3_dup_u8(a) simde_vld3_dup_u8((a)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_uint16x4x3_t +simde_vld3_dup_u16(uint16_t const * ptr) { + #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) + return vld3_dup_u16(ptr); + #else + simde_uint16x4x3_t r; + + for (size_t i = 0 ; i < 3 ; i++) { + r.val[i] = simde_vdup_n_u16(ptr[i]); + } + return r; + #endif +} +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vld3_dup_u16 + #define vld3_dup_u16(a) simde_vld3_dup_u16((a)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_uint32x2x3_t +simde_vld3_dup_u32(uint32_t const * ptr) { + #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) + return vld3_dup_u32(ptr); + #else + simde_uint32x2x3_t r; + + for (size_t i = 0 ; i < 3 ; i++) { + r.val[i] = simde_vdup_n_u32(ptr[i]); + } + return r; + #endif +} +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vld3_dup_u32 + #define vld3_dup_u32(a) simde_vld3_dup_u32((a)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_uint64x1x3_t +simde_vld3_dup_u64(uint64_t const * ptr) { + #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) + return vld3_dup_u64(ptr); + #else + simde_uint64x1x3_t r; + + for (size_t i = 0 ; i < 3 ; i++) { + r.val[i] = simde_vdup_n_u64(ptr[i]); + } + return r; + #endif +} +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vld3_dup_u64 + #define vld3_dup_u64(a) simde_vld3_dup_u64((a)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_float16x8x3_t +simde_vld3q_dup_f16(simde_float16_t const * ptr) { + #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARM_NEON_FP16) + return vld3q_dup_f16(ptr); + #else + simde_float16x8x3_t r; + + for (size_t i = 0 ; i < 3 ; i++) { + r.val[i] = simde_vdupq_n_f16(ptr[i]); + } + return r; + #endif +} +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) + #undef vld3q_dup_f16 + #define vld3q_dup_f16(a) simde_vld3q_dup_f16((a)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_float32x4x3_t +simde_vld3q_dup_f32(simde_float32 const * ptr) { + #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + return vld3q_dup_f32(ptr); + #else + simde_float32x4x3_t r; + + for (size_t i = 0 ; i < 3 ; i++) { + r.val[i] = simde_vdupq_n_f32(ptr[i]); + } + return r; + #endif +} +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vld3q_dup_f32 + #define vld3q_dup_f32(a) simde_vld3q_dup_f32((a)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_float64x2x3_t +simde_vld3q_dup_f64(simde_float64 const * ptr) { + #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + return vld3q_dup_f64(ptr); + #else + simde_float64x2x3_t r; + + for (size_t i = 0 ; i < 3 ; i++) { + r.val[i] = simde_vdupq_n_f64(ptr[i]); + } + return r; + #endif +} +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vld3q_dup_f64 + #define vld3q_dup_f64(a) simde_vld3q_dup_f64((a)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_int8x16x3_t +simde_vld3q_dup_s8(int8_t const * ptr) { + #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + return vld3q_dup_s8(ptr); + #else + simde_int8x16x3_t r; + + for (size_t i = 0 ; i < 3 ; i++) { + r.val[i] = simde_vdupq_n_s8(ptr[i]); + } + return r; + #endif +} +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vld3q_dup_s8 + #define vld3q_dup_s8(a) simde_vld3q_dup_s8((a)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_int16x8x3_t +simde_vld3q_dup_s16(int16_t const * ptr) { + #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + return vld3q_dup_s16(ptr); + #else + simde_int16x8x3_t r; + + for (size_t i = 0 ; i < 3 ; i++) { + r.val[i] = simde_vdupq_n_s16(ptr[i]); + } + return r; + #endif +} +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vld3q_dup_s16 + #define vld3q_dup_s16(a) simde_vld3q_dup_s16((a)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_int32x4x3_t +simde_vld3q_dup_s32(int32_t const * ptr) { + #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + return vld3q_dup_s32(ptr); + #else + simde_int32x4x3_t r; + + for (size_t i = 0 ; i < 3 ; i++) { + r.val[i] = simde_vdupq_n_s32(ptr[i]); + } + return r; + #endif +} +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vld3q_dup_s32 + #define vld3q_dup_s32(a) simde_vld3q_dup_s32((a)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_int64x2x3_t +simde_vld3q_dup_s64(int64_t const * ptr) { + #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + return vld3q_dup_s64(ptr); + #else + simde_int64x2x3_t r; + + for (size_t i = 0 ; i < 3 ; i++) { + r.val[i] = simde_vdupq_n_s64(ptr[i]); + } + return r; + #endif +} +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vld3q_dup_s64 + #define vld3q_dup_s64(a) simde_vld3q_dup_s64((a)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_uint8x16x3_t +simde_vld3q_dup_u8(uint8_t const * ptr) { + #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + return vld3q_dup_u8(ptr); + #else + simde_uint8x16x3_t r; + + for (size_t i = 0 ; i < 3 ; i++) { + r.val[i] = simde_vdupq_n_u8(ptr[i]); + } + return r; + #endif +} +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vld3q_dup_u8 + #define vld3q_dup_u8(a) simde_vld3q_dup_u8((a)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_uint16x8x3_t +simde_vld3q_dup_u16(uint16_t const * ptr) { + #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + return vld3q_dup_u16(ptr); + #else + simde_uint16x8x3_t r; + + for (size_t i = 0 ; i < 3 ; i++) { + r.val[i] = simde_vdupq_n_u16(ptr[i]); + } + return r; + #endif +} +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vld3q_dup_u16 + #define vld3q_dup_u16(a) simde_vld3q_dup_u16((a)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_uint32x4x3_t +simde_vld3q_dup_u32(uint32_t const * ptr) { + #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + return vld3q_dup_u32(ptr); + #else + simde_uint32x4x3_t r; + + for (size_t i = 0 ; i < 3 ; i++) { + r.val[i] = simde_vdupq_n_u32(ptr[i]); + } + return r; + #endif +} +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vld3q_dup_u32 + #define vld3q_dup_u32(a) simde_vld3q_dup_u32((a)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_uint64x2x3_t +simde_vld3q_dup_u64(uint64_t const * ptr) { + #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + return vld3q_dup_u64(ptr); + #else + simde_uint64x2x3_t r; + + for (size_t i = 0 ; i < 3 ; i++) { + r.val[i] = simde_vdupq_n_u64(ptr[i]); + } + return r; + #endif +} +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vld3q_dup_u64 + #define vld3q_dup_u64(a) simde_vld3q_dup_u64((a)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_poly8x8x3_t +simde_vld3_dup_p8(simde_poly8_t const * ptr) { + #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) && !defined(SIMDE_BUG_GCC_95399) + return vld3_dup_p8(ptr); + #else + simde_poly8x8x3_t r; + + for (size_t i = 0 ; i < 3 ; i++) { + r.val[i] = simde_vdup_n_p8(ptr[i]); + } + return r; + #endif +} +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vld3_dup_p8 + #define vld3_dup_p8(a) simde_vld3_dup_p8((a)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_poly16x4x3_t +simde_vld3_dup_p16(simde_poly16_t const * ptr) { + #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) && !defined(SIMDE_BUG_GCC_95399) + return vld3_dup_p16(ptr); + #else + simde_poly16x4x3_t r; + + for (size_t i = 0 ; i < 3 ; i++) { + r.val[i] = simde_vdup_n_p16(ptr[i]); + } + return r; + #endif +} +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vld3_dup_p16 + #define vld3_dup_p16(a) simde_vld3_dup_p16((a)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_poly64x1x3_t +simde_vld3_dup_p64(simde_poly64_t const * ptr) { + #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) + return vld3_dup_p64(ptr); + #else + simde_poly64x1x3_t r; + + for (size_t i = 0 ; i < 3 ; i++) { + r.val[i] = simde_vdup_n_p64(ptr[i]); + } + return r; + #endif +} +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) + #undef vld3_dup_p64 + #define vld3_dup_p64(a) simde_vld3_dup_p64((a)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_poly8x16x3_t +simde_vld3q_dup_p8(simde_poly8_t const * ptr) { + #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) && !defined(SIMDE_BUG_GCC_95399) + return vld3q_dup_p8(ptr); + #else + simde_poly8x16x3_t r; + + for (size_t i = 0 ; i < 3 ; i++) { + r.val[i] = simde_vdupq_n_p8(ptr[i]); + } + return r; + #endif +} +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vld3q_dup_p8 + #define vld3q_dup_p8(a) simde_vld3q_dup_p8((a)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_poly16x8x3_t +simde_vld3q_dup_p16(simde_poly16_t const * ptr) { + #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) && !defined(SIMDE_BUG_GCC_95399) + return vld3q_dup_p16(ptr); + #else + simde_poly16x8x3_t r; + + for (size_t i = 0 ; i < 3 ; i++) { + r.val[i] = simde_vdupq_n_p16(ptr[i]); + } + return r; + #endif +} +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vld3q_dup_p16 + #define vld3q_dup_p16(a) simde_vld3q_dup_p16((a)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_poly64x2x3_t +simde_vld3q_dup_p64(simde_poly64_t const * ptr) { + #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + return vld3q_dup_p64(ptr); + #else + simde_poly64x2x3_t r; + + for (size_t i = 0 ; i < 3 ; i++) { + r.val[i] = simde_vdupq_n_p64(ptr[i]); + } + return r; + #endif +} +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vld3q_dup_p64 + #define vld3q_dup_p64(a) simde_vld3q_dup_p64((a)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_bfloat16x4x3_t +simde_vld3_dup_bf16(simde_bfloat16_t const ptr[HEDLEY_ARRAY_PARAM(2)]) { + #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARM_NEON_BF16) + return vld3_dup_bf16(ptr); + #else + simde_bfloat16x4x3_t r; + + for (size_t i = 0 ; i < 3 ; i++) { + r.val[i] = simde_vdup_n_bf16(ptr[i]); + } + return r; + #endif +} +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) + #undef vld3_dup_bf16 + #define vld3_dup_bf16(a) simde_vld3_dup_bf16((a)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_bfloat16x8x3_t +simde_vld3q_dup_bf16(simde_bfloat16 const * ptr) { + #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARM_NEON_BF16) + return vld3q_dup_bf16(ptr); + #else + simde_bfloat16x8x3_t r; + + for (size_t i = 0 ; i < 3 ; i++) { + r.val[i] = simde_vdupq_n_bf16(ptr[i]); + } + return r; + #endif +} +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) + #undef vld3q_dup_bf16 + #define vld3q_dup_bf16(a) simde_vld3q_dup_bf16((a)) +#endif + +SIMDE_END_DECLS_ +HEDLEY_DIAGNOSTIC_POP + +#endif /* !defined(SIMDE_ARM_NEON_LD3_DUP_H) */ diff --git a/lib/simd_wrapper/simde/arm/neon/ld3_lane.h b/lib/simd_wrapper/simde/arm/neon/ld3_lane.h new file mode 100644 index 00000000000..4950792a8a6 --- /dev/null +++ b/lib/simd_wrapper/simde/arm/neon/ld3_lane.h @@ -0,0 +1,638 @@ +/* SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, copy, + * modify, merge, publish, distribute, sublicense, and/or sell copies + * of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + * Copyright: + * 2023 Yi-Yen Chung (Copyright owned by Andes Technology) + */ + +#if !defined(SIMDE_ARM_NEON_LD3_LANE_H) +#define SIMDE_ARM_NEON_LD3_LANE_H + +#include "types.h" + +HEDLEY_DIAGNOSTIC_PUSH +SIMDE_DISABLE_UNWANTED_DIAGNOSTICS +SIMDE_BEGIN_DECLS_ + +SIMDE_FUNCTION_ATTRIBUTES +simde_int8x8x3_t simde_vld3_lane_s8(int8_t const ptr[HEDLEY_ARRAY_PARAM(3)], simde_int8x8x3_t src, const int lane) + SIMDE_REQUIRE_CONSTANT_RANGE(lane, 0, 7) { + simde_int8x8x3_t r; + + for (size_t i = 0 ; i < 3 ; i++) { + simde_int8x8_private tmp_ = simde_int8x8_to_private(src.val[i]); + tmp_.values[lane] = ptr[i]; + r.val[i] = simde_int8x8_from_private(tmp_); + } + return r; +} +#if defined(SIMDE_ARM_NEON_A32V7_NATIVE) + #define simde_vld3_lane_s8(ptr, src, lane) vld3_lane_s8(ptr, src, lane) +#endif +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vld3_lane_s8 + #define vld3_lane_s8(ptr, src, lane) simde_vld3_lane_s8((ptr), (src), (lane)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_int16x4x3_t simde_vld3_lane_s16(int16_t const ptr[HEDLEY_ARRAY_PARAM(3)], simde_int16x4x3_t src, const int lane) + SIMDE_REQUIRE_CONSTANT_RANGE(lane, 0, 3) { + simde_int16x4x3_t r; + + for (size_t i = 0 ; i < 3 ; i++) { + simde_int16x4_private tmp_ = simde_int16x4_to_private(src.val[i]); + tmp_.values[lane] = ptr[i]; + r.val[i] = simde_int16x4_from_private(tmp_); + } + return r; +} +#if defined(SIMDE_ARM_NEON_A32V7_NATIVE) + #define simde_vld3_lane_s16(ptr, src, lane) vld3_lane_s16(ptr, src, lane) +#endif +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vld3_lane_s16 + #define vld3_lane_s16(ptr, src, lane) simde_vld3_lane_s16((ptr), (src), (lane)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_int32x2x3_t simde_vld3_lane_s32(int32_t const ptr[HEDLEY_ARRAY_PARAM(3)], simde_int32x2x3_t src, const int lane) + SIMDE_REQUIRE_CONSTANT_RANGE(lane, 0, 1) { + simde_int32x2x3_t r; + + for (size_t i = 0 ; i < 3 ; i++) { + simde_int32x2_private tmp_ = simde_int32x2_to_private(src.val[i]); + tmp_.values[lane] = ptr[i]; + r.val[i] = simde_int32x2_from_private(tmp_); + } + return r; +} +#if defined(SIMDE_ARM_NEON_A32V7_NATIVE) + #define simde_vld3_lane_s32(ptr, src, lane) vld3_lane_s32(ptr, src, lane) +#endif +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vld3_lane_s32 + #define vld3_lane_s32(ptr, src, lane) simde_vld3_lane_s32((ptr), (src), (lane)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_int64x1x3_t simde_vld3_lane_s64(int64_t const ptr[HEDLEY_ARRAY_PARAM(3)], simde_int64x1x3_t src, const int lane) + SIMDE_REQUIRE_CONSTANT_RANGE(lane, 0, 0) { + simde_int64x1x3_t r; + + for (size_t i = 0 ; i < 3 ; i++) { + simde_int64x1_private tmp_ = simde_int64x1_to_private(src.val[i]); + tmp_.values[lane] = ptr[i]; + r.val[i] = simde_int64x1_from_private(tmp_); + } + return r; +} +#if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + #define simde_vld3_lane_s64(ptr, src, lane) vld3_lane_s64(ptr, src, lane) +#endif +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vld3_lane_s64 + #define vld3_lane_s64(ptr, src, lane) simde_vld3_lane_s64((ptr), (src), (lane)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_uint8x8x3_t simde_vld3_lane_u8(uint8_t const ptr[HEDLEY_ARRAY_PARAM(3)], simde_uint8x8x3_t src, const int lane) + SIMDE_REQUIRE_CONSTANT_RANGE(lane, 0, 7) { + simde_uint8x8x3_t r; + + for (size_t i = 0 ; i < 3 ; i++) { + simde_uint8x8_private tmp_ = simde_uint8x8_to_private(src.val[i]); + tmp_.values[lane] = ptr[i]; + r.val[i] = simde_uint8x8_from_private(tmp_); + } + return r; +} +#if defined(SIMDE_ARM_NEON_A32V7_NATIVE) + #define simde_vld3_lane_u8(ptr, src, lane) vld3_lane_u8(ptr, src, lane) +#endif +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vld3_lane_u8 + #define vld3_lane_u8(ptr, src, lane) simde_vld3_lane_u8((ptr), (src), (lane)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_uint16x4x3_t simde_vld3_lane_u16(uint16_t const ptr[HEDLEY_ARRAY_PARAM(3)], simde_uint16x4x3_t src, const int lane) + SIMDE_REQUIRE_CONSTANT_RANGE(lane, 0, 3) { + simde_uint16x4x3_t r; + + for (size_t i = 0 ; i < 3 ; i++) { + simde_uint16x4_private tmp_ = simde_uint16x4_to_private(src.val[i]); + tmp_.values[lane] = ptr[i]; + r.val[i] = simde_uint16x4_from_private(tmp_); + } + return r; +} +#if defined(SIMDE_ARM_NEON_A32V7_NATIVE) + #define simde_vld3_lane_u16(ptr, src, lane) vld3_lane_u16(ptr, src, lane) +#endif +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vld3_lane_u16 + #define vld3_lane_u16(ptr, src, lane) simde_vld3_lane_u16((ptr), (src), (lane)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_uint32x2x3_t simde_vld3_lane_u32(uint32_t const ptr[HEDLEY_ARRAY_PARAM(3)], simde_uint32x2x3_t src, const int lane) + SIMDE_REQUIRE_CONSTANT_RANGE(lane, 0, 1) { + simde_uint32x2x3_t r; + + for (size_t i = 0 ; i < 3 ; i++) { + simde_uint32x2_private tmp_ = simde_uint32x2_to_private(src.val[i]); + tmp_.values[lane] = ptr[i]; + r.val[i] = simde_uint32x2_from_private(tmp_); + } + return r; +} +#if defined(SIMDE_ARM_NEON_A32V7_NATIVE) + #define simde_vld3_lane_u32(ptr, src, lane) vld3_lane_u32(ptr, src, lane) +#endif +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vld3_lane_u32 + #define vld3_lane_u32(ptr, src, lane) simde_vld3_lane_u32((ptr), (src), (lane)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_uint64x1x3_t simde_vld3_lane_u64(uint64_t const ptr[HEDLEY_ARRAY_PARAM(3)], simde_uint64x1x3_t src, const int lane) + SIMDE_REQUIRE_CONSTANT_RANGE(lane, 0, 0) { + simde_uint64x1x3_t r; + + for (size_t i = 0 ; i < 3 ; i++) { + simde_uint64x1_private tmp_ = simde_uint64x1_to_private(src.val[i]); + tmp_.values[lane] = ptr[i]; + r.val[i] = simde_uint64x1_from_private(tmp_); + } + return r; +} +#if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + #define simde_vld3_lane_u64(ptr, src, lane) vld3_lane_u64(ptr, src, lane) +#endif +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vld3_lane_u64 + #define vld3_lane_u64(ptr, src, lane) simde_vld3_lane_u64((ptr), (src), (lane)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_float16x4x3_t simde_vld3_lane_f16(simde_float16_t const ptr[HEDLEY_ARRAY_PARAM(3)], simde_float16x4x3_t src, const int lane) + SIMDE_REQUIRE_CONSTANT_RANGE(lane, 0, 3) { + simde_float16x4x3_t r; + + for (size_t i = 0 ; i < 3 ; i++) { + simde_float16x4_private tmp_ = simde_float16x4_to_private(src.val[i]); + tmp_.values[lane] = ptr[i]; + r.val[i] = simde_float16x4_from_private(tmp_); + } + return r; +} +#if defined(SIMDE_ARM_NEON_A32V7_NATIVE) && defined(SIMDE_ARM_NEON_FP16) + #define simde_vld3_lane_f16(ptr, src, lane) vld3_lane_f16(ptr, src, lane) +#endif +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vld3_lane_f16 + #define vld3_lane_f16(ptr, src, lane) simde_vld3_lane_f16((ptr), (src), (lane)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_float32x2x3_t simde_vld3_lane_f32(simde_float32_t const ptr[HEDLEY_ARRAY_PARAM(3)], simde_float32x2x3_t src, const int lane) + SIMDE_REQUIRE_CONSTANT_RANGE(lane, 0, 1) { + simde_float32x2x3_t r; + + for (size_t i = 0 ; i < 3 ; i++) { + simde_float32x2_private tmp_ = simde_float32x2_to_private(src.val[i]); + tmp_.values[lane] = ptr[i]; + r.val[i] = simde_float32x2_from_private(tmp_); + } + return r; +} +#if defined(SIMDE_ARM_NEON_A32V7_NATIVE) + #define simde_vld3_lane_f32(ptr, src, lane) vld3_lane_f32(ptr, src, lane) +#endif +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vld3_lane_f32 + #define vld3_lane_f32(ptr, src, lane) simde_vld3_lane_f32((ptr), (src), (lane)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_float64x1x3_t simde_vld3_lane_f64(simde_float64_t const ptr[HEDLEY_ARRAY_PARAM(3)], simde_float64x1x3_t src, const int lane) + SIMDE_REQUIRE_CONSTANT_RANGE(lane, 0, 0) { + simde_float64x1x3_t r; + + for (size_t i = 0 ; i < 3 ; i++) { + simde_float64x1_private tmp_ = simde_float64x1_to_private(src.val[i]); + tmp_.values[lane] = ptr[i]; + r.val[i] = simde_float64x1_from_private(tmp_); + } + return r; +} +#if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + #define simde_vld3_lane_f64(ptr, src, lane) vld3_lane_f64(ptr, src, lane) +#endif +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vld3_lane_f64 + #define vld3_lane_f64(ptr, src, lane) simde_vld3_lane_f64((ptr), (src), (lane)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_int8x16x3_t simde_vld3q_lane_s8(int8_t const ptr[HEDLEY_ARRAY_PARAM(3)], simde_int8x16x3_t src, const int lane) + SIMDE_REQUIRE_CONSTANT_RANGE(lane, 0, 15) { + simde_int8x16x3_t r; + + for (size_t i = 0 ; i < 3 ; i++) { + simde_int8x16_private tmp_ = simde_int8x16_to_private(src.val[i]); + tmp_.values[lane] = ptr[i]; + r.val[i] = simde_int8x16_from_private(tmp_); + } + return r; +} +#if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + #define simde_vld3q_lane_s8(ptr, src, lane) vld3q_lane_s8(ptr, src, lane) +#endif +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vld3q_lane_s8 + #define vld3q_lane_s8(ptr, src, lane) simde_vld3q_lane_s8((ptr), (src), (lane)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_int16x8x3_t simde_vld3q_lane_s16(int16_t const ptr[HEDLEY_ARRAY_PARAM(3)], simde_int16x8x3_t src, const int lane) + SIMDE_REQUIRE_CONSTANT_RANGE(lane, 0, 7) { + simde_int16x8x3_t r; + + for (size_t i = 0 ; i < 3 ; i++) { + simde_int16x8_private tmp_ = simde_int16x8_to_private(src.val[i]); + tmp_.values[lane] = ptr[i]; + r.val[i] = simde_int16x8_from_private(tmp_); + } + return r; +} +#if defined(SIMDE_ARM_NEON_A32V7_NATIVE) + #define simde_vld3q_lane_s16(ptr, src, lane) vld3q_lane_s16(ptr, src, lane) +#endif +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vld3q_lane_s16 + #define vld3q_lane_s16(ptr, src, lane) simde_vld3q_lane_s16((ptr), (src), (lane)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_int32x4x3_t simde_vld3q_lane_s32(int32_t const ptr[HEDLEY_ARRAY_PARAM(3)], simde_int32x4x3_t src, const int lane) + SIMDE_REQUIRE_CONSTANT_RANGE(lane, 0, 3) { + simde_int32x4x3_t r; + + for (size_t i = 0 ; i < 3 ; i++) { + simde_int32x4_private tmp_ = simde_int32x4_to_private(src.val[i]); + tmp_.values[lane] = ptr[i]; + r.val[i] = simde_int32x4_from_private(tmp_); + } + return r; +} +#if defined(SIMDE_ARM_NEON_A32V7_NATIVE) + #define simde_vld3q_lane_s32(ptr, src, lane) vld3q_lane_s32(ptr, src, lane) +#endif +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vld3q_lane_s32 + #define vld3q_lane_s32(ptr, src, lane) simde_vld3q_lane_s32((ptr), (src), (lane)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_int64x2x3_t simde_vld3q_lane_s64(int64_t const ptr[HEDLEY_ARRAY_PARAM(3)], simde_int64x2x3_t src, const int lane) + SIMDE_REQUIRE_CONSTANT_RANGE(lane, 0, 1) { + simde_int64x2x3_t r; + + for (size_t i = 0 ; i < 3 ; i++) { + simde_int64x2_private tmp_ = simde_int64x2_to_private(src.val[i]); + tmp_.values[lane] = ptr[i]; + r.val[i] = simde_int64x2_from_private(tmp_); + } + return r; +} +#if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + #define simde_vld3q_lane_s64(ptr, src, lane) vld3q_lane_s64(ptr, src, lane) +#endif +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vld3q_lane_s64 + #define vld3q_lane_s64(ptr, src, lane) simde_vld3q_lane_s64((ptr), (src), (lane)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_uint8x16x3_t simde_vld3q_lane_u8(uint8_t const ptr[HEDLEY_ARRAY_PARAM(3)], simde_uint8x16x3_t src, const int lane) + SIMDE_REQUIRE_CONSTANT_RANGE(lane, 0, 15) { + simde_uint8x16x3_t r; + + for (size_t i = 0 ; i < 3 ; i++) { + simde_uint8x16_private tmp_ = simde_uint8x16_to_private(src.val[i]); + tmp_.values[lane] = ptr[i]; + r.val[i] = simde_uint8x16_from_private(tmp_); + } + return r; +} +#if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + #define simde_vld3q_lane_u8(ptr, src, lane) vld3q_lane_u8(ptr, src, lane) +#endif +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vld3q_lane_u8 + #define vld3q_lane_u8(ptr, src, lane) simde_vld3q_lane_u8((ptr), (src), (lane)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_uint16x8x3_t simde_vld3q_lane_u16(uint16_t const ptr[HEDLEY_ARRAY_PARAM(3)], simde_uint16x8x3_t src, const int lane) + SIMDE_REQUIRE_CONSTANT_RANGE(lane, 0, 7) { + simde_uint16x8x3_t r; + + for (size_t i = 0 ; i < 3 ; i++) { + simde_uint16x8_private tmp_ = simde_uint16x8_to_private(src.val[i]); + tmp_.values[lane] = ptr[i]; + r.val[i] = simde_uint16x8_from_private(tmp_); + } + return r; +} +#if defined(SIMDE_ARM_NEON_A32V7_NATIVE) + #define simde_vld3q_lane_u16(ptr, src, lane) vld3q_lane_u16(ptr, src, lane) +#endif +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vld3q_lane_u16 + #define vld3q_lane_u16(ptr, src, lane) simde_vld3q_lane_u16((ptr), (src), (lane)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_uint32x4x3_t simde_vld3q_lane_u32(uint32_t const ptr[HEDLEY_ARRAY_PARAM(3)], simde_uint32x4x3_t src, const int lane) + SIMDE_REQUIRE_CONSTANT_RANGE(lane, 0, 3) { + simde_uint32x4x3_t r; + + for (size_t i = 0 ; i < 3 ; i++) { + simde_uint32x4_private tmp_ = simde_uint32x4_to_private(src.val[i]); + tmp_.values[lane] = ptr[i]; + r.val[i] = simde_uint32x4_from_private(tmp_); + } + return r; +} +#if defined(SIMDE_ARM_NEON_A32V7_NATIVE) + #define simde_vld3q_lane_u32(ptr, src, lane) vld3q_lane_u32(ptr, src, lane) +#endif +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vld3q_lane_u32 + #define vld3q_lane_u32(ptr, src, lane) simde_vld3q_lane_u32((ptr), (src), (lane)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_uint64x2x3_t simde_vld3q_lane_u64(uint64_t const ptr[HEDLEY_ARRAY_PARAM(3)], simde_uint64x2x3_t src, const int lane) + SIMDE_REQUIRE_CONSTANT_RANGE(lane, 0, 1) { + simde_uint64x2x3_t r; + + for (size_t i = 0 ; i < 3 ; i++) { + simde_uint64x2_private tmp_ = simde_uint64x2_to_private(src.val[i]); + tmp_.values[lane] = ptr[i]; + r.val[i] = simde_uint64x2_from_private(tmp_); + } + return r; +} +#if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + #define simde_vld3q_lane_u64(ptr, src, lane) vld3q_lane_u64(ptr, src, lane) +#endif +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vld3q_lane_u64 + #define vld3q_lane_u64(ptr, src, lane) simde_vld3q_lane_u64((ptr), (src), (lane)) +#endif +SIMDE_FUNCTION_ATTRIBUTES +simde_float16x8x3_t simde_vld3q_lane_f16(simde_float16_t const ptr[HEDLEY_ARRAY_PARAM(3)], simde_float16x8x3_t src, const int lane) + SIMDE_REQUIRE_CONSTANT_RANGE(lane, 0, 7) { + simde_float16x8x3_t r; + + for (size_t i = 0 ; i < 3 ; i++) { + simde_float16x8_private tmp_ = simde_float16x8_to_private(src.val[i]); + tmp_.values[lane] = ptr[i]; + r.val[i] = simde_float16x8_from_private(tmp_); + } + return r; +} +#if defined(SIMDE_ARM_NEON_A32V7_NATIVE) && defined(SIMDE_ARM_NEON_FP16) + #define simde_vld3q_lane_f16(ptr, src, lane) vld3q_lane_f16(ptr, src, lane) +#endif +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vld3q_lane_f16 + #define vld3q_lane_f16(ptr, src, lane) simde_vld3q_lane_f16((ptr), (src), (lane)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_float32x4x3_t simde_vld3q_lane_f32(simde_float32_t const ptr[HEDLEY_ARRAY_PARAM(3)], simde_float32x4x3_t src, const int lane) + SIMDE_REQUIRE_CONSTANT_RANGE(lane, 0, 3) { + simde_float32x4x3_t r; + + for (size_t i = 0 ; i < 3 ; i++) { + simde_float32x4_private tmp_ = simde_float32x4_to_private(src.val[i]); + tmp_.values[lane] = ptr[i]; + r.val[i] = simde_float32x4_from_private(tmp_); + } + return r; +} +#if defined(SIMDE_ARM_NEON_A32V7_NATIVE) + #define simde_vld3q_lane_f32(ptr, src, lane) vld3q_lane_f32(ptr, src, lane) +#endif +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vld3q_lane_f32 + #define vld3q_lane_f32(ptr, src, lane) simde_vld3q_lane_f32((ptr), (src), (lane)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_float64x2x3_t simde_vld3q_lane_f64(simde_float64_t const ptr[HEDLEY_ARRAY_PARAM(3)], simde_float64x2x3_t src, const int lane) + SIMDE_REQUIRE_CONSTANT_RANGE(lane, 0, 1) { + simde_float64x2x3_t r; + + for (size_t i = 0 ; i < 3 ; i++) { + simde_float64x2_private tmp_ = simde_float64x2_to_private(src.val[i]); + tmp_.values[lane] = ptr[i]; + r.val[i] = simde_float64x2_from_private(tmp_); + } + return r; +} +#if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + #define simde_vld3q_lane_f64(ptr, src, lane) vld3q_lane_f64(ptr, src, lane) +#endif +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vld3q_lane_f64 + #define vld3q_lane_f64(ptr, src, lane) simde_vld3q_lane_f64((ptr), (src), (lane)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_poly8x8x3_t simde_vld3_lane_p8(simde_poly8_t const ptr[HEDLEY_ARRAY_PARAM(3)], simde_poly8x8x3_t src, const int lane) + SIMDE_REQUIRE_CONSTANT_RANGE(lane, 0, 7) { + simde_poly8x8x3_t r; + + for (size_t i = 0 ; i < 3 ; i++) { + simde_poly8x8_private tmp_ = simde_poly8x8_to_private(src.val[i]); + tmp_.values[lane] = ptr[i]; + r.val[i] = simde_poly8x8_from_private(tmp_); + } + return r; +} +#if defined(SIMDE_ARM_NEON_A32V7_NATIVE) + #define simde_vld3_lane_p8(ptr, src, lane) vld3_lane_p8(ptr, src, lane) +#endif +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vld3_lane_p8 + #define vld3_lane_p8(ptr, src, lane) simde_vld3_lane_p8((ptr), (src), (lane)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_poly16x4x3_t simde_vld3_lane_p16(simde_poly16_t const ptr[HEDLEY_ARRAY_PARAM(3)], simde_poly16x4x3_t src, const int lane) + SIMDE_REQUIRE_CONSTANT_RANGE(lane, 0, 3) { + simde_poly16x4x3_t r; + + for (size_t i = 0 ; i < 3 ; i++) { + simde_poly16x4_private tmp_ = simde_poly16x4_to_private(src.val[i]); + tmp_.values[lane] = ptr[i]; + r.val[i] = simde_poly16x4_from_private(tmp_); + } + return r; +} +#if defined(SIMDE_ARM_NEON_A32V7_NATIVE) + #define simde_vld3_lane_p16(ptr, src, lane) vld3_lane_p16(ptr, src, lane) +#endif +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vld3_lane_p16 + #define vld3_lane_p16(ptr, src, lane) simde_vld3_lane_p16((ptr), (src), (lane)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_poly64x1x3_t simde_vld3_lane_p64(simde_poly64_t const ptr[HEDLEY_ARRAY_PARAM(3)], simde_poly64x1x3_t src, const int lane) + SIMDE_REQUIRE_CONSTANT_RANGE(lane, 0, 0) { + simde_poly64x1x3_t r; + + for (size_t i = 0 ; i < 3 ; i++) { + simde_poly64x1_private tmp_ = simde_poly64x1_to_private(src.val[i]); + tmp_.values[lane] = ptr[i]; + r.val[i] = simde_poly64x1_from_private(tmp_); + } + return r; +} +#if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + #define simde_vld3_lane_p64(ptr, src, lane) vld3_lane_p64(ptr, src, lane) +#endif +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vld3_lane_p64 + #define vld3_lane_p64(ptr, src, lane) simde_vld3_lane_p64((ptr), (src), (lane)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_poly8x16x3_t simde_vld3q_lane_p8(simde_poly8_t const ptr[HEDLEY_ARRAY_PARAM(3)], simde_poly8x16x3_t src, const int lane) + SIMDE_REQUIRE_CONSTANT_RANGE(lane, 0, 15) { + simde_poly8x16x3_t r; + + for (size_t i = 0 ; i < 3 ; i++) { + simde_poly8x16_private tmp_ = simde_poly8x16_to_private(src.val[i]); + tmp_.values[lane] = ptr[i]; + r.val[i] = simde_poly8x16_from_private(tmp_); + } + return r; +} +#if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + #define simde_vld3q_lane_p8(ptr, src, lane) vld3q_lane_p8(ptr, src, lane) +#endif +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vld3q_lane_p8 + #define vld3q_lane_p8(ptr, src, lane) simde_vld3q_lane_p8((ptr), (src), (lane)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_poly16x8x3_t simde_vld3q_lane_p16(simde_poly16_t const ptr[HEDLEY_ARRAY_PARAM(3)], simde_poly16x8x3_t src, const int lane) + SIMDE_REQUIRE_CONSTANT_RANGE(lane, 0, 7) { + simde_poly16x8x3_t r; + + for (size_t i = 0 ; i < 3 ; i++) { + simde_poly16x8_private tmp_ = simde_poly16x8_to_private(src.val[i]); + tmp_.values[lane] = ptr[i]; + r.val[i] = simde_poly16x8_from_private(tmp_); + } + return r; +} +#if defined(SIMDE_ARM_NEON_A32V7_NATIVE) + #define simde_vld3q_lane_p16(ptr, src, lane) vld3q_lane_p16(ptr, src, lane) +#endif +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vld3q_lane_p16 + #define vld3q_lane_p16(ptr, src, lane) simde_vld3q_lane_p16((ptr), (src), (lane)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_poly64x2x3_t simde_vld3q_lane_p64(simde_poly64_t const ptr[HEDLEY_ARRAY_PARAM(3)], simde_poly64x2x3_t src, const int lane) + SIMDE_REQUIRE_CONSTANT_RANGE(lane, 0, 1) { + simde_poly64x2x3_t r; + + for (size_t i = 0 ; i < 3 ; i++) { + simde_poly64x2_private tmp_ = simde_poly64x2_to_private(src.val[i]); + tmp_.values[lane] = ptr[i]; + r.val[i] = simde_poly64x2_from_private(tmp_); + } + return r; +} +#if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + #define simde_vld3q_lane_p64(ptr, src, lane) vld3q_lane_p64(ptr, src, lane) +#endif +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vld3q_lane_p64 + #define vld3q_lane_p64(ptr, src, lane) simde_vld3q_lane_p64((ptr), (src), (lane)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_bfloat16x4x3_t simde_vld3_lane_bf16(simde_bfloat16_t const ptr[HEDLEY_ARRAY_PARAM(3)], simde_bfloat16x4x3_t src, const int lane) + SIMDE_REQUIRE_CONSTANT_RANGE(lane, 0, 3) { + simde_bfloat16x4x3_t r; + + for (size_t i = 0 ; i < 3 ; i++) { + simde_bfloat16x4_private tmp_ = simde_bfloat16x4_to_private(src.val[i]); + tmp_.values[lane] = ptr[i]; + r.val[i] = simde_bfloat16x4_from_private(tmp_); + } + return r; +} +#if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARM_NEON_BF16) + #define simde_vld3_lane_bf16(ptr, src, lane) vld3_lane_bf16(ptr, src, lane) +#endif +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) + #undef vld3_lane_bf16 + #define vld3_lane_bf16(ptr, src, lane) simde_vld3_lane_bf16((ptr), (src), (lane)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_bfloat16x8x3_t simde_vld3q_lane_bf16(simde_bfloat16_t const ptr[HEDLEY_ARRAY_PARAM(3)], simde_bfloat16x8x3_t src, const int lane) + SIMDE_REQUIRE_CONSTANT_RANGE(lane, 0, 7) { + simde_bfloat16x8x3_t r; + + for (size_t i = 0 ; i < 3 ; i++) { + simde_bfloat16x8_private tmp_ = simde_bfloat16x8_to_private(src.val[i]); + tmp_.values[lane] = ptr[i]; + r.val[i] = simde_bfloat16x8_from_private(tmp_); + } + return r; +} +#if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARM_NEON_BF16) + #define simde_vld3q_lane_bf16(ptr, src, lane) vld3q_lane_bf16(ptr, src, lane) +#endif +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) + #undef vld3q_lane_bf16 + #define vld3q_lane_bf16(ptr, src, lane) simde_vld3q_lane_bf16((ptr), (src), (lane)) +#endif + +SIMDE_END_DECLS_ +HEDLEY_DIAGNOSTIC_POP + +#endif /* !defined(SIMDE_ARM_NEON_LD3_LANE_H) */ diff --git a/lib/simd_wrapper/simde/arm/neon/ld4.h b/lib/simd_wrapper/simde/arm/neon/ld4.h index b936182485f..777c24f73c4 100644 --- a/lib/simd_wrapper/simde/arm/neon/ld4.h +++ b/lib/simd_wrapper/simde/arm/neon/ld4.h @@ -23,6 +23,8 @@ * Copyright: * 2020 Evan Nemerson * 2020 Sean Maher + * 2023 Yi-Yen Chung (Copyright owned by Andes Technology) + * 2023 Chi-Wei Chu (Copyright owned by NTHU pllab) */ #if !defined(SIMDE_ARM_NEON_LD4_H) @@ -39,6 +41,34 @@ SIMDE_BEGIN_DECLS_ #if !defined(SIMDE_BUG_INTEL_857088) +SIMDE_FUNCTION_ATTRIBUTES +simde_float16x4x4_t +simde_vld4_f16(simde_float16_t const ptr[HEDLEY_ARRAY_PARAM(16)]) { + #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) && defined(SIMDE_ARM_NEON_FP16) + return vld4_f16(ptr); + #else + simde_float16x4_private a_[4]; + #if defined(SIMDE_RISCV_V_NATIVE) && SIMDE_ARCH_RISCV_ZVFH && (SIMDE_NATURAL_VECTOR_SIZE >= 128) + vfloat16m1x4_t dest = __riscv_vlseg4e16_v_f16m1x4((_Float16 *)&ptr[0], 4); + a_[0].sv64 = __riscv_vget_v_f16m1x4_f16m1(dest, 0); + a_[1].sv64 = __riscv_vget_v_f16m1x4_f16m1(dest, 1); + a_[2].sv64 = __riscv_vget_v_f16m1x4_f16m1(dest, 2); + a_[3].sv64 = __riscv_vget_v_f16m1x4_f16m1(dest, 3); + #else + for (size_t i = 0; i < (sizeof(simde_float16x4_t) / sizeof(*ptr)) * 4 ; i++) { + a_[i % 4].values[i / 4] = ptr[i]; + } + #endif + simde_float16x4x4_t s_ = { { simde_float16x4_from_private(a_[0]), simde_float16x4_from_private(a_[1]), + simde_float16x4_from_private(a_[2]), simde_float16x4_from_private(a_[3]) } }; + return (s_); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vld4_f16 + #define vld4_f16(a) simde_vld4_f16((a)) +#endif + SIMDE_FUNCTION_ATTRIBUTES simde_float32x2x4_t simde_vld4_f32(simde_float32 const ptr[HEDLEY_ARRAY_PARAM(8)]) { @@ -46,9 +76,17 @@ simde_vld4_f32(simde_float32 const ptr[HEDLEY_ARRAY_PARAM(8)]) { return vld4_f32(ptr); #else simde_float32x2_private a_[4]; - for (size_t i = 0; i < (sizeof(simde_float32x2_t) / sizeof(*ptr)) * 4 ; i++) { - a_[i % 4].values[i / 4] = ptr[i]; - } + #if defined(SIMDE_RISCV_V_NATIVE) + vfloat32m1x4_t dest = __riscv_vlseg4e32_v_f32m1x4(&ptr[0], 2); + a_[0].sv64 = __riscv_vget_v_f32m1x4_f32m1(dest, 0); + a_[1].sv64 = __riscv_vget_v_f32m1x4_f32m1(dest, 1); + a_[2].sv64 = __riscv_vget_v_f32m1x4_f32m1(dest, 2); + a_[3].sv64 = __riscv_vget_v_f32m1x4_f32m1(dest, 3); + #else + for (size_t i = 0; i < (sizeof(simde_float32x2_t) / sizeof(*ptr)) * 4 ; i++) { + a_[i % 4].values[i / 4] = ptr[i]; + } + #endif simde_float32x2x4_t s_ = { { simde_float32x2_from_private(a_[0]), simde_float32x2_from_private(a_[1]), simde_float32x2_from_private(a_[2]), simde_float32x2_from_private(a_[3]) } }; return (s_); @@ -66,9 +104,17 @@ simde_vld4_f64(simde_float64 const ptr[HEDLEY_ARRAY_PARAM(4)]) { return vld4_f64(ptr); #else simde_float64x1_private a_[4]; - for (size_t i = 0; i < (sizeof(simde_float64x1_t) / sizeof(*ptr)) * 4 ; i++) { - a_[i % 4].values[i / 4] = ptr[i]; - } + #if defined(SIMDE_RISCV_V_NATIVE) + vfloat64m1x4_t dest = __riscv_vlseg4e64_v_f64m1x4(&ptr[0], 1); + a_[0].sv64 = __riscv_vget_v_f64m1x4_f64m1(dest, 0); + a_[1].sv64 = __riscv_vget_v_f64m1x4_f64m1(dest, 1); + a_[2].sv64 = __riscv_vget_v_f64m1x4_f64m1(dest, 2); + a_[3].sv64 = __riscv_vget_v_f64m1x4_f64m1(dest, 3); + #else + for (size_t i = 0; i < (sizeof(simde_float64x1_t) / sizeof(*ptr)) * 4 ; i++) { + a_[i % 4].values[i / 4] = ptr[i]; + } + #endif simde_float64x1x4_t s_ = { { simde_float64x1_from_private(a_[0]), simde_float64x1_from_private(a_[1]), simde_float64x1_from_private(a_[2]), simde_float64x1_from_private(a_[3]) } }; return s_; @@ -86,9 +132,17 @@ simde_vld4_s8(int8_t const ptr[HEDLEY_ARRAY_PARAM(32)]) { return vld4_s8(ptr); #else simde_int8x8_private a_[4]; - for (size_t i = 0; i < (sizeof(simde_int8x8_t) / sizeof(*ptr)) * 4 ; i++) { - a_[i % 4].values[i / 4] = ptr[i]; - } + #if defined(SIMDE_RISCV_V_NATIVE) + vint8m1x4_t dest = __riscv_vlseg4e8_v_i8m1x4(&ptr[0], 8); + a_[0].sv64 = __riscv_vget_v_i8m1x4_i8m1(dest, 0); + a_[1].sv64 = __riscv_vget_v_i8m1x4_i8m1(dest, 1); + a_[2].sv64 = __riscv_vget_v_i8m1x4_i8m1(dest, 2); + a_[3].sv64 = __riscv_vget_v_i8m1x4_i8m1(dest, 3); + #else + for (size_t i = 0; i < (sizeof(simde_int8x8_t) / sizeof(*ptr)) * 4 ; i++) { + a_[i % 4].values[i / 4] = ptr[i]; + } + #endif simde_int8x8x4_t s_ = { { simde_int8x8_from_private(a_[0]), simde_int8x8_from_private(a_[1]), simde_int8x8_from_private(a_[2]), simde_int8x8_from_private(a_[3]) } }; return s_; @@ -106,9 +160,17 @@ simde_vld4_s16(int16_t const ptr[HEDLEY_ARRAY_PARAM(16)]) { return vld4_s16(ptr); #else simde_int16x4_private a_[4]; - for (size_t i = 0; i < (sizeof(simde_int16x4_t) / sizeof(*ptr)) * 4 ; i++) { - a_[i % 4].values[i / 4] = ptr[i]; - } + #if defined(SIMDE_RISCV_V_NATIVE) + vint16m1x4_t dest = __riscv_vlseg4e16_v_i16m1x4(&ptr[0], 4); + a_[0].sv64 = __riscv_vget_v_i16m1x4_i16m1(dest, 0); + a_[1].sv64 = __riscv_vget_v_i16m1x4_i16m1(dest, 1); + a_[2].sv64 = __riscv_vget_v_i16m1x4_i16m1(dest, 2); + a_[3].sv64 = __riscv_vget_v_i16m1x4_i16m1(dest, 3); + #else + for (size_t i = 0; i < (sizeof(simde_int16x4_t) / sizeof(*ptr)) * 4 ; i++) { + a_[i % 4].values[i / 4] = ptr[i]; + } + #endif simde_int16x4x4_t s_ = { { simde_int16x4_from_private(a_[0]), simde_int16x4_from_private(a_[1]), simde_int16x4_from_private(a_[2]), simde_int16x4_from_private(a_[3]) } }; return s_; @@ -126,9 +188,17 @@ simde_vld4_s32(int32_t const ptr[HEDLEY_ARRAY_PARAM(8)]) { return vld4_s32(ptr); #else simde_int32x2_private a_[4]; - for (size_t i = 0; i < (sizeof(simde_int32x2_t) / sizeof(*ptr)) * 4 ; i++) { - a_[i % 4].values[i / 4] = ptr[i]; - } + #if defined(SIMDE_RISCV_V_NATIVE) + vint32m1x4_t dest = __riscv_vlseg4e32_v_i32m1x4(&ptr[0], 2); + a_[0].sv64 = __riscv_vget_v_i32m1x4_i32m1(dest, 0); + a_[1].sv64 = __riscv_vget_v_i32m1x4_i32m1(dest, 1); + a_[2].sv64 = __riscv_vget_v_i32m1x4_i32m1(dest, 2); + a_[3].sv64 = __riscv_vget_v_i32m1x4_i32m1(dest, 3); + #else + for (size_t i = 0; i < (sizeof(simde_int32x2_t) / sizeof(*ptr)) * 4 ; i++) { + a_[i % 4].values[i / 4] = ptr[i]; + } + #endif simde_int32x2x4_t s_ = { { simde_int32x2_from_private(a_[0]), simde_int32x2_from_private(a_[1]), simde_int32x2_from_private(a_[2]), simde_int32x2_from_private(a_[3]) } }; return s_; @@ -146,15 +216,23 @@ simde_vld4_s64(int64_t const ptr[HEDLEY_ARRAY_PARAM(4)]) { return vld4_s64(ptr); #else simde_int64x1_private a_[4]; - for (size_t i = 0; i < (sizeof(simde_int64x1_t) / sizeof(*ptr)) * 4 ; i++) { - a_[i % 4].values[i / 4] = ptr[i]; - } + #if defined(SIMDE_RISCV_V_NATIVE) + vint64m1x4_t dest = __riscv_vlseg4e64_v_i64m1x4(&ptr[0], 1); + a_[0].sv64 = __riscv_vget_v_i64m1x4_i64m1(dest, 0); + a_[1].sv64 = __riscv_vget_v_i64m1x4_i64m1(dest, 1); + a_[2].sv64 = __riscv_vget_v_i64m1x4_i64m1(dest, 2); + a_[3].sv64 = __riscv_vget_v_i64m1x4_i64m1(dest, 3); + #else + for (size_t i = 0; i < (sizeof(simde_int64x1_t) / sizeof(*ptr)) * 4 ; i++) { + a_[i % 4].values[i / 4] = ptr[i]; + } + #endif simde_int64x1x4_t s_ = { { simde_int64x1_from_private(a_[0]), simde_int64x1_from_private(a_[1]), simde_int64x1_from_private(a_[2]), simde_int64x1_from_private(a_[3]) } }; return s_; #endif } -#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) #undef vld4_s64 #define vld4_s64(a) simde_vld4_s64((a)) #endif @@ -166,9 +244,17 @@ simde_vld4_u8(uint8_t const ptr[HEDLEY_ARRAY_PARAM(32)]) { return vld4_u8(ptr); #else simde_uint8x8_private a_[4]; - for (size_t i = 0; i < (sizeof(simde_uint8x8_t) / sizeof(*ptr)) * 4 ; i++) { - a_[i % 4].values[i / 4] = ptr[i]; - } + #if defined(SIMDE_RISCV_V_NATIVE) + vuint8m1x4_t dest = __riscv_vlseg4e8_v_u8m1x4(&ptr[0], 8); + a_[0].sv64 = __riscv_vget_v_u8m1x4_u8m1(dest, 0); + a_[1].sv64 = __riscv_vget_v_u8m1x4_u8m1(dest, 1); + a_[2].sv64 = __riscv_vget_v_u8m1x4_u8m1(dest, 2); + a_[3].sv64 = __riscv_vget_v_u8m1x4_u8m1(dest, 3); + #else + for (size_t i = 0; i < (sizeof(simde_uint8x8_t) / sizeof(*ptr)) * 4 ; i++) { + a_[i % 4].values[i / 4] = ptr[i]; + } + #endif simde_uint8x8x4_t s_ = { { simde_uint8x8_from_private(a_[0]), simde_uint8x8_from_private(a_[1]), simde_uint8x8_from_private(a_[2]), simde_uint8x8_from_private(a_[3]) } }; return s_; @@ -186,9 +272,17 @@ simde_vld4_u16(uint16_t const ptr[HEDLEY_ARRAY_PARAM(16)]) { return vld4_u16(ptr); #else simde_uint16x4_private a_[4]; - for (size_t i = 0; i < (sizeof(simde_uint16x4_t) / sizeof(*ptr)) * 4 ; i++) { - a_[i % 4].values[i / 4] = ptr[i]; - } + #if defined(SIMDE_RISCV_V_NATIVE) + vuint16m1x4_t dest = __riscv_vlseg4e16_v_u16m1x4(&ptr[0], 4); + a_[0].sv64 = __riscv_vget_v_u16m1x4_u16m1(dest, 0); + a_[1].sv64 = __riscv_vget_v_u16m1x4_u16m1(dest, 1); + a_[2].sv64 = __riscv_vget_v_u16m1x4_u16m1(dest, 2); + a_[3].sv64 = __riscv_vget_v_u16m1x4_u16m1(dest, 3); + #else + for (size_t i = 0; i < (sizeof(simde_uint16x4_t) / sizeof(*ptr)) * 4 ; i++) { + a_[i % 4].values[i / 4] = ptr[i]; + } + #endif simde_uint16x4x4_t s_ = { { simde_uint16x4_from_private(a_[0]), simde_uint16x4_from_private(a_[1]), simde_uint16x4_from_private(a_[2]), simde_uint16x4_from_private(a_[3]) } }; return s_; @@ -206,9 +300,17 @@ simde_vld4_u32(uint32_t const ptr[HEDLEY_ARRAY_PARAM(8)]) { return vld4_u32(ptr); #else simde_uint32x2_private a_[4]; - for (size_t i = 0; i < (sizeof(simde_uint32x2_t) / sizeof(*ptr)) * 4 ; i++) { - a_[i % 4].values[i / 4] = ptr[i]; - } + #if defined(SIMDE_RISCV_V_NATIVE) + vuint32m1x4_t dest = __riscv_vlseg4e32_v_u32m1x4(&ptr[0], 2); + a_[0].sv64 = __riscv_vget_v_u32m1x4_u32m1(dest, 0); + a_[1].sv64 = __riscv_vget_v_u32m1x4_u32m1(dest, 1); + a_[2].sv64 = __riscv_vget_v_u32m1x4_u32m1(dest, 2); + a_[3].sv64 = __riscv_vget_v_u32m1x4_u32m1(dest, 3); + #else + for (size_t i = 0; i < (sizeof(simde_uint32x2_t) / sizeof(*ptr)) * 4 ; i++) { + a_[i % 4].values[i / 4] = ptr[i]; + } + #endif simde_uint32x2x4_t s_ = { { simde_uint32x2_from_private(a_[0]), simde_uint32x2_from_private(a_[1]), simde_uint32x2_from_private(a_[2]), simde_uint32x2_from_private(a_[3]) } }; return s_; @@ -226,19 +328,55 @@ simde_vld4_u64(uint64_t const ptr[HEDLEY_ARRAY_PARAM(4)]) { return vld4_u64(ptr); #else simde_uint64x1_private a_[4]; - for (size_t i = 0; i < (sizeof(simde_uint64x1_t) / sizeof(*ptr)) * 4 ; i++) { - a_[i % 4].values[i / 4] = ptr[i]; - } + #if defined(SIMDE_RISCV_V_NATIVE) + vuint64m1x4_t dest = __riscv_vlseg4e64_v_u64m1x4(&ptr[0], 1); + a_[0].sv64 = __riscv_vget_v_u64m1x4_u64m1(dest, 0); + a_[1].sv64 = __riscv_vget_v_u64m1x4_u64m1(dest, 1); + a_[2].sv64 = __riscv_vget_v_u64m1x4_u64m1(dest, 2); + a_[3].sv64 = __riscv_vget_v_u64m1x4_u64m1(dest, 3); + #else + for (size_t i = 0; i < (sizeof(simde_uint64x1_t) / sizeof(*ptr)) * 4 ; i++) { + a_[i % 4].values[i / 4] = ptr[i]; + } + #endif simde_uint64x1x4_t s_ = { { simde_uint64x1_from_private(a_[0]), simde_uint64x1_from_private(a_[1]), simde_uint64x1_from_private(a_[2]), simde_uint64x1_from_private(a_[3]) } }; return s_; #endif } -#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) #undef vld4_u64 #define vld4_u64(a) simde_vld4_u64((a)) #endif +SIMDE_FUNCTION_ATTRIBUTES +simde_float16x8x4_t +simde_vld4q_f16(simde_float16_t const ptr[HEDLEY_ARRAY_PARAM(32)]) { + #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) && defined(SIMDE_ARM_NEON_FP16) + return vld4q_f16(ptr); + #else + simde_float16x8_private a_[4]; + #if defined(SIMDE_RISCV_V_NATIVE) && SIMDE_ARCH_RISCV_ZVFH && (SIMDE_NATURAL_VECTOR_SIZE >= 128) + vfloat16m1x4_t dest = __riscv_vlseg4e16_v_f16m1x4((_Float16 *)&ptr[0], 8); + a_[0].sv128 = __riscv_vget_v_f16m1x4_f16m1(dest, 0); + a_[1].sv128 = __riscv_vget_v_f16m1x4_f16m1(dest, 1); + a_[2].sv128 = __riscv_vget_v_f16m1x4_f16m1(dest, 2); + a_[3].sv128 = __riscv_vget_v_f16m1x4_f16m1(dest, 3); + #else + for (size_t i = 0; i < (sizeof(simde_float16x8_t) / sizeof(*ptr)) * 4 ; i++) { + a_[i % 4].values[i / 4] = ptr[i]; + } + #endif + simde_float16x8x4_t s_ = { { simde_float16x8_from_private(a_[0]), simde_float16x8_from_private(a_[1]), + simde_float16x8_from_private(a_[2]), simde_float16x8_from_private(a_[3]) } }; + return s_; + #endif +} +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vld4q_f16 + #define vld4q_f16(a) simde_vld4q_f16((a)) +#endif + SIMDE_FUNCTION_ATTRIBUTES simde_float32x4x4_t simde_vld4q_f32(simde_float32 const ptr[HEDLEY_ARRAY_PARAM(16)]) { @@ -246,9 +384,17 @@ simde_vld4q_f32(simde_float32 const ptr[HEDLEY_ARRAY_PARAM(16)]) { return vld4q_f32(ptr); #else simde_float32x4_private a_[4]; - for (size_t i = 0; i < (sizeof(simde_float32x4_t) / sizeof(*ptr)) * 4 ; i++) { - a_[i % 4].values[i / 4] = ptr[i]; - } + #if defined(SIMDE_RISCV_V_NATIVE) + vfloat32m1x4_t dest = __riscv_vlseg4e32_v_f32m1x4(&ptr[0], 4); + a_[0].sv128 = __riscv_vget_v_f32m1x4_f32m1(dest, 0); + a_[1].sv128 = __riscv_vget_v_f32m1x4_f32m1(dest, 1); + a_[2].sv128 = __riscv_vget_v_f32m1x4_f32m1(dest, 2); + a_[3].sv128 = __riscv_vget_v_f32m1x4_f32m1(dest, 3); + #else + for (size_t i = 0; i < (sizeof(simde_float32x4_t) / sizeof(*ptr)) * 4 ; i++) { + a_[i % 4].values[i / 4] = ptr[i]; + } + #endif simde_float32x4x4_t s_ = { { simde_float32x4_from_private(a_[0]), simde_float32x4_from_private(a_[1]), simde_float32x4_from_private(a_[2]), simde_float32x4_from_private(a_[3]) } }; return s_; @@ -266,9 +412,17 @@ simde_vld4q_f64(simde_float64 const ptr[HEDLEY_ARRAY_PARAM(8)]) { return vld4q_f64(ptr); #else simde_float64x2_private a_[4]; - for (size_t i = 0; i < (sizeof(simde_float64x2_t) / sizeof(*ptr)) * 4 ; i++) { - a_[i % 4].values[i / 4] = ptr[i]; - } + #if defined(SIMDE_RISCV_V_NATIVE) + vfloat64m1x4_t dest = __riscv_vlseg4e64_v_f64m1x4(&ptr[0], 2); + a_[0].sv128 = __riscv_vget_v_f64m1x4_f64m1(dest, 0); + a_[1].sv128 = __riscv_vget_v_f64m1x4_f64m1(dest, 1); + a_[2].sv128 = __riscv_vget_v_f64m1x4_f64m1(dest, 2); + a_[3].sv128 = __riscv_vget_v_f64m1x4_f64m1(dest, 3); + #else + for (size_t i = 0; i < (sizeof(simde_float64x2_t) / sizeof(*ptr)) * 4 ; i++) { + a_[i % 4].values[i / 4] = ptr[i]; + } + #endif simde_float64x2x4_t s_ = { { simde_float64x2_from_private(a_[0]), simde_float64x2_from_private(a_[1]), simde_float64x2_from_private(a_[2]), simde_float64x2_from_private(a_[3]) } }; return s_; @@ -286,9 +440,17 @@ simde_vld4q_s8(int8_t const ptr[HEDLEY_ARRAY_PARAM(64)]) { return vld4q_s8(ptr); #else simde_int8x16_private a_[4]; - for (size_t i = 0; i < (sizeof(simde_int8x16_t) / sizeof(*ptr)) * 4 ; i++) { - a_[i % 4].values[i / 4] = ptr[i]; - } + #if defined(SIMDE_RISCV_V_NATIVE) + vint8m1x4_t dest = __riscv_vlseg4e8_v_i8m1x4(&ptr[0], 16); + a_[0].sv128 = __riscv_vget_v_i8m1x4_i8m1(dest, 0); + a_[1].sv128 = __riscv_vget_v_i8m1x4_i8m1(dest, 1); + a_[2].sv128 = __riscv_vget_v_i8m1x4_i8m1(dest, 2); + a_[3].sv128 = __riscv_vget_v_i8m1x4_i8m1(dest, 3); + #else + for (size_t i = 0; i < (sizeof(simde_int8x16_t) / sizeof(*ptr)) * 4 ; i++) { + a_[i % 4].values[i / 4] = ptr[i]; + } + #endif simde_int8x16x4_t s_ = { { simde_int8x16_from_private(a_[0]), simde_int8x16_from_private(a_[1]), simde_int8x16_from_private(a_[2]), simde_int8x16_from_private(a_[3]) } }; return s_; @@ -306,9 +468,17 @@ simde_vld4q_s16(int16_t const ptr[HEDLEY_ARRAY_PARAM(32)]) { return vld4q_s16(ptr); #else simde_int16x8_private a_[4]; - for (size_t i = 0; i < (sizeof(simde_int16x8_t) / sizeof(*ptr)) * 4 ; i++) { - a_[i % 4].values[i / 4] = ptr[i]; - } + #if defined(SIMDE_RISCV_V_NATIVE) + vint16m1x4_t dest = __riscv_vlseg4e16_v_i16m1x4(&ptr[0], 8); + a_[0].sv128 = __riscv_vget_v_i16m1x4_i16m1(dest, 0); + a_[1].sv128 = __riscv_vget_v_i16m1x4_i16m1(dest, 1); + a_[2].sv128 = __riscv_vget_v_i16m1x4_i16m1(dest, 2); + a_[3].sv128 = __riscv_vget_v_i16m1x4_i16m1(dest, 3); + #else + for (size_t i = 0; i < (sizeof(simde_int16x8_t) / sizeof(*ptr)) * 4 ; i++) { + a_[i % 4].values[i / 4] = ptr[i]; + } + #endif simde_int16x8x4_t s_ = { { simde_int16x8_from_private(a_[0]), simde_int16x8_from_private(a_[1]), simde_int16x8_from_private(a_[2]), simde_int16x8_from_private(a_[3]) } }; return s_; @@ -326,9 +496,17 @@ simde_vld4q_s32(int32_t const ptr[HEDLEY_ARRAY_PARAM(16)]) { return vld4q_s32(ptr); #else simde_int32x4_private a_[4]; - for (size_t i = 0; i < (sizeof(simde_int32x4_t) / sizeof(*ptr)) * 4 ; i++) { - a_[i % 4].values[i / 4] = ptr[i]; - } + #if defined(SIMDE_RISCV_V_NATIVE) + vint32m1x4_t dest = __riscv_vlseg4e32_v_i32m1x4(&ptr[0], 4); + a_[0].sv128 = __riscv_vget_v_i32m1x4_i32m1(dest, 0); + a_[1].sv128 = __riscv_vget_v_i32m1x4_i32m1(dest, 1); + a_[2].sv128 = __riscv_vget_v_i32m1x4_i32m1(dest, 2); + a_[3].sv128 = __riscv_vget_v_i32m1x4_i32m1(dest, 3); + #else + for (size_t i = 0; i < (sizeof(simde_int32x4_t) / sizeof(*ptr)) * 4 ; i++) { + a_[i % 4].values[i / 4] = ptr[i]; + } + #endif simde_int32x4x4_t s_ = { { simde_int32x4_from_private(a_[0]), simde_int32x4_from_private(a_[1]), simde_int32x4_from_private(a_[2]), simde_int32x4_from_private(a_[3]) } }; return s_; @@ -346,9 +524,17 @@ simde_vld4q_s64(int64_t const ptr[HEDLEY_ARRAY_PARAM(8)]) { return vld4q_s64(ptr); #else simde_int64x2_private a_[4]; - for (size_t i = 0; i < (sizeof(simde_int64x2_t) / sizeof(*ptr)) * 4 ; i++) { - a_[i % 4].values[i / 4] = ptr[i]; - } + #if defined(SIMDE_RISCV_V_NATIVE) + vint64m1x4_t dest = __riscv_vlseg4e64_v_i64m1x4(&ptr[0], 2); + a_[0].sv128 = __riscv_vget_v_i64m1x4_i64m1(dest, 0); + a_[1].sv128 = __riscv_vget_v_i64m1x4_i64m1(dest, 1); + a_[2].sv128 = __riscv_vget_v_i64m1x4_i64m1(dest, 2); + a_[3].sv128 = __riscv_vget_v_i64m1x4_i64m1(dest, 3); + #else + for (size_t i = 0; i < (sizeof(simde_int64x2_t) / sizeof(*ptr)) * 4 ; i++) { + a_[i % 4].values[i / 4] = ptr[i]; + } + #endif simde_int64x2x4_t s_ = { { simde_int64x2_from_private(a_[0]), simde_int64x2_from_private(a_[1]), simde_int64x2_from_private(a_[2]), simde_int64x2_from_private(a_[3]) } }; return s_; @@ -358,7 +544,6 @@ simde_vld4q_s64(int64_t const ptr[HEDLEY_ARRAY_PARAM(8)]) { #undef vld4q_s64 #define vld4q_s64(a) simde_vld4q_s64((a)) #endif - SIMDE_FUNCTION_ATTRIBUTES simde_uint8x16x4_t simde_vld4q_u8(uint8_t const ptr[HEDLEY_ARRAY_PARAM(64)]) { @@ -403,6 +588,20 @@ simde_vld4q_u8(uint8_t const ptr[HEDLEY_ARRAY_PARAM(64)]) { simde_uint8x16_from_private(r_[2]), simde_uint8x16_from_private(r_[3])}}; return s_; + #elif defined(SIMDE_RISCV_V_NATIVE) + simde_uint8x16_private r_[4]; + vuint8m1x4_t dest = __riscv_vlseg4e8_v_u8m1x4(&ptr[0], 16); + r_[0].sv128 = __riscv_vget_v_u8m1x4_u8m1(dest, 0); + r_[1].sv128 = __riscv_vget_v_u8m1x4_u8m1(dest, 1); + r_[2].sv128 = __riscv_vget_v_u8m1x4_u8m1(dest, 2); + r_[3].sv128 = __riscv_vget_v_u8m1x4_u8m1(dest, 3); + simde_uint8x16x4_t r = { { + simde_uint8x16_from_private(r_[0]), + simde_uint8x16_from_private(r_[1]), + simde_uint8x16_from_private(r_[2]), + simde_uint8x16_from_private(r_[3]) + } }; + return r; #else simde_uint8x16_private a_[4]; for (size_t i = 0; i < (sizeof(simde_uint8x16_t) / sizeof(*ptr)) * 4 ; i++) { @@ -425,9 +624,17 @@ simde_vld4q_u16(uint16_t const ptr[HEDLEY_ARRAY_PARAM(32)]) { return vld4q_u16(ptr); #else simde_uint16x8_private a_[4]; - for (size_t i = 0; i < (sizeof(simde_uint16x8_t) / sizeof(*ptr)) * 4 ; i++) { - a_[i % 4].values[i / 4] = ptr[i]; - } + #if defined(SIMDE_RISCV_V_NATIVE) + vuint16m1x4_t dest = __riscv_vlseg4e16_v_u16m1x4(&ptr[0], 8); + a_[0].sv128 = __riscv_vget_v_u16m1x4_u16m1(dest, 0); + a_[1].sv128 = __riscv_vget_v_u16m1x4_u16m1(dest, 1); + a_[2].sv128 = __riscv_vget_v_u16m1x4_u16m1(dest, 2); + a_[3].sv128 = __riscv_vget_v_u16m1x4_u16m1(dest, 3); + #else + for (size_t i = 0; i < (sizeof(simde_uint16x8_t) / sizeof(*ptr)) * 4 ; i++) { + a_[i % 4].values[i / 4] = ptr[i]; + } + #endif simde_uint16x8x4_t s_ = { { simde_uint16x8_from_private(a_[0]), simde_uint16x8_from_private(a_[1]), simde_uint16x8_from_private(a_[2]), simde_uint16x8_from_private(a_[3]) } }; return s_; @@ -445,9 +652,17 @@ simde_vld4q_u32(uint32_t const ptr[HEDLEY_ARRAY_PARAM(16)]) { return vld4q_u32(ptr); #else simde_uint32x4_private a_[4]; - for (size_t i = 0; i < (sizeof(simde_uint32x4_t) / sizeof(*ptr)) * 4 ; i++) { - a_[i % 4].values[i / 4] = ptr[i]; - } + #if defined(SIMDE_RISCV_V_NATIVE) + vuint32m1x4_t dest = __riscv_vlseg4e32_v_u32m1x4(&ptr[0], 4); + a_[0].sv128 = __riscv_vget_v_u32m1x4_u32m1(dest, 0); + a_[1].sv128 = __riscv_vget_v_u32m1x4_u32m1(dest, 1); + a_[2].sv128 = __riscv_vget_v_u32m1x4_u32m1(dest, 2); + a_[3].sv128 = __riscv_vget_v_u32m1x4_u32m1(dest, 3); + #else + for (size_t i = 0; i < (sizeof(simde_uint32x4_t) / sizeof(*ptr)) * 4 ; i++) { + a_[i % 4].values[i / 4] = ptr[i]; + } + #endif simde_uint32x4x4_t s_ = { { simde_uint32x4_from_private(a_[0]), simde_uint32x4_from_private(a_[1]), simde_uint32x4_from_private(a_[2]), simde_uint32x4_from_private(a_[3]) } }; return s_; @@ -465,9 +680,17 @@ simde_vld4q_u64(uint64_t const ptr[HEDLEY_ARRAY_PARAM(8)]) { return vld4q_u64(ptr); #else simde_uint64x2_private a_[4]; - for (size_t i = 0; i < (sizeof(simde_uint64x2_t) / sizeof(*ptr)) * 4 ; i++) { - a_[i % 4].values[i / 4] = ptr[i]; - } + #if defined(SIMDE_RISCV_V_NATIVE) + vuint64m1x4_t dest = __riscv_vlseg4e64_v_u64m1x4(&ptr[0], 2); + a_[0].sv128 = __riscv_vget_v_u64m1x4_u64m1(dest, 0); + a_[1].sv128 = __riscv_vget_v_u64m1x4_u64m1(dest, 1); + a_[2].sv128 = __riscv_vget_v_u64m1x4_u64m1(dest, 2); + a_[3].sv128 = __riscv_vget_v_u64m1x4_u64m1(dest, 3); + #else + for (size_t i = 0; i < (sizeof(simde_uint64x2_t) / sizeof(*ptr)) * 4 ; i++) { + a_[i % 4].values[i / 4] = ptr[i]; + } + #endif simde_uint64x2x4_t s_ = { { simde_uint64x2_from_private(a_[0]), simde_uint64x2_from_private(a_[1]), simde_uint64x2_from_private(a_[2]), simde_uint64x2_from_private(a_[3]) } }; return s_; @@ -478,6 +701,214 @@ simde_vld4q_u64(uint64_t const ptr[HEDLEY_ARRAY_PARAM(8)]) { #define vld4q_u64(a) simde_vld4q_u64((a)) #endif +SIMDE_FUNCTION_ATTRIBUTES +simde_poly8x8x4_t +simde_vld4_p8(simde_poly8_t const ptr[HEDLEY_ARRAY_PARAM(32)]) { + #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) + return vld4_p8(ptr); + #else + simde_poly8x8_private a_[4]; + #if defined(SIMDE_RISCV_V_NATIVE) + vuint8m1x4_t dest = __riscv_vlseg4e8_v_u8m1x4(&ptr[0], 8); + a_[0].sv64 = __riscv_vget_v_u8m1x4_u8m1(dest, 0); + a_[1].sv64 = __riscv_vget_v_u8m1x4_u8m1(dest, 1); + a_[2].sv64 = __riscv_vget_v_u8m1x4_u8m1(dest, 2); + a_[3].sv64 = __riscv_vget_v_u8m1x4_u8m1(dest, 3); + #else + for (size_t i = 0; i < (sizeof(simde_poly8x8_t) / sizeof(*ptr)) * 4 ; i++) { + a_[i % 4].values[i / 4] = ptr[i]; + } + #endif + simde_poly8x8x4_t s_ = { { simde_poly8x8_from_private(a_[0]), simde_poly8x8_from_private(a_[1]), + simde_poly8x8_from_private(a_[2]), simde_poly8x8_from_private(a_[3]) } }; + return s_; + #endif +} +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vld4_p8 + #define vld4_p8(a) simde_vld4_p8((a)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_poly16x4x4_t +simde_vld4_p16(simde_poly16_t const ptr[HEDLEY_ARRAY_PARAM(16)]) { + #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) + return vld4_p16(ptr); + #else + simde_poly16x4_private a_[4]; + #if defined(SIMDE_RISCV_V_NATIVE) + vuint16m1x4_t dest = __riscv_vlseg4e16_v_u16m1x4(&ptr[0], 4); + a_[0].sv64 = __riscv_vget_v_u16m1x4_u16m1(dest, 0); + a_[1].sv64 = __riscv_vget_v_u16m1x4_u16m1(dest, 1); + a_[2].sv64 = __riscv_vget_v_u16m1x4_u16m1(dest, 2); + a_[3].sv64 = __riscv_vget_v_u16m1x4_u16m1(dest, 3); + #else + for (size_t i = 0; i < (sizeof(simde_poly16x4_t) / sizeof(*ptr)) * 4 ; i++) { + a_[i % 4].values[i / 4] = ptr[i]; + } + #endif + simde_poly16x4x4_t s_ = { { simde_poly16x4_from_private(a_[0]), simde_poly16x4_from_private(a_[1]), + simde_poly16x4_from_private(a_[2]), simde_poly16x4_from_private(a_[3]) } }; + return s_; + #endif +} +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vld4_p16 + #define vld4_p16(a) simde_vld4_p16((a)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_poly64x1x4_t +simde_vld4_p64(simde_poly64_t const ptr[HEDLEY_ARRAY_PARAM(4)]) { + #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) + return vld4_p64(ptr); + #else + simde_poly64x1_private a_[4]; + #if defined(SIMDE_RISCV_V_NATIVE) + vuint64m1x4_t dest = __riscv_vlseg4e64_v_u64m1x4(&ptr[0], 1); + a_[0].sv64 = __riscv_vget_v_u64m1x4_u64m1(dest, 0); + a_[1].sv64 = __riscv_vget_v_u64m1x4_u64m1(dest, 1); + a_[2].sv64 = __riscv_vget_v_u64m1x4_u64m1(dest, 2); + a_[3].sv64 = __riscv_vget_v_u64m1x4_u64m1(dest, 3); + #else + for (size_t i = 0; i < (sizeof(simde_poly64x1_t) / sizeof(*ptr)) * 4 ; i++) { + a_[i % 4].values[i / 4] = ptr[i]; + } + #endif + simde_poly64x1x4_t s_ = { { simde_poly64x1_from_private(a_[0]), simde_poly64x1_from_private(a_[1]), + simde_poly64x1_from_private(a_[2]), simde_poly64x1_from_private(a_[3]) } }; + return s_; + #endif +} +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vld4_p64 + #define vld4_p64(a) simde_vld4_p64((a)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_poly8x16x4_t +simde_vld4q_p8(simde_poly8_t const ptr[HEDLEY_ARRAY_PARAM(64)]) { + #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) + return vld4q_p8(ptr); + #else + simde_poly8x16_private a_[4]; + #if defined(SIMDE_RISCV_V_NATIVE) + vuint8m1x4_t dest = __riscv_vlseg4e8_v_u8m1x4(&ptr[0], 16); + a_[0].sv128 = __riscv_vget_v_u8m1x4_u8m1(dest, 0); + a_[1].sv128 = __riscv_vget_v_u8m1x4_u8m1(dest, 1); + a_[2].sv128 = __riscv_vget_v_u8m1x4_u8m1(dest, 2); + a_[3].sv128 = __riscv_vget_v_u8m1x4_u8m1(dest, 3); + #else + for (size_t i = 0; i < (sizeof(simde_poly8x16_t) / sizeof(*ptr)) * 4 ; i++) { + a_[i % 4].values[i / 4] = ptr[i]; + } + #endif + simde_poly8x16x4_t s_ = { { simde_poly8x16_from_private(a_[0]), simde_poly8x16_from_private(a_[1]), + simde_poly8x16_from_private(a_[2]), simde_poly8x16_from_private(a_[3]) } }; + return s_; + #endif +} +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vld4q_p8 + #define vld4q_p8(a) simde_vld4q_p8((a)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_poly16x8x4_t +simde_vld4q_p16(simde_poly16_t const ptr[HEDLEY_ARRAY_PARAM(32)]) { + #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) + return vld4q_p16(ptr); + #else + simde_poly16x8_private a_[4]; + #if defined(SIMDE_RISCV_V_NATIVE) + vuint16m1x4_t dest = __riscv_vlseg4e16_v_u16m1x4(&ptr[0], 8); + a_[0].sv128 = __riscv_vget_v_u16m1x4_u16m1(dest, 0); + a_[1].sv128 = __riscv_vget_v_u16m1x4_u16m1(dest, 1); + a_[2].sv128 = __riscv_vget_v_u16m1x4_u16m1(dest, 2); + a_[3].sv128 = __riscv_vget_v_u16m1x4_u16m1(dest, 3); + #else + for (size_t i = 0; i < (sizeof(simde_poly16x8_t) / sizeof(*ptr)) * 4 ; i++) { + a_[i % 4].values[i / 4] = ptr[i]; + } + #endif + simde_poly16x8x4_t s_ = { { simde_poly16x8_from_private(a_[0]), simde_poly16x8_from_private(a_[1]), + simde_poly16x8_from_private(a_[2]), simde_poly16x8_from_private(a_[3]) } }; + return s_; + #endif +} +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vld4q_p16 + #define vld4q_p16(a) simde_vld4q_p16((a)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_poly64x2x4_t +simde_vld4q_p64(simde_poly64_t const ptr[HEDLEY_ARRAY_PARAM(8)]) { + #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + return vld4q_p64(ptr); + #else + simde_poly64x2_private a_[4]; + #if defined(SIMDE_RISCV_V_NATIVE) + vuint64m1x4_t dest = __riscv_vlseg4e64_v_u64m1x4(&ptr[0], 2); + a_[0].sv128 = __riscv_vget_v_u64m1x4_u64m1(dest, 0); + a_[1].sv128 = __riscv_vget_v_u64m1x4_u64m1(dest, 1); + a_[2].sv128 = __riscv_vget_v_u64m1x4_u64m1(dest, 2); + a_[3].sv128 = __riscv_vget_v_u64m1x4_u64m1(dest, 3); + #else + for (size_t i = 0; i < (sizeof(simde_poly64x2_t) / sizeof(*ptr)) * 4 ; i++) { + a_[i % 4].values[i / 4] = ptr[i]; + } + #endif + simde_poly64x2x4_t s_ = { { simde_poly64x2_from_private(a_[0]), simde_poly64x2_from_private(a_[1]), + simde_poly64x2_from_private(a_[2]), simde_poly64x2_from_private(a_[3]) } }; + return s_; + #endif +} +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vld4q_p64 + #define vld4q_p64(a) simde_vld4q_p64((a)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_bfloat16x4x4_t +simde_vld4_bf16(simde_bfloat16 const ptr[HEDLEY_ARRAY_PARAM(16)]) { + #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARM_NEON_BF16) + return vld4_bf16(ptr); + #else + simde_bfloat16x4_private a_[4]; + for (size_t i = 0; i < (sizeof(simde_bfloat16x4_t) / sizeof(*ptr)) * 4 ; i++) { + a_[i % 4].values[i / 4] = ptr[i]; + } + simde_bfloat16x4x4_t s_ = { { simde_bfloat16x4_from_private(a_[0]), simde_bfloat16x4_from_private(a_[1]), + simde_bfloat16x4_from_private(a_[2]), simde_bfloat16x4_from_private(a_[3]) } }; + return (s_); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) + #undef vld4_bf16 + #define vld4_bf16(a) simde_vld4_bf16((a)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_bfloat16x8x4_t +simde_vld4q_bf16(simde_bfloat16 const ptr[HEDLEY_ARRAY_PARAM(32)]) { + #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARM_NEON_BF16) + return vld4q_bf16(ptr); + #else + simde_bfloat16x8_private a_[4]; + for (size_t i = 0; i < (sizeof(simde_bfloat16x8_t) / sizeof(*ptr)) * 4 ; i++) { + a_[i % 4].values[i / 4] = ptr[i]; + } + simde_bfloat16x8x4_t s_ = { { simde_bfloat16x8_from_private(a_[0]), simde_bfloat16x8_from_private(a_[1]), + simde_bfloat16x8_from_private(a_[2]), simde_bfloat16x8_from_private(a_[3]) } }; + return s_; + #endif +} +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) + #undef vld4q_bf16 + #define vld4q_bf16(a) simde_vld4q_bf16((a)) +#endif + #endif /* !defined(SIMDE_BUG_INTEL_857088) */ SIMDE_END_DECLS_ diff --git a/lib/simd_wrapper/simde/arm/neon/ld4_dup.h b/lib/simd_wrapper/simde/arm/neon/ld4_dup.h new file mode 100644 index 00000000000..c2100af147d --- /dev/null +++ b/lib/simd_wrapper/simde/arm/neon/ld4_dup.h @@ -0,0 +1,610 @@ +/* SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, copy, + * modify, merge, publish, distribute, sublicense, and/or sell copies + * of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + * Copyright: + * 2023 Yi-Yen Chung (Copyright owned by Andes Technology) + */ + +#if !defined(SIMDE_ARM_NEON_LD4_DUP_H) +#define SIMDE_ARM_NEON_LD4_DUP_H + +#include "dup_n.h" +#include "reinterpret.h" + +HEDLEY_DIAGNOSTIC_PUSH +SIMDE_DISABLE_UNWANTED_DIAGNOSTICS +SIMDE_BEGIN_DECLS_ + +SIMDE_FUNCTION_ATTRIBUTES +simde_float16x4x4_t +simde_vld4_dup_f16(simde_float16_t const ptr[HEDLEY_ARRAY_PARAM(4)]) { + #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) && defined(SIMDE_ARM_NEON_FP16) + return vld4_dup_f16(ptr); + #else + simde_float16x4x4_t r; + + for (size_t i = 0 ; i < 4 ; i++) { + r.val[i] = simde_vdup_n_f16(ptr[i]); + } + return r; + #endif +} +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vld4_dup_f16 + #define vld4_dup_f16(a) simde_vld4_dup_f16((a)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_float32x2x4_t +simde_vld4_dup_f32(simde_float32 const * ptr) { + #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) + return vld4_dup_f32(ptr); + #else + simde_float32x2x4_t r; + + for (size_t i = 0 ; i < 4 ; i++) { + r.val[i] = simde_vdup_n_f32(ptr[i]); + } + return r; + #endif +} +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vld4_dup_f32 + #define vld4_dup_f32(a) simde_vld4_dup_f32((a)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_float64x1x4_t +simde_vld4_dup_f64(simde_float64 const * ptr) { + #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + return vld4_dup_f64(ptr); + #else + simde_float64x1x4_t r; + + for (size_t i = 0 ; i < 4 ; i++) { + r.val[i] = simde_vdup_n_f64(ptr[i]); + } + return r; + #endif +} +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vld4_dup_f64 + #define vld4_dup_f64(a) simde_vld4_dup_f64((a)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_int8x8x4_t +simde_vld4_dup_s8(int8_t const * ptr) { + #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) + return vld4_dup_s8(ptr); + #else + simde_int8x8x4_t r; + + for (size_t i = 0 ; i < 4 ; i++) { + r.val[i] = simde_vdup_n_s8(ptr[i]); + } + return r; + #endif +} +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vld4_dup_s8 + #define vld4_dup_s8(a) simde_vld4_dup_s8((a)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_int16x4x4_t +simde_vld4_dup_s16(int16_t const * ptr) { + #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) + return vld4_dup_s16(ptr); + #else + simde_int16x4x4_t r; + + for (size_t i = 0 ; i < 4 ; i++) { + r.val[i] = simde_vdup_n_s16(ptr[i]); + } + return r; + #endif +} +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vld4_dup_s16 + #define vld4_dup_s16(a) simde_vld4_dup_s16((a)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_int32x2x4_t +simde_vld4_dup_s32(int32_t const * ptr) { + #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) + return vld4_dup_s32(ptr); + #else + simde_int32x2x4_t r; + + for (size_t i = 0 ; i < 4 ; i++) { + r.val[i] = simde_vdup_n_s32(ptr[i]); + } + return r; + #endif +} +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vld4_dup_s32 + #define vld4_dup_s32(a) simde_vld4_dup_s32((a)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_int64x1x4_t +simde_vld4_dup_s64(int64_t const * ptr) { + #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) + return vld4_dup_s64(ptr); + #else + simde_int64x1x4_t r; + + for (size_t i = 0 ; i < 4 ; i++) { + r.val[i] = simde_vdup_n_s64(ptr[i]); + } + return r; + #endif +} +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vld4_dup_s64 + #define vld4_dup_s64(a) simde_vld4_dup_s64((a)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_uint8x8x4_t +simde_vld4_dup_u8(uint8_t const * ptr) { + #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) + return vld4_dup_u8(ptr); + #else + simde_uint8x8x4_t r; + + for (size_t i = 0 ; i < 4 ; i++) { + r.val[i] = simde_vdup_n_u8(ptr[i]); + } + return r; + #endif +} +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vld4_dup_u8 + #define vld4_dup_u8(a) simde_vld4_dup_u8((a)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_uint16x4x4_t +simde_vld4_dup_u16(uint16_t const * ptr) { + #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) + return vld4_dup_u16(ptr); + #else + simde_uint16x4x4_t r; + + for (size_t i = 0 ; i < 4 ; i++) { + r.val[i] = simde_vdup_n_u16(ptr[i]); + } + return r; + #endif +} +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vld4_dup_u16 + #define vld4_dup_u16(a) simde_vld4_dup_u16((a)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_uint32x2x4_t +simde_vld4_dup_u32(uint32_t const * ptr) { + #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) + return vld4_dup_u32(ptr); + #else + simde_uint32x2x4_t r; + + for (size_t i = 0 ; i < 4 ; i++) { + r.val[i] = simde_vdup_n_u32(ptr[i]); + } + return r; + #endif +} +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vld4_dup_u32 + #define vld4_dup_u32(a) simde_vld4_dup_u32((a)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_uint64x1x4_t +simde_vld4_dup_u64(uint64_t const * ptr) { + #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) + return vld4_dup_u64(ptr); + #else + simde_uint64x1x4_t r; + + for (size_t i = 0 ; i < 4 ; i++) { + r.val[i] = simde_vdup_n_u64(ptr[i]); + } + return r; + #endif +} +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vld4_dup_u64 + #define vld4_dup_u64(a) simde_vld4_dup_u64((a)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_float16x8x4_t +simde_vld4q_dup_f16(simde_float16_t const * ptr) { + #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARM_NEON_FP16) + return vld4q_dup_f16(ptr); + #else + simde_float16x8x4_t r; + + for (size_t i = 0 ; i < 4 ; i++) { + r.val[i] = simde_vdupq_n_f16(ptr[i]); + } + return r; + #endif +} +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) + #undef vld4q_dup_f16 + #define vld4q_dup_f16(a) simde_vld4q_dup_f16((a)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_float32x4x4_t +simde_vld4q_dup_f32(simde_float32 const * ptr) { + #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + return vld4q_dup_f32(ptr); + #else + simde_float32x4x4_t r; + + for (size_t i = 0 ; i < 4 ; i++) { + r.val[i] = simde_vdupq_n_f32(ptr[i]); + } + return r; + #endif +} +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vld4q_dup_f32 + #define vld4q_dup_f32(a) simde_vld4q_dup_f32((a)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_float64x2x4_t +simde_vld4q_dup_f64(simde_float64 const * ptr) { + #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + return vld4q_dup_f64(ptr); + #else + simde_float64x2x4_t r; + + for (size_t i = 0 ; i < 4 ; i++) { + r.val[i] = simde_vdupq_n_f64(ptr[i]); + } + return r; + #endif +} +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vld4q_dup_f64 + #define vld4q_dup_f64(a) simde_vld4q_dup_f64((a)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_int8x16x4_t +simde_vld4q_dup_s8(int8_t const * ptr) { + #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + return vld4q_dup_s8(ptr); + #else + simde_int8x16x4_t r; + + for (size_t i = 0 ; i < 4 ; i++) { + r.val[i] = simde_vdupq_n_s8(ptr[i]); + } + return r; + #endif +} +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vld4q_dup_s8 + #define vld4q_dup_s8(a) simde_vld4q_dup_s8((a)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_int16x8x4_t +simde_vld4q_dup_s16(int16_t const * ptr) { + #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + return vld4q_dup_s16(ptr); + #else + simde_int16x8x4_t r; + + for (size_t i = 0 ; i < 4 ; i++) { + r.val[i] = simde_vdupq_n_s16(ptr[i]); + } + return r; + #endif +} +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vld4q_dup_s16 + #define vld4q_dup_s16(a) simde_vld4q_dup_s16((a)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_int32x4x4_t +simde_vld4q_dup_s32(int32_t const * ptr) { + #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + return vld4q_dup_s32(ptr); + #else + simde_int32x4x4_t r; + + for (size_t i = 0 ; i < 4 ; i++) { + r.val[i] = simde_vdupq_n_s32(ptr[i]); + } + return r; + #endif +} +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vld4q_dup_s32 + #define vld4q_dup_s32(a) simde_vld4q_dup_s32((a)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_int64x2x4_t +simde_vld4q_dup_s64(int64_t const * ptr) { + #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + return vld4q_dup_s64(ptr); + #else + simde_int64x2x4_t r; + + for (size_t i = 0 ; i < 4 ; i++) { + r.val[i] = simde_vdupq_n_s64(ptr[i]); + } + return r; + #endif +} +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vld4q_dup_s64 + #define vld4q_dup_s64(a) simde_vld4q_dup_s64((a)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_uint8x16x4_t +simde_vld4q_dup_u8(uint8_t const * ptr) { + #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + return vld4q_dup_u8(ptr); + #else + simde_uint8x16x4_t r; + + for (size_t i = 0 ; i < 4 ; i++) { + r.val[i] = simde_vdupq_n_u8(ptr[i]); + } + return r; + #endif +} +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vld4q_dup_u8 + #define vld4q_dup_u8(a) simde_vld4q_dup_u8((a)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_uint16x8x4_t +simde_vld4q_dup_u16(uint16_t const * ptr) { + #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + return vld4q_dup_u16(ptr); + #else + simde_uint16x8x4_t r; + + for (size_t i = 0 ; i < 4 ; i++) { + r.val[i] = simde_vdupq_n_u16(ptr[i]); + } + return r; + #endif +} +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vld4q_dup_u16 + #define vld4q_dup_u16(a) simde_vld4q_dup_u16((a)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_uint32x4x4_t +simde_vld4q_dup_u32(uint32_t const * ptr) { + #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + return vld4q_dup_u32(ptr); + #else + simde_uint32x4x4_t r; + + for (size_t i = 0 ; i < 4 ; i++) { + r.val[i] = simde_vdupq_n_u32(ptr[i]); + } + return r; + #endif +} +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vld4q_dup_u32 + #define vld4q_dup_u32(a) simde_vld4q_dup_u32((a)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_uint64x2x4_t +simde_vld4q_dup_u64(uint64_t const * ptr) { + #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + return vld4q_dup_u64(ptr); + #else + simde_uint64x2x4_t r; + + for (size_t i = 0 ; i < 4 ; i++) { + r.val[i] = simde_vdupq_n_u64(ptr[i]); + } + return r; + #endif +} +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vld4q_dup_u64 + #define vld4q_dup_u64(a) simde_vld4q_dup_u64((a)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_poly8x8x4_t +simde_vld4_dup_p8(simde_poly8_t const * ptr) { + #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) && !defined(SIMDE_BUG_GCC_95399) + return vld4_dup_p8(ptr); + #else + simde_poly8x8x4_t r; + + for (size_t i = 0 ; i < 4 ; i++) { + r.val[i] = simde_vdup_n_p8(ptr[i]); + } + return r; + #endif +} +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vld4_dup_p8 + #define vld4_dup_p8(a) simde_vld4_dup_p8((a)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_poly16x4x4_t +simde_vld4_dup_p16(simde_poly16_t const * ptr) { + #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) && !defined(SIMDE_BUG_GCC_95399) + return vld4_dup_p16(ptr); + #else + simde_poly16x4x4_t r; + + for (size_t i = 0 ; i < 4 ; i++) { + r.val[i] = simde_vdup_n_p16(ptr[i]); + } + return r; + #endif +} +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vld4_dup_p16 + #define vld4_dup_p16(a) simde_vld4_dup_p16((a)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_poly64x1x4_t +simde_vld4_dup_p64(simde_poly64_t const * ptr) { + #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) + return vld4_dup_p64(ptr); + #else + simde_poly64x1x4_t r; + + for (size_t i = 0 ; i < 4 ; i++) { + r.val[i] = simde_vdup_n_p64(ptr[i]); + } + return r; + #endif +} +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) + #undef vld4_dup_p64 + #define vld4_dup_p64(a) simde_vld4_dup_p64((a)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_poly8x16x4_t +simde_vld4q_dup_p8(simde_poly8_t const * ptr) { + #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) && !defined(SIMDE_BUG_GCC_95399) + return vld4q_dup_p8(ptr); + #else + simde_poly8x16x4_t r; + + for (size_t i = 0 ; i < 4 ; i++) { + r.val[i] = simde_vdupq_n_p8(ptr[i]); + } + return r; + #endif +} +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vld4q_dup_p8 + #define vld4q_dup_p8(a) simde_vld4q_dup_p8((a)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_poly16x8x4_t +simde_vld4q_dup_p16(simde_poly16_t const * ptr) { + #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) && !defined(SIMDE_BUG_GCC_95399) + return vld4q_dup_p16(ptr); + #else + simde_poly16x8x4_t r; + + for (size_t i = 0 ; i < 4 ; i++) { + r.val[i] = simde_vdupq_n_p16(ptr[i]); + } + return r; + #endif +} +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vld4q_dup_p16 + #define vld4q_dup_p16(a) simde_vld4q_dup_p16((a)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_poly64x2x4_t +simde_vld4q_dup_p64(simde_poly64_t const * ptr) { + #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + return vld4q_dup_p64(ptr); + #else + simde_poly64x2x4_t r; + + for (size_t i = 0 ; i < 4 ; i++) { + r.val[i] = simde_vdupq_n_p64(ptr[i]); + } + return r; + #endif +} +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vld4q_dup_p64 + #define vld4q_dup_p64(a) simde_vld4q_dup_p64((a)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_bfloat16x4x4_t +simde_vld4_dup_bf16(simde_bfloat16_t const ptr[HEDLEY_ARRAY_PARAM(2)]) { + #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARM_NEON_BF16) + return vld4_dup_bf16(ptr); + #else + simde_bfloat16x4x4_t r; + + for (size_t i = 0 ; i < 4 ; i++) { + r.val[i] = simde_vdup_n_bf16(ptr[i]); + } + return r; + #endif +} +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) + #undef vld4_dup_bf16 + #define vld4_dup_bf16(a) simde_vld4_dup_bf16((a)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_bfloat16x8x4_t +simde_vld4q_dup_bf16(simde_bfloat16 const * ptr) { + #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARM_NEON_BF16) + return vld4q_dup_bf16(ptr); + #else + simde_bfloat16x8x4_t r; + + for (size_t i = 0 ; i < 4 ; i++) { + r.val[i] = simde_vdupq_n_bf16(ptr[i]); + } + return r; + #endif +} +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) + #undef vld4q_dup_bf16 + #define vld4q_dup_bf16(a) simde_vld4q_dup_bf16((a)) +#endif + +SIMDE_END_DECLS_ +HEDLEY_DIAGNOSTIC_POP + +#endif /* !defined(SIMDE_ARM_NEON_LD3_DUP_H) */ diff --git a/lib/simd_wrapper/simde/arm/neon/ld4_lane.h b/lib/simd_wrapper/simde/arm/neon/ld4_lane.h index c525755d2bf..ed8a7e4d2ad 100644 --- a/lib/simd_wrapper/simde/arm/neon/ld4_lane.h +++ b/lib/simd_wrapper/simde/arm/neon/ld4_lane.h @@ -23,6 +23,7 @@ * Copyright: * 2021 Zhi An Ng (Copyright owned by Google, LLC) * 2021 Evan Nemerson + * 2023 Yi-Yen Chung (Copyright owned by Andes Technology) */ /* In older versions of clang, __builtin_neon_vld4_lane_v would @@ -99,6 +100,7 @@ simde_vld4_lane_s16(int16_t const ptr[HEDLEY_ARRAY_PARAM(4)], simde_int16x4x4_t #define vld4_lane_s16(ptr, src, lane) simde_vld4_lane_s16((ptr), (src), (lane)) #endif + SIMDE_FUNCTION_ATTRIBUTES simde_int32x2x4_t simde_vld4_lane_s32(int32_t const ptr[HEDLEY_ARRAY_PARAM(4)], simde_int32x2x4_t src, const int lane) @@ -261,6 +263,33 @@ simde_vld4_lane_u64(uint64_t const ptr[HEDLEY_ARRAY_PARAM(4)], simde_uint64x1x4_ #define vld4_lane_u64(ptr, src, lane) simde_vld4_lane_u64((ptr), (src), (lane)) #endif +SIMDE_FUNCTION_ATTRIBUTES +simde_float16x4x4_t +simde_vld4_lane_f16(simde_float16_t const ptr[HEDLEY_ARRAY_PARAM(4)], simde_float16x4x4_t src, const int lane) + SIMDE_REQUIRE_CONSTANT_RANGE(lane, 0, 3) { + simde_float16x4x4_t r; + + for (size_t i = 0 ; i < 4 ; i++) { + simde_float16x4_private tmp_ = simde_float16x4_to_private(src.val[i]); + tmp_.values[lane] = ptr[i]; + r.val[i] = simde_float16x4_from_private(tmp_); + } + + return r; +} +#if defined(SIMDE_ARM_NEON_A32V7_NATIVE) && defined(SIMDE_ARM_NEON_FP16) + #if defined(__clang__) && !SIMDE_DETECT_CLANG_VERSION_CHECK(10,0,0) + #define simde_vld4_lane_f16(ptr, src, lane) \ + SIMDE_DISABLE_DIAGNOSTIC_EXPR_(SIMDE_DIAGNOSTIC_DISABLE_VECTOR_CONVERSION_, vld4_lane_f16(ptr, src, lane)) + #else + #define simde_vld4_lane_f16(ptr, src, lane) vld4_lane_f16(ptr, src, lane) + #endif +#endif +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vld4_lane_f16 + #define vld4_lane_f16(ptr, src, lane) simde_vld4_lane_f16((ptr), (src), (lane)) +#endif + SIMDE_FUNCTION_ATTRIBUTES simde_float32x2x4_t simde_vld4_lane_f32(simde_float32_t const ptr[HEDLEY_ARRAY_PARAM(4)], simde_float32x2x4_t src, const int lane) @@ -531,6 +560,33 @@ simde_vld4q_lane_u64(uint64_t const ptr[HEDLEY_ARRAY_PARAM(4)], simde_uint64x2x4 #define vld4q_lane_u64(ptr, src, lane) simde_vld4q_lane_u64((ptr), (src), (lane)) #endif +SIMDE_FUNCTION_ATTRIBUTES +simde_float16x8x4_t +simde_vld4q_lane_f16(simde_float16_t const ptr[HEDLEY_ARRAY_PARAM(4)], simde_float16x8x4_t src, const int lane) + SIMDE_REQUIRE_CONSTANT_RANGE(lane, 0, 7) { + simde_float16x8x4_t r; + + for (size_t i = 0 ; i < 4 ; i++) { + simde_float16x8_private tmp_ = simde_float16x8_to_private(src.val[i]); + tmp_.values[lane] = ptr[i]; + r.val[i] = simde_float16x8_from_private(tmp_); + } + + return r; +} +#if defined(SIMDE_ARM_NEON_A32V7_NATIVE) && defined(SIMDE_ARM_NEON_FP16) + #if defined(__clang__) && !SIMDE_DETECT_CLANG_VERSION_CHECK(10,0,0) + #define simde_vld4q_lane_f16(ptr, src, lane) \ + SIMDE_DISABLE_DIAGNOSTIC_EXPR_(SIMDE_DIAGNOSTIC_DISABLE_VECTOR_CONVERSION_, vld4q_lane_f16(ptr, src, lane)) + #else + #define simde_vld4q_lane_f16(ptr, src, lane) vld4q_lane_f16(ptr, src, lane) + #endif +#endif +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vld4q_lane_f16 + #define vld4q_lane_f16(ptr, src, lane) simde_vld4q_lane_f16((ptr), (src), (lane)) +#endif + SIMDE_FUNCTION_ATTRIBUTES simde_float32x4x4_t simde_vld4q_lane_f32(simde_float32_t const ptr[HEDLEY_ARRAY_PARAM(4)], simde_float32x4x4_t src, const int lane) @@ -585,6 +641,182 @@ simde_vld4q_lane_f64(simde_float64_t const ptr[HEDLEY_ARRAY_PARAM(4)], simde_flo #define vld4q_lane_f64(ptr, src, lane) simde_vld4q_lane_f64((ptr), (src), (lane)) #endif +SIMDE_FUNCTION_ATTRIBUTES +simde_poly8x8x4_t +simde_vld4_lane_p8(simde_poly8_t const ptr[HEDLEY_ARRAY_PARAM(4)], simde_poly8x8x4_t src, const int lane) + SIMDE_REQUIRE_CONSTANT_RANGE(lane, 0, 7) { + simde_poly8x8x4_t r; + + for (size_t i = 0 ; i < 4 ; i++) { + simde_poly8x8_private tmp_ = simde_poly8x8_to_private(src.val[i]); + tmp_.values[lane] = ptr[i]; + r.val[i] = simde_poly8x8_from_private(tmp_); + } + + return r; +} +#if defined(SIMDE_ARM_NEON_A32V7_NATIVE) + #define simde_vld4_lane_p8(ptr, src, lane) vld4_lane_p8(ptr, src, lane) +#endif +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vld4_lane_p8 + #define vld4_lane_p8(ptr, src, lane) simde_vld4_lane_p8((ptr), (src), (lane)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_poly16x4x4_t +simde_vld4_lane_p16(simde_poly16_t const ptr[HEDLEY_ARRAY_PARAM(4)], simde_poly16x4x4_t src, const int lane) + SIMDE_REQUIRE_CONSTANT_RANGE(lane, 0, 3) { + simde_poly16x4x4_t r; + + for (size_t i = 0 ; i < 4 ; i++) { + simde_poly16x4_private tmp_ = simde_poly16x4_to_private(src.val[i]); + tmp_.values[lane] = ptr[i]; + r.val[i] = simde_poly16x4_from_private(tmp_); + } + + return r; +} +#if defined(SIMDE_ARM_NEON_A32V7_NATIVE) + #define simde_vld4_lane_p16(ptr, src, lane) vld4_lane_p16(ptr, src, lane) +#endif +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vld4_lane_p16 + #define vld4_lane_p16(ptr, src, lane) simde_vld4_lane_p16((ptr), (src), (lane)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_poly64x1x4_t +simde_vld4_lane_p64(simde_poly64_t const ptr[HEDLEY_ARRAY_PARAM(4)], simde_poly64x1x4_t src, const int lane) + SIMDE_REQUIRE_CONSTANT_RANGE(lane, 0, 0) { + simde_poly64x1x4_t r; + + for (size_t i = 0 ; i < 4 ; i++) { + simde_poly64x1_private tmp_ = simde_poly64x1_to_private(src.val[i]); + tmp_.values[lane] = ptr[i]; + r.val[i] = simde_poly64x1_from_private(tmp_); + } + + return r; +} +#if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + #define simde_vld4_lane_p64(ptr, src, lane) vld4_lane_p64(ptr, src, lane) +#endif +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vld4_lane_p64 + #define vld4_lane_p64(ptr, src, lane) simde_vld4_lane_p64((ptr), (src), (lane)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_poly8x16x4_t +simde_vld4q_lane_p8(simde_poly8_t const ptr[HEDLEY_ARRAY_PARAM(4)], simde_poly8x16x4_t src, const int lane) + SIMDE_REQUIRE_CONSTANT_RANGE(lane, 0, 15) { + simde_poly8x16x4_t r; + + for (size_t i = 0 ; i < 4 ; i++) { + simde_poly8x16_private tmp_ = simde_poly8x16_to_private(src.val[i]); + tmp_.values[lane] = ptr[i]; + r.val[i] = simde_poly8x16_from_private(tmp_); + } + + return r; +} +#if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + #define simde_vld4q_lane_p8(ptr, src, lane) vld4q_lane_p8(ptr, src, lane) +#endif +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vld4q_lane_p8 + #define vld4q_lane_p8(ptr, src, lane) simde_vld4q_lane_p8((ptr), (src), (lane)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_poly16x8x4_t +simde_vld4q_lane_p16(simde_poly16_t const ptr[HEDLEY_ARRAY_PARAM(4)], simde_poly16x8x4_t src, const int lane) + SIMDE_REQUIRE_CONSTANT_RANGE(lane, 0, 7) { + simde_poly16x8x4_t r; + + for (size_t i = 0 ; i < 4 ; i++) { + simde_poly16x8_private tmp_ = simde_poly16x8_to_private(src.val[i]); + tmp_.values[lane] = ptr[i]; + r.val[i] = simde_poly16x8_from_private(tmp_); + } + + return r; +} +#if defined(SIMDE_ARM_NEON_A32V7_NATIVE) + #define simde_vld4q_lane_p16(ptr, src, lane) vld4q_lane_p16(ptr, src, lane) +#endif +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vld4q_lane_p16 + #define vld4q_lane_p16(ptr, src, lane) simde_vld4q_lane_p16((ptr), (src), (lane)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_poly64x2x4_t +simde_vld4q_lane_p64(simde_poly64_t const ptr[HEDLEY_ARRAY_PARAM(4)], simde_poly64x2x4_t src, const int lane) + SIMDE_REQUIRE_CONSTANT_RANGE(lane, 0, 1) { + simde_poly64x2x4_t r; + + for (size_t i = 0 ; i < 4 ; i++) { + simde_poly64x2_private tmp_ = simde_poly64x2_to_private(src.val[i]); + tmp_.values[lane] = ptr[i]; + r.val[i] = simde_poly64x2_from_private(tmp_); + } + + return r; +} +#if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + #define simde_vld4q_lane_p64(ptr, src, lane) vld4q_lane_p64(ptr, src, lane) +#endif +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vld4q_lane_p64 + #define vld4q_lane_p64(ptr, src, lane) simde_vld4q_lane_p64((ptr), (src), (lane)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_bfloat16x4x4_t +simde_vld4_lane_bf16(simde_bfloat16_t const ptr[HEDLEY_ARRAY_PARAM(4)], simde_bfloat16x4x4_t src, const int lane) + SIMDE_REQUIRE_CONSTANT_RANGE(lane, 0, 3) { + simde_bfloat16x4x4_t r; + + for (size_t i = 0 ; i < 4 ; i++) { + simde_bfloat16x4_private tmp_ = simde_bfloat16x4_to_private(src.val[i]); + tmp_.values[lane] = ptr[i]; + r.val[i] = simde_bfloat16x4_from_private(tmp_); + } + + return r; +} +#if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARM_NEON_BF16) + #define simde_vld4_lane_bf16(ptr, src, lane) vld4_lane_bf16(ptr, src, lane) +#endif +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) + #undef vld4_lane_bf16 + #define vld4_lane_bf16(ptr, src, lane) simde_vld4_lane_bf16((ptr), (src), (lane)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_bfloat16x8x4_t +simde_vld4q_lane_bf16(simde_bfloat16_t const ptr[HEDLEY_ARRAY_PARAM(4)], simde_bfloat16x8x4_t src, const int lane) + SIMDE_REQUIRE_CONSTANT_RANGE(lane, 0, 7) { + simde_bfloat16x8x4_t r; + + for (size_t i = 0 ; i < 4 ; i++) { + simde_bfloat16x8_private tmp_ = simde_bfloat16x8_to_private(src.val[i]); + tmp_.values[lane] = ptr[i]; + r.val[i] = simde_bfloat16x8_from_private(tmp_); + } + + return r; +} +#if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARM_NEON_BF16) + #define simde_vld4q_lane_bf16(ptr, src, lane) vld4q_lane_bf16(ptr, src, lane) +#endif +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) + #undef vld4q_lane_bf16 + #define vld4q_lane_bf16(ptr, src, lane) simde_vld4q_lane_bf16((ptr), (src), (lane)) +#endif + #endif /* !defined(SIMDE_BUG_INTEL_857088) */ SIMDE_END_DECLS_ diff --git a/lib/simd_wrapper/simde/arm/neon/max.h b/lib/simd_wrapper/simde/arm/neon/max.h index 1e2b449e34b..04c38184aa3 100644 --- a/lib/simd_wrapper/simde/arm/neon/max.h +++ b/lib/simd_wrapper/simde/arm/neon/max.h @@ -23,6 +23,7 @@ * Copyright: * 2020 Evan Nemerson * 2020 Sean Maher (Copyright owned by Google, LLC) + * 2023 Yi-Yen Chung (Copyright owned by Andes Technology) */ #if !defined(SIMDE_ARM_NEON_MAX_H) @@ -36,6 +37,52 @@ HEDLEY_DIAGNOSTIC_PUSH SIMDE_DISABLE_UNWANTED_DIAGNOSTICS SIMDE_BEGIN_DECLS_ +SIMDE_FUNCTION_ATTRIBUTES +simde_float16_t +simde_vmaxh_f16(simde_float16_t a, simde_float16_t b) { + #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARM_NEON_FP16) + return vmaxh_f16(a, b); + #else + simde_float32_t r_; + simde_float32_t a_ = simde_float16_to_float32(a); + simde_float32_t b_ = simde_float16_to_float32(b); + #if !defined(SIMDE_FAST_NANS) + r_ = (a_ >= b_) ? a_ : ((a_ < b_) ? b_ : SIMDE_MATH_NANF); + #else + r_ = (a_ > b_) ? a_ : b_; + #endif + return simde_float16_from_float32(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) + #undef vmaxh_f16 + #define vmaxh_f16(a, b) simde_vmaxh_f16((a), (b)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_float16x4_t +simde_vmax_f16(simde_float16x4_t a, simde_float16x4_t b) { + #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARM_NEON_FP16) + return vmax_f16(a, b); + #else + simde_float16x4_private + r_, + a_ = simde_float16x4_to_private(a), + b_ = simde_float16x4_to_private(b); + + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { + r_.values[i] = simde_vmaxh_f16(a_.values[i], b_.values[i]); + } + + return simde_float16x4_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) + #undef vmax_f16 + #define vmax_f16(a, b) simde_vmax_f16((a), (b)) +#endif + SIMDE_FUNCTION_ATTRIBUTES simde_float32x2_t simde_vmax_f32(simde_float32x2_t a, simde_float32x2_t b) { @@ -293,6 +340,30 @@ simde_x_vmax_u64(simde_uint64x1_t a, simde_uint64x1_t b) { #endif } +SIMDE_FUNCTION_ATTRIBUTES +simde_float16x8_t +simde_vmaxq_f16(simde_float16x8_t a, simde_float16x8_t b) { + #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARM_NEON_FP16) + return vmaxq_f16(a, b); + #else + simde_float16x8_private + r_, + a_ = simde_float16x8_to_private(a), + b_ = simde_float16x8_to_private(b); + + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { + r_.values[i] = simde_vmaxh_f16(a_.values[i], b_.values[i]); + } + + return simde_float16x8_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) + #undef vmaxq_f16 + #define vmaxq_f16(a, b) simde_vmaxq_f16((a), (b)) +#endif + SIMDE_FUNCTION_ATTRIBUTES simde_float32x4_t simde_vmaxq_f32(simde_float32x4_t a, simde_float32x4_t b) { diff --git a/lib/simd_wrapper/simde/arm/neon/maxnm.h b/lib/simd_wrapper/simde/arm/neon/maxnm.h index b9aceb02ce0..8101dd2ca3a 100644 --- a/lib/simd_wrapper/simde/arm/neon/maxnm.h +++ b/lib/simd_wrapper/simde/arm/neon/maxnm.h @@ -22,6 +22,7 @@ * * Copyright: * 2020 Evan Nemerson + * 2023 Yi-Yen Chung (Copyright owned by Andes Technology) */ #if !defined(SIMDE_ARM_NEON_MAXNM_H) @@ -35,6 +36,84 @@ HEDLEY_DIAGNOSTIC_PUSH SIMDE_DISABLE_UNWANTED_DIAGNOSTICS SIMDE_BEGIN_DECLS_ +SIMDE_FUNCTION_ATTRIBUTES +simde_float16_t +simde_vmaxnmh_f16(simde_float16_t a, simde_float16_t b) { + #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && (__ARM_NEON_FP >= 6) && defined(SIMDE_ARM_NEON_FP16) + return vmaxnmh_f16(a, b); + #else + #if defined(simde_math_fmaxf) + return simde_float16_from_float32(simde_math_fmaxf(simde_float16_to_float32(a), simde_float16_to_float32(b))); + #else + simde_float32_t a_ = simde_float16_to_float32(a); + simde_float32_t b_ = simde_float16_to_float32(b); + simde_float32_t r_; + if (a_ > b_) { + r_ = a_; + } else if (a_ < b_) { + r_ = b_; + } else if (a_ == a_) { + r_ = a_; + } else { + r_ = b_; + } + return simde_float16_from_float32(r_); + #endif + #endif +} +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) + #undef vmaxnmh_f16 + #define vmaxnmh_f16(a, b) simde_vmaxnmh_f16((a), (b)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_float16x4_t +simde_vmaxnm_f16(simde_float16x4_t a, simde_float16x4_t b) { + #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && (__ARM_NEON_FP >= 6) && defined(SIMDE_ARM_NEON_FP16) + return vmaxnm_f16(a, b); + #else + simde_float16x4_private + r_, + a_ = simde_float16x4_to_private(a), + b_ = simde_float16x4_to_private(b); + + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { + r_.values[i] = simde_vmaxnmh_f16(a_.values[i], b_.values[i]); + } + + return simde_float16x4_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) + #undef vmaxnm_f16 + #define vmaxnm_f16(a, b) simde_vmaxnm_f16((a), (b)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_float16x8_t +simde_vmaxnmq_f16(simde_float16x8_t a, simde_float16x8_t b) { + #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && (__ARM_NEON_FP >= 6) && defined(SIMDE_ARM_NEON_FP16) + return vmaxnmq_f16(a, b); + #else + simde_float16x8_private + r_, + a_ = simde_float16x8_to_private(a), + b_ = simde_float16x8_to_private(b); + + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { + r_.values[i] = simde_vmaxnmh_f16(a_.values[i], b_.values[i]); + } + + return simde_float16x8_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) + #undef vmaxnmq_f16 + #define vmaxnmq_f16(a, b) simde_vmaxnmq_f16((a), (b)) +#endif + SIMDE_FUNCTION_ATTRIBUTES simde_float32x2_t simde_vmaxnm_f32(simde_float32x2_t a, simde_float32x2_t b) { diff --git a/lib/simd_wrapper/simde/arm/neon/maxnmv.h b/lib/simd_wrapper/simde/arm/neon/maxnmv.h new file mode 100644 index 00000000000..7f00628e19b --- /dev/null +++ b/lib/simd_wrapper/simde/arm/neon/maxnmv.h @@ -0,0 +1,172 @@ +/* SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, copy, + * modify, merge, publish, distribute, sublicense, and/or sell copies + * of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + * Copyright: + * 2023 Yi-Yen Chung (Copyright owned by Andes Technology) + */ + +#if !defined(SIMDE_ARM_NEON_MAXNMV_H) +#define SIMDE_ARM_NEON_MAXNMV_H + +#include "types.h" +#include + +HEDLEY_DIAGNOSTIC_PUSH +SIMDE_DISABLE_UNWANTED_DIAGNOSTICS +SIMDE_BEGIN_DECLS_ + +SIMDE_FUNCTION_ATTRIBUTES +simde_float32_t +simde_vmaxnmv_f32(simde_float32x2_t a) { + simde_float32_t r; + + #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + r = vmaxnmv_f32(a); + #else + simde_float32x2_private a_ = simde_float32x2_to_private(a); + + r = -SIMDE_MATH_INFINITYF; + SIMDE_VECTORIZE_REDUCTION(max:r) + for (size_t i = 0 ; i < (sizeof(a_.values) / sizeof(a_.values[0])) ; i++) { + r = a_.values[i] > r ? a_.values[i] : r; + } + #endif + + return r; +} +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vmaxnmv_f32 + #define vmaxnmv_f32(v) simde_vmaxnmv_f32(v) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_float32_t +simde_vmaxnmvq_f32(simde_float32x4_t a) { + simde_float32_t r; + + #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + r = vmaxnmvq_f32(a); + #else + simde_float32x4_private a_ = simde_float32x4_to_private(a); + + r = -SIMDE_MATH_INFINITYF; + SIMDE_VECTORIZE_REDUCTION(max:r) + for (size_t i = 0 ; i < (sizeof(a_.values) / sizeof(a_.values[0])) ; i++) { + r = a_.values[i] > r ? a_.values[i] : r; + } + #endif + + return r; +} +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vmaxnmvq_f32 + #define vmaxnmvq_f32(v) simde_vmaxnmvq_f32(v) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_float64_t +simde_vmaxnmvq_f64(simde_float64x2_t a) { + simde_float64_t r; + + #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + r = vmaxnmvq_f64(a); + #else + simde_float64x2_private a_ = simde_float64x2_to_private(a); + + r = -SIMDE_MATH_INFINITY; + SIMDE_VECTORIZE_REDUCTION(max:r) + for (size_t i = 0 ; i < (sizeof(a_.values) / sizeof(a_.values[0])) ; i++) { + r = a_.values[i] > r ? a_.values[i] : r; + } + #endif + + return r; +} +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vmaxnmvq_f64 + #define vmaxnmvq_f64(v) simde_vmaxnmvq_f64(v) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_float16_t +simde_vmaxnmv_f16(simde_float16x4_t a) { + #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && defined(SIMDE_ARM_NEON_FP16) + return vmaxnmv_f16(a); + #else + simde_float32_t r_ = simde_float16_to_float32(SIMDE_NINFINITYHF); + simde_float16x4_private a_ = simde_float16x4_to_private(a); + + #if defined(SIMDE_FAST_NANS) + SIMDE_VECTORIZE_REDUCTION(max:r_) + #else + SIMDE_VECTORIZE + #endif + for (size_t i = 0 ; i < (sizeof(a_.values) / sizeof(a_.values[0])) ; i++) { + simde_float32_t tmp_a = simde_float16_to_float32(a_.values[i]); + #if defined(SIMDE_FAST_NANS) + r_ = tmp_a > r_ ? tmp_a : r_; + #else + r_ = (tmp_a > r_) ? tmp_a : ((tmp_a <= r_) ? r_ : ((tmp_a == tmp_a) ? r_ : tmp_a)); + #endif + } + return simde_float16_from_float32(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vmaxnmv_f16 + #define vmaxnmv_f16(v) simde_vmaxnmv_f16(v) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_float16_t +simde_vmaxnmvq_f16(simde_float16x8_t a) { + #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && defined(SIMDE_ARM_NEON_FP16) + return vmaxnmvq_f16(a); + #else + simde_float32_t r_ = simde_float16_to_float32(SIMDE_NINFINITYHF); + simde_float16x8_private a_ = simde_float16x8_to_private(a); + + #if defined(SIMDE_FAST_NANS) + SIMDE_VECTORIZE_REDUCTION(max:r_) + #else + SIMDE_VECTORIZE + #endif + for (size_t i = 0 ; i < (sizeof(a_.values) / sizeof(a_.values[0])) ; i++) { + simde_float32_t tmp_a = simde_float16_to_float32(a_.values[i]); + #if defined(SIMDE_FAST_NANS) + r_ = tmp_a > r_ ? tmp_a : r_; + #else + r_ = (tmp_a > r_) ? tmp_a : ((tmp_a <= r_) ? r_ : ((tmp_a == tmp_a) ? r_ : tmp_a)); + #endif + } + return simde_float16_from_float32(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vmaxnmvq_f16 + #define vmaxnmvq_f16(v) simde_vmaxnmvq_f16(v) +#endif + +SIMDE_END_DECLS_ +HEDLEY_DIAGNOSTIC_POP + +#endif /* !defined(SIMDE_ARM_NEON_MAXNMV_H) */ diff --git a/lib/simd_wrapper/simde/arm/neon/maxv.h b/lib/simd_wrapper/simde/arm/neon/maxv.h index 37437b04d09..39c9e0cae5d 100644 --- a/lib/simd_wrapper/simde/arm/neon/maxv.h +++ b/lib/simd_wrapper/simde/arm/neon/maxv.h @@ -22,6 +22,7 @@ * * Copyright: * 2020 Evan Nemerson + * 2023 Yi-Yen Chung (Copyright owned by Andes Technology) */ #if !defined(SIMDE_ARM_NEON_MAXV_H) @@ -34,6 +35,38 @@ HEDLEY_DIAGNOSTIC_PUSH SIMDE_DISABLE_UNWANTED_DIAGNOSTICS SIMDE_BEGIN_DECLS_ +SIMDE_FUNCTION_ATTRIBUTES +simde_float16_t +simde_vmaxv_f16(simde_float16x4_t a) { + #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && defined(SIMDE_ARM_NEON_FP16) + return vmaxv_f16(a); + #else + simde_float32_t r; + simde_float16x4_private a_ = simde_float16x4_to_private(a); + + r = simde_float16_to_float32(SIMDE_NINFINITYHF); + #if defined(SIMDE_FAST_NANS) + SIMDE_VECTORIZE_REDUCTION(max:r) + #else + SIMDE_VECTORIZE + #endif + for (size_t i = 0 ; i < (sizeof(a_.values) / sizeof(a_.values[0])) ; i++) { + simde_float32_t a32 = simde_float16_to_float32(a_.values[i]); + #if defined(SIMDE_FAST_NANS) + r = a32 > r ? a32 : r; + #else + r = a32 > r ? a32 : (a32 <= r ? r : ((a32 == a32) ? r : a32)); + #endif + } + + return simde_float16_from_float32(r); + #endif +} +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vmaxv_f16 + #define vmaxv_f16(v) simde_vmaxv_f16(v) +#endif + SIMDE_FUNCTION_ATTRIBUTES simde_float32_t simde_vmaxv_f32(simde_float32x2_t a) { @@ -202,6 +235,38 @@ simde_vmaxv_u32(simde_uint32x2_t a) { #define vmaxv_u32(v) simde_vmaxv_u32(v) #endif +SIMDE_FUNCTION_ATTRIBUTES +simde_float16_t +simde_vmaxvq_f16(simde_float16x8_t a) { + #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && defined(SIMDE_ARM_NEON_FP16) + return vmaxvq_f16(a); + #else + simde_float32_t r; + simde_float16x8_private a_ = simde_float16x8_to_private(a); + + r = simde_float16_to_float32(SIMDE_NINFINITYHF); + #if defined(SIMDE_FAST_NANS) + SIMDE_VECTORIZE_REDUCTION(max:r) + #else + SIMDE_VECTORIZE + #endif + for (size_t i = 0 ; i < (sizeof(a_.values) / sizeof(a_.values[0])) ; i++) { + simde_float32_t a32 = simde_float16_to_float32(a_.values[i]); + #if defined(SIMDE_FAST_NANS) + r = a32 > r ? a32 : r; + #else + r = a32 > r ? a32 : (a32 <= r ? r : ((a32 == a32) ? r : a32)); + #endif + } + + return simde_float16_from_float32(r); + #endif +} +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vmaxvq_f16 + #define vmaxvq_f16(v) simde_vmaxvq_f16(v) +#endif + SIMDE_FUNCTION_ATTRIBUTES simde_float32_t simde_vmaxvq_f32(simde_float32x4_t a) { diff --git a/lib/simd_wrapper/simde/arm/neon/min.h b/lib/simd_wrapper/simde/arm/neon/min.h index 08ea4d00355..469e65aa035 100644 --- a/lib/simd_wrapper/simde/arm/neon/min.h +++ b/lib/simd_wrapper/simde/arm/neon/min.h @@ -22,6 +22,7 @@ * * Copyright: * 2020 Evan Nemerson + * 2023 Yi-Yen Chung (Copyright owned by Andes Technology) */ #if !defined(SIMDE_ARM_NEON_MIN_H) @@ -36,6 +37,52 @@ HEDLEY_DIAGNOSTIC_PUSH SIMDE_DISABLE_UNWANTED_DIAGNOSTICS SIMDE_BEGIN_DECLS_ +SIMDE_FUNCTION_ATTRIBUTES +simde_float16_t +simde_vminh_f16(simde_float16_t a, simde_float16_t b) { + #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && defined(SIMDE_ARM_NEON_FP16) + return vminh_f16(a, b); + #else + simde_float32_t r_; + simde_float32_t a_ = simde_float16_to_float32(a); + simde_float32_t b_ = simde_float16_to_float32(b); + #if !defined(SIMDE_FAST_NANS) + r_ = (a_ <= b_) ? a_ : ((a_ > b_) ? b_ : SIMDE_MATH_NANF); + #else + r_ = (a_ < b_) ? a_ : b_; + #endif + return simde_float16_from_float32(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vminh_f16 + #define vminh_f16(a, b) simde_vminh_f16((a), (b)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_float16x4_t +simde_vmin_f16(simde_float16x4_t a, simde_float16x4_t b) { + #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARM_NEON_FP16) + return vmin_f16(a, b); + #else + simde_float16x4_private + r_, + a_ = simde_float16x4_to_private(a), + b_ = simde_float16x4_to_private(b); + + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { + r_.values[i] = simde_vminh_f16(a_.values[i], b_.values[i]); + } + + return simde_float16x4_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) + #undef vmin_f16 + #define vmin_f16(a, b) simde_vmin_f16((a), (b)) +#endif + SIMDE_FUNCTION_ATTRIBUTES simde_float32x2_t simde_vmin_f32(simde_float32x2_t a, simde_float32x2_t b) { @@ -159,14 +206,10 @@ simde_vmin_s16(simde_int16x4_t a, simde_int16x4_t b) { a_ = simde_int16x4_to_private(a), b_ = simde_int16x4_to_private(b); - #if defined(SIMDE_X86_MMX_NATIVE) - r_.m64 = _mm_sub_pi16(a_.m64, _mm_subs_pu16(b_.m64)); - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { - r_.values[i] = (a_.values[i] < b_.values[i]) ? a_.values[i] : b_.values[i]; - } - #endif + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { + r_.values[i] = (a_.values[i] < b_.values[i]) ? a_.values[i] : b_.values[i]; + } return simde_int16x4_from_private(r_); #endif @@ -325,6 +368,30 @@ simde_x_vmin_u64(simde_uint64x1_t a, simde_uint64x1_t b) { #endif } +SIMDE_FUNCTION_ATTRIBUTES +simde_float16x8_t +simde_vminq_f16(simde_float16x8_t a, simde_float16x8_t b) { + #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARM_NEON_FP16) + return vminq_f16(a, b); + #else + simde_float16x8_private + r_, + a_ = simde_float16x8_to_private(a), + b_ = simde_float16x8_to_private(b); + + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { + r_.values[i] = simde_vminh_f16(a_.values[i], b_.values[i]); + } + + return simde_float16x8_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) + #undef vminq_f16 + #define vminq_f16(a, b) simde_vminq_f16((a), (b)) +#endif + SIMDE_FUNCTION_ATTRIBUTES simde_float32x4_t simde_vminq_f32(simde_float32x4_t a, simde_float32x4_t b) { diff --git a/lib/simd_wrapper/simde/arm/neon/minnm.h b/lib/simd_wrapper/simde/arm/neon/minnm.h index b68a28cb750..341a3150ca9 100644 --- a/lib/simd_wrapper/simde/arm/neon/minnm.h +++ b/lib/simd_wrapper/simde/arm/neon/minnm.h @@ -22,6 +22,7 @@ * * Copyright: * 2020 Evan Nemerson + * 2023 Yi-Yen Chung (Copyright owned by Andes Technology) */ #if !defined(SIMDE_ARM_NEON_MINNM_H) @@ -35,6 +36,60 @@ HEDLEY_DIAGNOSTIC_PUSH SIMDE_DISABLE_UNWANTED_DIAGNOSTICS SIMDE_BEGIN_DECLS_ +SIMDE_FUNCTION_ATTRIBUTES +simde_float16_t +simde_vminnmh_f16(simde_float16_t a, simde_float16_t b) { + #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && (__ARM_NEON_FP >= 6) && defined(SIMDE_ARM_NEON_FP16) + return vminnmh_f16(a, b); + #else + #if defined(simde_math_fminf) + return simde_float16_from_float32(simde_math_fminf(simde_float16_to_float32(a), simde_float16_to_float32(b))); + #else + simde_float32_t a_ = simde_float16_to_float32(a); + simde_float32_t b_ = simde_float16_to_float32(b); + simde_float32_t r_; + if (a_ < b_) { + r_ = a_; + } else if (a_ > b_) { + r_ = b_; + } else if (a_ == a_) { + r_ = a_; + } else { + r_ = b_; + } + return simde_float16_from_float32(r_); + #endif + #endif +} +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) + #undef vminnmh_f16 + #define vminnmh_f16(a, b) simde_vminnmh_f16((a), (b)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_float16x4_t +simde_vminnm_f16(simde_float16x4_t a, simde_float16x4_t b) { + #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && (__ARM_NEON_FP >= 6) && defined(SIMDE_ARM_NEON_FP16) + return vminnm_f16(a, b); + #else + simde_float16x4_private + r_, + a_ = simde_float16x4_to_private(a), + b_ = simde_float16x4_to_private(b); + + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { + r_.values[i] = simde_vminnmh_f16(a_.values[i], b_.values[i]); + } + + return simde_float16x4_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) + #undef vminnm_f16 + #define vminnm_f16(a, b) simde_vminnm_f16((a), (b)) +#endif + SIMDE_FUNCTION_ATTRIBUTES simde_float32x2_t simde_vminnm_f32(simde_float32x2_t a, simde_float32x2_t b) { @@ -107,6 +162,30 @@ simde_vminnm_f64(simde_float64x1_t a, simde_float64x1_t b) { #define vminnm_f64(a, b) simde_vminnm_f64((a), (b)) #endif +SIMDE_FUNCTION_ATTRIBUTES +simde_float16x8_t +simde_vminnmq_f16(simde_float16x8_t a, simde_float16x8_t b) { + #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && (__ARM_NEON_FP >= 6) && defined(SIMDE_ARM_NEON_FP16) + return vminnmq_f16(a, b); + #else + simde_float16x8_private + r_, + a_ = simde_float16x8_to_private(a), + b_ = simde_float16x8_to_private(b); + + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { + r_.values[i] = simde_vminnmh_f16(a_.values[i], b_.values[i]); + } + + return simde_float16x8_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) + #undef vminnmq_f16 + #define vminnmq_f16(a, b) simde_vminnmq_f16((a), (b)) +#endif + SIMDE_FUNCTION_ATTRIBUTES simde_float32x4_t simde_vminnmq_f32(simde_float32x4_t a, simde_float32x4_t b) { diff --git a/lib/simd_wrapper/simde/arm/neon/minnmv.h b/lib/simd_wrapper/simde/arm/neon/minnmv.h new file mode 100644 index 00000000000..11e1b3438db --- /dev/null +++ b/lib/simd_wrapper/simde/arm/neon/minnmv.h @@ -0,0 +1,196 @@ +/* SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, copy, + * modify, merge, publish, distribute, sublicense, and/or sell copies + * of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + * Copyright: + * 2023 Yi-Yen Chung (Copyright owned by Andes Technology) + */ + +#if !defined(SIMDE_ARM_NEON_MINNMV_H) +#define SIMDE_ARM_NEON_MINNMV_H + +#include "types.h" +#include + +HEDLEY_DIAGNOSTIC_PUSH +SIMDE_DISABLE_UNWANTED_DIAGNOSTICS +SIMDE_BEGIN_DECLS_ + +SIMDE_FUNCTION_ATTRIBUTES +simde_float16_t +simde_vminnmv_f16(simde_float16x4_t a) { + #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && defined(SIMDE_ARM_NEON_FP16) + return vminnmv_f16(a); + #else + simde_float32_t r_ = simde_float16_to_float32(SIMDE_INFINITYHF); + simde_float16x4_private a_ = simde_float16x4_to_private(a); + + #if defined(SIMDE_FAST_NANS) + SIMDE_VECTORIZE_REDUCTION(min:r_) + #else + SIMDE_VECTORIZE + #endif + for (size_t i = 0 ; i < (sizeof(a_.values) / sizeof(a_.values[0])) ; i++) { + simde_float32_t tmp_a = simde_float16_to_float32(a_.values[i]); + #if defined(SIMDE_FAST_NANS) + r_ = tmp_a < r_ ? tmp_a : r_; + #else + r_ = (tmp_a < r_) ? tmp_a : ((tmp_a >= r_) ? r_ : ((tmp_a == tmp_a) ? r_ : tmp_a)); + #endif + } + return simde_float16_from_float32(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vminnmv_f16 + #define vminnmv_f16(v) simde_vminnmv_f16(v) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_float32_t +simde_vminnmv_f32(simde_float32x2_t a) { + simde_float32_t r; + + #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + r = vminnmv_f32(a); + #else + simde_float32x2_private a_ = simde_float32x2_to_private(a); + + r = SIMDE_MATH_INFINITYF; + #if defined(SIMDE_FAST_NANS) + SIMDE_VECTORIZE_REDUCTION(min:r) + #else + SIMDE_VECTORIZE + #endif + for (size_t i = 0 ; i < (sizeof(a_.values) / sizeof(a_.values[0])) ; i++) { + #if defined(SIMDE_FAST_NANS) + r = a_.values[i] < r ? a_.values[i] : r; + #else + r = (a_.values[i] < r) ? a_.values[i] : ((a_.values[i] >= r) ? r : ((a_.values[i] == a_.values[i]) ? r : a_.values[i])); + #endif + } + #endif + + return r; +} +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vminnmv_f32 + #define vminnmv_f32(v) simde_vminnmv_f32(v) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_float16_t +simde_vminnmvq_f16(simde_float16x8_t a) { + #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && defined(SIMDE_ARM_NEON_FP16) + return vminnmvq_f16(a); + #else + simde_float32_t r_ = simde_float16_to_float32(SIMDE_INFINITYHF); + simde_float16x8_private a_ = simde_float16x8_to_private(a); + + #if defined(SIMDE_FAST_NANS) + SIMDE_VECTORIZE_REDUCTION(min:r_) + #else + SIMDE_VECTORIZE + #endif + for (size_t i = 0 ; i < (sizeof(a_.values) / sizeof(a_.values[0])) ; i++) { + simde_float32_t tmp_a = simde_float16_to_float32(a_.values[i]); + #if defined(SIMDE_FAST_NANS) + r_ = tmp_a < r_ ? tmp_a : r_; + #else + r_ = (tmp_a < r_) ? tmp_a : ((tmp_a >= r_) ? r_ : ((tmp_a == tmp_a) ? r_ : tmp_a)); + #endif + } + return simde_float16_from_float32(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vminnmvq_f16 + #define vminnmvq_f16(v) simde_vminnmvq_f16(v) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_float32_t +simde_vminnmvq_f32(simde_float32x4_t a) { + simde_float32_t r; + + #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + r = vminnmvq_f32(a); + #else + simde_float32x4_private a_ = simde_float32x4_to_private(a); + + r = SIMDE_MATH_INFINITYF; + #if defined(SIMDE_FAST_NANS) + SIMDE_VECTORIZE_REDUCTION(min:r) + #else + SIMDE_VECTORIZE + #endif + for (size_t i = 0 ; i < (sizeof(a_.values) / sizeof(a_.values[0])) ; i++) { + #if defined(SIMDE_FAST_NANS) + r = a_.values[i] < r ? a_.values[i] : r; + #else + r = (a_.values[i] < r) ? a_.values[i] : ((a_.values[i] >= r) ? r : ((a_.values[i] == a_.values[i]) ? r : a_.values[i])); + #endif + } + #endif + + return r; +} +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vminnmvq_f32 + #define vminnmvq_f32(v) simde_vminnmvq_f32(v) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_float64_t +simde_vminnmvq_f64(simde_float64x2_t a) { + simde_float64_t r; + + #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + r = vminnmvq_f64(a); + #else + simde_float64x2_private a_ = simde_float64x2_to_private(a); + + r = SIMDE_MATH_INFINITY; + #if defined(SIMDE_FAST_NANS) + SIMDE_VECTORIZE_REDUCTION(min:r) + #else + SIMDE_VECTORIZE + #endif + for (size_t i = 0 ; i < (sizeof(a_.values) / sizeof(a_.values[0])) ; i++) { + #if defined(SIMDE_FAST_NANS) + r = a_.values[i] < r ? a_.values[i] : r; + #else + r = (a_.values[i] < r) ? a_.values[i] : ((a_.values[i] >= r) ? r : ((a_.values[i] == a_.values[i]) ? r : a_.values[i])); + #endif + } + #endif + + return r; +} +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vminnmvq_f64 + #define vminnmvq_f64(v) simde_vminnmvq_f64(v) +#endif + +SIMDE_END_DECLS_ +HEDLEY_DIAGNOSTIC_POP + +#endif /* !defined(SIMDE_ARM_NEON_MINNMV_H) */ diff --git a/lib/simd_wrapper/simde/arm/neon/minv.h b/lib/simd_wrapper/simde/arm/neon/minv.h index 93028d74fe8..2c7b5e3f7af 100644 --- a/lib/simd_wrapper/simde/arm/neon/minv.h +++ b/lib/simd_wrapper/simde/arm/neon/minv.h @@ -22,6 +22,7 @@ * * Copyright: * 2020 Evan Nemerson + * 2023 Yi-Yen Chung (Copyright owned by Andes Technology) */ #if !defined(SIMDE_ARM_NEON_MINV_H) @@ -34,6 +35,38 @@ HEDLEY_DIAGNOSTIC_PUSH SIMDE_DISABLE_UNWANTED_DIAGNOSTICS SIMDE_BEGIN_DECLS_ +SIMDE_FUNCTION_ATTRIBUTES +simde_float16_t +simde_vminv_f16(simde_float16x4_t a) { + #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && defined(SIMDE_ARM_NEON_FP16) + return vminv_f16(a); + #else + simde_float32_t r; + simde_float16x4_private a_ = simde_float16x4_to_private(a); + + r = simde_float16_to_float32(SIMDE_INFINITYHF); + #if defined(SIMDE_FAST_NANS) + SIMDE_VECTORIZE_REDUCTION(min:r) + #else + SIMDE_VECTORIZE + #endif + for (size_t i = 0 ; i < (sizeof(a_.values) / sizeof(a_.values[0])) ; i++) { + simde_float32_t a32 = simde_float16_to_float32(a_.values[i]); + #if defined(SIMDE_FAST_NANS) + r = a32 < r ? a32 : r; + #else + r = a32 < r ? a32 : (a32 >= r ? r : ((a32 == a32) ? r : a32)); + #endif + } + + return simde_float16_from_float32(r); + #endif +} +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vminv_f16 + #define vminv_f16(v) simde_vminv_f16(v) +#endif + SIMDE_FUNCTION_ATTRIBUTES simde_float32_t simde_vminv_f32(simde_float32x2_t a) { @@ -210,6 +243,38 @@ simde_vminv_u32(simde_uint32x2_t a) { #define vminv_u32(v) simde_vminv_u32(v) #endif +SIMDE_FUNCTION_ATTRIBUTES +simde_float16_t +simde_vminvq_f16(simde_float16x8_t a) { + #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && defined(SIMDE_ARM_NEON_FP16) + return vminvq_f16(a); + #else + simde_float32_t r; + simde_float16x8_private a_ = simde_float16x8_to_private(a); + + r = simde_float16_to_float32(SIMDE_INFINITYHF); + #if defined(SIMDE_FAST_NANS) + SIMDE_VECTORIZE_REDUCTION(min:r) + #else + SIMDE_VECTORIZE + #endif + for (size_t i = 0 ; i < (sizeof(a_.values) / sizeof(a_.values[0])) ; i++) { + simde_float32_t a32 = simde_float16_to_float32(a_.values[i]); + #if defined(SIMDE_FAST_NANS) + r = a32 < r ? a32 : r; + #else + r = a32 < r ? a32 : (a32 >= r ? r : ((a32 == a32) ? r : a32)); + #endif + } + + return simde_float16_from_float32(r); + #endif +} +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vminvq_f16 + #define vminvq_f16(v) simde_vminvq_f16(v) +#endif + SIMDE_FUNCTION_ATTRIBUTES simde_float32_t simde_vminvq_f32(simde_float32x4_t a) { diff --git a/lib/simd_wrapper/simde/arm/neon/mla_lane.h b/lib/simd_wrapper/simde/arm/neon/mla_lane.h new file mode 100644 index 00000000000..ad383d473ec --- /dev/null +++ b/lib/simd_wrapper/simde/arm/neon/mla_lane.h @@ -0,0 +1,241 @@ +/* SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, copy, + * modify, merge, publish, distribute, sublicense, and/or sell copies + * of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + * Copyright: + * 2021 Zhi An Ng (Copyright owned by Google, LLC) + * 2023 Yi-Yen Chung (Copyright owned by Andes Technology) + */ + +#if !defined(SIMDE_ARM_NEON_MLA_LANE_H) +#define SIMDE_ARM_NEON_MLA_LANE_H + +#include "mla.h" +#include "dup_lane.h" + +HEDLEY_DIAGNOSTIC_PUSH +SIMDE_DISABLE_UNWANTED_DIAGNOSTICS +SIMDE_BEGIN_DECLS_ + +#if defined(SIMDE_ARM_NEON_A32V7_NATIVE) + #define simde_vmla_lane_f32(a, b, v, lane) vmla_lane_f32((a), (b), (v), (lane)) +#else + #define simde_vmla_lane_f32(a, b, v, lane) simde_vmla_f32((a), (b), simde_vdup_lane_f32((v), (lane))) +#endif +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vmla_lane_f32 + #define vmla_lane_f32(a, b, v, lane) simde_vmla_lane_f32((a), (b), (v), (lane)) +#endif + +#if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + #define simde_vmla_laneq_f32(a, b, v, lane) vmla_laneq_f32((a), (b), (v), (lane)) +#else + #define simde_vmla_laneq_f32(a, b, v, lane) simde_vmla_f32((a), (b), simde_vdup_laneq_f32((v), (lane))) +#endif +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vmla_laneq_f32 + #define vmla_laneq_f32(a, b, v, lane) simde_vmla_laneq_f32((a), (b), (v), (lane)) +#endif + +#if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + #define simde_vmlaq_laneq_f32(a, b, v, lane) vmlaq_laneq_f32((a), (b), (v), (lane)) +#else + #define simde_vmlaq_laneq_f32(a, b, v, lane) simde_vmlaq_f32((a), (b), simde_vdupq_laneq_f32((v), (lane))) +#endif +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vmlaq_laneq_f32 + #define vmlaq_laneq_f32(a, b, v, lane) simde_vmlaq_laneq_f32((a), (b), (v), (lane)) +#endif + +#if defined(SIMDE_ARM_NEON_A32V7_NATIVE) + #define simde_vmla_lane_s16(a, b, v, lane) vmla_lane_s16((a), (b), (v), (lane)) +#else + #define simde_vmla_lane_s16(a, b, v, lane) simde_vmla_s16((a), (b), simde_vdup_lane_s16((v), (lane))) +#endif +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vmla_lane_s16 + #define vmla_lane_s16(a, b, v, lane) simde_vmla_lane_s16((a), (b), (v), (lane)) +#endif + +#if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + #define simde_vmla_laneq_s16(a, b, v, lane) vmla_laneq_s16((a), (b), (v), (lane)) +#else + #define simde_vmla_laneq_s16(a, b, v, lane) simde_vmla_s16((a), (b), simde_vdup_laneq_s16((v), (lane))) +#endif +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vmla_laneq_s16 + #define vmla_laneq_s16(a, b, v, lane) simde_vmla_laneq_s16((a), (b), (v), (lane)) +#endif + +#if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + #define simde_vmlaq_laneq_s16(a, b, v, lane) vmlaq_laneq_s16((a), (b), (v), (lane)) +#else + #define simde_vmlaq_laneq_s16(a, b, v, lane) simde_vmlaq_s16((a), (b), simde_vdupq_laneq_s16((v), (lane))) +#endif +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vmlaq_laneq_s16 + #define vmlaq_laneq_s16(a, b, v, lane) simde_vmlaq_laneq_s16((a), (b), (v), (lane)) +#endif + +#if defined(SIMDE_ARM_NEON_A32V7_NATIVE) + #define simde_vmla_lane_s32(a, b, v, lane) vmla_lane_s32((a), (b), (v), (lane)) +#else + #define simde_vmla_lane_s32(a, b, v, lane) simde_vmla_s32((a), (b), simde_vdup_lane_s32((v), (lane))) +#endif +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vmla_lane_s32 + #define vmla_lane_s32(a, b, v, lane) simde_vmla_lane_s32((a), (b), (v), (lane)) +#endif + +#if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + #define simde_vmla_laneq_s32(a, b, v, lane) vmla_laneq_s32((a), (b), (v), (lane)) +#else + #define simde_vmla_laneq_s32(a, b, v, lane) simde_vmla_s32((a), (b), simde_vdup_laneq_s32((v), (lane))) +#endif +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vmla_laneq_s32 + #define vmla_laneq_s32(a, b, v, lane) simde_vmla_laneq_s32((a), (b), (v), (lane)) +#endif + +#if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + #define simde_vmlaq_laneq_s32(a, b, v, lane) vmlaq_laneq_s32((a), (b), (v), (lane)) +#else + #define simde_vmlaq_laneq_s32(a, b, v, lane) simde_vmlaq_s32((a), (b), simde_vdupq_laneq_s32((v), (lane))) +#endif +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vmlaq_laneq_s32 + #define vmlaq_laneq_s32(a, b, v, lane) simde_vmlaq_laneq_s32((a), (b), (v), (lane)) +#endif + +#if defined(SIMDE_ARM_NEON_A32V7_NATIVE) + #define simde_vmla_lane_u16(a, b, v, lane) vmla_lane_u16((a), (b), (v), (lane)) +#else + #define simde_vmla_lane_u16(a, b, v, lane) simde_vmla_u16((a), (b), simde_vdup_lane_u16((v), (lane))) +#endif +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vmla_lane_u16 + #define vmla_lane_u16(a, b, v, lane) simde_vmla_lane_u16((a), (b), (v), (lane)) +#endif + +#if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + #define simde_vmla_laneq_u16(a, b, v, lane) vmla_laneq_u16((a), (b), (v), (lane)) +#else + #define simde_vmla_laneq_u16(a, b, v, lane) simde_vmla_u16((a), (b), simde_vdup_laneq_u16((v), (lane))) +#endif +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vmla_laneq_u16 + #define vmla_laneq_u16(a, b, v, lane) simde_vmla_laneq_u16((a), (b), (v), (lane)) +#endif + +#if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + #define simde_vmlaq_laneq_u16(a, b, v, lane) vmlaq_laneq_u16((a), (b), (v), (lane)) +#else + #define simde_vmlaq_laneq_u16(a, b, v, lane) simde_vmlaq_u16((a), (b), simde_vdupq_laneq_u16((v), (lane))) +#endif +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vmlaq_laneq_u16 + #define vmlaq_laneq_u16(a, b, v, lane) simde_vmlaq_laneq_u16((a), (b), (v), (lane)) +#endif + +#if defined(SIMDE_ARM_NEON_A32V7_NATIVE) + #define simde_vmla_lane_u32(a, b, v, lane) vmla_lane_u32((a), (b), (v), (lane)) +#else + #define simde_vmla_lane_u32(a, b, v, lane) simde_vmla_u32((a), (b), simde_vdup_lane_u32((v), (lane))) +#endif +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vmla_lane_u32 + #define vmla_lane_u32(a, b, v, lane) simde_vmla_lane_u32((a), (b), (v), (lane)) +#endif + +#if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + #define simde_vmla_laneq_u32(a, b, v, lane) vmla_laneq_u32((a), (b), (v), (lane)) +#else + #define simde_vmla_laneq_u32(a, b, v, lane) simde_vmla_u32((a), (b), simde_vdup_laneq_u32((v), (lane))) +#endif +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vmla_laneq_u32 + #define vmla_laneq_u32(a, b, v, lane) simde_vmla_laneq_u32((a), (b), (v), (lane)) +#endif + +#if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + #define simde_vmlaq_laneq_u32(a, b, v, lane) vmlaq_laneq_u32((a), (b), (v), (lane)) +#else + #define simde_vmlaq_laneq_u32(a, b, v, lane) simde_vmlaq_u32((a), (b), simde_vdupq_laneq_u32((v), (lane))) +#endif +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vmlaq_laneq_u32 + #define vmlaq_laneq_u32(a, b, v, lane) simde_vmlaq_laneq_u32((a), (b), (v), (lane)) +#endif + +#if defined(SIMDE_ARM_NEON_A32V7_NATIVE) + #define simde_vmlaq_lane_f32(a, b, v, lane) vmlaq_lane_f32((a), (b), (v), (lane)) +#else + #define simde_vmlaq_lane_f32(a, b, v, lane) simde_vmlaq_f32((a), (b), simde_vdupq_lane_f32((v), (lane))) +#endif +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vmlaq_lane_f32 + #define vmlaq_lane_f32(a, b, v, lane) simde_vmlaq_lane_f32((a), (b), (v), (lane)) +#endif + +#if defined(SIMDE_ARM_NEON_A32V7_NATIVE) + #define simde_vmlaq_lane_s16(a, b, v, lane) vmlaq_lane_s16((a), (b), (v), (lane)) +#else + #define simde_vmlaq_lane_s16(a, b, v, lane) simde_vmlaq_s16((a), (b), simde_vdupq_lane_s16((v), (lane))) +#endif +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vmlaq_lane_s16 + #define vmlaq_lane_s16(a, b, v, lane) simde_vmlaq_lane_s16((a), (b), (v), (lane)) +#endif + +#if defined(SIMDE_ARM_NEON_A32V7_NATIVE) + #define simde_vmlaq_lane_s32(a, b, v, lane) vmlaq_lane_s32((a), (b), (v), (lane)) +#else + #define simde_vmlaq_lane_s32(a, b, v, lane) simde_vmlaq_s32((a), (b), simde_vdupq_lane_s32((v), (lane))) +#endif +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vmlaq_lane_s32 + #define vmlaq_lane_s32(a, b, v, lane) simde_vmlaq_lane_s32((a), (b), (v), (lane)) +#endif + +#if defined(SIMDE_ARM_NEON_A32V7_NATIVE) + #define simde_vmlaq_lane_u16(a, b, v, lane) vmlaq_lane_u16((a), (b), (v), (lane)) +#else + #define simde_vmlaq_lane_u16(a, b, v, lane) simde_vmlaq_u16((a), (b), simde_vdupq_lane_u16((v), (lane))) +#endif +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vmlaq_lane_u16 + #define vmlaq_lane_u16(a, b, v, lane) simde_vmlaq_lane_u16((a), (b), (v), (lane)) +#endif + +#if defined(SIMDE_ARM_NEON_A32V7_NATIVE) + #define simde_vmlaq_lane_u32(a, b, v, lane) vmlaq_lane_u32((a), (b), (v), (lane)) +#else + #define simde_vmlaq_lane_u32(a, b, v, lane) simde_vmlaq_u32((a), (b), simde_vdupq_lane_u32((v), (lane))) +#endif +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vmlaq_lane_u32 + #define vmlaq_lane_u32(a, b, v, lane) simde_vmlaq_lane_u32((a), (b), (v), (lane)) +#endif + +SIMDE_END_DECLS_ +HEDLEY_DIAGNOSTIC_POP + +#endif /* !defined(SIMDE_ARM_NEON_MLA_LANE_H) */ diff --git a/lib/simd_wrapper/simde/arm/neon/mlal_high_lane.h b/lib/simd_wrapper/simde/arm/neon/mlal_high_lane.h new file mode 100644 index 00000000000..50018a95d85 --- /dev/null +++ b/lib/simd_wrapper/simde/arm/neon/mlal_high_lane.h @@ -0,0 +1,147 @@ +/* SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, copy, + * modify, merge, publish, distribute, sublicense, and/or sell copies + * of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + * Copyright: + * 2023 Yi-Yen Chung (Copyright owned by Andes Technology) + */ + +#if !defined(SIMDE_ARM_NEON_MLAL_HIGH_LANE_H) +#define SIMDE_ARM_NEON_MLAL_HIGH_LANE_H + +#include "movl_high.h" +#include "mlal_high.h" +#include "dup_n.h" +#include "mla.h" +#include "types.h" + +HEDLEY_DIAGNOSTIC_PUSH +SIMDE_DISABLE_UNWANTED_DIAGNOSTICS +SIMDE_BEGIN_DECLS_ + +SIMDE_FUNCTION_ATTRIBUTES +simde_int32x4_t +simde_vmlal_high_lane_s16(simde_int32x4_t a, simde_int16x8_t b, simde_int16x4_t v, const int lane) SIMDE_REQUIRE_CONSTANT_RANGE(lane, 0, 3) { + return simde_vmlal_high_s16(a, b, simde_vdupq_n_s16(simde_int16x4_to_private(v).values[lane])); +} +#if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + #define simde_vmlal_high_lane_s16(a, b, v, lane) vmlal_high_lane_s16(a, b, v, lane) +#endif +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vmlal_high_lane_s16 + #define vmlal_high_lane_s16(a, b, v, lane) simde_vmlal_high_lane_s16((a), (b), (v), (lane)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_int32x4_t +simde_vmlal_high_laneq_s16(simde_int32x4_t a, simde_int16x8_t b, simde_int16x8_t v, const int lane) SIMDE_REQUIRE_CONSTANT_RANGE(lane, 0, 7) { + return simde_vmlal_high_s16(a, b, simde_vdupq_n_s16(simde_int16x8_to_private(v).values[lane])); +} +#if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + #define simde_vmlal_high_laneq_s16(a, b, v, lane) vmlal_high_laneq_s16(a, b, v, lane) +#endif +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vmlal_high_laneq_s16 + #define vmlal_high_laneq_s16(a, b, v, lane) simde_vmlal_high_laneq_s16((a), (b), (v), (lane)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_int64x2_t +simde_vmlal_high_lane_s32(simde_int64x2_t a, simde_int32x4_t b, simde_int32x2_t v, const int lane) SIMDE_REQUIRE_CONSTANT_RANGE(lane, 0, 1) { + return simde_vmlal_high_s32(a, b, simde_vdupq_n_s32(simde_int32x2_to_private(v).values[lane])); +} +#if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + #define simde_vmlal_high_lane_s32(a, b, v, lane) vmlal_high_lane_s32(a, b, v, lane) +#endif +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vmlal_high_lane_s32 + #define vmlal_high_lane_s32(a, b, v, lane) simde_vmlal_high_lane_s32((a), (b), (v), (lane)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_int64x2_t +simde_vmlal_high_laneq_s32(simde_int64x2_t a, simde_int32x4_t b, simde_int32x4_t v, const int lane) SIMDE_REQUIRE_CONSTANT_RANGE(lane, 0, 3) { + return simde_vmlal_high_s32(a, b, simde_vdupq_n_s32(simde_int32x4_to_private(v).values[lane])); +} +#if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + #define simde_vmlal_high_laneq_s32(a, b, v, lane) vmlal_high_laneq_s32(a, b, v, lane) +#endif +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vmlal_high_laneq_s32 + #define vmlal_high_laneq_s32(a, b, v, lane) simde_vmlal_high_laneq_s32((a), (b), (v), (lane)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_uint32x4_t +simde_vmlal_high_lane_u16(simde_uint32x4_t a, simde_uint16x8_t b, simde_uint16x4_t v, const int lane) SIMDE_REQUIRE_CONSTANT_RANGE(lane, 0, 3) { + return simde_vmlal_high_u16(a, b, simde_vdupq_n_u16(simde_uint16x4_to_private(v).values[lane])); +} +#if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + #define simde_vmlal_high_lane_u16(a, b, v, lane) vmlal_high_lane_u16(a, b, v, lane) +#endif +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vmlal_high_lane_u16 + #define vmlal_high_lane_u16(a, b, v, lane) simde_vmlal_high_lane_u16((a), (b), (v), (lane)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_uint32x4_t +simde_vmlal_high_laneq_u16(simde_uint32x4_t a, simde_uint16x8_t b, simde_uint16x8_t v, const int lane) SIMDE_REQUIRE_CONSTANT_RANGE(lane, 0, 7) { + return simde_vmlal_high_u16(a, b, simde_vdupq_n_u16(simde_uint16x8_to_private(v).values[lane])); +} +#if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + #define simde_vmlal_high_laneq_u16(a, b, v, lane) vmlal_high_laneq_u16(a, b, v, lane) +#endif +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vmlal_high_laneq_u16 + #define vmlal_high_laneq_u16(a, b, v, lane) simde_vmlal_high_laneq_u16((a), (b), (v), (lane)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_uint64x2_t +simde_vmlal_high_lane_u32(simde_uint64x2_t a, simde_uint32x4_t b, simde_uint32x2_t v, const int lane) SIMDE_REQUIRE_CONSTANT_RANGE(lane, 0, 1) { + return simde_vmlal_high_u32(a, b, simde_vdupq_n_u32(simde_uint32x2_to_private(v).values[lane])); +} +#if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + #define simde_vmlal_high_lane_u32(a, b, v, lane) vmlal_high_lane_u32(a, b, v, lane) +#endif +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vmlal_high_lane_u32 + #define vmlal_high_lane_u32(a, b, v, lane) simde_vmlal_high_lane_u32((a), (b), (v), (lane)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_uint64x2_t +simde_vmlal_high_laneq_u32(simde_uint64x2_t a, simde_uint32x4_t b, simde_uint32x4_t v, const int lane) SIMDE_REQUIRE_CONSTANT_RANGE(lane, 0, 3) { + return simde_vmlal_high_u32(a, b, simde_vdupq_n_u32(simde_uint32x4_to_private(v).values[lane])); +} +#if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + #define simde_vmlal_high_laneq_u32(a, b, v, lane) vmlal_high_laneq_u32(a, b, v, lane) +#endif +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vmlal_high_laneq_u32 + #define vmlal_high_laneq_u32(a, b, v, lane) simde_vmlal_high_laneq_u32((a), (b), (v), (lane)) +#endif + +SIMDE_END_DECLS_ +HEDLEY_DIAGNOSTIC_POP + +#endif /* !defined(SIMDE_ARM_NEON_MLAL_HIGH_LANE_H) */ diff --git a/lib/simd_wrapper/simde/arm/neon/mls.h b/lib/simd_wrapper/simde/arm/neon/mls.h index 83fb42fc734..c92547f7de0 100644 --- a/lib/simd_wrapper/simde/arm/neon/mls.h +++ b/lib/simd_wrapper/simde/arm/neon/mls.h @@ -151,18 +151,13 @@ simde_float32x4_t simde_vmlsq_f32(simde_float32x4_t a, simde_float32x4_t b, simde_float32x4_t c) { #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) return vmlsq_f32(a, b, c); - #elif \ - defined(SIMDE_X86_FMA_NATIVE) + #elif defined(SIMDE_X86_FMA_NATIVE) simde_float32x4_private r_, a_ = simde_float32x4_to_private(a), b_ = simde_float32x4_to_private(b), c_ = simde_float32x4_to_private(c); - - #if defined(SIMDE_X86_FMA_NATIVE) - r_.m128 = _mm_fnmadd_ps(b_.m128, c_.m128, a_.m128); - #endif - + r_.m128 = _mm_fnmadd_ps(b_.m128, c_.m128, a_.m128); return simde_float32x4_from_private(r_); #else return simde_vsubq_f32(a, simde_vmulq_f32(b, c)); @@ -178,18 +173,13 @@ simde_float64x2_t simde_vmlsq_f64(simde_float64x2_t a, simde_float64x2_t b, simde_float64x2_t c) { #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) return vmlsq_f64(a, b, c); - #elif \ - defined(SIMDE_X86_FMA_NATIVE) + #elif defined(SIMDE_X86_FMA_NATIVE) simde_float64x2_private r_, a_ = simde_float64x2_to_private(a), b_ = simde_float64x2_to_private(b), c_ = simde_float64x2_to_private(c); - - #if defined(SIMDE_X86_FMA_NATIVE) - r_.m128d = _mm_fnmadd_pd(b_.m128d, c_.m128d, a_.m128d); - #endif - + r_.m128d = _mm_fnmadd_pd(b_.m128d, c_.m128d, a_.m128d); return simde_float64x2_from_private(r_); #else return simde_vsubq_f64(a, simde_vmulq_f64(b, c)); diff --git a/lib/simd_wrapper/simde/arm/neon/mls_lane.h b/lib/simd_wrapper/simde/arm/neon/mls_lane.h new file mode 100644 index 00000000000..35855a2b7c6 --- /dev/null +++ b/lib/simd_wrapper/simde/arm/neon/mls_lane.h @@ -0,0 +1,240 @@ +/* SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, copy, + * modify, merge, publish, distribute, sublicense, and/or sell copies + * of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + * Copyright: + * 2023 Yi-Yen Chung (Copyright owned by Andes Technology) + */ + +#if !defined(SIMDE_ARM_NEON_MLS_LANE_H) +#define SIMDE_ARM_NEON_MLS_LANE_H + +#include "mls.h" +#include "dup_lane.h" + +HEDLEY_DIAGNOSTIC_PUSH +SIMDE_DISABLE_UNWANTED_DIAGNOSTICS +SIMDE_BEGIN_DECLS_ + +#if defined(SIMDE_ARM_NEON_A32V7_NATIVE) + #define simde_vmls_lane_f32(a, b, v, lane) vmls_lane_f32((a), (b), (v), (lane)) +#else + #define simde_vmls_lane_f32(a, b, v, lane) simde_vmls_f32((a), (b), simde_vdup_lane_f32((v), (lane))) +#endif +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vmls_lane_f32 + #define vmls_lane_f32(a, b, v, lane) simde_vmls_lane_f32((a), (b), (v), (lane)) +#endif + +#if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + #define simde_vmls_laneq_f32(a, b, v, lane) vmls_laneq_f32((a), (b), (v), (lane)) +#else + #define simde_vmls_laneq_f32(a, b, v, lane) simde_vmls_f32((a), (b), simde_vdup_laneq_f32((v), (lane))) +#endif +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vmls_laneq_f32 + #define vmls_laneq_f32(a, b, v, lane) simde_vmls_laneq_f32((a), (b), (v), (lane)) +#endif + +#if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + #define simde_vmlsq_laneq_f32(a, b, v, lane) vmlsq_laneq_f32((a), (b), (v), (lane)) +#else + #define simde_vmlsq_laneq_f32(a, b, v, lane) simde_vmlsq_f32((a), (b), simde_vdupq_laneq_f32((v), (lane))) +#endif +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vmlsq_laneq_f32 + #define vmlsq_laneq_f32(a, b, v, lane) simde_vmlsq_laneq_f32((a), (b), (v), (lane)) +#endif + +#if defined(SIMDE_ARM_NEON_A32V7_NATIVE) + #define simde_vmls_lane_s16(a, b, v, lane) vmls_lane_s16((a), (b), (v), (lane)) +#else + #define simde_vmls_lane_s16(a, b, v, lane) simde_vmls_s16((a), (b), simde_vdup_lane_s16((v), (lane))) +#endif +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vmls_lane_s16 + #define vmls_lane_s16(a, b, v, lane) simde_vmls_lane_s16((a), (b), (v), (lane)) +#endif + +#if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + #define simde_vmls_laneq_s16(a, b, v, lane) vmls_laneq_s16((a), (b), (v), (lane)) +#else + #define simde_vmls_laneq_s16(a, b, v, lane) simde_vmls_s16((a), (b), simde_vdup_laneq_s16((v), (lane))) +#endif +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vmls_laneq_s16 + #define vmls_laneq_s16(a, b, v, lane) simde_vmls_laneq_s16((a), (b), (v), (lane)) +#endif + +#if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + #define simde_vmlsq_laneq_s16(a, b, v, lane) vmlsq_laneq_s16((a), (b), (v), (lane)) +#else + #define simde_vmlsq_laneq_s16(a, b, v, lane) simde_vmlsq_s16((a), (b), simde_vdupq_laneq_s16((v), (lane))) +#endif +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vmlsq_laneq_s16 + #define vmlsq_laneq_s16(a, b, v, lane) simde_vmlsq_laneq_s16((a), (b), (v), (lane)) +#endif + +#if defined(SIMDE_ARM_NEON_A32V7_NATIVE) + #define simde_vmls_lane_s32(a, b, v, lane) vmls_lane_s32((a), (b), (v), (lane)) +#else + #define simde_vmls_lane_s32(a, b, v, lane) simde_vmls_s32((a), (b), simde_vdup_lane_s32((v), (lane))) +#endif +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vmls_lane_s32 + #define vmls_lane_s32(a, b, v, lane) simde_vmls_lane_s32((a), (b), (v), (lane)) +#endif + +#if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + #define simde_vmls_laneq_s32(a, b, v, lane) vmls_laneq_s32((a), (b), (v), (lane)) +#else + #define simde_vmls_laneq_s32(a, b, v, lane) simde_vmls_s32((a), (b), simde_vdup_laneq_s32((v), (lane))) +#endif +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vmls_laneq_s32 + #define vmls_laneq_s32(a, b, v, lane) simde_vmls_laneq_s32((a), (b), (v), (lane)) +#endif + +#if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + #define simde_vmlsq_laneq_s32(a, b, v, lane) vmlsq_laneq_s32((a), (b), (v), (lane)) +#else + #define simde_vmlsq_laneq_s32(a, b, v, lane) simde_vmlsq_s32((a), (b), simde_vdupq_laneq_s32((v), (lane))) +#endif +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vmlsq_laneq_s32 + #define vmlsq_laneq_s32(a, b, v, lane) simde_vmlsq_laneq_s32((a), (b), (v), (lane)) +#endif + +#if defined(SIMDE_ARM_NEON_A32V7_NATIVE) + #define simde_vmls_lane_u16(a, b, v, lane) vmls_lane_u16((a), (b), (v), (lane)) +#else + #define simde_vmls_lane_u16(a, b, v, lane) simde_vmls_u16((a), (b), simde_vdup_lane_u16((v), (lane))) +#endif +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vmls_lane_u16 + #define vmls_lane_u16(a, b, v, lane) simde_vmls_lane_u16((a), (b), (v), (lane)) +#endif + +#if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + #define simde_vmls_laneq_u16(a, b, v, lane) vmls_laneq_u16((a), (b), (v), (lane)) +#else + #define simde_vmls_laneq_u16(a, b, v, lane) simde_vmls_u16((a), (b), simde_vdup_laneq_u16((v), (lane))) +#endif +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vmls_laneq_u16 + #define vmls_laneq_u16(a, b, v, lane) simde_vmls_laneq_u16((a), (b), (v), (lane)) +#endif + +#if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + #define simde_vmlsq_laneq_u16(a, b, v, lane) vmlsq_laneq_u16((a), (b), (v), (lane)) +#else + #define simde_vmlsq_laneq_u16(a, b, v, lane) simde_vmlsq_u16((a), (b), simde_vdupq_laneq_u16((v), (lane))) +#endif +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vmlsq_laneq_u16 + #define vmlsq_laneq_u16(a, b, v, lane) simde_vmlsq_laneq_u16((a), (b), (v), (lane)) +#endif + +#if defined(SIMDE_ARM_NEON_A32V7_NATIVE) + #define simde_vmls_lane_u32(a, b, v, lane) vmls_lane_u32((a), (b), (v), (lane)) +#else + #define simde_vmls_lane_u32(a, b, v, lane) simde_vmls_u32((a), (b), simde_vdup_lane_u32((v), (lane))) +#endif +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vmls_lane_u32 + #define vmls_lane_u32(a, b, v, lane) simde_vmls_lane_u32((a), (b), (v), (lane)) +#endif + +#if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + #define simde_vmls_laneq_u32(a, b, v, lane) vmls_laneq_u32((a), (b), (v), (lane)) +#else + #define simde_vmls_laneq_u32(a, b, v, lane) simde_vmls_u32((a), (b), simde_vdup_laneq_u32((v), (lane))) +#endif +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vmls_laneq_u32 + #define vmls_laneq_u32(a, b, v, lane) simde_vmls_laneq_u32((a), (b), (v), (lane)) +#endif + +#if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + #define simde_vmlsq_laneq_u32(a, b, v, lane) vmlsq_laneq_u32((a), (b), (v), (lane)) +#else + #define simde_vmlsq_laneq_u32(a, b, v, lane) simde_vmlsq_u32((a), (b), simde_vdupq_laneq_u32((v), (lane))) +#endif +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vmlsq_laneq_u32 + #define vmlsq_laneq_u32(a, b, v, lane) simde_vmlsq_laneq_u32((a), (b), (v), (lane)) +#endif + +#if defined(SIMDE_ARM_NEON_A32V7_NATIVE) + #define simde_vmlsq_lane_f32(a, b, v, lane) vmlsq_lane_f32((a), (b), (v), (lane)) +#else + #define simde_vmlsq_lane_f32(a, b, v, lane) simde_vmlsq_f32((a), (b), simde_vdupq_lane_f32((v), (lane))) +#endif +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vmlsq_lane_f32 + #define vmlsq_lane_f32(a, b, v, lane) simde_vmlsq_lane_f32((a), (b), (v), (lane)) +#endif + +#if defined(SIMDE_ARM_NEON_A32V7_NATIVE) + #define simde_vmlsq_lane_s16(a, b, v, lane) vmlsq_lane_s16((a), (b), (v), (lane)) +#else + #define simde_vmlsq_lane_s16(a, b, v, lane) simde_vmlsq_s16((a), (b), simde_vdupq_lane_s16((v), (lane))) +#endif +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vmlsq_lane_s16 + #define vmlsq_lane_s16(a, b, v, lane) simde_vmlsq_lane_s16((a), (b), (v), (lane)) +#endif + +#if defined(SIMDE_ARM_NEON_A32V7_NATIVE) + #define simde_vmlsq_lane_s32(a, b, v, lane) vmlsq_lane_s32((a), (b), (v), (lane)) +#else + #define simde_vmlsq_lane_s32(a, b, v, lane) simde_vmlsq_s32((a), (b), simde_vdupq_lane_s32((v), (lane))) +#endif +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vmlsq_lane_s32 + #define vmlsq_lane_s32(a, b, v, lane) simde_vmlsq_lane_s32((a), (b), (v), (lane)) +#endif + +#if defined(SIMDE_ARM_NEON_A32V7_NATIVE) + #define simde_vmlsq_lane_u16(a, b, v, lane) vmlsq_lane_u16((a), (b), (v), (lane)) +#else + #define simde_vmlsq_lane_u16(a, b, v, lane) simde_vmlsq_u16((a), (b), simde_vdupq_lane_u16((v), (lane))) +#endif +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vmlsq_lane_u16 + #define vmlsq_lane_u16(a, b, v, lane) simde_vmlsq_lane_u16((a), (b), (v), (lane)) +#endif + +#if defined(SIMDE_ARM_NEON_A32V7_NATIVE) + #define simde_vmlsq_lane_u32(a, b, v, lane) vmlsq_lane_u32((a), (b), (v), (lane)) +#else + #define simde_vmlsq_lane_u32(a, b, v, lane) simde_vmlsq_u32((a), (b), simde_vdupq_lane_u32((v), (lane))) +#endif +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vmlsq_lane_u32 + #define vmlsq_lane_u32(a, b, v, lane) simde_vmlsq_lane_u32((a), (b), (v), (lane)) +#endif + +SIMDE_END_DECLS_ +HEDLEY_DIAGNOSTIC_POP + +#endif /* !defined(SIMDE_ARM_NEON_MLS_LANE_H) */ diff --git a/lib/simd_wrapper/simde/arm/neon/mlsl_high_lane.h b/lib/simd_wrapper/simde/arm/neon/mlsl_high_lane.h new file mode 100644 index 00000000000..f45b7d98930 --- /dev/null +++ b/lib/simd_wrapper/simde/arm/neon/mlsl_high_lane.h @@ -0,0 +1,147 @@ +/* SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, copy, + * modify, merge, publish, distribute, sublicense, and/or sell copies + * of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + * Copyright: + * 2023 Yi-Yen Chung (Copyright owned by Andes Technology) + */ + +#if !defined(SIMDE_ARM_NEON_MLSL_HIGH_LANE_H) +#define SIMDE_ARM_NEON_MLSL_HIGH_LANE_H + +#include "movl_high.h" +#include "mlsl_high.h" +#include "dup_n.h" +#include "mls.h" +#include "types.h" + +HEDLEY_DIAGNOSTIC_PUSH +SIMDE_DISABLE_UNWANTED_DIAGNOSTICS +SIMDE_BEGIN_DECLS_ + +SIMDE_FUNCTION_ATTRIBUTES +simde_int32x4_t +simde_vmlsl_high_lane_s16(simde_int32x4_t a, simde_int16x8_t b, simde_int16x4_t v, const int lane) SIMDE_REQUIRE_CONSTANT_RANGE(lane, 0, 3) { + return simde_vmlsl_high_s16(a, b, simde_vdupq_n_s16(simde_int16x4_to_private(v).values[lane])); +} +#if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + #define simde_vmlsl_high_lane_s16(a, b, v, lane) vmlsl_high_lane_s16(a, b, v, lane) +#endif +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vmlsl_high_lane_s16 + #define vmlsl_high_lane_s16(a, b, v, lane) simde_vmlsl_high_lane_s16((a), (b), (v), (lane)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_int32x4_t +simde_vmlsl_high_laneq_s16(simde_int32x4_t a, simde_int16x8_t b, simde_int16x8_t v, const int lane) SIMDE_REQUIRE_CONSTANT_RANGE(lane, 0, 7) { + return simde_vmlsl_high_s16(a, b, simde_vdupq_n_s16(simde_int16x8_to_private(v).values[lane])); +} +#if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + #define simde_vmlsl_high_laneq_s16(a, b, v, lane) vmlsl_high_laneq_s16(a, b, v, lane) +#endif +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vmlsl_high_laneq_s16 + #define vmlsl_high_laneq_s16(a, b, v, lane) simde_vmlsl_high_laneq_s16((a), (b), (v), (lane)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_int64x2_t +simde_vmlsl_high_lane_s32(simde_int64x2_t a, simde_int32x4_t b, simde_int32x2_t v, const int lane) SIMDE_REQUIRE_CONSTANT_RANGE(lane, 0, 1) { + return simde_vmlsl_high_s32(a, b, simde_vdupq_n_s32(simde_int32x2_to_private(v).values[lane])); +} +#if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + #define simde_vmlsl_high_lane_s32(a, b, v, lane) vmlsl_high_lane_s32(a, b, v, lane) +#endif +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vmlsl_high_lane_s32 + #define vmlsl_high_lane_s32(a, b, v, lane) simde_vmlsl_high_lane_s32((a), (b), (v), (lane)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_int64x2_t +simde_vmlsl_high_laneq_s32(simde_int64x2_t a, simde_int32x4_t b, simde_int32x4_t v, const int lane) SIMDE_REQUIRE_CONSTANT_RANGE(lane, 0, 3) { + return simde_vmlsl_high_s32(a, b, simde_vdupq_n_s32(simde_int32x4_to_private(v).values[lane])); +} +#if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + #define simde_vmlsl_high_laneq_s32(a, b, v, lane) vmlsl_high_laneq_s32(a, b, v, lane) +#endif +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vmlsl_high_laneq_s32 + #define vmlsl_high_laneq_s32(a, b, v, lane) simde_vmlsl_high_laneq_s32((a), (b), (v), (lane)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_uint32x4_t +simde_vmlsl_high_lane_u16(simde_uint32x4_t a, simde_uint16x8_t b, simde_uint16x4_t v, const int lane) SIMDE_REQUIRE_CONSTANT_RANGE(lane, 0, 3) { + return simde_vmlsl_high_u16(a, b, simde_vdupq_n_u16(simde_uint16x4_to_private(v).values[lane])); +} +#if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + #define simde_vmlsl_high_lane_u16(a, b, v, lane) vmlsl_high_lane_u16(a, b, v, lane) +#endif +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vmlsl_high_lane_u16 + #define vmlsl_high_lane_u16(a, b, v, lane) simde_vmlsl_high_lane_u16((a), (b), (v), (lane)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_uint32x4_t +simde_vmlsl_high_laneq_u16(simde_uint32x4_t a, simde_uint16x8_t b, simde_uint16x8_t v, const int lane) SIMDE_REQUIRE_CONSTANT_RANGE(lane, 0, 7) { + return simde_vmlsl_high_u16(a, b, simde_vdupq_n_u16(simde_uint16x8_to_private(v).values[lane])); +} +#if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + #define simde_vmlsl_high_laneq_u16(a, b, v, lane) vmlsl_high_laneq_u16(a, b, v, lane) +#endif +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vmlsl_high_laneq_u16 + #define vmlsl_high_laneq_u16(a, b, v, lane) simde_vmlsl_high_laneq_u16((a), (b), (v), (lane)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_uint64x2_t +simde_vmlsl_high_lane_u32(simde_uint64x2_t a, simde_uint32x4_t b, simde_uint32x2_t v, const int lane) SIMDE_REQUIRE_CONSTANT_RANGE(lane, 0, 1) { + return simde_vmlsl_high_u32(a, b, simde_vdupq_n_u32(simde_uint32x2_to_private(v).values[lane])); +} +#if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + #define simde_vmlsl_high_lane_u32(a, b, v, lane) vmlsl_high_lane_u32(a, b, v, lane) +#endif +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vmlsl_high_lane_u32 + #define vmlsl_high_lane_u32(a, b, v, lane) simde_vmlsl_high_lane_u32((a), (b), (v), (lane)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_uint64x2_t +simde_vmlsl_high_laneq_u32(simde_uint64x2_t a, simde_uint32x4_t b, simde_uint32x4_t v, const int lane) SIMDE_REQUIRE_CONSTANT_RANGE(lane, 0, 3) { + return simde_vmlsl_high_u32(a, b, simde_vdupq_n_u32(simde_uint32x4_to_private(v).values[lane])); +} +#if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + #define simde_vmlsl_high_laneq_u32(a, b, v, lane) vmlsl_high_laneq_u32(a, b, v, lane) +#endif +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vmlsl_high_laneq_u32 + #define vmlsl_high_laneq_u32(a, b, v, lane) simde_vmlsl_high_laneq_u32((a), (b), (v), (lane)) +#endif + +SIMDE_END_DECLS_ +HEDLEY_DIAGNOSTIC_POP + +#endif /* !defined(SIMDE_ARM_NEON_MLSL_HIGH_LANE_H) */ diff --git a/lib/simd_wrapper/simde/arm/neon/mmlaq.h b/lib/simd_wrapper/simde/arm/neon/mmlaq.h new file mode 100644 index 00000000000..a5685385305 --- /dev/null +++ b/lib/simd_wrapper/simde/arm/neon/mmlaq.h @@ -0,0 +1,153 @@ +/* SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, copy, + * modify, merge, publish, distribute, sublicense, and/or sell copies + * of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + * Copyright: + * 2023 Yi-Yen Chung (Copyright owned by Andes Technology) + */ + +#if !defined(SIMDE_ARM_NEON_MMLAQ_H) +#define SIMDE_ARM_NEON_MMLAQ_H + +#include "types.h" +#include "cgt.h" +#include "bsl.h" + +HEDLEY_DIAGNOSTIC_PUSH +SIMDE_DISABLE_UNWANTED_DIAGNOSTICS +SIMDE_BEGIN_DECLS_ + +SIMDE_FUNCTION_ATTRIBUTES +simde_int32x4_t +simde_vmmlaq_s32(simde_int32x4_t r, simde_int8x16_t a, simde_int8x16_t b) { + // I8MM is optional feature. src: https://patchwork.ffmpeg.org/project/ffmpeg/patch/20230530123043.52940-2-martin@martin.st/ + #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARCH_ARM_MATMUL_INT8) + return vmmlaq_s32(r, a, b); + #else + simde_int8x16_private + a_ = simde_int8x16_to_private(a), + b_ = simde_int8x16_to_private(b); + simde_int32x4_private + r_ = simde_int32x4_to_private(r), + ret; + + for (size_t k = 0 ; k < (sizeof(ret.values) / sizeof(ret.values[0])) ; k++) { + ret.values[k] = r_.values[k]; + for (size_t i = 0 ; i < (sizeof(a_.values) / sizeof(a_.values[0]) / 2) ; i++) { + ret.values[k] += a_.values[(k/2)*8+i] * b_.values[(k%2)*8+i]; + } + } + return simde_int32x4_from_private(ret); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) + #undef vmmlaq_s32 + #define vmmlaq_s32(r, a, b) simde_vmmlaq_s32((r), (a), (b)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_uint32x4_t +simde_vmmlaq_u32(simde_uint32x4_t r, simde_uint8x16_t a, simde_uint8x16_t b) { + #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARCH_ARM_MATMUL_INT8) + return vmmlaq_u32(r, a, b); + #else + simde_uint8x16_private + a_ = simde_uint8x16_to_private(a), + b_ = simde_uint8x16_to_private(b); + simde_uint32x4_private + r_ = simde_uint32x4_to_private(r), + ret; + + for (size_t k = 0 ; k < (sizeof(ret.values) / sizeof(ret.values[0])) ; k++) { + ret.values[k] = r_.values[k]; + for (size_t i = 0 ; i < (sizeof(a_.values) / sizeof(a_.values[0]) / 2) ; i++) { + ret.values[k] += a_.values[(k/2)*8+i] * b_.values[(k%2)*8+i]; + } + } + return simde_uint32x4_from_private(ret); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) + #undef vmmlaq_u32 + #define vmmlaq_u32(r, a, b) simde_vmmlaq_u32((r), (a), (b)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_int32x4_t +simde_vusmmlaq_s32(simde_int32x4_t r, simde_uint8x16_t a, simde_int8x16_t b) { + #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARCH_ARM_MATMUL_INT8) + return vusmmlaq_s32(r, a, b); + #else + simde_uint8x16_private + a_ = simde_uint8x16_to_private(a); + simde_int8x16_private + b_ = simde_int8x16_to_private(b); + simde_int32x4_private + r_ = simde_int32x4_to_private(r), + ret; + + for (size_t k = 0 ; k < (sizeof(ret.values) / sizeof(ret.values[0])) ; k++) { + ret.values[k] = r_.values[k]; + for (size_t i = 0 ; i < (sizeof(a_.values) / sizeof(a_.values[0]) / 2) ; i++) { + ret.values[k] += a_.values[(k/2)*8+i] * b_.values[(k%2)*8+i]; + } + } + return simde_int32x4_from_private(ret); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) + #undef vusmmlaq_s32 + #define vusmmlaq_s32(r, a, b) simde_vusmmlaq_s32((r), (a), (b)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_float32x4_t +simde_vbfmmlaq_f32(simde_float32x4_t r, simde_bfloat16x8_t a, simde_bfloat16x8_t b) { + #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARCH_ARM_MATMUL_INT8) && \ + defined(SIMDE_ARM_NEON_BF16) + return vbfmmlaq_f32(r, a, b); + #else + simde_bfloat16x8_private + a_ = simde_bfloat16x8_to_private(a), + b_ = simde_bfloat16x8_to_private(b); + simde_float32x4_private + r_ = simde_float32x4_to_private(r), + ret; + + for (size_t k = 0 ; k < (sizeof(ret.values) / sizeof(ret.values[0])) ; k++) { + ret.values[k] = r_.values[k]; + for (size_t i = 0 ; i < (sizeof(a_.values) / sizeof(a_.values[0]) / 2) ; i++) { + ret.values[k] += simde_bfloat16_to_float32(a_.values[(k/2)*4+i]) * + simde_bfloat16_to_float32(b_.values[(k%2)*4+i]); + } + } + return simde_float32x4_from_private(ret); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) + #undef vbfmmlaq_f32 + #define vbfmmlaq_f32(r, a, b) simde_vbfmmlaq_f32((r), (a), (b)) +#endif + +SIMDE_END_DECLS_ +HEDLEY_DIAGNOSTIC_POP + +#endif /* !defined(SIMDE_ARM_NEON_MMLAQ_H) */ diff --git a/lib/simd_wrapper/simde/arm/neon/mul.h b/lib/simd_wrapper/simde/arm/neon/mul.h index 48de8a24031..590b0eae54a 100644 --- a/lib/simd_wrapper/simde/arm/neon/mul.h +++ b/lib/simd_wrapper/simde/arm/neon/mul.h @@ -23,6 +23,8 @@ * Copyright: * 2020 Evan Nemerson * 2020 Sean Maher (Copyright owned by Google, LLC) + * 2023 Yi-Yen Chung (Copyright owned by Andes Technology) + * 2023 Yung-Cheng Su (Copyright owned by NTHU pllab) */ #if !defined(SIMDE_ARM_NEON_MUL_H) @@ -36,6 +38,49 @@ HEDLEY_DIAGNOSTIC_PUSH SIMDE_DISABLE_UNWANTED_DIAGNOSTICS SIMDE_BEGIN_DECLS_ +SIMDE_FUNCTION_ATTRIBUTES +simde_float16_t +simde_vmulh_f16(simde_float16_t a, simde_float16_t b) { + #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARM_NEON_FP16) + return vmulh_f16(a, b); + #else + simde_float32_t a_ = simde_float16_to_float32(a); + simde_float32_t b_ = simde_float16_to_float32(b); + + return simde_float16_from_float32(a_ * b_); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) + #undef vmulh_f16 + #define vmulh_f16(a, b) simde_vmulh_f16((a), (b)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_float16x4_t +simde_vmul_f16(simde_float16x4_t a, simde_float16x4_t b) { + #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARM_NEON_FP16) + return vmul_f16(a, b); + #else + simde_float16x4_private + r_, + a_ = simde_float16x4_to_private(a), + b_ = simde_float16x4_to_private(b); + + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { + simde_float32_t tmp_a_ = simde_float16_to_float32(a_.values[i]); + simde_float32_t tmp_b_ = simde_float16_to_float32(b_.values[i]); + r_.values[i] = simde_float16_from_float32(tmp_a_ * tmp_b_); + } + + return simde_float16x4_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) + #undef vmul_f16 + #define vmul_f16(a, b) simde_vmul_f16((a), (b)) +#endif + SIMDE_FUNCTION_ATTRIBUTES simde_float32x2_t simde_vmul_f32(simde_float32x2_t a, simde_float32x2_t b) { @@ -47,7 +92,9 @@ simde_vmul_f32(simde_float32x2_t a, simde_float32x2_t b) { a_ = simde_float32x2_to_private(a), b_ = simde_float32x2_to_private(b); - #if defined(SIMDE_VECTOR_SUBSCRIPT_OPS) + #if defined(SIMDE_RISCV_V_NATIVE) + r_.sv64 = __riscv_vfmul_vv_f32m1(a_.sv64, b_.sv64, 2); + #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) r_.values = a_.values * b_.values; #else SIMDE_VECTORIZE @@ -75,7 +122,9 @@ simde_vmul_f64(simde_float64x1_t a, simde_float64x1_t b) { a_ = simde_float64x1_to_private(a), b_ = simde_float64x1_to_private(b); - #if defined(SIMDE_VECTOR_SUBSCRIPT_OPS) + #if defined(SIMDE_RISCV_V_NATIVE) + r_.sv64 = __riscv_vfmul_vv_f64m1(a_.sv64, b_.sv64, 1); + #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) r_.values = a_.values * b_.values; #else SIMDE_VECTORIZE @@ -103,7 +152,9 @@ simde_vmul_s8(simde_int8x8_t a, simde_int8x8_t b) { a_ = simde_int8x8_to_private(a), b_ = simde_int8x8_to_private(b); - #if defined(SIMDE_VECTOR_SUBSCRIPT_OPS) && !defined(SIMDE_BUG_GCC_100762) + #if defined(SIMDE_RISCV_V_NATIVE) + r_.sv64 = __riscv_vmul_vv_i8m1(a_.sv64, b_.sv64, 8); + #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) && !defined(SIMDE_BUG_GCC_100762) r_.values = a_.values * b_.values; #else SIMDE_VECTORIZE @@ -133,6 +184,8 @@ simde_vmul_s16(simde_int16x4_t a, simde_int16x4_t b) { #if defined(SIMDE_X86_MMX_NATIVE) r_.m64 = _m_pmullw(a_.m64, b_.m64); + #elif defined(SIMDE_RISCV_V_NATIVE) + r_.sv64 = __riscv_vmul_vv_i16m1(a_.sv64, b_.sv64, 4); #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) && !defined(SIMDE_BUG_GCC_100762) r_.values = a_.values * b_.values; #else @@ -161,7 +214,9 @@ simde_vmul_s32(simde_int32x2_t a, simde_int32x2_t b) { a_ = simde_int32x2_to_private(a), b_ = simde_int32x2_to_private(b); - #if defined(SIMDE_VECTOR_SUBSCRIPT_OPS) && !defined(SIMDE_BUG_GCC_100762) + #if defined(SIMDE_RISCV_V_NATIVE) + r_.sv64 = __riscv_vmul_vv_i32m1(a_.sv64, b_.sv64, 2); + #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) && !defined(SIMDE_BUG_GCC_100762) r_.values = a_.values * b_.values; #else SIMDE_VECTORIZE @@ -186,7 +241,9 @@ simde_x_vmul_s64(simde_int64x1_t a, simde_int64x1_t b) { a_ = simde_int64x1_to_private(a), b_ = simde_int64x1_to_private(b); - #if defined(SIMDE_VECTOR_SUBSCRIPT_OPS) + #if defined(SIMDE_RISCV_V_NATIVE) + r_.sv64 = __riscv_vmul_vv_i64m1(a_.sv64, b_.sv64, 1); + #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) r_.values = a_.values * b_.values; #else SIMDE_VECTORIZE @@ -209,7 +266,9 @@ simde_vmul_u8(simde_uint8x8_t a, simde_uint8x8_t b) { a_ = simde_uint8x8_to_private(a), b_ = simde_uint8x8_to_private(b); - #if defined(SIMDE_VECTOR_SUBSCRIPT_OPS) && !defined(SIMDE_BUG_GCC_100762) + #if defined(SIMDE_RISCV_V_NATIVE) + r_.sv64 = __riscv_vmul_vv_u8m1(a_.sv64, b_.sv64, 8); + #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) && !defined(SIMDE_BUG_GCC_100762) r_.values = a_.values * b_.values; #else SIMDE_VECTORIZE @@ -237,7 +296,9 @@ simde_vmul_u16(simde_uint16x4_t a, simde_uint16x4_t b) { a_ = simde_uint16x4_to_private(a), b_ = simde_uint16x4_to_private(b); - #if defined(SIMDE_VECTOR_SUBSCRIPT_OPS) && !defined(SIMDE_BUG_GCC_100762) + #if defined(SIMDE_RISCV_V_NATIVE) + r_.sv64 = __riscv_vmul_vv_u16m1(a_.sv64, b_.sv64, 4); + #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) && !defined(SIMDE_BUG_GCC_100762) r_.values = a_.values * b_.values; #else SIMDE_VECTORIZE @@ -265,7 +326,9 @@ simde_vmul_u32(simde_uint32x2_t a, simde_uint32x2_t b) { a_ = simde_uint32x2_to_private(a), b_ = simde_uint32x2_to_private(b); - #if defined(SIMDE_VECTOR_SUBSCRIPT_OPS) && !defined(SIMDE_BUG_GCC_100762) + #if defined(SIMDE_RISCV_V_NATIVE) + r_.sv64 = __riscv_vmul_vv_u32m1(a_.sv64, b_.sv64, 2); + #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) && !defined(SIMDE_BUG_GCC_100762) r_.values = a_.values * b_.values; #else SIMDE_VECTORIZE @@ -290,7 +353,9 @@ simde_x_vmul_u64(simde_uint64x1_t a, simde_uint64x1_t b) { a_ = simde_uint64x1_to_private(a), b_ = simde_uint64x1_to_private(b); - #if defined(SIMDE_VECTOR_SUBSCRIPT_OPS) + #if defined(SIMDE_RISCV_V_NATIVE) + r_.sv64 = __riscv_vmul_vv_u64m1(a_.sv64, b_.sv64, 1); + #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) r_.values = a_.values * b_.values; #else SIMDE_VECTORIZE @@ -302,6 +367,32 @@ simde_x_vmul_u64(simde_uint64x1_t a, simde_uint64x1_t b) { return simde_uint64x1_from_private(r_); } +SIMDE_FUNCTION_ATTRIBUTES +simde_float16x8_t +simde_vmulq_f16(simde_float16x8_t a, simde_float16x8_t b) { + #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARM_NEON_FP16) + return vmulq_f16(a, b); + #else + simde_float16x8_private + r_, + a_ = simde_float16x8_to_private(a), + b_ = simde_float16x8_to_private(b); + + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { + simde_float32_t tmp_a_ = simde_float16_to_float32(a_.values[i]); + simde_float32_t tmp_b_ = simde_float16_to_float32(b_.values[i]); + r_.values[i] = simde_float16_from_float32(tmp_a_ * tmp_b_); + } + + return simde_float16x8_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) + #undef vmulq_f16 + #define vmulq_f16(a, b) simde_vmulq_f16((a), (b)) +#endif + SIMDE_FUNCTION_ATTRIBUTES simde_float32x4_t simde_vmulq_f32(simde_float32x4_t a, simde_float32x4_t b) { @@ -317,6 +408,8 @@ simde_vmulq_f32(simde_float32x4_t a, simde_float32x4_t b) { r_.m128 = _mm_mul_ps(a_.m128, b_.m128); #elif defined(SIMDE_WASM_SIMD128_NATIVE) r_.v128 = wasm_f32x4_mul(a_.v128, b_.v128); + #elif defined(SIMDE_RISCV_V_NATIVE) + r_.sv128 = __riscv_vfmul_vv_f32m1(a_.sv128, b_.sv128, 4); #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) r_.values = a_.values * b_.values; #else @@ -349,6 +442,8 @@ simde_vmulq_f64(simde_float64x2_t a, simde_float64x2_t b) { r_.m128d = _mm_mul_pd(a_.m128d, b_.m128d); #elif defined(SIMDE_WASM_SIMD128_NATIVE) r_.v128 = wasm_f64x2_mul(a_.v128, b_.v128); + #elif defined(SIMDE_RISCV_V_NATIVE) + r_.sv128 = __riscv_vfmul_vv_f64m1(a_.sv128, b_.sv128, 2); #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) r_.values = a_.values * b_.values; #else @@ -400,6 +495,8 @@ simde_vmulq_s8(simde_int8x16_t a, simde_int8x16_t b) { ) #endif ); + #elif defined(SIMDE_RISCV_V_NATIVE) + r_.sv128 = __riscv_vmul_vv_i8m1(a_.sv128, b_.sv128, 16); #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) r_.values = a_.values * b_.values; #else @@ -430,6 +527,8 @@ simde_vmulq_s16(simde_int16x8_t a, simde_int16x8_t b) { #if defined(SIMDE_X86_SSE2_NATIVE) r_.m128i = _mm_mullo_epi16(a_.m128i, b_.m128i); + #elif defined(SIMDE_RISCV_V_NATIVE) + r_.sv128 = __riscv_vmul_vv_i16m1(a_.sv128, b_.sv128, 8); #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) r_.values = a_.values * b_.values; #else @@ -460,6 +559,8 @@ simde_vmulq_s32(simde_int32x4_t a, simde_int32x4_t b) { #if defined(SIMDE_WASM_SIMD128_NATIVE) r_.v128 = wasm_i32x4_mul(a_.v128, b_.v128); + #elif defined(SIMDE_RISCV_V_NATIVE) + r_.sv128 = __riscv_vmul_vv_i32m1(a_.sv128, b_.sv128, 4); #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) r_.values = a_.values * b_.values; #else @@ -489,6 +590,8 @@ simde_x_vmulq_s64(simde_int64x2_t a, simde_int64x2_t b) { r_.v128 = wasm_i64x2_mul(a_.v128, b_.v128); #elif defined(SIMDE_X86_AVX512VL_NATIVE) && defined(SIMDE_X86_AVX512DQ_NATIVE) r_.m128i = _mm_mullo_epi64(a_.m128i, b_.m128i); + #elif defined(SIMDE_RISCV_V_NATIVE) + r_.sv128 = __riscv_vmul_vv_i64m1(a_.sv128, b_.sv128, 2); #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) r_.values = a_.values * b_.values; #else @@ -506,6 +609,13 @@ simde_uint8x16_t simde_vmulq_u8(simde_uint8x16_t a, simde_uint8x16_t b) { #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) return vmulq_u8(a, b); + #elif defined(SIMDE_RISCV_V_NATIVE) + simde_uint8x16_private + r_, + a_ = simde_uint8x16_to_private(a), + b_ = simde_uint8x16_to_private(b); + r_.sv128 = __riscv_vmul_vv_u8m1(a_.sv128, b_.sv128, 16); + return simde_uint8x16_from_private(r_); #else return simde_vreinterpretq_u8_s8( @@ -526,6 +636,13 @@ simde_uint16x8_t simde_vmulq_u16(simde_uint16x8_t a, simde_uint16x8_t b) { #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) return vmulq_u16(a, b); + #elif defined(SIMDE_RISCV_V_NATIVE) + simde_uint16x8_private + r_, + a_ = simde_uint16x8_to_private(a), + b_ = simde_uint16x8_to_private(b); + r_.sv128 = __riscv_vmul_vv_u16m1(a_.sv128, b_.sv128, 8); + return simde_uint16x8_from_private(r_); #else return simde_vreinterpretq_u16_s16( @@ -546,6 +663,13 @@ simde_uint32x4_t simde_vmulq_u32(simde_uint32x4_t a, simde_uint32x4_t b) { #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) return vmulq_u32(a, b); + #elif defined(SIMDE_RISCV_V_NATIVE) + simde_uint32x4_private + r_, + a_ = simde_uint32x4_to_private(a), + b_ = simde_uint32x4_to_private(b); + r_.sv128 = __riscv_vmul_vv_u32m1(a_.sv128, b_.sv128, 4); + return simde_uint32x4_from_private(r_); #else return simde_vreinterpretq_u32_s32( @@ -564,14 +688,85 @@ simde_vmulq_u32(simde_uint32x4_t a, simde_uint32x4_t b) { SIMDE_FUNCTION_ATTRIBUTES simde_uint64x2_t simde_x_vmulq_u64(simde_uint64x2_t a, simde_uint64x2_t b) { - return - simde_vreinterpretq_u64_s64( - simde_x_vmulq_s64( - simde_vreinterpretq_s64_u64(a), - simde_vreinterpretq_s64_u64(b) - ) - ); + #if defined(SIMDE_RISCV_V_NATIVE) + simde_uint64x2_private + r_, + a_ = simde_uint64x2_to_private(a), + b_ = simde_uint64x2_to_private(b); + r_.sv128 = __riscv_vmul_vv_u64m1(a_.sv128, b_.sv128, 2); + return simde_uint64x2_from_private(r_); + #else + return + simde_vreinterpretq_u64_s64( + simde_x_vmulq_s64( + simde_vreinterpretq_s64_u64(a), + simde_vreinterpretq_s64_u64(b) + ) + ); + #endif +} + +SIMDE_FUNCTION_ATTRIBUTES +simde_poly8x8_t +simde_vmul_p8(simde_poly8x8_t a, simde_poly8x8_t b) { + #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) + return vmul_p8(a, b); + #else + simde_uint8x8_private + r_, + a_ = simde_uint8x8_to_private(simde_vreinterpret_u8_p8(a)), + b_ = simde_uint8x8_to_private(simde_vreinterpret_u8_p8(b)); + + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { + uint16_t extend_op2 = HEDLEY_STATIC_CAST(uint16_t, b_.values[i]); + uint16_t result = 0; + for(uint16_t j = 0; j < 8; ++j) { + if (a_.values[i] & (1 << j)) { + result = HEDLEY_STATIC_CAST(uint16_t, result ^ (extend_op2 << j)); + } + } + r_.values[i] = HEDLEY_STATIC_CAST(uint8_t, (result & (0xFF))); + } + + return simde_vreinterpret_p8_u8(simde_uint8x8_from_private(r_)); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vmul_p8 + #define vmul_p8(a, b) simde_vmul_p8((a), (b)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_poly8x16_t +simde_vmulq_p8(simde_poly8x16_t a, simde_poly8x16_t b) { + #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) + return vmulq_p8(a, b); + #else + simde_uint8x16_private + r_, + a_ = simde_uint8x16_to_private(simde_vreinterpretq_u8_p8(a)), + b_ = simde_uint8x16_to_private(simde_vreinterpretq_u8_p8(b)); + + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { + uint16_t extend_op2 = HEDLEY_STATIC_CAST(uint16_t, b_.values[i]); + uint16_t result = 0; + for(uint16_t j = 0; j < 8; ++j) { + if (a_.values[i] & (1 << j)) { + result = HEDLEY_STATIC_CAST(uint16_t, result ^ (extend_op2 << j)); + } + } + r_.values[i] = HEDLEY_STATIC_CAST(uint8_t, (result & (0xFF))); + } + + return simde_vreinterpretq_p8_u8(simde_uint8x16_from_private(r_)); + #endif } +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vmulq_p8 + #define vmulq_p8(a, b) simde_vmulq_p8((a), (b)) +#endif SIMDE_END_DECLS_ HEDLEY_DIAGNOSTIC_POP diff --git a/lib/simd_wrapper/simde/arm/neon/mul_lane.h b/lib/simd_wrapper/simde/arm/neon/mul_lane.h index f7b1f2e5141..72c032eea62 100644 --- a/lib/simd_wrapper/simde/arm/neon/mul_lane.h +++ b/lib/simd_wrapper/simde/arm/neon/mul_lane.h @@ -22,17 +22,39 @@ * * Copyright: * 2020 Evan Nemerson + * 2023 Yi-Yen Chung (Copyright owned by Andes Technology) + * 2023 Yung-Cheng Su */ #if !defined(SIMDE_ARM_NEON_MUL_LANE_H) #define SIMDE_ARM_NEON_MUL_LANE_H #include "types.h" +#include "mul.h" HEDLEY_DIAGNOSTIC_PUSH SIMDE_DISABLE_UNWANTED_DIAGNOSTICS SIMDE_BEGIN_DECLS_ +SIMDE_FUNCTION_ATTRIBUTES +simde_float16_t +simde_vmulh_lane_f16(simde_float16_t a, simde_float16x4_t b, const int lane) + SIMDE_REQUIRE_CONSTANT_RANGE(lane, 0, 3) { + return simde_vmulh_f16(a, simde_float16x4_to_private(b).values[lane]); +} +#if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && defined(SIMDE_ARM_NEON_FP16) + #if defined(__clang__) && !SIMDE_DETECT_CLANG_VERSION_CHECK(11,0,0) + #define simde_vmulh_lane_f16(a, b, lane) \ + SIMDE_DISABLE_DIAGNOSTIC_EXPR_(SIMDE_DIAGNOSTIC_DISABLE_VECTOR_CONVERSION_, vmulh_lane_f16(a, b, lane)) + #else + #define simde_vmulh_lane_f16(a, b, lane) vmulh_lane_f16((a), (b), (lane)) + #endif +#endif +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vmulh_lane_f16 + #define vmulh_lane_f16(a, b, lane) simde_vmulh_lane_f16(a, b, lane) +#endif + SIMDE_FUNCTION_ATTRIBUTES simde_float64_t simde_vmuld_lane_f64(simde_float64_t a, simde_float64x1_t b, const int lane) @@ -90,6 +112,25 @@ simde_vmuls_lane_f32(simde_float32_t a, simde_float32x2_t b, const int lane) #define vmuls_lane_f32(a, b, lane) simde_vmuls_lane_f32(a, b, lane) #endif +SIMDE_FUNCTION_ATTRIBUTES +simde_float16_t +simde_vmulh_laneq_f16(simde_float16_t a, simde_float16x8_t b, const int lane) + SIMDE_REQUIRE_CONSTANT_RANGE(lane, 0, 7) { + return simde_vmulh_f16(a, simde_float16x8_to_private(b).values[lane]); +} +#if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && defined(SIMDE_ARM_NEON_FP16) + #if defined(__clang__) && !SIMDE_DETECT_CLANG_VERSION_CHECK(11,0,0) + #define simde_vmulh_laneq_f16(a, b, lane) \ + SIMDE_DISABLE_DIAGNOSTIC_EXPR_(SIMDE_DIAGNOSTIC_DISABLE_VECTOR_CONVERSION_, vmulh_laneq_f16(a, b, lane)) + #else + #define simde_vmulh_laneq_f16(a, b, lane) vmulh_laneq_f16((a), (b), (lane)) + #endif +#endif +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vmulh_laneq_f16 + #define vmulh_laneq_f16(a, b, lane) simde_vmulh_laneq_f16(a, b, lane) +#endif + SIMDE_FUNCTION_ATTRIBUTES simde_float32_t simde_vmuls_laneq_f32(simde_float32_t a, simde_float32x4_t b, const int lane) @@ -109,6 +150,30 @@ simde_vmuls_laneq_f32(simde_float32_t a, simde_float32x4_t b, const int lane) #define vmuls_laneq_f32(a, b, lane) simde_vmuls_laneq_f32(a, b, lane) #endif +SIMDE_FUNCTION_ATTRIBUTES +simde_float16x4_t +simde_vmul_lane_f16(simde_float16x4_t a, simde_float16x4_t b, const int lane) + SIMDE_REQUIRE_CONSTANT_RANGE(lane, 0, 3) { + simde_float16x4_private + r_, + a_ = simde_float16x4_to_private(a), + b_ = simde_float16x4_to_private(b); + + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { + r_.values[i] = simde_vmulh_f16(a_.values[i], b_.values[lane]); + } + + return simde_float16x4_from_private(r_); +} +#if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARM_NEON_FP16) + #define simde_vmul_lane_f16(a, b, lane) vmul_lane_f16((a), (b), (lane)) +#endif +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) + #undef vmul_lane_f16 + #define vmul_lane_f16(a, b, lane) simde_vmul_lane_f16((a), (b), (lane)) +#endif + SIMDE_FUNCTION_ATTRIBUTES simde_float32x2_t simde_vmul_lane_f32(simde_float32x2_t a, simde_float32x2_t b, const int lane) @@ -118,10 +183,14 @@ simde_vmul_lane_f32(simde_float32x2_t a, simde_float32x2_t b, const int lane) a_ = simde_float32x2_to_private(a), b_ = simde_float32x2_to_private(b); - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { - r_.values[i] = a_.values[i] * b_.values[lane]; - } + #if defined(SIMDE_RISCV_V_NATIVE) + r_.sv64 = __riscv_vfmul_vf_f32m1(a_.sv64, b_.values[lane], 2); + #else + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { + r_.values[i] = a_.values[i] * b_.values[lane]; + } + #endif return simde_float32x2_from_private(r_); } @@ -142,10 +211,14 @@ simde_vmul_lane_f64(simde_float64x1_t a, simde_float64x1_t b, const int lane) a_ = simde_float64x1_to_private(a), b_ = simde_float64x1_to_private(b); - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { - r_.values[i] = a_.values[i] * b_.values[lane]; - } + #if defined(SIMDE_RISCV_V_NATIVE) + r_.sv64 = __riscv_vfmul_vf_f64m1(a_.sv64, b_.values[lane], 1); + #else + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { + r_.values[i] = a_.values[i] * b_.values[lane]; + } + #endif return simde_float64x1_from_private(r_); } @@ -166,10 +239,14 @@ simde_vmul_lane_s16(simde_int16x4_t a, simde_int16x4_t b, const int lane) a_ = simde_int16x4_to_private(a), b_ = simde_int16x4_to_private(b); - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { - r_.values[i] = a_.values[i] * b_.values[lane]; - } + #if defined(SIMDE_RISCV_V_NATIVE) + r_.sv64 = __riscv_vmul_vx_i16m1(a_.sv64, b_.values[lane], 4); + #else + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { + r_.values[i] = a_.values[i] * b_.values[lane]; + } + #endif return simde_int16x4_from_private(r_); } @@ -190,10 +267,14 @@ simde_vmul_lane_s32(simde_int32x2_t a, simde_int32x2_t b, const int lane) a_ = simde_int32x2_to_private(a), b_ = simde_int32x2_to_private(b); - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { - r_.values[i] = a_.values[i] * b_.values[lane]; - } + #if defined(SIMDE_RISCV_V_NATIVE) + r_.sv64 = __riscv_vmul_vx_i32m1(a_.sv64, b_.values[lane], 2); + #else + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { + r_.values[i] = a_.values[i] * b_.values[lane]; + } + #endif return simde_int32x2_from_private(r_); } @@ -214,10 +295,14 @@ simde_vmul_lane_u16(simde_uint16x4_t a, simde_uint16x4_t b, const int lane) a_ = simde_uint16x4_to_private(a), b_ = simde_uint16x4_to_private(b); - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { - r_.values[i] = a_.values[i] * b_.values[lane]; - } + #if defined(SIMDE_RISCV_V_NATIVE) + r_.sv64 = __riscv_vmul_vx_u16m1(a_.sv64, b_.values[lane], 4); + #else + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { + r_.values[i] = a_.values[i] * b_.values[lane]; + } + #endif return simde_uint16x4_from_private(r_); } @@ -238,10 +323,14 @@ simde_vmul_lane_u32(simde_uint32x2_t a, simde_uint32x2_t b, const int lane) a_ = simde_uint32x2_to_private(a), b_ = simde_uint32x2_to_private(b); - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { - r_.values[i] = a_.values[i] * b_.values[lane]; - } + #if defined(SIMDE_RISCV_V_NATIVE) + r_.sv64 = __riscv_vmul_vx_u32m1(a_.sv64, b_.values[lane], 2); + #else + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { + r_.values[i] = a_.values[i] * b_.values[lane]; + } + #endif return simde_uint32x2_from_private(r_); } @@ -263,10 +352,14 @@ simde_vmul_laneq_s16(simde_int16x4_t a, simde_int16x8_t b, const int lane) simde_int16x8_private b_ = simde_int16x8_to_private(b); - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { - r_.values[i] = a_.values[i] * b_.values[lane]; - } + #if defined(SIMDE_RISCV_V_NATIVE) + r_.sv64 = __riscv_vmul_vx_i16m1(a_.sv64, b_.values[lane], 4); + #else + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { + r_.values[i] = a_.values[i] * b_.values[lane]; + } + #endif return simde_int16x4_from_private(r_); } @@ -288,10 +381,14 @@ simde_vmul_laneq_s32(simde_int32x2_t a, simde_int32x4_t b, const int lane) simde_int32x4_private b_ = simde_int32x4_to_private(b); - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { - r_.values[i] = a_.values[i] * b_.values[lane]; - } + #if defined(SIMDE_RISCV_V_NATIVE) + r_.sv64 = __riscv_vmul_vx_i32m1(a_.sv64, b_.values[lane], 2); + #else + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { + r_.values[i] = a_.values[i] * b_.values[lane]; + } + #endif return simde_int32x2_from_private(r_); } @@ -313,10 +410,14 @@ simde_vmul_laneq_u16(simde_uint16x4_t a, simde_uint16x8_t b, const int lane) simde_uint16x8_private b_ = simde_uint16x8_to_private(b); - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { - r_.values[i] = a_.values[i] * b_.values[lane]; - } + #if defined(SIMDE_RISCV_V_NATIVE) + r_.sv64 = __riscv_vmul_vx_u16m1(a_.sv64, b_.values[lane], 4); + #else + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { + r_.values[i] = a_.values[i] * b_.values[lane]; + } + #endif return simde_uint16x4_from_private(r_); } @@ -338,10 +439,14 @@ simde_vmul_laneq_u32(simde_uint32x2_t a, simde_uint32x4_t b, const int lane) simde_uint32x4_private b_ = simde_uint32x4_to_private(b); - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { - r_.values[i] = a_.values[i] * b_.values[lane]; - } + #if defined(SIMDE_RISCV_V_NATIVE) + r_.sv64 = __riscv_vmul_vx_u32m1(a_.sv64, b_.values[lane], 2); + #else + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { + r_.values[i] = a_.values[i] * b_.values[lane]; + } + #endif return simde_uint32x2_from_private(r_); } @@ -353,6 +458,30 @@ simde_vmul_laneq_u32(simde_uint32x2_t a, simde_uint32x4_t b, const int lane) #define vmul_laneq_u32(a, b, lane) simde_vmul_laneq_u32((a), (b), (lane)) #endif +SIMDE_FUNCTION_ATTRIBUTES +simde_float16x8_t +simde_vmulq_lane_f16(simde_float16x8_t a, simde_float16x4_t b, const int lane) + SIMDE_REQUIRE_CONSTANT_RANGE(lane, 0, 3) { + simde_float16x8_private + r_, + a_ = simde_float16x8_to_private(a); + simde_float16x4_private b_ = simde_float16x4_to_private(b); + + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { + r_.values[i] = simde_vmulh_f16(a_.values[i], b_.values[lane]); + } + + return simde_float16x8_from_private(r_); +} +#if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARM_NEON_FP16) + #define simde_vmulq_lane_f16(a, b, lane) vmulq_lane_f16((a), (b), (lane)) +#endif +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) + #undef vmulq_lane_f16 + #define vmulq_lane_f16(a, b, lane) simde_vmulq_lane_f16((a), (b), (lane)) +#endif + SIMDE_FUNCTION_ATTRIBUTES simde_float32x4_t simde_vmulq_lane_f32(simde_float32x4_t a, simde_float32x2_t b, const int lane) @@ -362,10 +491,14 @@ simde_vmulq_lane_f32(simde_float32x4_t a, simde_float32x2_t b, const int lane) a_ = simde_float32x4_to_private(a); simde_float32x2_private b_ = simde_float32x2_to_private(b); - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { - r_.values[i] = a_.values[i] * b_.values[lane]; - } + #if defined(SIMDE_RISCV_V_NATIVE) + r_.sv128 = __riscv_vfmul_vf_f32m1(a_.sv128, b_.values[lane], 4); + #else + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { + r_.values[i] = a_.values[i] * b_.values[lane]; + } + #endif return simde_float32x4_from_private(r_); } @@ -386,10 +519,14 @@ simde_vmulq_lane_f64(simde_float64x2_t a, simde_float64x1_t b, const int lane) a_ = simde_float64x2_to_private(a); simde_float64x1_private b_ = simde_float64x1_to_private(b); - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { - r_.values[i] = a_.values[i] * b_.values[lane]; - } + #if defined(SIMDE_RISCV_V_NATIVE) + r_.sv128 = __riscv_vfmul_vf_f64m1(a_.sv128, b_.values[lane], 2); + #else + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { + r_.values[i] = a_.values[i] * b_.values[lane]; + } + #endif return simde_float64x2_from_private(r_); } @@ -410,10 +547,14 @@ simde_vmulq_lane_s16(simde_int16x8_t a, simde_int16x4_t b, const int lane) a_ = simde_int16x8_to_private(a); simde_int16x4_private b_ = simde_int16x4_to_private(b); - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { - r_.values[i] = a_.values[i] * b_.values[lane]; - } + #if defined(SIMDE_RISCV_V_NATIVE) + r_.sv128 = __riscv_vmul_vx_i16m1(a_.sv128, b_.values[lane], 8); + #else + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { + r_.values[i] = a_.values[i] * b_.values[lane]; + } + #endif return simde_int16x8_from_private(r_); } @@ -434,10 +575,14 @@ simde_vmulq_lane_s32(simde_int32x4_t a, simde_int32x2_t b, const int lane) a_ = simde_int32x4_to_private(a); simde_int32x2_private b_ = simde_int32x2_to_private(b); - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { - r_.values[i] = a_.values[i] * b_.values[lane]; - } + #if defined(SIMDE_RISCV_V_NATIVE) + r_.sv128 = __riscv_vmul_vx_i32m1(a_.sv128, b_.values[lane], 4); + #else + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { + r_.values[i] = a_.values[i] * b_.values[lane]; + } + #endif return simde_int32x4_from_private(r_); } @@ -458,10 +603,14 @@ simde_vmulq_lane_u16(simde_uint16x8_t a, simde_uint16x4_t b, const int lane) a_ = simde_uint16x8_to_private(a); simde_uint16x4_private b_ = simde_uint16x4_to_private(b); - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { - r_.values[i] = a_.values[i] * b_.values[lane]; - } + #if defined(SIMDE_RISCV_V_NATIVE) + r_.sv128 = __riscv_vmul_vx_u16m1(a_.sv128, b_.values[lane], 8); + #else + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { + r_.values[i] = a_.values[i] * b_.values[lane]; + } + #endif return simde_uint16x8_from_private(r_); } @@ -482,10 +631,14 @@ simde_vmulq_lane_u32(simde_uint32x4_t a, simde_uint32x2_t b, const int lane) a_ = simde_uint32x4_to_private(a); simde_uint32x2_private b_ = simde_uint32x2_to_private(b); - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { - r_.values[i] = a_.values[i] * b_.values[lane]; - } + #if defined(SIMDE_RISCV_V_NATIVE) + r_.sv128 = __riscv_vmul_vx_u32m1(a_.sv128, b_.values[lane], 4); + #else + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { + r_.values[i] = a_.values[i] * b_.values[lane]; + } + #endif return simde_uint32x4_from_private(r_); } @@ -497,6 +650,30 @@ simde_vmulq_lane_u32(simde_uint32x4_t a, simde_uint32x2_t b, const int lane) #define vmulq_lane_u32(a, b, lane) simde_vmulq_lane_u32((a), (b), (lane)) #endif +SIMDE_FUNCTION_ATTRIBUTES +simde_float16x8_t +simde_vmulq_laneq_f16(simde_float16x8_t a, simde_float16x8_t b, const int lane) + SIMDE_REQUIRE_CONSTANT_RANGE(lane, 0, 7) { + simde_float16x8_private + r_, + a_ = simde_float16x8_to_private(a), + b_ = simde_float16x8_to_private(b); + + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { + r_.values[i] = simde_vmulh_f16(a_.values[i], b_.values[lane]); + } + + return simde_float16x8_from_private(r_); +} +#if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && defined(SIMDE_ARM_NEON_FP16) + #define simde_vmulq_laneq_f16(a, b, lane) vmulq_laneq_f16((a), (b), (lane)) +#endif +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vmulq_laneq_f16 + #define vmulq_laneq_f16(a, b, lane) simde_vmulq_laneq_f16((a), (b), (lane)) +#endif + SIMDE_FUNCTION_ATTRIBUTES simde_float32x4_t simde_vmulq_laneq_f32(simde_float32x4_t a, simde_float32x4_t b, const int lane) @@ -506,10 +683,14 @@ simde_vmulq_laneq_f32(simde_float32x4_t a, simde_float32x4_t b, const int lane) a_ = simde_float32x4_to_private(a), b_ = simde_float32x4_to_private(b); - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { - r_.values[i] = a_.values[i] * b_.values[lane]; - } + #if defined(SIMDE_RISCV_V_NATIVE) + r_.sv128 = __riscv_vfmul_vf_f32m1(a_.sv128, b_.values[lane], 4); + #else + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { + r_.values[i] = a_.values[i] * b_.values[lane]; + } + #endif return simde_float32x4_from_private(r_); } @@ -530,10 +711,14 @@ simde_vmulq_laneq_f64(simde_float64x2_t a, simde_float64x2_t b, const int lane) a_ = simde_float64x2_to_private(a), b_ = simde_float64x2_to_private(b); - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { - r_.values[i] = a_.values[i] * b_.values[lane]; - } + #if defined(SIMDE_RISCV_V_NATIVE) + r_.sv128 = __riscv_vfmul_vf_f64m1(a_.sv128, b_.values[lane], 2); + #else + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { + r_.values[i] = a_.values[i] * b_.values[lane]; + } + #endif return simde_float64x2_from_private(r_); } @@ -554,10 +739,14 @@ simde_vmulq_laneq_s16(simde_int16x8_t a, simde_int16x8_t b, const int lane) a_ = simde_int16x8_to_private(a), b_ = simde_int16x8_to_private(b); - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { - r_.values[i] = a_.values[i] * b_.values[lane]; - } + #if defined(SIMDE_RISCV_V_NATIVE) + r_.sv128 = __riscv_vmul_vx_i16m1(a_.sv128, b_.values[lane], 8); + #else + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { + r_.values[i] = a_.values[i] * b_.values[lane]; + } + #endif return simde_int16x8_from_private(r_); } @@ -578,10 +767,14 @@ simde_vmulq_laneq_s32(simde_int32x4_t a, simde_int32x4_t b, const int lane) a_ = simde_int32x4_to_private(a), b_ = simde_int32x4_to_private(b); - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { - r_.values[i] = a_.values[i] * b_.values[lane]; - } + #if defined(SIMDE_RISCV_V_NATIVE) + r_.sv128 = __riscv_vmul_vx_i32m1(a_.sv128, b_.values[lane], 4); + #else + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { + r_.values[i] = a_.values[i] * b_.values[lane]; + } + #endif return simde_int32x4_from_private(r_); } @@ -602,10 +795,14 @@ simde_vmulq_laneq_u16(simde_uint16x8_t a, simde_uint16x8_t b, const int lane) a_ = simde_uint16x8_to_private(a), b_ = simde_uint16x8_to_private(b); - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { - r_.values[i] = a_.values[i] * b_.values[lane]; - } + #if defined(SIMDE_RISCV_V_NATIVE) + r_.sv128 = __riscv_vmul_vx_u16m1(a_.sv128, b_.values[lane], 8); + #else + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { + r_.values[i] = a_.values[i] * b_.values[lane]; + } + #endif return simde_uint16x8_from_private(r_); } @@ -626,10 +823,14 @@ simde_vmulq_laneq_u32(simde_uint32x4_t a, simde_uint32x4_t b, const int lane) a_ = simde_uint32x4_to_private(a), b_ = simde_uint32x4_to_private(b); - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { - r_.values[i] = a_.values[i] * b_.values[lane]; - } + #if defined(SIMDE_RISCV_V_NATIVE) + r_.sv128 = __riscv_vmul_vx_u32m1(a_.sv128, b_.values[lane], 4); + #else + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { + r_.values[i] = a_.values[i] * b_.values[lane]; + } + #endif return simde_uint32x4_from_private(r_); } @@ -641,6 +842,30 @@ simde_vmulq_laneq_u32(simde_uint32x4_t a, simde_uint32x4_t b, const int lane) #define vmulq_laneq_u32(a, b, lane) simde_vmulq_laneq_u32((a), (b), (lane)) #endif +SIMDE_FUNCTION_ATTRIBUTES +simde_float16x4_t +simde_vmul_laneq_f16(simde_float16x4_t a, simde_float16x8_t b, const int lane) + SIMDE_REQUIRE_CONSTANT_RANGE(lane, 0, 7) { + simde_float16x4_private + r_, + a_ = simde_float16x4_to_private(a); + simde_float16x8_private b_ = simde_float16x8_to_private(b); + + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { + r_.values[i] = simde_vmulh_f16(a_.values[i], b_.values[lane]); + } + + return simde_float16x4_from_private(r_); +} +#if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && defined(SIMDE_ARM_NEON_FP16) + #define simde_vmul_laneq_f16(a, b, lane) vmul_laneq_f16((a), (b), (lane)) +#endif +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vmul_laneq_f16 + #define vmul_laneq_f16(a, b, lane) simde_vmul_laneq_f16((a), (b), (lane)) +#endif + SIMDE_FUNCTION_ATTRIBUTES simde_float32x2_t simde_vmul_laneq_f32(simde_float32x2_t a, simde_float32x4_t b, const int lane) @@ -650,10 +875,14 @@ simde_vmul_laneq_f32(simde_float32x2_t a, simde_float32x4_t b, const int lane) a_ = simde_float32x2_to_private(a); simde_float32x4_private b_ = simde_float32x4_to_private(b); - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { - r_.values[i] = a_.values[i] * b_.values[lane]; - } + #if defined(SIMDE_RISCV_V_NATIVE) + r_.sv64 = __riscv_vfmul_vf_f32m1(a_.sv64, b_.values[lane], 2); + #else + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { + r_.values[i] = a_.values[i] * b_.values[lane]; + } + #endif return simde_float32x2_from_private(r_); } @@ -674,10 +903,14 @@ simde_vmul_laneq_f64(simde_float64x1_t a, simde_float64x2_t b, const int lane) a_ = simde_float64x1_to_private(a); simde_float64x2_private b_ = simde_float64x2_to_private(b); - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { - r_.values[i] = a_.values[i] * b_.values[lane]; - } + #if defined(SIMDE_RISCV_V_NATIVE) + r_.sv64 = __riscv_vfmul_vf_f64m1(a_.sv64, b_.values[lane], 1); + #else + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { + r_.values[i] = a_.values[i] * b_.values[lane]; + } + #endif return simde_float64x1_from_private(r_); } diff --git a/lib/simd_wrapper/simde/arm/neon/mul_n.h b/lib/simd_wrapper/simde/arm/neon/mul_n.h index 5c73ad2e7f8..53375427933 100644 --- a/lib/simd_wrapper/simde/arm/neon/mul_n.h +++ b/lib/simd_wrapper/simde/arm/neon/mul_n.h @@ -23,6 +23,7 @@ * Copyright: * 2020 Evan Nemerson * 2020 Sean Maher (Copyright owned by Google, LLC) + * 2023 Yi-Yen Chung (Copyright owned by Andes Technology) */ #if !defined(SIMDE_ARM_NEON_MUL_N_H) @@ -36,6 +37,20 @@ HEDLEY_DIAGNOSTIC_PUSH SIMDE_DISABLE_UNWANTED_DIAGNOSTICS SIMDE_BEGIN_DECLS_ +SIMDE_FUNCTION_ATTRIBUTES +simde_float16x4_t +simde_vmul_n_f16(simde_float16x4_t a, simde_float16_t b) { + #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARM_NEON_FP16) + return vmul_n_f16(a, b); + #else + return simde_vmul_f16(a, simde_vdup_n_f16(b)); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) + #undef vmul_n_f16 + #define vmul_n_f16(a, b) simde_vmul_n_f16((a), (b)) +#endif + SIMDE_FUNCTION_ATTRIBUTES simde_float32x2_t simde_vmul_n_f32(simde_float32x2_t a, simde_float32 b) { @@ -120,6 +135,20 @@ simde_vmul_n_u32(simde_uint32x2_t a, uint32_t b) { #define vmul_n_u32(a, b) simde_vmul_n_u32((a), (b)) #endif +SIMDE_FUNCTION_ATTRIBUTES +simde_float16x8_t +simde_vmulq_n_f16(simde_float16x8_t a, simde_float16_t b) { + #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARM_NEON_FP16) + return vmulq_n_f16(a, b); + #else + return simde_vmulq_f16(a, simde_vdupq_n_f16(b)); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) + #undef vmulq_n_f16 + #define vmulq_n_f16(a, b) simde_vmulq_n_f16((a), (b)) +#endif + SIMDE_FUNCTION_ATTRIBUTES simde_float32x4_t simde_vmulq_n_f32(simde_float32x4_t a, simde_float32 b) { diff --git a/lib/simd_wrapper/simde/arm/neon/mull.h b/lib/simd_wrapper/simde/arm/neon/mull.h index bfad62a2f3c..cd5c9112f49 100644 --- a/lib/simd_wrapper/simde/arm/neon/mull.h +++ b/lib/simd_wrapper/simde/arm/neon/mull.h @@ -23,6 +23,7 @@ * Copyright: * 2020 Evan Nemerson * 2020 Sean Maher (Copyright owned by Google, LLC) + * 2023 Yi-Yen Chung (Copyright owned by Andes Technology) */ #if !defined(SIMDE_ARM_NEON_MULL_H) @@ -230,6 +231,62 @@ simde_vmull_u32(simde_uint32x2_t a, simde_uint32x2_t b) { #define vmull_u32(a, b) simde_vmull_u32((a), (b)) #endif +SIMDE_FUNCTION_ATTRIBUTES +simde_poly16x8_t +simde_vmull_p8(simde_poly8x8_t a, simde_poly8x8_t b) { + #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) + return vmull_p8(a, b); + #else + simde_uint8x8_private + a_ = simde_uint8x8_to_private(simde_vreinterpret_u8_p8(a)), + b_ = simde_uint8x8_to_private(simde_vreinterpret_u8_p8(b)); + simde_uint16x8_private r_; + + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { + uint16_t extend_op2 = HEDLEY_STATIC_CAST(uint16_t, b_.values[i]); + uint16_t result = 0; + for(size_t j = 0; j < 8; ++j) { + if (a_.values[i] & (1 << j)) { + result = HEDLEY_STATIC_CAST(uint16_t, result ^ (extend_op2 << j)); + } + } + r_.values[i] = result; + } + + return simde_vreinterpretq_p16_u16(simde_uint16x8_from_private(r_)); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vmull_p8 + #define vmull_p8(a, b) simde_vmull_p8((a), (b)) +#endif + +#if !defined(SIMDE_TARGET_NOT_SUPPORT_INT128_TYPE) +SIMDE_FUNCTION_ATTRIBUTES +simde_poly128_t +simde_vmull_p64(simde_poly64_t a, simde_poly64_t b) { + #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARCH_ARM_CRYPTO) + return vmull_p64(a, b); + #else + simde_poly128_t extend_op2 = HEDLEY_STATIC_CAST(simde_poly128_t, b); + simde_poly128_t result = 0; + SIMDE_VECTORIZE + for(size_t j = 0; j < 64; ++j) { + if (a & (1ull << j)) { + result = result ^ (extend_op2 << j); + } + } + return result; + #endif +} +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) + #undef vmull_p64 + #define vmull_p64(a, b) simde_vmull_p64((a), (b)) +#endif + +#endif /* !defined(SIMDE_TARGET_NOT_SUPPORT_INT128_TYPE) */ + SIMDE_END_DECLS_ HEDLEY_DIAGNOSTIC_POP diff --git a/lib/simd_wrapper/simde/arm/neon/mull_high.h b/lib/simd_wrapper/simde/arm/neon/mull_high.h index 658d151f709..87e83369a96 100644 --- a/lib/simd_wrapper/simde/arm/neon/mull_high.h +++ b/lib/simd_wrapper/simde/arm/neon/mull_high.h @@ -22,6 +22,7 @@ * * Copyright: * 2020 Evan Nemerson + * 2023 Yi-Yen Chung (Copyright owned by Andes Technology) */ #if !defined(SIMDE_ARM_NEON_MULL_HIGH_H) @@ -30,6 +31,7 @@ #include "types.h" #include "mul.h" #include "movl_high.h" +#include "mull.h" HEDLEY_DIAGNOSTIC_PUSH SIMDE_DISABLE_UNWANTED_DIAGNOSTICS @@ -119,6 +121,57 @@ simde_vmull_high_u32(simde_uint32x4_t a, simde_uint32x4_t b) { #define vmull_high_u32(a, b) simde_vmull_high_u32((a), (b)) #endif +SIMDE_FUNCTION_ATTRIBUTES +simde_poly16x8_t +simde_vmull_high_p8(simde_poly8x16_t a, simde_poly8x16_t b) { + #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + return vmull_high_p8(a, b); + #else + simde_uint8x16_private + a_ = simde_uint8x16_to_private(simde_vreinterpretq_u8_p8(a)), + b_ = simde_uint8x16_to_private(simde_vreinterpretq_u8_p8(b)); + simde_uint16x8_private r_; + + size_t high_offset = (sizeof(r_.values) / sizeof(r_.values[0])); + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { + uint16_t extend_op2 = HEDLEY_STATIC_CAST(uint16_t, b_.values[i+high_offset]); + uint16_t result = 0; + for(size_t j = 0; j < 8; ++j) { + if (a_.values[i+high_offset] & (1 << j)) { + result = HEDLEY_STATIC_CAST(uint16_t, result ^ (extend_op2 << j)); + } + } + r_.values[i] = result; + } + + return simde_vreinterpretq_p16_u16(simde_uint16x8_from_private(r_)); + #endif +} +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vmull_high_p8 + #define vmull_high_p8(a, b) simde_vmull_high_p8((a), (b)) +#endif + +#if !defined(SIMDE_TARGET_NOT_SUPPORT_INT128_TYPE) +SIMDE_FUNCTION_ATTRIBUTES +simde_poly128_t +simde_vmull_high_p64(simde_poly64x2_t a, simde_poly64x2_t b) { + #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && defined(SIMDE_ARCH_ARM_CRYPTO) + return vmull_high_p64(a, b); + #else + simde_poly64x2_private + a_ = simde_poly64x2_to_private(a), + b_ = simde_poly64x2_to_private(b); + return simde_vmull_p64(a_.values[1], b_.values[1]); + #endif +} +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vmull_high_p64 + #define vmull_high_p64(a, b) simde_vmull_high_p64((a), (b)) +#endif +#endif /* !defined(SIMDE_TARGET_NOT_SUPPORT_INT128_TYPE) */ + SIMDE_END_DECLS_ HEDLEY_DIAGNOSTIC_POP diff --git a/lib/simd_wrapper/simde/arm/neon/mull_high_lane.h b/lib/simd_wrapper/simde/arm/neon/mull_high_lane.h new file mode 100644 index 00000000000..226dbf862dc --- /dev/null +++ b/lib/simd_wrapper/simde/arm/neon/mull_high_lane.h @@ -0,0 +1,170 @@ +/* SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, copy, + * modify, merge, publish, distribute, sublicense, and/or sell copies + * of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + * Copyright: + * 2023 Yi-Yen Chung (Copyright owned by Andes Technology) + */ + +#if !defined(SIMDE_ARM_NEON_MULL_HIGH_LANE_H) +#define SIMDE_ARM_NEON_MULL_HIGH_LANE_H + +#include "combine.h" +#include "mull.h" +#include "dup_n.h" +#include "get_high.h" + +HEDLEY_DIAGNOSTIC_PUSH +SIMDE_DISABLE_UNWANTED_DIAGNOSTICS +SIMDE_BEGIN_DECLS_ + +SIMDE_FUNCTION_ATTRIBUTES +simde_int32x4_t +simde_vmull_high_lane_s16(simde_int16x8_t a, simde_int16x4_t v, const int lane) + SIMDE_REQUIRE_CONSTANT_RANGE(lane, 0, 3) { + simde_int16x4_private + v_ = simde_int16x4_to_private(v); + return simde_vmull_s16(simde_vget_high_s16(a), simde_vdup_n_s16(v_.values[lane])); +} +#if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + #define simde_vmull_high_lane_s16(a, v, lane) vmull_high_lane_s16(a, v, lane) +#endif +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vmull_high_lane_s16 + #define vmull_high_lane_s16(a, v, lane) simde_vmull_high_lane_s16((a), (v), (lane)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_int32x4_t +simde_vmull_high_laneq_s16(simde_int16x8_t a, simde_int16x8_t v, const int lane) + SIMDE_REQUIRE_CONSTANT_RANGE(lane, 0, 7) { + simde_int16x8_private + v_ = simde_int16x8_to_private(v); + return simde_vmull_s16(simde_vget_high_s16(a), simde_vdup_n_s16(v_.values[lane])); +} +#if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + #define simde_vmull_high_laneq_s16(a, v, lane) vmull_high_laneq_s16(a, v, lane) +#endif +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vmull_high_laneq_s16 + #define vmull_high_laneq_s16(a, v, lane) simde_vmull_high_laneq_s16((a), (v), (lane)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_int64x2_t +simde_vmull_high_lane_s32(simde_int32x4_t a, simde_int32x2_t v, const int lane) + SIMDE_REQUIRE_CONSTANT_RANGE(lane, 0, 1) { + simde_int32x2_private + v_ = simde_int32x2_to_private(v); + return simde_vmull_s32(simde_vget_high_s32(a), simde_vdup_n_s32(v_.values[lane])); +} +#if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + #define simde_vmull_high_lane_s32(a, v, lane) vmull_high_lane_s32(a, v, lane) +#endif +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vmull_high_lane_s32 + #define vmull_high_lane_s32(a, v, lane) simde_vmull_high_lane_s32((a), (v), (lane)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_int64x2_t +simde_vmull_high_laneq_s32(simde_int32x4_t a, simde_int32x4_t v, const int lane) + SIMDE_REQUIRE_CONSTANT_RANGE(lane, 0, 3) { + simde_int32x4_private + v_ = simde_int32x4_to_private(v); + return simde_vmull_s32(simde_vget_high_s32(a), simde_vdup_n_s32(v_.values[lane])); +} +#if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + #define simde_vmull_high_laneq_s32(a, v, lane) vmull_high_laneq_s32(a, v, lane) +#endif +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vmull_high_laneq_s32 + #define vmull_high_laneq_s32(a, v, lane) simde_vmull_high_laneq_s32((a), (v), (lane)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_uint32x4_t +simde_vmull_high_lane_u16(simde_uint16x8_t a, simde_uint16x4_t v, const int lane) + SIMDE_REQUIRE_CONSTANT_RANGE(lane, 0, 3) { + simde_uint16x4_private + v_ = simde_uint16x4_to_private(v); + return simde_vmull_u16(simde_vget_high_u16(a), simde_vdup_n_u16(v_.values[lane])); +} +#if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + #define simde_vmull_high_lane_u16(a, v, lane) vmull_high_lane_u16(a, v, lane) +#endif +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vmull_high_lane_u16 + #define vmull_high_lane_u16(a, v, lane) simde_vmull_high_lane_u16((a), (v), (lane)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_uint32x4_t +simde_vmull_high_laneq_u16(simde_uint16x8_t a, simde_uint16x8_t v, const int lane) + SIMDE_REQUIRE_CONSTANT_RANGE(lane, 0, 7) { + simde_uint16x8_private + v_ = simde_uint16x8_to_private(v); + return simde_vmull_u16(simde_vget_high_u16(a), simde_vdup_n_u16(v_.values[lane])); +} +#if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + #define simde_vmull_high_laneq_u16(a, v, lane) vmull_high_laneq_u16(a, v, lane) +#endif +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vmull_high_laneq_u16 + #define vmull_high_laneq_u16(a, v, lane) simde_vmull_high_laneq_u16((a), (v), (lane)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_uint64x2_t +simde_vmull_high_lane_u32(simde_uint32x4_t a, simde_uint32x2_t v, const int lane) + SIMDE_REQUIRE_CONSTANT_RANGE(lane, 0, 1) { + simde_uint32x2_private + v_ = simde_uint32x2_to_private(v); + return simde_vmull_u32(simde_vget_high_u32(a), simde_vdup_n_u32(v_.values[lane])); +} +#if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + #define simde_vmull_high_lane_u32(a, v, lane) vmull_high_lane_u32(a, v, lane) +#endif +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vmull_high_lane_u32 + #define vmull_high_lane_u32(a, v, lane) simde_vmull_high_lane_u32((a), (v), (lane)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_uint64x2_t +simde_vmull_high_laneq_u32(simde_uint32x4_t a, simde_uint32x4_t v, const int lane) + SIMDE_REQUIRE_CONSTANT_RANGE(lane, 0, 3) { + simde_uint32x4_private + v_ = simde_uint32x4_to_private(v); + return simde_vmull_u32(simde_vget_high_u32(a), simde_vdup_n_u32(v_.values[lane])); +} +#if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + #define simde_vmull_high_laneq_u32(a, v, lane) vmull_high_laneq_u32(a, v, lane) +#endif +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vmull_high_laneq_u32 + #define vmull_high_laneq_u32(a, v, lane) simde_vmull_high_laneq_u32((a), (v), (lane)) +#endif + +SIMDE_END_DECLS_ +HEDLEY_DIAGNOSTIC_POP + +#endif /* !defined(SIMDE_ARM_NEON_QDMULL_HIGH_LANE_H) */ diff --git a/lib/simd_wrapper/simde/arm/neon/mull_high_n.h b/lib/simd_wrapper/simde/arm/neon/mull_high_n.h new file mode 100644 index 00000000000..d6a5b356f4a --- /dev/null +++ b/lib/simd_wrapper/simde/arm/neon/mull_high_n.h @@ -0,0 +1,98 @@ +/* SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, copy, + * modify, merge, publish, distribute, sublicense, and/or sell copies + * of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + * Copyright: + * 2023 Yi-Yen Chung (Copyright owned by Andes Technology) + */ + +#if !defined(SIMDE_ARM_NEON_MULL_HIGH_N_H) +#define SIMDE_ARM_NEON_MULL_HIGH_N_H + +#include "combine.h" +#include "get_high.h" +#include "dup_n.h" +#include "mull.h" + +HEDLEY_DIAGNOSTIC_PUSH +SIMDE_DISABLE_UNWANTED_DIAGNOSTICS +SIMDE_BEGIN_DECLS_ + +SIMDE_FUNCTION_ATTRIBUTES +simde_int32x4_t +simde_vmull_high_n_s16(simde_int16x8_t a, int16_t b) { + #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + return vmull_high_n_s16(a, b); + #else + return simde_vmull_s16(simde_vget_high_s16(a), simde_vdup_n_s16(b)); + #endif +} +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vmull_high_n_s16 + #define vmull_high_n_s16(a, b) simde_vmull_high_n_s16((a), (b)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_int64x2_t +simde_vmull_high_n_s32(simde_int32x4_t a, int32_t b) { + #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + return vmull_high_n_s32(a, b); + #else + return simde_vmull_s32(simde_vget_high_s32(a), simde_vdup_n_s32(b)); + #endif +} +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vmull_high_n_s32 + #define vmull_high_n_s32(a, b) simde_vmull_high_n_s32((a), (b)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_uint32x4_t +simde_vmull_high_n_u16(simde_uint16x8_t a, uint16_t b) { + #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + return vmull_high_n_u16(a, b); + #else + return simde_vmull_u16(simde_vget_high_u16(a), simde_vdup_n_u16(b)); + #endif +} +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vmull_high_n_u16 + #define vmull_high_n_u16(a, b) simde_vmull_high_n_u16((a), (b)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_uint64x2_t +simde_vmull_high_n_u32(simde_uint32x4_t a, uint32_t b) { + #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + return vmull_high_n_u32(a, b); + #else + return simde_vmull_u32(simde_vget_high_u32(a), simde_vdup_n_u32(b)); + #endif +} +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vmull_high_n_u32 + #define vmull_high_n_u32(a, b) simde_vmull_high_n_u32((a), (b)) +#endif + +SIMDE_END_DECLS_ +HEDLEY_DIAGNOSTIC_POP + +#endif /* !defined(SIMDE_ARM_NEON_MULL_HIGH_N_H) */ diff --git a/lib/simd_wrapper/simde/arm/neon/mulx.h b/lib/simd_wrapper/simde/arm/neon/mulx.h new file mode 100644 index 00000000000..a089125f64f --- /dev/null +++ b/lib/simd_wrapper/simde/arm/neon/mulx.h @@ -0,0 +1,237 @@ +/* SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, copy, + * modify, merge, publish, distribute, sublicense, and/or sell copies + * of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + * Copyright: + * 2023 Yi-Yen Chung (Copyright owned by Andes Technology) + */ + +#if !defined(SIMDE_ARM_NEON_MULX_H) +#define SIMDE_ARM_NEON_MULX_H + +#include "types.h" + +#include "reinterpret.h" + +HEDLEY_DIAGNOSTIC_PUSH +SIMDE_DISABLE_UNWANTED_DIAGNOSTICS +SIMDE_BEGIN_DECLS_ + +SIMDE_FUNCTION_ATTRIBUTES +simde_float16_t +simde_vmulxh_f16(simde_float16_t a, simde_float16_t b) { + #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && defined(SIMDE_ARM_NEON_FP16) + return vmulxh_f16(a, b); + #else + return simde_float16_from_float32( + simde_float16_to_float32(a) * + simde_float16_to_float32(b)); + #endif +} +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vmulxh_f16 + #define vmulxh_f16(a, b) simde_vmulxh_f16((a), (b)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_float32_t +simde_vmulxs_f32(simde_float32_t a, simde_float32_t b) { + #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + return vmulxs_f32(a, b); + #else + return a * b; + #endif +} +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vmulxs_f32 + #define vmulxs_f32(a, b) simde_vmulxs_f32((a), (b)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_float64_t +simde_vmulxd_f64(simde_float64_t a, simde_float64_t b) { + #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + return vmulxd_f64(a, b); + #else + return a * b; + #endif +} +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vmulxd_f64 + #define vmulxd_f64(a, b) simde_vmulxd_f64((a), (b)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_float16x4_t +simde_vmulx_f16(simde_float16x4_t a, simde_float16x4_t b) { + #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && defined(SIMDE_ARM_NEON_FP16) + return vmulx_f16(a, b); + #else + simde_float16x4_private + r_, + a_ = simde_float16x4_to_private(a), + b_ = simde_float16x4_to_private(b); + + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { + r_.values[i] = simde_vmulxh_f16(a_.values[i], b_.values[i]); + } + + return simde_float16x4_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vmulx_f16 + #define vmulx_f16(a, b) simde_vmulx_f16((a), (b)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_float32x2_t +simde_vmulx_f32(simde_float32x2_t a, simde_float32x2_t b) { + #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + return vmulx_f32(a, b); + #else + simde_float32x2_private + r_, + a_ = simde_float32x2_to_private(a), + b_ = simde_float32x2_to_private(b); + + #if defined(SIMDE_VECTOR_SUBSCRIPT_OPS) + r_.values = a_.values * b_.values; + #else + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { + r_.values[i] = a_.values[i] * b_.values[i]; + } + #endif + + return simde_float32x2_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vmulx_f32 + #define vmulx_f32(a, b) simde_vmulx_f32((a), (b)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_float64x1_t +simde_vmulx_f64(simde_float64x1_t a, simde_float64x1_t b) { + #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + return vmulx_f64(a, b); + #else + simde_float64x1_private + r_, + a_ = simde_float64x1_to_private(a), + b_ = simde_float64x1_to_private(b); + + #if defined(SIMDE_VECTOR_SUBSCRIPT_OPS) + r_.values = a_.values * b_.values; + #else + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { + r_.values[i] = a_.values[i] * b_.values[i]; + } + #endif + + return simde_float64x1_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vmulx_f64 + #define vmulx_f64(a, b) simde_vmulx_f64((a), (b)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_float16x8_t +simde_vmulxq_f16(simde_float16x8_t a, simde_float16x8_t b) { + #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && defined(SIMDE_ARM_NEON_FP16) + return vmulxq_f16(a, b); + #else + simde_float16x8_private + r_, + a_ = simde_float16x8_to_private(a), + b_ = simde_float16x8_to_private(b); + + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { + r_.values[i] = simde_vmulxh_f16(a_.values[i], b_.values[i]); + } + + return simde_float16x8_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vmulxq_f16 + #define vmulxq_f16(a, b) simde_vmulxq_f16((a), (b)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_float32x4_t +simde_vmulxq_f32(simde_float32x4_t a, simde_float32x4_t b) { + #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + return vmulxq_f32(a, b); + #else + simde_float32x4_private + r_, + a_ = simde_float32x4_to_private(a), + b_ = simde_float32x4_to_private(b); + + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { + r_.values[i] = a_.values[i] * b_.values[i]; + } + + return simde_float32x4_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vmulxq_f32 + #define vmulxq_f32(a, b) simde_vmulxq_f32((a), (b)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_float64x2_t +simde_vmulxq_f64(simde_float64x2_t a, simde_float64x2_t b) { + #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + return vmulxq_f64(a, b); + #else + simde_float64x2_private + r_, + a_ = simde_float64x2_to_private(a), + b_ = simde_float64x2_to_private(b); + + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { + r_.values[i] = a_.values[i] * b_.values[i]; + } + + return simde_float64x2_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vmulxq_f64 + #define vmulxq_f64(a, b) simde_vmulxq_f64((a), (b)) +#endif + +SIMDE_END_DECLS_ +HEDLEY_DIAGNOSTIC_POP + +#endif /* !defined(SIMDE_ARM_NEON_MULX_H) */ diff --git a/lib/simd_wrapper/simde/arm/neon/mulx_lane.h b/lib/simd_wrapper/simde/arm/neon/mulx_lane.h new file mode 100644 index 00000000000..eed553651c1 --- /dev/null +++ b/lib/simd_wrapper/simde/arm/neon/mulx_lane.h @@ -0,0 +1,455 @@ +/* SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, copy, + * modify, merge, publish, distribute, sublicense, and/or sell copies + * of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + * Copyright: + * 2023 Yi-Yen Chung (Copyright owned by Andes Technology) + */ + +#if !defined(SIMDE_ARM_NEON_MULX_LANE_H) +#define SIMDE_ARM_NEON_MULX_LANE_H + +#include "types.h" + +HEDLEY_DIAGNOSTIC_PUSH +SIMDE_DISABLE_UNWANTED_DIAGNOSTICS +SIMDE_BEGIN_DECLS_ + +SIMDE_FUNCTION_ATTRIBUTES +simde_float16_t +simde_vmulxh_lane_f16(simde_float16_t a, simde_float16x4_t b, const int lane) + SIMDE_REQUIRE_CONSTANT_RANGE(lane, 0, 3) { + return simde_float16_from_float32( + simde_float16_to_float32(a) * + simde_float16_to_float32(simde_float16x4_to_private(b).values[lane])); +} +#if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && defined(SIMDE_ARM_NEON_FP16) + #define simde_vmulxh_lane_f16(a, b, lane) vmulxh_lane_f16((a), (b), (lane)) +#endif +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vmulxh_lane_f16 + #define vmulxh_lane_f16(a, b, lane) simde_vmulxh_lane_f16(a, b, lane) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_float32_t +simde_vmulxs_lane_f32(simde_float32_t a, simde_float32x2_t b, const int lane) + SIMDE_REQUIRE_CONSTANT_RANGE(lane, 0, 1) { + return a * simde_float32x2_to_private(b).values[lane]; +} +#if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + #define simde_vmulxs_lane_f32(a, b, lane) vmulxs_lane_f32((a), (b), (lane)) +#endif +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vmulxs_lane_f32 + #define vmulxs_lane_f32(a, b, lane) simde_vmulxs_lane_f32(a, b, lane) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_float64_t +simde_vmulxd_lane_f64(simde_float64_t a, simde_float64x1_t b, const int lane) + SIMDE_REQUIRE_CONSTANT_RANGE(lane, 0, 0) { + return a * simde_float64x1_to_private(b).values[lane]; +} +#if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + #define simde_vmulxd_lane_f64(a, b, lane) vmulxd_lane_f64((a), (b), (lane)) +#endif +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vmulxd_lane_f64 + #define vmulxd_lane_f64(a, b, lane) simde_vmulxd_lane_f64(a, b, lane) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_float16_t +simde_vmulxh_laneq_f16(simde_float16_t a, simde_float16x8_t b, const int lane) + SIMDE_REQUIRE_CONSTANT_RANGE(lane, 0, 7) { + return simde_float16_from_float32( + simde_float16_to_float32(a) * + simde_float16_to_float32(simde_float16x8_to_private(b).values[lane])); +} +#if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && defined(SIMDE_ARM_NEON_FP16) + #define simde_vmulxh_laneq_f16(a, b, lane) vmulxh_laneq_f16((a), (b), (lane)) +#endif +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vmulxh_laneq_f16 + #define vmulxh_laneq_f16(a, b, lane) simde_vmulxh_laneq_f16(a, b, lane) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_float32_t +simde_vmulxs_laneq_f32(simde_float32_t a, simde_float32x4_t b, const int lane) + SIMDE_REQUIRE_CONSTANT_RANGE(lane, 0, 3) { + return a * simde_float32x4_to_private(b).values[lane]; +} +#if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + #define simde_vmulxs_laneq_f32(a, b, lane) vmulxs_laneq_f32((a), (b), (lane)) +#endif +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vmulxs_laneq_f32 + #define vmulxs_laneq_f32(a, b, lane) simde_vmulxs_laneq_f32(a, b, lane) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_float64_t +simde_vmulxd_laneq_f64(simde_float64_t a, simde_float64x2_t b, const int lane) + SIMDE_REQUIRE_CONSTANT_RANGE(lane, 0, 1) { + return a * simde_float64x2_to_private(b).values[lane]; +} +#if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + #define simde_vmulxd_laneq_f64(a, b, lane) vmulxd_laneq_f64((a), (b), (lane)) +#endif +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vmulxd_laneq_f64 + #define vmulxd_laneq_f64(a, b, lane) simde_vmulxd_laneq_f64(a, b, lane) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_float16x4_t +simde_vmulx_lane_f16(simde_float16x4_t a, simde_float16x4_t b, const int lane) + SIMDE_REQUIRE_CONSTANT_RANGE(lane, 0, 3) { + simde_float16x4_private + r_, + a_ = simde_float16x4_to_private(a), + b_ = simde_float16x4_to_private(b); + simde_float32_t b_lane_ = simde_float16_to_float32(b_.values[lane]); + + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { + r_.values[i] = simde_float16_from_float32( + simde_float16_to_float32(a_.values[i]) * b_lane_); + } + + return simde_float16x4_from_private(r_); +} +#if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && defined(SIMDE_ARM_NEON_FP16) + #define simde_vmulx_lane_f16(a, b, lane) vmulx_lane_f16((a), (b), (lane)) +#endif +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vmulx_lane_f16 + #define vmulx_lane_f16(a, b, lane) simde_vmulx_lane_f16((a), (b), (lane)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_float32x2_t +simde_vmulx_lane_f32(simde_float32x2_t a, simde_float32x2_t b, const int lane) + SIMDE_REQUIRE_CONSTANT_RANGE(lane, 0, 1) { + simde_float32x2_private + r_, + a_ = simde_float32x2_to_private(a), + b_ = simde_float32x2_to_private(b); + + #if defined(SIMDE_RISCV_V_NATIVE) + r_.sv64 = __riscv_vfmul_vf_f32m1(a_.sv64, b_.values[lane], 2); + #else + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { + r_.values[i] = a_.values[i] * b_.values[lane]; + } + #endif + + return simde_float32x2_from_private(r_); +} +#if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + #define simde_vmulx_lane_f32(a, b, lane) vmulx_lane_f32((a), (b), (lane)) +#endif +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vmulx_lane_f32 + #define vmulx_lane_f32(a, b, lane) simde_vmulx_lane_f32((a), (b), (lane)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_float64x1_t +simde_vmulx_lane_f64(simde_float64x1_t a, simde_float64x1_t b, const int lane) + SIMDE_REQUIRE_CONSTANT_RANGE(lane, 0, 0) { + simde_float64x1_private + r_, + a_ = simde_float64x1_to_private(a), + b_ = simde_float64x1_to_private(b); + + #if defined(SIMDE_RISCV_V_NATIVE) + r_.sv64 = __riscv_vfmul_vf_f64m1(a_.sv64, b_.values[lane], 1); + #else + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { + r_.values[i] = a_.values[i] * b_.values[lane]; + } + #endif + + return simde_float64x1_from_private(r_); +} +#if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + #define simde_vmulx_lane_f64(a, b, lane) vmulx_lane_f64((a), (b), (lane)) +#endif +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vmulx_lane_f64 + #define vmulx_lane_f64(a, b, lane) simde_vmulx_lane_f64((a), (b), (lane)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_float16x8_t +simde_vmulxq_lane_f16(simde_float16x8_t a, simde_float16x4_t b, const int lane) + SIMDE_REQUIRE_CONSTANT_RANGE(lane, 0, 3) { + simde_float16x8_private + r_, + a_ = simde_float16x8_to_private(a); + simde_float16x4_private b_ = simde_float16x4_to_private(b); + simde_float32_t b_lane_ = simde_float16_to_float32(b_.values[lane]); + + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { + r_.values[i] = simde_float16_from_float32( + simde_float16_to_float32(a_.values[i]) * b_lane_); + } + + return simde_float16x8_from_private(r_); +} +#if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && defined(SIMDE_ARM_NEON_FP16) + #define simde_vmulxq_lane_f16(a, b, lane) vmulxq_lane_f16((a), (b), (lane)) +#endif +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vmulxq_lane_f16 + #define vmulxq_lane_f16(a, b, lane) simde_vmulxq_lane_f16((a), (b), (lane)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_float32x4_t +simde_vmulxq_lane_f32(simde_float32x4_t a, simde_float32x2_t b, const int lane) + SIMDE_REQUIRE_CONSTANT_RANGE(lane, 0, 1) { + simde_float32x4_private + r_, + a_ = simde_float32x4_to_private(a); + simde_float32x2_private b_ = simde_float32x2_to_private(b); + + #if defined(SIMDE_RISCV_V_NATIVE) + r_.sv128 = __riscv_vfmul_vf_f32m1(a_.sv128, b_.values[lane], 4); + #else + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { + r_.values[i] = a_.values[i] * b_.values[lane]; + } + #endif + + return simde_float32x4_from_private(r_); +} +#if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + #define simde_vmulxq_lane_f32(a, b, lane) vmulxq_lane_f32((a), (b), (lane)) +#endif +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vmulxq_lane_f32 + #define vmulxq_lane_f32(a, b, lane) simde_vmulxq_lane_f32((a), (b), (lane)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_float64x2_t +simde_vmulxq_lane_f64(simde_float64x2_t a, simde_float64x1_t b, const int lane) + SIMDE_REQUIRE_CONSTANT_RANGE(lane, 0, 0) { + simde_float64x2_private + r_, + a_ = simde_float64x2_to_private(a); + simde_float64x1_private b_ = simde_float64x1_to_private(b); + + #if defined(SIMDE_RISCV_V_NATIVE) + r_.sv128 = __riscv_vfmul_vf_f64m1(a_.sv128, b_.values[lane], 2); + #else + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { + r_.values[i] = a_.values[i] * b_.values[lane]; + } + #endif + + return simde_float64x2_from_private(r_); +} +#if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + #define simde_vmulxq_lane_f64(a, b, lane) vmulxq_lane_f64((a), (b), (lane)) +#endif +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vmulxq_lane_f64 + #define vmulxq_lane_f64(a, b, lane) simde_vmulxq_lane_f64((a), (b), (lane)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_float16x8_t +simde_vmulxq_laneq_f16(simde_float16x8_t a, simde_float16x8_t b, const int lane) + SIMDE_REQUIRE_CONSTANT_RANGE(lane, 0, 7) { + simde_float16x8_private + r_, + a_ = simde_float16x8_to_private(a), + b_ = simde_float16x8_to_private(b); + simde_float32_t b_lane_ = simde_float16_to_float32(b_.values[lane]); + + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { + r_.values[i] = simde_float16_from_float32( + simde_float16_to_float32(a_.values[i]) * b_lane_); + } + + return simde_float16x8_from_private(r_); +} +#if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && defined(SIMDE_ARM_NEON_FP16) + #define simde_vmulxq_laneq_f16(a, b, lane) vmulxq_laneq_f16((a), (b), (lane)) +#endif +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vmulxq_laneq_f16 + #define vmulxq_laneq_f16(a, b, lane) simde_vmulxq_laneq_f16((a), (b), (lane)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_float32x4_t +simde_vmulxq_laneq_f32(simde_float32x4_t a, simde_float32x4_t b, const int lane) + SIMDE_REQUIRE_CONSTANT_RANGE(lane, 0, 3) { + simde_float32x4_private + r_, + a_ = simde_float32x4_to_private(a), + b_ = simde_float32x4_to_private(b); + + #if defined(SIMDE_RISCV_V_NATIVE) + r_.sv128 = __riscv_vfmul_vf_f32m1(a_.sv128, b_.values[lane], 4); + #else + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { + r_.values[i] = a_.values[i] * b_.values[lane]; + } + #endif + + return simde_float32x4_from_private(r_); +} +#if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + #define simde_vmulxq_laneq_f32(a, b, lane) vmulxq_laneq_f32((a), (b), (lane)) +#endif +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vmulxq_laneq_f32 + #define vmulxq_laneq_f32(a, b, lane) simde_vmulxq_laneq_f32((a), (b), (lane)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_float64x2_t +simde_vmulxq_laneq_f64(simde_float64x2_t a, simde_float64x2_t b, const int lane) + SIMDE_REQUIRE_CONSTANT_RANGE(lane, 0, 1) { + simde_float64x2_private + r_, + a_ = simde_float64x2_to_private(a), + b_ = simde_float64x2_to_private(b); + + #if defined(SIMDE_RISCV_V_NATIVE) + r_.sv128 = __riscv_vfmul_vf_f64m1(a_.sv128, b_.values[lane], 2); + #else + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { + r_.values[i] = a_.values[i] * b_.values[lane]; + } + #endif + + return simde_float64x2_from_private(r_); +} +#if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + #define simde_vmulxq_laneq_f64(a, b, lane) vmulxq_laneq_f64((a), (b), (lane)) +#endif +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vmulxq_laneq_f64 + #define vmulxq_laneq_f64(a, b, lane) simde_vmulxq_laneq_f64((a), (b), (lane)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_float16x4_t +simde_vmulx_laneq_f16(simde_float16x4_t a, simde_float16x8_t b, const int lane) + SIMDE_REQUIRE_CONSTANT_RANGE(lane, 0, 7) { + simde_float16x4_private + r_, + a_ = simde_float16x4_to_private(a); + simde_float16x8_private b_ = simde_float16x8_to_private(b); + simde_float32_t b_lane_ = simde_float16_to_float32(b_.values[lane]); + + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { + r_.values[i] = simde_float16_from_float32( + simde_float16_to_float32(a_.values[i]) * b_lane_); + } + + return simde_float16x4_from_private(r_); +} +#if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && defined(SIMDE_ARM_NEON_FP16) + #define simde_vmulx_laneq_f16(a, b, lane) vmulx_laneq_f16((a), (b), (lane)) +#endif +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vmulx_laneq_f16 + #define vmulx_laneq_f16(a, b, lane) simde_vmulx_laneq_f16((a), (b), (lane)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_float32x2_t +simde_vmulx_laneq_f32(simde_float32x2_t a, simde_float32x4_t b, const int lane) + SIMDE_REQUIRE_CONSTANT_RANGE(lane, 0, 3) { + simde_float32x2_private + r_, + a_ = simde_float32x2_to_private(a); + simde_float32x4_private b_ = simde_float32x4_to_private(b); + + #if defined(SIMDE_RISCV_V_NATIVE) + r_.sv64 = __riscv_vfmul_vf_f32m1(a_.sv64, b_.values[lane], 2); + #else + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { + r_.values[i] = a_.values[i] * b_.values[lane]; + } + #endif + + return simde_float32x2_from_private(r_); +} +#if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + #define simde_vmulx_laneq_f32(a, b, lane) vmulx_laneq_f32((a), (b), (lane)) +#endif +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vmulx_laneq_f32 + #define vmulx_laneq_f32(a, b, lane) simde_vmulx_laneq_f32((a), (b), (lane)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_float64x1_t +simde_vmulx_laneq_f64(simde_float64x1_t a, simde_float64x2_t b, const int lane) + SIMDE_REQUIRE_CONSTANT_RANGE(lane, 0, 1) { + simde_float64x1_private + r_, + a_ = simde_float64x1_to_private(a); + simde_float64x2_private b_ = simde_float64x2_to_private(b); + + #if defined(SIMDE_RISCV_V_NATIVE) + r_.sv64 = __riscv_vfmul_vf_f64m1(a_.sv64, b_.values[lane], 1); + #else + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { + r_.values[i] = a_.values[i] * b_.values[lane]; + } + #endif + + return simde_float64x1_from_private(r_); +} +#if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + #define simde_vmulx_laneq_f64(a, b, lane) vmulx_laneq_f64((a), (b), (lane)) +#endif +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vmulx_laneq_f64 + #define vmulx_laneq_f64(a, b, lane) simde_vmulx_laneq_f64((a), (b), (lane)) +#endif + +SIMDE_END_DECLS_ +HEDLEY_DIAGNOSTIC_POP + +#endif /* !defined(SIMDE_ARM_NEON_MULX_LANE_H) */ diff --git a/lib/simd_wrapper/simde/arm/neon/mulx_n.h b/lib/simd_wrapper/simde/arm/neon/mulx_n.h new file mode 100644 index 00000000000..be78a834d6a --- /dev/null +++ b/lib/simd_wrapper/simde/arm/neon/mulx_n.h @@ -0,0 +1,69 @@ +/* SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, copy, + * modify, merge, publish, distribute, sublicense, and/or sell copies + * of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + * Copyright: + * 2023 Yi-Yen Chung (Copyright owned by Andes Technology) + */ + +#if !defined(SIMDE_ARM_NEON_MULX_N_H) +#define SIMDE_ARM_NEON_MULX_N_H + +#include "types.h" +#include "mul.h" +#include "dup_n.h" + +HEDLEY_DIAGNOSTIC_PUSH +SIMDE_DISABLE_UNWANTED_DIAGNOSTICS +SIMDE_BEGIN_DECLS_ + +SIMDE_FUNCTION_ATTRIBUTES +simde_float16x4_t +simde_vmulx_n_f16(simde_float16x4_t a, simde_float16 b) { + #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && defined(SIMDE_ARM_NEON_FP16) + return vmulx_n_f16(a, b); + #else + return simde_vmul_f16(a, simde_vdup_n_f16(b)); + #endif +} +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vmulx_n_f16 + #define vmulx_n_f16(a, b) simde_vmulx_n_f16((a), (b)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_float16x8_t +simde_vmulxq_n_f16(simde_float16x8_t a, simde_float16 b) { + #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && defined(SIMDE_ARM_NEON_FP16) + return vmulxq_n_f16(a, b); + #else + return simde_vmulq_f16(a, simde_vdupq_n_f16(b)); + #endif +} +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vmulxq_n_f16 + #define vmulxq_n_f16(a, b) simde_vmulxq_n_f16((a), (b)) +#endif + +SIMDE_END_DECLS_ +HEDLEY_DIAGNOSTIC_POP + +#endif /* !defined(SIMDE_ARM_NEON_MULX_N_H) */ diff --git a/lib/simd_wrapper/simde/arm/neon/mvn.h b/lib/simd_wrapper/simde/arm/neon/mvn.h index 654455ec2bc..1cd35591d69 100644 --- a/lib/simd_wrapper/simde/arm/neon/mvn.h +++ b/lib/simd_wrapper/simde/arm/neon/mvn.h @@ -23,6 +23,7 @@ * Copyright: * 2020 Evan Nemerson * 2020 Christopher Moore + * 2023 Yi-Yen Chung (Copyright owned by Andes Technology) */ #if !defined(SIMDE_ARM_NEON_MVN_H) @@ -420,6 +421,52 @@ simde_vmvn_u32(simde_uint32x2_t a) { #define vmvn_u32(a) simde_vmvn_u32(a) #endif +SIMDE_FUNCTION_ATTRIBUTES +simde_poly8x8_t +simde_vmvn_p8(simde_poly8x8_t a) { + #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) + return vmvn_p8(a); + #else + simde_poly8x8_private + r_, + a_ = simde_poly8x8_to_private(a); + + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { + r_.values[i] = ~(a_.values[i]); + } + + return simde_poly8x8_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vmvn_p8 + #define vmvn_p8(a) simde_vmvn_p8(a) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_poly8x16_t +simde_vmvnq_p8(simde_poly8x16_t a) { + #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) + return vmvnq_p8(a); + #else + simde_poly8x16_private + r_, + a_ = simde_poly8x16_to_private(a); + + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { + r_.values[i] = ~(a_.values[i]); + } + + return simde_poly8x16_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vmvnq_p8 + #define vmvnq_p8(a) simde_vmvnq_p8(a) +#endif + SIMDE_END_DECLS_ HEDLEY_DIAGNOSTIC_POP diff --git a/lib/simd_wrapper/simde/arm/neon/neg.h b/lib/simd_wrapper/simde/arm/neon/neg.h index 779238950a3..e6b2a8e480d 100644 --- a/lib/simd_wrapper/simde/arm/neon/neg.h +++ b/lib/simd_wrapper/simde/arm/neon/neg.h @@ -22,6 +22,7 @@ * * Copyright: * 2020 Evan Nemerson + * 2023 Yi-Yen Chung (Copyright owned by Andes Technology) */ #if !defined(SIMDE_ARM_NEON_NEG_H) @@ -47,6 +48,43 @@ simde_vnegd_s64(int64_t a) { #define vnegd_s64(a) simde_vnegd_s64(a) #endif +SIMDE_FUNCTION_ATTRIBUTES +simde_float16_t +simde_vnegh_f16(simde_float16_t a) { + #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARM_NEON_FP16) + return vnegh_f16(a); + #else + return simde_float16_from_float32(-simde_float16_to_float32(a)); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) + #undef vnegh_f16 + #define vnegh_f16(a) simde_vnegh_f16(a) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_float16x4_t +simde_vneg_f16(simde_float16x4_t a) { + #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARM_NEON_FP16) + return vneg_f16(a); + #else + simde_float16x4_private + r_, + a_ = simde_float16x4_to_private(a); + + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { + r_.values[i] = simde_vnegh_f16(a_.values[i]); + } + + return simde_float16x4_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) + #undef vneg_f16 + #define vneg_f16(a) simde_vneg_f16(a) +#endif + SIMDE_FUNCTION_ATTRIBUTES simde_float32x2_t simde_vneg_f32(simde_float32x2_t a) { @@ -209,6 +247,29 @@ simde_vneg_s64(simde_int64x1_t a) { #define vneg_s64(a) simde_vneg_s64(a) #endif +SIMDE_FUNCTION_ATTRIBUTES +simde_float16x8_t +simde_vnegq_f16(simde_float16x8_t a) { + #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARM_NEON_FP16) + return vnegq_f16(a); + #else + simde_float16x8_private + r_, + a_ = simde_float16x8_to_private(a); + + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { + r_.values[i] = simde_vnegh_f16(a_.values[i]); + } + + return simde_float16x8_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) + #undef vnegq_f16 + #define vnegq_f16(a) simde_vnegq_f16(a) +#endif + SIMDE_FUNCTION_ATTRIBUTES simde_float32x4_t simde_vnegq_f32(simde_float32x4_t a) { diff --git a/lib/simd_wrapper/simde/arm/neon/padd.h b/lib/simd_wrapper/simde/arm/neon/padd.h index 6cfd99a2d71..5c34cbe8960 100644 --- a/lib/simd_wrapper/simde/arm/neon/padd.h +++ b/lib/simd_wrapper/simde/arm/neon/padd.h @@ -23,6 +23,7 @@ * Copyright: * 2020-2021 Evan Nemerson * 2020 Sean Maher (Copyright owned by Google, LLC) + * 2023 Yi-Yen Chung (Copyright owned by Andes Technology) */ #if !defined(SIMDE_ARM_NEON_PADD_H) @@ -96,6 +97,20 @@ simde_vpadds_f32(simde_float32x2_t a) { #define vpadds_f32(a) simde_vpadds_f32((a)) #endif +SIMDE_FUNCTION_ATTRIBUTES +simde_float16x4_t +simde_vpadd_f16(simde_float16x4_t a, simde_float16x4_t b) { + #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && !SIMDE_DETECT_CLANG_VERSION_NOT(9,0,0) && defined(SIMDE_ARM_NEON_FP16) + return vpadd_f16(a, b); + #else + return simde_vadd_f16(simde_vuzp1_f16(a, b), simde_vuzp2_f16(a, b)); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) + #undef vpadd_f16 + #define vpadd_f16(a, b) simde_vpadd_f16((a), (b)) +#endif + SIMDE_FUNCTION_ATTRIBUTES simde_float32x2_t simde_vpadd_f32(simde_float32x2_t a, simde_float32x2_t b) { @@ -198,6 +213,20 @@ simde_vpadd_u32(simde_uint32x2_t a, simde_uint32x2_t b) { #define vpadd_u32(a, b) simde_vpadd_u32((a), (b)) #endif +SIMDE_FUNCTION_ATTRIBUTES +simde_float16x8_t +simde_vpaddq_f16(simde_float16x8_t a, simde_float16x8_t b) { + #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && defined(SIMDE_ARM_NEON_FP16) + return vpaddq_f16(a, b); + #else + return simde_vaddq_f16(simde_vuzp1q_f16(a, b), simde_vuzp2q_f16(a, b)); + #endif +} +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vpaddq_f16 + #define vpaddq_f16(a, b) simde_vpaddq_f16((a), (b)) +#endif + SIMDE_FUNCTION_ATTRIBUTES simde_float32x4_t simde_vpaddq_f32(simde_float32x4_t a, simde_float32x4_t b) { diff --git a/lib/simd_wrapper/simde/arm/neon/paddl.h b/lib/simd_wrapper/simde/arm/neon/paddl.h index 203fbad9fcd..3b36e0dcdef 100644 --- a/lib/simd_wrapper/simde/arm/neon/paddl.h +++ b/lib/simd_wrapper/simde/arm/neon/paddl.h @@ -286,7 +286,7 @@ simde_vpaddlq_u16(simde_uint16x8_t a) { simde_uint32x4_private r_; #if defined(SIMDE_X86_XOP_NATIVE) - r_.sse_m128i = _mm_haddd_epu16(a_.sse_m128i); + r_.m128i = _mm_haddd_epu16(a_.m128i); #elif defined(SIMDE_X86_SSE2_NATIVE) r_.m128i = _mm_add_epi32( diff --git a/lib/simd_wrapper/simde/arm/neon/pmax.h b/lib/simd_wrapper/simde/arm/neon/pmax.h index ecf31a1a93a..d8de39d763e 100644 --- a/lib/simd_wrapper/simde/arm/neon/pmax.h +++ b/lib/simd_wrapper/simde/arm/neon/pmax.h @@ -23,6 +23,7 @@ * Copyright: * 2020 Evan Nemerson * 2020 Sean Maher (Copyright owned by Google, LLC) + * 2023 Yi-Yen Chung (Copyright owned by Andes Technology) */ #if !defined(SIMDE_ARM_NEON_PMAX_H) @@ -67,6 +68,20 @@ simde_vpmaxqd_f64(simde_float64x2_t a) { #define vpmaxqd_f64(a) simde_vpmaxqd_f64((a)) #endif +SIMDE_FUNCTION_ATTRIBUTES +simde_float16x4_t +simde_vpmax_f16(simde_float16x4_t a, simde_float16x4_t b) { + #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARM_NEON_FP16) + return vpmax_f16(a, b); + #else + return simde_vmax_f16(simde_vuzp1_f16(a, b), simde_vuzp2_f16(a, b)); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) + #undef vpmax_f16 + #define vpmax_f16(a, b) simde_vpmax_f16((a), (b)) +#endif + SIMDE_FUNCTION_ATTRIBUTES simde_float32x2_t simde_vpmax_f32(simde_float32x2_t a, simde_float32x2_t b) { @@ -165,6 +180,20 @@ simde_vpmax_u32(simde_uint32x2_t a, simde_uint32x2_t b) { #define vpmax_u32(a, b) simde_vpmax_u32((a), (b)) #endif +SIMDE_FUNCTION_ATTRIBUTES +simde_float16x8_t +simde_vpmaxq_f16(simde_float16x8_t a, simde_float16x8_t b) { + #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && defined(SIMDE_ARM_NEON_FP16) + return vpmaxq_f16(a, b); + #else + return simde_vmaxq_f16(simde_vuzp1q_f16(a, b), simde_vuzp2q_f16(a, b)); + #endif +} +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vpmaxq_f16 + #define vpmaxq_f16(a, b) simde_vpmaxq_f16((a), (b)) +#endif + SIMDE_FUNCTION_ATTRIBUTES simde_float32x4_t simde_vpmaxq_f32(simde_float32x4_t a, simde_float32x4_t b) { diff --git a/lib/simd_wrapper/simde/arm/neon/pmaxnm.h b/lib/simd_wrapper/simde/arm/neon/pmaxnm.h new file mode 100644 index 00000000000..5fa519d5e78 --- /dev/null +++ b/lib/simd_wrapper/simde/arm/neon/pmaxnm.h @@ -0,0 +1,142 @@ +/* SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, copy, + * modify, merge, publish, distribute, sublicense, and/or sell copies + * of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + * Copyright: + * 2023 Yi-Yen Chung (Copyright owned by Andes Technology) + */ + +#if !defined(SIMDE_ARM_NEON_PMAXNM_H) +#define SIMDE_ARM_NEON_PMAXNM_H + +#include "types.h" +#include "max.h" +#include "uzp1.h" +#include "uzp2.h" + +HEDLEY_DIAGNOSTIC_PUSH +SIMDE_DISABLE_UNWANTED_DIAGNOSTICS +SIMDE_BEGIN_DECLS_ + +SIMDE_FUNCTION_ATTRIBUTES +simde_float32_t +simde_vpmaxnms_f32(simde_float32x2_t a) { + #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + return vpmaxnms_f32(a); + #else + simde_float32x2_private a_ = simde_float32x2_to_private(a); + return (a_.values[0] > a_.values[1]) ? a_.values[0] : a_.values[1]; + #endif +} +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vpmaxnms_f32 + #define vpmaxnms_f32(a) simde_vpmaxnms_f32((a)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_float64_t +simde_vpmaxnmqd_f64(simde_float64x2_t a) { + #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + return vpmaxnmqd_f64(a); + #else + simde_float64x2_private a_ = simde_float64x2_to_private(a); + return (a_.values[0] > a_.values[1]) ? a_.values[0] : a_.values[1]; + #endif +} +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vpmaxnmqd_f64 + #define vpmaxnmqd_f64(a) simde_vpmaxnmqd_f64((a)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_float16x4_t +simde_vpmaxnm_f16(simde_float16x4_t a, simde_float16x4_t b) { + #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && defined(SIMDE_ARM_NEON_FP16) + return vpmaxnm_f16(a, b); + #else + return simde_vmax_f16(simde_vuzp1_f16(a, b), simde_vuzp2_f16(a, b)); + #endif +} +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vpmaxnm_f16 + #define vpmaxnm_f16(a, b) simde_vpmaxnm_f16((a), (b)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_float32x2_t +simde_vpmaxnm_f32(simde_float32x2_t a, simde_float32x2_t b) { + #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + return vpmaxnm_f32(a, b); + #else + return simde_vmax_f32(simde_vuzp1_f32(a, b), simde_vuzp2_f32(a, b)); + #endif +} +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vpmaxnm_f32 + #define vpmaxnm_f32(a, b) simde_vpmaxnm_f32((a), (b)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_float16x8_t +simde_vpmaxnmq_f16(simde_float16x8_t a, simde_float16x8_t b) { + #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && defined(SIMDE_ARM_NEON_FP16) + return vpmaxnmq_f16(a, b); + #else + return simde_vmaxq_f16(simde_vuzp1q_f16(a, b), simde_vuzp2q_f16(a, b)); + #endif +} +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vpmaxnmq_f16 + #define vpmaxnmq_f16(a, b) simde_vpmaxnmq_f16((a), (b)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_float32x4_t +simde_vpmaxnmq_f32(simde_float32x4_t a, simde_float32x4_t b) { + #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + return vpmaxnmq_f32(a, b); + #else + return simde_vmaxq_f32(simde_vuzp1q_f32(a, b), simde_vuzp2q_f32(a, b)); + #endif +} +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vpmaxnmq_f32 + #define vpmaxnmq_f32(a, b) simde_vpmaxnmq_f32((a), (b)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_float64x2_t +simde_vpmaxnmq_f64(simde_float64x2_t a, simde_float64x2_t b) { + #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + return vpmaxnmq_f64(a, b); + #else + return simde_vmaxq_f64(simde_vuzp1q_f64(a, b), simde_vuzp2q_f64(a, b)); + #endif +} +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vpmaxnmq_f64 + #define vpmaxnmq_f64(a, b) simde_vpmaxnmq_f64((a), (b)) +#endif + +SIMDE_END_DECLS_ +HEDLEY_DIAGNOSTIC_POP + +#endif /* !defined(SIMDE_ARM_NEON_PMAXNM_H) */ diff --git a/lib/simd_wrapper/simde/arm/neon/pmin.h b/lib/simd_wrapper/simde/arm/neon/pmin.h index eaf58e45503..2f76c63801f 100644 --- a/lib/simd_wrapper/simde/arm/neon/pmin.h +++ b/lib/simd_wrapper/simde/arm/neon/pmin.h @@ -22,6 +22,7 @@ * * Copyright: * 2020 Evan Nemerson + * 2023 Yi-Yen Chung (Copyright owned by Andes Technology) */ #if !defined(SIMDE_ARM_NEON_PMIN_H) @@ -66,6 +67,20 @@ simde_vpminqd_f64(simde_float64x2_t a) { #define vpminqd_f64(a) simde_vpminqd_f64((a)) #endif +SIMDE_FUNCTION_ATTRIBUTES +simde_float16x4_t +simde_vpmin_f16(simde_float16x4_t a, simde_float16x4_t b) { + #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARM_NEON_FP16) + return vpmin_f16(a, b); + #else + return simde_vmin_f16(simde_vuzp1_f16(a, b), simde_vuzp2_f16(a, b)); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) + #undef vpmin_f16 + #define vpmin_f16(a, b) simde_vpmin_f16((a), (b)) +#endif + SIMDE_FUNCTION_ATTRIBUTES simde_float32x2_t simde_vpmin_f32(simde_float32x2_t a, simde_float32x2_t b) { @@ -164,6 +179,20 @@ simde_vpmin_u32(simde_uint32x2_t a, simde_uint32x2_t b) { #define vpmin_u32(a, b) simde_vpmin_u32((a), (b)) #endif +SIMDE_FUNCTION_ATTRIBUTES +simde_float16x8_t +simde_vpminq_f16(simde_float16x8_t a, simde_float16x8_t b) { + #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && defined(SIMDE_ARM_NEON_FP16) + return vpminq_f16(a, b); + #else + return simde_vminq_f16(simde_vuzp1q_f16(a, b), simde_vuzp2q_f16(a, b)); + #endif +} +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vpminq_f16 + #define vpminq_f16(a, b) simde_vpminq_f16((a), (b)) +#endif + SIMDE_FUNCTION_ATTRIBUTES simde_float32x4_t simde_vpminq_f32(simde_float32x4_t a, simde_float32x4_t b) { diff --git a/lib/simd_wrapper/simde/arm/neon/pminnm.h b/lib/simd_wrapper/simde/arm/neon/pminnm.h new file mode 100644 index 00000000000..99de03555b0 --- /dev/null +++ b/lib/simd_wrapper/simde/arm/neon/pminnm.h @@ -0,0 +1,142 @@ +/* SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, copy, + * modify, merge, publish, distribute, sublicense, and/or sell copies + * of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + * Copyright: + * 2023 Yi-Yen Chung (Copyright owned by Andes Technology) + */ + +#if !defined(SIMDE_ARM_NEON_PMINNM_H) +#define SIMDE_ARM_NEON_PMINNM_H + +#include "types.h" +#include "min.h" +#include "uzp1.h" +#include "uzp2.h" + +HEDLEY_DIAGNOSTIC_PUSH +SIMDE_DISABLE_UNWANTED_DIAGNOSTICS +SIMDE_BEGIN_DECLS_ + +SIMDE_FUNCTION_ATTRIBUTES +simde_float32_t +simde_vpminnms_f32(simde_float32x2_t a) { + #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + return vpminnms_f32(a); + #else + simde_float32x2_private a_ = simde_float32x2_to_private(a); + return (a_.values[0] < a_.values[1]) ? a_.values[0] : a_.values[1]; + #endif +} +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vpminnms_f32 + #define vpminnms_f32(a) simde_vpminnms_f32((a)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_float64_t +simde_vpminnmqd_f64(simde_float64x2_t a) { + #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + return vpminnmqd_f64(a); + #else + simde_float64x2_private a_ = simde_float64x2_to_private(a); + return (a_.values[0] < a_.values[1]) ? a_.values[0] : a_.values[1]; + #endif +} +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vpminnmqd_f64 + #define vpminnmqd_f64(a) simde_vpminnmqd_f64((a)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_float16x4_t +simde_vpminnm_f16(simde_float16x4_t a, simde_float16x4_t b) { + #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && defined(SIMDE_ARM_NEON_FP16) + return vpminnm_f16(a, b); + #else + return simde_vmin_f16(simde_vuzp1_f16(a, b), simde_vuzp2_f16(a, b)); + #endif +} +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vpminnm_f16 + #define vpminnm_f16(a, b) simde_vpminnm_f16((a), (b)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_float32x2_t +simde_vpminnm_f32(simde_float32x2_t a, simde_float32x2_t b) { + #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + return vpminnm_f32(a, b); + #else + return simde_vmin_f32(simde_vuzp1_f32(a, b), simde_vuzp2_f32(a, b)); + #endif +} +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vpminnm_f32 + #define vpminnm_f32(a, b) simde_vpminnm_f32((a), (b)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_float16x8_t +simde_vpminnmq_f16(simde_float16x8_t a, simde_float16x8_t b) { + #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && defined(SIMDE_ARM_NEON_FP16) + return vpminnmq_f16(a, b); + #else + return simde_vminq_f16(simde_vuzp1q_f16(a, b), simde_vuzp2q_f16(a, b)); + #endif +} +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vpminnmq_f16 + #define vpminnmq_f16(a, b) simde_vpminnmq_f16((a), (b)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_float32x4_t +simde_vpminnmq_f32(simde_float32x4_t a, simde_float32x4_t b) { + #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + return vpminnmq_f32(a, b); + #else + return simde_vminq_f32(simde_vuzp1q_f32(a, b), simde_vuzp2q_f32(a, b)); + #endif +} +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vpminnmq_f32 + #define vpminnmq_f32(a, b) simde_vpminnmq_f32((a), (b)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_float64x2_t +simde_vpminnmq_f64(simde_float64x2_t a, simde_float64x2_t b) { + #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + return vpminnmq_f64(a, b); + #else + return simde_vminq_f64(simde_vuzp1q_f64(a, b), simde_vuzp2q_f64(a, b)); + #endif +} +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vpminnmq_f64 + #define vpminnmq_f64(a, b) simde_vpminnmq_f64((a), (b)) +#endif + +SIMDE_END_DECLS_ +HEDLEY_DIAGNOSTIC_POP + +#endif /* !defined(SIMDE_ARM_NEON_PMINNM_H) */ diff --git a/lib/simd_wrapper/simde/arm/neon/qabs.h b/lib/simd_wrapper/simde/arm/neon/qabs.h index 6e956f1e144..9ad7d7c8300 100644 --- a/lib/simd_wrapper/simde/arm/neon/qabs.h +++ b/lib/simd_wrapper/simde/arm/neon/qabs.h @@ -162,7 +162,7 @@ simde_int8x16_t simde_vqabsq_s8(simde_int8x16_t a) { #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) return vqabsq_s8(a); - #elif defined(SIMDE_X86_SSE4_1_NATIVE) + #elif defined(SIMDE_X86_SSE2_NATIVE) simde_int8x16_private r_, a_ = simde_int8x16_to_private(simde_vabsq_s8(a)); diff --git a/lib/simd_wrapper/simde/arm/neon/qdmlal.h b/lib/simd_wrapper/simde/arm/neon/qdmlal.h new file mode 100644 index 00000000000..fe96b0fc813 --- /dev/null +++ b/lib/simd_wrapper/simde/arm/neon/qdmlal.h @@ -0,0 +1,105 @@ +/* SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, copy, + * modify, merge, publish, distribute, sublicense, and/or sell copies + * of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + * Copyright: + * 2023 Yi-Yen Chung (Copyright owned by Andes Technology) + */ + +#if !defined(SIMDE_ARM_NEON_QDMLAL_H) +#define SIMDE_ARM_NEON_QDMLAL_H + +#include "add.h" +#include "mul.h" +#include "mul_n.h" +#include "movl.h" +#include "qadd.h" +#include "types.h" + +HEDLEY_DIAGNOSTIC_PUSH +SIMDE_DISABLE_UNWANTED_DIAGNOSTICS +SIMDE_BEGIN_DECLS_ + +SIMDE_FUNCTION_ATTRIBUTES +int32_t +simde_vqdmlalh_s16(int32_t a, int16_t b, int16_t c) { + #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + return vqdmlalh_s16(a, b, c); + #else + return HEDLEY_STATIC_CAST(int32_t, b) * HEDLEY_STATIC_CAST(int32_t, c) * 2 + a; + #endif +} +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vqdmlalh_s16 + #define vqdmlalh_s16(a, b, c) simde_vqdmlalh_s16((a), (b), (c)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +int64_t +simde_vqdmlals_s32(int64_t a, int32_t b, int32_t c) { + #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + return vqdmlals_s32(a, b, c); + #else + return HEDLEY_STATIC_CAST(int64_t, b) * HEDLEY_STATIC_CAST(int64_t, c) * 2 + a; + #endif +} +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vqdmlals_s32 + #define vqdmlals_s32(a, b, c) simde_vqdmlals_s32((a), (b), (c)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_int32x4_t +simde_vqdmlal_s16(simde_int32x4_t a, simde_int16x4_t b, simde_int16x4_t c) { + #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) + return vqdmlal_s16(a, b, c); + #else + simde_int32x4_t temp = simde_vmulq_s32(simde_vmovl_s16(b), simde_vmovl_s16(c)); + return simde_vqaddq_s32(simde_vqaddq_s32(temp, temp), a); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vqdmlal_s16 + #define vqdmlal_s16(a, b, c) simde_vqdmlal_s16((a), (b), (c)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_int64x2_t +simde_vqdmlal_s32(simde_int64x2_t a, simde_int32x2_t b, simde_int32x2_t c) { + #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) + return vqdmlal_s32(a, b, c); + #else + simde_int64x2_t r = simde_x_vmulq_s64( + simde_vmovl_s32(b), + simde_vmovl_s32(c)); + return simde_vqaddq_s64(a, simde_vqaddq_s64(r, r)); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vqdmlal_s32 + #define vqdmlal_s32(a, b, c) simde_vqdmlal_s32((a), (b), (c)) +#endif + + +SIMDE_END_DECLS_ +HEDLEY_DIAGNOSTIC_POP + +#endif /* !defined(SIMDE_ARM_NEON_QDMLAL_H) */ diff --git a/lib/simd_wrapper/simde/arm/neon/qdmlal_high.h b/lib/simd_wrapper/simde/arm/neon/qdmlal_high.h new file mode 100644 index 00000000000..016deb01191 --- /dev/null +++ b/lib/simd_wrapper/simde/arm/neon/qdmlal_high.h @@ -0,0 +1,83 @@ +/* SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, copy, + * modify, merge, publish, distribute, sublicense, and/or sell copies + * of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + * Copyright: + * 2023 Yi-Yen Chung (Copyright owned by Andes Technology) + */ + +#if !defined(SIMDE_ARM_NEON_QDMLAL_HIGH_H) +#define SIMDE_ARM_NEON_QDMLAL_HIGH_H + +#include "movl_high.h" +#include "mla.h" +#include "mul_n.h" +#include "types.h" + +HEDLEY_DIAGNOSTIC_PUSH +SIMDE_DISABLE_UNWANTED_DIAGNOSTICS +SIMDE_BEGIN_DECLS_ + +SIMDE_FUNCTION_ATTRIBUTES +simde_int32x4_t +simde_vqdmlal_high_s16(simde_int32x4_t a, simde_int16x8_t b, simde_int16x8_t c) { + #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + return vqdmlal_high_s16(a, b, c); + #else + return simde_vaddq_s32( + simde_vmulq_n_s32( + simde_vmulq_s32( + simde_vmovl_high_s16(b), simde_vmovl_high_s16(c)), 2), a); + #endif +} +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vqdmlal_high_s16 + #define vqdmlal_high_s16(a, b, c) simde_vqdmlal_high_s16((a), (b), (c)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_int64x2_t +simde_vqdmlal_high_s32(simde_int64x2_t a, simde_int32x4_t b, simde_int32x4_t c) { + #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + return vqdmlal_high_s32(a, b, c); + #else + simde_int64x2_private r_ = simde_int64x2_to_private( + simde_x_vmulq_s64( + simde_vmovl_high_s32(b), + simde_vmovl_high_s32(c))); + + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { + r_.values[i] = r_.values[i] * HEDLEY_STATIC_CAST(int64_t, 2); + } + + return simde_vaddq_s64(a, simde_int64x2_from_private(r_)); + #endif +} +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vqdmlal_high_s32 + #define vqdmlal_high_s32(a, b, c) simde_vqdmlal_high_s32((a), (b), (c)) +#endif + +SIMDE_END_DECLS_ +HEDLEY_DIAGNOSTIC_POP + +#endif /* !defined(SIMDE_ARM_NEON_QDMLAL_HIGH_H) */ diff --git a/lib/simd_wrapper/simde/arm/neon/qdmlal_high_lane.h b/lib/simd_wrapper/simde/arm/neon/qdmlal_high_lane.h new file mode 100644 index 00000000000..b2d6a8b4283 --- /dev/null +++ b/lib/simd_wrapper/simde/arm/neon/qdmlal_high_lane.h @@ -0,0 +1,125 @@ +/* SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, copy, + * modify, merge, publish, distribute, sublicense, and/or sell copies + * of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + * Copyright: + * 2023 Yi-Yen Chung (Copyright owned by Andes Technology) + */ + +#if !defined(SIMDE_ARM_NEON_QDMLAL_HIGH_LANE_H) +#define SIMDE_ARM_NEON_QDMLAL_HIGH_LANE_H + +#include "movl_high.h" +#include "add.h" +#include "mul.h" +#include "mul_n.h" +#include "dup_n.h" +#include "mla.h" +#include "types.h" + +HEDLEY_DIAGNOSTIC_PUSH +SIMDE_DISABLE_UNWANTED_DIAGNOSTICS +SIMDE_BEGIN_DECLS_ + +SIMDE_FUNCTION_ATTRIBUTES +simde_int32x4_t +simde_vqdmlal_high_lane_s16(simde_int32x4_t a, simde_int16x8_t b, simde_int16x4_t v, const int lane) SIMDE_REQUIRE_CONSTANT_RANGE(lane, 0, 3) { + return simde_vaddq_s32( + simde_vmulq_n_s32( + simde_vmulq_s32( + simde_vmovl_high_s16(b), + simde_vmovl_high_s16(simde_vdupq_n_s16(simde_int16x4_to_private(v).values[lane]))), 2), a); +} +#if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + #define simde_vqdmlal_high_lane_s16(a, b, v, lane) vqdmlal_high_lane_s16(a, b, v, lane) +#endif +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vqdmlal_high_lane_s16 + #define vqdmlal_high_lane_s16(a, b, v, lane) simde_vqdmlal_high_lane_s16((a), (b), (v), (lane)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_int32x4_t +simde_vqdmlal_high_laneq_s16(simde_int32x4_t a, simde_int16x8_t b, simde_int16x8_t v, const int lane) SIMDE_REQUIRE_CONSTANT_RANGE(lane, 0, 7) { + return simde_vaddq_s32( + simde_vmulq_n_s32( + simde_vmulq_s32( + simde_vmovl_high_s16(b), + simde_vmovl_high_s16(simde_vdupq_n_s16(simde_int16x8_to_private(v).values[lane]))), 2), a); +} +#if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + #define simde_vqdmlal_high_laneq_s16(a, b, v, lane) vqdmlal_high_laneq_s16(a, b, v, lane) +#endif +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vqdmlal_high_laneq_s16 + #define vqdmlal_high_laneq_s16(a, b, v, lane) simde_vqdmlal_high_laneq_s16((a), (b), (v), (lane)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_int64x2_t +simde_vqdmlal_high_lane_s32(simde_int64x2_t a, simde_int32x4_t b, simde_int32x2_t v, const int lane) SIMDE_REQUIRE_CONSTANT_RANGE(lane, 0, 1) { + simde_int64x2_private r_ = simde_int64x2_to_private( + simde_x_vmulq_s64( + simde_vmovl_high_s32(b), + simde_vmovl_high_s32(simde_vdupq_n_s32(simde_int32x2_to_private(v).values[lane])))); + + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { + r_.values[i] = r_.values[i] * HEDLEY_STATIC_CAST(int64_t, 2); + } + + return simde_vaddq_s64(a, simde_int64x2_from_private(r_)); +} +#if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + #define simde_vqdmlal_high_lane_s32(a, b, v, lane) vqdmlal_high_lane_s32(a, b, v, lane) +#endif +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vqdmlal_high_lane_s32 + #define vqdmlal_high_lane_s32(a, b, v, lane) simde_vqdmlal_high_lane_s32((a), (b), (v), (lane)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_int64x2_t +simde_vqdmlal_high_laneq_s32(simde_int64x2_t a, simde_int32x4_t b, simde_int32x4_t v, const int lane) SIMDE_REQUIRE_CONSTANT_RANGE(lane, 0, 3) { + simde_int64x2_private r_ = simde_int64x2_to_private( + simde_x_vmulq_s64( + simde_vmovl_high_s32(b), + simde_vmovl_high_s32(simde_vdupq_n_s32(simde_int32x4_to_private(v).values[lane])))); + + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { + r_.values[i] = r_.values[i] * HEDLEY_STATIC_CAST(int64_t, 2); + } + + return simde_vaddq_s64(a, simde_int64x2_from_private(r_)); +} +#if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + #define simde_vqdmlal_high_laneq_s32(a, b, v, lane) vqdmlal_high_laneq_s32(a, b, v, lane) +#endif +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vqdmlal_high_laneq_s32 + #define vqdmlal_high_laneq_s32(a, b, v, lane) simde_vqdmlal_high_laneq_s32((a), (b), (v), (lane)) +#endif + +SIMDE_END_DECLS_ +HEDLEY_DIAGNOSTIC_POP + +#endif /* !defined(SIMDE_ARM_NEON_QDMLAL_HIGH_LANE_H) */ diff --git a/lib/simd_wrapper/simde/arm/neon/qdmlal_high_n.h b/lib/simd_wrapper/simde/arm/neon/qdmlal_high_n.h new file mode 100644 index 00000000000..205cafbcc16 --- /dev/null +++ b/lib/simd_wrapper/simde/arm/neon/qdmlal_high_n.h @@ -0,0 +1,86 @@ +/* SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, copy, + * modify, merge, publish, distribute, sublicense, and/or sell copies + * of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + * Copyright: + * 2023 Yi-Yen Chung (Copyright owned by Andes Technology) + */ + +#if !defined(SIMDE_ARM_NEON_QDMLAL_HIGH_N_H) +#define SIMDE_ARM_NEON_QDMLAL_HIGH_N_H + +#include "movl_high.h" +#include "dup_n.h" +#include "add.h" +#include "mul.h" +#include "mul_n.h" +#include "types.h" + +HEDLEY_DIAGNOSTIC_PUSH +SIMDE_DISABLE_UNWANTED_DIAGNOSTICS +SIMDE_BEGIN_DECLS_ + +SIMDE_FUNCTION_ATTRIBUTES +simde_int32x4_t +simde_vqdmlal_high_n_s16(simde_int32x4_t a, simde_int16x8_t b, int16_t c) { + #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + return vqdmlal_high_n_s16(a, b, c); + #else + return simde_vaddq_s32( + simde_vmulq_n_s32( + simde_vmulq_s32( + simde_vmovl_high_s16(b), + simde_vmovl_high_s16(simde_vdupq_n_s16(c))), 2), a); + #endif +} +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vqdmlal_high_n_s16 + #define vqdmlal_high_n_s16(a, b, c) simde_vqdmlal_high_n_s16((a), (b), (c)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_int64x2_t +simde_vqdmlal_high_n_s32(simde_int64x2_t a, simde_int32x4_t b, int32_t c) { + #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + return vqdmlal_high_n_s32(a, b, c); + #else + simde_int64x2_private r_ = simde_int64x2_to_private( + simde_x_vmulq_s64( + simde_vmovl_high_s32(b), + simde_vmovl_high_s32(simde_vdupq_n_s32(c)))); + + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { + r_.values[i] = r_.values[i] * HEDLEY_STATIC_CAST(int64_t, 2); + } + + return simde_vaddq_s64(a, simde_int64x2_from_private(r_)); + #endif +} +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vqdmlal_high_n_s32 + #define vqdmlal_high_n_s32(a, b, c) simde_vqdmlal_high_n_s32((a), (b), (c)) +#endif + +SIMDE_END_DECLS_ +HEDLEY_DIAGNOSTIC_POP + +#endif /* !defined(SIMDE_ARM_NEON_QDMLAL_HIGH_N_H) */ diff --git a/lib/simd_wrapper/simde/arm/neon/qdmlal_lane.h b/lib/simd_wrapper/simde/arm/neon/qdmlal_lane.h new file mode 100644 index 00000000000..14a663cd66d --- /dev/null +++ b/lib/simd_wrapper/simde/arm/neon/qdmlal_lane.h @@ -0,0 +1,122 @@ +/* SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, copy, + * modify, merge, publish, distribute, sublicense, and/or sell copies + * of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + * Copyright: + * 2023 Yi-Yen Chung (Copyright owned by Andes Technology) + */ + +#if !defined(SIMDE_ARM_NEON_QDMLAL_LANE_H) +#define SIMDE_ARM_NEON_QDMLAL_LANE_H + +#include "qdmlal.h" +#include "dup_lane.h" +#include "get_lane.h" +#include "types.h" + +HEDLEY_DIAGNOSTIC_PUSH +SIMDE_DISABLE_UNWANTED_DIAGNOSTICS +SIMDE_BEGIN_DECLS_ + +#if defined(SIMDE_ARM_NEON_A32V7_NATIVE) + #define simde_vqdmlal_lane_s16(a, b, v, lane) vqdmlal_lane_s16((a), (b), (v), (lane)) +#else + #define simde_vqdmlal_lane_s16(a, b, v, lane) simde_vqdmlal_s16((a), (b), simde_vdup_lane_s16((v), (lane))) +#endif +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vqdmlal_lane_s16 + #define vqdmlal_lane_s16(a, b, c, lane) simde_vqdmlal_lane_s16((a), (b), (c), (lane)) +#endif + +#if defined(SIMDE_ARM_NEON_A32V7_NATIVE) + #define simde_vqdmlal_lane_s32(a, b, v, lane) vqdmlal_lane_s32((a), (b), (v), (lane)) +#else + #define simde_vqdmlal_lane_s32(a, b, v, lane) simde_vqdmlal_s32((a), (b), simde_vdup_lane_s32((v), (lane))) +#endif +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vqdmlal_lane_s32 + #define vqdmlal_lane_s32(a, b, c, lane) simde_vqdmlal_lane_s32((a), (b), (c), (lane)) +#endif + +#if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + #define simde_vqdmlal_laneq_s16(a, b, v, lane) vqdmlal_laneq_s16((a), (b), (v), (lane)) +#else + #define simde_vqdmlal_laneq_s16(a, b, v, lane) simde_vqdmlal_s16((a), (b), simde_vdup_laneq_s16((v), (lane))) +#endif +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vqdmlal_laneq_s16 + #define vqdmlal_laneq_s16(a, b, c, lane) simde_vqdmlal_laneq_s16((a), (b), (c), (lane)) +#endif + +#if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + #define simde_vqdmlal_laneq_s32(a, b, v, lane) vqdmlal_laneq_s32((a), (b), (v), (lane)) +#else + #define simde_vqdmlal_laneq_s32(a, b, v, lane) simde_vqdmlal_s32((a), (b), simde_vdup_laneq_s32((v), (lane))) +#endif +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vqdmlal_laneq_s32 + #define vqdmlal_laneq_s32(a, b, c, lane) simde_vqdmlal_laneq_s32((a), (b), (c), (lane)) +#endif + +#if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + #define simde_vqdmlalh_lane_s16(a, b, v, lane) vqdmlalh_lane_s16((a), (b), (v), (lane)) +#else + #define simde_vqdmlalh_lane_s16(a, b, v, lane) simde_vqdmlalh_s16((a), (b), simde_vget_lane_s16((v), (lane))) +#endif +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vqdmlalh_lane_s16 + #define vqdmlalh_lane_s16(a, b, c, lane) simde_vqdmlalh_lane_s16((a), (b), (c), (lane)) +#endif + +#if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + #define simde_vqdmlalh_laneq_s16(a, b, v, lane) vqdmlalh_laneq_s16((a), (b), (v), (lane)) +#else + #define simde_vqdmlalh_laneq_s16(a, b, v, lane) simde_vqdmlalh_s16((a), (b), simde_vgetq_lane_s16((v), (lane))) +#endif +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vqdmlalh_laneq_s16 + #define vqdmlalh_laneq_s16(a, b, c, lane) simde_vqdmlalh_laneq_s16((a), (b), (c), (lane)) +#endif + +#if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + #define simde_vqdmlals_lane_s32(a, b, v, lane) vqdmlals_lane_s32((a), (b), (v), (lane)) +#else + #define simde_vqdmlals_lane_s32(a, b, v, lane) simde_vqdmlals_s32((a), (b), simde_vget_lane_s32((v), (lane))) +#endif +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vqdmlals_lane_s32 + #define vqdmlals_lane_s32(a, b, c, lane) simde_vqdmlals_lane_s32((a), (b), (c), (lane)) +#endif + +#if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + #define simde_vqdmlals_laneq_s32(a, b, v, lane) vqdmlals_laneq_s32((a), (b), (v), (lane)) +#else + #define simde_vqdmlals_laneq_s32(a, b, v, lane) simde_vqdmlals_s32((a), (b), simde_vgetq_lane_s32((v), (lane))) +#endif +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vqdmlals_laneq_s32 + #define vqdmlals_laneq_s32(a, b, c, lane) simde_vqdmlals_laneq_s32((a), (b), (c), (lane)) +#endif + +SIMDE_END_DECLS_ +HEDLEY_DIAGNOSTIC_POP + +#endif /* !defined(SIMDE_ARM_NEON_QDMLAL_LANE_H) */ diff --git a/lib/simd_wrapper/simde/arm/neon/qdmlal_n.h b/lib/simd_wrapper/simde/arm/neon/qdmlal_n.h new file mode 100644 index 00000000000..0a5c69ea376 --- /dev/null +++ b/lib/simd_wrapper/simde/arm/neon/qdmlal_n.h @@ -0,0 +1,69 @@ +/* SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, copy, + * modify, merge, publish, distribute, sublicense, and/or sell copies + * of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + * Copyright: + * 2023 Yi-Yen Chung (Copyright owned by Andes Technology) + */ + +#if !defined(SIMDE_ARM_NEON_QDMLAL_N_H) +#define SIMDE_ARM_NEON_QDMLAL_N_H + +#include "dup_n.h" +#include "qdmlal.h" +#include "types.h" + +HEDLEY_DIAGNOSTIC_PUSH +SIMDE_DISABLE_UNWANTED_DIAGNOSTICS +SIMDE_BEGIN_DECLS_ + +SIMDE_FUNCTION_ATTRIBUTES +simde_int32x4_t +simde_vqdmlal_n_s16(simde_int32x4_t a, simde_int16x4_t b, int16_t c) { + #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) + return vqdmlal_n_s16(a, b, c); + #else + return simde_vqdmlal_s16(a, b, simde_vdup_n_s16(c)); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vqdmlal_n_s16 + #define vqdmlal_n_s16(a, b, c) simde_vqdmlal_n_s16((a), (b), (c)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_int64x2_t +simde_vqdmlal_n_s32(simde_int64x2_t a, simde_int32x2_t b, int32_t c) { + #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) + return vqdmlal_n_s32(a, b, c); + #else + return simde_vqdmlal_s32(a, b, simde_vdup_n_s32(c)); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vqdmlal_n_s32 + #define vqdmlal_n_s32(a, b, c) simde_vqdmlal_n_s32((a), (b), (c)) +#endif + +SIMDE_END_DECLS_ +HEDLEY_DIAGNOSTIC_POP + +#endif /* !defined(SIMDE_ARM_NEON_QDMLAL_N_H) */ diff --git a/lib/simd_wrapper/simde/arm/neon/qdmlsl.h b/lib/simd_wrapper/simde/arm/neon/qdmlsl.h new file mode 100644 index 00000000000..68e17ca05e0 --- /dev/null +++ b/lib/simd_wrapper/simde/arm/neon/qdmlsl.h @@ -0,0 +1,106 @@ +/* SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, copy, + * modify, merge, publish, distribute, sublicense, and/or sell copies + * of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + * Copyright: + * 2023 Yi-Yen Chung (Copyright owned by Andes Technology) + */ + +#if !defined(SIMDE_ARM_NEON_QDMLSL_H) +#define SIMDE_ARM_NEON_QDMLSL_H + +#include "sub.h" +#include "mul.h" +#include "mul_n.h" +#include "movl.h" +#include "qadd.h" +#include "qsub.h" +#include "types.h" + +HEDLEY_DIAGNOSTIC_PUSH +SIMDE_DISABLE_UNWANTED_DIAGNOSTICS +SIMDE_BEGIN_DECLS_ + +SIMDE_FUNCTION_ATTRIBUTES +int32_t +simde_vqdmlslh_s16(int32_t a, int16_t b, int16_t c) { + #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + return vqdmlslh_s16(a, b, c); + #else + return a - HEDLEY_STATIC_CAST(int32_t, b) * HEDLEY_STATIC_CAST(int32_t, c) * 2; + #endif +} +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vqdmlslh_s16 + #define vqdmlslh_s16(a, b, c) simde_vqdmlslh_s16((a), (b), (c)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +int64_t +simde_vqdmlsls_s32(int64_t a, int32_t b, int32_t c) { + #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + return vqdmlsls_s32(a, b, c); + #else + return a - HEDLEY_STATIC_CAST(int64_t, b) * HEDLEY_STATIC_CAST(int64_t, c) * 2; + #endif +} +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vqdmlsls_s32 + #define vqdmlsls_s32(a, b, c) simde_vqdmlsls_s32((a), (b), (c)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_int32x4_t +simde_vqdmlsl_s16(simde_int32x4_t a, simde_int16x4_t b, simde_int16x4_t c) { + #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) + return vqdmlsl_s16(a, b, c); + #else + simde_int32x4_t temp = simde_vmulq_s32(simde_vmovl_s16(b), simde_vmovl_s16(c)); + return simde_vqsubq_s32(a, simde_vqaddq_s32(temp, temp)); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vqdmlsl_s16 + #define vqdmlsl_s16(a, b, c) simde_vqdmlsl_s16((a), (b), (c)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_int64x2_t +simde_vqdmlsl_s32(simde_int64x2_t a, simde_int32x2_t b, simde_int32x2_t c) { + #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) + return vqdmlsl_s32(a, b, c); + #else + simde_int64x2_t r = simde_x_vmulq_s64( + simde_vmovl_s32(b), + simde_vmovl_s32(c)); + return simde_vqsubq_s64(a, simde_vqaddq_s64(r, r)); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vqdmlsl_s32 + #define vqdmlsl_s32(a, b, c) simde_vqdmlsl_s32((a), (b), (c)) +#endif + + +SIMDE_END_DECLS_ +HEDLEY_DIAGNOSTIC_POP + +#endif /* !defined(SIMDE_ARM_NEON_QDMLSL_H) */ diff --git a/lib/simd_wrapper/simde/arm/neon/qdmlsl_high.h b/lib/simd_wrapper/simde/arm/neon/qdmlsl_high.h new file mode 100644 index 00000000000..18a6f47fe5a --- /dev/null +++ b/lib/simd_wrapper/simde/arm/neon/qdmlsl_high.h @@ -0,0 +1,81 @@ +/* SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, copy, + * modify, merge, publish, distribute, sublicense, and/or sell copies + * of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + * Copyright: + * 2023 Yi-Yen Chung (Copyright owned by Andes Technology) + */ + +#if !defined(SIMDE_ARM_NEON_QDMLSL_HIGH_H) +#define SIMDE_ARM_NEON_QDMLSL_HIGH_H + +#include "movl_high.h" +#include "sub.h" +#include "mul.h" +#include "mul_n.h" +#include "types.h" + +HEDLEY_DIAGNOSTIC_PUSH +SIMDE_DISABLE_UNWANTED_DIAGNOSTICS +SIMDE_BEGIN_DECLS_ + +SIMDE_FUNCTION_ATTRIBUTES +simde_int32x4_t +simde_vqdmlsl_high_s16(simde_int32x4_t a, simde_int16x8_t b, simde_int16x8_t c) { + #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + return vqdmlsl_high_s16(a, b, c); + #else + return simde_vsubq_s32(a, simde_vmulq_n_s32(simde_vmulq_s32(simde_vmovl_high_s16(b), simde_vmovl_high_s16(c)), 2)); + #endif +} +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vqdmlsl_high_s16 + #define vqdmlsl_high_s16(a, b, c) simde_vqdmlsl_high_s16((a), (b), (c)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_int64x2_t +simde_vqdmlsl_high_s32(simde_int64x2_t a, simde_int32x4_t b, simde_int32x4_t c) { + #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + return vqdmlsl_high_s32(a, b, c); + #else + simde_int64x2_private r_ = simde_int64x2_to_private( + simde_x_vmulq_s64( + simde_vmovl_high_s32(b), + simde_vmovl_high_s32(c))); + + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { + r_.values[i] = r_.values[i] * HEDLEY_STATIC_CAST(int64_t, 2); + } + + return simde_vsubq_s64(a, simde_int64x2_from_private(r_)); + #endif +} +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vqdmlsl_high_s32 + #define vqdmlsl_high_s32(a, b, c) simde_vqdmlsl_high_s32((a), (b), (c)) +#endif + +SIMDE_END_DECLS_ +HEDLEY_DIAGNOSTIC_POP + +#endif /* !defined(SIMDE_ARM_NEON_QDMLSL_HIGH_H) */ diff --git a/lib/simd_wrapper/simde/arm/neon/qdmlsl_high_lane.h b/lib/simd_wrapper/simde/arm/neon/qdmlsl_high_lane.h new file mode 100644 index 00000000000..877c72a2aa7 --- /dev/null +++ b/lib/simd_wrapper/simde/arm/neon/qdmlsl_high_lane.h @@ -0,0 +1,124 @@ +/* SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, copy, + * modify, merge, publish, distribute, sublicense, and/or sell copies + * of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + * Copyright: + * 2023 Yi-Yen Chung (Copyright owned by Andes Technology) + */ + +#if !defined(SIMDE_ARM_NEON_QDMLSL_HIGH_LANE_H) +#define SIMDE_ARM_NEON_QDMLSL_HIGH_LANE_H + +#include "movl_high.h" +#include "sub.h" +#include "mul.h" +#include "mul_n.h" +#include "dup_n.h" +#include "types.h" + +HEDLEY_DIAGNOSTIC_PUSH +SIMDE_DISABLE_UNWANTED_DIAGNOSTICS +SIMDE_BEGIN_DECLS_ + +SIMDE_FUNCTION_ATTRIBUTES +simde_int32x4_t +simde_vqdmlsl_high_lane_s16(simde_int32x4_t a, simde_int16x8_t b, simde_int16x4_t v, const int lane) SIMDE_REQUIRE_CONSTANT_RANGE(lane, 0, 3) { + return simde_vsubq_s32(a, + simde_vmulq_n_s32( + simde_vmulq_s32( + simde_vmovl_high_s16(b), + simde_vmovl_high_s16(simde_vdupq_n_s16(simde_int16x4_to_private(v).values[lane]))), 2)); +} +#if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + #define simde_vqdmlsl_high_lane_s16(a, b, v, lane) vqdmlsl_high_lane_s16(a, b, v, lane) +#endif +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vqdmlsl_high_lane_s16 + #define vqdmlsl_high_lane_s16(a, b, v, lane) simde_vqdmlsl_high_lane_s16((a), (b), (v), (lane)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_int32x4_t +simde_vqdmlsl_high_laneq_s16(simde_int32x4_t a, simde_int16x8_t b, simde_int16x8_t v, const int lane) SIMDE_REQUIRE_CONSTANT_RANGE(lane, 0, 7) { + return simde_vsubq_s32(a, + simde_vmulq_n_s32( + simde_vmulq_s32( + simde_vmovl_high_s16(b), + simde_vmovl_high_s16(simde_vdupq_n_s16(simde_int16x8_to_private(v).values[lane]))), 2)); +} +#if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + #define simde_vqdmlsl_high_laneq_s16(a, b, v, lane) vqdmlsl_high_laneq_s16(a, b, v, lane) +#endif +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vqdmlsl_high_laneq_s16 + #define vqdmlsl_high_laneq_s16(a, b, v, lane) simde_vqdmlsl_high_laneq_s16((a), (b), (v), (lane)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_int64x2_t +simde_vqdmlsl_high_lane_s32(simde_int64x2_t a, simde_int32x4_t b, simde_int32x2_t v, const int lane) SIMDE_REQUIRE_CONSTANT_RANGE(lane, 0, 1) { + simde_int64x2_private r_ = simde_int64x2_to_private( + simde_x_vmulq_s64( + simde_vmovl_high_s32(b), + simde_vmovl_high_s32(simde_vdupq_n_s32(simde_int32x2_to_private(v).values[lane])))); + + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { + r_.values[i] = r_.values[i] * HEDLEY_STATIC_CAST(int64_t, 2); + } + + return simde_vsubq_s64(a, simde_int64x2_from_private(r_)); +} +#if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + #define simde_vqdmlsl_high_lane_s32(a, b, v, lane) vqdmlsl_high_lane_s32(a, b, v, lane) +#endif +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vqdmlsl_high_lane_s32 + #define vqdmlsl_high_lane_s32(a, b, v, lane) simde_vqdmlsl_high_lane_s32((a), (b), (v), (lane)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_int64x2_t +simde_vqdmlsl_high_laneq_s32(simde_int64x2_t a, simde_int32x4_t b, simde_int32x4_t v, const int lane) SIMDE_REQUIRE_CONSTANT_RANGE(lane, 0, 3) { + simde_int64x2_private r_ = simde_int64x2_to_private( + simde_x_vmulq_s64( + simde_vmovl_high_s32(b), + simde_vmovl_high_s32(simde_vdupq_n_s32(simde_int32x4_to_private(v).values[lane])))); + + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { + r_.values[i] = r_.values[i] * HEDLEY_STATIC_CAST(int64_t, 2); + } + + return simde_vsubq_s64(a, simde_int64x2_from_private(r_)); +} +#if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + #define simde_vqdmlsl_high_laneq_s32(a, b, v, lane) vqdmlsl_high_laneq_s32(a, b, v, lane) +#endif +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vqdmlsl_high_laneq_s32 + #define vqdmlsl_high_laneq_s32(a, b, v, lane) simde_vqdmlsl_high_laneq_s32((a), (b), (v), (lane)) +#endif + +SIMDE_END_DECLS_ +HEDLEY_DIAGNOSTIC_POP + +#endif /* !defined(SIMDE_ARM_NEON_QDMLSL_HIGH_LANE_H) */ diff --git a/lib/simd_wrapper/simde/arm/neon/qdmlsl_high_n.h b/lib/simd_wrapper/simde/arm/neon/qdmlsl_high_n.h new file mode 100644 index 00000000000..9db3d7e045f --- /dev/null +++ b/lib/simd_wrapper/simde/arm/neon/qdmlsl_high_n.h @@ -0,0 +1,86 @@ +/* SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, copy, + * modify, merge, publish, distribute, sublicense, and/or sell copies + * of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + * Copyright: + * 2023 Yi-Yen Chung (Copyright owned by Andes Technology) + */ + +#if !defined(SIMDE_ARM_NEON_QDMLSL_HIGH_N_H) +#define SIMDE_ARM_NEON_QDMLSL_HIGH_N_H + +#include "movl_high.h" +#include "dup_n.h" +#include "sub.h" +#include "mul.h" +#include "mul_n.h" +#include "types.h" + +HEDLEY_DIAGNOSTIC_PUSH +SIMDE_DISABLE_UNWANTED_DIAGNOSTICS +SIMDE_BEGIN_DECLS_ + +SIMDE_FUNCTION_ATTRIBUTES +simde_int32x4_t +simde_vqdmlsl_high_n_s16(simde_int32x4_t a, simde_int16x8_t b, int16_t c) { + #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + return vqdmlsl_high_n_s16(a, b, c); + #else + return simde_vsubq_s32(a, + simde_vmulq_n_s32( + simde_vmulq_s32( + simde_vmovl_high_s16(b), + simde_vmovl_high_s16(simde_vdupq_n_s16(c))), 2)); + #endif +} +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vqdmlsl_high_n_s16 + #define vqdmlsl_high_n_s16(a, b, c) simde_vqdmlsl_high_n_s16((a), (b), (c)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_int64x2_t +simde_vqdmlsl_high_n_s32(simde_int64x2_t a, simde_int32x4_t b, int32_t c) { + #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + return vqdmlsl_high_n_s32(a, b, c); + #else + simde_int64x2_private r_ = simde_int64x2_to_private( + simde_x_vmulq_s64( + simde_vmovl_high_s32(b), + simde_vmovl_high_s32(simde_vdupq_n_s32(c)))); + + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { + r_.values[i] = r_.values[i] * HEDLEY_STATIC_CAST(int64_t, 2); + } + + return simde_vsubq_s64(a, simde_int64x2_from_private(r_)); + #endif +} +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vqdmlsl_high_n_s32 + #define vqdmlsl_high_n_s32(a, b, c) simde_vqdmlsl_high_n_s32((a), (b), (c)) +#endif + +SIMDE_END_DECLS_ +HEDLEY_DIAGNOSTIC_POP + +#endif /* !defined(SIMDE_ARM_NEON_QDMLSL_HIGH_N_H) */ diff --git a/lib/simd_wrapper/simde/arm/neon/qdmlsl_lane.h b/lib/simd_wrapper/simde/arm/neon/qdmlsl_lane.h new file mode 100644 index 00000000000..d93677da0dc --- /dev/null +++ b/lib/simd_wrapper/simde/arm/neon/qdmlsl_lane.h @@ -0,0 +1,122 @@ +/* SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, copy, + * modify, merge, publish, distribute, sublicense, and/or sell copies + * of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + * Copyright: + * 2023 Yi-Yen Chung (Copyright owned by Andes Technology) + */ + +#if !defined(SIMDE_ARM_NEON_QDMLSL_LANE_H) +#define SIMDE_ARM_NEON_QDMLSL_LANE_H + +#include "qdmlsl.h" +#include "dup_lane.h" +#include "get_lane.h" +#include "types.h" + +HEDLEY_DIAGNOSTIC_PUSH +SIMDE_DISABLE_UNWANTED_DIAGNOSTICS +SIMDE_BEGIN_DECLS_ + +#if defined(SIMDE_ARM_NEON_A32V7_NATIVE) + #define simde_vqdmlsl_lane_s16(a, b, v, lane) vqdmlsl_lane_s16((a), (b), (v), (lane)) +#else + #define simde_vqdmlsl_lane_s16(a, b, v, lane) simde_vqdmlsl_s16((a), (b), simde_vdup_lane_s16((v), (lane))) +#endif +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vqdmlsl_lane_s16 + #define vqdmlsl_lane_s16(a, b, c, lane) simde_vqdmlsl_lane_s16((a), (b), (c), (lane)) +#endif + +#if defined(SIMDE_ARM_NEON_A32V7_NATIVE) + #define simde_vqdmlsl_lane_s32(a, b, v, lane) vqdmlsl_lane_s32((a), (b), (v), (lane)) +#else + #define simde_vqdmlsl_lane_s32(a, b, v, lane) simde_vqdmlsl_s32((a), (b), simde_vdup_lane_s32((v), (lane))) +#endif +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vqdmlsl_lane_s32 + #define vqdmlsl_lane_s32(a, b, c, lane) simde_vqdmlsl_lane_s32((a), (b), (c), (lane)) +#endif + +#if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + #define simde_vqdmlsl_laneq_s16(a, b, v, lane) vqdmlsl_laneq_s16((a), (b), (v), (lane)) +#else + #define simde_vqdmlsl_laneq_s16(a, b, v, lane) simde_vqdmlsl_s16((a), (b), simde_vdup_laneq_s16((v), (lane))) +#endif +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vqdmlsl_laneq_s16 + #define vqdmlsl_laneq_s16(a, b, c, lane) simde_vqdmlsl_laneq_s16((a), (b), (c), (lane)) +#endif + +#if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + #define simde_vqdmlsl_laneq_s32(a, b, v, lane) vqdmlsl_laneq_s32((a), (b), (v), (lane)) +#else + #define simde_vqdmlsl_laneq_s32(a, b, v, lane) simde_vqdmlsl_s32((a), (b), simde_vdup_laneq_s32((v), (lane))) +#endif +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vqdmlsl_laneq_s32 + #define vqdmlsl_laneq_s32(a, b, c, lane) simde_vqdmlsl_laneq_s32((a), (b), (c), (lane)) +#endif + +#if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + #define simde_vqdmlslh_lane_s16(a, b, v, lane) vqdmlslh_lane_s16((a), (b), (v), (lane)) +#else + #define simde_vqdmlslh_lane_s16(a, b, v, lane) simde_vqdmlslh_s16((a), (b), simde_vget_lane_s16((v), (lane))) +#endif +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vqdmlslh_lane_s16 + #define vqdmlslh_lane_s16(a, b, c, lane) simde_vqdmlslh_lane_s16((a), (b), (c), (lane)) +#endif + +#if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + #define simde_vqdmlslh_laneq_s16(a, b, v, lane) vqdmlslh_laneq_s16((a), (b), (v), (lane)) +#else + #define simde_vqdmlslh_laneq_s16(a, b, v, lane) simde_vqdmlslh_s16((a), (b), simde_vgetq_lane_s16((v), (lane))) +#endif +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vqdmlslh_laneq_s16 + #define vqdmlslh_laneq_s16(a, b, c, lane) simde_vqdmlslh_laneq_s16((a), (b), (c), (lane)) +#endif + +#if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + #define simde_vqdmlsls_lane_s32(a, b, v, lane) vqdmlsls_lane_s32((a), (b), (v), (lane)) +#else + #define simde_vqdmlsls_lane_s32(a, b, v, lane) simde_vqdmlsls_s32((a), (b), simde_vget_lane_s32((v), (lane))) +#endif +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vqdmlsls_lane_s32 + #define vqdmlsls_lane_s32(a, b, c, lane) simde_vqdmlsls_lane_s32((a), (b), (c), (lane)) +#endif + +#if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + #define simde_vqdmlsls_laneq_s32(a, b, v, lane) vqdmlsls_laneq_s32((a), (b), (v), (lane)) +#else + #define simde_vqdmlsls_laneq_s32(a, b, v, lane) simde_vqdmlsls_s32((a), (b), simde_vgetq_lane_s32((v), (lane))) +#endif +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vqdmlsls_laneq_s32 + #define vqdmlsls_laneq_s32(a, b, c, lane) simde_vqdmlsls_laneq_s32((a), (b), (c), (lane)) +#endif + +SIMDE_END_DECLS_ +HEDLEY_DIAGNOSTIC_POP + +#endif /* !defined(SIMDE_ARM_NEON_QDmlsl_LANE_H) */ diff --git a/lib/simd_wrapper/simde/arm/neon/qdmlsl_n.h b/lib/simd_wrapper/simde/arm/neon/qdmlsl_n.h new file mode 100644 index 00000000000..5707f4c47b1 --- /dev/null +++ b/lib/simd_wrapper/simde/arm/neon/qdmlsl_n.h @@ -0,0 +1,69 @@ +/* SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, copy, + * modify, merge, publish, distribute, sublicense, and/or sell copies + * of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + * Copyright: + * 2023 Yi-Yen Chung (Copyright owned by Andes Technology) + */ + +#if !defined(SIMDE_ARM_NEON_QDMLSL_N_H) +#define SIMDE_ARM_NEON_QDMLSL_N_H + +#include "dup_n.h" +#include "qdmlsl.h" +#include "types.h" + +HEDLEY_DIAGNOSTIC_PUSH +SIMDE_DISABLE_UNWANTED_DIAGNOSTICS +SIMDE_BEGIN_DECLS_ + +SIMDE_FUNCTION_ATTRIBUTES +simde_int32x4_t +simde_vqdmlsl_n_s16(simde_int32x4_t a, simde_int16x4_t b, int16_t c) { + #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) + return vqdmlsl_n_s16(a, b, c); + #else + return simde_vqdmlsl_s16(a, b, simde_vdup_n_s16(c)); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vqdmlsl_n_s16 + #define vqdmlsl_n_s16(a, b, c) simde_vqdmlsl_n_s16((a), (b), (c)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_int64x2_t +simde_vqdmlsl_n_s32(simde_int64x2_t a, simde_int32x2_t b, int32_t c) { + #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) + return vqdmlsl_n_s32(a, b, c); + #else + return simde_vqdmlsl_s32(a, b, simde_vdup_n_s32(c)); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vqdmlsl_n_s32 + #define vqdmlsl_n_s32(a, b, c) simde_vqdmlsl_n_s32((a), (b), (c)) +#endif + +SIMDE_END_DECLS_ +HEDLEY_DIAGNOSTIC_POP + +#endif /* !defined(SIMDE_ARM_NEON_QDMLSL_N_H) */ diff --git a/lib/simd_wrapper/simde/arm/neon/qdmulh.h b/lib/simd_wrapper/simde/arm/neon/qdmulh.h index d42e393ad7f..29d1078cb02 100644 --- a/lib/simd_wrapper/simde/arm/neon/qdmulh.h +++ b/lib/simd_wrapper/simde/arm/neon/qdmulh.h @@ -23,6 +23,7 @@ * Copyright: * 2020 Evan Nemerson * 2020 Sean Maher (Copyright owned by Google, LLC) + * 2023 Yi-Yen Chung (Copyright owned by Andes Technology) */ #if !defined(SIMDE_ARM_NEON_QDMULH_H) @@ -63,7 +64,7 @@ simde_vqdmulh_s16(simde_int16x4_t a, simde_int16x4_t b) { #else simde_int16x4_private r_; - #if HEDLEY_HAS_BUILTIN(__builtin_shufflevector) + #if HEDLEY_HAS_BUILTIN(__builtin_shufflevector) && !(HEDLEY_GCC_VERSION_CHECK(12,1,0) && defined(SIMDE_ARCH_ZARCH)) simde_int16x8_private tmp_ = simde_int16x8_to_private( simde_vreinterpretq_s16_s32( @@ -89,6 +90,21 @@ simde_vqdmulh_s16(simde_int16x4_t a, simde_int16x4_t b) { #define vqdmulh_s16(a, b) simde_vqdmulh_s16((a), (b)) #endif +SIMDE_FUNCTION_ATTRIBUTES +int16_t +simde_vqdmulhh_s16(int16_t a, int16_t b) { + #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + return vqdmulhh_s16(a, b); + #else + int32_t tmp = simde_vqdmullh_s16(a, b); + return HEDLEY_STATIC_CAST(int16_t, tmp >> 16); + #endif +} +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vqdmulhh_s16 + #define vqdmulhh_s16(a, b) simde_vqdmulhh_s16((a), (b)) +#endif + SIMDE_FUNCTION_ATTRIBUTES simde_int32x2_t simde_vqdmulh_s32(simde_int32x2_t a, simde_int32x2_t b) { @@ -97,7 +113,7 @@ simde_vqdmulh_s32(simde_int32x2_t a, simde_int32x2_t b) { #else simde_int32x2_private r_; - #if HEDLEY_HAS_BUILTIN(__builtin_shufflevector) + #if HEDLEY_HAS_BUILTIN(__builtin_shufflevector) && !(HEDLEY_GCC_VERSION_CHECK(12,1,0) && defined(SIMDE_ARCH_ZARCH)) simde_int32x4_private tmp_ = simde_int32x4_to_private( simde_vreinterpretq_s32_s64( diff --git a/lib/simd_wrapper/simde/arm/neon/qdmulh_lane.h b/lib/simd_wrapper/simde/arm/neon/qdmulh_lane.h index 3120eb7ad70..32cd22dead3 100644 --- a/lib/simd_wrapper/simde/arm/neon/qdmulh_lane.h +++ b/lib/simd_wrapper/simde/arm/neon/qdmulh_lane.h @@ -23,6 +23,7 @@ * Copyright: * 2021 Evan Nemerson * 2021 Zhi An Ng (Copyright owned by Google, LLC) + * 2023 Yi-Yen Chung (Copyright owned by Andes Technology) */ #if !defined(SIMDE_ARM_NEON_QDMULH_LANE_H) @@ -37,6 +38,17 @@ HEDLEY_DIAGNOSTIC_PUSH SIMDE_DISABLE_UNWANTED_DIAGNOSTICS SIMDE_BEGIN_DECLS_ +#if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + #define simde_vqdmulhh_lane_s16(a, v, lane) vqdmulhh_lane_s16((a), (v), (lane)) +#else + #define simde_vqdmulhh_lane_s16(a, v, lane) \ + simde_vqdmulhh_s16((a), simde_vget_lane_s16((v), (lane))) +#endif +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vqdmulhh_lane_s16 + #define vqdmulhh_lane_s16(a, v, lane) simde_vqdmulhh_lane_s16((a), (v), (lane)) +#endif + #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) #define simde_vqdmulh_lane_s16(a, v, lane) vqdmulh_lane_s16((a), (v), (lane)) #else @@ -81,6 +93,17 @@ SIMDE_BEGIN_DECLS_ #define vqdmulhq_lane_s32(a, v, lane) simde_vqdmulhq_lane_s32((a), (v), (lane)) #endif +#if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + #define simde_vqdmulhh_laneq_s16(a, v, lane) vqdmulhh_laneq_s16((a), (v), (lane)) +#else + #define simde_vqdmulhh_laneq_s16(a, v, lane) \ + simde_vqdmulhh_s16((a), simde_vgetq_lane_s16((v), (lane))) +#endif +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vqdmulhh_laneq_s16 + #define vqdmulhh_laneq_s16(a, v, lane) simde_vqdmulhh_laneq_s16((a), (v), (lane)) +#endif + #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) #define simde_vqdmulh_laneq_s16(a, v, lane) vqdmulh_laneq_s16((a), (v), (lane)) #else diff --git a/lib/simd_wrapper/simde/arm/neon/qdmull.h b/lib/simd_wrapper/simde/arm/neon/qdmull.h index 88bf50bcbb8..871257f6188 100644 --- a/lib/simd_wrapper/simde/arm/neon/qdmull.h +++ b/lib/simd_wrapper/simde/arm/neon/qdmull.h @@ -23,6 +23,7 @@ * Copyright: * 2020 Evan Nemerson * 2020 Sean Maher (Copyright owned by Google, LLC) + * 2023 Yi-Yen Chung (Copyright owned by Andes Technology) */ /* Implementation notes (seanptmaher): @@ -67,8 +68,8 @@ simde_vqdmulls_s32(int32_t a, int32_t b) { #endif } #if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) - #undef vqdmulls_s16 - #define vqdmulls_s16(a, b) simde_vqdmulls_s16((a), (b)) + #undef vqdmulls_s32 + #define vqdmulls_s32(a, b) simde_vqdmulls_s32((a), (b)) #endif SIMDE_FUNCTION_ATTRIBUTES diff --git a/lib/simd_wrapper/simde/arm/neon/qdmull_high.h b/lib/simd_wrapper/simde/arm/neon/qdmull_high.h new file mode 100644 index 00000000000..2c6b26912b2 --- /dev/null +++ b/lib/simd_wrapper/simde/arm/neon/qdmull_high.h @@ -0,0 +1,69 @@ +/* SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, copy, + * modify, merge, publish, distribute, sublicense, and/or sell copies + * of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + * Copyright: + * 2023 Yi-Yen Chung (Copyright owned by Andes Technology) + */ + +#if !defined(SIMDE_ARM_NEON_QDMULL_HIGH_H) +#define SIMDE_ARM_NEON_QDMULL_HIGH_H + +#include "combine.h" +#include "get_high.h" +#include "qdmull.h" + +HEDLEY_DIAGNOSTIC_PUSH +SIMDE_DISABLE_UNWANTED_DIAGNOSTICS +SIMDE_BEGIN_DECLS_ + +SIMDE_FUNCTION_ATTRIBUTES +simde_int32x4_t +simde_vqdmull_high_s16(simde_int16x8_t a, simde_int16x8_t b) { + #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + return vqdmull_high_s16(a, b); + #else + return simde_vqdmull_s16(simde_vget_high_s16(a), simde_vget_high_s16(b)); + #endif +} +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vqdmull_high_s16 + #define vqdmull_high_s16(a, b) simde_vqdmull_high_s16((a), (b)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_int64x2_t +simde_vqdmull_high_s32(simde_int32x4_t a, simde_int32x4_t b) { + #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + return vqdmull_high_s32(a, b); + #else + return simde_vqdmull_s32(simde_vget_high_s32(a), simde_vget_high_s32(b)); + #endif +} +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vqdmull_high_s32 + #define vqdmull_high_s32(a, b) simde_vqdmull_high_s32((a), (b)) +#endif + +SIMDE_END_DECLS_ +HEDLEY_DIAGNOSTIC_POP + +#endif /* !defined(SIMDE_ARM_NEON_QDMULL_HIGH_H) */ diff --git a/lib/simd_wrapper/simde/arm/neon/qdmull_high_lane.h b/lib/simd_wrapper/simde/arm/neon/qdmull_high_lane.h new file mode 100644 index 00000000000..f8326b2bf48 --- /dev/null +++ b/lib/simd_wrapper/simde/arm/neon/qdmull_high_lane.h @@ -0,0 +1,107 @@ +/* SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, copy, + * modify, merge, publish, distribute, sublicense, and/or sell copies + * of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + * Copyright: + * 2023 Yi-Yen Chung (Copyright owned by Andes Technology) + */ + +#if !defined(SIMDE_ARM_NEON_QDMULL_HIGH_LANE_H) +#define SIMDE_ARM_NEON_QDMULL_HIGH_LANE_H + +#include "combine.h" +#include "qdmull.h" +#include "dup_n.h" +#include "get_high.h" + +HEDLEY_DIAGNOSTIC_PUSH +SIMDE_DISABLE_UNWANTED_DIAGNOSTICS +SIMDE_BEGIN_DECLS_ + +SIMDE_FUNCTION_ATTRIBUTES +simde_int32x4_t +simde_vqdmull_high_lane_s16(simde_int16x8_t a, simde_int16x4_t v, const int lane) + SIMDE_REQUIRE_CONSTANT_RANGE(lane, 0, 3) { + simde_int16x4_private + v_ = simde_int16x4_to_private(v); + return simde_vqdmull_s16(simde_vget_high_s16(a), simde_vdup_n_s16(v_.values[lane])); +} +#if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + #define simde_vqdmull_high_lane_s16(a, v, lane) vqdmull_high_lane_s16(a, v, lane) +#endif +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vqdmull_high_lane_s16 + #define vqdmull_high_lane_s16(a, v, lane) simde_vqdmull_high_lane_s16((a), (v), (lane)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_int32x4_t +simde_vqdmull_high_laneq_s16(simde_int16x8_t a, simde_int16x8_t v, const int lane) + SIMDE_REQUIRE_CONSTANT_RANGE(lane, 0, 7) { + simde_int16x8_private + v_ = simde_int16x8_to_private(v); + return simde_vqdmull_s16(simde_vget_high_s16(a), simde_vdup_n_s16(v_.values[lane])); +} +#if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + #define simde_vqdmull_high_laneq_s16(a, v, lane) vqdmull_high_laneq_s16(a, v, lane) +#endif +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vqdmull_high_laneq_s16 + #define vqdmull_high_laneq_s16(a, v, lane) simde_vqdmull_high_laneq_s16((a), (v), (lane)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_int64x2_t +simde_vqdmull_high_lane_s32(simde_int32x4_t a, simde_int32x2_t v, const int lane) + SIMDE_REQUIRE_CONSTANT_RANGE(lane, 0, 1) { + simde_int32x2_private + v_ = simde_int32x2_to_private(v); + return simde_vqdmull_s32(simde_vget_high_s32(a), simde_vdup_n_s32(v_.values[lane])); +} +#if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + #define simde_vqdmull_high_lane_s32(a, v, lane) vqdmull_high_lane_s32(a, v, lane) +#endif +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vqdmull_high_lane_s32 + #define vqdmull_high_lane_s32(a, v, lane) simde_vqdmull_high_lane_s32((a), (v), (lane)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_int64x2_t +simde_vqdmull_high_laneq_s32(simde_int32x4_t a, simde_int32x4_t v, const int lane) + SIMDE_REQUIRE_CONSTANT_RANGE(lane, 0, 3) { + simde_int32x4_private + v_ = simde_int32x4_to_private(v); + return simde_vqdmull_s32(simde_vget_high_s32(a), simde_vdup_n_s32(v_.values[lane])); +} +#if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + #define simde_vqdmull_high_laneq_s32(a, v, lane) vqdmull_high_laneq_s32(a, v, lane) +#endif +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vqdmull_high_laneq_s32 + #define vqdmull_high_laneq_s32(a, v, lane) simde_vqdmull_high_laneq_s32((a), (v), (lane)) +#endif + + +SIMDE_END_DECLS_ +HEDLEY_DIAGNOSTIC_POP + +#endif /* !defined(SIMDE_ARM_NEON_QDMULL_HIGH_LANE_H) */ diff --git a/lib/simd_wrapper/simde/arm/neon/qdmull_high_n.h b/lib/simd_wrapper/simde/arm/neon/qdmull_high_n.h new file mode 100644 index 00000000000..aef31240f69 --- /dev/null +++ b/lib/simd_wrapper/simde/arm/neon/qdmull_high_n.h @@ -0,0 +1,70 @@ +/* SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, copy, + * modify, merge, publish, distribute, sublicense, and/or sell copies + * of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + * Copyright: + * 2023 Yi-Yen Chung (Copyright owned by Andes Technology) + */ + +#if !defined(SIMDE_ARM_NEON_QDMULL_HIGH_N_H) +#define SIMDE_ARM_NEON_QDMULL_HIGH_N_H + +#include "combine.h" +#include "get_high.h" +#include "dup_n.h" +#include "qdmull.h" + +HEDLEY_DIAGNOSTIC_PUSH +SIMDE_DISABLE_UNWANTED_DIAGNOSTICS +SIMDE_BEGIN_DECLS_ + +SIMDE_FUNCTION_ATTRIBUTES +simde_int32x4_t +simde_vqdmull_high_n_s16(simde_int16x8_t a, int16_t b) { + #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + return vqdmull_high_n_s16(a, b); + #else + return simde_vqdmull_s16(simde_vget_high_s16(a), simde_vdup_n_s16(b)); + #endif +} +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vqdmull_high_n_s16 + #define vqdmull_high_n_s16(a, b) simde_vqdmull_high_n_s16((a), (b)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_int64x2_t +simde_vqdmull_high_n_s32(simde_int32x4_t a, int32_t b) { + #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + return vqdmull_high_n_s32(a, b); + #else + return simde_vqdmull_s32(simde_vget_high_s32(a), simde_vdup_n_s32(b)); + #endif +} +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vqdmull_high_n_s32 + #define vqdmull_high_n_s32(a, b) simde_vqdmull_high_n_s32((a), (b)) +#endif + +SIMDE_END_DECLS_ +HEDLEY_DIAGNOSTIC_POP + +#endif /* !defined(SIMDE_ARM_NEON_QDMULL_HIGH_N_H) */ diff --git a/lib/simd_wrapper/simde/arm/neon/qdmull_lane.h b/lib/simd_wrapper/simde/arm/neon/qdmull_lane.h new file mode 100644 index 00000000000..a7bf68cbdc1 --- /dev/null +++ b/lib/simd_wrapper/simde/arm/neon/qdmull_lane.h @@ -0,0 +1,206 @@ +/* SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, copy, + * modify, merge, publish, distribute, sublicense, and/or sell copies + * of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + * Copyright: + * 2023 Yi-Yen Chung (Copyright owned by Andes Technology) + */ + +#if !defined(SIMDE_ARM_NEON_QDMULL_LANE_H) +#define SIMDE_ARM_NEON_QDMULL_LANE_H + +#include "combine.h" +#include "qdmull.h" + +HEDLEY_DIAGNOSTIC_PUSH +SIMDE_DISABLE_UNWANTED_DIAGNOSTICS +SIMDE_BEGIN_DECLS_ + +SIMDE_FUNCTION_ATTRIBUTES +int32_t +simde_vqdmullh_lane_s16(int16_t a, simde_int16x4_t v, const int lane) + SIMDE_REQUIRE_CONSTANT_RANGE(lane, 0, 3) { + simde_int16x4_private + v_ = simde_int16x4_to_private(v); + + return simde_vqdmullh_s16(a, v_.values[lane]); +} +#if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + #define simde_vqdmullh_lane_s16(a, v, lane) vqdmullh_lane_s16(a, v, lane) +#endif +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vqdmullh_lane_s16 + #define vqdmullh_lane_s16(a, v, lane) simde_vqdmullh_lane_s16((a), (v), (lane)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +int32_t +simde_vqdmullh_laneq_s16(int16_t a, simde_int16x8_t v, const int lane) + SIMDE_REQUIRE_CONSTANT_RANGE(lane, 0, 7) { + simde_int16x8_private + v_ = simde_int16x8_to_private(v); + + return simde_vqdmullh_s16(a, v_.values[lane]); +} +#if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + #define simde_vqdmullh_laneq_s16(a, v, lane) vqdmullh_laneq_s16(a, v, lane) +#endif +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vqdmullh_laneq_s16 + #define vqdmullh_laneq_s16(a, v, lane) simde_vqdmullh_laneq_s16((a), (v), (lane)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +int64_t +simde_vqdmulls_lane_s32(int32_t a, simde_int32x2_t v, const int lane) + SIMDE_REQUIRE_CONSTANT_RANGE(lane, 0, 1) { + simde_int32x2_private + v_ = simde_int32x2_to_private(v); + + return simde_vqdmulls_s32(a, v_.values[lane]); +} +#if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + #define simde_vqdmulls_lane_s32(a, v, lane) vqdmulls_lane_s32(a, v, lane) +#endif +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vqdmulls_lane_s32 + #define vqdmulls_lane_s32(a, v, lane) simde_vqdmulls_lane_s32((a), (v), (lane)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +int64_t +simde_vqdmulls_laneq_s32(int32_t a, simde_int32x4_t v, const int lane) + SIMDE_REQUIRE_CONSTANT_RANGE(lane, 0, 3) { + simde_int32x4_private + v_ = simde_int32x4_to_private(v); + + return simde_vqdmulls_s32(a, v_.values[lane]); +} +#if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + #define simde_vqdmulls_laneq_s32(a, v, lane) vqdmulls_laneq_s32(a, v, lane) +#endif +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vqdmulls_laneq_s32 + #define vqdmulls_laneq_s32(a, v, lane) simde_vqdmulls_laneq_s32((a), (v), (lane)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_int32x4_t +simde_vqdmull_lane_s16(simde_int16x4_t a, simde_int16x4_t b, const int lane) + SIMDE_REQUIRE_CONSTANT_RANGE(lane, 0, 3) { + simde_int32x4_private r_; + simde_int16x4_private + a_ = simde_int16x4_to_private(a), + b_ = simde_int16x4_to_private(b); + + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { + r_.values[i] = simde_vqdmullh_s16(a_.values[i], b_.values[lane]); + } + + return simde_int32x4_from_private(r_); +} +#if defined(SIMDE_ARM_NEON_A32V7_NATIVE) + #define simde_vqdmull_lane_s16(a, v, lane) vqdmull_lane_s16(a, v, lane) +#endif +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vqdmull_lane_s16 + #define vqdmull_lane_s16(a, v, lane) simde_vqdmull_lane_s16((a), (v), (lane)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_int32x4_t +simde_vqdmull_laneq_s16(simde_int16x4_t a, simde_int16x8_t b, const int lane) + SIMDE_REQUIRE_CONSTANT_RANGE(lane, 0, 7) { + simde_int32x4_private r_; + simde_int16x4_private + a_ = simde_int16x4_to_private(a); + simde_int16x8_private + b_ = simde_int16x8_to_private(b); + + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { + r_.values[i] = simde_vqdmullh_s16(a_.values[i], b_.values[lane]); + } + + return simde_int32x4_from_private(r_); +} +#if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + #define simde_vqdmull_laneq_s16(a, v, lane) vqdmull_laneq_s16(a, v, lane) +#endif +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vqdmull_laneq_s16 + #define vqdmull_laneq_s16(a, v, lane) simde_vqdmull_laneq_s16((a), (v), (lane)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_int64x2_t +simde_vqdmull_lane_s32(simde_int32x2_t a, simde_int32x2_t b, const int lane) + SIMDE_REQUIRE_CONSTANT_RANGE(lane, 0, 1) { + simde_int64x2_private r_; + simde_int32x2_private + a_ = simde_int32x2_to_private(a), + b_ = simde_int32x2_to_private(b); + + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { + r_.values[i] = simde_vqdmulls_s32(a_.values[i], b_.values[lane]); + } + + return simde_int64x2_from_private(r_); +} +#if defined(SIMDE_ARM_NEON_A32V7_NATIVE) + #define simde_vqdmull_lane_s32(a, v, lane) vqdmull_lane_s32(a, v, lane) +#endif +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vqdmull_lane_s32 + #define vqdmull_lane_s32(a, v, lane) simde_vqdmull_lane_s32((a), (v), (lane)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_int64x2_t +simde_vqdmull_laneq_s32(simde_int32x2_t a, simde_int32x4_t b, const int lane) + SIMDE_REQUIRE_CONSTANT_RANGE(lane, 0, 3) { + simde_int64x2_private r_; + simde_int32x2_private + a_ = simde_int32x2_to_private(a); + simde_int32x4_private + b_ = simde_int32x4_to_private(b); + + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { + r_.values[i] = simde_vqdmulls_s32(a_.values[i], b_.values[lane]); + } + + return simde_int64x2_from_private(r_); +} +#if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + #define simde_vqdmull_laneq_s32(a, v, lane) vqdmull_laneq_s32(a, v, lane) +#endif +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vqdmull_laneq_s32 + #define vqdmull_laneq_s32(a, v, lane) simde_vqdmull_laneq_s32((a), (v), (lane)) +#endif + +SIMDE_END_DECLS_ +HEDLEY_DIAGNOSTIC_POP + +#endif /* !defined(SIMDE_ARM_NEON_QDMULL_H) */ diff --git a/lib/simd_wrapper/simde/arm/neon/qdmull_n.h b/lib/simd_wrapper/simde/arm/neon/qdmull_n.h new file mode 100644 index 00000000000..691802637b9 --- /dev/null +++ b/lib/simd_wrapper/simde/arm/neon/qdmull_n.h @@ -0,0 +1,69 @@ +/* SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, copy, + * modify, merge, publish, distribute, sublicense, and/or sell copies + * of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + * Copyright: + * 2023 Yi-Yen Chung (Copyright owned by Andes Technology) + */ + +#if !defined(SIMDE_ARM_NEON_QDMULL_N_H) +#define SIMDE_ARM_NEON_QDMULL_N_H + +#include "combine.h" +#include "dup_n.h" +#include "qdmull.h" + +HEDLEY_DIAGNOSTIC_PUSH +SIMDE_DISABLE_UNWANTED_DIAGNOSTICS +SIMDE_BEGIN_DECLS_ + +SIMDE_FUNCTION_ATTRIBUTES +simde_int32x4_t +simde_vqdmull_n_s16(simde_int16x4_t a, int16_t b) { + #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) + return vqdmull_n_s16(a, b); + #else + return simde_vqdmull_s16(a, simde_vdup_n_s16(b)); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vqdmull_n_s16 + #define vqdmull_n_s16(a, b) simde_vqdmull_n_s16((a), (b)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_int64x2_t +simde_vqdmull_n_s32(simde_int32x2_t a, int32_t b) { + #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) + return vqdmull_n_s32(a, b); + #else + return simde_vqdmull_s32(a, simde_vdup_n_s32(b)); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vqdmull_n_s32 + #define vqdmull_n_s32(a, b) simde_vqdmull_n_s32((a), (b)) +#endif + +SIMDE_END_DECLS_ +HEDLEY_DIAGNOSTIC_POP + +#endif /* !defined(SIMDE_ARM_NEON_QDMULL_N_H) */ diff --git a/lib/simd_wrapper/simde/arm/neon/qmovun_high.h b/lib/simd_wrapper/simde/arm/neon/qmovun_high.h new file mode 100644 index 00000000000..edb3e17a499 --- /dev/null +++ b/lib/simd_wrapper/simde/arm/neon/qmovun_high.h @@ -0,0 +1,84 @@ +/* SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, copy, + * modify, merge, publish, distribute, sublicense, and/or sell copies + * of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + * Copyright: + * 2023 Yi-Yen Chung (Copyright owned by Andes Technology) + */ + +#if !defined(SIMDE_ARM_NEON_QMOVUN_HIGH_H) +#define SIMDE_ARM_NEON_QMOVUN_HIGH_H + +#include "types.h" + +#include "combine.h" +#include "qmovun.h" + +HEDLEY_DIAGNOSTIC_PUSH +SIMDE_DISABLE_UNWANTED_DIAGNOSTICS +SIMDE_BEGIN_DECLS_ + +SIMDE_FUNCTION_ATTRIBUTES +simde_uint8x16_t +simde_vqmovun_high_s16(simde_uint8x8_t r, simde_int16x8_t a) { + #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + return vqmovun_high_s16(r, a); + #else + return simde_vcombine_u8(r, simde_vqmovun_s16(a)); + #endif +} +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vqmovun_high_s16 + #define vqmovun_high_s16(r, a) simde_vqmovun_high_s16((r), (a)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_uint16x8_t +simde_vqmovun_high_s32(simde_uint16x4_t r, simde_int32x4_t a) { + #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + return vqmovun_high_s32(r, a); + #else + return simde_vcombine_u16(r, simde_vqmovun_s32(a)); + #endif +} +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vqmovun_high_s32 + #define vqmovun_high_s32(r, a) simde_vqmovun_high_s32((r), (a)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_uint32x4_t +simde_vqmovun_high_s64(simde_uint32x2_t r, simde_int64x2_t a) { + #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + return vqmovun_high_s64(r, a); + #else + return simde_vcombine_u32(r, simde_vqmovun_s64(a)); + #endif +} +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vqmovun_high_s64 + #define vqmovun_high_s64(r, a) simde_vqmovun_high_s64((r), (a)) +#endif + +SIMDE_END_DECLS_ +HEDLEY_DIAGNOSTIC_POP + +#endif /* !defined(SIMDE_ARM_NEON_QMOVUN_HIGH_H) */ diff --git a/lib/simd_wrapper/simde/arm/neon/qrdmlah.h b/lib/simd_wrapper/simde/arm/neon/qrdmlah.h new file mode 100644 index 00000000000..9442101e312 --- /dev/null +++ b/lib/simd_wrapper/simde/arm/neon/qrdmlah.h @@ -0,0 +1,186 @@ +/* SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, copy, + * modify, merge, publish, distribute, sublicense, and/or sell copies + * of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + * Copyright: + * 2023 Yi-Yen Chung (Copyright owned by Andes Technology) + */ + +#if !defined(SIMDE_ARM_NEON_QRDMLAH_H) +#define SIMDE_ARM_NEON_QRDMLAH_H + +#include "types.h" +#include "qmovn.h" + +HEDLEY_DIAGNOSTIC_PUSH +SIMDE_DISABLE_UNWANTED_DIAGNOSTICS +SIMDE_BEGIN_DECLS_ + +SIMDE_FUNCTION_ATTRIBUTES +int16_t +simde_vqrdmlahh_s16(int16_t a, int16_t b, int16_t c) { + #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && defined(SIMDE_ARCH_ARM_QRDMX) + #if defined(__clang__) && !SIMDE_DETECT_CLANG_VERSION_CHECK(11,0,0) + return SIMDE_DISABLE_DIAGNOSTIC_EXPR_(SIMDE_DIAGNOSTIC_DISABLE_VECTOR_CONVERSION_, vqrdmlahh_s16(a, b, c)); + #else + return vqrdmlahh_s16(a, b, c); + #endif + #else + int64_t r = (((1 << 15) + (HEDLEY_STATIC_CAST(int64_t, a) << 16) + ((HEDLEY_STATIC_CAST(int64_t, (HEDLEY_STATIC_CAST(int64_t, b) * HEDLEY_STATIC_CAST(int64_t, c)))) << 1)) >> 16); + return simde_vqmovns_s32(HEDLEY_STATIC_CAST(int32_t, r)); + #endif +} +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vqrdmlahh_s16 + #define vqrdmlahh_s16(a, b, c) simde_vqrdmlahh_s16((a), (b), (c)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +int32_t +simde_vqrdmlahs_s32(int32_t a, int32_t b, int32_t c) { + #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && defined(SIMDE_ARCH_ARM_QRDMX) + return vqrdmlahs_s32(a, b, c); + #else + int64_t round_const = (HEDLEY_STATIC_CAST(int64_t, 1) << 31); + int64_t a_ = (HEDLEY_STATIC_CAST(int64_t, a) << 32); + int64_t sum = round_const + a_; + int64_t mul = (HEDLEY_STATIC_CAST(int64_t, b) * HEDLEY_STATIC_CAST(int64_t, c)); + int64_t mul2 = mul << 1; + if (mul2 >> 1 != mul) { + if (mul > 0) return INT32_MAX; + else if (mul < 0) return INT32_MIN; + } + int64_t sum2 = sum + mul2; + if (sum > 0 && INT64_MAX - sum < mul2) return INT32_MAX; + if (sum < 0 && INT64_MIN - sum > mul2) return INT32_MIN; + return HEDLEY_STATIC_CAST(int32_t, ((sum2 >> 32) & 0xffffffff)); + #endif +} +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vqrdmlahs_s32 + #define vqrdmlahs_s32(a, b, c) simde_vqrdmlahs_s32((a), (b), (c)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_int16x4_t +simde_vqrdmlah_s16(simde_int16x4_t a, simde_int16x4_t b, simde_int16x4_t c) { + #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && defined(SIMDE_ARCH_ARM_QRDMX) + return vqrdmlah_s16(a, b, c); + #else + simde_int16x4_private + r_, + a_ = simde_int16x4_to_private(a), + b_ = simde_int16x4_to_private(b), + c_ = simde_int16x4_to_private(c); + + + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { + r_.values[i] = simde_vqrdmlahh_s16(a_.values[i], b_.values[i], c_.values[i]); + } + + return simde_int16x4_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vqrdmlah_s16 + #define vqrdmlah_s16(a, b, c) simde_vqrdmlah_s16((a), (b), (c)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_int32x2_t +simde_vqrdmlah_s32(simde_int32x2_t a, simde_int32x2_t b, simde_int32x2_t c) { + #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && defined(SIMDE_ARCH_ARM_QRDMX) + return vqrdmlah_s32(a, b, c); + #else + simde_int32x2_private + r_, + a_ = simde_int32x2_to_private(a), + b_ = simde_int32x2_to_private(b), + c_ = simde_int32x2_to_private(c); + + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { + r_.values[i] = simde_vqrdmlahs_s32(a_.values[i], b_.values[i], c_.values[i]); + } + + return simde_int32x2_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vqrdmlah_s32 + #define vqrdmlah_s32(a, b, c) simde_vqrdmlah_s32((a), (b), (c)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_int16x8_t +simde_vqrdmlahq_s16(simde_int16x8_t a, simde_int16x8_t b, simde_int16x8_t c) { + #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && defined(SIMDE_ARCH_ARM_QRDMX) + return vqrdmlahq_s16(a, b, c); + #else + simde_int16x8_private + r_, + a_ = simde_int16x8_to_private(a), + b_ = simde_int16x8_to_private(b), + c_ = simde_int16x8_to_private(c); + + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { + r_.values[i] = simde_vqrdmlahh_s16(a_.values[i], b_.values[i], c_.values[i]); + } + + return simde_int16x8_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vqrdmlahq_s16 + #define vqrdmlahq_s16(a, b, c) simde_vqrdmlahq_s16((a), (b), (c)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_int32x4_t +simde_vqrdmlahq_s32(simde_int32x4_t a, simde_int32x4_t b, simde_int32x4_t c) { + #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && defined(SIMDE_ARCH_ARM_QRDMX) + return vqrdmlahq_s32(a, b, c); + #else + simde_int32x4_private + r_, + a_ = simde_int32x4_to_private(a), + b_ = simde_int32x4_to_private(b), + c_ = simde_int32x4_to_private(c); + + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { + r_.values[i] = simde_vqrdmlahs_s32(a_.values[i], b_.values[i], c_.values[i]); + } + + return simde_int32x4_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vqrdmlahq_s32 + #define vqrdmlahq_s32(a, b, c) simde_vqrdmlahq_s32((a), (b), (c)) +#endif + +SIMDE_END_DECLS_ +HEDLEY_DIAGNOSTIC_POP + +#endif /* !defined(SIMDE_ARM_NEON_QRDMLAH_H) */ diff --git a/lib/simd_wrapper/simde/arm/neon/qrdmlah_lane.h b/lib/simd_wrapper/simde/arm/neon/qrdmlah_lane.h new file mode 100644 index 00000000000..4f18bbb5fb1 --- /dev/null +++ b/lib/simd_wrapper/simde/arm/neon/qrdmlah_lane.h @@ -0,0 +1,162 @@ +/* SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, copy, + * modify, merge, publish, distribute, sublicense, and/or sell copies + * of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + * Copyright: + * 2023 Yi-Yen Chung (Copyright owned by Andes Technology) + */ + +#if !defined(SIMDE_ARM_NEON_QRDMLAH_LANE_H) +#define SIMDE_ARM_NEON_QRDMLAH_LANE_H + +#include "types.h" +#include "qrdmlah.h" +#include "dup_lane.h" +#include "get_lane.h" + +HEDLEY_DIAGNOSTIC_PUSH +SIMDE_DISABLE_UNWANTED_DIAGNOSTICS +SIMDE_BEGIN_DECLS_ + +#if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && defined(SIMDE_ARCH_ARM_QRDMX) + #define simde_vqrdmlahh_lane_s16(a, b, v, lane) vqrdmlahh_lane_s16((a), (b), (v), (lane)) +#else + #define simde_vqrdmlahh_lane_s16(a, b, v, lane) simde_vqrdmlahh_s16((a), (b), simde_vget_lane_s16((v), (lane))) +#endif +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vqrdmlahh_lane_s16 + #define vqrdmlahh_lane_s16(a, b, v, lane) simde_vqrdmlahh_lane_s16((a), (b), (v), (lane)) +#endif + +#if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && defined(SIMDE_ARCH_ARM_QRDMX) + #define simde_vqrdmlahh_laneq_s16(a, b, v, lane) vqrdmlahh_laneq_s16((a), (b), (v), (lane)) +#else + #define simde_vqrdmlahh_laneq_s16(a, b, v, lane) simde_vqrdmlahh_s16((a), (b), simde_vgetq_lane_s16((v), (lane))) +#endif +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vqrdmlahh_laneq_s16 + #define vqrdmlahh_laneq_s16(a, b, v, lane) simde_vqrdmlahh_laneq_s16((a), (b), (v), (lane)) +#endif + +#if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && defined(SIMDE_ARCH_ARM_QRDMX) + #define simde_vqrdmlahs_lane_s32(a, b, v, lane) vqrdmlahs_lane_s32((a), (b), (v), (lane)) +#else + #define simde_vqrdmlahs_lane_s32(a, b, v, lane) simde_vqrdmlahs_s32((a), (b), simde_vget_lane_s32((v), (lane))) +#endif +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vqrdmlahs_lane_s32 + #define vqrdmlahs_lane_s32(a, b, v, lane) simde_vqrdmlahs_lane_s32((a), (b), (v), (lane)) +#endif + +#if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && defined(SIMDE_ARCH_ARM_QRDMX) + #define simde_vqrdmlahs_laneq_s32(a, b, v, lane) vqrdmlahs_laneq_s32((a), (b), (v), (lane)) +#else + #define simde_vqrdmlahs_laneq_s32(a, b, v, lane) simde_vqrdmlahs_s32((a), (b), simde_vgetq_lane_s32((v), (lane))) +#endif +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vqrdmlahs_laneq_s32 + #define vqrdmlahs_laneq_s32(a, b, v, lane) simde_vqrdmlahs_laneq_s32((a), (b), (v), (lane)) +#endif + +#if defined(SIMDE_ARM_NEON_A32V7_NATIVE) && defined(SIMDE_ARCH_ARM_QRDMX) + #define simde_vqrdmlah_lane_s16(a, b, v, lane) vqrdmlah_lane_s16((a), (b), (v), (lane)) +#else + #define simde_vqrdmlah_lane_s16(a, b, v, lane) simde_vqrdmlah_s16((a), (b), simde_vdup_lane_s16((v), (lane))) +#endif +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vqrdmlah_lane_s16 + #define vqrdmlah_lane_s16(a, b, v, lane) simde_vqrdmlah_lane_s16((a), (b), (v), (lane)) +#endif + +#if defined(SIMDE_ARM_NEON_A32V7_NATIVE) && defined(SIMDE_ARCH_ARM_QRDMX) + #define simde_vqrdmlah_lane_s32(a, b, v, lane) vqrdmlah_lane_s32((a), (b), (v), (lane)) +#else + #define simde_vqrdmlah_lane_s32(a, b, v, lane) simde_vqrdmlah_s32((a), (b), simde_vdup_lane_s32((v), (lane))) +#endif +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vqrdmlah_lane_s32 + #define vqrdmlah_lane_s32(a, b, v, lane) simde_vqrdmlah_lane_s32((a), (b), (v), (lane)) +#endif + +#if defined(SIMDE_ARM_NEON_A32V7_NATIVE) && defined(SIMDE_ARCH_ARM_QRDMX) + #define simde_vqrdmlahq_lane_s16(a, b, v, lane) vqrdmlahq_lane_s16((a), (b), (v), (lane)) +#else + #define simde_vqrdmlahq_lane_s16(a, b, v, lane) simde_vqrdmlahq_s16((a), (b), simde_vdupq_lane_s16((v), (lane))) +#endif +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vqrdmlahq_lane_s16 + #define vqrdmlahq_lane_s16(a, b, v, lane) simde_vqrdmlahq_lane_s16((a), (b), (v), (lane)) +#endif + +#if defined(SIMDE_ARM_NEON_A32V7_NATIVE) && defined(SIMDE_ARCH_ARM_QRDMX) + #define simde_vqrdmlahq_lane_s32(a, b, v, lane) vqrdmlahq_lane_s32((a), (b), (v), (lane)) +#else + #define simde_vqrdmlahq_lane_s32(a, b, v, lane) simde_vqrdmlahq_s32((a), (b), simde_vdupq_lane_s32((v), (lane))) +#endif +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vqrdmlahq_lane_s32 + #define vqrdmlahq_lane_s32(a, b, v, lane) simde_vqrdmlahq_lane_s32((a), (b), (v), (lane)) +#endif + +#if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && defined(SIMDE_ARCH_ARM_QRDMX) + #define simde_vqrdmlah_laneq_s16(a, b, v, lane) vqrdmlah_laneq_s16((a), (b), (v), (lane)) +#else + #define simde_vqrdmlah_laneq_s16(a, b, v, lane) simde_vqrdmlah_s16((a), (b), simde_vdup_laneq_s16((v), (lane))) +#endif +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vqrdmlah_laneq_s16 + #define vqrdmlah_laneq_s16(a, b, v, lane) simde_vqrdmlah_laneq_s16((a), (b), (v), (lane)) +#endif + +#if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && defined(SIMDE_ARCH_ARM_QRDMX) + #define simde_vqrdmlah_laneq_s32(a, b, v, lane) vqrdmlah_laneq_s32((a), (b), (v), (lane)) +#else + #define simde_vqrdmlah_laneq_s32(a, b, v, lane) simde_vqrdmlah_s32((a), (b), simde_vdup_laneq_s32((v), (lane))) +#endif +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vqrdmlah_laneq_s32 + #define vqrdmlah_laneq_s32(a, b, v, lane) simde_vqrdmlah_laneq_s32((a), (b), (v), (lane)) +#endif + +#if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && defined(SIMDE_ARCH_ARM_QRDMX) + #define simde_vqrdmlahq_laneq_s16(a, b, v, lane) vqrdmlahq_laneq_s16((a), (b), (v), (lane)) +#else + #define simde_vqrdmlahq_laneq_s16(a, b, v, lane) simde_vqrdmlahq_s16((a), (b), simde_vdupq_laneq_s16((v), (lane))) +#endif +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vqrdmlahq_laneq_s16 + #define vqrdmlahq_laneq_s16(a, b, v, lane) simde_vqrdmlahq_laneq_s16((a), (b), (v), (lane)) +#endif + +#if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && defined(SIMDE_ARCH_ARM_QRDMX) + #define simde_vqrdmlahq_laneq_s32(a, b, v, lane) vqrdmlahq_laneq_s32((a), (b), (v), (lane)) +#else + #define simde_vqrdmlahq_laneq_s32(a, b, v, lane) simde_vqrdmlahq_s32((a), (b), simde_vdupq_laneq_s32((v), (lane))) +#endif +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vqrdmlahq_laneq_s32 + #define vqrdmlahq_laneq_s32(a, b, v, lane) simde_vqrdmlahq_laneq_s32((a), (b), (v), (lane)) +#endif + +SIMDE_END_DECLS_ +HEDLEY_DIAGNOSTIC_POP + +#endif /* !defined(SIMDE_ARM_NEON_QRDMLAH_LANE_H) */ diff --git a/lib/simd_wrapper/simde/arm/neon/qrdmlsh.h b/lib/simd_wrapper/simde/arm/neon/qrdmlsh.h new file mode 100644 index 00000000000..eb0be8e7c87 --- /dev/null +++ b/lib/simd_wrapper/simde/arm/neon/qrdmlsh.h @@ -0,0 +1,186 @@ +/* SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, copy, + * modify, merge, publish, distribute, sublicense, and/or sell copies + * of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + * Copyright: + * 2023 Yi-Yen Chung (Copyright owned by Andes Technology) + */ + +#if !defined(SIMDE_ARM_NEON_QRDMLSH_H) +#define SIMDE_ARM_NEON_QRDMLSH_H + +#include "types.h" +#include "qmovn.h" + +HEDLEY_DIAGNOSTIC_PUSH +SIMDE_DISABLE_UNWANTED_DIAGNOSTICS +SIMDE_BEGIN_DECLS_ + +SIMDE_FUNCTION_ATTRIBUTES +int16_t +simde_vqrdmlshh_s16(int16_t a, int16_t b, int16_t c) { + #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && defined(SIMDE_ARCH_ARM_QRDMX) + #if defined(__clang__) && !SIMDE_DETECT_CLANG_VERSION_CHECK(11,0,0) + return SIMDE_DISABLE_DIAGNOSTIC_EXPR_(SIMDE_DIAGNOSTIC_DISABLE_VECTOR_CONVERSION_, vqrdmlshh_s16(a, b, c)); + #else + return vqrdmlshh_s16(a, b, c); + #endif + #else + int64_t r = (((1 << 15) + (HEDLEY_STATIC_CAST(int64_t, a) << 16) - ((HEDLEY_STATIC_CAST(int64_t, (HEDLEY_STATIC_CAST(int64_t, b) * HEDLEY_STATIC_CAST(int64_t, c)))) << 1)) >> 16); + return simde_vqmovns_s32(HEDLEY_STATIC_CAST(int32_t, r)); + #endif +} +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vqrdmlshh_s16 + #define vqrdmlshh_s16(a, b, c) simde_vqrdmlshh_s16((a), (b), (c)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +int32_t +simde_vqrdmlshs_s32(int32_t a, int32_t b, int32_t c) { + #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && defined(SIMDE_ARCH_ARM_QRDMX) + return vqrdmlshs_s32(a, b, c); + #else + int64_t round_const = (HEDLEY_STATIC_CAST(int64_t, 1) << 31); + int64_t a_ = (HEDLEY_STATIC_CAST(int64_t, a) << 32); + int64_t sum = round_const + a_; + int64_t mul = -(HEDLEY_STATIC_CAST(int64_t, b) * HEDLEY_STATIC_CAST(int64_t, c)); + int64_t mul2 = mul << 1; + if (mul2 >> 1 != mul) { + if (mul > 0) return INT32_MAX; + else if (mul < 0) return INT32_MIN; + } + int64_t sum2 = sum + mul2; + if (sum > 0 && INT64_MAX - sum < mul2) return INT32_MAX; + if (sum < 0 && INT64_MIN - sum > mul2) return INT32_MIN; + return HEDLEY_STATIC_CAST(int32_t, ((sum2 >> 32) & 0xffffffff)); + #endif +} +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vqrdmlshs_s32 + #define vqrdmlshs_s32(a, b, c) simde_vqrdmlshs_s32((a), (b), (c)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_int16x4_t +simde_vqrdmlsh_s16(simde_int16x4_t a, simde_int16x4_t b, simde_int16x4_t c) { + #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && defined(SIMDE_ARCH_ARM_QRDMX) + return vqrdmlsh_s16(a, b, c); + #else + simde_int16x4_private + r_, + a_ = simde_int16x4_to_private(a), + b_ = simde_int16x4_to_private(b), + c_ = simde_int16x4_to_private(c); + + + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { + r_.values[i] = simde_vqrdmlshh_s16(a_.values[i], b_.values[i], c_.values[i]); + } + + return simde_int16x4_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vqrdmlsh_s16 + #define vqrdmlsh_s16(a, b, c) simde_vqrdmlsh_s16((a), (b), (c)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_int32x2_t +simde_vqrdmlsh_s32(simde_int32x2_t a, simde_int32x2_t b, simde_int32x2_t c) { + #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && defined(SIMDE_ARCH_ARM_QRDMX) + return vqrdmlsh_s32(a, b, c); + #else + simde_int32x2_private + r_, + a_ = simde_int32x2_to_private(a), + b_ = simde_int32x2_to_private(b), + c_ = simde_int32x2_to_private(c); + + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { + r_.values[i] = simde_vqrdmlshs_s32(a_.values[i], b_.values[i], c_.values[i]); + } + + return simde_int32x2_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vqrdmlsh_s32 + #define vqrdmlsh_s32(a, b, c) simde_vqrdmlsh_s32((a), (b), (c)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_int16x8_t +simde_vqrdmlshq_s16(simde_int16x8_t a, simde_int16x8_t b, simde_int16x8_t c) { + #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && defined(SIMDE_ARCH_ARM_QRDMX) + return vqrdmlshq_s16(a, b, c); + #else + simde_int16x8_private + r_, + a_ = simde_int16x8_to_private(a), + b_ = simde_int16x8_to_private(b), + c_ = simde_int16x8_to_private(c); + + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { + r_.values[i] = simde_vqrdmlshh_s16(a_.values[i], b_.values[i], c_.values[i]); + } + + return simde_int16x8_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vqrdmlshq_s16 + #define vqrdmlshq_s16(a, b, c) simde_vqrdmlshq_s16((a), (b), (c)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_int32x4_t +simde_vqrdmlshq_s32(simde_int32x4_t a, simde_int32x4_t b, simde_int32x4_t c) { + #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && defined(SIMDE_ARCH_ARM_QRDMX) + return vqrdmlshq_s32(a, b, c); + #else + simde_int32x4_private + r_, + a_ = simde_int32x4_to_private(a), + b_ = simde_int32x4_to_private(b), + c_ = simde_int32x4_to_private(c); + + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { + r_.values[i] = simde_vqrdmlshs_s32(a_.values[i], b_.values[i], c_.values[i]); + } + + return simde_int32x4_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vqrdmlshq_s32 + #define vqrdmlshq_s32(a, b, c) simde_vqrdmlshq_s32((a), (b), (c)) +#endif + +SIMDE_END_DECLS_ +HEDLEY_DIAGNOSTIC_POP + +#endif /* !defined(SIMDE_ARM_NEON_QRDMLSH_H) */ diff --git a/lib/simd_wrapper/simde/arm/neon/qrdmlsh_lane.h b/lib/simd_wrapper/simde/arm/neon/qrdmlsh_lane.h new file mode 100644 index 00000000000..a9584c60140 --- /dev/null +++ b/lib/simd_wrapper/simde/arm/neon/qrdmlsh_lane.h @@ -0,0 +1,162 @@ +/* SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, copy, + * modify, merge, publish, distribute, sublicense, and/or sell copies + * of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + * Copyright: + * 2023 Yi-Yen Chung (Copyright owned by Andes Technology) + */ + +#if !defined(SIMDE_ARM_NEON_QRDMLSH_LANE_H) +#define SIMDE_ARM_NEON_QRDMLSH_LANE_H + +#include "types.h" +#include "qrdmlsh.h" +#include "dup_lane.h" +#include "get_lane.h" + +HEDLEY_DIAGNOSTIC_PUSH +SIMDE_DISABLE_UNWANTED_DIAGNOSTICS +SIMDE_BEGIN_DECLS_ + +#if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && defined(SIMDE_ARCH_ARM_QRDMX) + #define simde_vqrdmlshh_lane_s16(a, b, v, lane) vqrdmlshh_lane_s16((a), (b), (v), (lane)) +#else + #define simde_vqrdmlshh_lane_s16(a, b, v, lane) simde_vqrdmlshh_s16((a), (b), simde_vget_lane_s16((v), (lane))) +#endif +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vqrdmlshh_lane_s16 + #define vqrdmlshh_lane_s16(a, b, v, lane) simde_vqrdmlshh_lane_s16((a), (b), (v), (lane)) +#endif + +#if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && defined(SIMDE_ARCH_ARM_QRDMX) + #define simde_vqrdmlshh_laneq_s16(a, b, v, lane) vqrdmlshh_laneq_s16((a), (b), (v), (lane)) +#else + #define simde_vqrdmlshh_laneq_s16(a, b, v, lane) simde_vqrdmlshh_s16((a), (b), simde_vgetq_lane_s16((v), (lane))) +#endif +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vqrdmlshh_laneq_s16 + #define vqrdmlshh_laneq_s16(a, b, v, lane) simde_vqrdmlshh_laneq_s16((a), (b), (v), (lane)) +#endif + +#if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && defined(SIMDE_ARCH_ARM_QRDMX) + #define simde_vqrdmlshs_lane_s32(a, b, v, lane) vqrdmlshs_lane_s32((a), (b), (v), (lane)) +#else + #define simde_vqrdmlshs_lane_s32(a, b, v, lane) simde_vqrdmlshs_s32((a), (b), simde_vget_lane_s32((v), (lane))) +#endif +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vqrdmlshs_lane_s32 + #define vqrdmlshs_lane_s32(a, b, v, lane) simde_vqrdmlshs_lane_s32((a), (b), (v), (lane)) +#endif + +#if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && defined(SIMDE_ARCH_ARM_QRDMX) + #define simde_vqrdmlshs_laneq_s32(a, b, v, lane) vqrdmlshs_laneq_s32((a), (b), (v), (lane)) +#else + #define simde_vqrdmlshs_laneq_s32(a, b, v, lane) simde_vqrdmlshs_s32((a), (b), simde_vgetq_lane_s32((v), (lane))) +#endif +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vqrdmlshs_laneq_s32 + #define vqrdmlshs_laneq_s32(a, b, v, lane) simde_vqrdmlshs_laneq_s32((a), (b), (v), (lane)) +#endif + +#if defined(SIMDE_ARM_NEON_A32V7_NATIVE) && defined(SIMDE_ARCH_ARM_QRDMX) + #define simde_vqrdmlsh_lane_s16(a, b, v, lane) vqrdmlsh_lane_s16((a), (b), (v), (lane)) +#else + #define simde_vqrdmlsh_lane_s16(a, b, v, lane) simde_vqrdmlsh_s16((a), (b), simde_vdup_lane_s16((v), (lane))) +#endif +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vqrdmlsh_lane_s16 + #define vqrdmlsh_lane_s16(a, b, v, lane) simde_vqrdmlsh_lane_s16((a), (b), (v), (lane)) +#endif + +#if defined(SIMDE_ARM_NEON_A32V7_NATIVE) && defined(SIMDE_ARCH_ARM_QRDMX) + #define simde_vqrdmlsh_lane_s32(a, b, v, lane) vqrdmlsh_lane_s32((a), (b), (v), (lane)) +#else + #define simde_vqrdmlsh_lane_s32(a, b, v, lane) simde_vqrdmlsh_s32((a), (b), simde_vdup_lane_s32((v), (lane))) +#endif +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vqrdmlsh_lane_s32 + #define vqrdmlsh_lane_s32(a, b, v, lane) simde_vqrdmlsh_lane_s32((a), (b), (v), (lane)) +#endif + +#if defined(SIMDE_ARM_NEON_A32V7_NATIVE) && defined(SIMDE_ARCH_ARM_QRDMX) + #define simde_vqrdmlshq_lane_s16(a, b, v, lane) vqrdmlshq_lane_s16((a), (b), (v), (lane)) +#else + #define simde_vqrdmlshq_lane_s16(a, b, v, lane) simde_vqrdmlshq_s16((a), (b), simde_vdupq_lane_s16((v), (lane))) +#endif +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vqrdmlshq_lane_s16 + #define vqrdmlshq_lane_s16(a, b, v, lane) simde_vqrdmlshq_lane_s16((a), (b), (v), (lane)) +#endif + +#if defined(SIMDE_ARM_NEON_A32V7_NATIVE) && defined(SIMDE_ARCH_ARM_QRDMX) + #define simde_vqrdmlshq_lane_s32(a, b, v, lane) vqrdmlshq_lane_s32((a), (b), (v), (lane)) +#else + #define simde_vqrdmlshq_lane_s32(a, b, v, lane) simde_vqrdmlshq_s32((a), (b), simde_vdupq_lane_s32((v), (lane))) +#endif +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vqrdmlshq_lane_s32 + #define vqrdmlshq_lane_s32(a, b, v, lane) simde_vqrdmlshq_lane_s32((a), (b), (v), (lane)) +#endif + +#if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && defined(SIMDE_ARCH_ARM_QRDMX) + #define simde_vqrdmlsh_laneq_s16(a, b, v, lane) vqrdmlsh_laneq_s16((a), (b), (v), (lane)) +#else + #define simde_vqrdmlsh_laneq_s16(a, b, v, lane) simde_vqrdmlsh_s16((a), (b), simde_vdup_laneq_s16((v), (lane))) +#endif +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vqrdmlsh_laneq_s16 + #define vqrdmlsh_laneq_s16(a, b, v, lane) simde_vqrdmlsh_laneq_s16((a), (b), (v), (lane)) +#endif + +#if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && defined(SIMDE_ARCH_ARM_QRDMX) + #define simde_vqrdmlsh_laneq_s32(a, b, v, lane) vqrdmlsh_laneq_s32((a), (b), (v), (lane)) +#else + #define simde_vqrdmlsh_laneq_s32(a, b, v, lane) simde_vqrdmlsh_s32((a), (b), simde_vdup_laneq_s32((v), (lane))) +#endif +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vqrdmlsh_laneq_s32 + #define vqrdmlsh_laneq_s32(a, b, v, lane) simde_vqrdmlsh_laneq_s32((a), (b), (v), (lane)) +#endif + +#if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && defined(SIMDE_ARCH_ARM_QRDMX) + #define simde_vqrdmlshq_laneq_s16(a, b, v, lane) vqrdmlshq_laneq_s16((a), (b), (v), (lane)) +#else + #define simde_vqrdmlshq_laneq_s16(a, b, v, lane) simde_vqrdmlshq_s16((a), (b), simde_vdupq_laneq_s16((v), (lane))) +#endif +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vqrdmlshq_laneq_s16 + #define vqrdmlshq_laneq_s16(a, b, v, lane) simde_vqrdmlshq_laneq_s16((a), (b), (v), (lane)) +#endif + +#if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && defined(SIMDE_ARCH_ARM_QRDMX) + #define simde_vqrdmlshq_laneq_s32(a, b, v, lane) vqrdmlshq_laneq_s32((a), (b), (v), (lane)) +#else + #define simde_vqrdmlshq_laneq_s32(a, b, v, lane) simde_vqrdmlshq_s32((a), (b), simde_vdupq_laneq_s32((v), (lane))) +#endif +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vqrdmlshq_laneq_s32 + #define vqrdmlshq_laneq_s32(a, b, v, lane) simde_vqrdmlshq_laneq_s32((a), (b), (v), (lane)) +#endif + +SIMDE_END_DECLS_ +HEDLEY_DIAGNOSTIC_POP + +#endif /* !defined(SIMDE_ARM_NEON_QRDMLSH_LANE_H) */ diff --git a/lib/simd_wrapper/simde/arm/neon/qrdmulh.h b/lib/simd_wrapper/simde/arm/neon/qrdmulh.h index 9a69b92e5e7..55fedfe723c 100644 --- a/lib/simd_wrapper/simde/arm/neon/qrdmulh.h +++ b/lib/simd_wrapper/simde/arm/neon/qrdmulh.h @@ -40,7 +40,10 @@ simde_vqrdmulhh_s16(int16_t a, int16_t b) { #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) return vqrdmulhh_s16(a, b); #else - return HEDLEY_STATIC_CAST(int16_t, (((1 << 15) + ((HEDLEY_STATIC_CAST(int32_t, (HEDLEY_STATIC_CAST(int32_t, a) * HEDLEY_STATIC_CAST(int32_t, b)))) << 1)) >> 16) & 0xffff); + int32_t temp = HEDLEY_STATIC_CAST(int32_t, a) * HEDLEY_STATIC_CAST(int32_t, b); + int32_t r = temp > 0 ? (temp > (INT32_MAX >> 1) ? INT32_MAX : (temp << 1)) : (temp < (INT32_MIN >> 1) ? INT32_MIN : (temp << 1)); + r = (r > (INT32_MAX - (1 << 15))) ? INT32_MAX : ((1 << 15) + r); + return HEDLEY_STATIC_CAST(int16_t, ((r >> 16) & 0xffff)); #endif } #if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) @@ -54,7 +57,10 @@ simde_vqrdmulhs_s32(int32_t a, int32_t b) { #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) return vqrdmulhs_s32(a, b); #else - return HEDLEY_STATIC_CAST(int32_t, (((HEDLEY_STATIC_CAST(int64_t, 1) << 31) + ((HEDLEY_STATIC_CAST(int64_t, (HEDLEY_STATIC_CAST(int64_t, a) * HEDLEY_STATIC_CAST(int64_t, b)))) << 1)) >> 32) & 0xffffffff); + int64_t temp = HEDLEY_STATIC_CAST(int64_t, a) * HEDLEY_STATIC_CAST(int64_t, b); + int64_t r = temp > 0 ? (temp > (INT64_MAX >> 1) ? INT64_MAX : (temp << 1)) : (temp < (INT64_MIN >> 1) ? INT64_MIN : (temp << 1)); + r = (r > (INT64_MAX - (HEDLEY_STATIC_CAST(int64_t, 1) << 31))) ? INT64_MAX : ((HEDLEY_STATIC_CAST(int64_t, 1) << 31) + r); + return HEDLEY_STATIC_CAST(int32_t, ((r >> 32) & 0xffffffff)); #endif } #if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) diff --git a/lib/simd_wrapper/simde/arm/neon/qrdmulh_lane.h b/lib/simd_wrapper/simde/arm/neon/qrdmulh_lane.h index 507064eab7e..2e7f548ec7c 100644 --- a/lib/simd_wrapper/simde/arm/neon/qrdmulh_lane.h +++ b/lib/simd_wrapper/simde/arm/neon/qrdmulh_lane.h @@ -22,6 +22,7 @@ * * Copyright: * 2020 Evan Nemerson + * 2023 Yi-Yen Chung (Copyright owned by Andes Technology) */ #if !defined(SIMDE_ARM_NEON_QRDMULH_LANE_H) @@ -36,6 +37,26 @@ HEDLEY_DIAGNOSTIC_PUSH SIMDE_DISABLE_UNWANTED_DIAGNOSTICS SIMDE_BEGIN_DECLS_ +#if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + #define simde_vqrdmulhh_lane_s16(a, v, lane) vqrdmulhh_lane_s16((a), (v), (lane)) +#else + #define simde_vqrdmulhh_lane_s16(a, v, lane) simde_vqrdmulhh_s16((a), simde_vget_lane_s16((v), (lane))) +#endif +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vqrdmulhh_lane_s16 + #define vqrdmulhh_lane_s16(a, v, lane) simde_vqrdmulhh_lane_s16((a), (v), (lane)) +#endif + +#if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + #define simde_vqrdmulhh_laneq_s16(a, v, lane) vqrdmulhh_laneq_s16((a), (v), (lane)) +#else + #define simde_vqrdmulhh_laneq_s16(a, v, lane) simde_vqrdmulhh_s16((a), simde_vgetq_lane_s16((v), (lane))) +#endif +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vqrdmulhh_laneq_s16 + #define vqrdmulhh_laneq_s16(a, v, lane) simde_vqrdmulhh_laneq_s16((a), (v), (lane)) +#endif + #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) #if defined(__clang__) && !SIMDE_DETECT_CLANG_VERSION_CHECK(11,0,0) #define simde_vqrdmulhs_lane_s32(a, v, lane) \ diff --git a/lib/simd_wrapper/simde/arm/neon/qrshl.h b/lib/simd_wrapper/simde/arm/neon/qrshl.h new file mode 100644 index 00000000000..e91435619d2 --- /dev/null +++ b/lib/simd_wrapper/simde/arm/neon/qrshl.h @@ -0,0 +1,744 @@ +/* SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, copy, + * modify, merge, publish, distribute, sublicense, and/or sell copies + * of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + * Copyright: + * 2023 Yi-Yen Chung (Copyright owned by Andes Technology) + */ + +#if !defined(SIMDE_ARM_NEON_QRSHL_H) +#define SIMDE_ARM_NEON_QRSHL_H +#include "../../x86/avx.h" +#include "types.h" + +HEDLEY_DIAGNOSTIC_PUSH +SIMDE_DISABLE_UNWANTED_DIAGNOSTICS +SIMDE_BEGIN_DECLS_ + +SIMDE_FUNCTION_ATTRIBUTES +int8_t +simde_vqrshlb_s8(int8_t a, int8_t b) { + int8_t r; + + #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + r = vqrshlb_s8(a, b); + #else + if (b < -8) { + r = 0; + } else if (b < 0) { + r = HEDLEY_STATIC_CAST(int8_t, a <= 0 + ? ((a + (1 << (-b - 1))) >> -b) + : HEDLEY_STATIC_CAST(int8_t, ((HEDLEY_STATIC_CAST(uint8_t, + (a + (1 << (-b - 1)))) >> -b) & 0x7FUL))); + } else if (b == 0) { + r = a; + } else if (b < 7) { + r = HEDLEY_STATIC_CAST(int8_t, a << b); + if ((r >> b) != a) { + r = (a < 0) ? INT8_MIN : INT8_MAX; + } + } else if (a == 0) { + r = 0; + } else { + r = (a < 0) ? INT8_MIN : INT8_MAX; + } + #endif + + return r; +} +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vqrshlb_s8 + #define vqrshlb_s8(a, b) simde_vqrshlb_s8((a), (b)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +int16_t +simde_vqrshlh_s16(int16_t a, int16_t b) { + int16_t r; + + #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + r = vqrshlh_s16(a, b); + #else + int8_t b8 = HEDLEY_STATIC_CAST(int8_t, b); + + if (b8 <= -16) { + r = 0; + } else if (b8 < 0) { + r = HEDLEY_STATIC_CAST(int16_t, a <= 0 + ? ((a + (1 << (-b8 - 1))) >> -b8) + : HEDLEY_STATIC_CAST(int16_t, ((HEDLEY_STATIC_CAST(uint16_t, + (a + (1 << (-b8 - 1)))) >> -b8) & 0x7FFFUL))); + } else if (b8 == 0) { + r = a; + } else if (b8 < 15) { + r = HEDLEY_STATIC_CAST(int16_t, a << b8); + if ((r >> b8) != a) { + r = (a < 0) ? INT16_MIN : INT16_MAX; + } + } else if (a == 0) { + r = 0; + } else { + r = (a < 0) ? INT16_MIN : INT16_MAX; + } + #endif + + return r; +} +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vqrshlh_s16 + #define vqrshlh_s16(a, b) simde_vqrshlh_s16((a), (b)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +int32_t +simde_vqrshls_s32(int32_t a, int32_t b) { + int32_t r; + + #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + r = vqrshls_s32(a, b); + #else + int8_t b8 = HEDLEY_STATIC_CAST(int8_t, b); + + if (b8 <= -32) { + r = 0; + } else if (b8 < 0) { + r = a <= 0 + ? ((a + (1 << (-b8 - 1))) >> -b8) + : HEDLEY_STATIC_CAST(int32_t, ((HEDLEY_STATIC_CAST(uint32_t, + (a + (1 << (-b8 - 1)))) >> -b8) & 0x7FFFFFFFUL)); + } else if (b8 == 0) { + r = a; + } else if (b8 < 31) { + r = HEDLEY_STATIC_CAST(int32_t, a << b8); + if ((r >> b8) != a) { + r = (a < 0) ? INT32_MIN : INT32_MAX; + } + } else if (a == 0) { + r = 0; + } else { + r = (a < 0) ? INT32_MIN : INT32_MAX; + } + #endif + + return r; +} +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vqrshls_s32 + #define vqrshls_s32(a, b) simde_vqrshls_s32((a), (b)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +int64_t +simde_vqrshld_s64(int64_t a, int64_t b) { + int64_t r; + + #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + r = vqrshld_s64(a, b); + #else + int8_t b8 = HEDLEY_STATIC_CAST(int8_t, b); + + if (b8 <= -64) { + r = 0; + } else if (b8 < 0) { + r = a <= 0 + ? ((a + (INT64_C(1) << (-b8 - 1))) >> -b8) + : HEDLEY_STATIC_CAST(int64_t, ((HEDLEY_STATIC_CAST(uint64_t, + (a + (INT64_C(1) << (-b8 - 1)))) >> -b8) & 0x7FFFFFFFFFFFFFFFUL)); + } else if (b8 == 0) { + r = a; + } else if (b8 < 63) { + r = HEDLEY_STATIC_CAST(int64_t, a << b8); + if ((r >> b8) != a) { + r = (a < 0) ? INT64_MIN : INT64_MAX; + } + } else if (a == 0) { + r = 0; + } else { + r = (a < 0) ? INT64_MIN : INT64_MAX; + } + #endif + + return r; +} +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vqrshld_s64 + #define vqrshld_s64(a, b) simde_vqrshld_s64((a), (b)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +uint8_t +simde_vqrshlb_u8(uint8_t a, int8_t b) { + #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + #if defined(HEDLEY_GCC_VERSION) && !HEDLEY_GCC_VERSION_CHECK(11,0,0) + return vqrshlb_u8(a, HEDLEY_STATIC_CAST(uint8_t, b)); + #elif HEDLEY_HAS_WARNING("-Wsign-conversion") + /* https://github.com/llvm/llvm-project/commit/f0a78bdfdc6d56b25e0081884580b3960a3c2429 */ + HEDLEY_DIAGNOSTIC_PUSH + #pragma clang diagnostic ignored "-Wsign-conversion" + return vqrshlb_u8(a, b); + HEDLEY_DIAGNOSTIC_POP + #else + return vqrshlb_u8(a, b); + #endif + #else + uint8_t r; + + if (b < -8) { + r = 0; + } else if (b < 0) { + r = (a >> -b) + ((a >> (-b - 1)) & 1); + } else if (b == 0) { + r = a; + } else if (b < 7) { + r = HEDLEY_STATIC_CAST(uint8_t, a << b); + if ((r >> b) != a) { + r = UINT8_MAX; + } + } else if (a == 0) { + r = 0; + } else { + r = UINT8_MAX; + } + + return r; + #endif +} +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vqrshlb_u8 + #define vqrshlb_u8(a, b) simde_vqrshlb_u8((a), (b)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +uint16_t +simde_vqrshlh_u16(uint16_t a, int16_t b) { + #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + #if defined(HEDLEY_GCC_VERSION) && !HEDLEY_GCC_VERSION_CHECK(11,0,0) + return vqrshlh_u16(a, HEDLEY_STATIC_CAST(uint16_t, b)); + #elif HEDLEY_HAS_WARNING("-Wsign-conversion") + HEDLEY_DIAGNOSTIC_PUSH + #pragma clang diagnostic ignored "-Wsign-conversion" + return vqrshlh_u16(a, b); + HEDLEY_DIAGNOSTIC_POP + #else + return vqrshlh_u16(a, b); + #endif + #else + b = HEDLEY_STATIC_CAST(int8_t, b); + uint16_t r; + + if (b < -16) { + r = 0; + } else if (b < 0) { + r = (a >> -b) + ((a >> (-b - 1)) & 1); + } else if (b == 0) { + r = a; + } else if (b < 15) { + r = HEDLEY_STATIC_CAST(uint16_t, a << b); + if ((r >> b) != a) { + r = UINT16_MAX; + } + } else if (a == 0) { + r = 0; + } else { + r = UINT16_MAX; + } + + return r; + #endif +} +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vqrshlh_u16 + #define vqrshlh_u16(a, b) simde_vqrshlh_u16((a), (b)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +uint32_t +simde_vqrshls_u32(uint32_t a, int32_t b) { + #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + #if defined(HEDLEY_GCC_VERSION) && !HEDLEY_GCC_VERSION_CHECK(11,0,0) + return vqrshls_u32(a, HEDLEY_STATIC_CAST(uint16_t, b)); + #elif HEDLEY_HAS_WARNING("-Wsign-conversion") + HEDLEY_DIAGNOSTIC_PUSH + #pragma clang diagnostic ignored "-Wsign-conversion" + return vqrshls_u32(a, b); + HEDLEY_DIAGNOSTIC_POP + #else + return vqrshls_u32(a, b); + #endif + #else + b = HEDLEY_STATIC_CAST(int8_t, b); + uint32_t r; + + if (b < -32) { + r = 0; + } else if (b < 0) { + r = (a >> -b) + ((a >> (-b - 1)) & 1); + } else if (b == 0) { + r = a; + } else if (b < 31) { + r = HEDLEY_STATIC_CAST(uint32_t, a << b); + if ((r >> b) != a) { + r = UINT32_MAX; + } + } else if (a == 0) { + r = 0; + } else { + r = UINT32_MAX; + } + + return r; + #endif +} +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vqrshls_u32 + #define vqrshls_u32(a, b) simde_vqrshls_u32((a), (b)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +uint64_t +simde_vqrshld_u64(uint64_t a, int64_t b) { + #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + #if defined(HEDLEY_GCC_VERSION) && !HEDLEY_GCC_VERSION_CHECK(11,0,0) + return vqrshld_u64(a, HEDLEY_STATIC_CAST(uint16_t, b)); + #elif HEDLEY_HAS_WARNING("-Wsign-conversion") + HEDLEY_DIAGNOSTIC_PUSH + #pragma clang diagnostic ignored "-Wsign-conversion" + return vqrshld_u64(a, b); + HEDLEY_DIAGNOSTIC_POP + #else + return vqrshld_u64(a, b); + #endif + #else + b = HEDLEY_STATIC_CAST(int8_t, b); + uint64_t r; + + if (b < -64) { + r = 0; + } else if (b < 0) { + r = (a >> -b) + ((a >> (-b - 1)) & 1); + } else if (b == 0) { + r = a; + } else if (b < 63) { + r = HEDLEY_STATIC_CAST(uint64_t, a << b); + if ((r >> b) != a) { + r = UINT64_MAX; + } + } else if (a == 0) { + r = 0; + } else { + r = UINT64_MAX; + } + + return r; + #endif +} +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vqrshld_u64 + #define vqrshld_u64(a, b) simde_vqrshld_u64((a), (b)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_int8x8_t +simde_vqrshl_s8 (const simde_int8x8_t a, const simde_int8x8_t b) { + #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) + return vqrshl_s8(a, b); + #else + simde_int8x8_private + r_, + a_ = simde_int8x8_to_private(a), + b_ = simde_int8x8_to_private(b); + + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { + r_.values[i] = simde_vqrshlb_s8(a_.values[i], b_.values[i]); + } + + return simde_int8x8_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vqrshl_s8 + #define vqrshl_s8(a, b) simde_vqrshl_s8((a), (b)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_int16x4_t +simde_vqrshl_s16 (const simde_int16x4_t a, const simde_int16x4_t b) { + #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) + return vqrshl_s16(a, b); + #else + simde_int16x4_private + r_, + a_ = simde_int16x4_to_private(a), + b_ = simde_int16x4_to_private(b); + + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { + r_.values[i] = simde_vqrshlh_s16(a_.values[i], b_.values[i]); + } + + return simde_int16x4_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vqrshl_s16 + #define vqrshl_s16(a, b) simde_vqrshl_s16((a), (b)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_int32x2_t +simde_vqrshl_s32 (const simde_int32x2_t a, const simde_int32x2_t b) { + #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) + return vqrshl_s32(a, b); + #else + simde_int32x2_private + r_, + a_ = simde_int32x2_to_private(a), + b_ = simde_int32x2_to_private(b); + + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { + r_.values[i] = simde_vqrshls_s32(a_.values[i], b_.values[i]); + } + + return simde_int32x2_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vqrshl_s32 + #define vqrshl_s32(a, b) simde_vqrshl_s32((a), (b)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_int64x1_t +simde_vqrshl_s64 (const simde_int64x1_t a, const simde_int64x1_t b) { + #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) + return vqrshl_s64(a, b); + #else + simde_int64x1_private + r_, + a_ = simde_int64x1_to_private(a), + b_ = simde_int64x1_to_private(b); + + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { + r_.values[i] = simde_vqrshld_s64(a_.values[i], b_.values[i]); + } + + return simde_int64x1_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vqrshl_s64 + #define vqrshl_s64(a, b) simde_vqrshl_s64((a), (b)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_uint8x8_t +simde_vqrshl_u8 (const simde_uint8x8_t a, const simde_int8x8_t b) { + #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) + return vqrshl_u8(a, b); + #else + simde_uint8x8_private + r_, + a_ = simde_uint8x8_to_private(a); + simde_int8x8_private b_ = simde_int8x8_to_private(b); + + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { + r_.values[i] = simde_vqrshlb_u8(a_.values[i], b_.values[i]); + } + + return simde_uint8x8_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vqrshl_u8 + #define vqrshl_u8(a, b) simde_vqrshl_u8((a), (b)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_uint16x4_t +simde_vqrshl_u16 (const simde_uint16x4_t a, const simde_int16x4_t b) { + #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) + return vqrshl_u16(a, b); + #else + simde_uint16x4_private + r_, + a_ = simde_uint16x4_to_private(a); + simde_int16x4_private b_ = simde_int16x4_to_private(b); + + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { + r_.values[i] = simde_vqrshlh_u16(a_.values[i], b_.values[i]); + } + + return simde_uint16x4_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vqrshl_u16 + #define vqrshl_u16(a, b) simde_vqrshl_u16((a), (b)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_uint32x2_t +simde_vqrshl_u32 (const simde_uint32x2_t a, const simde_int32x2_t b) { + #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) + return vqrshl_u32(a, b); + #else + simde_uint32x2_private + r_, + a_ = simde_uint32x2_to_private(a); + simde_int32x2_private b_ = simde_int32x2_to_private(b); + + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { + r_.values[i] = simde_vqrshls_u32(a_.values[i], b_.values[i]); + } + + return simde_uint32x2_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vqrshl_u32 + #define vqrshl_u32(a, b) simde_vqrshl_u32((a), (b)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_uint64x1_t +simde_vqrshl_u64 (const simde_uint64x1_t a, const simde_int64x1_t b) { + #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) + return vqrshl_u64(a, b); + #else + simde_uint64x1_private + r_, + a_ = simde_uint64x1_to_private(a); + simde_int64x1_private b_ = simde_int64x1_to_private(b); + + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { + r_.values[i] = simde_vqrshld_u64(a_.values[i], b_.values[i]); + } + + return simde_uint64x1_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vqrshl_u64 + #define vqrshl_u64(a, b) simde_vqrshl_u64((a), (b)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_int8x16_t +simde_vqrshlq_s8 (const simde_int8x16_t a, const simde_int8x16_t b) { + #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) + return vqrshlq_s8(a, b); + #else + simde_int8x16_private + r_, + a_ = simde_int8x16_to_private(a), + b_ = simde_int8x16_to_private(b); + + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { + r_.values[i] = simde_vqrshlb_s8(a_.values[i], b_.values[i]); + } + + return simde_int8x16_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vqrshlq_s8 + #define vqrshlq_s8(a, b) simde_vqrshlq_s8((a), (b)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_int16x8_t +simde_vqrshlq_s16 (const simde_int16x8_t a, const simde_int16x8_t b) { + #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) + return vqrshlq_s16(a, b); + #else + simde_int16x8_private + r_, + a_ = simde_int16x8_to_private(a), + b_ = simde_int16x8_to_private(b); + + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { + r_.values[i] = simde_vqrshlh_s16(a_.values[i], b_.values[i]); + } + + return simde_int16x8_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vqrshlq_s16 + #define vqrshlq_s16(a, b) simde_vqrshlq_s16((a), (b)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_int32x4_t +simde_vqrshlq_s32 (const simde_int32x4_t a, const simde_int32x4_t b) { + #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) + return vqrshlq_s32(a, b); + #else + simde_int32x4_private + r_, + a_ = simde_int32x4_to_private(a), + b_ = simde_int32x4_to_private(b); + + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { + r_.values[i] = simde_vqrshls_s32(a_.values[i], b_.values[i]); + } + + return simde_int32x4_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vqrshlq_s32 + #define vqrshlq_s32(a, b) simde_vqrshlq_s32((a), (b)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_int64x2_t +simde_vqrshlq_s64 (const simde_int64x2_t a, const simde_int64x2_t b) { + #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) + return vqrshlq_s64(a, b); + #else + simde_int64x2_private + r_, + a_ = simde_int64x2_to_private(a), + b_ = simde_int64x2_to_private(b); + + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { + r_.values[i] = simde_vqrshld_s64(a_.values[i], b_.values[i]); + } + + return simde_int64x2_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vqrshlq_s64 + #define vqrshlq_s64(a, b) simde_vqrshlq_s64((a), (b)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_uint8x16_t +simde_vqrshlq_u8 (const simde_uint8x16_t a, const simde_int8x16_t b) { + #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) + return vqrshlq_u8(a, b); + #else + simde_uint8x16_private + r_, + a_ = simde_uint8x16_to_private(a); + simde_int8x16_private b_ = simde_int8x16_to_private(b); + + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { + r_.values[i] = simde_vqrshlb_u8(a_.values[i], b_.values[i]); + } + + return simde_uint8x16_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vqrshlq_u8 + #define vqrshlq_u8(a, b) simde_vqrshlq_u8((a), (b)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_uint16x8_t +simde_vqrshlq_u16 (const simde_uint16x8_t a, const simde_int16x8_t b) { + #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) + return vqrshlq_u16(a, b); + #else + simde_uint16x8_private + r_, + a_ = simde_uint16x8_to_private(a); + simde_int16x8_private b_ = simde_int16x8_to_private(b); + + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { + r_.values[i] = simde_vqrshlh_u16(a_.values[i], b_.values[i]); + } + + return simde_uint16x8_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vqrshlq_u16 + #define vqrshlq_u16(a, b) simde_vqrshlq_u16((a), (b)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_uint32x4_t +simde_vqrshlq_u32 (const simde_uint32x4_t a, const simde_int32x4_t b) { + #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) + return vqrshlq_u32(a, b); + #else + simde_uint32x4_private + r_, + a_ = simde_uint32x4_to_private(a); + simde_int32x4_private b_ = simde_int32x4_to_private(b); + + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { + r_.values[i] = simde_vqrshls_u32(a_.values[i], b_.values[i]); + } + + return simde_uint32x4_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vqrshlq_u32 + #define vqrshlq_u32(a, b) simde_vqrshlq_u32((a), (b)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_uint64x2_t +simde_vqrshlq_u64 (const simde_uint64x2_t a, const simde_int64x2_t b) { + #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) + return vqrshlq_u64(a, b); + #else + simde_uint64x2_private + r_, + a_ = simde_uint64x2_to_private(a); + simde_int64x2_private b_ = simde_int64x2_to_private(b); + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { + r_.values[i] = simde_vqrshld_u64(a_.values[i], b_.values[i]); + } + + return simde_uint64x2_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vqrshlq_u64 + #define vqrshlq_u64(a, b) simde_vqrshlq_u64((a), (b)) +#endif + +SIMDE_END_DECLS_ +HEDLEY_DIAGNOSTIC_POP + +#endif /* !defined(SIMDE_ARM_NEON_QRSHL_H) */ diff --git a/lib/simd_wrapper/simde/arm/neon/qrshrn_high_n.h b/lib/simd_wrapper/simde/arm/neon/qrshrn_high_n.h new file mode 100644 index 00000000000..0080e739af1 --- /dev/null +++ b/lib/simd_wrapper/simde/arm/neon/qrshrn_high_n.h @@ -0,0 +1,189 @@ +/* SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, copy, + * modify, merge, publish, distribute, sublicense, and/or sell copies + * of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + * Copyright: + * 2023 Yi-Yen Chung (Copyright owned by Andes Technology) + */ + +#if !defined(SIMDE_ARM_NEON_QRSHRN_HIGH_N_H) +#define SIMDE_ARM_NEON_QRSHRN_HIGH_N_H + +#include "combine.h" +#include "qmovn.h" +#include "types.h" + +HEDLEY_DIAGNOSTIC_PUSH +SIMDE_DISABLE_UNWANTED_DIAGNOSTICS +SIMDE_BEGIN_DECLS_ + +SIMDE_FUNCTION_ATTRIBUTES +simde_int8x16_t +simde_vqrshrn_high_n_s16(simde_int8x8_t r, simde_int16x8_t a, const int n) + SIMDE_REQUIRE_CONSTANT_RANGE(n, 1, 8) { + simde_int16x8_private + r_, + a_ = simde_int16x8_to_private(a); + + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { + int16_t tmp = HEDLEY_STATIC_CAST(int16_t, (a_.values[i] + (1 << (n - 1))) >> n); + if (tmp > INT8_MAX) tmp = INT8_MAX; + else if (tmp < INT8_MIN) tmp = INT8_MIN; + r_.values[i] = HEDLEY_STATIC_CAST(int8_t, tmp); + } + return simde_vcombine_s8(r, simde_vqmovn_s16(simde_int16x8_from_private(r_))); +} +#if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + #define simde_vqrshrn_high_n_s16(r, a, n) vqrshrn_high_n_s16((r), (a), (n)) +#endif +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vqrshrn_high_n_s16 + #define vqrshrn_high_n_s16(r, a, n) simde_vqrshrn_high_n_s16((r), (a), (n)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_int16x8_t +simde_vqrshrn_high_n_s32(simde_int16x4_t r, simde_int32x4_t a, const int n) + SIMDE_REQUIRE_CONSTANT_RANGE(n, 1, 16) { + simde_int32x4_private + r_, + a_ = simde_int32x4_to_private(a); + + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { + int32_t tmp = (a_.values[i] >> ((n == 32) ? 31 : n)) + ((a_.values[i] & HEDLEY_STATIC_CAST(int32_t, UINT32_C(1) << (n - 1))) != 0); + if (tmp > INT16_MAX) tmp = INT16_MAX; + else if (tmp < INT16_MIN) tmp = INT16_MIN; + r_.values[i] = HEDLEY_STATIC_CAST(int16_t, tmp); + } + return simde_vcombine_s16(r, simde_vqmovn_s32(simde_int32x4_from_private(r_))); +} +#if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + #define simde_vqrshrn_high_n_s32(r, a, n) vqrshrn_high_n_s32((r), (a), (n)) +#endif +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vqrshrn_high_n_s32 + #define vqrshrn_high_n_s32(r, a, n) simde_vqrshrn_high_n_s32((r), (a), (n)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_int32x4_t +simde_vqrshrn_high_n_s64(simde_int32x2_t r, simde_int64x2_t a, const int n) + SIMDE_REQUIRE_CONSTANT_RANGE(n, 1, 32) { + simde_int64x2_private + r_, + a_ = simde_int64x2_to_private(a); + + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { + int64_t tmp = (a_.values[i] >> ((n == 64) ? 63 : n)) + ((a_.values[i] & HEDLEY_STATIC_CAST(int64_t, UINT64_C(1) << (n - 1))) != 0); + if (tmp > INT32_MAX) tmp = INT32_MAX; + else if (tmp < INT32_MIN) tmp = INT32_MIN; + r_.values[i] = HEDLEY_STATIC_CAST(int32_t, tmp); + } + return simde_vcombine_s32(r, simde_vqmovn_s64(simde_int64x2_from_private(r_))); +} +#if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + #define simde_vqrshrn_high_n_s64(r, a, n) vqrshrn_high_n_s64((r), (a), (n)) +#endif +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vqrshrn_high_n_s64 + #define vqrshrn_high_n_s64(r, a, n) simde_vqrshrn_high_n_s64((r), (a), (n)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_uint8x16_t +simde_vqrshrn_high_n_u16(simde_uint8x8_t r, simde_uint16x8_t a, const int n) + SIMDE_REQUIRE_CONSTANT_RANGE(n, 1, 8) { + simde_uint16x8_private + r_, + a_ = simde_uint16x8_to_private(a); + + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { + uint16_t tmp = HEDLEY_STATIC_CAST(uint16_t, (a_.values[i] + (1 << (n - 1))) >> n); + if (tmp > UINT8_MAX) tmp = UINT8_MAX; + r_.values[i] = HEDLEY_STATIC_CAST(uint8_t, tmp); + } + return simde_vcombine_u8(r, simde_vqmovn_u16(simde_uint16x8_from_private(r_))); +} +#if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + #define simde_vqrshrn_high_n_u16(r, a, n) vqrshrn_high_n_u16((r), (a), (n)) +#endif +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vqrshrn_high_n_u16 + #define vqrshrn_high_n_u16(r, a, n) simde_vqrshrn_high_n_u16((r), (a), (n)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_uint16x8_t +simde_vqrshrn_high_n_u32(simde_uint16x4_t r, simde_uint32x4_t a, const int n) + SIMDE_REQUIRE_CONSTANT_RANGE(n, 1, 16) { + simde_uint32x4_private + r_, + a_ = simde_uint32x4_to_private(a); + + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { + uint32_t tmp = (a_.values[i] >> ((n == 32) ? 31 : n)) + ((a_.values[i] & HEDLEY_STATIC_CAST(uint32_t, UINT32_C(1) << (n - 1))) != 0); + if (tmp > UINT16_MAX) tmp = UINT16_MAX; + r_.values[i] = HEDLEY_STATIC_CAST(uint16_t, tmp); + } + return simde_vcombine_u16(r, simde_vqmovn_u32(simde_uint32x4_from_private(r_))); +} +#if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + #define simde_vqrshrn_high_n_u32(r, a, n) vqrshrn_high_n_u32((r), (a), (n)) +#endif +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vqrshrn_high_n_u32 + #define vqrshrn_high_n_u32(r, a, n) simde_vqrshrn_high_n_u32((r), (a), (n)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_uint32x4_t +simde_vqrshrn_high_n_u64(simde_uint32x2_t r, simde_uint64x2_t a, const int n) + SIMDE_REQUIRE_CONSTANT_RANGE(n, 1, 32) { + simde_uint64x2_private + r_, + a_ = simde_uint64x2_to_private(a); + + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { + uint64_t tmp = (a_.values[i] >> ((n == 64) ? 63 : n)) + ((a_.values[i] & HEDLEY_STATIC_CAST(uint64_t, UINT64_C(1) << (n - 1))) != 0); + if (tmp > UINT32_MAX) tmp = UINT32_MAX; + r_.values[i] = HEDLEY_STATIC_CAST(uint32_t, tmp); + } + return simde_vcombine_u32(r, simde_vqmovn_u64(simde_uint64x2_from_private(r_))); +} +#if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + #define simde_vqrshrn_high_n_u64(r, a, n) vqrshrn_high_n_u64((r), (a), (n)) +#endif +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vqrshrn_high_n_u64 + #define vqrshrn_high_n_u64(r, a, n) simde_vqrshrn_high_n_u64((r), (a), (n)) +#endif + + +SIMDE_END_DECLS_ +HEDLEY_DIAGNOSTIC_POP + +#endif /* !defined(SIMDE_ARM_NEON_RSHRN_HIGH_N_H) */ diff --git a/lib/simd_wrapper/simde/arm/neon/qrshrn_n.h b/lib/simd_wrapper/simde/arm/neon/qrshrn_n.h index f5864ae0022..75f0a846c14 100644 --- a/lib/simd_wrapper/simde/arm/neon/qrshrn_n.h +++ b/lib/simd_wrapper/simde/arm/neon/qrshrn_n.h @@ -22,6 +22,7 @@ * * Copyright: * 2021 Zhi An Ng (Copyright owned by Google, LLC) + * 2023 Yi-Yen Chung (Copyright owned by Andes Technology) */ #if !defined(SIMDE_ARM_NEON_QRSHRN_N_H) @@ -35,6 +36,26 @@ HEDLEY_DIAGNOSTIC_PUSH SIMDE_DISABLE_UNWANTED_DIAGNOSTICS SIMDE_BEGIN_DECLS_ +#if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + #define simde_vqrshrnh_n_s16(a, n) vqrshrnh_n_s16(a, n) +#else + #define simde_vqrshrnh_n_s16(a, n) simde_vqmovnh_s16(simde_x_vrshrh_n_s16(a, n)) +#endif +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vqrshrnh_n_s16 + #define vqrshrnh_n_s16(a, n) simde_vqrshrnh_n_s16(a, n) +#endif + +#if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + #define simde_vqrshrnh_n_u16(a, n) vqrshrnh_n_u16(a, n) +#else + #define simde_vqrshrnh_n_u16(a, n) simde_vqmovnh_u16(simde_x_vrshrh_n_u16(a, n)) +#endif +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vqrshrnh_n_u16 + #define vqrshrnh_n_u16(a, n) simde_vqrshrnh_n_u16(a, n) +#endif + #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) #define simde_vqrshrns_n_s32(a, n) vqrshrns_n_s32(a, n) #else diff --git a/lib/simd_wrapper/simde/arm/neon/qrshrun_high_n.h b/lib/simd_wrapper/simde/arm/neon/qrshrun_high_n.h new file mode 100644 index 00000000000..b035681c381 --- /dev/null +++ b/lib/simd_wrapper/simde/arm/neon/qrshrun_high_n.h @@ -0,0 +1,113 @@ +/* SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, copy, + * modify, merge, publish, distribute, sublicense, and/or sell copies + * of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + * Copyright: + * 2023 Yi-Yen Chung (Copyright owned by Andes Technology) + */ + +#if !defined(SIMDE_ARM_NEON_QRSHRUN_HIGH_N_H) +#define SIMDE_ARM_NEON_QRSHRUN_HIGH_N_H + +#include "combine.h" +#include "qmovn.h" +#include "types.h" + +HEDLEY_DIAGNOSTIC_PUSH +SIMDE_DISABLE_UNWANTED_DIAGNOSTICS +SIMDE_BEGIN_DECLS_ + +SIMDE_FUNCTION_ATTRIBUTES +simde_uint8x16_t +simde_vqrshrun_high_n_s16(simde_uint8x8_t r, simde_int16x8_t a, const int n) + SIMDE_REQUIRE_CONSTANT_RANGE(n, 1, 8) { + simde_int16x8_private a_ = simde_int16x8_to_private(a); + simde_uint16x8_private r_; + + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { + int16_t tmp = HEDLEY_STATIC_CAST(int16_t, (a_.values[i] + (1 << (n - 1))) >> n); + if (tmp > UINT8_MAX) tmp = UINT8_MAX; + else if (tmp < 0) tmp = 0; + r_.values[i] = HEDLEY_STATIC_CAST(uint8_t, tmp); + } + return simde_vcombine_u8(r, simde_vqmovn_u16(simde_uint16x8_from_private(r_))); +} +#if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && !defined(__clang__) + #define simde_vqrshrun_high_n_s16(r, a, n) vqrshrun_high_n_s16((r), (a), (n)) +#endif +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vqrshrun_high_n_s16 + #define vqrshrun_high_n_s16(r, a, n) simde_vqrshrun_high_n_s16((r), (a), (n)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_uint16x8_t +simde_vqrshrun_high_n_s32(simde_uint16x4_t r, simde_int32x4_t a, const int n) + SIMDE_REQUIRE_CONSTANT_RANGE(n, 1, 16) { + simde_int32x4_private a_ = simde_int32x4_to_private(a); + simde_uint32x4_private r_; + + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { + int32_t tmp = (a_.values[i] >> ((n == 32) ? 31 : n)) + ((a_.values[i] & HEDLEY_STATIC_CAST(int32_t, UINT32_C(1) << (n - 1))) != 0); + if (tmp > UINT16_MAX) tmp = UINT16_MAX; + else if (tmp < 0) tmp = 0; + r_.values[i] = HEDLEY_STATIC_CAST(uint16_t, tmp); + } + return simde_vcombine_u16(r, simde_vqmovn_u32(simde_uint32x4_from_private(r_))); +} +#if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && !defined(__clang__) + #define simde_vqrshrun_high_n_s32(r, a, n) vqrshrun_high_n_s32((r), (a), (n)) +#endif +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vqrshrun_high_n_s32 + #define vqrshrun_high_n_s32(r, a, n) simde_vqrshrun_high_n_s32((r), (a), (n)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_uint32x4_t +simde_vqrshrun_high_n_s64(simde_uint32x2_t r, simde_int64x2_t a, const int n) + SIMDE_REQUIRE_CONSTANT_RANGE(n, 1, 32) { + simde_int64x2_private a_ = simde_int64x2_to_private(a); + simde_uint64x2_private r_; + + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { + int64_t tmp = (a_.values[i] >> ((n == 64) ? 63 : n)) + ((a_.values[i] & HEDLEY_STATIC_CAST(int64_t, UINT64_C(1) << (n - 1))) != 0); + if (tmp > UINT32_MAX) tmp = UINT32_MAX; + else if (tmp < 0) tmp = 0; + r_.values[i] = HEDLEY_STATIC_CAST(uint32_t, tmp); + } + return simde_vcombine_u32(r, simde_vqmovn_u64(simde_uint64x2_from_private(r_))); +} +#if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && !defined(__clang__) + #define simde_vqrshrun_high_n_s64(r, a, n) vqrshrun_high_n_s64((r), (a), (n)) +#endif +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vqrshrun_high_n_s64 + #define vqrshrun_high_n_s64(r, a, n) simde_vqrshrun_high_n_s64((r), (a), (n)) +#endif + +SIMDE_END_DECLS_ +HEDLEY_DIAGNOSTIC_POP + +#endif /* !defined(SIMDE_ARM_NEON_QRSHRUN_HIGH_N_H) */ diff --git a/lib/simd_wrapper/simde/arm/neon/qrshrun_n.h b/lib/simd_wrapper/simde/arm/neon/qrshrun_n.h index 8903d9ffb80..7eac1805428 100644 --- a/lib/simd_wrapper/simde/arm/neon/qrshrun_n.h +++ b/lib/simd_wrapper/simde/arm/neon/qrshrun_n.h @@ -22,6 +22,7 @@ * * Copyright: * 2021 Zhi An Ng (Copyright owned by Google, LLC) + * 2023 Yi-Yen Chung (Copyright owned by Andes Technology) */ #if !defined(SIMDE_ARM_NEON_QRSHRUN_N_H) @@ -36,7 +37,11 @@ SIMDE_DISABLE_UNWANTED_DIAGNOSTICS SIMDE_BEGIN_DECLS_ #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) - #define simde_vqrshruns_n_s32(a, n) vqrshruns_n_s32(a, n) + #if defined(SIMDE_BUG_CLANG_71751) + #define simde_vqrshruns_n_s32(a, n) HEDLEY_STATIC_CAST(uint16_t, vqrshruns_n_s32((a), (n))) + #else + #define simde_vqrshruns_n_s32(a, n) vqrshruns_n_s32((a), (n)) + #endif #else #define simde_vqrshruns_n_s32(a, n) simde_vqmovuns_s32(simde_x_vrshrs_n_s32(a, n)) #endif @@ -46,15 +51,33 @@ SIMDE_BEGIN_DECLS_ #endif #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) - #define simde_vqrshrund_n_s64(a, n) vqrshrund_n_s64(a, n) + #if defined(SIMDE_BUG_CLANG_71751) + #define simde_vqrshrund_n_s64(a, n) HEDLEY_STATIC_CAST(uint32_t, vqrshrund_n_s64((a), (n))) + #else + #define simde_vqrshrund_n_s64(a, n) vqrshrund_n_s64((a), (n)) + #endif #else - #define simde_vqrshrund_n_s64(a, n) simde_vqmovund_s64(simde_vrshrd_n_s64(a, n)) + #define simde_vqrshrund_n_s64(a, n) simde_vqmovund_s64(simde_vrshrd_n_s64((a), (n))) #endif #if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) #undef vqrshrund_n_s64 #define vqrshrund_n_s64(a, n) simde_vqrshrund_n_s64((a), (n)) #endif +#if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + #if defined(SIMDE_BUG_CLANG_71751) + #define simde_vqrshrunh_n_s16(a, n) HEDLEY_STATIC_CAST(uint8_t, vqrshrunh_n_s16((a), (n))) + #else + #define simde_vqrshrunh_n_s16(a, n) vqrshrunh_n_s16((a), (n)) + #endif +#else + #define simde_vqrshrunh_n_s16(a, n) simde_vqmovunh_s16(simde_x_vrshrh_n_s16(a, n)) +#endif +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vqrshrunh_n_s16 + #define vqrshrunh_n_s16(a, n) simde_vqrshrunh_n_s16((a), (n)) +#endif + #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) #define simde_vqrshrun_n_s16(a, n) vqrshrun_n_s16((a), (n)) #else diff --git a/lib/simd_wrapper/simde/arm/neon/qshl.h b/lib/simd_wrapper/simde/arm/neon/qshl.h index 279afe70824..4d3e9dbf9ff 100644 --- a/lib/simd_wrapper/simde/arm/neon/qshl.h +++ b/lib/simd_wrapper/simde/arm/neon/qshl.h @@ -23,6 +23,7 @@ * Copyright: * 2020 Evan Nemerson * 2020 Christopher Moore + * 2023 Yi-Yen Chung (Copyright owned by Andes Technology) */ #if !defined(SIMDE_ARM_NEON_QSHL_H) @@ -330,7 +331,7 @@ simde_vqshld_u64(uint64_t a, int64_t b) { #endif } #if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) - #undef vqshldb_u64 + #undef vqshld_u64 #define vqshld_u64(a, b) simde_vqshld_u64((a), (b)) #endif diff --git a/lib/simd_wrapper/simde/arm/neon/qshl_n.h b/lib/simd_wrapper/simde/arm/neon/qshl_n.h new file mode 100644 index 00000000000..e3d4c924d9d --- /dev/null +++ b/lib/simd_wrapper/simde/arm/neon/qshl_n.h @@ -0,0 +1,513 @@ +/* SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, copy, + * modify, merge, publish, distribute, sublicense, and/or sell copies + * of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + * Copyright: + * 2023 Yi-Yen Chung (Copyright owned by Andes Technology) + */ + +#if !defined(SIMDE_ARM_NEON_QSHL_N_H) +#define SIMDE_ARM_NEON_QSHL_N_H + +#include "types.h" +#include "cls.h" +#include "qshl.h" + +HEDLEY_DIAGNOSTIC_PUSH +SIMDE_DISABLE_UNWANTED_DIAGNOSTICS +SIMDE_BEGIN_DECLS_ + +SIMDE_FUNCTION_ATTRIBUTES +int8_t +simde_vqshlb_n_s8(int8_t a, const int n) + SIMDE_REQUIRE_CONSTANT_RANGE(n, 0, 7) { + return simde_vqshlb_s8(a, HEDLEY_STATIC_CAST(int8_t, n)); +} +#if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + #define simde_vqshlb_n_s8(a, n) vqshlb_n_s8((a), (n)) +#endif +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vqshlb_n_s8 + #define vqshlb_n_s8(a, n) simde_vqshlb_n_s8((a), (n)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +int16_t +simde_vqshlh_n_s16(int16_t a, const int n) + SIMDE_REQUIRE_CONSTANT_RANGE(n, 0, 15) { + return simde_vqshlh_s16(a, HEDLEY_STATIC_CAST(int16_t, n)); +} +#if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + #define simde_vqshlh_n_s16(a, n) vqshlh_n_s16((a), (n)) +#endif +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vqshlh_n_s16 + #define vqshlh_n_s16(a, n) simde_vqshlh_n_s16((a), (n)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +int32_t +simde_vqshls_n_s32(int32_t a, const int n) + SIMDE_REQUIRE_CONSTANT_RANGE(n, 0, 31) { + return simde_vqshls_s32(a, n); +} +#if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + #define simde_vqshls_n_s32(a, n) vqshls_n_s32((a), (n)) +#endif +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vqshls_n_s32 + #define vqshls_n_s32(a, n) simde_vqshls_n_s32((a), (n)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +int64_t +simde_vqshld_n_s64(int64_t a, const int n) + SIMDE_REQUIRE_CONSTANT_RANGE(n, 0, 63) { + return simde_vqshld_s64(a, n); +} +#if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + #define simde_vqshld_n_s64(a, n) vqshld_n_s64((a), (n)) +#endif +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vqshld_n_s64 + #define vqshld_n_s64(a, n) simde_vqshld_n_s64((a), (n)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +uint8_t +simde_vqshlb_n_u8(uint8_t a, const int n) + SIMDE_REQUIRE_CONSTANT_RANGE(n, 0, 7) { + return simde_vqshlb_u8(a, HEDLEY_STATIC_CAST(int8_t, n)); +} +#if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + #define simde_vqshlb_n_u8(a, n) vqshlb_n_u8((a), (n)) +#endif +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vqshlb_n_u8 + #define vqshlb_n_u8(a, n) simde_vqshlb_n_u8((a), (n)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +uint16_t +simde_vqshlh_n_u16(uint16_t a, const int n) + SIMDE_REQUIRE_CONSTANT_RANGE(n, 0, 15) { + return simde_vqshlh_u16(a, HEDLEY_STATIC_CAST(int16_t, n)); +} +#if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + #define simde_vqshlh_n_u16(a, n) vqshlh_n_u16((a), (n)) +#endif +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vqshlh_n_u16 + #define vqshlh_n_u16(a, n) simde_vqshlh_n_u16((a), (n)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +uint32_t +simde_vqshls_n_u32(uint32_t a, const int n) + SIMDE_REQUIRE_CONSTANT_RANGE(n, 0, 31) { + return simde_vqshls_u32(a, n); +} +#if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + #define simde_vqshls_n_u32(a, n) vqshls_n_u32((a), (n)) +#endif +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vqshls_n_u32 + #define vqshls_n_u32(a, n) simde_vqshls_n_u32((a), (n)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +uint64_t +simde_vqshld_n_u64(uint64_t a, const int n) + SIMDE_REQUIRE_CONSTANT_RANGE(n, 0, 63) { + return simde_vqshld_u64(a, n); +} +#if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + #define simde_vqshld_n_u64(a, n) vqshld_n_u64((a), (n)) +#endif +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vqshld_n_u64 + #define vqshld_n_u64(a, n) simde_vqshld_n_u64((a), (n)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_int8x8_t +simde_vqshl_n_s8 (const simde_int8x8_t a, const int n) + SIMDE_REQUIRE_CONSTANT_RANGE(n, 0, 7) { + simde_int8x8_private + r_, + a_ = simde_int8x8_to_private(a); + + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { + SIMDE_CONSTIFY_8_(simde_vqshlb_n_s8, r_.values[i], (HEDLEY_UNREACHABLE(), 0), n, a_.values[i]); + } + return simde_int8x8_from_private(r_); +} +#if defined(SIMDE_ARM_NEON_A32V7_NATIVE) + #define simde_vqshl_n_s8(a, n) vqshl_n_s8((a), (n)) +#endif +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vqshl_n_s8 + #define vqshl_n_s8(a, n) simde_vqshl_n_s8((a), (n)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_int16x4_t +simde_vqshl_n_s16 (const simde_int16x4_t a, const int n) + SIMDE_REQUIRE_CONSTANT_RANGE(n, 0, 15) { + simde_int16x4_private + r_, + a_ = simde_int16x4_to_private(a); + + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { + SIMDE_CONSTIFY_16_(simde_vqshlh_n_s16, r_.values[i], (HEDLEY_UNREACHABLE(), 0), n, a_.values[i]); + } + return simde_int16x4_from_private(r_); +} +#if defined(SIMDE_ARM_NEON_A32V7_NATIVE) + #define simde_vqshl_n_s16(a, n) vqshl_n_s16((a), (n)) +#endif +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vqshl_n_s16 + #define vqshl_n_s16(a, n) simde_vqshl_n_s16((a), (n)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_int32x2_t +simde_vqshl_n_s32 (const simde_int32x2_t a, const int n) + SIMDE_REQUIRE_CONSTANT_RANGE(n, 0, 31) { + simde_int32x2_private + r_, + a_ = simde_int32x2_to_private(a); + + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { + r_.values[i] = simde_vqshls_s32(a_.values[i], n); + } + return simde_int32x2_from_private(r_); +} +#if defined(SIMDE_ARM_NEON_A32V7_NATIVE) + #define simde_vqshl_n_s32(a, n) vqshl_n_s32((a), (n)) +#endif +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vqshl_n_s32 + #define vqshl_n_s32(a, n) simde_vqshl_n_s32((a), (n)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_int64x1_t +simde_vqshl_n_s64 (const simde_int64x1_t a, const int n) + SIMDE_REQUIRE_CONSTANT_RANGE(n, 0, 63) { + simde_int64x1_private + r_, + a_ = simde_int64x1_to_private(a); + + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { + r_.values[i] = simde_vqshld_s64(a_.values[i], n); + } + return simde_int64x1_from_private(r_); +} +#if defined(SIMDE_ARM_NEON_A32V7_NATIVE) + #define simde_vqshl_n_s64(a, n) vqshl_n_s64((a), (n)) +#endif +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vqshl_n_s64 + #define vqshl_n_s64(a, n) simde_vqshl_n_s64((a), (n)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_uint8x8_t +simde_vqshl_n_u8 (const simde_uint8x8_t a, const int n) + SIMDE_REQUIRE_CONSTANT_RANGE(n, 0, 7) { + simde_uint8x8_private + r_, + a_ = simde_uint8x8_to_private(a); + + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { + SIMDE_CONSTIFY_8_(simde_vqshlb_n_u8, r_.values[i], (HEDLEY_UNREACHABLE(), 0), n, a_.values[i]); + } + return simde_uint8x8_from_private(r_); +} +#if defined(SIMDE_ARM_NEON_A32V7_NATIVE) + #define simde_vqshl_n_u8(a, n) vqshl_n_u8((a), (n)) +#endif +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vqshl_n_u8 + #define vqshl_n_u8(a, n) simde_vqshl_n_u8((a), (n)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_uint16x4_t +simde_vqshl_n_u16 (const simde_uint16x4_t a, const int n) + SIMDE_REQUIRE_CONSTANT_RANGE(n, 0, 15) { + simde_uint16x4_private + r_, + a_ = simde_uint16x4_to_private(a); + + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { + SIMDE_CONSTIFY_16_(simde_vqshlh_n_u16, r_.values[i], (HEDLEY_UNREACHABLE(), 0), n, a_.values[i]); + } + return simde_uint16x4_from_private(r_); +} +#if defined(SIMDE_ARM_NEON_A32V7_NATIVE) + #define simde_vqshl_n_u16(a, n) vqshl_n_u16((a), (n)) +#endif +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vqshl_n_u16 + #define vqshl_n_u16(a, n) simde_vqshl_n_u16((a), (n)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_uint32x2_t +simde_vqshl_n_u32 (const simde_uint32x2_t a, const int n) + SIMDE_REQUIRE_CONSTANT_RANGE(n, 0, 31) { + simde_uint32x2_private + r_, + a_ = simde_uint32x2_to_private(a); + + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { + r_.values[i] = simde_vqshls_u32(a_.values[i], n); + } + return simde_uint32x2_from_private(r_); +} +#if defined(SIMDE_ARM_NEON_A32V7_NATIVE) + #define simde_vqshl_n_u32(a, n) vqshl_n_u32((a), (n)) +#endif +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vqshl_n_u32 + #define vqshl_n_u32(a, n) simde_vqshl_n_u32((a), (n)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_uint64x1_t +simde_vqshl_n_u64 (const simde_uint64x1_t a, const int n) + SIMDE_REQUIRE_CONSTANT_RANGE(n, 0, 63) { + simde_uint64x1_private + r_, + a_ = simde_uint64x1_to_private(a); + + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { + r_.values[i] = simde_vqshld_u64(a_.values[i], n); + } + return simde_uint64x1_from_private(r_); +} +#if defined(SIMDE_ARM_NEON_A32V7_NATIVE) + #define simde_vqshl_n_u64(a, n) vqshl_n_u64((a), (n)) +#endif +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vqshl_n_u64 + #define vqshl_n_u64(a, n) simde_vqshl_n_u64((a), (n)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_int8x16_t +simde_vqshlq_n_s8 (const simde_int8x16_t a, const int n) + SIMDE_REQUIRE_CONSTANT_RANGE(n, 0, 7) { + simde_int8x16_private + r_, + a_ = simde_int8x16_to_private(a); + + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { + SIMDE_CONSTIFY_8_(simde_vqshlb_n_s8, r_.values[i], (HEDLEY_UNREACHABLE(), 0), n, a_.values[i]); + } + + return simde_int8x16_from_private(r_); +} +#if defined(SIMDE_ARM_NEON_A32V7_NATIVE) + #define simde_vqshlq_n_s8(a, n) vqshlq_n_s8((a), (n)) +#endif +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vqshlq_n_s8 + #define vqshlq_n_s8(a, n) simde_vqshlq_n_s8((a), (n)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_int16x8_t +simde_vqshlq_n_s16 (const simde_int16x8_t a, const int n) + SIMDE_REQUIRE_CONSTANT_RANGE(n, 0, 15) { + simde_int16x8_private + r_, + a_ = simde_int16x8_to_private(a); + + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { + SIMDE_CONSTIFY_16_(simde_vqshlh_n_s16, r_.values[i], (HEDLEY_UNREACHABLE(), 0), n, a_.values[i]); + } + + return simde_int16x8_from_private(r_); +} +#if defined(SIMDE_ARM_NEON_A32V7_NATIVE) + #define simde_vqshlq_n_s16(a, n) vqshlq_n_s16((a), (n)) +#endif +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vqshlq_n_s16 + #define vqshlq_n_s16(a, n) simde_vqshlq_n_s16((a), (n)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_int32x4_t +simde_vqshlq_n_s32 (const simde_int32x4_t a, const int n) + SIMDE_REQUIRE_CONSTANT_RANGE(n, 0, 31) { + simde_int32x4_private + r_, + a_ = simde_int32x4_to_private(a); + + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { + r_.values[i] = simde_vqshls_s32(a_.values[i], n); + } + + return simde_int32x4_from_private(r_); +} +#if defined(SIMDE_ARM_NEON_A32V7_NATIVE) + #define simde_vqshlq_n_s32(a, n) vqshlq_n_s32((a), (n)) +#endif +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vqshlq_n_s32 + #define vqshlq_n_s32(a, n) simde_vqshlq_n_s32((a), (n)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_int64x2_t +simde_vqshlq_n_s64 (const simde_int64x2_t a, const int n) + SIMDE_REQUIRE_CONSTANT_RANGE(n, 0, 63) { + simde_int64x2_private + r_, + a_ = simde_int64x2_to_private(a); + + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { + r_.values[i] = simde_vqshld_s64(a_.values[i], n); + } + + return simde_int64x2_from_private(r_); +} +#if defined(SIMDE_ARM_NEON_A32V7_NATIVE) + #define simde_vqshlq_n_s64(a, n) vqshlq_n_s64((a), (n)) +#endif +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vqshlq_n_s64 + #define vqshlq_n_s64(a, n) simde_vqshlq_n_s64((a), (n)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_uint8x16_t +simde_vqshlq_n_u8 (const simde_uint8x16_t a, const int n) + SIMDE_REQUIRE_CONSTANT_RANGE(n, 0, 7) { + simde_uint8x16_private + r_, + a_ = simde_uint8x16_to_private(a); + + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { + SIMDE_CONSTIFY_8_(simde_vqshlb_n_u8, r_.values[i], (HEDLEY_UNREACHABLE(), 0), n, a_.values[i]); + } + + return simde_uint8x16_from_private(r_); +} +#if defined(SIMDE_ARM_NEON_A32V7_NATIVE) + #define simde_vqshlq_n_u8(a, n) vqshlq_n_u8((a), (n)) +#endif +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vqshlq_n_u8 + #define vqshlq_n_u8(a, n) simde_vqshlq_n_u8((a), (n)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_uint16x8_t +simde_vqshlq_n_u16 (const simde_uint16x8_t a, const int n) + SIMDE_REQUIRE_CONSTANT_RANGE(n, 0, 15) { + simde_uint16x8_private + r_, + a_ = simde_uint16x8_to_private(a); + + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { + SIMDE_CONSTIFY_16_(simde_vqshlh_n_u16, r_.values[i], (HEDLEY_UNREACHABLE(), 0), n, a_.values[i]); + } + + return simde_uint16x8_from_private(r_); +} +#if defined(SIMDE_ARM_NEON_A32V7_NATIVE) + #define simde_vqshlq_n_u16(a, n) vqshlq_n_u16((a), (n)) +#endif +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vqshlq_n_u16 + #define vqshlq_n_u16(a, n) simde_vqshlq_n_u16((a), (n)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_uint32x4_t +simde_vqshlq_n_u32 (const simde_uint32x4_t a, const int n) + SIMDE_REQUIRE_CONSTANT_RANGE(n, 0, 31) { + simde_uint32x4_private + r_, + a_ = simde_uint32x4_to_private(a); + + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { + r_.values[i] = simde_vqshls_u32(a_.values[i], n); + } + + return simde_uint32x4_from_private(r_); +} +#if defined(SIMDE_ARM_NEON_A32V7_NATIVE) + #define simde_vqshlq_n_u32(a, n) vqshlq_n_u32((a), (n)) +#endif +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vqshlq_n_u32 + #define vqshlq_n_u32(a, n) simde_vqshlq_n_u32((a), (n)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_uint64x2_t +simde_vqshlq_n_u64 (const simde_uint64x2_t a, const int n) + SIMDE_REQUIRE_CONSTANT_RANGE(n, 0, 63) { + simde_uint64x2_private + r_, + a_ = simde_uint64x2_to_private(a); + + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { + r_.values[i] = simde_vqshld_u64(a_.values[i], n); + } + + return simde_uint64x2_from_private(r_); +} +#if defined(SIMDE_ARM_NEON_A32V7_NATIVE) + #define simde_vqshlq_n_u64(a, n) vqshlq_n_u64((a), (n)) +#endif +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vqshlq_n_u64 + #define vqshlq_n_u64(a, n) simde_vqshlq_n_u64((a), (n)) +#endif + +SIMDE_END_DECLS_ +HEDLEY_DIAGNOSTIC_POP + +#endif /* !defined(SIMDE_ARM_NEON_QSHL_N_H) */ diff --git a/lib/simd_wrapper/simde/arm/neon/qshlu_n.h b/lib/simd_wrapper/simde/arm/neon/qshlu_n.h index a39f6795ae5..db9610a0f58 100644 --- a/lib/simd_wrapper/simde/arm/neon/qshlu_n.h +++ b/lib/simd_wrapper/simde/arm/neon/qshlu_n.h @@ -22,6 +22,7 @@ * * Copyright: * 2021 Atharva Nimbalkar + * 2023 Yi-Yen Chung (Copyright owned by Andes Technology) */ #if !defined(SIMDE_ARM_NEON_QSHLU_N_H) @@ -56,6 +57,22 @@ simde_vqshlub_n_s8(int8_t a, const int n) #define vqshlub_n_s8(a, n) simde_vqshlub_n_s8((a), (n)) #endif +SIMDE_FUNCTION_ATTRIBUTES +uint16_t +simde_vqshluh_n_s16(int16_t a, const int n) + SIMDE_REQUIRE_CONSTANT_RANGE(n, 0, 15) { + uint16_t r = HEDLEY_STATIC_CAST(uint16_t, a << n); + r |= (((r >> n) != HEDLEY_STATIC_CAST(uint16_t, a)) ? UINT16_MAX : 0); + return (a < 0) ? 0 : r; +} +#if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + #define simde_vqshluh_n_s16(a, n) HEDLEY_STATIC_CAST(uint16_t, vqshluh_n_s16(a, n)) +#endif +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vqshluh_n_s16 + #define vqshluh_n_s16(a, n) simde_vqshluh_n_s16((a), (n)) +#endif + SIMDE_FUNCTION_ATTRIBUTES uint32_t simde_vqshlus_n_s32(int32_t a, const int n) diff --git a/lib/simd_wrapper/simde/arm/neon/qshrn_high_n.h b/lib/simd_wrapper/simde/arm/neon/qshrn_high_n.h new file mode 100644 index 00000000000..59e6d8d9353 --- /dev/null +++ b/lib/simd_wrapper/simde/arm/neon/qshrn_high_n.h @@ -0,0 +1,101 @@ +/* SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, copy, + * modify, merge, publish, distribute, sublicense, and/or sell copies + * of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + * Copyright: + * 2023 Yi-Yen Chung (Copyright owned by Andes Technology) + */ + +#if !defined(SIMDE_ARM_NEON_QSHRN_HIGH_N_H) +#define SIMDE_ARM_NEON_QSHRN_HIGH_N_H + +#include "types.h" +#include "shr_n.h" +#include "qmovn.h" + +HEDLEY_DIAGNOSTIC_PUSH +SIMDE_DISABLE_UNWANTED_DIAGNOSTICS +SIMDE_BEGIN_DECLS_ + +#if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + #define simde_vqshrn_high_n_s16(r, a, n) vqshrn_high_n_s16((r), (a), (n)) +#else + #define simde_vqshrn_high_n_s16(r, a, n) simde_vcombine_s8(r, simde_vqmovn_s16(simde_vshrq_n_s16(a, n))) +#endif +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vqshrn_high_n_s16 + #define vqshrn_high_n_s16(r, a, n) simde_vqshrn_high_n_s16((r), (a), (n)) +#endif + +#if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + #define simde_vqshrn_high_n_s32(r, a, n) vqshrn_high_n_s32((r), (a), (n)) +#else + #define simde_vqshrn_high_n_s32(r, a, n) simde_vcombine_s16(r, simde_vqmovn_s32(simde_vshrq_n_s32(a, n))) +#endif +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vqshrn_high_n_s32 + #define vqshrn_high_n_s32(r, a, n) simde_vqshrn_high_n_s32((r), (a), (n)) +#endif + +#if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + #define simde_vqshrn_high_n_s64(r, a, n) vqshrn_high_n_s64((r), (a), (n)) +#else + #define simde_vqshrn_high_n_s64(r, a, n) simde_vcombine_s32(r, simde_vqmovn_s64(simde_vshrq_n_s64(a, n))) +#endif +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vqshrn_high_n_s64 + #define vqshrn_high_n_s64(r, a, n) simde_vqshrn_high_n_s64((r), (a), (n)) +#endif + +#if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + #define simde_vqshrn_high_n_u16(r, a, n) vqshrn_high_n_u16((r), (a), (n)) +#else + #define simde_vqshrn_high_n_u16(r, a, n) simde_vcombine_u8(r, simde_vqmovn_u16(simde_vshrq_n_u16(a, n))) +#endif +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vqshrn_high_n_u16 + #define vqshrn_high_n_u16(r, a, n) simde_vqshrn_high_n_u16((r), (a), (n)) +#endif + +#if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + #define simde_vqshrn_high_n_u32(r, a, n) vqshrn_high_n_u32((r), (a), (n)) +#else + #define simde_vqshrn_high_n_u32(r, a, n) simde_vcombine_u16(r, simde_vqmovn_u32(simde_vshrq_n_u32(a, n))) +#endif +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vqshrn_high_n_u32 + #define vqshrn_high_n_u32(r, a, n) simde_vqshrn_high_n_u32((r), (a), (n)) +#endif + +#if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + #define simde_vqshrn_high_n_u64(r, a, n) vqshrn_high_n_u64((r), (a), (n)) +#else + #define simde_vqshrn_high_n_u64(r, a, n) simde_vcombine_u32(r, simde_vqmovn_u64(simde_vshrq_n_u64(a, n))) +#endif +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vqshrn_high_n_u64 + #define vqshrn_high_n_u64(r, a, n) simde_vqshrn_high_n_u64((r), (a), (n)) +#endif + +SIMDE_END_DECLS_ +HEDLEY_DIAGNOSTIC_POP + +#endif /* !defined(SIMDE_ARM_NEON_QSHRN_HIGH_N_H) */ diff --git a/lib/simd_wrapper/simde/arm/neon/qshrn_n.h b/lib/simd_wrapper/simde/arm/neon/qshrn_n.h index 93ab96c1f66..abd47dcf793 100644 --- a/lib/simd_wrapper/simde/arm/neon/qshrn_n.h +++ b/lib/simd_wrapper/simde/arm/neon/qshrn_n.h @@ -23,6 +23,7 @@ * Copyright: * 2021 Zhi An Ng (Copyright owned by Google, LLC) * 2021 Evan Nemerson + * 2023 Yi-Yen Chung (Copyright owned by Andes Technology) */ #if !defined(SIMDE_ARM_NEON_QSHRN_N_H) @@ -36,6 +37,26 @@ HEDLEY_DIAGNOSTIC_PUSH SIMDE_DISABLE_UNWANTED_DIAGNOSTICS SIMDE_BEGIN_DECLS_ +#if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + #define simde_vqshrnh_n_s16(a, n) vqshrnh_n_s16(a, n) +#else + #define simde_vqshrnh_n_s16(a, n) simde_vqmovnh_s16(simde_x_vshrh_n_s16(a, n)) +#endif +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vqshrnh_n_s16 + #define vqshrnh_n_s16(a, n) simde_vqshrnh_n_s16(a, n) +#endif + +#if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + #define simde_vqshrnh_n_u16(a, n) vqshrnh_n_u16(a, n) +#else + #define simde_vqshrnh_n_u16(a, n) simde_vqmovnh_u16(simde_x_vshrh_n_u16(a, n)) +#endif +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vqshrnh_n_u16 + #define vqshrnh_n_u16(a, n) simde_vqshrnh_n_u16(a, n) +#endif + #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) #define simde_vqshrns_n_s32(a, n) vqshrns_n_s32(a, n) #else diff --git a/lib/simd_wrapper/simde/arm/neon/qshrun_high_n.h b/lib/simd_wrapper/simde/arm/neon/qshrun_high_n.h new file mode 100644 index 00000000000..c30368600ae --- /dev/null +++ b/lib/simd_wrapper/simde/arm/neon/qshrun_high_n.h @@ -0,0 +1,113 @@ +/* SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, copy, + * modify, merge, publish, distribute, sublicense, and/or sell copies + * of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + * Copyright: + * 2023 Yi-Yen Chung (Copyright owned by Andes Technology) + */ + +#if !defined(SIMDE_ARM_NEON_QSHRUN_HIGH_N_H) +#define SIMDE_ARM_NEON_QSHRUN_HIGH_N_H + +#include "combine.h" +#include "qmovn.h" +#include "types.h" + +HEDLEY_DIAGNOSTIC_PUSH +SIMDE_DISABLE_UNWANTED_DIAGNOSTICS +SIMDE_BEGIN_DECLS_ + +SIMDE_FUNCTION_ATTRIBUTES +simde_uint8x16_t +simde_vqshrun_high_n_s16(simde_uint8x8_t r, simde_int16x8_t a, const int n) + SIMDE_REQUIRE_CONSTANT_RANGE(n, 1, 8) { + simde_int16x8_private a_ = simde_int16x8_to_private(a); + simde_uint16x8_private r_; + + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { + int16_t tmp = (a_.values[i]) >> n; + if (tmp > UINT8_MAX) tmp = UINT8_MAX; + else if (tmp < 0) tmp = 0; + r_.values[i] = HEDLEY_STATIC_CAST(uint8_t, tmp); + } + return simde_vcombine_u8(r, simde_vqmovn_u16(simde_uint16x8_from_private(r_))); +} +#if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && !defined(SIMDE_BUG_CLANG_71365) + #define simde_vqshrun_high_n_s16(r, a, n) vqshrun_high_n_s16((r), (a), (n)) +#endif +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vqshrun_high_n_s16 + #define vqshrun_high_n_s16(r, a, n) simde_vqshrun_high_n_s16((r), (a), (n)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_uint16x8_t +simde_vqshrun_high_n_s32(simde_uint16x4_t r, simde_int32x4_t a, const int n) + SIMDE_REQUIRE_CONSTANT_RANGE(n, 1, 16) { + simde_int32x4_private a_ = simde_int32x4_to_private(a); + simde_uint32x4_private r_; + + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { + int32_t tmp = (a_.values[i] >> n); + if (tmp > UINT16_MAX) tmp = UINT16_MAX; + else if (tmp < 0) tmp = 0; + r_.values[i] = HEDLEY_STATIC_CAST(uint16_t, tmp); + } + return simde_vcombine_u16(r, simde_vqmovn_u32(simde_uint32x4_from_private(r_))); +} +#if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && !defined(SIMDE_BUG_CLANG_71365) + #define simde_vqshrun_high_n_s32(r, a, n) vqshrun_high_n_s32((r), (a), (n)) +#endif +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vqshrun_high_n_s32 + #define vqshrun_high_n_s32(r, a, n) simde_vqshrun_high_n_s32((r), (a), (n)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_uint32x4_t +simde_vqshrun_high_n_s64(simde_uint32x2_t r, simde_int64x2_t a, const int n) + SIMDE_REQUIRE_CONSTANT_RANGE(n, 1, 32) { + simde_int64x2_private a_ = simde_int64x2_to_private(a); + simde_uint64x2_private r_; + + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { + int64_t tmp = (a_.values[i] >> n); + if (tmp > UINT32_MAX) tmp = UINT32_MAX; + else if (tmp < 0) tmp = 0; + r_.values[i] = HEDLEY_STATIC_CAST(uint32_t, tmp); + } + return simde_vcombine_u32(r, simde_vqmovn_u64(simde_uint64x2_from_private(r_))); +} +#if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && !defined(SIMDE_BUG_CLANG_71365) + #define simde_vqshrun_high_n_s64(r, a, n) vqshrun_high_n_s64((r), (a), (n)) +#endif +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vqshrun_high_n_s64 + #define vqshrun_high_n_s64(r, a, n) simde_vqshrun_high_n_s64((r), (a), (n)) +#endif + +SIMDE_END_DECLS_ +HEDLEY_DIAGNOSTIC_POP + +#endif /* !defined(SIMDE_ARM_NEON_QSHRUN_HIGH_N_H) */ diff --git a/lib/simd_wrapper/simde/arm/neon/qshrun_n.h b/lib/simd_wrapper/simde/arm/neon/qshrun_n.h index 4e1aa7395bc..77f8e6af65f 100644 --- a/lib/simd_wrapper/simde/arm/neon/qshrun_n.h +++ b/lib/simd_wrapper/simde/arm/neon/qshrun_n.h @@ -22,6 +22,7 @@ * * Copyright: * 2021 Zhi An Ng (Copyright owned by Google, LLC) + * 2023 Yi-Yen Chung (Copyright owned by Andes Technology) */ #if !defined(SIMDE_ARM_NEON_QSHRUN_N_H) @@ -35,6 +36,16 @@ HEDLEY_DIAGNOSTIC_PUSH SIMDE_DISABLE_UNWANTED_DIAGNOSTICS SIMDE_BEGIN_DECLS_ +#if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + #define simde_vqshrunh_n_s16(a, n) HEDLEY_STATIC_CAST(uint8_t, vqshrunh_n_s16((a), (n))) +#else + #define simde_vqshrunh_n_s16(a, n) simde_vqmovunh_s16(simde_x_vshrh_n_s16(a, n)) +#endif +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vqshrunh_n_s16 + #define vqshrunh_n_s16(a, n) simde_vqshrunh_n_s16(a, n) +#endif + #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) #define simde_vqshruns_n_s32(a, n) HEDLEY_STATIC_CAST(uint16_t, vqshruns_n_s32((a), (n))) #else diff --git a/lib/simd_wrapper/simde/arm/neon/qtbl.h b/lib/simd_wrapper/simde/arm/neon/qtbl.h index 1b7c3b3cd86..f1897d77b87 100644 --- a/lib/simd_wrapper/simde/arm/neon/qtbl.h +++ b/lib/simd_wrapper/simde/arm/neon/qtbl.h @@ -23,6 +23,7 @@ * Copyright: * 2020 Evan Nemerson * 2020 Christopher Moore + * 2023 Yi-Yen Chung (Copyright owned by Andes Technology) */ #if !defined(SIMDE_ARM_NEON_QTBL_H) @@ -40,6 +41,10 @@ simde_uint8x8_t simde_vqtbl1_u8(simde_uint8x16_t t, simde_uint8x8_t idx) { #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) return vqtbl1_u8(t, idx); + #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE) + uint8x8x2_t split; + simde_memcpy(&split, &t, sizeof(split)); + return vtbl2_u8(split, idx); #else simde_uint8x16_private t_ = simde_uint8x16_to_private(t); simde_uint8x8_private @@ -86,6 +91,10 @@ simde_uint8x8_t simde_vqtbl2_u8(simde_uint8x16x2_t t, simde_uint8x8_t idx) { #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) return vqtbl2_u8(t, idx); + #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE) + uint8x8x4_t split; + simde_memcpy(&split, &t, sizeof(split)); + return vtbl4_u8(split, idx); #else simde_uint8x16_private t_[2] = { simde_uint8x16_to_private(t.val[0]), simde_uint8x16_to_private(t.val[1]) }; simde_uint8x8_private @@ -135,6 +144,15 @@ simde_uint8x8_t simde_vqtbl3_u8(simde_uint8x16x3_t t, simde_uint8x8_t idx) { #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) return vqtbl3_u8(t, idx); + #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE) + uint8x8_t idx_hi = vsub_u8(idx, vdup_n_u8(32)); + uint8x8x4_t split_lo; + uint8x8x2_t split_hi; + simde_memcpy(&split_lo, &t.val[0], sizeof(split_lo)); + simde_memcpy(&split_hi, &t.val[2], sizeof(split_hi)); + uint8x8_t lo = vtbl4_u8(split_lo, idx); + uint8x8_t hi = vtbl2_u8(split_hi, idx_hi); + return vorr_u8(lo, hi); #else simde_uint8x16_private t_[3] = { simde_uint8x16_to_private(t.val[0]), simde_uint8x16_to_private(t.val[1]), simde_uint8x16_to_private(t.val[2]) }; @@ -187,6 +205,15 @@ simde_uint8x8_t simde_vqtbl4_u8(simde_uint8x16x4_t t, simde_uint8x8_t idx) { #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) return vqtbl4_u8(t, idx); + #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE) + uint8x8_t idx_hi = vsub_u8(idx, vdup_n_u8(32)); + uint8x8x4_t split_lo; + uint8x8x4_t split_hi; + simde_memcpy(&split_lo, &t.val[0], sizeof(split_lo)); + simde_memcpy(&split_hi, &t.val[2], sizeof(split_hi)); + uint8x8_t lo = vtbl4_u8(split_lo, idx); + uint8x8_t hi = vtbl4_u8(split_hi, idx_hi); + return vorr_u8(lo, hi); #else simde_uint8x16_private t_[4] = { simde_uint8x16_to_private(t.val[0]), simde_uint8x16_to_private(t.val[1]), simde_uint8x16_to_private(t.val[2]), simde_uint8x16_to_private(t.val[3]) }; @@ -244,6 +271,12 @@ simde_uint8x16_t simde_vqtbl1q_u8(simde_uint8x16_t t, simde_uint8x16_t idx) { #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) return vqtbl1q_u8(t, idx); + #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE) + uint8x8x2_t split; + simde_memcpy(&split, &t, sizeof(split)); + uint8x8_t lo = vtbl2_u8(split, vget_low_u8(idx)); + uint8x8_t hi = vtbl2_u8(split, vget_high_u8(idx)); + return vcombine_u8(lo, hi); #elif defined(SIMDE_POWER_ALTIVEC_P6_NATIVE) return vec_and(vec_perm(t, t, idx), vec_cmplt(idx, vec_splats(HEDLEY_STATIC_CAST(unsigned char, 16)))); #else @@ -292,6 +325,12 @@ simde_uint8x16_t simde_vqtbl2q_u8(simde_uint8x16x2_t t, simde_uint8x16_t idx) { #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) return vqtbl2q_u8(t, idx); + #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE) + uint8x8x4_t split; + simde_memcpy(&split, &t, sizeof(split)); + uint8x8_t lo = vtbl4_u8(split, vget_low_u8(idx)); + uint8x8_t hi = vtbl4_u8(split, vget_high_u8(idx)); + return vcombine_u8(lo, hi); #elif defined(SIMDE_POWER_ALTIVEC_P6_NATIVE) return vec_and(vec_perm(t.val[0], t.val[1], idx), vec_cmplt(idx, vec_splats(HEDLEY_STATIC_CAST(unsigned char, 32)))); @@ -345,6 +384,17 @@ simde_uint8x16_t simde_vqtbl3q_u8(simde_uint8x16x3_t t, simde_uint8x16_t idx) { #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) return vqtbl3q_u8(t, idx); + #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE) + uint8x16_t idx_hi = vsubq_u8(idx, vdupq_n_u8(32)); + uint8x8x4_t split_lo; + uint8x8x2_t split_hi; + simde_memcpy(&split_lo, &t.val[0], sizeof(split_lo)); + simde_memcpy(&split_hi, &t.val[2], sizeof(split_hi)); + uint8x8_t hi_lo = vtbl2_u8(split_hi, vget_low_u8(idx_hi)); + uint8x8_t hi_hi = vtbl2_u8(split_hi, vget_high_u8(idx_hi)); + uint8x8_t lo = vtbx4_u8(hi_lo, split_lo, vget_low_u8(idx)); + uint8x8_t hi = vtbx4_u8(hi_hi, split_lo, vget_high_u8(idx)); + return vcombine_u8(lo, hi); #elif defined(SIMDE_POWER_ALTIVEC_P6_NATIVE) SIMDE_POWER_ALTIVEC_VECTOR(unsigned char) r_01 = vec_perm(t.val[0], t.val[1], idx); SIMDE_POWER_ALTIVEC_VECTOR(unsigned char) r_2 = vec_perm(t.val[2], t.val[2], idx); @@ -404,6 +454,17 @@ simde_uint8x16_t simde_vqtbl4q_u8(simde_uint8x16x4_t t, simde_uint8x16_t idx) { #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) return vqtbl4q_u8(t, idx); + #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE) + uint8x16_t idx_hi = vsubq_u8(idx, vdupq_n_u8(32)); + uint8x8x4_t split_lo; + uint8x8x4_t split_hi; + simde_memcpy(&split_lo, &t.val[0], sizeof(split_lo)); + simde_memcpy(&split_hi, &t.val[2], sizeof(split_hi)); + uint8x8_t lo_lo = vtbl4_u8(split_lo, vget_low_u8(idx)); + uint8x8_t lo_hi = vtbl4_u8(split_lo, vget_high_u8(idx)); + uint8x8_t lo = vtbx4_u8(lo_lo, split_hi, vget_low_u8(idx_hi)); + uint8x8_t hi = vtbx4_u8(lo_hi, split_hi, vget_high_u8(idx_hi)); + return vcombine_u8(lo, hi); #elif defined(SIMDE_POWER_ALTIVEC_P6_NATIVE) SIMDE_POWER_ALTIVEC_VECTOR(unsigned char) r_01 = vec_perm(t.val[0], t.val[1], idx); SIMDE_POWER_ALTIVEC_VECTOR(unsigned char) r_23 = vec_perm(t.val[2], t.val[3], idx); @@ -462,6 +523,130 @@ simde_vqtbl4q_s8(simde_int8x16x4_t t, simde_uint8x16_t idx) { #define vqtbl4q_s8(t, idx) simde_vqtbl4q_s8((t), (idx)) #endif +SIMDE_FUNCTION_ATTRIBUTES +simde_poly8x8_t +simde_vqtbl1_p8(simde_poly8x16_t t, simde_uint8x8_t idx) { + #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + return vqtbl1_p8(t, idx); + #else + return simde_vreinterpret_p8_u8(simde_vqtbl1_u8(simde_vreinterpretq_u8_p8(t), idx)); + #endif +} +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vqtbl1_p8 + #define vqtbl1_p8(t, idx) simde_vqtbl1_p8((t), (idx)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_poly8x16_t +simde_vqtbl1q_p8(simde_poly8x16_t t, simde_uint8x16_t idx) { + #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + return vqtbl1q_p8(t, idx); + #else + return simde_vreinterpretq_p8_u8(simde_vqtbl1q_u8(simde_vreinterpretq_u8_p8(t), idx)); + #endif +} +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vqtbl1q_p8 + #define vqtbl1q_p8(t, idx) simde_vqtbl1q_p8((t), (idx)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_poly8x8_t +simde_vqtbl2_p8(simde_poly8x16x2_t t, simde_uint8x8_t idx) { + #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + return vqtbl2_p8(t, idx); + #else + simde_uint8x16x2_t t_; + simde_memcpy(&t_, &t, sizeof(t_)); + return simde_vreinterpret_p8_u8(simde_vqtbl2_u8(t_, idx)); + #endif +} +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vqtbl2_p8 + #define vqtbl2_p8(t, idx) simde_vqtbl2_p8((t), (idx)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_poly8x16_t +simde_vqtbl2q_p8(simde_poly8x16x2_t t, simde_uint8x16_t idx) { + #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + return vqtbl2q_p8(t, idx); + #else + simde_uint8x16x2_t t_; + simde_memcpy(&t_, &t, sizeof(t_)); + return simde_vreinterpretq_p8_u8(simde_vqtbl2q_u8(t_, idx)); + #endif +} +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vqtbl2q_p8 + #define vqtbl2q_p8(t, idx) simde_vqtbl2q_p8((t), (idx)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_poly8x8_t +simde_vqtbl3_p8(simde_poly8x16x3_t t, simde_uint8x8_t idx) { + #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + return vqtbl3_p8(t, idx); + #else + simde_uint8x16x3_t t_; + simde_memcpy(&t_, &t, sizeof(t_)); + return simde_vreinterpret_p8_u8(simde_vqtbl3_u8(t_, idx)); + #endif +} +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vqtbl3_p8 + #define vqtbl3_p8(t, idx) simde_vqtbl3_p8((t), (idx)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_poly8x16_t +simde_vqtbl3q_p8(simde_poly8x16x3_t t, simde_uint8x16_t idx) { + #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + return vqtbl3q_p8(t, idx); + #else + simde_uint8x16x3_t t_; + simde_memcpy(&t_, &t, sizeof(t_)); + return simde_vreinterpretq_p8_u8(simde_vqtbl3q_u8(t_, idx)); + #endif +} +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vqtbl3q_p8 + #define vqtbl3q_p8(t, idx) simde_vqtbl3q_p8((t), (idx)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_poly8x8_t +simde_vqtbl4_p8(simde_poly8x16x4_t t, simde_uint8x8_t idx) { + #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + return vqtbl4_p8(t, idx); + #else + simde_uint8x16x4_t t_; + simde_memcpy(&t_, &t, sizeof(t_)); + return simde_vreinterpret_p8_u8(simde_vqtbl4_u8(t_, idx)); + #endif +} +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vqtbl4_p8 + #define vqtbl4_p8(t, idx) simde_vqtbl4_p8((t), (idx)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_poly8x16_t +simde_vqtbl4q_p8(simde_poly8x16x4_t t, simde_uint8x16_t idx) { + #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + return vqtbl4q_p8(t, idx); + #else + simde_uint8x16x4_t t_; + simde_memcpy(&t_, &t, sizeof(t_)); + return simde_vreinterpretq_p8_u8(simde_vqtbl4q_u8(t_, idx)); + #endif +} +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vqtbl4q_p8 + #define vqtbl4q_p8(t, idx) simde_vqtbl4q_p8((t), (idx)) +#endif + #endif /* !defined(SIMDE_BUG_INTEL_857088) */ SIMDE_END_DECLS_ diff --git a/lib/simd_wrapper/simde/arm/neon/qtbx.h b/lib/simd_wrapper/simde/arm/neon/qtbx.h index 5ba998fb1b7..221c4b5dfbe 100644 --- a/lib/simd_wrapper/simde/arm/neon/qtbx.h +++ b/lib/simd_wrapper/simde/arm/neon/qtbx.h @@ -23,6 +23,7 @@ * Copyright: * 2020 Evan Nemerson * 2020 Christopher Moore + * 2023 Yi-Yen Chung (Copyright owned by Andes Technology) */ #if !defined(SIMDE_ARM_NEON_QTBX_H) @@ -40,6 +41,10 @@ simde_uint8x8_t simde_vqtbx1_u8(simde_uint8x8_t a, simde_uint8x16_t t, simde_uint8x8_t idx) { #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) return vqtbx1_u8(a, t, idx); + #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE) + uint8x8x2_t split; + simde_memcpy(&split, &t, sizeof(split)); + return vtbx2_u8(a, split, idx); #else simde_uint8x16_private t_ = simde_uint8x16_to_private(t); simde_uint8x8_private @@ -89,6 +94,10 @@ simde_uint8x8_t simde_vqtbx2_u8(simde_uint8x8_t a, simde_uint8x16x2_t t, simde_uint8x8_t idx) { #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) return vqtbx2_u8(a, t, idx); + #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE) + uint8x8x4_t split; + simde_memcpy(&split, &t, sizeof(split)); + return vtbx4_u8(a, split, idx); #else simde_uint8x16_private t_[2] = { simde_uint8x16_to_private(t.val[0]), simde_uint8x16_to_private(t.val[1]) }; simde_uint8x8_private @@ -140,6 +149,14 @@ simde_uint8x8_t simde_vqtbx3_u8(simde_uint8x8_t a, simde_uint8x16x3_t t, simde_uint8x8_t idx) { #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) return vqtbx3_u8(a, t, idx); + #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE) + uint8x8_t idx_hi = vsub_u8(idx, vdup_n_u8(32)); + uint8x8x4_t split_lo; + uint8x8x2_t split_hi; + simde_memcpy(&split_lo, &t.val[0], sizeof(split_lo)); + simde_memcpy(&split_hi, &t.val[2], sizeof(split_hi)); + uint8x8_t hi = vtbx2_u8(a, split_hi, idx_hi); + return vtbx4_u8(hi, split_lo, idx); #else simde_uint8x16_private t_[3] = { simde_uint8x16_to_private(t.val[0]), simde_uint8x16_to_private(t.val[1]), simde_uint8x16_to_private(t.val[2]) }; simde_uint8x8_private @@ -193,6 +210,14 @@ simde_uint8x8_t simde_vqtbx4_u8(simde_uint8x8_t a, simde_uint8x16x4_t t, simde_uint8x8_t idx) { #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) return vqtbx4_u8(a, t, idx); + #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE) + uint8x8_t idx_hi = vsub_u8(idx, vdup_n_u8(32)); + uint8x8x4_t split_lo; + uint8x8x4_t split_hi; + simde_memcpy(&split_lo, &t.val[0], sizeof(split_lo)); + simde_memcpy(&split_hi, &t.val[2], sizeof(split_hi)); + uint8x8_t lo = vtbx4_u8(a, split_lo, idx); + return vtbx4_u8(lo, split_hi, idx_hi); #else simde_uint8x16_private t_[4] = { simde_uint8x16_to_private(t.val[0]), simde_uint8x16_to_private(t.val[1]), simde_uint8x16_to_private(t.val[2]), simde_uint8x16_to_private(t.val[3]) }; simde_uint8x8_private @@ -251,6 +276,12 @@ simde_uint8x16_t simde_vqtbx1q_u8(simde_uint8x16_t a, simde_uint8x16_t t, simde_uint8x16_t idx) { #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) return vqtbx1q_u8(a, t, idx); + #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE) + uint8x8x2_t split; + simde_memcpy(&split, &t, sizeof(split)); + uint8x8_t lo = vtbx2_u8(vget_low_u8(a), split, vget_low_u8(idx)); + uint8x8_t hi = vtbx2_u8(vget_high_u8(a), split, vget_high_u8(idx)); + return vcombine_u8(lo, hi); #elif defined(SIMDE_POWER_ALTIVEC_P6_NATIVE) return vec_sel(a, vec_perm(t, t, idx), @@ -304,6 +335,12 @@ simde_uint8x16_t simde_vqtbx2q_u8(simde_uint8x16_t a, simde_uint8x16x2_t t, simde_uint8x16_t idx) { #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) return vqtbx2q_u8(a, t, idx); + #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE) + uint8x8x4_t split; + simde_memcpy(&split, &t, sizeof(split)); + uint8x8_t lo = vtbx4_u8(vget_low_u8(a), split, vget_low_u8(idx)); + uint8x8_t hi = vtbx4_u8(vget_high_u8(a), split, vget_high_u8(idx)); + return vcombine_u8(lo, hi); #elif defined(SIMDE_POWER_ALTIVEC_P6_NATIVE) return vec_sel(a, vec_perm(t.val[0], t.val[1], idx), vec_cmplt(idx, vec_splats(HEDLEY_STATIC_CAST(unsigned char, 32)))); @@ -360,6 +397,17 @@ simde_uint8x16_t simde_vqtbx3q_u8(simde_uint8x16_t a, simde_uint8x16x3_t t, simde_uint8x16_t idx) { #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) return vqtbx3q_u8(a, t, idx); + #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE) + uint8x16_t idx_hi = vsubq_u8(idx, vdupq_n_u8(32)); + uint8x8x4_t split_lo; + uint8x8x2_t split_hi; + simde_memcpy(&split_lo, &t.val[0], sizeof(split_lo)); + simde_memcpy(&split_hi, &t.val[2], sizeof(split_hi)); + uint8x8_t hi_lo = vtbx2_u8(vget_low_u8(a), split_hi, vget_low_u8(idx_hi)); + uint8x8_t hi_hi = vtbx2_u8(vget_high_u8(a), split_hi, vget_high_u8(idx_hi)); + uint8x8_t lo_lo = vtbx4_u8(hi_lo, split_lo, vget_low_u8(idx)); + uint8x8_t lo_hi = vtbx4_u8(hi_hi, split_lo, vget_high_u8(idx)); + return vcombine_u8(lo_lo, lo_hi); #elif defined(SIMDE_POWER_ALTIVEC_P6_NATIVE) SIMDE_POWER_ALTIVEC_VECTOR(unsigned char) r_01 = vec_perm(t.val[0], t.val[1], idx); SIMDE_POWER_ALTIVEC_VECTOR(unsigned char) r_2 = vec_perm(t.val[2], t.val[2], idx); @@ -422,6 +470,17 @@ simde_uint8x16_t simde_vqtbx4q_u8(simde_uint8x16_t a, simde_uint8x16x4_t t, simde_uint8x16_t idx) { #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) return vqtbx4q_u8(a, t, idx); + #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE) + uint8x16_t idx_hi = vsubq_u8(idx, vdupq_n_u8(32)); + uint8x8x4_t split_lo; + uint8x8x4_t split_hi; + simde_memcpy(&split_lo, &t.val[0], sizeof(split_lo)); + simde_memcpy(&split_hi, &t.val[2], sizeof(split_hi)); + uint8x8_t lo_lo = vtbx4_u8(vget_low_u8(a), split_lo, vget_low_u8(idx)); + uint8x8_t lo_hi = vtbx4_u8(vget_high_u8(a), split_lo, vget_high_u8(idx)); + uint8x8_t lo = vtbx4_u8(lo_lo, split_hi, vget_low_u8(idx_hi)); + uint8x8_t hi = vtbx4_u8(lo_hi, split_hi, vget_high_u8(idx_hi)); + return vcombine_u8(lo, hi); #elif defined(SIMDE_POWER_ALTIVEC_P6_NATIVE) SIMDE_POWER_ALTIVEC_VECTOR(unsigned char) r_01 = vec_perm(t.val[0], t.val[1], idx); SIMDE_POWER_ALTIVEC_VECTOR(unsigned char) r_23 = vec_perm(t.val[2], t.val[3], idx); @@ -483,6 +542,130 @@ simde_vqtbx4q_s8(simde_int8x16_t a, simde_int8x16x4_t t, simde_uint8x16_t idx) { #define vqtbx4q_s8(a, t, idx) simde_vqtbx4q_s8((a), (t), (idx)) #endif +SIMDE_FUNCTION_ATTRIBUTES +simde_poly8x8_t +simde_vqtbx1_p8(simde_poly8x8_t a, simde_poly8x16_t t, simde_uint8x8_t idx) { + #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + return vqtbx1_p8(a, t, idx); + #else + return simde_vreinterpret_p8_u8(simde_vqtbx1_u8(simde_vreinterpret_u8_p8(a), simde_vreinterpretq_u8_p8(t), idx)); + #endif +} +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vqtbx1_p8 + #define vqtbx1_p8(a, t, idx) simde_vqtbx1_p8((a), (t), (idx)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_poly8x16_t +simde_vqtbx1q_p8(simde_poly8x16_t a, simde_poly8x16_t t, simde_uint8x16_t idx) { + #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + return vqtbx1q_p8(a, t, idx); + #else + return simde_vreinterpretq_p8_u8(simde_vqtbx1q_u8(simde_vreinterpretq_u8_p8(a), simde_vreinterpretq_u8_p8(t), idx)); + #endif +} +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vqtbx1q_p8 + #define vqtbx1q_p8(a, t, idx) simde_vqtbx1q_p8((a), (t), (idx)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_poly8x8_t +simde_vqtbx2_p8(simde_poly8x8_t a, simde_poly8x16x2_t t, simde_uint8x8_t idx) { + #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + return vqtbx2_p8(a, t, idx); + #else + simde_uint8x16x2_t t_; + simde_memcpy(&t_, &t, sizeof(t_)); + return simde_vreinterpret_p8_u8(simde_vqtbx2_u8(simde_vreinterpret_u8_p8(a), t_, idx)); + #endif +} +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vqtbx2_p8 + #define vqtbx2_p8(a, t, idx) simde_vqtbx2_p8((a), (t), (idx)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_poly8x16_t +simde_vqtbx2q_p8(simde_poly8x16_t a, simde_poly8x16x2_t t, simde_uint8x16_t idx) { + #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + return vqtbx2q_p8(a, t, idx); + #else + simde_uint8x16x2_t t_; + simde_memcpy(&t_, &t, sizeof(t_)); + return simde_vreinterpretq_p8_u8(simde_vqtbx2q_u8(simde_vreinterpretq_u8_p8(a), t_, idx)); + #endif +} +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vqtbx2q_p8 + #define vqtbx2q_p8(a, t, idx) simde_vqtbx2q_p8((a), (t), (idx)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_poly8x8_t +simde_vqtbx3_p8(simde_poly8x8_t a, simde_poly8x16x3_t t, simde_uint8x8_t idx) { + #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + return vqtbx3_p8(a, t, idx); + #else + simde_uint8x16x3_t t_; + simde_memcpy(&t_, &t, sizeof(t_)); + return simde_vreinterpret_p8_u8(simde_vqtbx3_u8(simde_vreinterpret_u8_p8(a), t_, idx)); + #endif +} +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vqtbx3_p8 + #define vqtbx3_p8(a, t, idx) simde_vqtbx3_p8((a), (t), (idx)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_poly8x16_t +simde_vqtbx3q_p8(simde_poly8x16_t a, simde_poly8x16x3_t t, simde_uint8x16_t idx) { + #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + return vqtbx3q_p8(a, t, idx); + #else + simde_uint8x16x3_t t_; + simde_memcpy(&t_, &t, sizeof(t_)); + return simde_vreinterpretq_p8_u8(simde_vqtbx3q_u8(simde_vreinterpretq_u8_p8(a), t_, idx)); + #endif +} +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vqtbx3q_p8 + #define vqtbx3q_p8(a, t, idx) simde_vqtbx3q_p8((a), (t), (idx)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_poly8x8_t +simde_vqtbx4_p8(simde_poly8x8_t a, simde_poly8x16x4_t t, simde_uint8x8_t idx) { + #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + return vqtbx4_p8(a, t, idx); + #else + simde_uint8x16x4_t t_; + simde_memcpy(&t_, &t, sizeof(t_)); + return simde_vreinterpret_p8_u8(simde_vqtbx4_u8(simde_vreinterpret_u8_p8(a), t_, idx)); + #endif +} +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vqtbx4_p8 + #define vqtbx4_p8(a, t, idx) simde_vqtbx4_p8((a), (t), (idx)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_poly8x16_t +simde_vqtbx4q_p8(simde_poly8x16_t a, simde_poly8x16x4_t t, simde_uint8x16_t idx) { + #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + return vqtbx4q_p8(a, t, idx); + #else + simde_uint8x16x4_t t_; + simde_memcpy(&t_, &t, sizeof(t_)); + return simde_vreinterpretq_p8_u8(simde_vqtbx4q_u8(simde_vreinterpretq_u8_p8(a), t_, idx)); + #endif +} +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vqtbx4q_p8 + #define vqtbx4q_p8(a, t, idx) simde_vqtbx4q_p8((a), (t), (idx)) +#endif + #endif /* !defined(SIMDE_BUG_INTEL_857088) */ SIMDE_END_DECLS_ diff --git a/lib/simd_wrapper/simde/arm/neon/raddhn.h b/lib/simd_wrapper/simde/arm/neon/raddhn.h new file mode 100644 index 00000000000..0f16e446e9c --- /dev/null +++ b/lib/simd_wrapper/simde/arm/neon/raddhn.h @@ -0,0 +1,182 @@ +/* SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, copy, + * modify, merge, publish, distribute, sublicense, and/or sell copies + * of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + * Copyright: + * 2023 Yi-Yen Chung (Copyright owned by Andes Technology) + */ + +#if !defined(SIMDE_ARM_NEON_RADDHN_H) +#define SIMDE_ARM_NEON_RADDHN_H + +#include "add.h" +#include "shr_n.h" +#include "movn.h" + +#include "reinterpret.h" + +HEDLEY_DIAGNOSTIC_PUSH +SIMDE_DISABLE_UNWANTED_DIAGNOSTICS +SIMDE_BEGIN_DECLS_ + +SIMDE_FUNCTION_ATTRIBUTES +simde_int8x8_t +simde_vraddhn_s16(simde_int16x8_t a, simde_int16x8_t b) { + #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) + return vraddhn_s16(a, b); + #else + simde_int16x8_private + r_, + a_ = simde_int16x8_to_private(a), + b_ = simde_int16x8_to_private(b); + int16_t round_cast = 1 << 7; + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { + r_.values[i] = a_.values[i] + b_.values[i] + round_cast; + } + return simde_vmovn_s16(simde_vshrq_n_s16(simde_int16x8_from_private(r_), 8)); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vraddhn_s16 + #define vraddhn_s16(a, b) simde_vraddhn_s16((a), (b)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_int16x4_t +simde_vraddhn_s32(simde_int32x4_t a, simde_int32x4_t b) { + #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) + return vraddhn_s32(a, b); + #else + simde_int32x4_private + r_, + a_ = simde_int32x4_to_private(a), + b_ = simde_int32x4_to_private(b); + int round_cast = 1 << 15; + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { + r_.values[i] = a_.values[i] + b_.values[i] + round_cast; + } + return simde_vmovn_s32(simde_vshrq_n_s32(simde_int32x4_from_private(r_), 16)); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vraddhn_s32 + #define vraddhn_s32(a, b) simde_vraddhn_s32((a), (b)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_int32x2_t +simde_vraddhn_s64(simde_int64x2_t a, simde_int64x2_t b) { + #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) + return vraddhn_s64(a, b); + #else + simde_int64x2_private + r_, + a_ = simde_int64x2_to_private(a), + b_ = simde_int64x2_to_private(b); + int64_t round_cast = 1ll << 31; + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { + r_.values[i] = ((a_.values[i] + b_.values[i] + round_cast) >> 32); + } + return simde_vmovn_s64(simde_int64x2_from_private(r_)); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vraddhn_s64 + #define vraddhn_s64(a, b) simde_vraddhn_s64((a), (b)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_uint8x8_t +simde_vraddhn_u16(simde_uint16x8_t a, simde_uint16x8_t b) { + #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) + return vraddhn_u16(a, b); + #else + simde_uint16x8_private + r_, + a_ = simde_uint16x8_to_private(a), + b_ = simde_uint16x8_to_private(b); + uint16_t round_cast = 1 << 7; + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { + r_.values[i] = HEDLEY_STATIC_CAST(uint16_t, a_.values[i] + b_.values[i] + round_cast); + } + return simde_vmovn_u16(simde_vshrq_n_u16(simde_uint16x8_from_private(r_), 8)); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vraddhn_u16 + #define vraddhn_u16(a, b) simde_vraddhn_u16((a), (b)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_uint16x4_t +simde_vraddhn_u32(simde_uint32x4_t a, simde_uint32x4_t b) { + #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) + return vraddhn_u32(a, b); + #else + simde_uint32x4_private + r_, + a_ = simde_uint32x4_to_private(a), + b_ = simde_uint32x4_to_private(b); + uint32_t round_cast = 1 << 15; + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { + r_.values[i] = HEDLEY_STATIC_CAST(uint32_t, a_.values[i] + b_.values[i] + round_cast); + } + return simde_vmovn_u32(simde_vshrq_n_u32(simde_uint32x4_from_private(r_), 16)); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vraddhn_u32 + #define vraddhn_u32(a, b) simde_vraddhn_u32((a), (b)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_uint32x2_t +simde_vraddhn_u64(simde_uint64x2_t a, simde_uint64x2_t b) { + #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) + return vraddhn_u64(a, b); + #else + simde_uint64x2_private + r_, + a_ = simde_uint64x2_to_private(a), + b_ = simde_uint64x2_to_private(b); + uint64_t round_cast = 1ull << 31; + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { + r_.values[i] = ((a_.values[i] + b_.values[i] + round_cast) >> 32); + } + return simde_vmovn_u64(simde_uint64x2_from_private(r_)); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vraddhn_u64 + #define vraddhn_u64(a, b) simde_vraddhn_u64((a), (b)) +#endif + + +SIMDE_END_DECLS_ +HEDLEY_DIAGNOSTIC_POP + +#endif /* !defined(SIMDE_ARM_NEON_RADDHN_H) */ diff --git a/lib/simd_wrapper/simde/arm/neon/raddhn_high.h b/lib/simd_wrapper/simde/arm/neon/raddhn_high.h new file mode 100644 index 00000000000..dc911698cf6 --- /dev/null +++ b/lib/simd_wrapper/simde/arm/neon/raddhn_high.h @@ -0,0 +1,102 @@ +/* SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, copy, + * modify, merge, publish, distribute, sublicense, and/or sell copies + * of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + * Copyright: + * 2023 Yi-Yen Chung (Copyright owned by Andes Technology) + */ + +#if !defined(SIMDE_ARM_NEON_RADDHN_HIGH_H) +#define SIMDE_ARM_NEON_RADDHN_HIGH_H + +#include "raddhn.h" +#include "combine.h" + +#include "reinterpret.h" + +HEDLEY_DIAGNOSTIC_PUSH +SIMDE_DISABLE_UNWANTED_DIAGNOSTICS +SIMDE_BEGIN_DECLS_ + +#if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + #define simde_vraddhn_high_s16(r, a, b) vraddhn_high_s16((r), (a), (b)) +#else + #define simde_vraddhn_high_s16(r, a, b) simde_vcombine_s8(r, simde_vraddhn_s16(a, b)) +#endif +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vraddhn_high_s16 + #define vraddhn_high_s16(r, a, b) simde_vraddhn_high_s16((r), (a), (b)) +#endif + +#if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + #define simde_vraddhn_high_s32(r, a, b) vraddhn_high_s32((r), (a), (b)) +#else + #define simde_vraddhn_high_s32(r, a, b) simde_vcombine_s16(r, simde_vraddhn_s32(a, b)) +#endif +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vraddhn_high_s32 + #define vraddhn_high_s32(r, a, b) simde_vraddhn_high_s32((r), (a), (b)) +#endif + +#if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + #define simde_vraddhn_high_s64(r, a, b) vraddhn_high_s64((r), (a), (b)) +#else + #define simde_vraddhn_high_s64(r, a, b) simde_vcombine_s32(r, simde_vraddhn_s64(a, b)) +#endif +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vraddhn_high_s64 + #define vraddhn_high_s64(r, a, b) simde_vraddhn_high_s64((r), (a), (b)) +#endif + +#if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + #define simde_vraddhn_high_u16(r, a, b) vraddhn_high_u16((r), (a), (b)) +#else + #define simde_vraddhn_high_u16(r, a, b) simde_vcombine_u8(r, simde_vraddhn_u16(a, b)) +#endif +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vraddhn_high_u16 + #define vraddhn_high_u16(r, a, b) simde_vraddhn_high_u16((r), (a), (b)) +#endif + +#if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + #define simde_vraddhn_high_u32(r, a, b) vraddhn_high_u32((r), (a), (b)) +#else + #define simde_vraddhn_high_u32(r, a, b) simde_vcombine_u16(r, simde_vraddhn_u32(a, b)) +#endif +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vraddhn_high_u32 + #define vraddhn_high_u32(r, a, b) simde_vraddhn_high_u32((r), (a), (b)) +#endif + +#if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + #define simde_vraddhn_high_u64(r, a, b) vraddhn_high_u64((r), (a), (b)) +#else + #define simde_vraddhn_high_u64(r, a, b) simde_vcombine_u32(r, simde_vraddhn_u64(a, b)) +#endif +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vraddhn_high_u64 + #define vraddhn_high_u64(r, a, b) simde_vraddhn_high_u64((r), (a), (b)) +#endif + +SIMDE_END_DECLS_ +HEDLEY_DIAGNOSTIC_POP + +#endif /* !defined(SIMDE_ARM_NEON_RADDHN_HIGH_H) */ diff --git a/lib/simd_wrapper/simde/arm/neon/rax.h b/lib/simd_wrapper/simde/arm/neon/rax.h new file mode 100644 index 00000000000..052e9caf664 --- /dev/null +++ b/lib/simd_wrapper/simde/arm/neon/rax.h @@ -0,0 +1,64 @@ +/* SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, copy, + * modify, merge, publish, distribute, sublicense, and/or sell copies + * of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + * Copyright: + * 2023 Yi-Yen Chung (Copyright owned by Andes Technology) + */ + +#if !defined(SIMDE_ARM_NEON_RAX_H) +#define SIMDE_ARM_NEON_RAX_H + +#include "types.h" + +HEDLEY_DIAGNOSTIC_PUSH +SIMDE_DISABLE_UNWANTED_DIAGNOSTICS +SIMDE_BEGIN_DECLS_ + +SIMDE_FUNCTION_ATTRIBUTES +simde_uint64x2_t +simde_vrax1q_u64(simde_uint64x2_t a, simde_uint64x2_t b) { + #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && defined(SIMDE_ARCH_ARM_SHA3) + return vrax1q_u64(a, b); + #else + simde_uint64x2_private + r_, + a_ = simde_uint64x2_to_private(a), + b_ = simde_uint64x2_to_private(b); + + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { + b_.values[i] = (b_.values[i] >> 63) | (b_.values[i] << 1); + r_.values[i] = a_.values[i] ^ b_.values[i]; + } + + return simde_uint64x2_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vrax1q_u64 + #define vrax1q_u64(a, b) simde_vrax1q_u64((a), (b)) +#endif + +SIMDE_END_DECLS_ +HEDLEY_DIAGNOSTIC_POP + +#endif /* !defined(SIMDE_ARM_NEON_RAX_H) */ diff --git a/lib/simd_wrapper/simde/arm/neon/rbit.h b/lib/simd_wrapper/simde/arm/neon/rbit.h index c507df72047..647c4c5ae68 100644 --- a/lib/simd_wrapper/simde/arm/neon/rbit.h +++ b/lib/simd_wrapper/simde/arm/neon/rbit.h @@ -23,6 +23,7 @@ * Copyright: * 2020 Evan Nemerson * 2020 Christopher Moore + * 2023 Yi-Yen Chung (Copyright owned by Andes Technology) */ /* The GFNI implementation is based on Wojciech Muła's work at @@ -159,6 +160,34 @@ simde_vrbitq_s8(simde_int8x16_t a) { #define vrbitq_s8(a) simde_vrbitq_s8(a) #endif +SIMDE_FUNCTION_ATTRIBUTES +simde_poly8x8_t +simde_vrbit_p8(simde_poly8x8_t a) { + #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + return vrbit_p8(a); + #else + return simde_vreinterpret_p8_u8(simde_vrbit_u8(simde_vreinterpret_u8_p8(a))); + #endif +} +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vrbit_p8 + #define vrbit_p8(a) simde_vrbit_p8(a) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_poly8x16_t +simde_vrbitq_p8(simde_poly8x16_t a) { + #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + return vrbitq_p8(a); + #else + return simde_vreinterpretq_p8_u8(simde_vrbitq_u8(simde_vreinterpretq_u8_p8(a))); + #endif +} +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vrbitq_p8 + #define vrbitq_p8(a) simde_vrbitq_p8(a) +#endif + SIMDE_END_DECLS_ HEDLEY_DIAGNOSTIC_POP diff --git a/lib/simd_wrapper/simde/arm/neon/recpe.h b/lib/simd_wrapper/simde/arm/neon/recpe.h index ed9ef42548d..382d12fc479 100644 --- a/lib/simd_wrapper/simde/arm/neon/recpe.h +++ b/lib/simd_wrapper/simde/arm/neon/recpe.h @@ -23,6 +23,7 @@ * Copyright: * 2020 Evan Nemerson * 2021 Zhi An Ng (Copyright owned by Google, LLC) + * 2023 Yi-Yen Chung (Copyright owned by Andes Technology) */ #if !defined(SIMDE_ARM_NEON_RECPE_H) @@ -34,6 +35,23 @@ HEDLEY_DIAGNOSTIC_PUSH SIMDE_DISABLE_UNWANTED_DIAGNOSTICS SIMDE_BEGIN_DECLS_ +SIMDE_FUNCTION_ATTRIBUTES +simde_float16_t +simde_vrecpeh_f16(simde_float16_t a) { + #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARM_NEON_FP16) + return vrecpeh_f16(a); + #else + simde_float32_t r_; + simde_float32_t a_ = simde_float16_to_float32(a); + r_ = 1.0f / a_; + return simde_float16_from_float32(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) + #undef vrecpeh_f16 + #define vrecpeh_f16(a) simde_vrecpeh_f16((a)) +#endif + SIMDE_FUNCTION_ATTRIBUTES simde_float32_t simde_vrecpes_f32(simde_float32_t a) { @@ -62,6 +80,29 @@ simde_vrecped_f64(simde_float64_t a) { #define vrecped_f64(a) simde_vrecped_f64((a)) #endif +SIMDE_FUNCTION_ATTRIBUTES +simde_float16x4_t +simde_vrecpe_f16(simde_float16x4_t a) { + #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARM_NEON_FP16) + return vrecpe_f16(a); + #else + simde_float16x4_private + r_, + a_ = simde_float16x4_to_private(a); + + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { + r_.values[i] = simde_vrecpeh_f16(a_.values[i]); + } + + return simde_float16x4_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) + #undef vrecpe_f16 + #define vrecpe_f16(a) simde_vrecpe_f16((a)) +#endif + SIMDE_FUNCTION_ATTRIBUTES simde_float32x2_t simde_vrecpe_f32(simde_float32x2_t a) { @@ -198,6 +239,29 @@ simde_vrecpeq_f32(simde_float32x4_t a) { #define vrecpeq_f32(a) simde_vrecpeq_f32((a)) #endif +SIMDE_FUNCTION_ATTRIBUTES +simde_float16x8_t +simde_vrecpeq_f16(simde_float16x8_t a) { + #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARM_NEON_FP16) + return vrecpeq_f16(a); + #else + simde_float16x8_private + r_, + a_ = simde_float16x8_to_private(a); + + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { + r_.values[i] = simde_vrecpeh_f16(a_.values[i]); + } + + return simde_float16x8_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) + #undef vrecpeq_f16 + #define vrecpeq_f16(a) simde_vrecpeq_f16((a)) +#endif + SIMDE_FUNCTION_ATTRIBUTES simde_uint32x2_t simde_vrecpe_u32(simde_uint32x2_t a){ @@ -210,7 +274,7 @@ simde_vrecpe_u32(simde_uint32x2_t a){ SIMDE_VECTORIZE for(size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { - if(a_.values[i] <= 0x7FFFFFFF){ + if (a_.values[i] <= 0x7FFFFFFF){ r_.values[i] = UINT32_MAX; } else { uint32_t a_temp = (a_.values[i] >> 23) & 511; @@ -241,7 +305,7 @@ simde_vrecpeq_u32(simde_uint32x4_t a){ SIMDE_VECTORIZE for(size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { - if(a_.values[i] <= 0x7FFFFFFF){ + if (a_.values[i] <= 0x7FFFFFFF){ r_.values[i] = UINT32_MAX; } else { uint32_t a_temp = (a_.values[i] >> 23) & 511; diff --git a/lib/simd_wrapper/simde/arm/neon/recps.h b/lib/simd_wrapper/simde/arm/neon/recps.h index 85c4f10520f..9d1f7ecc98a 100644 --- a/lib/simd_wrapper/simde/arm/neon/recps.h +++ b/lib/simd_wrapper/simde/arm/neon/recps.h @@ -22,6 +22,7 @@ * * Copyright: * 2021 Zhi An Ng (Copyright owned by Google, LLC) + * 2023 Yi-Yen Chung (Copyright owned by Andes Technology) */ #if !defined(SIMDE_ARM_NEON_RECPS_H) @@ -35,6 +36,21 @@ HEDLEY_DIAGNOSTIC_PUSH SIMDE_DISABLE_UNWANTED_DIAGNOSTICS SIMDE_BEGIN_DECLS_ +SIMDE_FUNCTION_ATTRIBUTES +simde_float16_t +simde_vrecpsh_f16(simde_float16_t a, simde_float16_t b) { + #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && defined(SIMDE_ARM_NEON_FP16) + return vrecpsh_f16(a, b); + #else + return simde_float16_from_float32(SIMDE_FLOAT32_C(2.0) - + simde_float16_to_float32(a) * simde_float16_to_float32(b)); + #endif +} +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vrecpsh_f16 + #define vrecpsh_f16(a, b) simde_vrecpsh_f16((a), (b)) +#endif + SIMDE_FUNCTION_ATTRIBUTES simde_float32_t simde_vrecpss_f32(simde_float32_t a, simde_float32_t b) { @@ -77,6 +93,30 @@ simde_vrecps_f64(simde_float64x1_t a, simde_float64x1_t b) { #define vrecps_f64(a, b) simde_vrecps_f64((a), (b)) #endif +SIMDE_FUNCTION_ATTRIBUTES +simde_float16x4_t +simde_vrecps_f16(simde_float16x4_t a, simde_float16x4_t b) { + #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARM_NEON_FP16) + return vrecps_f16(a, b); + #else + simde_float16x4_private + r_, + a_ = simde_float16x4_to_private(a), + b_ = simde_float16x4_to_private(b); + + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { + r_.values[i] = simde_vrecpsh_f16(a_.values[i], b_.values[i]); + } + + return simde_float16x4_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) + #undef vrecps_f16 + #define vrecps_f16(a, b) simde_vrecps_f16((a), (b)) +#endif + SIMDE_FUNCTION_ATTRIBUTES simde_float32x2_t simde_vrecps_f32(simde_float32x2_t a, simde_float32x2_t b) { @@ -119,6 +159,30 @@ simde_vrecpsq_f32(simde_float32x4_t a, simde_float32x4_t b) { #define vrecpsq_f32(a, b) simde_vrecpsq_f32((a), (b)) #endif +SIMDE_FUNCTION_ATTRIBUTES +simde_float16x8_t +simde_vrecpsq_f16(simde_float16x8_t a, simde_float16x8_t b) { + #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARM_NEON_FP16) + return vrecpsq_f16(a, b); + #else + simde_float16x8_private + r_, + a_ = simde_float16x8_to_private(a), + b_ = simde_float16x8_to_private(b); + + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { + r_.values[i] = simde_vrecpsh_f16(a_.values[i], b_.values[i]); + } + + return simde_float16x8_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) + #undef vrecpsq_f16 + #define vrecpsq_f16(a, b) simde_vrecpsq_f16((a), (b)) +#endif + SIMDE_END_DECLS_ HEDLEY_DIAGNOSTIC_POP #endif /* !defined(SIMDE_ARM_NEON_RECPS_H) */ diff --git a/lib/simd_wrapper/simde/arm/neon/recpx.h b/lib/simd_wrapper/simde/arm/neon/recpx.h new file mode 100644 index 00000000000..c1a36f6507e --- /dev/null +++ b/lib/simd_wrapper/simde/arm/neon/recpx.h @@ -0,0 +1,133 @@ +/* SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, copy, + * modify, merge, publish, distribute, sublicense, and/or sell copies + * of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + * Copyright: + * 2020 Evan Nemerson + * 2021 Zhi An Ng (Copyright owned by Google, LLC) + * 2023 Yi-Yen Chung (Copyright owned by Andes Technology) + */ + +#if !defined(SIMDE_ARM_NEON_RECPX_H) +#define SIMDE_ARM_NEON_RECPX_H + +#include "types.h" + +HEDLEY_DIAGNOSTIC_PUSH +SIMDE_DISABLE_UNWANTED_DIAGNOSTICS +SIMDE_BEGIN_DECLS_ + +SIMDE_FUNCTION_ATTRIBUTES +simde_float16_t +simde_vrecpxh_f16(simde_float16_t a) { + #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && defined(SIMDE_ARM_NEON_FP16) + return vrecpxh_f16(a); + #else + if (simde_isnanhf(a)) { + return SIMDE_NANHF; + } + uint16_t n; + simde_memcpy(&n, &a, sizeof(a)); + uint16_t sign = n & 0x8000; + uint16_t exp = n & 0x7c00; + uint16_t result; + if (exp == 0) { + uint16_t max_exp = 0x7b00; + result = sign|max_exp; + } + else { + exp = ~(exp) & 0x7c00; + result = sign|exp; + } + simde_memcpy(&a, &result, sizeof(result)); + return a; + #endif +} +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vrecpxh_f16 + #define vrecpxh_f16(a) simde_vrecpxh_f16((a)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_float32_t +simde_vrecpxs_f32(simde_float32_t a) { + #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + return vrecpxs_f32(a); + #else + if (simde_math_isnanf(a)) { + return SIMDE_MATH_NANF; + } + uint32_t n; + simde_memcpy(&n, &a, sizeof(a)); + uint32_t sign = n & 0x80000000; + uint32_t exp = n & 0x7f800000; + uint32_t result; + if (exp == 0) { + uint32_t max_exp = 0x7f000000; + result = sign|max_exp; + } + else { + exp = ~(exp) & 0x7f800000; + result = sign|exp; + } + simde_memcpy(&a, &result, sizeof(result)); + return a; + #endif +} +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vrecpxs_f32 + #define vrecpxs_f32(a) simde_vrecpxs_f32((a)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_float64_t +simde_vrecpxd_f64(simde_float64_t a) { + #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + return vrecpxd_f64(a); + #else + if (simde_math_isnan(a)) { + return SIMDE_MATH_NAN; + } + uint64_t n; + simde_memcpy(&n, &a, sizeof(a)); + uint64_t sign = n & 0x8000000000000000ull; + uint64_t exp = n & 0x7ff0000000000000ull; + uint64_t result; + if (exp == 0) { + uint64_t max_exp = 0x7fe0000000000000ull; + result = sign|max_exp; + } + else { + exp = ~(exp) & 0x7ff0000000000000ull; + result = sign|exp; + } + simde_memcpy(&a, &result, sizeof(result)); + return a; + #endif +} +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vrecpxd_f64 + #define vrecpxd_f64(a) simde_vrecpxd_f64((a)) +#endif + +SIMDE_END_DECLS_ +HEDLEY_DIAGNOSTIC_POP +#endif /* !defined(SIMDE_ARM_NEON_RECPX_H) */ diff --git a/lib/simd_wrapper/simde/arm/neon/reinterpret.h b/lib/simd_wrapper/simde/arm/neon/reinterpret.h index 88bddbe6da4..3af62f7733a 100644 --- a/lib/simd_wrapper/simde/arm/neon/reinterpret.h +++ b/lib/simd_wrapper/simde/arm/neon/reinterpret.h @@ -23,6 +23,7 @@ * Copyright: * 2020 Evan Nemerson * 2020 Sean Maher (Copyright owned by Google, LLC) + * 2023 Yi-Yen Chung (Copyright owned by Andes Technology) */ @@ -2330,6 +2331,23 @@ simde_vreinterpret_u64_u32(simde_uint32x2_t a) { #define vreinterpret_u64_u32 simde_vreinterpret_u64_u32 #endif +SIMDE_FUNCTION_ATTRIBUTES +simde_uint64x1_t +simde_vreinterpret_u64_f16(simde_float16x4_t a) { + #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) && defined(SIMDE_ARM_NEON_FP16) + return vreinterpret_u64_f16(a); + #else + simde_uint64x1_private r_; + simde_float16x4_private a_ = simde_float16x4_to_private(a); + simde_memcpy(&r_, &a_, sizeof(r_)); + return simde_uint64x1_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vreinterpret_u64_f16 + #define vreinterpret_u64_f16 simde_vreinterpret_u64_f16 +#endif + SIMDE_FUNCTION_ATTRIBUTES simde_uint64x1_t simde_vreinterpret_u64_f32(simde_float32x2_t a) { @@ -2670,6 +2688,7 @@ simde_vreinterpret_f32_u64(simde_uint64x1_t a) { #define vreinterpret_f32_u64 simde_vreinterpret_f32_u64 #endif + SIMDE_FUNCTION_ATTRIBUTES simde_float32x2_t simde_vreinterpret_f32_f64(simde_float64x1_t a) { @@ -3163,6 +3182,4348 @@ simde_vreinterpretq_f64_f32(simde_float32x4_t a) { #define vreinterpretq_f64_f32(a) simde_vreinterpretq_f64_f32(a) #endif +SIMDE_FUNCTION_ATTRIBUTES +simde_float16x4_t +simde_vreinterpret_f16_f32(simde_float32x2_t a) { + #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) && defined(SIMDE_ARM_NEON_FP16) + return vreinterpret_f16_f32(a); + #else + simde_float16x4_private r_; + simde_float32x2_private a_ = simde_float32x2_to_private(a); + simde_memcpy(&r_, &a_, sizeof(r_)); + return simde_float16x4_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vreinterpret_f16_f32 + #define vreinterpret_f16_f32 simde_vreinterpret_f16_f32 +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_float16x4_t +simde_vreinterpret_f16_s16(simde_int16x4_t a) { + #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) && defined(SIMDE_ARM_NEON_FP16) + return vreinterpret_f16_s16(a); + #else + simde_float16x4_private r_; + simde_int16x4_private a_ = simde_int16x4_to_private(a); + simde_memcpy(&r_, &a_, sizeof(r_)); + return simde_float16x4_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vreinterpret_f16_s16 + #define vreinterpret_f16_s16 simde_vreinterpret_f16_s16 +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_float16x4_t +simde_vreinterpret_f16_s32(simde_int32x2_t a) { + #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) && defined(SIMDE_ARM_NEON_FP16) + return vreinterpret_f16_s32(a); + #else + simde_float16x4_private r_; + simde_int32x2_private a_ = simde_int32x2_to_private(a); + simde_memcpy(&r_, &a_, sizeof(r_)); + return simde_float16x4_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vreinterpret_f16_s32 + #define vreinterpret_f16_s32 simde_vreinterpret_f16_s32 +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_float16x4_t +simde_vreinterpret_f16_s64(simde_int64x1_t a) { + #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) && defined(SIMDE_ARM_NEON_FP16) + return vreinterpret_f16_s64(a); + #else + simde_float16x4_private r_; + simde_int64x1_private a_ = simde_int64x1_to_private(a); + simde_memcpy(&r_, &a_, sizeof(r_)); + return simde_float16x4_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vreinterpret_f16_s64 + #define vreinterpret_f16_s64 simde_vreinterpret_f16_s64 +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_float16x4_t +simde_vreinterpret_f16_s8(simde_int8x8_t a) { + #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) && defined(SIMDE_ARM_NEON_FP16) + return vreinterpret_f16_s8(a); + #else + simde_float16x4_private r_; + simde_int8x8_private a_ = simde_int8x8_to_private(a); + simde_memcpy(&r_, &a_, sizeof(r_)); + return simde_float16x4_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vreinterpret_f16_s8 + #define vreinterpret_f16_s8 simde_vreinterpret_f16_s8 +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_float16x4_t +simde_vreinterpret_f16_u32(simde_uint32x2_t a) { + #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) && defined(SIMDE_ARM_NEON_FP16) + return vreinterpret_f16_u32(a); + #else + simde_float16x4_private r_; + simde_uint32x2_private a_ = simde_uint32x2_to_private(a); + simde_memcpy(&r_, &a_, sizeof(r_)); + return simde_float16x4_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vreinterpret_f16_u32 + #define vreinterpret_f16_u32 simde_vreinterpret_f16_u32 +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_float16x4_t +simde_vreinterpret_f16_u64(simde_uint64x1_t a) { + #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) && defined(SIMDE_ARM_NEON_FP16) + return vreinterpret_f16_u64(a); + #else + simde_float16x4_private r_; + simde_uint64x1_private a_ = simde_uint64x1_to_private(a); + simde_memcpy(&r_, &a_, sizeof(r_)); + return simde_float16x4_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vreinterpret_f16_u64 + #define vreinterpret_f16_u64 simde_vreinterpret_f16_u64 +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_float16x4_t +simde_vreinterpret_f16_u8(simde_uint8x8_t a) { + #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) && defined(SIMDE_ARM_NEON_FP16) + return vreinterpret_f16_u8(a); + #else + simde_float16x4_private r_; + simde_uint8x8_private a_ = simde_uint8x8_to_private(a); + simde_memcpy(&r_, &a_, sizeof(r_)); + return simde_float16x4_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vreinterpret_f16_u8 + #define vreinterpret_f16_u8 simde_vreinterpret_f16_u8 +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_float16x8_t +simde_vreinterpretq_f16_f32(simde_float32x4_t a) { + #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) && defined(SIMDE_ARM_NEON_FP16) + return vreinterpretq_f16_f32(a); + #else + simde_float16x8_private r_; + simde_float32x4_private a_ = simde_float32x4_to_private(a); + simde_memcpy(&r_, &a_, sizeof(r_)); + return simde_float16x8_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vreinterpretq_f16_f32 + #define vreinterpretq_f16_f32(a) simde_vreinterpretq_f16_f32(a) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_float16x8_t +simde_vreinterpretq_f16_s16(simde_int16x8_t a) { + #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) && defined(SIMDE_ARM_NEON_FP16) + return vreinterpretq_f16_s16(a); + #else + simde_float16x8_private r_; + simde_int16x8_private a_ = simde_int16x8_to_private(a); + simde_memcpy(&r_, &a_, sizeof(r_)); + return simde_float16x8_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vreinterpretq_f16_s16 + #define vreinterpretq_f16_s16(a) simde_vreinterpretq_f16_s16(a) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_float16x8_t +simde_vreinterpretq_f16_s32(simde_int32x4_t a) { + #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) && defined(SIMDE_ARM_NEON_FP16) + return vreinterpretq_f16_s32(a); + #else + simde_float16x8_private r_; + simde_int32x4_private a_ = simde_int32x4_to_private(a); + simde_memcpy(&r_, &a_, sizeof(r_)); + return simde_float16x8_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vreinterpretq_f16_s32 + #define vreinterpretq_f16_s32(a) simde_vreinterpretq_f16_s32(a) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_float16x8_t +simde_vreinterpretq_f16_s64(simde_int64x2_t a) { + #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) && defined(SIMDE_ARM_NEON_FP16) + return vreinterpretq_f16_s64(a); + #else + simde_float16x8_private r_; + simde_int64x2_private a_ = simde_int64x2_to_private(a); + simde_memcpy(&r_, &a_, sizeof(r_)); + return simde_float16x8_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vreinterpretq_f16_s64 + #define vreinterpretq_f16_s64(a) simde_vreinterpretq_f16_s64(a) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_float16x8_t +simde_vreinterpretq_f16_s8(simde_int8x16_t a) { + #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) && defined(SIMDE_ARM_NEON_FP16) + return vreinterpretq_f16_s8(a); + #else + simde_float16x8_private r_; + simde_int8x16_private a_ = simde_int8x16_to_private(a); + simde_memcpy(&r_, &a_, sizeof(r_)); + return simde_float16x8_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vreinterpretq_f16_s8 + #define vreinterpretq_f16_s8(a) simde_vreinterpretq_f16_s8(a) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_float16x8_t +simde_vreinterpretq_f16_u32(simde_uint32x4_t a) { + #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) && defined(SIMDE_ARM_NEON_FP16) + return vreinterpretq_f16_u32(a); + #else + simde_float16x8_private r_; + simde_uint32x4_private a_ = simde_uint32x4_to_private(a); + simde_memcpy(&r_, &a_, sizeof(r_)); + return simde_float16x8_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vreinterpretq_f16_u32 + #define vreinterpretq_f16_u32(a) simde_vreinterpretq_f16_u32(a) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_float16x8_t +simde_vreinterpretq_f16_u64(simde_uint64x2_t a) { + #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) && defined(SIMDE_ARM_NEON_FP16) + return vreinterpretq_f16_u64(a); + #else + simde_float16x8_private r_; + simde_uint64x2_private a_ = simde_uint64x2_to_private(a); + simde_memcpy(&r_, &a_, sizeof(r_)); + return simde_float16x8_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vreinterpretq_f16_u64 + #define vreinterpretq_f16_u64(a) simde_vreinterpretq_f16_u64(a) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_float16x8_t +simde_vreinterpretq_f16_u8(simde_uint8x16_t a) { + #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) && defined(SIMDE_ARM_NEON_FP16) + return vreinterpretq_f16_u8(a); + #else + simde_float16x8_private r_; + simde_uint8x16_private a_ = simde_uint8x16_to_private(a); + simde_memcpy(&r_, &a_, sizeof(r_)); + return simde_float16x8_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vreinterpretq_f16_u8 + #define vreinterpretq_f16_u8(a) simde_vreinterpretq_f16_u8(a) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_float16x4_t +simde_vreinterpret_f16_f64(simde_float64x1_t a) { + #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && defined(SIMDE_ARM_NEON_FP16) + return vreinterpret_f16_f64(a); + #else + simde_float16x4_private r_; + simde_float64x1_private a_ = simde_float64x1_to_private(a); + simde_memcpy(&r_, &a_, sizeof(r_)); + return simde_float16x4_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vreinterpret_f16_f64 + #define vreinterpret_f16_f64 simde_vreinterpret_f16_f64 +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_float16x8_t +simde_vreinterpretq_f16_f64(simde_float64x2_t a) { + #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && defined(SIMDE_ARM_NEON_FP16) + return vreinterpretq_f16_f64(a); + #else + simde_float16x8_private r_; + simde_float64x2_private a_ = simde_float64x2_to_private(a); + simde_memcpy(&r_, &a_, sizeof(r_)); + return simde_float16x8_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vreinterpretq_f16_f64 + #define vreinterpretq_f16_f64(a) simde_vreinterpretq_f16_f64(a) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_float32x2_t +simde_vreinterpret_f32_f16(simde_float16x4_t a) { + #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) && defined(SIMDE_ARM_NEON_FP16) + return vreinterpret_f32_f16(a); + #else + simde_float32x2_private r_; + simde_float16x4_private a_ = simde_float16x4_to_private(a); + simde_memcpy(&r_, &a_, sizeof(r_)); + return simde_float32x2_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vreinterpret_f32_f16 + #define vreinterpret_f32_f16 simde_vreinterpret_f32_f16 +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_float32x4_t +simde_vreinterpretq_f32_f16(simde_float16x8_t a) { + #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) && defined(SIMDE_ARM_NEON_FP16) + return vreinterpretq_f32_f16(a); + #else + simde_float32x4_private r_; + simde_float16x8_private a_ = simde_float16x8_to_private(a); + simde_memcpy(&r_, &a_, sizeof(r_)); + return simde_float32x4_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vreinterpretq_f32_f16 + #define vreinterpretq_f32_f16 simde_vreinterpretq_f32_f16 +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_float64x1_t +simde_vreinterpret_f64_f16(simde_float16x4_t a) { + #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && defined(SIMDE_ARM_NEON_FP16) + return vreinterpret_f64_f16(a); + #else + simde_float64x1_private r_; + simde_float16x4_private a_ = simde_float16x4_to_private(a); + simde_memcpy(&r_, &a_, sizeof(r_)); + return simde_float64x1_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vreinterpret_f64_f16 + #define vreinterpret_f64_f16 simde_vreinterpret_f64_f16 +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_float64x2_t +simde_vreinterpretq_f64_f16(simde_float16x8_t a) { + #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && defined(SIMDE_ARM_NEON_FP16) + return vreinterpretq_f64_f16(a); + #else + simde_float64x2_private r_; + simde_float16x8_private a_ = simde_float16x8_to_private(a); + simde_memcpy(&r_, &a_, sizeof(r_)); + return simde_float64x2_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vreinterpretq_f64_f16 + #define vreinterpretq_f64_f16 simde_vreinterpretq_f64_f16 +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_uint8x8_t +simde_vreinterpret_u8_f16(simde_float16x4_t a) { + #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) && defined(SIMDE_ARM_NEON_FP16) + return vreinterpret_u8_f16(a); + #else + simde_uint8x8_private r_; + simde_float16x4_private a_ = simde_float16x4_to_private(a); + simde_memcpy(&r_, &a_, sizeof(r_)); + return simde_uint8x8_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vreinterpret_u8_f16 + #define vreinterpret_u8_f16(a) simde_vreinterpret_u8_f16(a) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_uint8x16_t +simde_vreinterpretq_u8_f16(simde_float16x8_t a) { + #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) && defined(SIMDE_ARM_NEON_FP16) + return vreinterpretq_u8_f16(a); + #else + simde_uint8x16_private r_; + simde_float16x8_private a_ = simde_float16x8_to_private(a); + simde_memcpy(&r_, &a_, sizeof(r_)); + return simde_uint8x16_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vreinterpretq_u8_f16 + #define vreinterpretq_u8_f16(a) simde_vreinterpretq_u8_f16(a) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_int8x8_t +simde_vreinterpret_s8_f16(simde_float16x4_t a) { + #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) && defined(SIMDE_ARM_NEON_FP16) + return vreinterpret_s8_f16(a); + #else + simde_int8x8_private r_; + simde_float16x4_private a_ = simde_float16x4_to_private(a); + simde_memcpy(&r_, &a_, sizeof(r_)); + return simde_int8x8_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vreinterpret_s8_f16 + #define vreinterpret_s8_f16(a) simde_vreinterpret_s8_f16(a) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_int8x16_t +simde_vreinterpretq_s8_f16(simde_float16x8_t a) { + #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) && defined(SIMDE_ARM_NEON_FP16) + return vreinterpretq_s8_f16(a); + #else + simde_int8x16_private r_; + simde_float16x8_private a_ = simde_float16x8_to_private(a); + simde_memcpy(&r_, &a_, sizeof(r_)); + return simde_int8x16_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vreinterpretq_s8_f16 + #define vreinterpretq_s8_f16(a) simde_vreinterpretq_s8_f16(a) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_int16x4_t +simde_vreinterpret_s16_f16(simde_float16x4_t a) { + #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) && defined(SIMDE_ARM_NEON_FP16) + return vreinterpret_s16_f16(a); + #else + simde_int16x4_private r_; + simde_float16x4_private a_ = simde_float16x4_to_private(a); + simde_memcpy(&r_, &a_, sizeof(r_)); + return simde_int16x4_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vreinterpret_s16_f16 + #define vreinterpret_s16_f16(a) simde_vreinterpret_s16_f16(a) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_int16x8_t +simde_vreinterpretq_s16_f16(simde_float16x8_t a) { + #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) && defined(SIMDE_ARM_NEON_FP16) + return vreinterpretq_s16_f16(a); + #else + simde_int16x8_private r_; + simde_float16x8_private a_ = simde_float16x8_to_private(a); + simde_memcpy(&r_, &a_, sizeof(r_)); + return simde_int16x8_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vreinterpretq_s16_f16 + #define vreinterpretq_s16_f16(a) simde_vreinterpretq_s16_f16(a) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_int32x2_t +simde_vreinterpret_s32_f16(simde_float16x4_t a) { + #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) && defined(SIMDE_ARM_NEON_FP16) + return vreinterpret_s32_f16(a); + #else + simde_int32x2_private r_; + simde_float16x4_private a_ = simde_float16x4_to_private(a); + simde_memcpy(&r_, &a_, sizeof(r_)); + return simde_int32x2_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vreinterpret_s32_f16 + #define vreinterpret_s32_f16(a) simde_vreinterpret_s32_f16(a) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_int32x4_t +simde_vreinterpretq_s32_f16(simde_float16x8_t a) { + #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) && defined(SIMDE_ARM_NEON_FP16) + return vreinterpretq_s32_f16(a); + #else + simde_int32x4_private r_; + simde_float16x8_private a_ = simde_float16x8_to_private(a); + simde_memcpy(&r_, &a_, sizeof(r_)); + return simde_int32x4_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vreinterpretq_s32_f16 + #define vreinterpretq_s32_f16(a) simde_vreinterpretq_s32_f16(a) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_int64x1_t +simde_vreinterpret_s64_f16(simde_float16x4_t a) { + #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) && defined(SIMDE_ARM_NEON_FP16) + return vreinterpret_s64_f16(a); + #else + simde_int64x1_private r_; + simde_float16x4_private a_ = simde_float16x4_to_private(a); + simde_memcpy(&r_, &a_, sizeof(r_)); + return simde_int64x1_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vreinterpret_s64_f16 + #define vreinterpret_s64_f16(a) simde_vreinterpret_s64_f16(a) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_int64x2_t +simde_vreinterpretq_s64_f16(simde_float16x8_t a) { + #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) && defined(SIMDE_ARM_NEON_FP16) + return vreinterpretq_s64_f16(a); + #else + simde_int64x2_private r_; + simde_float16x8_private a_ = simde_float16x8_to_private(a); + simde_memcpy(&r_, &a_, sizeof(r_)); + return simde_int64x2_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vreinterpretq_s64_f16 + #define vreinterpretq_s64_f16(a) simde_vreinterpretq_s64_f16(a) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_uint32x2_t +simde_vreinterpret_u32_f16(simde_float16x4_t a) { + #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) && defined(SIMDE_ARM_NEON_FP16) + return vreinterpret_u32_f16(a); + #else + simde_uint32x2_private r_; + simde_float16x4_private a_ = simde_float16x4_to_private(a); + simde_memcpy(&r_, &a_, sizeof(r_)); + return simde_uint32x2_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vreinterpret_u32_f16 + #define vreinterpret_u32_f16(a) simde_vreinterpret_u32_f16(a) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_uint32x4_t +simde_vreinterpretq_u32_f16(simde_float16x8_t a) { + #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) && defined(SIMDE_ARM_NEON_FP16) + return vreinterpretq_u32_f16(a); + #else + simde_uint32x4_private r_; + simde_float16x8_private a_ = simde_float16x8_to_private(a); + simde_memcpy(&r_, &a_, sizeof(r_)); + return simde_uint32x4_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vreinterpretq_u32_f16 + #define vreinterpretq_u32_f16(a) simde_vreinterpretq_u32_f16(a) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_uint64x2_t +simde_vreinterpretq_u64_f16(simde_float16x8_t a) { + #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) && defined(SIMDE_ARM_NEON_FP16) + return vreinterpretq_u64_f16(a); + #else + simde_uint64x2_private r_; + simde_float16x8_private a_ = simde_float16x8_to_private(a); + simde_memcpy(&r_, &a_, sizeof(r_)); + return simde_uint64x2_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vreinterpretq_u64_f16 + #define vreinterpretq_u64_f16 simde_vreinterpretq_u64_f16 +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_poly8x8_t +simde_vreinterpret_p8_s8(simde_int8x8_t a) { + #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) + return vreinterpret_p8_s8(a); + #else + simde_poly8x8_private r_; + simde_int8x8_private a_ = simde_int8x8_to_private(a); + simde_memcpy(&r_, &a_, sizeof(r_)); + return simde_poly8x8_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vreinterpret_p8_s8 + #define vreinterpret_p8_s8 simde_vreinterpret_p8_s8 +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_poly8x8_t +simde_vreinterpret_p8_s16(simde_int16x4_t a) { + #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) + return vreinterpret_p8_s16(a); + #else + simde_poly8x8_private r_; + simde_int16x4_private a_ = simde_int16x4_to_private(a); + simde_memcpy(&r_, &a_, sizeof(r_)); + return simde_poly8x8_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vreinterpret_p8_s16 + #define vreinterpret_p8_s16 simde_vreinterpret_p8_s16 +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_poly8x8_t +simde_vreinterpret_p8_s32(simde_int32x2_t a) { + #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) + return vreinterpret_p8_s32(a); + #else + simde_poly8x8_private r_; + simde_int32x2_private a_ = simde_int32x2_to_private(a); + simde_memcpy(&r_, &a_, sizeof(r_)); + return simde_poly8x8_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vreinterpret_p8_s32 + #define vreinterpret_p8_s32 simde_vreinterpret_p8_s32 +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_poly8x8_t +simde_vreinterpret_p8_s64(simde_int64x1_t a) { + #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) + return vreinterpret_p8_s64(a); + #else + simde_poly8x8_private r_; + simde_int64x1_private a_ = simde_int64x1_to_private(a); + simde_memcpy(&r_, &a_, sizeof(r_)); + return simde_poly8x8_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vreinterpret_p8_s64 + #define vreinterpret_p8_s64 simde_vreinterpret_p8_s64 +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_poly8x8_t +simde_vreinterpret_p8_p16(simde_poly16x4_t a) { + #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) + return vreinterpret_p8_p16(a); + #else + simde_poly8x8_private r_; + simde_poly16x4_private a_ = simde_poly16x4_to_private(a); + simde_memcpy(&r_, &a_, sizeof(r_)); + return simde_poly8x8_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vreinterpret_p8_p16 + #define vreinterpret_p8_p16 simde_vreinterpret_p8_p16 +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_poly8x8_t +simde_vreinterpret_p8_p64(simde_poly64x1_t a) { + #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) + return vreinterpret_p8_p64(a); + #else + simde_poly8x8_private r_; + simde_poly64x1_private a_ = simde_poly64x1_to_private(a); + simde_memcpy(&r_, &a_, sizeof(r_)); + return simde_poly8x8_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) + #undef vreinterpret_p8_p64 + #define vreinterpret_p8_p64 simde_vreinterpret_p8_p64 +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_poly8x8_t +simde_vreinterpret_p8_f32(simde_float32x2_t a) { + #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) + return vreinterpret_p8_f32(a); + #else + simde_poly8x8_private r_; + simde_float32x2_private a_ = simde_float32x2_to_private(a); + simde_memcpy(&r_, &a_, sizeof(r_)); + return simde_poly8x8_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vreinterpret_p8_f32 + #define vreinterpret_p8_f32 simde_vreinterpret_p8_f32 +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_poly8x8_t +simde_vreinterpret_p8_f64(simde_float64x1_t a) { + #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + return vreinterpret_p8_f64(a); + #else + simde_poly8x8_private r_; + simde_float64x1_private a_ = simde_float64x1_to_private(a); + simde_memcpy(&r_, &a_, sizeof(r_)); + return simde_poly8x8_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vreinterpret_p8_f64 + #define vreinterpret_p8_f64 simde_vreinterpret_p8_f64 +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_poly8x16_t +simde_vreinterpretq_p8_s8(simde_int8x16_t a) { + #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) + return vreinterpretq_p8_s8(a); + #else + simde_poly8x16_private r_; + simde_int8x16_private a_ = simde_int8x16_to_private(a); + simde_memcpy(&r_, &a_, sizeof(r_)); + return simde_poly8x16_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vreinterpretq_p8_s8 + #define vreinterpretq_p8_s8(a) simde_vreinterpretq_p8_s8(a) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_poly8x16_t +simde_vreinterpretq_p8_s16(simde_int16x8_t a) { + #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) + return vreinterpretq_p8_s16(a); + #else + simde_poly8x16_private r_; + simde_int16x8_private a_ = simde_int16x8_to_private(a); + simde_memcpy(&r_, &a_, sizeof(r_)); + return simde_poly8x16_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vreinterpretq_p8_s16 + #define vreinterpretq_p8_s16(a) simde_vreinterpretq_p8_s16(a) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_poly8x16_t +simde_vreinterpretq_p8_s32(simde_int32x4_t a) { + #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) + return vreinterpretq_p8_s32(a); + #else + simde_poly8x16_private r_; + simde_int32x4_private a_ = simde_int32x4_to_private(a); + simde_memcpy(&r_, &a_, sizeof(r_)); + return simde_poly8x16_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vreinterpretq_p8_s32 + #define vreinterpretq_p8_s32(a) simde_vreinterpretq_p8_s32(a) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_poly8x16_t +simde_vreinterpretq_p8_s64(simde_int64x2_t a) { + #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) + return vreinterpretq_p8_s64(a); + #else + simde_poly8x16_private r_; + simde_int64x2_private a_ = simde_int64x2_to_private(a); + simde_memcpy(&r_, &a_, sizeof(r_)); + return simde_poly8x16_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vreinterpretq_p8_s64 + #define vreinterpretq_p8_s64(a) simde_vreinterpretq_p8_s64(a) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_poly8x16_t +simde_vreinterpretq_p8_p16(simde_poly16x8_t a) { + #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) + return vreinterpretq_p8_p16(a); + #else + simde_poly8x16_private r_; + simde_poly16x8_private a_ = simde_poly16x8_to_private(a); + simde_memcpy(&r_, &a_, sizeof(r_)); + return simde_poly8x16_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vreinterpretq_p8_p16 + #define vreinterpretq_p8_p16(a) simde_vreinterpretq_p8_p16(a) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_poly8x16_t +simde_vreinterpretq_p8_p64(simde_poly64x2_t a) { + #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) + return vreinterpretq_p8_p64(a); + #else + simde_poly8x16_private r_; + simde_poly64x2_private a_ = simde_poly64x2_to_private(a); + simde_memcpy(&r_, &a_, sizeof(r_)); + return simde_poly8x16_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) + #undef vreinterpretq_p8_p64 + #define vreinterpretq_p8_p64(a) simde_vreinterpretq_p8_p64(a) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_poly8x16_t +simde_vreinterpretq_p8_f32(simde_float32x4_t a) { + #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) + return vreinterpretq_p8_f32(a); + #else + simde_poly8x16_private r_; + simde_float32x4_private a_ = simde_float32x4_to_private(a); + simde_memcpy(&r_, &a_, sizeof(r_)); + return simde_poly8x16_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vreinterpretq_p8_f32 + #define vreinterpretq_p8_f32(a) simde_vreinterpretq_p8_f32(a) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_poly8x16_t +simde_vreinterpretq_p8_f64(simde_float64x2_t a) { + #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + return vreinterpretq_p8_f64(a); + #else + simde_poly8x16_private r_; + simde_float64x2_private a_ = simde_float64x2_to_private(a); + simde_memcpy(&r_, &a_, sizeof(r_)); + return simde_poly8x16_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vreinterpretq_p8_f64 + #define vreinterpretq_p8_f64(a) simde_vreinterpretq_p8_f64(a) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_poly16x4_t +simde_vreinterpret_p16_s8(simde_int8x8_t a) { + #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) + return vreinterpret_p16_s8(a); + #else + simde_poly16x4_private r_; + simde_int8x8_private a_ = simde_int8x8_to_private(a); + simde_memcpy(&r_, &a_, sizeof(r_)); + return simde_poly16x4_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vreinterpret_p16_s8 + #define vreinterpret_p16_s8 simde_vreinterpret_p16_s8 +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_poly16x4_t +simde_vreinterpret_p16_s16(simde_int16x4_t a) { + #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) + return vreinterpret_p16_s16(a); + #else + simde_poly16x4_private r_; + simde_int16x4_private a_ = simde_int16x4_to_private(a); + simde_memcpy(&r_, &a_, sizeof(r_)); + return simde_poly16x4_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vreinterpret_p16_s16 + #define vreinterpret_p16_s16 simde_vreinterpret_p16_s16 +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_poly16x4_t +simde_vreinterpret_p16_s32(simde_int32x2_t a) { + #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) + return vreinterpret_p16_s32(a); + #else + simde_poly16x4_private r_; + simde_int32x2_private a_ = simde_int32x2_to_private(a); + simde_memcpy(&r_, &a_, sizeof(r_)); + return simde_poly16x4_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vreinterpret_p16_s32 + #define vreinterpret_p16_s32 simde_vreinterpret_p16_s32 +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_poly16x4_t +simde_vreinterpret_p16_s64(simde_int64x1_t a) { + #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) + return vreinterpret_p16_s64(a); + #else + simde_poly16x4_private r_; + simde_int64x1_private a_ = simde_int64x1_to_private(a); + simde_memcpy(&r_, &a_, sizeof(r_)); + return simde_poly16x4_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vreinterpret_p16_s64 + #define vreinterpret_p16_s64 simde_vreinterpret_p16_s64 +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_poly16x4_t +simde_vreinterpret_p16_p8(simde_poly8x8_t a) { + #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) + return vreinterpret_p16_p8(a); + #else + simde_poly16x4_private r_; + simde_poly8x8_private a_ = simde_poly8x8_to_private(a); + simde_memcpy(&r_, &a_, sizeof(r_)); + return simde_poly16x4_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vreinterpret_p16_p8 + #define vreinterpret_p16_p8 simde_vreinterpret_p16_p8 +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_poly16x4_t +simde_vreinterpret_p16_p64(simde_poly64x1_t a) { + #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) + return vreinterpret_p16_p64(a); + #else + simde_poly16x4_private r_; + simde_poly64x1_private a_ = simde_poly64x1_to_private(a); + simde_memcpy(&r_, &a_, sizeof(r_)); + return simde_poly16x4_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) + #undef vreinterpret_p16_p64 + #define vreinterpret_p16_p64 simde_vreinterpret_p16_p64 +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_poly16x4_t +simde_vreinterpret_p16_f16(simde_float16x4_t a) { + #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) && defined(SIMDE_ARM_NEON_FP16) + return vreinterpret_p16_f16(a); + #else + simde_poly16x4_private r_; + simde_float16x4_private a_ = simde_float16x4_to_private(a); + simde_memcpy(&r_, &a_, sizeof(r_)); + return simde_poly16x4_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vreinterpret_p16_f16 + #define vreinterpret_p16_f16(a) simde_vreinterpret_p16_f16(a) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_poly16x4_t +simde_vreinterpret_p16_f32(simde_float32x2_t a) { + #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) + return vreinterpret_p16_f32(a); + #else + simde_poly16x4_private r_; + simde_float32x2_private a_ = simde_float32x2_to_private(a); + simde_memcpy(&r_, &a_, sizeof(r_)); + return simde_poly16x4_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vreinterpret_p16_f32 + #define vreinterpret_p16_f32 simde_vreinterpret_p16_f32 +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_poly16x4_t +simde_vreinterpret_p16_f64(simde_float64x1_t a) { + #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + return vreinterpret_p16_f64(a); + #else + simde_poly16x4_private r_; + simde_float64x1_private a_ = simde_float64x1_to_private(a); + simde_memcpy(&r_, &a_, sizeof(r_)); + return simde_poly16x4_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vreinterpret_p16_f64 + #define vreinterpret_p16_f64 simde_vreinterpret_p16_f64 +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_poly16x8_t +simde_vreinterpretq_p16_s8(simde_int8x16_t a) { + #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) + return vreinterpretq_p16_s8(a); + #else + simde_poly16x8_private r_; + simde_int8x16_private a_ = simde_int8x16_to_private(a); + simde_memcpy(&r_, &a_, sizeof(r_)); + return simde_poly16x8_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vreinterpretq_p16_s8 + #define vreinterpretq_p16_s8(a) simde_vreinterpretq_p16_s8(a) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_poly16x8_t +simde_vreinterpretq_p16_s16(simde_int16x8_t a) { + #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) + return vreinterpretq_p16_s16(a); + #else + simde_poly16x8_private r_; + simde_int16x8_private a_ = simde_int16x8_to_private(a); + simde_memcpy(&r_, &a_, sizeof(r_)); + return simde_poly16x8_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vreinterpretq_p16_s16 + #define vreinterpretq_p16_s16(a) simde_vreinterpretq_p16_s16(a) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_poly16x8_t +simde_vreinterpretq_p16_s32(simde_int32x4_t a) { + #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) + return vreinterpretq_p16_s32(a); + #else + simde_poly16x8_private r_; + simde_int32x4_private a_ = simde_int32x4_to_private(a); + simde_memcpy(&r_, &a_, sizeof(r_)); + return simde_poly16x8_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vreinterpretq_p16_s32 + #define vreinterpretq_p16_s32(a) simde_vreinterpretq_p16_s32(a) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_poly16x8_t +simde_vreinterpretq_p16_s64(simde_int64x2_t a) { + #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) + return vreinterpretq_p16_s64(a); + #else + simde_poly16x8_private r_; + simde_int64x2_private a_ = simde_int64x2_to_private(a); + simde_memcpy(&r_, &a_, sizeof(r_)); + return simde_poly16x8_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vreinterpretq_p16_s64 + #define vreinterpretq_p16_s64(a) simde_vreinterpretq_p16_s64(a) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_poly16x8_t +simde_vreinterpretq_p16_p8(simde_poly8x16_t a) { + #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) + return vreinterpretq_p16_p8(a); + #else + simde_poly16x8_private r_; + simde_poly8x16_private a_ = simde_poly8x16_to_private(a); + simde_memcpy(&r_, &a_, sizeof(r_)); + return simde_poly16x8_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vreinterpretq_p16_p8 + #define vreinterpretq_p16_p8(a) simde_vreinterpretq_p16_p8(a) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_poly16x8_t +simde_vreinterpretq_p16_p64(simde_poly64x2_t a) { + #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) + return vreinterpretq_p16_p64(a); + #else + simde_poly16x8_private r_; + simde_poly64x2_private a_ = simde_poly64x2_to_private(a); + simde_memcpy(&r_, &a_, sizeof(r_)); + return simde_poly16x8_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) + #undef vreinterpretq_p16_p64 + #define vreinterpretq_p16_p64(a) simde_vreinterpretq_p16_p64(a) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_poly16x8_t +simde_vreinterpretq_p16_f32(simde_float32x4_t a) { + #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) + return vreinterpretq_p16_f32(a); + #else + simde_poly16x8_private r_; + simde_float32x4_private a_ = simde_float32x4_to_private(a); + simde_memcpy(&r_, &a_, sizeof(r_)); + return simde_poly16x8_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vreinterpretq_p16_f32 + #define vreinterpretq_p16_f32(a) simde_vreinterpretq_p16_f32(a) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_poly16x8_t +simde_vreinterpretq_p16_f64(simde_float64x2_t a) { + #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + return vreinterpretq_p16_f64(a); + #else + simde_poly16x8_private r_; + simde_float64x2_private a_ = simde_float64x2_to_private(a); + simde_memcpy(&r_, &a_, sizeof(r_)); + return simde_poly16x8_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vreinterpretq_p16_f64 + #define vreinterpretq_p16_f64(a) simde_vreinterpretq_p16_f64(a) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_poly16x8_t +simde_vreinterpretq_p16_f16(simde_float16x8_t a) { + #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) && defined(SIMDE_ARM_NEON_FP16) + return vreinterpretq_p16_f16(a); + #else + simde_poly16x8_private r_; + simde_float16x8_private a_ = simde_float16x8_to_private(a); + simde_memcpy(&r_, &a_, sizeof(r_)); + return simde_poly16x8_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vreinterpretq_p16_f16 + #define vreinterpretq_p16_f16(a) simde_vreinterpretq_p16_f16(a) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_poly64x1_t +simde_vreinterpret_p64_s8(simde_int8x8_t a) { + #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) + return vreinterpret_p64_s8(a); + #else + simde_poly64x1_private r_; + simde_int8x8_private a_ = simde_int8x8_to_private(a); + simde_memcpy(&r_, &a_, sizeof(r_)); + return simde_poly64x1_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) + #undef vreinterpret_p64_s8 + #define vreinterpret_p64_s8 simde_vreinterpret_p64_s8 +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_poly64x1_t +simde_vreinterpret_p64_s16(simde_int16x4_t a) { + #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) + return vreinterpret_p64_s16(a); + #else + simde_poly64x1_private r_; + simde_int16x4_private a_ = simde_int16x4_to_private(a); + simde_memcpy(&r_, &a_, sizeof(r_)); + return simde_poly64x1_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) + #undef vreinterpret_p64_s16 + #define vreinterpret_p64_s16 simde_vreinterpret_p64_s16 +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_poly64x1_t +simde_vreinterpret_p64_s32(simde_int32x2_t a) { + #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) + return vreinterpret_p64_s32(a); + #else + simde_poly64x1_private r_; + simde_int32x2_private a_ = simde_int32x2_to_private(a); + simde_memcpy(&r_, &a_, sizeof(r_)); + return simde_poly64x1_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) + #undef vreinterpret_p64_s32 + #define vreinterpret_p64_s32 simde_vreinterpret_p64_s32 +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_poly64x1_t +simde_vreinterpret_p64_p8(simde_poly8x8_t a) { + #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) + return vreinterpret_p64_p8(a); + #else + simde_poly64x1_private r_; + simde_poly8x8_private a_ = simde_poly8x8_to_private(a); + simde_memcpy(&r_, &a_, sizeof(r_)); + return simde_poly64x1_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) + #undef vreinterpret_p64_p8 + #define vreinterpret_p64_p8 simde_vreinterpret_p64_p8 +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_poly64x1_t +simde_vreinterpret_p64_p16(simde_poly16x4_t a) { + #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) + return vreinterpret_p64_p16(a); + #else + simde_poly64x1_private r_; + simde_poly16x4_private a_ = simde_poly16x4_to_private(a); + simde_memcpy(&r_, &a_, sizeof(r_)); + return simde_poly64x1_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) + #undef vreinterpret_p64_p16 + #define vreinterpret_p64_p16 simde_vreinterpret_p64_p16 +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_poly64x1_t +simde_vreinterpret_p64_f16(simde_float16x4_t a) { + #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARM_NEON_FP16) + return vreinterpret_p64_f16(a); + #else + simde_poly64x1_private r_; + simde_float16x4_private a_ = simde_float16x4_to_private(a); + simde_memcpy(&r_, &a_, sizeof(r_)); + return simde_poly64x1_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) + #undef vreinterpret_p64_f16 + #define vreinterpret_p64_f16 simde_vreinterpret_p64_f16 +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_poly64x1_t +simde_vreinterpret_p64_f32(simde_float32x2_t a) { + #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) + return vreinterpret_p64_f32(a); + #else + simde_poly64x1_private r_; + simde_float32x2_private a_ = simde_float32x2_to_private(a); + simde_memcpy(&r_, &a_, sizeof(r_)); + return simde_poly64x1_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) + #undef vreinterpret_p64_f32 + #define vreinterpret_p64_f32 simde_vreinterpret_p64_f32 +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_poly64x1_t +simde_vreinterpret_p64_f64(simde_float64x1_t a) { + #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + return vreinterpret_p64_f64(a); + #else + simde_poly64x1_private r_; + simde_float64x1_private a_ = simde_float64x1_to_private(a); + simde_memcpy(&r_, &a_, sizeof(r_)); + return simde_poly64x1_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vreinterpret_p64_f64 + #define vreinterpret_p64_f64 simde_vreinterpret_p64_f64 +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_poly64x2_t +simde_vreinterpretq_p64_s8(simde_int8x16_t a) { + #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) + return vreinterpretq_p64_s8(a); + #else + simde_poly64x2_private r_; + simde_int8x16_private a_ = simde_int8x16_to_private(a); + simde_memcpy(&r_, &a_, sizeof(r_)); + return simde_poly64x2_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) + #undef vreinterpretq_p64_s8 + #define vreinterpretq_p64_s8(a) simde_vreinterpretq_p64_s8(a) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_poly64x2_t +simde_vreinterpretq_p64_s16(simde_int16x8_t a) { + #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) + return vreinterpretq_p64_s16(a); + #else + simde_poly64x2_private r_; + simde_int16x8_private a_ = simde_int16x8_to_private(a); + simde_memcpy(&r_, &a_, sizeof(r_)); + return simde_poly64x2_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) + #undef vreinterpretq_p64_s16 + #define vreinterpretq_p64_s16(a) simde_vreinterpretq_p64_s16(a) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_poly64x2_t +simde_vreinterpretq_p64_s32(simde_int32x4_t a) { + #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) + return vreinterpretq_p64_s32(a); + #else + simde_poly64x2_private r_; + simde_int32x4_private a_ = simde_int32x4_to_private(a); + simde_memcpy(&r_, &a_, sizeof(r_)); + return simde_poly64x2_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) + #undef vreinterpretq_p64_s32 + #define vreinterpretq_p64_s32(a) simde_vreinterpretq_p64_s32(a) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_poly64x2_t +simde_vreinterpretq_p64_s64(simde_int64x2_t a) { + #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) + return vreinterpretq_p64_s64(a); + #else + simde_poly64x2_private r_; + simde_int64x2_private a_ = simde_int64x2_to_private(a); + simde_memcpy(&r_, &a_, sizeof(r_)); + return simde_poly64x2_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) + #undef vreinterpretq_p64_s64 + #define vreinterpretq_p64_s64(a) simde_vreinterpretq_p64_s64(a) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_poly64x2_t +simde_vreinterpretq_p64_p8(simde_poly8x16_t a) { + #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) + return vreinterpretq_p64_p8(a); + #else + simde_poly64x2_private r_; + simde_poly8x16_private a_ = simde_poly8x16_to_private(a); + simde_memcpy(&r_, &a_, sizeof(r_)); + return simde_poly64x2_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) + #undef vreinterpretq_p64_p8 + #define vreinterpretq_p64_p8(a) simde_vreinterpretq_p64_p8(a) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_poly64x2_t +simde_vreinterpretq_p64_p16(simde_poly16x8_t a) { + #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) + return vreinterpretq_p64_p16(a); + #else + simde_poly64x2_private r_; + simde_poly16x8_private a_ = simde_poly16x8_to_private(a); + simde_memcpy(&r_, &a_, sizeof(r_)); + return simde_poly64x2_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) + #undef vreinterpretq_p64_p16 + #define vreinterpretq_p64_p16(a) simde_vreinterpretq_p64_p16(a) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_poly64x2_t +simde_vreinterpretq_p64_f32(simde_float32x4_t a) { + #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) + return vreinterpretq_p64_f32(a); + #else + simde_poly64x2_private r_; + simde_float32x4_private a_ = simde_float32x4_to_private(a); + simde_memcpy(&r_, &a_, sizeof(r_)); + return simde_poly64x2_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) + #undef vreinterpretq_p64_f32 + #define vreinterpretq_p64_f32(a) simde_vreinterpretq_p64_f32(a) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_poly64x2_t +simde_vreinterpretq_p64_f64(simde_float64x2_t a) { + #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + return vreinterpretq_p64_f64(a); + #else + simde_poly64x2_private r_; + simde_float64x2_private a_ = simde_float64x2_to_private(a); + simde_memcpy(&r_, &a_, sizeof(r_)); + return simde_poly64x2_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vreinterpretq_p64_f64 + #define vreinterpretq_p64_f64(a) simde_vreinterpretq_p64_f64(a) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_poly8x8_t +simde_vreinterpret_p8_f16(simde_float16x4_t a) { + #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) && defined(SIMDE_ARM_NEON_FP16) + return vreinterpret_p8_f16(a); + #else + simde_poly8x8_private r_; + simde_float16x4_private a_ = simde_float16x4_to_private(a); + simde_memcpy(&r_, &a_, sizeof(r_)); + return simde_poly8x8_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vreinterpret_p8_f16 + #define vreinterpret_p8_f16(a) simde_vreinterpret_p8_f16(a) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_poly8x16_t +simde_vreinterpretq_p8_f16(simde_float16x8_t a) { + #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) && defined(SIMDE_ARM_NEON_FP16) + return vreinterpretq_p8_f16(a); + #else + simde_poly8x16_private r_; + simde_float16x8_private a_ = simde_float16x8_to_private(a); + simde_memcpy(&r_, &a_, sizeof(r_)); + return simde_poly8x16_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vreinterpretq_p8_f16 + #define vreinterpretq_p8_f16(a) simde_vreinterpretq_p8_f16(a) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_poly64x2_t +simde_vreinterpretq_p64_f16(simde_float16x8_t a) { + #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARM_NEON_FP16) + return vreinterpretq_p64_f16(a); + #else + simde_poly64x2_private r_; + simde_float16x8_private a_ = simde_float16x8_to_private(a); + simde_memcpy(&r_, &a_, sizeof(r_)); + return simde_poly64x2_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) + #undef vreinterpretq_p64_f16 + #define vreinterpretq_p64_f16 simde_vreinterpretq_p64_f16 +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_int8x8_t +simde_vreinterpret_s8_p8(simde_poly8x8_t a) { + #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) + return vreinterpret_s8_p8(a); + #else + simde_int8x8_private r_; + simde_poly8x8_private a_ = simde_poly8x8_to_private(a); + simde_memcpy(&r_, &a_, sizeof(r_)); + return simde_int8x8_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vreinterpret_s8_p8 + #define vreinterpret_s8_p8 simde_vreinterpret_s8_p8 +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_int8x8_t +simde_vreinterpret_s8_p16(simde_poly16x4_t a) { + #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) + return vreinterpret_s8_p16(a); + #else + simde_int8x8_private r_; + simde_poly16x4_private a_ = simde_poly16x4_to_private(a); + simde_memcpy(&r_, &a_, sizeof(r_)); + return simde_int8x8_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vreinterpret_s8_p16 + #define vreinterpret_s8_p16 simde_vreinterpret_s8_p16 +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_int8x8_t +simde_vreinterpret_s8_p64(simde_poly64x1_t a) { + #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) + return vreinterpret_s8_p64(a); + #else + simde_int8x8_private r_; + simde_poly64x1_private a_ = simde_poly64x1_to_private(a); + simde_memcpy(&r_, &a_, sizeof(r_)); + return simde_int8x8_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) + #undef vreinterpret_s8_p64 + #define vreinterpret_s8_p64 simde_vreinterpret_s8_p64 +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_int8x16_t +simde_vreinterpretq_s8_p8(simde_poly8x16_t a) { + #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) + return vreinterpretq_s8_p8(a); + #else + simde_int8x16_private r_; + simde_poly8x16_private a_ = simde_poly8x16_to_private(a); + simde_memcpy(&r_, &a_, sizeof(r_)); + return simde_int8x16_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vreinterpretq_s8_p8 + #define vreinterpretq_s8_p8(a) simde_vreinterpretq_s8_p8(a) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_int8x16_t +simde_vreinterpretq_s8_p16(simde_poly16x8_t a) { + #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) + return vreinterpretq_s8_p16(a); + #else + simde_int8x16_private r_; + simde_poly16x8_private a_ = simde_poly16x8_to_private(a); + simde_memcpy(&r_, &a_, sizeof(r_)); + return simde_int8x16_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vreinterpretq_s8_p16 + #define vreinterpretq_s8_p16(a) simde_vreinterpretq_s8_p16(a) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_int8x16_t +simde_vreinterpretq_s8_p64(simde_poly64x2_t a) { + #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) + return vreinterpretq_s8_p64(a); + #else + simde_int8x16_private r_; + simde_poly64x2_private a_ = simde_poly64x2_to_private(a); + simde_memcpy(&r_, &a_, sizeof(r_)); + return simde_int8x16_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) + #undef vreinterpretq_s8_p64 + #define vreinterpretq_s8_p64(a) simde_vreinterpretq_s8_p64(a) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_int16x4_t +simde_vreinterpret_s16_p8(simde_poly8x8_t a) { + #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) + return vreinterpret_s16_p8(a); + #else + simde_int16x4_private r_; + simde_poly8x8_private a_ = simde_poly8x8_to_private(a); + simde_memcpy(&r_, &a_, sizeof(r_)); + return simde_int16x4_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vreinterpret_s16_p8 + #define vreinterpret_s16_p8 simde_vreinterpret_s16_p8 +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_int16x4_t +simde_vreinterpret_s16_p16(simde_poly16x4_t a) { + #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) + return vreinterpret_s16_p16(a); + #else + simde_int16x4_private r_; + simde_poly16x4_private a_ = simde_poly16x4_to_private(a); + simde_memcpy(&r_, &a_, sizeof(r_)); + return simde_int16x4_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vreinterpret_s16_p16 + #define vreinterpret_s16_p16 simde_vreinterpret_s16_p16 +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_int16x4_t +simde_vreinterpret_s16_p64(simde_poly64x1_t a) { + #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) + return vreinterpret_s16_p64(a); + #else + simde_int16x4_private r_; + simde_poly64x1_private a_ = simde_poly64x1_to_private(a); + simde_memcpy(&r_, &a_, sizeof(r_)); + return simde_int16x4_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) + #undef vreinterpret_s16_p64 + #define vreinterpret_s16_p64 simde_vreinterpret_s16_p64 +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_int16x8_t +simde_vreinterpretq_s16_p8(simde_poly8x16_t a) { + #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) + return vreinterpretq_s16_p8(a); + #else + simde_int16x8_private r_; + simde_poly8x16_private a_ = simde_poly8x16_to_private(a); + simde_memcpy(&r_, &a_, sizeof(r_)); + return simde_int16x8_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vreinterpretq_s16_p8 + #define vreinterpretq_s16_p8(a) simde_vreinterpretq_s16_p8(a) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_int16x8_t +simde_vreinterpretq_s16_p16(simde_poly16x8_t a) { + #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) + return vreinterpretq_s16_p16(a); + #else + simde_int16x8_private r_; + simde_poly16x8_private a_ = simde_poly16x8_to_private(a); + simde_memcpy(&r_, &a_, sizeof(r_)); + return simde_int16x8_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vreinterpretq_s16_p16 + #define vreinterpretq_s16_p16(a) simde_vreinterpretq_s16_p16(a) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_int16x8_t +simde_vreinterpretq_s16_p64(simde_poly64x2_t a) { + #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) + return vreinterpretq_s16_p64(a); + #else + simde_int16x8_private r_; + simde_poly64x2_private a_ = simde_poly64x2_to_private(a); + simde_memcpy(&r_, &a_, sizeof(r_)); + return simde_int16x8_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) + #undef vreinterpretq_s16_p64 + #define vreinterpretq_s16_p64(a) simde_vreinterpretq_s16_p64(a) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_int32x2_t +simde_vreinterpret_s32_p8(simde_poly8x8_t a) { + #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) + return vreinterpret_s32_p8(a); + #else + simde_int32x2_private r_; + simde_poly8x8_private a_ = simde_poly8x8_to_private(a); + simde_memcpy(&r_, &a_, sizeof(r_)); + return simde_int32x2_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vreinterpret_s32_p8 + #define vreinterpret_s32_p8 simde_vreinterpret_s32_p8 +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_int32x2_t +simde_vreinterpret_s32_p16(simde_poly16x4_t a) { + #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) + return vreinterpret_s32_p16(a); + #else + simde_int32x2_private r_; + simde_poly16x4_private a_ = simde_poly16x4_to_private(a); + simde_memcpy(&r_, &a_, sizeof(r_)); + return simde_int32x2_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vreinterpret_s32_p16 + #define vreinterpret_s32_p16 simde_vreinterpret_s32_p16 +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_int32x2_t +simde_vreinterpret_s32_p64(simde_poly64x1_t a) { + #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) + return vreinterpret_s32_p64(a); + #else + simde_int32x2_private r_; + simde_poly64x1_private a_ = simde_poly64x1_to_private(a); + simde_memcpy(&r_, &a_, sizeof(r_)); + return simde_int32x2_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) + #undef vreinterpret_s32_p64 + #define vreinterpret_s32_p64 simde_vreinterpret_s32_p64 +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_int32x4_t +simde_vreinterpretq_s32_p8(simde_poly8x16_t a) { + #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) + return vreinterpretq_s32_p8(a); + #else + simde_int32x4_private r_; + simde_poly8x16_private a_ = simde_poly8x16_to_private(a); + simde_memcpy(&r_, &a_, sizeof(r_)); + return simde_int32x4_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vreinterpretq_s32_p8 + #define vreinterpretq_s32_p8(a) simde_vreinterpretq_s32_p8(a) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_int32x4_t +simde_vreinterpretq_s32_p16(simde_poly16x8_t a) { + #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) + return vreinterpretq_s32_p16(a); + #else + simde_int32x4_private r_; + simde_poly16x8_private a_ = simde_poly16x8_to_private(a); + simde_memcpy(&r_, &a_, sizeof(r_)); + return simde_int32x4_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vreinterpretq_s32_p16 + #define vreinterpretq_s32_p16(a) simde_vreinterpretq_s32_p16(a) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_int32x4_t +simde_vreinterpretq_s32_p64(simde_poly64x2_t a) { + #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) + return vreinterpretq_s32_p64(a); + #else + simde_int32x4_private r_; + simde_poly64x2_private a_ = simde_poly64x2_to_private(a); + simde_memcpy(&r_, &a_, sizeof(r_)); + return simde_int32x4_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) + #undef vreinterpretq_s32_p64 + #define vreinterpretq_s32_p64(a) simde_vreinterpretq_s32_p64(a) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_int64x1_t +simde_vreinterpret_s64_p8(simde_poly8x8_t a) { + #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) + return vreinterpret_s64_p8(a); + #else + simde_int64x1_private r_; + simde_poly8x8_private a_ = simde_poly8x8_to_private(a); + simde_memcpy(&r_, &a_, sizeof(r_)); + return simde_int64x1_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vreinterpret_s64_p8 + #define vreinterpret_s64_p8 simde_vreinterpret_s64_p8 +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_int64x1_t +simde_vreinterpret_s64_p16(simde_poly16x4_t a) { + #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) + return vreinterpret_s64_p16(a); + #else + simde_int64x1_private r_; + simde_poly16x4_private a_ = simde_poly16x4_to_private(a); + simde_memcpy(&r_, &a_, sizeof(r_)); + return simde_int64x1_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vreinterpret_s64_p16 + #define vreinterpret_s64_p16 simde_vreinterpret_s64_p16 +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_int64x1_t +simde_vreinterpret_s64_p64(simde_poly64x1_t a) { + #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) + return vreinterpret_s64_p64(a); + #else + simde_int64x1_private r_; + simde_poly64x1_private a_ = simde_poly64x1_to_private(a); + simde_memcpy(&r_, &a_, sizeof(r_)); + return simde_int64x1_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) + #undef vreinterpret_s64_p64 + #define vreinterpret_s64_p64 simde_vreinterpret_s64_p64 +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_int64x2_t +simde_vreinterpretq_s64_p8(simde_poly8x16_t a) { + #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) + return vreinterpretq_s64_p8(a); + #else + simde_int64x2_private r_; + simde_poly8x16_private a_ = simde_poly8x16_to_private(a); + simde_memcpy(&r_, &a_, sizeof(r_)); + return simde_int64x2_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vreinterpretq_s64_p8 + #define vreinterpretq_s64_p8(a) simde_vreinterpretq_s64_p8(a) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_int64x2_t +simde_vreinterpretq_s64_p16(simde_poly16x8_t a) { + #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) + return vreinterpretq_s64_p16(a); + #else + simde_int64x2_private r_; + simde_poly16x8_private a_ = simde_poly16x8_to_private(a); + simde_memcpy(&r_, &a_, sizeof(r_)); + return simde_int64x2_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vreinterpretq_s64_p16 + #define vreinterpretq_s64_p16(a) simde_vreinterpretq_s64_p16(a) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_int64x2_t +simde_vreinterpretq_s64_p64(simde_poly64x2_t a) { + #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) + return vreinterpretq_s64_p64(a); + #else + simde_int64x2_private r_; + simde_poly64x2_private a_ = simde_poly64x2_to_private(a); + simde_memcpy(&r_, &a_, sizeof(r_)); + return simde_int64x2_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) + #undef vreinterpretq_s64_p64 + #define vreinterpretq_s64_p64(a) simde_vreinterpretq_s64_p64(a) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_float32x2_t +simde_vreinterpret_f32_p8(simde_poly8x8_t a) { + #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) + return vreinterpret_f32_p8(a); + #else + simde_float32x2_private r_; + simde_poly8x8_private a_ = simde_poly8x8_to_private(a); + simde_memcpy(&r_, &a_, sizeof(r_)); + return simde_float32x2_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vreinterpret_f32_p8 + #define vreinterpret_f32_p8 simde_vreinterpret_f32_p8 +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_float32x2_t +simde_vreinterpret_f32_p16(simde_poly16x4_t a) { + #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) + return vreinterpret_f32_p16(a); + #else + simde_float32x2_private r_; + simde_poly16x4_private a_ = simde_poly16x4_to_private(a); + simde_memcpy(&r_, &a_, sizeof(r_)); + return simde_float32x2_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vreinterpret_f32_p16 + #define vreinterpret_f32_p16 simde_vreinterpret_f32_p16 +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_float16x4_t +simde_vreinterpret_f16_p16(simde_poly16x4_t a) { + #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) && defined(SIMDE_ARM_NEON_FP16) + return vreinterpret_f16_p16(a); + #else + simde_float16x4_private r_; + simde_poly16x4_private a_ = simde_poly16x4_to_private(a); + simde_memcpy(&r_, &a_, sizeof(r_)); + return simde_float16x4_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vreinterpret_f16_p16 + #define vreinterpret_f16_p16(a) simde_vreinterpret_f16_p16(a) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_float32x4_t +simde_vreinterpretq_f32_p8(simde_poly8x16_t a) { + #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) + return vreinterpretq_f32_p8(a); + #else + simde_float32x4_private r_; + simde_poly8x16_private a_ = simde_poly8x16_to_private(a); + simde_memcpy(&r_, &a_, sizeof(r_)); + return simde_float32x4_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vreinterpretq_f32_p8 + #define vreinterpretq_f32_p8(a) simde_vreinterpretq_f32_p8(a) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_float32x4_t +simde_vreinterpretq_f32_p16(simde_poly16x8_t a) { + #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) + return vreinterpretq_f32_p16(a); + #else + simde_float32x4_private r_; + simde_poly16x8_private a_ = simde_poly16x8_to_private(a); + simde_memcpy(&r_, &a_, sizeof(r_)); + return simde_float32x4_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vreinterpretq_f32_p16 + #define vreinterpretq_f32_p16(a) simde_vreinterpretq_f32_p16(a) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_float16x8_t +simde_vreinterpretq_f16_p16(simde_poly16x8_t a) { + #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) && defined(SIMDE_ARM_NEON_FP16) + return vreinterpretq_f16_p16(a); + #else + simde_float16x8_private r_; + simde_poly16x8_private a_ = simde_poly16x8_to_private(a); + simde_memcpy(&r_, &a_, sizeof(r_)); + return simde_float16x8_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vreinterpretq_f16_p16 + #define vreinterpretq_f16_p16(a) simde_vreinterpretq_f16_p16(a) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_float64x1_t +simde_vreinterpret_f64_p8(simde_poly8x8_t a) { + #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + return vreinterpret_f64_p8(a); + #else + simde_float64x1_private r_; + simde_poly8x8_private a_ = simde_poly8x8_to_private(a); + simde_memcpy(&r_, &a_, sizeof(r_)); + return simde_float64x1_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vreinterpret_f64_p8 + #define vreinterpret_f64_p8 simde_vreinterpret_f64_p8 +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_float64x1_t +simde_vreinterpret_f64_p16(simde_poly16x4_t a) { + #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + return vreinterpret_f64_p16(a); + #else + simde_float64x1_private r_; + simde_poly16x4_private a_ = simde_poly16x4_to_private(a); + simde_memcpy(&r_, &a_, sizeof(r_)); + return simde_float64x1_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vreinterpret_f64_p16 + #define vreinterpret_f64_p16 simde_vreinterpret_f64_p16 +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_float64x1_t +simde_vreinterpret_f64_p64(simde_poly64x1_t a) { + #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + return vreinterpret_f64_p64(a); + #else + simde_float64x1_private r_; + simde_poly64x1_private a_ = simde_poly64x1_to_private(a); + simde_memcpy(&r_, &a_, sizeof(r_)); + return simde_float64x1_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vreinterpret_f64_p64 + #define vreinterpret_f64_p64 simde_vreinterpret_f64_p64 +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_float64x2_t +simde_vreinterpretq_f64_p8(simde_poly8x16_t a) { + #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + return vreinterpretq_f64_p8(a); + #else + simde_float64x2_private r_; + simde_poly8x16_private a_ = simde_poly8x16_to_private(a); + simde_memcpy(&r_, &a_, sizeof(r_)); + return simde_float64x2_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vreinterpretq_f64_p8 + #define vreinterpretq_f64_p8(a) simde_vreinterpretq_f64_p8(a) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_float64x2_t +simde_vreinterpretq_f64_p16(simde_poly16x8_t a) { + #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + return vreinterpretq_f64_p16(a); + #else + simde_float64x2_private r_; + simde_poly16x8_private a_ = simde_poly16x8_to_private(a); + simde_memcpy(&r_, &a_, sizeof(r_)); + return simde_float64x2_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vreinterpretq_f64_p16 + #define vreinterpretq_f64_p16(a) simde_vreinterpretq_f64_p16(a) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_float64x2_t +simde_vreinterpretq_f64_p64(simde_poly64x2_t a) { + #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + return vreinterpretq_f64_p64(a); + #else + simde_float64x2_private r_; + simde_poly64x2_private a_ = simde_poly64x2_to_private(a); + simde_memcpy(&r_, &a_, sizeof(r_)); + return simde_float64x2_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vreinterpretq_f64_p64 + #define vreinterpretq_f64_p64(a) simde_vreinterpretq_f64_p64(a) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_float16x4_t +simde_vreinterpret_f16_p64(simde_poly64x1_t a) { + #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARM_NEON_FP16) + return vreinterpret_f16_p64(a); + #else + simde_float16x4_private r_; + simde_poly64x1_private a_ = simde_poly64x1_to_private(a); + simde_memcpy(&r_, &a_, sizeof(r_)); + return simde_float16x4_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) + #undef vreinterpret_f16_p64 + #define vreinterpret_f16_p64 simde_vreinterpret_f16_p64 +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_float16x4_t +simde_vreinterpret_f16_p8(simde_poly8x8_t a) { + #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) && defined(SIMDE_ARM_NEON_FP16) + return vreinterpret_f16_p8(a); + #else + simde_float16x4_private r_; + simde_poly8x8_private a_ = simde_poly8x8_to_private(a); + simde_memcpy(&r_, &a_, sizeof(r_)); + return simde_float16x4_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vreinterpret_f16_p8 + #define vreinterpret_f16_p8 simde_vreinterpret_f16_p8 +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_float16x8_t +simde_vreinterpretq_f16_p64(simde_poly64x2_t a) { + #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARM_NEON_FP16) + return vreinterpretq_f16_p64(a); + #else + simde_float16x8_private r_; + simde_poly64x2_private a_ = simde_poly64x2_to_private(a); + simde_memcpy(&r_, &a_, sizeof(r_)); + return simde_float16x8_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) + #undef vreinterpretq_f16_p64 + #define vreinterpretq_f16_p64(a) simde_vreinterpretq_f16_p64(a) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_float16x8_t +simde_vreinterpretq_f16_p8(simde_poly8x16_t a) { + #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) && defined(SIMDE_ARM_NEON_FP16) + return vreinterpretq_f16_p8(a); + #else + simde_float16x8_private r_; + simde_poly8x16_private a_ = simde_poly8x16_to_private(a); + simde_memcpy(&r_, &a_, sizeof(r_)); + return simde_float16x8_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vreinterpretq_f16_p8 + #define vreinterpretq_f16_p8(a) simde_vreinterpretq_f16_p8(a) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_uint8x8_t +simde_vreinterpret_u8_p16(simde_poly16x4_t a) { + #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) + return vreinterpret_u8_p16(a); + #else + simde_uint8x8_private r_; + simde_poly16x4_private a_ = simde_poly16x4_to_private(a); + simde_memcpy(&r_, &a_, sizeof(r_)); + return simde_uint8x8_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vreinterpret_u8_p16 + #define vreinterpret_u8_p16 simde_vreinterpret_u8_p16 +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_uint8x8_t +simde_vreinterpret_u8_p64(simde_poly64x1_t a) { + #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) + return vreinterpret_u8_p64(a); + #else + simde_uint8x8_private r_; + simde_poly64x1_private a_ = simde_poly64x1_to_private(a); + simde_memcpy(&r_, &a_, sizeof(r_)); + return simde_uint8x8_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) + #undef vreinterpret_u8_p64 + #define vreinterpret_u8_p64 simde_vreinterpret_u8_p64 +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_uint8x16_t +simde_vreinterpretq_u8_p16(simde_poly16x8_t a) { + #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) + return vreinterpretq_u8_p16(a); + #else + simde_uint8x16_private r_; + simde_poly16x8_private a_ = simde_poly16x8_to_private(a); + simde_memcpy(&r_, &a_, sizeof(r_)); + return simde_uint8x16_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vreinterpretq_u8_p16 + #define vreinterpretq_u8_p16(a) simde_vreinterpretq_u8_p16(a) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_uint8x16_t +simde_vreinterpretq_u8_p64(simde_poly64x2_t a) { + #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) + return vreinterpretq_u8_p64(a); + #else + simde_uint8x16_private r_; + simde_poly64x2_private a_ = simde_poly64x2_to_private(a); + simde_memcpy(&r_, &a_, sizeof(r_)); + return simde_uint8x16_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) + #undef vreinterpretq_u8_p64 + #define vreinterpretq_u8_p64(a) simde_vreinterpretq_u8_p64(a) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_uint16x4_t +simde_vreinterpret_u16_p8(simde_poly8x8_t a) { + #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) + return vreinterpret_u16_p8(a); + #else + simde_uint16x4_private r_; + simde_poly8x8_private a_ = simde_poly8x8_to_private(a); + simde_memcpy(&r_, &a_, sizeof(r_)); + return simde_uint16x4_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vreinterpret_u16_p8 + #define vreinterpret_u16_p8 simde_vreinterpret_u16_p8 +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_uint16x4_t +simde_vreinterpret_u16_p64(simde_poly64x1_t a) { + #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) + return vreinterpret_u16_p64(a); + #else + simde_uint16x4_private r_; + simde_poly64x1_private a_ = simde_poly64x1_to_private(a); + simde_memcpy(&r_, &a_, sizeof(r_)); + return simde_uint16x4_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) + #undef vreinterpret_u16_p64 + #define vreinterpret_u16_p64 simde_vreinterpret_u16_p64 +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_uint16x8_t +simde_vreinterpretq_u16_p8(simde_poly8x16_t a) { + #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) + return vreinterpretq_u16_p8(a); + #else + simde_uint16x8_private r_; + simde_poly8x16_private a_ = simde_poly8x16_to_private(a); + simde_memcpy(&r_, &a_, sizeof(r_)); + return simde_uint16x8_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vreinterpretq_u16_p8 + #define vreinterpretq_u16_p8(a) simde_vreinterpretq_u16_p8(a) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_uint16x8_t +simde_vreinterpretq_u16_p64(simde_poly64x2_t a) { + #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) + return vreinterpretq_u16_p64(a); + #else + simde_uint16x8_private r_; + simde_poly64x2_private a_ = simde_poly64x2_to_private(a); + simde_memcpy(&r_, &a_, sizeof(r_)); + return simde_uint16x8_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) + #undef vreinterpretq_u16_p64 + #define vreinterpretq_u16_p64(a) simde_vreinterpretq_u16_p64(a) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_uint32x2_t +simde_vreinterpret_u32_p8(simde_poly8x8_t a) { + #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) + return vreinterpret_u32_p8(a); + #else + simde_uint32x2_private r_; + simde_poly8x8_private a_ = simde_poly8x8_to_private(a); + simde_memcpy(&r_, &a_, sizeof(r_)); + return simde_uint32x2_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vreinterpret_u32_p8 + #define vreinterpret_u32_p8 simde_vreinterpret_u32_p8 +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_uint32x4_t +simde_vreinterpretq_u32_p8(simde_poly8x16_t a) { + #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) + return vreinterpretq_u32_p8(a); + #else + simde_uint32x4_private r_; + simde_poly8x16_private a_ = simde_poly8x16_to_private(a); + simde_memcpy(&r_, &a_, sizeof(r_)); + return simde_uint32x4_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vreinterpretq_u32_p8 + #define vreinterpretq_u32_p8(a) simde_vreinterpretq_u32_p8(a) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_uint32x2_t +simde_vreinterpret_u32_p16(simde_poly16x4_t a) { + #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) + return vreinterpret_u32_p16(a); + #else + simde_uint32x2_private r_; + simde_poly16x4_private a_ = simde_poly16x4_to_private(a); + simde_memcpy(&r_, &a_, sizeof(r_)); + return simde_uint32x2_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vreinterpret_u32_p16 + #define vreinterpret_u32_p16 simde_vreinterpret_u32_p16 +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_uint32x4_t +simde_vreinterpretq_u32_p16(simde_poly16x8_t a) { + #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) + return vreinterpretq_u32_p16(a); + #else + simde_uint32x4_private r_; + simde_poly16x8_private a_ = simde_poly16x8_to_private(a); + simde_memcpy(&r_, &a_, sizeof(r_)); + return simde_uint32x4_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vreinterpretq_u32_p16 + #define vreinterpretq_u32_p16(a) simde_vreinterpretq_u32_p16(a) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_uint32x2_t +simde_vreinterpret_u32_p64(simde_poly64x1_t a) { + #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) + return vreinterpret_u32_p64(a); + #else + simde_uint32x2_private r_; + simde_poly64x1_private a_ = simde_poly64x1_to_private(a); + simde_memcpy(&r_, &a_, sizeof(r_)); + return simde_uint32x2_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) + #undef vreinterpret_u32_p64 + #define vreinterpret_u32_p64 simde_vreinterpret_u32_p64 +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_uint32x4_t +simde_vreinterpretq_u32_p64(simde_poly64x2_t a) { + #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) + return vreinterpretq_u32_p64(a); + #else + simde_uint32x4_private r_; + simde_poly64x2_private a_ = simde_poly64x2_to_private(a); + simde_memcpy(&r_, &a_, sizeof(r_)); + return simde_uint32x4_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) + #undef vreinterpretq_u32_p64 + #define vreinterpretq_u32_p64(a) simde_vreinterpretq_u32_p64(a) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_uint64x1_t +simde_vreinterpret_u64_p8(simde_poly8x8_t a) { + #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) + return vreinterpret_u64_p8(a); + #else + simde_uint64x1_private r_; + simde_poly8x8_private a_ = simde_poly8x8_to_private(a); + simde_memcpy(&r_, &a_, sizeof(r_)); + return simde_uint64x1_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vreinterpret_u64_p8 + #define vreinterpret_u64_p8 simde_vreinterpret_u64_p8 +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_uint64x2_t +simde_vreinterpretq_u64_p8(simde_poly8x16_t a) { + #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) + return vreinterpretq_u64_p8(a); + #else + simde_uint64x2_private r_; + simde_poly8x16_private a_ = simde_poly8x16_to_private(a); + simde_memcpy(&r_, &a_, sizeof(r_)); + return simde_uint64x2_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vreinterpretq_u64_p8 + #define vreinterpretq_u64_p8(a) simde_vreinterpretq_u64_p8(a) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_uint64x1_t +simde_vreinterpret_u64_p16(simde_poly16x4_t a) { + #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) + return vreinterpret_u64_p16(a); + #else + simde_uint64x1_private r_; + simde_poly16x4_private a_ = simde_poly16x4_to_private(a); + simde_memcpy(&r_, &a_, sizeof(r_)); + return simde_uint64x1_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vreinterpret_u64_p16 + #define vreinterpret_u64_p16 simde_vreinterpret_u64_p16 +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_uint64x2_t +simde_vreinterpretq_u64_p16(simde_poly16x8_t a) { + #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) + return vreinterpretq_u64_p16(a); + #else + simde_uint64x2_private r_; + simde_poly16x8_private a_ = simde_poly16x8_to_private(a); + simde_memcpy(&r_, &a_, sizeof(r_)); + return simde_uint64x2_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vreinterpretq_u64_p16 + #define vreinterpretq_u64_p16(a) simde_vreinterpretq_u64_p16(a) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_poly8x8_t +simde_vreinterpret_p8_u16(simde_uint16x4_t a) { + #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) + return vreinterpret_p8_u16(a); + #else + simde_poly8x8_private r_; + simde_uint16x4_private a_ = simde_uint16x4_to_private(a); + simde_memcpy(&r_, &a_, sizeof(r_)); + return simde_poly8x8_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vreinterpret_p8_u16 + #define vreinterpret_p8_u16 simde_vreinterpret_p8_u16 +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_poly8x8_t +simde_vreinterpret_p8_u64(simde_uint64x1_t a) { + #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) + return vreinterpret_p8_u64(a); + #else + simde_poly8x8_private r_; + simde_uint64x1_private a_ = simde_uint64x1_to_private(a); + simde_memcpy(&r_, &a_, sizeof(r_)); + return simde_poly8x8_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vreinterpret_p8_u64 + #define vreinterpret_p8_u64 simde_vreinterpret_p8_u64 +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_poly8x16_t +simde_vreinterpretq_p8_u16(simde_uint16x8_t a) { + #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) + return vreinterpretq_p8_u16(a); + #else + simde_poly8x16_private r_; + simde_uint16x8_private a_ = simde_uint16x8_to_private(a); + simde_memcpy(&r_, &a_, sizeof(r_)); + return simde_poly8x16_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vreinterpretq_p8_u16 + #define vreinterpretq_p8_u16(a) simde_vreinterpretq_p8_u16(a) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_poly8x16_t +simde_vreinterpretq_p8_u64(simde_uint64x2_t a) { + #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) + return vreinterpretq_p8_u64(a); + #else + simde_poly8x16_private r_; + simde_uint64x2_private a_ = simde_uint64x2_to_private(a); + simde_memcpy(&r_, &a_, sizeof(r_)); + return simde_poly8x16_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vreinterpretq_p8_u64 + #define vreinterpretq_p8_u64(a) simde_vreinterpretq_p8_u64(a) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_poly8x8_t +simde_vreinterpret_p8_u32(simde_uint32x2_t a) { + #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) + return vreinterpret_p8_u32(a); + #else + simde_poly8x8_private r_; + simde_uint32x2_private a_ = simde_uint32x2_to_private(a); + simde_memcpy(&r_, &a_, sizeof(r_)); + return simde_poly8x8_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vreinterpret_p8_u32 + #define vreinterpret_p8_u32 simde_vreinterpret_p8_u32 +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_poly8x16_t +simde_vreinterpretq_p8_u32(simde_uint32x4_t a) { + #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) + return vreinterpretq_p8_u32(a); + #else + simde_poly8x16_private r_; + simde_uint32x4_private a_ = simde_uint32x4_to_private(a); + simde_memcpy(&r_, &a_, sizeof(r_)); + return simde_poly8x16_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vreinterpretq_p8_u32 + #define vreinterpretq_p8_u32(a) simde_vreinterpretq_p8_u32(a) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_poly16x4_t +simde_vreinterpret_p16_u8(simde_uint8x8_t a) { + #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) + return vreinterpret_p16_u8(a); + #else + simde_poly16x4_private r_; + simde_uint8x8_private a_ = simde_uint8x8_to_private(a); + simde_memcpy(&r_, &a_, sizeof(r_)); + return simde_poly16x4_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vreinterpret_p16_u8 + #define vreinterpret_p16_u8 simde_vreinterpret_p16_u8 +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_poly16x4_t +simde_vreinterpret_p16_u32(simde_uint32x2_t a) { + #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) + return vreinterpret_p16_u32(a); + #else + simde_poly16x4_private r_; + simde_uint32x2_private a_ = simde_uint32x2_to_private(a); + simde_memcpy(&r_, &a_, sizeof(r_)); + return simde_poly16x4_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vreinterpret_p16_u32 + #define vreinterpret_p16_u32 simde_vreinterpret_p16_u32 +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_poly16x4_t +simde_vreinterpret_p16_u64(simde_uint64x1_t a) { + #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) + return vreinterpret_p16_u64(a); + #else + simde_poly16x4_private r_; + simde_uint64x1_private a_ = simde_uint64x1_to_private(a); + simde_memcpy(&r_, &a_, sizeof(r_)); + return simde_poly16x4_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vreinterpret_p16_u64 + #define vreinterpret_p16_u64 simde_vreinterpret_p16_u64 +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_poly16x8_t +simde_vreinterpretq_p16_u8(simde_uint8x16_t a) { + #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) + return vreinterpretq_p16_u8(a); + #else + simde_poly16x8_private r_; + simde_uint8x16_private a_ = simde_uint8x16_to_private(a); + simde_memcpy(&r_, &a_, sizeof(r_)); + return simde_poly16x8_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vreinterpretq_p16_u8 + #define vreinterpretq_p16_u8(a) simde_vreinterpretq_p16_u8(a) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_poly16x8_t +simde_vreinterpretq_p16_u32(simde_uint32x4_t a) { + #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) + return vreinterpretq_p16_u32(a); + #else + simde_poly16x8_private r_; + simde_uint32x4_private a_ = simde_uint32x4_to_private(a); + simde_memcpy(&r_, &a_, sizeof(r_)); + return simde_poly16x8_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vreinterpretq_p16_u32 + #define vreinterpretq_p16_u32(a) simde_vreinterpretq_p16_u32(a) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_poly16x8_t +simde_vreinterpretq_p16_u64(simde_uint64x2_t a) { + #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) + return vreinterpretq_p16_u64(a); + #else + simde_poly16x8_private r_; + simde_uint64x2_private a_ = simde_uint64x2_to_private(a); + simde_memcpy(&r_, &a_, sizeof(r_)); + return simde_poly16x8_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vreinterpretq_p16_u64 + #define vreinterpretq_p16_u64(a) simde_vreinterpretq_p16_u64(a) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_poly64x1_t +simde_vreinterpret_p64_u8(simde_uint8x8_t a) { + #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) + return vreinterpret_p64_u8(a); + #else + simde_poly64x1_private r_; + simde_uint8x8_private a_ = simde_uint8x8_to_private(a); + simde_memcpy(&r_, &a_, sizeof(r_)); + return simde_poly64x1_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) + #undef vreinterpret_p64_u8 + #define vreinterpret_p64_u8 simde_vreinterpret_p64_u8 +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_poly64x1_t +simde_vreinterpret_p64_u16(simde_uint16x4_t a) { + #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) + return vreinterpret_p64_u16(a); + #else + simde_poly64x1_private r_; + simde_uint16x4_private a_ = simde_uint16x4_to_private(a); + simde_memcpy(&r_, &a_, sizeof(r_)); + return simde_poly64x1_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) + #undef vreinterpret_p64_u16 + #define vreinterpret_p64_u16 simde_vreinterpret_p64_u16 +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_poly64x1_t +simde_vreinterpret_p64_u32(simde_uint32x2_t a) { + #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) + return vreinterpret_p64_u32(a); + #else + simde_poly64x1_private r_; + simde_uint32x2_private a_ = simde_uint32x2_to_private(a); + simde_memcpy(&r_, &a_, sizeof(r_)); + return simde_poly64x1_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) + #undef vreinterpret_p64_u32 + #define vreinterpret_p64_u32 simde_vreinterpret_p64_u32 +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_poly64x2_t +simde_vreinterpretq_p64_u8(simde_uint8x16_t a) { + #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) + return vreinterpretq_p64_u8(a); + #else + simde_poly64x2_private r_; + simde_uint8x16_private a_ = simde_uint8x16_to_private(a); + simde_memcpy(&r_, &a_, sizeof(r_)); + return simde_poly64x2_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) + #undef vreinterpretq_p64_u8 + #define vreinterpretq_p64_u8(a) simde_vreinterpretq_p64_u8(a) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_poly64x2_t +simde_vreinterpretq_p64_u16(simde_uint16x8_t a) { + #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) + return vreinterpretq_p64_u16(a); + #else + simde_poly64x2_private r_; + simde_uint16x8_private a_ = simde_uint16x8_to_private(a); + simde_memcpy(&r_, &a_, sizeof(r_)); + return simde_poly64x2_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) + #undef vreinterpretq_p64_u16 + #define vreinterpretq_p64_u16(a) simde_vreinterpretq_p64_u16(a) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_poly64x2_t +simde_vreinterpretq_p64_u32(simde_uint32x4_t a) { + #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) + return vreinterpretq_p64_u32(a); + #else + simde_poly64x2_private r_; + simde_uint32x4_private a_ = simde_uint32x4_to_private(a); + simde_memcpy(&r_, &a_, sizeof(r_)); + return simde_poly64x2_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) + #undef vreinterpretq_p64_u32 + #define vreinterpretq_p64_u32(a) simde_vreinterpretq_p64_u32(a) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_uint8x8_t +simde_vreinterpret_u8_p8(simde_poly8x8_t a) { + #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) + return vreinterpret_u8_p8(a); + #else + simde_uint8x8_private r_; + simde_poly8x8_private a_ = simde_poly8x8_to_private(a); + simde_memcpy(&r_, &a_, sizeof(r_)); + return simde_uint8x8_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vreinterpret_u8_p8 + #define vreinterpret_u8_p8 simde_vreinterpret_u8_p8 +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_uint8x16_t +simde_vreinterpretq_u8_p8(simde_poly8x16_t a) { + #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) + return vreinterpretq_u8_p8(a); + #else + simde_uint8x16_private r_; + simde_poly8x16_private a_ = simde_poly8x16_to_private(a); + simde_memcpy(&r_, &a_, sizeof(r_)); + return simde_uint8x16_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vreinterpretq_u8_p8 + #define vreinterpretq_u8_p8(a) simde_vreinterpretq_u8_p8(a) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_uint16x4_t +simde_vreinterpret_u16_p16(simde_poly16x4_t a) { + #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) + return vreinterpret_u16_p16(a); + #else + simde_uint16x4_private r_; + simde_poly16x4_private a_ = simde_poly16x4_to_private(a); + simde_memcpy(&r_, &a_, sizeof(r_)); + return simde_uint16x4_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vreinterpret_u16_p16 + #define vreinterpret_u16_p16 simde_vreinterpret_u16_p16 +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_uint16x8_t +simde_vreinterpretq_u16_p16(simde_poly16x8_t a) { + #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) + return vreinterpretq_u16_p16(a); + #else + simde_uint16x8_private r_; + simde_poly16x8_private a_ = simde_poly16x8_to_private(a); + simde_memcpy(&r_, &a_, sizeof(r_)); + return simde_uint16x8_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vreinterpretq_u16_p16 + #define vreinterpretq_u16_p16(a) simde_vreinterpretq_u16_p16(a) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_uint64x1_t +simde_vreinterpret_u64_p64(simde_poly64x1_t a) { + #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) + return vreinterpret_u64_p64(a); + #else + simde_uint64x1_private r_; + simde_poly64x1_private a_ = simde_poly64x1_to_private(a); + simde_memcpy(&r_, &a_, sizeof(r_)); + return simde_uint64x1_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) + #undef vreinterpret_u64_p64 + #define vreinterpret_u64_p64 simde_vreinterpret_u64_p64 +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_uint64x2_t +simde_vreinterpretq_u64_p64(simde_poly64x2_t a) { + #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) + return vreinterpretq_u64_p64(a); + #else + simde_uint64x2_private r_; + simde_poly64x2_private a_ = simde_poly64x2_to_private(a); + simde_memcpy(&r_, &a_, sizeof(r_)); + return simde_uint64x2_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) + #undef vreinterpretq_u64_p64 + #define vreinterpretq_u64_p64(a) simde_vreinterpretq_u64_p64(a) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_poly8x8_t +simde_vreinterpret_p8_u8(simde_uint8x8_t a) { + #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) + return vreinterpret_p8_u8(a); + #else + simde_poly8x8_private r_; + simde_uint8x8_private a_ = simde_uint8x8_to_private(a); + simde_memcpy(&r_, &a_, sizeof(r_)); + return simde_poly8x8_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vreinterpret_p8_u8 + #define vreinterpret_p8_u8 simde_vreinterpret_p8_u8 +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_poly8x16_t +simde_vreinterpretq_p8_u8(simde_uint8x16_t a) { + #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) + return vreinterpretq_p8_u8(a); + #else + simde_poly8x16_private r_; + simde_uint8x16_private a_ = simde_uint8x16_to_private(a); + simde_memcpy(&r_, &a_, sizeof(r_)); + return simde_poly8x16_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vreinterpretq_p8_u8 + #define vreinterpretq_p8_u8(a) simde_vreinterpretq_p8_u8(a) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_poly16x4_t +simde_vreinterpret_p16_u16(simde_uint16x4_t a) { + #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) + return vreinterpret_p16_u16(a); + #else + simde_poly16x4_private r_; + simde_uint16x4_private a_ = simde_uint16x4_to_private(a); + simde_memcpy(&r_, &a_, sizeof(r_)); + return simde_poly16x4_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vreinterpret_p16_u16 + #define vreinterpret_p16_u16 simde_vreinterpret_p16_u16 +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_poly16x8_t +simde_vreinterpretq_p16_u16(simde_uint16x8_t a) { + #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) + return vreinterpretq_p16_u16(a); + #else + simde_poly16x8_private r_; + simde_uint16x8_private a_ = simde_uint16x8_to_private(a); + simde_memcpy(&r_, &a_, sizeof(r_)); + return simde_poly16x8_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vreinterpretq_p16_u16 + #define vreinterpretq_p16_u16(a) simde_vreinterpretq_p16_u16(a) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_poly64x1_t +simde_vreinterpret_p64_u64(simde_uint64x1_t a) { + #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) + return vreinterpret_p64_u64(a); + #else + simde_poly64x1_private r_; + simde_uint64x1_private a_ = simde_uint64x1_to_private(a); + simde_memcpy(&r_, &a_, sizeof(r_)); + return simde_poly64x1_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) + #undef vreinterpret_p64_u64 + #define vreinterpret_p64_u64 simde_vreinterpret_p64_u64 +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_poly64x2_t +simde_vreinterpretq_p64_u64(simde_uint64x2_t a) { + #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) + return vreinterpretq_p64_u64(a); + #else + simde_poly64x2_private r_; + simde_uint64x2_private a_ = simde_uint64x2_to_private(a); + simde_memcpy(&r_, &a_, sizeof(r_)); + return simde_poly64x2_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) + #undef vreinterpretq_p64_u64 + #define vreinterpretq_p64_u64(a) simde_vreinterpretq_p64_u64(a) +#endif + +#if !defined(SIMDE_TARGET_NOT_SUPPORT_INT128_TYPE) +SIMDE_FUNCTION_ATTRIBUTES +simde_poly128_t +simde_vreinterpretq_p128_s8(simde_int8x16_t a) { + #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARCH_ARM_CRYPTO) + return vreinterpretq_p128_s8(a); + #else + simde_poly128_t r_; + simde_int8x16_private a_ = simde_int8x16_to_private(a); + simde_memcpy(&r_, &a_, sizeof(r_)); + return r_; + #endif +} +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) + #undef vreinterpretq_p128_s8 + #define vreinterpretq_p128_s8(a) simde_vreinterpretq_p128_s8(a) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_poly128_t +simde_vreinterpretq_p128_s16(simde_int16x8_t a) { + #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARCH_ARM_CRYPTO) + return vreinterpretq_p128_s16(a); + #else + simde_poly128_t r_; + simde_int16x8_private a_ = simde_int16x8_to_private(a); + simde_memcpy(&r_, &a_, sizeof(r_)); + return r_; + #endif +} +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) + #undef vreinterpretq_p128_s16 + #define vreinterpretq_p128_s16(a) simde_vreinterpretq_p128_s16(a) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_poly128_t +simde_vreinterpretq_p128_s32(simde_int32x4_t a) { + #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARCH_ARM_CRYPTO) + return vreinterpretq_p128_s32(a); + #else + simde_poly128_t r_; + simde_int32x4_private a_ = simde_int32x4_to_private(a); + simde_memcpy(&r_, &a_, sizeof(r_)); + return r_; + #endif +} +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) + #undef vreinterpretq_p128_s32 + #define vreinterpretq_p128_s32(a) simde_vreinterpretq_p128_s32(a) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_poly128_t +simde_vreinterpretq_p128_s64(simde_int64x2_t a) { + #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARCH_ARM_CRYPTO) + return vreinterpretq_p128_s64(a); + #else + simde_poly128_t r_; + simde_int64x2_private a_ = simde_int64x2_to_private(a); + simde_memcpy(&r_, &a_, sizeof(r_)); + return r_; + #endif +} +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) + #undef vreinterpretq_p128_s64 + #define vreinterpretq_p128_s64(a) simde_vreinterpretq_p128_s64(a) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_poly128_t +simde_vreinterpretq_p128_u8(simde_uint8x16_t a) { + #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARCH_ARM_CRYPTO) + return vreinterpretq_p128_u8(a); + #else + simde_poly128_t r_; + simde_uint8x16_private a_ = simde_uint8x16_to_private(a); + simde_memcpy(&r_, &a_, sizeof(r_)); + return r_; + #endif +} +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) + #undef vreinterpretq_p128_u8 + #define vreinterpretq_p128_u8(a) simde_vreinterpretq_p128_u8(a) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_poly128_t +simde_vreinterpretq_p128_u16(simde_uint16x8_t a) { + #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARCH_ARM_CRYPTO) + return vreinterpretq_p128_u16(a); + #else + simde_poly128_t r_; + simde_uint16x8_private a_ = simde_uint16x8_to_private(a); + simde_memcpy(&r_, &a_, sizeof(r_)); + return r_; + #endif +} +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) + #undef vreinterpretq_p128_u16 + #define vreinterpretq_p128_u16(a) simde_vreinterpretq_p128_u16(a) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_poly128_t +simde_vreinterpretq_p128_u32(simde_uint32x4_t a) { + #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARCH_ARM_CRYPTO) + return vreinterpretq_p128_u32(a); + #else + simde_poly128_t r_; + simde_uint32x4_private a_ = simde_uint32x4_to_private(a); + simde_memcpy(&r_, &a_, sizeof(r_)); + return r_; + #endif +} +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) + #undef vreinterpretq_p128_u32 + #define vreinterpretq_p128_u32(a) simde_vreinterpretq_p128_u32(a) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_poly128_t +simde_vreinterpretq_p128_u64(simde_uint64x2_t a) { + #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARCH_ARM_CRYPTO) + return vreinterpretq_p128_u64(a); + #else + simde_poly128_t r_; + simde_uint64x2_private a_ = simde_uint64x2_to_private(a); + simde_memcpy(&r_, &a_, sizeof(r_)); + return r_; + #endif +} +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) + #undef vreinterpretq_p128_u64 + #define vreinterpretq_p128_u64(a) simde_vreinterpretq_p128_u64(a) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_poly128_t +simde_vreinterpretq_p128_p8(simde_poly8x16_t a) { + #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARCH_ARM_CRYPTO) + return vreinterpretq_p128_p8(a); + #else + simde_poly128_t r_; + simde_poly8x16_private a_ = simde_poly8x16_to_private(a); + simde_memcpy(&r_, &a_, sizeof(r_)); + return r_; + #endif +} +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) + #undef vreinterpretq_p128_p8 + #define vreinterpretq_p128_p8(a) simde_vreinterpretq_p128_p8(a) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_poly128_t +simde_vreinterpretq_p128_p16(simde_poly16x8_t a) { + #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARCH_ARM_CRYPTO) + return vreinterpretq_p128_p16(a); + #else + simde_poly128_t r_; + simde_poly16x8_private a_ = simde_poly16x8_to_private(a); + simde_memcpy(&r_, &a_, sizeof(r_)); + return r_; + #endif +} +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) + #undef vreinterpretq_p128_p16 + #define vreinterpretq_p128_p16(a) simde_vreinterpretq_p128_p16(a) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_poly128_t +simde_vreinterpretq_p128_f16(simde_float16x8_t a) { + #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARM_NEON_FP16) && defined(SIMDE_ARCH_ARM_CRYPTO) + return vreinterpretq_p128_f16(a); + #else + simde_poly128_t r_; + simde_float16x8_private a_ = simde_float16x8_to_private(a); + simde_memcpy(&r_, &a_, sizeof(r_)); + return r_; + #endif +} +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) + #undef vreinterpretq_p128_f16 + #define vreinterpretq_p128_f16(a) simde_vreinterpretq_p128_f16(a) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_poly128_t +simde_vreinterpretq_p128_f32(simde_float32x4_t a) { + #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARCH_ARM_CRYPTO) + return vreinterpretq_p128_f32(a); + #else + simde_poly128_t r_; + simde_float32x4_private a_ = simde_float32x4_to_private(a); + simde_memcpy(&r_, &a_, sizeof(r_)); + return r_; + #endif +} +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) + #undef vreinterpretq_p128_f32 + #define vreinterpretq_p128_f32(a) simde_vreinterpretq_p128_f32(a) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_poly128_t +simde_vreinterpretq_p128_f64(simde_float64x2_t a) { + #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && defined(SIMDE_ARCH_ARM_CRYPTO) + return vreinterpretq_p128_f64(a); + #else + simde_poly128_t r_; + simde_float64x2_private a_ = simde_float64x2_to_private(a); + simde_memcpy(&r_, &a_, sizeof(r_)); + return r_; + #endif +} +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) + #undef vreinterpretq_p128_f64 + #define vreinterpretq_p128_f64(a) simde_vreinterpretq_p128_f64(a) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_int8x16_t +simde_vreinterpretq_s8_p128(simde_poly128_t a) { + #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARCH_ARM_CRYPTO) + return vreinterpretq_s8_p128(a); + #else + simde_int8x16_private r_; + simde_poly128_t a_ = a; + simde_memcpy(&r_, &a_, sizeof(r_)); + return simde_int8x16_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) + #undef vreinterpretq_s8_p128 + #define vreinterpretq_s8_p128(a) simde_vreinterpretq_s8_p128(a) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_int16x8_t +simde_vreinterpretq_s16_p128(simde_poly128_t a) { + #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARCH_ARM_CRYPTO) + return vreinterpretq_s16_p128(a); + #else + simde_int16x8_private r_; + simde_poly128_t a_ = a; + simde_memcpy(&r_, &a_, sizeof(r_)); + return simde_int16x8_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) + #undef vreinterpretq_s16_p128 + #define vreinterpretq_s16_p128(a) simde_vreinterpretq_s16_p128(a) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_int32x4_t +simde_vreinterpretq_s32_p128(simde_poly128_t a) { + #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARCH_ARM_CRYPTO) + return vreinterpretq_s32_p128(a); + #else + simde_int32x4_private r_; + simde_poly128_t a_ = a; + simde_memcpy(&r_, &a_, sizeof(r_)); + return simde_int32x4_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) + #undef vreinterpretq_s32_p128 + #define vreinterpretq_s32_p128(a) simde_vreinterpretq_s32_p128(a) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_int64x2_t +simde_vreinterpretq_s64_p128(simde_poly128_t a) { + #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARCH_ARM_CRYPTO) + return vreinterpretq_s64_p128(a); + #else + simde_int64x2_private r_; + simde_poly128_t a_ = a; + simde_memcpy(&r_, &a_, sizeof(r_)); + return simde_int64x2_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) + #undef vreinterpretq_s64_p128 + #define vreinterpretq_s64_p128(a) simde_vreinterpretq_s64_p128(a) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_uint8x16_t +simde_vreinterpretq_u8_p128(simde_poly128_t a) { + #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARCH_ARM_CRYPTO) + return vreinterpretq_u8_p128(a); + #else + simde_uint8x16_private r_; + simde_poly128_t a_ = a; + simde_memcpy(&r_, &a_, sizeof(r_)); + return simde_uint8x16_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) + #undef vreinterpretq_u8_p128 + #define vreinterpretq_u8_p128(a) simde_vreinterpretq_u8_p128(a) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_uint16x8_t +simde_vreinterpretq_u16_p128(simde_poly128_t a) { + #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARCH_ARM_CRYPTO) + return vreinterpretq_u16_p128(a); + #else + simde_uint16x8_private r_; + simde_poly128_t a_ = a; + simde_memcpy(&r_, &a_, sizeof(r_)); + return simde_uint16x8_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) + #undef vreinterpretq_u16_p128 + #define vreinterpretq_u16_p128(a) simde_vreinterpretq_u16_p128(a) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_uint32x4_t +simde_vreinterpretq_u32_p128(simde_poly128_t a) { + #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARCH_ARM_CRYPTO) + return vreinterpretq_u32_p128(a); + #else + simde_uint32x4_private r_; + simde_poly128_t a_ = a; + simde_memcpy(&r_, &a_, sizeof(r_)); + return simde_uint32x4_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) + #undef vreinterpretq_u32_p128 + #define vreinterpretq_u32_p128(a) simde_vreinterpretq_u32_p128(a) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_uint64x2_t +simde_vreinterpretq_u64_p128(simde_poly128_t a) { + #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARCH_ARM_CRYPTO) + return vreinterpretq_u64_p128(a); + #else + simde_uint64x2_private r_; + simde_poly128_t a_ = a; + simde_memcpy(&r_, &a_, sizeof(r_)); + return simde_uint64x2_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) + #undef vreinterpretq_u64_p128 + #define vreinterpretq_u64_p128(a) simde_vreinterpretq_u64_p128(a) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_poly8x16_t +simde_vreinterpretq_p8_p128(simde_poly128_t a) { + #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARCH_ARM_CRYPTO) + return vreinterpretq_p8_p128(a); + #else + simde_poly8x16_private r_; + simde_poly128_t a_ = a; + simde_memcpy(&r_, &a_, sizeof(r_)); + return simde_poly8x16_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) + #undef vreinterpretq_p8_p128 + #define vreinterpretq_p8_p128(a) simde_vreinterpretq_p8_p128(a) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_poly16x8_t +simde_vreinterpretq_p16_p128(simde_poly128_t a) { + #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARCH_ARM_CRYPTO) + return vreinterpretq_p16_p128(a); + #else + simde_poly16x8_private r_; + simde_poly128_t a_ = a; + simde_memcpy(&r_, &a_, sizeof(r_)); + return simde_poly16x8_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) + #undef vreinterpretq_p16_p128 + #define vreinterpretq_p16_p128(a) simde_vreinterpretq_p16_p128(a) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_float16x8_t +simde_vreinterpretq_f16_p128(simde_poly128_t a) { + #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARM_NEON_FP16) && defined(SIMDE_ARCH_ARM_CRYPTO) + return vreinterpretq_f16_p128(a); + #else + simde_float16x8_private r_; + simde_poly128_t a_ = a; + simde_memcpy(&r_, &a_, sizeof(r_)); + return simde_float16x8_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) + #undef vreinterpretq_f16_p128 + #define vreinterpretq_f16_p128(a) simde_vreinterpretq_f16_p128(a) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_float64x2_t +simde_vreinterpretq_f64_p128(simde_poly128_t a) { + #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && defined(SIMDE_ARCH_ARM_CRYPTO) + return vreinterpretq_f64_p128(a); + #else + simde_float64x2_private r_; + simde_poly128_t a_ = a; + simde_memcpy(&r_, &a_, sizeof(r_)); + return simde_float64x2_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) + #undef vreinterpretq_f64_p128 + #define vreinterpretq_f64_p128(a) simde_vreinterpretq_f64_p128(a) +#endif + +#endif /* !defined(SIMDE_TARGET_NOT_SUPPORT_INT128_TYPE) */ + +SIMDE_FUNCTION_ATTRIBUTES +simde_bfloat16x4_t +simde_vreinterpret_bf16_s8(simde_int8x8_t a) { + #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARM_NEON_BF16) + return vreinterpret_bf16_s8(a); + #else + simde_bfloat16x4_private r_; + simde_int8x8_private a_ = simde_int8x8_to_private(a); + simde_memcpy(&r_, &a_, sizeof(r_)); + return simde_bfloat16x4_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) + #undef vreinterpret_bf16_s8 + #define vreinterpret_bf16_s8(a) simde_vreinterpret_bf16_s8(a) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_bfloat16x4_t +simde_vreinterpret_bf16_s16(simde_int16x4_t a) { + #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARM_NEON_BF16) + return vreinterpret_bf16_s16(a); + #else + simde_bfloat16x4_private r_; + simde_int16x4_private a_ = simde_int16x4_to_private(a); + simde_memcpy(&r_, &a_, sizeof(r_)); + return simde_bfloat16x4_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) + #undef vreinterpret_bf16_s16 + #define vreinterpret_bf16_s16(a) simde_vreinterpret_bf16_s16(a) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_bfloat16x4_t +simde_vreinterpret_bf16_s32(simde_int32x2_t a) { + #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARM_NEON_BF16) + return vreinterpret_bf16_s32(a); + #else + simde_bfloat16x4_private r_; + simde_int32x2_private a_ = simde_int32x2_to_private(a); + simde_memcpy(&r_, &a_, sizeof(r_)); + return simde_bfloat16x4_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) + #undef vreinterpret_bf16_s32 + #define vreinterpret_bf16_s32(a) simde_vreinterpret_bf16_s32(a) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_bfloat16x4_t +simde_vreinterpret_bf16_s64(simde_int64x1_t a) { + #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARM_NEON_BF16) + return vreinterpret_bf16_s64(a); + #else + simde_bfloat16x4_private r_; + simde_int64x1_private a_ = simde_int64x1_to_private(a); + simde_memcpy(&r_, &a_, sizeof(r_)); + return simde_bfloat16x4_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) + #undef vreinterpret_bf16_s64 + #define vreinterpret_bf16_s64(a) simde_vreinterpret_bf16_s64(a) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_bfloat16x4_t +simde_vreinterpret_bf16_u8(simde_uint8x8_t a) { + #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARM_NEON_BF16) + return vreinterpret_bf16_u8(a); + #else + simde_bfloat16x4_private r_; + simde_uint8x8_private a_ = simde_uint8x8_to_private(a); + simde_memcpy(&r_, &a_, sizeof(r_)); + return simde_bfloat16x4_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) + #undef vreinterpret_bf16_u8 + #define vreinterpret_bf16_u8(a) simde_vreinterpret_bf16_u8(a) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_bfloat16x4_t +simde_vreinterpret_bf16_u16(simde_uint16x4_t a) { + #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARM_NEON_BF16) + return vreinterpret_bf16_u16(a); + #else + simde_bfloat16x4_private r_; + simde_uint16x4_private a_ = simde_uint16x4_to_private(a); + simde_memcpy(&r_, &a_, sizeof(r_)); + return simde_bfloat16x4_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) + #undef vreinterpret_bf16_u16 + #define vreinterpret_bf16_u16(a) simde_vreinterpret_bf16_u16(a) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_bfloat16x4_t +simde_vreinterpret_bf16_u32(simde_uint32x2_t a) { + #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARM_NEON_BF16) + return vreinterpret_bf16_u32(a); + #else + simde_bfloat16x4_private r_; + simde_uint32x2_private a_ = simde_uint32x2_to_private(a); + simde_memcpy(&r_, &a_, sizeof(r_)); + return simde_bfloat16x4_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) + #undef vreinterpret_bf16_u32 + #define vreinterpret_bf16_u32(a) simde_vreinterpret_bf16_u32(a) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_bfloat16x4_t +simde_vreinterpret_bf16_u64(simde_uint64x1_t a) { + #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARM_NEON_BF16) + return vreinterpret_bf16_u64(a); + #else + simde_bfloat16x4_private r_; + simde_uint64x1_private a_ = simde_uint64x1_to_private(a); + simde_memcpy(&r_, &a_, sizeof(r_)); + return simde_bfloat16x4_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) + #undef vreinterpret_bf16_u64 + #define vreinterpret_bf16_u64(a) simde_vreinterpret_bf16_u64(a) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_bfloat16x4_t +simde_vreinterpret_bf16_f32(simde_float32x2_t a) { + #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARM_NEON_BF16) + return vreinterpret_bf16_f32(a); + #else + simde_bfloat16x4_private r_; + simde_float32x2_private a_ = simde_float32x2_to_private(a); + simde_memcpy(&r_, &a_, sizeof(r_)); + return simde_bfloat16x4_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) + #undef vreinterpret_bf16_f32 + #define vreinterpret_bf16_f32 simde_vreinterpret_bf16_f32 +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_bfloat16x4_t +simde_vreinterpret_bf16_f64(simde_float64x1_t a) { + #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARM_NEON_BF16) + return vreinterpret_bf16_f64(a); + #else + simde_bfloat16x4_private r_; + simde_float64x1_private a_ = simde_float64x1_to_private(a); + simde_memcpy(&r_, &a_, sizeof(r_)); + return simde_bfloat16x4_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vreinterpret_bf16_f64 + #define vreinterpret_bf16_f64 simde_vreinterpret_bf16_f64 +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_bfloat16x8_t +simde_vreinterpretq_bf16_s8(simde_int8x16_t a) { + #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARM_NEON_BF16) + return vreinterpretq_bf16_s8(a); + #else + simde_bfloat16x8_private r_; + simde_int8x16_private a_ = simde_int8x16_to_private(a); + simde_memcpy(&r_, &a_, sizeof(r_)); + return simde_bfloat16x8_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) + #undef vreinterpretq_bf16_s8 + #define vreinterpretq_bf16_s8(a) simde_vreinterpretq_bf16_s8(a) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_bfloat16x8_t +simde_vreinterpretq_bf16_s16(simde_int16x8_t a) { + #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARM_NEON_BF16) + return vreinterpretq_bf16_s16(a); + #else + simde_bfloat16x8_private r_; + simde_int16x8_private a_ = simde_int16x8_to_private(a); + simde_memcpy(&r_, &a_, sizeof(r_)); + return simde_bfloat16x8_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) + #undef vreinterpretq_bf16_s16 + #define vreinterpretq_bf16_s16(a) simde_vreinterpretq_bf16_s16(a) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_bfloat16x8_t +simde_vreinterpretq_bf16_s32(simde_int32x4_t a) { + #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARM_NEON_BF16) + return vreinterpretq_bf16_s32(a); + #else + simde_bfloat16x8_private r_; + simde_int32x4_private a_ = simde_int32x4_to_private(a); + simde_memcpy(&r_, &a_, sizeof(r_)); + return simde_bfloat16x8_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) + #undef vreinterpretq_bf16_s32 + #define vreinterpretq_bf16_s32(a) simde_vreinterpretq_bf16_s32(a) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_bfloat16x8_t +simde_vreinterpretq_bf16_s64(simde_int64x2_t a) { + #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARM_NEON_BF16) + return vreinterpretq_bf16_s64(a); + #else + simde_bfloat16x8_private r_; + simde_int64x2_private a_ = simde_int64x2_to_private(a); + simde_memcpy(&r_, &a_, sizeof(r_)); + return simde_bfloat16x8_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) + #undef vreinterpretq_bf16_s64 + #define vreinterpretq_bf16_s64(a) simde_vreinterpretq_bf16_s64(a) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_bfloat16x8_t +simde_vreinterpretq_bf16_u8(simde_uint8x16_t a) { + #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARM_NEON_BF16) + return vreinterpretq_bf16_u8(a); + #else + simde_bfloat16x8_private r_; + simde_uint8x16_private a_ = simde_uint8x16_to_private(a); + simde_memcpy(&r_, &a_, sizeof(r_)); + return simde_bfloat16x8_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) + #undef vreinterpretq_bf16_u8 + #define vreinterpretq_bf16_u8(a) simde_vreinterpretq_bf16_u8(a) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_bfloat16x8_t +simde_vreinterpretq_bf16_u16(simde_uint16x8_t a) { + #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARM_NEON_BF16) + return vreinterpretq_bf16_u16(a); + #else + simde_bfloat16x8_private r_; + simde_uint16x8_private a_ = simde_uint16x8_to_private(a); + simde_memcpy(&r_, &a_, sizeof(r_)); + return simde_bfloat16x8_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) + #undef vreinterpretq_bf16_u16 + #define vreinterpretq_bf16_u16(a) simde_vreinterpretq_bf16_u16(a) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_bfloat16x8_t +simde_vreinterpretq_bf16_u32(simde_uint32x4_t a) { + #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARM_NEON_BF16) + return vreinterpretq_bf16_u32(a); + #else + simde_bfloat16x8_private r_; + simde_uint32x4_private a_ = simde_uint32x4_to_private(a); + simde_memcpy(&r_, &a_, sizeof(r_)); + return simde_bfloat16x8_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) + #undef vreinterpretq_bf16_u32 + #define vreinterpretq_bf16_u32(a) simde_vreinterpretq_bf16_u32(a) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_bfloat16x8_t +simde_vreinterpretq_bf16_u64(simde_uint64x2_t a) { + #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARM_NEON_BF16) + return vreinterpretq_bf16_u64(a); + #else + simde_bfloat16x8_private r_; + simde_uint64x2_private a_ = simde_uint64x2_to_private(a); + simde_memcpy(&r_, &a_, sizeof(r_)); + return simde_bfloat16x8_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) + #undef vreinterpretq_bf16_u64 + #define vreinterpretq_bf16_u64(a) simde_vreinterpretq_bf16_u64(a) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_bfloat16x8_t +simde_vreinterpretq_bf16_f32(simde_float32x4_t a) { + #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARM_NEON_BF16) + return vreinterpretq_bf16_f32(a); + #else + simde_bfloat16x8_private r_; + simde_float32x4_private a_ = simde_float32x4_to_private(a); + simde_memcpy(&r_, &a_, sizeof(r_)); + return simde_bfloat16x8_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) + #undef vreinterpretq_bf16_f32 + #define vreinterpretq_bf16_f32 simde_vreinterpretq_bf16_f32 +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_bfloat16x8_t +simde_vreinterpretq_bf16_f64(simde_float64x2_t a) { + #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARM_NEON_BF16) + return vreinterpretq_bf16_f64(a); + #else + simde_bfloat16x8_private r_; + simde_float64x2_private a_ = simde_float64x2_to_private(a); + simde_memcpy(&r_, &a_, sizeof(r_)); + return simde_bfloat16x8_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vreinterpretq_bf16_f64 + #define vreinterpretq_bf16_f64 simde_vreinterpretq_bf16_f64 +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_int8x8_t +simde_vreinterpret_s8_bf16(simde_bfloat16x4_t a) { + #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARM_NEON_BF16) + return vreinterpret_s8_bf16(a); + #else + simde_int8x8_private r_; + simde_bfloat16x4_private a_ = simde_bfloat16x4_to_private(a); + simde_memcpy(&r_, &a_, sizeof(r_)); + return simde_int8x8_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) + #undef vreinterpret_s8_bf16 + #define vreinterpret_s8_bf16(a) simde_vreinterpret_s8_bf16(a) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_int16x4_t +simde_vreinterpret_s16_bf16(simde_bfloat16x4_t a) { + #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARM_NEON_BF16) + return vreinterpret_s16_bf16(a); + #else + simde_int16x4_private r_; + simde_bfloat16x4_private a_ = simde_bfloat16x4_to_private(a); + simde_memcpy(&r_, &a_, sizeof(r_)); + return simde_int16x4_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) + #undef vreinterpret_s16_bf16 + #define vreinterpret_s16_bf16(a) simde_vreinterpret_s16_bf16(a) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_int32x2_t +simde_vreinterpret_s32_bf16(simde_bfloat16x4_t a) { + #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARM_NEON_BF16) + return vreinterpret_s32_bf16(a); + #else + simde_int32x2_private r_; + simde_bfloat16x4_private a_ = simde_bfloat16x4_to_private(a); + simde_memcpy(&r_, &a_, sizeof(r_)); + return simde_int32x2_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) + #undef vreinterpret_s32_bf16 + #define vreinterpret_s32_bf16(a) simde_vreinterpret_s32_bf16(a) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_int64x1_t +simde_vreinterpret_s64_bf16(simde_bfloat16x4_t a) { + #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARM_NEON_BF16) + return vreinterpret_s64_bf16(a); + #else + simde_int64x1_private r_; + simde_bfloat16x4_private a_ = simde_bfloat16x4_to_private(a); + simde_memcpy(&r_, &a_, sizeof(r_)); + return simde_int64x1_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) + #undef vreinterpret_s64_bf16 + #define vreinterpret_s64_bf16(a) simde_vreinterpret_s64_bf16(a) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_uint8x8_t +simde_vreinterpret_u8_bf16(simde_bfloat16x4_t a) { + #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARM_NEON_BF16) + return vreinterpret_u8_bf16(a); + #else + simde_uint8x8_private r_; + simde_bfloat16x4_private a_ = simde_bfloat16x4_to_private(a); + simde_memcpy(&r_, &a_, sizeof(r_)); + return simde_uint8x8_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) + #undef vreinterpret_u8_bf16 + #define vreinterpret_u8_bf16(a) simde_vreinterpret_u8_bf16(a) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_uint16x4_t +simde_vreinterpret_u16_bf16(simde_bfloat16x4_t a) { + #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARM_NEON_BF16) + return vreinterpret_u16_bf16(a); + #else + simde_uint16x4_private r_; + simde_bfloat16x4_private a_ = simde_bfloat16x4_to_private(a); + simde_memcpy(&r_, &a_, sizeof(r_)); + return simde_uint16x4_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) + #undef vreinterpret_u16_bf16 + #define vreinterpret_u16_bf16(a) simde_vreinterpret_u16_bf16(a) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_uint32x2_t +simde_vreinterpret_u32_bf16(simde_bfloat16x4_t a) { + #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARM_NEON_BF16) + return vreinterpret_u32_bf16(a); + #else + simde_uint32x2_private r_; + simde_bfloat16x4_private a_ = simde_bfloat16x4_to_private(a); + simde_memcpy(&r_, &a_, sizeof(r_)); + return simde_uint32x2_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) + #undef vreinterpret_u32_bf16 + #define vreinterpret_u32_bf16(a) simde_vreinterpret_u32_bf16(a) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_uint64x1_t +simde_vreinterpret_u64_bf16(simde_bfloat16x4_t a) { + #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARM_NEON_BF16) + return vreinterpret_u64_bf16(a); + #else + simde_uint64x1_private r_; + simde_bfloat16x4_private a_ = simde_bfloat16x4_to_private(a); + simde_memcpy(&r_, &a_, sizeof(r_)); + return simde_uint64x1_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) + #undef vreinterpret_u64_bf16 + #define vreinterpret_u64_bf16(a) simde_vreinterpret_u64_bf16(a) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_float32x2_t +simde_vreinterpret_f32_bf16(simde_bfloat16x4_t a) { + #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARM_NEON_BF16) + return vreinterpret_f32_bf16(a); + #else + simde_float32x2_private r_; + simde_bfloat16x4_private a_ = simde_bfloat16x4_to_private(a); + simde_memcpy(&r_, &a_, sizeof(r_)); + return simde_float32x2_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) + #undef vreinterpret_f32_bf16 + #define vreinterpret_f32_bf16 simde_vreinterpret_f32_bf16 +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_float64x1_t +simde_vreinterpret_f64_bf16(simde_bfloat16x4_t a) { + #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARM_NEON_BF16) + return vreinterpret_f64_bf16(a); + #else + simde_float64x1_private r_; + simde_bfloat16x4_private a_ = simde_bfloat16x4_to_private(a); + simde_memcpy(&r_, &a_, sizeof(r_)); + return simde_float64x1_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vreinterpret_f64_bf16 + #define vreinterpret_f64_bf16 simde_vreinterpret_f64_bf16 +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_int8x16_t +simde_vreinterpretq_s8_bf16(simde_bfloat16x8_t a) { + #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARM_NEON_BF16) + return vreinterpretq_s8_bf16(a); + #else + simde_int8x16_private r_; + simde_bfloat16x8_private a_ = simde_bfloat16x8_to_private(a); + simde_memcpy(&r_, &a_, sizeof(r_)); + return simde_int8x16_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) + #undef vreinterpretq_s8_bf16 + #define vreinterpretq_s8_bf16(a) simde_vreinterpretq_s8_bf16(a) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_int16x8_t +simde_vreinterpretq_s16_bf16(simde_bfloat16x8_t a) { + #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARM_NEON_BF16) + return vreinterpretq_s16_bf16(a); + #else + simde_int16x8_private r_; + simde_bfloat16x8_private a_ = simde_bfloat16x8_to_private(a); + simde_memcpy(&r_, &a_, sizeof(r_)); + return simde_int16x8_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) + #undef vreinterpretq_s16_bf16 + #define vreinterpretq_s16_bf16(a) simde_vreinterpretq_s16_bf16(a) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_int32x4_t +simde_vreinterpretq_s32_bf16(simde_bfloat16x8_t a) { + #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARM_NEON_BF16) + return vreinterpretq_s32_bf16(a); + #else + simde_int32x4_private r_; + simde_bfloat16x8_private a_ = simde_bfloat16x8_to_private(a); + simde_memcpy(&r_, &a_, sizeof(r_)); + return simde_int32x4_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) + #undef vreinterpretq_s32_bf16 + #define vreinterpretq_s32_bf16(a) simde_vreinterpretq_s32_bf16(a) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_int64x2_t +simde_vreinterpretq_s64_bf16(simde_bfloat16x8_t a) { + #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARM_NEON_BF16) + return vreinterpretq_s64_bf16(a); + #else + simde_int64x2_private r_; + simde_bfloat16x8_private a_ = simde_bfloat16x8_to_private(a); + simde_memcpy(&r_, &a_, sizeof(r_)); + return simde_int64x2_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) + #undef vreinterpretq_s64_bf16 + #define vreinterpretq_s64_bf16(a) simde_vreinterpretq_s64_bf16(a) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_uint8x16_t +simde_vreinterpretq_u8_bf16(simde_bfloat16x8_t a) { + #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARM_NEON_BF16) + return vreinterpretq_u8_bf16(a); + #else + simde_uint8x16_private r_; + simde_bfloat16x8_private a_ = simde_bfloat16x8_to_private(a); + simde_memcpy(&r_, &a_, sizeof(r_)); + return simde_uint8x16_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) + #undef vreinterpretq_u8_bf16 + #define vreinterpretq_u8_bf16(a) simde_vreinterpretq_u8_bf16(a) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_uint16x8_t +simde_vreinterpretq_u16_bf16(simde_bfloat16x8_t a) { + #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARM_NEON_BF16) + return vreinterpretq_u16_bf16(a); + #else + simde_uint16x8_private r_; + simde_bfloat16x8_private a_ = simde_bfloat16x8_to_private(a); + simde_memcpy(&r_, &a_, sizeof(r_)); + return simde_uint16x8_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) + #undef vreinterpretq_u16_bf16 + #define vreinterpretq_u16_bf16(a) simde_vreinterpretq_u16_bf16(a) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_uint32x4_t +simde_vreinterpretq_u32_bf16(simde_bfloat16x8_t a) { + #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARM_NEON_BF16) + return vreinterpretq_u32_bf16(a); + #else + simde_uint32x4_private r_; + simde_bfloat16x8_private a_ = simde_bfloat16x8_to_private(a); + simde_memcpy(&r_, &a_, sizeof(r_)); + return simde_uint32x4_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) + #undef vreinterpretq_u32_bf16 + #define vreinterpretq_u32_bf16(a) simde_vreinterpretq_u32_bf16(a) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_uint64x2_t +simde_vreinterpretq_u64_bf16(simde_bfloat16x8_t a) { + #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARM_NEON_BF16) + return vreinterpretq_u64_bf16(a); + #else + simde_uint64x2_private r_; + simde_bfloat16x8_private a_ = simde_bfloat16x8_to_private(a); + simde_memcpy(&r_, &a_, sizeof(r_)); + return simde_uint64x2_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) + #undef vreinterpretq_u64_bf16 + #define vreinterpretq_u64_bf16(a) simde_vreinterpretq_u64_bf16(a) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_float32x4_t +simde_vreinterpretq_f32_bf16(simde_bfloat16x8_t a) { + #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) && defined(SIMDE_ARM_NEON_BF16) + return vreinterpretq_f32_bf16(a); + #else + simde_float32x4_private r_; + simde_bfloat16x8_private a_ = simde_bfloat16x8_to_private(a); + simde_memcpy(&r_, &a_, sizeof(r_)); + return simde_float32x4_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vreinterpretq_f32_bf16 + #define vreinterpretq_f32_bf16 simde_vreinterpretq_f32_bf16 +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_float64x2_t +simde_vreinterpretq_f64_bf16(simde_bfloat16x8_t a) { + #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && defined(SIMDE_ARM_NEON_BF16) + return vreinterpretq_f64_bf16(a); + #else + simde_float64x2_private r_; + simde_bfloat16x8_private a_ = simde_bfloat16x8_to_private(a); + simde_memcpy(&r_, &a_, sizeof(r_)); + return simde_float64x2_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vreinterpretq_f64_bf16 + #define vreinterpretq_f64_bf16 simde_vreinterpretq_f64_bf16 +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_bfloat16x4_t +simde_vreinterpret_bf16_p8(simde_poly8x8_t a) { + #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARM_NEON_BF16) + return vreinterpret_bf16_p8(a); + #else + simde_bfloat16x4_private r_; + simde_poly8x8_private a_ = simde_poly8x8_to_private(a); + simde_memcpy(&r_, &a_, sizeof(r_)); + return simde_bfloat16x4_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) + #undef vreinterpret_bf16_p8 + #define vreinterpret_bf16_p8(a) simde_vreinterpret_bf16_p8(a) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_bfloat16x4_t +simde_vreinterpret_bf16_p16(simde_poly16x4_t a) { + #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARM_NEON_BF16) + return vreinterpret_bf16_p16(a); + #else + simde_bfloat16x4_private r_; + simde_poly16x4_private a_ = simde_poly16x4_to_private(a); + simde_memcpy(&r_, &a_, sizeof(r_)); + return simde_bfloat16x4_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) + #undef vreinterpret_bf16_p16 + #define vreinterpret_bf16_p16(a) simde_vreinterpret_bf16_p16(a) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_bfloat16x4_t +simde_vreinterpret_bf16_p64(simde_poly64x1_t a) { + #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARM_NEON_BF16) + return vreinterpret_bf16_p64(a); + #else + simde_bfloat16x4_private r_; + simde_poly64x1_private a_ = simde_poly64x1_to_private(a); + simde_memcpy(&r_, &a_, sizeof(r_)); + return simde_bfloat16x4_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) + #undef vreinterpret_bf16_p64 + #define vreinterpret_bf16_p64(a) simde_vreinterpret_bf16_p64(a) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_bfloat16x8_t +simde_vreinterpretq_bf16_p8(simde_poly8x16_t a) { + #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARM_NEON_BF16) + return vreinterpretq_bf16_p8(a); + #else + simde_bfloat16x8_private r_; + simde_poly8x16_private a_ = simde_poly8x16_to_private(a); + simde_memcpy(&r_, &a_, sizeof(r_)); + return simde_bfloat16x8_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) + #undef vreinterpretq_bf16_p8 + #define vreinterpretq_bf16_p8(a) simde_vreinterpretq_bf16_p8(a) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_bfloat16x8_t +simde_vreinterpretq_bf16_p16(simde_poly16x8_t a) { + #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARM_NEON_BF16) + return vreinterpretq_bf16_p16(a); + #else + simde_bfloat16x8_private r_; + simde_poly16x8_private a_ = simde_poly16x8_to_private(a); + simde_memcpy(&r_, &a_, sizeof(r_)); + return simde_bfloat16x8_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) + #undef vreinterpretq_bf16_p16 + #define vreinterpretq_bf16_p16(a) simde_vreinterpretq_bf16_p16(a) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_bfloat16x8_t +simde_vreinterpretq_bf16_p64(simde_poly64x2_t a) { + #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARM_NEON_BF16) + return vreinterpretq_bf16_p64(a); + #else + simde_bfloat16x8_private r_; + simde_poly64x2_private a_ = simde_poly64x2_to_private(a); + simde_memcpy(&r_, &a_, sizeof(r_)); + return simde_bfloat16x8_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) + #undef vreinterpretq_bf16_p64 + #define vreinterpretq_bf16_p64(a) simde_vreinterpretq_bf16_p64(a) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_poly8x8_t +simde_vreinterpret_p8_bf16(simde_bfloat16x4_t a) { + #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARM_NEON_BF16) + return vreinterpret_p8_bf16(a); + #else + simde_poly8x8_private r_; + simde_bfloat16x4_private a_ = simde_bfloat16x4_to_private(a); + simde_memcpy(&r_, &a_, sizeof(r_)); + return simde_poly8x8_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) + #undef vreinterpret_p8_bf16 + #define vreinterpret_p8_bf16(a) simde_vreinterpret_p8_bf16(a) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_poly16x4_t +simde_vreinterpret_p16_bf16(simde_bfloat16x4_t a) { + #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARM_NEON_BF16) + return vreinterpret_p16_bf16(a); + #else + simde_poly16x4_private r_; + simde_bfloat16x4_private a_ = simde_bfloat16x4_to_private(a); + simde_memcpy(&r_, &a_, sizeof(r_)); + return simde_poly16x4_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) + #undef vreinterpret_p16_bf16 + #define vreinterpret_p16_bf16(a) simde_vreinterpret_p16_bf16(a) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_poly64x1_t +simde_vreinterpret_p64_bf16(simde_bfloat16x4_t a) { + #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARM_NEON_BF16) + return vreinterpret_p64_bf16(a); + #else + simde_poly64x1_private r_; + simde_bfloat16x4_private a_ = simde_bfloat16x4_to_private(a); + simde_memcpy(&r_, &a_, sizeof(r_)); + return simde_poly64x1_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) + #undef vreinterpret_p64_bf16 + #define vreinterpret_p64_bf16(a) simde_vreinterpret_p64_bf16(a) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_poly8x16_t +simde_vreinterpretq_p8_bf16(simde_bfloat16x8_t a) { + #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARM_NEON_BF16) + return vreinterpretq_p8_bf16(a); + #else + simde_poly8x16_private r_; + simde_bfloat16x8_private a_ = simde_bfloat16x8_to_private(a); + simde_memcpy(&r_, &a_, sizeof(r_)); + return simde_poly8x16_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) + #undef vreinterpretq_p8_bf16 + #define vreinterpretq_p8_bf16(a) simde_vreinterpretq_p8_bf16(a) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_poly16x8_t +simde_vreinterpretq_p16_bf16(simde_bfloat16x8_t a) { + #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARM_NEON_BF16) + return vreinterpretq_p16_bf16(a); + #else + simde_poly16x8_private r_; + simde_bfloat16x8_private a_ = simde_bfloat16x8_to_private(a); + simde_memcpy(&r_, &a_, sizeof(r_)); + return simde_poly16x8_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) + #undef vreinterpretq_p16_bf16 + #define vreinterpretq_p16_bf16(a) simde_vreinterpretq_p16_bf16(a) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_poly64x2_t +simde_vreinterpretq_p64_bf16(simde_bfloat16x8_t a) { + #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARM_NEON_BF16) + return vreinterpretq_p64_bf16(a); + #else + simde_poly64x2_private r_; + simde_bfloat16x8_private a_ = simde_bfloat16x8_to_private(a); + simde_memcpy(&r_, &a_, sizeof(r_)); + return simde_poly64x2_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) + #undef vreinterpretq_p64_bf16 + #define vreinterpretq_p64_bf16(a) simde_vreinterpretq_p64_bf16(a) +#endif + +#if !defined(SIMDE_TARGET_NOT_SUPPORT_INT128_TYPE) + +SIMDE_FUNCTION_ATTRIBUTES +simde_poly128_t +simde_vreinterpretq_p128_bf16(simde_bfloat16x8_t a) { + #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARM_NEON_BF16) + return vreinterpretq_p128_bf16(a); + #else + simde_poly128_t r_; + simde_bfloat16x8_private a_ = simde_bfloat16x8_to_private(a); + simde_memcpy(&r_, &a_, sizeof(r_)); + return r_; + #endif +} +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) + #undef vreinterpretq_p128_bf16 + #define vreinterpretq_p128_bf16(a) simde_vreinterpretq_p128_bf16(a) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_bfloat16x8_t +simde_vreinterpretq_bf16_p128(simde_poly128_t a) { + #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARM_NEON_BF16) + return vreinterpretq_bf16_p128(a); + #else + simde_bfloat16x8_t r_; + simde_poly128_t a_ = a; + simde_memcpy(&r_, &a_, sizeof(r_)); + return simde_bfloat16x8_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) + #undef vreinterpretq_bf16_p128 + #define vreinterpretq_bf16_p128(a) simde_vreinterpretq_bf16_p128(a) +#endif + +#endif /* !defined(SIMDE_TARGET_NOT_SUPPORT_INT128_TYPE) */ + SIMDE_END_DECLS_ HEDLEY_DIAGNOSTIC_POP diff --git a/lib/simd_wrapper/simde/arm/neon/rev16.h b/lib/simd_wrapper/simde/arm/neon/rev16.h index 55fe38c2e23..5ad0bffd1c0 100644 --- a/lib/simd_wrapper/simde/arm/neon/rev16.h +++ b/lib/simd_wrapper/simde/arm/neon/rev16.h @@ -23,6 +23,7 @@ * Copyright: * 2020 Evan Nemerson * 2020 Christopher Moore + * 2023 Yi-Yen Chung (Copyright owned by Andes Technology) */ #if !defined(SIMDE_ARM_NEON_REV16_H) @@ -129,6 +130,34 @@ simde_vrev16q_u8(simde_uint8x16_t a) { #define vrev16q_u8(a) simde_vrev16q_u8(a) #endif +SIMDE_FUNCTION_ATTRIBUTES +simde_poly8x8_t +simde_vrev16_p8(simde_poly8x8_t a) { + #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) + return vrev16_p8(a); + #else + return simde_vreinterpret_p8_s8(simde_vrev16_s8(simde_vreinterpret_s8_p8(a))); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vrev16_p8 + #define vrev16_p8(a) simde_vrev16_p8(a) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_poly8x16_t +simde_vrev16q_p8(simde_poly8x16_t a) { + #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) + return vrev16q_p8(a); + #else + return simde_vreinterpretq_p8_s8(simde_vrev16q_s8(simde_vreinterpretq_s8_p8(a))); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vrev16q_p8 + #define vrev16q_p8(a) simde_vrev16q_p8(a) +#endif + SIMDE_END_DECLS_ HEDLEY_DIAGNOSTIC_POP diff --git a/lib/simd_wrapper/simde/arm/neon/rev32.h b/lib/simd_wrapper/simde/arm/neon/rev32.h index 3fac26505bf..172c38cd4ac 100644 --- a/lib/simd_wrapper/simde/arm/neon/rev32.h +++ b/lib/simd_wrapper/simde/arm/neon/rev32.h @@ -23,6 +23,7 @@ * Copyright: * 2020 Evan Nemerson * 2020 Christopher Moore + * 2023 Yi-Yen Chung (Copyright owned by Andes Technology) */ #if !defined(SIMDE_ARM_NEON_REV32_H) @@ -226,6 +227,62 @@ simde_vrev32q_u16(simde_uint16x8_t a) { #define vrev32q_u16(a) simde_vrev32q_u16(a) #endif +SIMDE_FUNCTION_ATTRIBUTES +simde_poly8x8_t +simde_vrev32_p8(simde_poly8x8_t a) { + #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) + return vrev32_p8(a); + #else + return simde_vreinterpret_p8_s8(simde_vrev32_s8(simde_vreinterpret_s8_p8(a))); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vrev32_p8 + #define vrev32_p8(a) simde_vrev32_p8(a) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_poly16x4_t +simde_vrev32_p16(simde_poly16x4_t a) { + #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) + return vrev32_p16(a); + #else + return simde_vreinterpret_p16_s16(simde_vrev32_s16(simde_vreinterpret_s16_p16(a))); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vrev32_p16 + #define vrev32_p16(a) simde_vrev32_p16(a) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_poly8x16_t +simde_vrev32q_p8(simde_poly8x16_t a) { + #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) + return vrev32q_p8(a); + #else + return simde_vreinterpretq_p8_s8(simde_vrev32q_s8(simde_vreinterpretq_s8_p8(a))); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vrev32q_p8 + #define vrev32q_p8(a) simde_vrev32q_p8(a) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_poly16x8_t +simde_vrev32q_p16(simde_poly16x8_t a) { + #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) + return vrev32q_p16(a); + #else + return simde_vreinterpretq_p16_s16(simde_vrev32q_s16(simde_vreinterpretq_s16_p16(a))); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vrev32q_p16 + #define vrev32q_p16(a) simde_vrev32q_p16(a) +#endif + SIMDE_END_DECLS_ HEDLEY_DIAGNOSTIC_POP diff --git a/lib/simd_wrapper/simde/arm/neon/rev64.h b/lib/simd_wrapper/simde/arm/neon/rev64.h index 274f0812657..4e3af9c5113 100644 --- a/lib/simd_wrapper/simde/arm/neon/rev64.h +++ b/lib/simd_wrapper/simde/arm/neon/rev64.h @@ -23,11 +23,9 @@ * Copyright: * 2020 Evan Nemerson * 2020 Christopher Moore + * 2023 Yi-Yen Chung (Copyright owned by Andes Technology) */ -/* N.B. CM: vrev64_f16 and vrev64q_f16 are omitted as - * SIMDe has no 16-bit floating point support. */ - #if !defined(SIMDE_ARM_NEON_REV64_H) #define SIMDE_ARM_NEON_REV64_H @@ -167,6 +165,20 @@ simde_vrev64_u32(simde_uint32x2_t a) { #define vrev64_u32(a) simde_vrev64_u32(a) #endif +SIMDE_FUNCTION_ATTRIBUTES +simde_float16x4_t +simde_vrev64_f16(simde_float16x4_t a) { + #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) && defined(SIMDE_ARM_NEON_FP16) + return vrev64_f16(a); + #else + return simde_vreinterpret_f16_s16(simde_vrev64_s16(simde_vreinterpret_s16_f16(a))); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vrev64_f16 + #define vrev64_f16(a) simde_vrev64_f16(a) +#endif + SIMDE_FUNCTION_ATTRIBUTES simde_float32x2_t simde_vrev64_f32(simde_float32x2_t a) { @@ -334,6 +346,20 @@ simde_vrev64q_u32(simde_uint32x4_t a) { #define vrev64q_u32(a) simde_vrev64q_u32(a) #endif +SIMDE_FUNCTION_ATTRIBUTES +simde_float16x8_t +simde_vrev64q_f16(simde_float16x8_t a) { + #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) && defined(SIMDE_ARM_NEON_FP16) + return vrev64q_f16(a); + #else + return simde_vreinterpretq_f16_s16(simde_vrev64q_s16(simde_vreinterpretq_s16_f16(a))); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vrev64q_f16 + #define vrev64q_f16(a) simde_vrev64q_f16(a) +#endif + SIMDE_FUNCTION_ATTRIBUTES simde_float32x4_t simde_vrev64q_f32(simde_float32x4_t a) { @@ -348,6 +374,62 @@ simde_vrev64q_f32(simde_float32x4_t a) { #define vrev64q_f32(a) simde_vrev64q_f32(a) #endif +SIMDE_FUNCTION_ATTRIBUTES +simde_poly8x8_t +simde_vrev64_p8(simde_poly8x8_t a) { + #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) + return vrev64_p8(a); + #else + return simde_vreinterpret_p8_s8(simde_vrev64_s8(simde_vreinterpret_s8_p8(a))); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vrev64_p8 + #define vrev64_p8(a) simde_vrev64_p8(a) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_poly16x4_t +simde_vrev64_p16(simde_poly16x4_t a) { + #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) + return vrev64_p16(a); + #else + return simde_vreinterpret_p16_s16(simde_vrev64_s16(simde_vreinterpret_s16_p16(a))); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vrev64_p16 + #define vrev64_p16(a) simde_vrev64_p16(a) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_poly8x16_t +simde_vrev64q_p8(simde_poly8x16_t a) { + #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) + return vrev64q_p8(a); + #else + return simde_vreinterpretq_p8_s8(simde_vrev64q_s8(simde_vreinterpretq_s8_p8(a))); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vrev64q_p8 + #define vrev64q_p8(a) simde_vrev64q_p8(a) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_poly16x8_t +simde_vrev64q_p16(simde_poly16x8_t a) { + #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) + return vrev64q_p16(a); + #else + return simde_vreinterpretq_p16_s16(simde_vrev64q_s16(simde_vreinterpretq_s16_p16(a))); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vrev64q_p16 + #define vrev64q_p16(a) simde_vrev64q_p16(a) +#endif + SIMDE_END_DECLS_ HEDLEY_DIAGNOSTIC_POP diff --git a/lib/simd_wrapper/simde/arm/neon/rnd.h b/lib/simd_wrapper/simde/arm/neon/rnd.h index 9a007b77c03..c663cdd9055 100644 --- a/lib/simd_wrapper/simde/arm/neon/rnd.h +++ b/lib/simd_wrapper/simde/arm/neon/rnd.h @@ -22,6 +22,7 @@ * * Copyright: * 2020 Evan Nemerson + * 2023 Yi-Yen Chung (Copyright owned by Andes Technology) */ #if !defined(SIMDE_ARM_NEON_RND_H) @@ -33,6 +34,43 @@ HEDLEY_DIAGNOSTIC_PUSH SIMDE_DISABLE_UNWANTED_DIAGNOSTICS SIMDE_BEGIN_DECLS_ +SIMDE_FUNCTION_ATTRIBUTES +simde_float16_t +simde_vrndh_f16(simde_float16_t a) { + #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARM_NEON_FP16) + return vrndh_f16(a); + #else + return simde_float16_from_float32(simde_math_truncf(simde_float16_to_float32(a))); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) + #undef vrndh_f16 + #define vrndh_f16(a) simde_vrndh_f16(a) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_float16x4_t +simde_vrnd_f16(simde_float16x4_t a) { + #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARM_NEON_FP16) + return vrnd_f16(a); + #else + simde_float16x4_private + r_, + a_ = simde_float16x4_to_private(a); + + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { + r_.values[i] = simde_vrndh_f16(a_.values[i]); + } + + return simde_float16x4_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) + #undef vrnd_f16 + #define vrnd_f16(a) simde_vrnd_f16(a) +#endif + SIMDE_FUNCTION_ATTRIBUTES simde_float32x2_t simde_vrnd_f32(simde_float32x2_t a) { @@ -79,6 +117,29 @@ simde_vrnd_f64(simde_float64x1_t a) { #define vrnd_f64(a) simde_vrnd_f64(a) #endif +SIMDE_FUNCTION_ATTRIBUTES +simde_float16x8_t +simde_vrndq_f16(simde_float16x8_t a) { + #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARM_NEON_FP16) + return vrndq_f16(a); + #else + simde_float16x8_private + r_, + a_ = simde_float16x8_to_private(a); + + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { + r_.values[i] = simde_vrndh_f16(a_.values[i]); + } + + return simde_float16x8_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) + #undef vrndq_f16 + #define vrndq_f16(a) simde_vrndq_f16(a) +#endif + SIMDE_FUNCTION_ATTRIBUTES simde_float32x4_t simde_vrndq_f32(simde_float32x4_t a) { @@ -125,7 +186,7 @@ simde_vrndq_f64(simde_float64x2_t a) { #if defined(SIMDE_X86_SSE4_1_NATIVE) r_.m128d = _mm_round_pd(a_.m128d, _MM_FROUND_TO_ZERO); #elif defined(SIMDE_X86_SVML_NATIVE) && defined(SIMDE_X86_SSE_NATIVE) - r_.m128d = _mm_trunc_ps(a_.m128d); + r_.m128d = _mm_trunc_pd(a_.m128d); #else SIMDE_VECTORIZE for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { diff --git a/lib/simd_wrapper/simde/arm/neon/rnd32x.h b/lib/simd_wrapper/simde/arm/neon/rnd32x.h new file mode 100644 index 00000000000..38d369aab9c --- /dev/null +++ b/lib/simd_wrapper/simde/arm/neon/rnd32x.h @@ -0,0 +1,160 @@ +/* SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, copy, + * modify, merge, publish, distribute, sublicense, and/or sell copies + * of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + * Copyright: + * 2023 Yi-Yen Chung (Copyright owned by Andes Technology) + */ + +#if !defined(SIMDE_ARM_NEON_RND32X_H) +#define SIMDE_ARM_NEON_RND32X_H + +#include "types.h" + +HEDLEY_DIAGNOSTIC_PUSH +SIMDE_DISABLE_UNWANTED_DIAGNOSTICS +SIMDE_BEGIN_DECLS_ + +// src: https://gcc.gnu.org/legacy-ml/gcc-patches/2019-09/msg00053.html +SIMDE_FUNCTION_ATTRIBUTES +simde_float32x2_t +simde_vrnd32x_f32(simde_float32x2_t a) { + #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && defined(SIMDE_ARCH_ARM_FRINT) + return vrnd32x_f32(a); + #else + simde_float32x2_private + r_, + a_ = simde_float32x2_to_private(a); + + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { + if (simde_math_isnanf(a_.values[i]) || simde_math_isinff(a_.values[i])) { + r_.values[i] = HEDLEY_STATIC_CAST(float, INT32_MIN); + } else { + r_.values[i] = simde_math_rintf(a_.values[i]); + if (r_.values[i] > HEDLEY_STATIC_CAST(float, INT32_MAX) || r_.values[i] < HEDLEY_STATIC_CAST(float, INT32_MIN)) { + r_.values[i] = HEDLEY_STATIC_CAST(float, INT32_MIN); + } + } + } + + return simde_float32x2_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vrnd32x_f32 + #define vrnd32x_f32(a) simde_vrnd32x_f32(a) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_float64x1_t +simde_vrnd32x_f64(simde_float64x1_t a) { + #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && defined(SIMDE_ARCH_ARM_FRINT) && (!defined(__clang__) || SIMDE_DETECT_CLANG_VERSION_CHECK(18, 0, 0)) + return vrnd32x_f64(a); + #else + simde_float64x1_private + r_, + a_ = simde_float64x1_to_private(a); + + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { + if (simde_math_isnan(a_.values[i]) || simde_math_isinf(a_.values[i])) { + r_.values[i] = HEDLEY_STATIC_CAST(double, INT32_MIN); + } else { + r_.values[i] = simde_math_rint(a_.values[i]); + if (r_.values[i] > HEDLEY_STATIC_CAST(double, INT32_MAX) || r_.values[i] < HEDLEY_STATIC_CAST(double, INT32_MIN)) { + r_.values[i] = HEDLEY_STATIC_CAST(double, INT32_MIN); + } + } + } + + return simde_float64x1_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vrnd32x_f64 + #define vrnd32x_f64(a) simde_vrnd32x_f64(a) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_float32x4_t +simde_vrnd32xq_f32(simde_float32x4_t a) { + #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && defined(SIMDE_ARCH_ARM_FRINT) + return vrnd32xq_f32(a); + #else + simde_float32x4_private + r_, + a_ = simde_float32x4_to_private(a); + + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { + if (simde_math_isnanf(a_.values[i]) || simde_math_isinff(a_.values[i])) { + r_.values[i] = HEDLEY_STATIC_CAST(float, INT32_MIN); + } else { + r_.values[i] = simde_math_rintf(a_.values[i]); + if (r_.values[i] > HEDLEY_STATIC_CAST(float, INT32_MAX) || r_.values[i] < HEDLEY_STATIC_CAST(float, INT32_MIN)) { + r_.values[i] = HEDLEY_STATIC_CAST(float, INT32_MIN); + } + } + } + + return simde_float32x4_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vrnd32xq_f32 + #define vrnd32xq_f32(a) simde_vrnd32xq_f32(a) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_float64x2_t +simde_vrnd32xq_f64(simde_float64x2_t a) { + #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && defined(SIMDE_ARCH_ARM_FRINT) && (!defined(__clang__) || SIMDE_DETECT_CLANG_VERSION_CHECK(18, 0, 0)) + return vrnd32xq_f64(a); + #else + simde_float64x2_private + r_, + a_ = simde_float64x2_to_private(a); + + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { + if (simde_math_isnan(a_.values[i]) || simde_math_isinf(a_.values[i])) { + r_.values[i] = HEDLEY_STATIC_CAST(double, INT32_MIN); + } else { + r_.values[i] = simde_math_rint(a_.values[i]); + if (r_.values[i] > HEDLEY_STATIC_CAST(double, INT32_MAX) || r_.values[i] < HEDLEY_STATIC_CAST(double, INT32_MIN)) { + r_.values[i] = HEDLEY_STATIC_CAST(double, INT32_MIN); + } + } + } + + return simde_float64x2_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vrnd32xq_f64 + #define vrnd32xq_f64(a) simde_vrnd32xq_f64(a) +#endif + +SIMDE_END_DECLS_ +HEDLEY_DIAGNOSTIC_POP + +#endif /* !defined(SIMDE_ARM_NEON_RND32X_H) */ diff --git a/lib/simd_wrapper/simde/arm/neon/rnd32z.h b/lib/simd_wrapper/simde/arm/neon/rnd32z.h new file mode 100644 index 00000000000..7000a128e0e --- /dev/null +++ b/lib/simd_wrapper/simde/arm/neon/rnd32z.h @@ -0,0 +1,160 @@ +/* SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, copy, + * modify, merge, publish, distribute, sublicense, and/or sell copies + * of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + * Copyright: + * 2023 Yi-Yen Chung (Copyright owned by Andes Technology) + */ + +#if !defined(SIMDE_ARM_NEON_RND32Z_H) +#define SIMDE_ARM_NEON_RND32Z_H + +#include "types.h" + +HEDLEY_DIAGNOSTIC_PUSH +SIMDE_DISABLE_UNWANTED_DIAGNOSTICS +SIMDE_BEGIN_DECLS_ + +// src: https://gcc.gnu.org/legacy-ml/gcc-patches/2019-09/msg00053.html +SIMDE_FUNCTION_ATTRIBUTES +simde_float32x2_t +simde_vrnd32z_f32(simde_float32x2_t a) { + #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && defined(SIMDE_ARCH_ARM_FRINT) + return vrnd32z_f32(a); + #else + simde_float32x2_private + r_, + a_ = simde_float32x2_to_private(a); + + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { + if (simde_math_isnanf(a_.values[i]) || simde_math_isinff(a_.values[i])) { + r_.values[i] = HEDLEY_STATIC_CAST(float, INT32_MIN); + } else { + r_.values[i] = simde_math_truncf(a_.values[i]); + if (r_.values[i] > HEDLEY_STATIC_CAST(float, INT32_MAX) || r_.values[i] < HEDLEY_STATIC_CAST(float, INT32_MIN)) { + r_.values[i] = HEDLEY_STATIC_CAST(float, INT32_MIN); + } + } + } + + return simde_float32x2_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vrnd32z_f32 + #define vrnd32z_f32(a) simde_vrnd32z_f32(a) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_float64x1_t +simde_vrnd32z_f64(simde_float64x1_t a) { + #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && defined(SIMDE_ARCH_ARM_FRINT) && (!defined(__clang__) || SIMDE_DETECT_CLANG_VERSION_CHECK(18, 0, 0)) + return vrnd32z_f64(a); + #else + simde_float64x1_private + r_, + a_ = simde_float64x1_to_private(a); + + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { + if (simde_math_isnan(a_.values[i]) || simde_math_isinf(a_.values[i])) { + r_.values[i] = HEDLEY_STATIC_CAST(double, INT32_MIN); + } else { + r_.values[i] = simde_math_trunc(a_.values[i]); + if (r_.values[i] > HEDLEY_STATIC_CAST(double, INT32_MAX) || r_.values[i] < HEDLEY_STATIC_CAST(double, INT32_MIN)) { + r_.values[i] = HEDLEY_STATIC_CAST(double, INT32_MIN); + } + } + } + + return simde_float64x1_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vrnd32z_f64 + #define vrnd32z_f64(a) simde_vrnd32z_f64(a) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_float32x4_t +simde_vrnd32zq_f32(simde_float32x4_t a) { + #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && defined(SIMDE_ARCH_ARM_FRINT) + return vrnd32zq_f32(a); + #else + simde_float32x4_private + r_, + a_ = simde_float32x4_to_private(a); + + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { + if (simde_math_isnanf(a_.values[i]) || simde_math_isinff(a_.values[i])) { + r_.values[i] = HEDLEY_STATIC_CAST(float, INT32_MIN); + } else { + r_.values[i] = simde_math_truncf(a_.values[i]); + if (r_.values[i] > HEDLEY_STATIC_CAST(float, INT32_MAX) || r_.values[i] < HEDLEY_STATIC_CAST(float, INT32_MIN)) { + r_.values[i] = HEDLEY_STATIC_CAST(float, INT32_MIN); + } + } + } + + return simde_float32x4_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vrnd32zq_f32 + #define vrnd32zq_f32(a) simde_vrnd32zq_f32(a) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_float64x2_t +simde_vrnd32zq_f64(simde_float64x2_t a) { + #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && defined(SIMDE_ARCH_ARM_FRINT) && (!defined(__clang__) || SIMDE_DETECT_CLANG_VERSION_CHECK(18, 0, 0)) + return vrnd32zq_f64(a); + #else + simde_float64x2_private + r_, + a_ = simde_float64x2_to_private(a); + + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { + if (simde_math_isnan(a_.values[i]) || simde_math_isinf(a_.values[i])) { + r_.values[i] = HEDLEY_STATIC_CAST(double, INT32_MIN); + } else { + r_.values[i] = simde_math_trunc(a_.values[i]); + if (r_.values[i] > HEDLEY_STATIC_CAST(double, INT32_MAX) || r_.values[i] < HEDLEY_STATIC_CAST(double, INT32_MIN)) { + r_.values[i] = HEDLEY_STATIC_CAST(double, INT32_MIN); + } + } + } + + return simde_float64x2_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vrnd32zq_f64 + #define vrnd32zq_f64(a) simde_vrnd32zq_f64(a) +#endif + +SIMDE_END_DECLS_ +HEDLEY_DIAGNOSTIC_POP + +#endif /* !defined(SIMDE_ARM_NEON_RND32Z_H) */ diff --git a/lib/simd_wrapper/simde/arm/neon/rnd64x.h b/lib/simd_wrapper/simde/arm/neon/rnd64x.h new file mode 100644 index 00000000000..8464291ff80 --- /dev/null +++ b/lib/simd_wrapper/simde/arm/neon/rnd64x.h @@ -0,0 +1,160 @@ +/* SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, copy, + * modify, merge, publish, distribute, sublicense, and/or sell copies + * of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + * Copyright: + * 2023 Yi-Yen Chung (Copyright owned by Andes Technology) + */ + +#if !defined(SIMDE_ARM_NEON_RND64X_H) +#define SIMDE_ARM_NEON_RND64X_H + +#include "types.h" + +HEDLEY_DIAGNOSTIC_PUSH +SIMDE_DISABLE_UNWANTED_DIAGNOSTICS +SIMDE_BEGIN_DECLS_ + +// src: https://gcc.gnu.org/legacy-ml/gcc-patches/2019-09/msg00053.html +SIMDE_FUNCTION_ATTRIBUTES +simde_float32x2_t +simde_vrnd64x_f32(simde_float32x2_t a) { + #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && defined(SIMDE_ARCH_ARM_FRINT) + return vrnd64x_f32(a); + #else + simde_float32x2_private + r_, + a_ = simde_float32x2_to_private(a); + + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { + if (simde_math_isnanf(a_.values[i]) || simde_math_isinff(a_.values[i])) { + r_.values[i] = HEDLEY_STATIC_CAST(float, INT64_MIN); + } else { + r_.values[i] = simde_math_rintf(a_.values[i]); + if (r_.values[i] > HEDLEY_STATIC_CAST(float, INT64_MAX) || r_.values[i] < HEDLEY_STATIC_CAST(float, INT64_MIN)) { + r_.values[i] = HEDLEY_STATIC_CAST(float, INT64_MIN); + } + } + } + + return simde_float32x2_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vrnd64x_f32 + #define vrnd64x_f32(a) simde_vrnd64x_f32(a) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_float64x1_t +simde_vrnd64x_f64(simde_float64x1_t a) { + #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && defined(SIMDE_ARCH_ARM_FRINT) && (!defined(__clang__) || SIMDE_DETECT_CLANG_VERSION_CHECK(18, 0, 0)) + return vrnd64x_f64(a); + #else + simde_float64x1_private + r_, + a_ = simde_float64x1_to_private(a); + + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { + if (simde_math_isnan(a_.values[i]) || simde_math_isinf(a_.values[i])) { + r_.values[i] = HEDLEY_STATIC_CAST(double, INT64_MIN); + } else { + r_.values[i] = simde_math_rint(a_.values[i]); + if (r_.values[i] > HEDLEY_STATIC_CAST(double, INT64_MAX) || r_.values[i] < HEDLEY_STATIC_CAST(double, INT64_MIN)) { + r_.values[i] = HEDLEY_STATIC_CAST(double, INT64_MIN); + } + } + } + + return simde_float64x1_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vrnd64x_f64 + #define vrnd64x_f64(a) simde_vrnd64x_f64(a) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_float32x4_t +simde_vrnd64xq_f32(simde_float32x4_t a) { + #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && defined(SIMDE_ARCH_ARM_FRINT) + return vrnd64xq_f32(a); + #else + simde_float32x4_private + r_, + a_ = simde_float32x4_to_private(a); + + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { + if (simde_math_isnanf(a_.values[i]) || simde_math_isinff(a_.values[i])) { + r_.values[i] = HEDLEY_STATIC_CAST(float, INT64_MIN); + } else { + r_.values[i] = simde_math_rintf(a_.values[i]); + if (r_.values[i] > HEDLEY_STATIC_CAST(float, INT64_MAX) || r_.values[i] < HEDLEY_STATIC_CAST(float, INT64_MIN)) { + r_.values[i] = HEDLEY_STATIC_CAST(float, INT64_MIN); + } + } + } + + return simde_float32x4_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vrnd64xq_f32 + #define vrnd64xq_f32(a) simde_vrnd64xq_f32(a) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_float64x2_t +simde_vrnd64xq_f64(simde_float64x2_t a) { + #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && defined(SIMDE_ARCH_ARM_FRINT) && (!defined(__clang__) || SIMDE_DETECT_CLANG_VERSION_CHECK(18, 0, 0)) + return vrnd64xq_f64(a); + #else + simde_float64x2_private + r_, + a_ = simde_float64x2_to_private(a); + + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { + if (simde_math_isnan(a_.values[i]) || simde_math_isinf(a_.values[i])) { + r_.values[i] = HEDLEY_STATIC_CAST(double, INT64_MIN); + } else { + r_.values[i] = simde_math_rint(a_.values[i]); + if (r_.values[i] > HEDLEY_STATIC_CAST(double, INT64_MAX) || r_.values[i] < HEDLEY_STATIC_CAST(double, INT64_MIN)) { + r_.values[i] = HEDLEY_STATIC_CAST(double, INT64_MIN); + } + } + } + + return simde_float64x2_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vrnd64xq_f64 + #define vrnd64xq_f64(a) simde_vrnd64xq_f64(a) +#endif + +SIMDE_END_DECLS_ +HEDLEY_DIAGNOSTIC_POP + +#endif /* !defined(SIMDE_ARM_NEON_RND64X_H) */ diff --git a/lib/simd_wrapper/simde/arm/neon/rnd64z.h b/lib/simd_wrapper/simde/arm/neon/rnd64z.h new file mode 100644 index 00000000000..e63b5829044 --- /dev/null +++ b/lib/simd_wrapper/simde/arm/neon/rnd64z.h @@ -0,0 +1,160 @@ +/* SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, copy, + * modify, merge, publish, distribute, sublicense, and/or sell copies + * of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + * Copyright: + * 2023 Yi-Yen Chung (Copyright owned by Andes Technology) + */ + +#if !defined(SIMDE_ARM_NEON_RND64Z_H) +#define SIMDE_ARM_NEON_RND64Z_H + +#include "types.h" + +HEDLEY_DIAGNOSTIC_PUSH +SIMDE_DISABLE_UNWANTED_DIAGNOSTICS +SIMDE_BEGIN_DECLS_ + +// src: https://gcc.gnu.org/legacy-ml/gcc-patches/2019-09/msg00053.html +SIMDE_FUNCTION_ATTRIBUTES +simde_float32x2_t +simde_vrnd64z_f32(simde_float32x2_t a) { + #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && defined(SIMDE_ARCH_ARM_FRINT) + return vrnd64z_f32(a); + #else + simde_float32x2_private + r_, + a_ = simde_float32x2_to_private(a); + + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { + if (simde_math_isnanf(a_.values[i]) || simde_math_isinff(a_.values[i])) { + r_.values[i] = HEDLEY_STATIC_CAST(float, INT64_MIN); + } else { + r_.values[i] = simde_math_truncf(a_.values[i]); + if (r_.values[i] > HEDLEY_STATIC_CAST(float, INT64_MAX) || r_.values[i] < HEDLEY_STATIC_CAST(float, INT64_MIN)) { + r_.values[i] = HEDLEY_STATIC_CAST(float, INT64_MIN); + } + } + } + + return simde_float32x2_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vrnd64z_f32 + #define vrnd64z_f32(a) simde_vrnd64z_f32(a) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_float64x1_t +simde_vrnd64z_f64(simde_float64x1_t a) { + #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && defined(SIMDE_ARCH_ARM_FRINT) && (!defined(__clang__) || SIMDE_DETECT_CLANG_VERSION_CHECK(18, 0, 0)) + return vrnd64z_f64(a); + #else + simde_float64x1_private + r_, + a_ = simde_float64x1_to_private(a); + + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { + if (simde_math_isnan(a_.values[i]) || simde_math_isinf(a_.values[i])) { + r_.values[i] = HEDLEY_STATIC_CAST(double, INT64_MIN); + } else { + r_.values[i] = simde_math_trunc(a_.values[i]); + if (r_.values[i] > HEDLEY_STATIC_CAST(double, INT64_MAX) || r_.values[i] < HEDLEY_STATIC_CAST(double, INT64_MIN)) { + r_.values[i] = HEDLEY_STATIC_CAST(double, INT64_MIN); + } + } + } + + return simde_float64x1_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vrnd64z_f64 + #define vrnd64z_f64(a) simde_vrnd64z_f64(a) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_float32x4_t +simde_vrnd64zq_f32(simde_float32x4_t a) { + #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && defined(SIMDE_ARCH_ARM_FRINT) + return vrnd64zq_f32(a); + #else + simde_float32x4_private + r_, + a_ = simde_float32x4_to_private(a); + + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { + if (simde_math_isnanf(a_.values[i]) || simde_math_isinff(a_.values[i])) { + r_.values[i] = HEDLEY_STATIC_CAST(float, INT64_MIN); + } else { + r_.values[i] = simde_math_truncf(a_.values[i]); + if (r_.values[i] > HEDLEY_STATIC_CAST(float, INT64_MAX) || r_.values[i] < HEDLEY_STATIC_CAST(float, INT64_MIN)) { + r_.values[i] = HEDLEY_STATIC_CAST(float, INT64_MIN); + } + } + } + + return simde_float32x4_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vrnd64zq_f32 + #define vrnd64zq_f32(a) simde_vrnd64zq_f32(a) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_float64x2_t +simde_vrnd64zq_f64(simde_float64x2_t a) { + #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && defined(SIMDE_ARCH_ARM_FRINT) && (!defined(__clang__) || SIMDE_DETECT_CLANG_VERSION_CHECK(18, 0, 0)) + return vrnd64zq_f64(a); + #else + simde_float64x2_private + r_, + a_ = simde_float64x2_to_private(a); + + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { + if (simde_math_isnan(a_.values[i]) || simde_math_isinf(a_.values[i])) { + r_.values[i] = HEDLEY_STATIC_CAST(double, INT64_MIN); + } else { + r_.values[i] = simde_math_trunc(a_.values[i]); + if (r_.values[i] > HEDLEY_STATIC_CAST(double, INT64_MAX) || r_.values[i] < HEDLEY_STATIC_CAST(double, INT64_MIN)) { + r_.values[i] = HEDLEY_STATIC_CAST(double, INT64_MIN); + } + } + } + + return simde_float64x2_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vrnd64zq_f64 + #define vrnd64zq_f64(a) simde_vrnd64zq_f64(a) +#endif + +SIMDE_END_DECLS_ +HEDLEY_DIAGNOSTIC_POP + +#endif /* !defined(SIMDE_ARM_NEON_RND64Z_H) */ diff --git a/lib/simd_wrapper/simde/arm/neon/rnda.h b/lib/simd_wrapper/simde/arm/neon/rnda.h new file mode 100644 index 00000000000..964e682ea7a --- /dev/null +++ b/lib/simd_wrapper/simde/arm/neon/rnda.h @@ -0,0 +1,191 @@ +/* SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, copy, + * modify, merge, publish, distribute, sublicense, and/or sell copies + * of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + * Copyright: + * 2023 Yi-Yen Chung (Copyright owned by Andes Technology) + */ + +#if !defined(SIMDE_ARM_NEON_RNDA_H) +#define SIMDE_ARM_NEON_RNDA_H + +#include "types.h" + +HEDLEY_DIAGNOSTIC_PUSH +SIMDE_DISABLE_UNWANTED_DIAGNOSTICS +SIMDE_BEGIN_DECLS_ + +SIMDE_FUNCTION_ATTRIBUTES +simde_float16_t +simde_vrndah_f16(simde_float16_t a) { + #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARM_NEON_FP16) + return vrndah_f16(a); + #else + return simde_float16_from_float32(simde_math_roundf(simde_float16_to_float32(a))); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) + #undef vrndah_f16 + #define vrndah_f16(a) simde_vrndah_f16(a) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_float16x4_t +simde_vrnda_f16(simde_float16x4_t a) { + #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARM_NEON_FP16) + return vrnda_f16(a); + #else + simde_float16x4_private + r_, + a_ = simde_float16x4_to_private(a); + + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { + r_.values[i] = simde_vrndah_f16(a_.values[i]); + } + + return simde_float16x4_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) + #undef vrnda_f16 + #define vrnda_f16(a) simde_vrnda_f16(a) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_float32x2_t +simde_vrnda_f32(simde_float32x2_t a) { + #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) + return vrnda_f32(a); + #else + simde_float32x2_private + r_, + a_ = simde_float32x2_to_private(a); + + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { + r_.values[i] = simde_math_roundf(a_.values[i]); + } + + return simde_float32x2_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) + #undef vrnda_f32 + #define vrnda_f32(a) simde_vrnda_f32(a) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_float64x1_t +simde_vrnda_f64(simde_float64x1_t a) { + #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + return vrnda_f64(a); + #else + simde_float64x1_private + r_, + a_ = simde_float64x1_to_private(a); + + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { + r_.values[i] = simde_math_round(a_.values[i]); + } + + return simde_float64x1_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vrnda_f64 + #define vrnda_f64(a) simde_vrnda_f64(a) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_float16x8_t +simde_vrndaq_f16(simde_float16x8_t a) { + #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARM_NEON_FP16) + return vrndaq_f16(a); + #else + simde_float16x8_private + r_, + a_ = simde_float16x8_to_private(a); + + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { + r_.values[i] = simde_vrndah_f16(a_.values[i]); + } + + return simde_float16x8_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) + #undef vrndaq_f16 + #define vrndaq_f16(a) simde_vrndaq_f16(a) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_float32x4_t +simde_vrndaq_f32(simde_float32x4_t a) { + #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) + return vrndaq_f32(a); + #else + simde_float32x4_private + r_, + a_ = simde_float32x4_to_private(a); + + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { + r_.values[i] = simde_math_roundf(a_.values[i]); + } + + return simde_float32x4_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) + #undef vrndaq_f32 + #define vrndaq_f32(a) simde_vrndaq_f32(a) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_float64x2_t +simde_vrndaq_f64(simde_float64x2_t a) { + #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + return vrndaq_f64(a); + #else + simde_float64x2_private + r_, + a_ = simde_float64x2_to_private(a); + + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { + r_.values[i] = simde_math_round(a_.values[i]); + } + + return simde_float64x2_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vrndaq_f64 + #define vrndaq_f64(a) simde_vrndaq_f64(a) +#endif + +SIMDE_END_DECLS_ +HEDLEY_DIAGNOSTIC_POP + +#endif /* !defined(SIMDE_ARM_NEON_RNDA_H) */ diff --git a/lib/simd_wrapper/simde/arm/neon/rndi.h b/lib/simd_wrapper/simde/arm/neon/rndi.h index b15949b552a..6b985ed28c0 100644 --- a/lib/simd_wrapper/simde/arm/neon/rndi.h +++ b/lib/simd_wrapper/simde/arm/neon/rndi.h @@ -22,6 +22,7 @@ * * Copyright: * 2020-2021 Evan Nemerson + * 2023 Yi-Yen Chung (Copyright owned by Andes Technology) */ #if !defined(SIMDE_ARM_NEON_RNDI_H) @@ -33,6 +34,43 @@ HEDLEY_DIAGNOSTIC_PUSH SIMDE_DISABLE_UNWANTED_DIAGNOSTICS SIMDE_BEGIN_DECLS_ +SIMDE_FUNCTION_ATTRIBUTES +simde_float16_t +simde_vrndih_f16(simde_float16_t a) { + #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && !defined(SIMDE_BUG_GCC_95399) && defined(SIMDE_ARM_NEON_FP16) + return vrndih_f16(a); + #else + return simde_float16_from_float32(simde_math_nearbyintf(simde_float16_to_float32(a))); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) + #undef vrndih_f16 + #define vrndih_f16(a) simde_vrndih_f16(a) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_float16x4_t +simde_vrndi_f16(simde_float16x4_t a) { + #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && !defined(SIMDE_BUG_GCC_95399) && defined(SIMDE_ARM_NEON_FP16) + return vrndi_f16(a); + #else + simde_float16x4_private + r_, + a_ = simde_float16x4_to_private(a); + + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { + r_.values[i] = simde_vrndih_f16(a_.values[i]); + } + + return simde_float16x4_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vrndi_f16 + #define vrndi_f16(a) simde_vrndi_f16(a) +#endif + SIMDE_FUNCTION_ATTRIBUTES simde_float32x2_t simde_vrndi_f32(simde_float32x2_t a) { @@ -79,6 +117,29 @@ simde_vrndi_f64(simde_float64x1_t a) { #define vrndi_f64(a) simde_vrndi_f64(a) #endif +SIMDE_FUNCTION_ATTRIBUTES +simde_float16x8_t +simde_vrndiq_f16(simde_float16x8_t a) { + #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && !defined(SIMDE_BUG_GCC_95399) && defined(SIMDE_ARM_NEON_FP16) + return vrndiq_f16(a); + #else + simde_float16x8_private + r_, + a_ = simde_float16x8_to_private(a); + + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { + r_.values[i] = simde_vrndih_f16(a_.values[i]); + } + + return simde_float16x8_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vrndiq_f16 + #define vrndiq_f16(a) simde_vrndiq_f16(a) +#endif + SIMDE_FUNCTION_ATTRIBUTES simde_float32x4_t simde_vrndiq_f32(simde_float32x4_t a) { diff --git a/lib/simd_wrapper/simde/arm/neon/rndm.h b/lib/simd_wrapper/simde/arm/neon/rndm.h index 386c0ecab91..33c2e00dfa8 100644 --- a/lib/simd_wrapper/simde/arm/neon/rndm.h +++ b/lib/simd_wrapper/simde/arm/neon/rndm.h @@ -22,6 +22,7 @@ * * Copyright: * 2020-2021 Evan Nemerson + * 2023 Yi-Yen Chung (Copyright owned by Andes Technology) */ #if !defined(SIMDE_ARM_NEON_RNDM_H) @@ -33,6 +34,43 @@ HEDLEY_DIAGNOSTIC_PUSH SIMDE_DISABLE_UNWANTED_DIAGNOSTICS SIMDE_BEGIN_DECLS_ +SIMDE_FUNCTION_ATTRIBUTES +simde_float16_t +simde_vrndmh_f16(simde_float16_t a) { + #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARM_NEON_FP16) + return vrndmh_f16(a); + #else + return simde_float16_from_float32(simde_math_floorf(simde_float16_to_float32(a))); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) + #undef vrndmh_f16 + #define vrndmh_f16(a) simde_vrndmh_f16(a) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_float16x4_t +simde_vrndm_f16(simde_float16x4_t a) { + #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARM_NEON_FP16) + return vrndm_f16(a); + #else + simde_float16x4_private + r_, + a_ = simde_float16x4_to_private(a); + + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { + r_.values[i] = simde_vrndmh_f16(a_.values[i]); + } + + return simde_float16x4_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) + #undef vrndm_f16 + #define vrndm_f16(a) simde_vrndm_f16(a) +#endif + SIMDE_FUNCTION_ATTRIBUTES simde_float32x2_t simde_vrndm_f32(simde_float32x2_t a) { @@ -79,6 +117,29 @@ simde_vrndm_f64(simde_float64x1_t a) { #define vrndm_f64(a) simde_vrndm_f64(a) #endif +SIMDE_FUNCTION_ATTRIBUTES +simde_float16x8_t +simde_vrndmq_f16(simde_float16x8_t a) { + #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARM_NEON_FP16) + return vrndmq_f16(a); + #else + simde_float16x8_private + r_, + a_ = simde_float16x8_to_private(a); + + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { + r_.values[i] = simde_vrndmh_f16(a_.values[i]); + } + + return simde_float16x8_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) + #undef vrndmq_f16 + #define vrndmq_f16(a) simde_vrndmq_f16(a) +#endif + SIMDE_FUNCTION_ATTRIBUTES simde_float32x4_t simde_vrndmq_f32(simde_float32x4_t a) { diff --git a/lib/simd_wrapper/simde/arm/neon/rndn.h b/lib/simd_wrapper/simde/arm/neon/rndn.h index d3d07317284..c8990a10dad 100644 --- a/lib/simd_wrapper/simde/arm/neon/rndn.h +++ b/lib/simd_wrapper/simde/arm/neon/rndn.h @@ -22,6 +22,7 @@ * * Copyright: * 2020-2021 Evan Nemerson + * 2023 Yi-Yen Chung (Copyright owned by Andes Technology) */ #if !defined(SIMDE_ARM_NEON_RNDN_H) @@ -33,6 +34,24 @@ HEDLEY_DIAGNOSTIC_PUSH SIMDE_DISABLE_UNWANTED_DIAGNOSTICS SIMDE_BEGIN_DECLS_ +SIMDE_FUNCTION_ATTRIBUTES +simde_float16_t +simde_vrndnh_f16(simde_float16_t a) { + #if \ + defined(SIMDE_ARM_NEON_A32V8_NATIVE) && \ + (!defined(__clang__) || SIMDE_DETECT_CLANG_VERSION_CHECK(7,0,0)) && \ + (!defined(HEDLEY_GCC_VERSION) || (defined(SIMDE_ARM_NEON_A64V8_NATIVE) && HEDLEY_GCC_VERSION_CHECK(8,0,0))) && defined(SIMDE_ARM_NEON_FP16) + return vrndnh_f16(a); + #else + simde_float32_t a_ = simde_float16_to_float32(a); + return simde_float16_from_float32(simde_math_roundevenf(a_)); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) + #undef vrndnh_f16 + #define vrndnh_f16(a) simde_vrndnh_f16(a) +#endif + SIMDE_FUNCTION_ATTRIBUTES simde_float32_t simde_vrndns_f32(simde_float32_t a) { @@ -50,6 +69,29 @@ simde_vrndns_f32(simde_float32_t a) { #define vrndns_f32(a) simde_vrndns_f32(a) #endif +SIMDE_FUNCTION_ATTRIBUTES +simde_float16x4_t +simde_vrndn_f16(simde_float16x4_t a) { + #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARM_NEON_FP16) + return vrndn_f16(a); + #else + simde_float16x4_private + r_, + a_ = simde_float16x4_to_private(a); + + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { + r_.values[i] = simde_vrndnh_f16(a_.values[i]); + } + + return simde_float16x4_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) + #undef vrndn_f16 + #define vrndn_f16(a) simde_vrndn_f16(a) +#endif + SIMDE_FUNCTION_ATTRIBUTES simde_float32x2_t simde_vrndn_f32(simde_float32x2_t a) { @@ -97,6 +139,29 @@ simde_vrndn_f64(simde_float64x1_t a) { #define vrndn_f64(a) simde_vrndn_f64(a) #endif +SIMDE_FUNCTION_ATTRIBUTES +simde_float16x8_t +simde_vrndnq_f16(simde_float16x8_t a) { + #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARM_NEON_FP16) + return vrndnq_f16(a); + #else + simde_float16x8_private + r_, + a_ = simde_float16x8_to_private(a); + + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { + r_.values[i] = simde_vrndnh_f16(a_.values[i]); + } + + return simde_float16x8_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) + #undef vrndnq_f16 + #define vrndnq_f16(a) simde_vrndnq_f16(a) +#endif + SIMDE_FUNCTION_ATTRIBUTES simde_float32x4_t simde_vrndnq_f32(simde_float32x4_t a) { diff --git a/lib/simd_wrapper/simde/arm/neon/rndp.h b/lib/simd_wrapper/simde/arm/neon/rndp.h index ee602a3f7b0..6b23136c590 100644 --- a/lib/simd_wrapper/simde/arm/neon/rndp.h +++ b/lib/simd_wrapper/simde/arm/neon/rndp.h @@ -22,6 +22,7 @@ * * Copyright: * 2020-2021 Evan Nemerson + * 2023 Yi-Yen Chung (Copyright owned by Andes Technology) */ #if !defined(SIMDE_ARM_NEON_RNDP_H) @@ -33,6 +34,43 @@ HEDLEY_DIAGNOSTIC_PUSH SIMDE_DISABLE_UNWANTED_DIAGNOSTICS SIMDE_BEGIN_DECLS_ +SIMDE_FUNCTION_ATTRIBUTES +simde_float16_t +simde_vrndph_f16(simde_float16_t a) { + #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARM_NEON_FP16) + return vrndph_f16(a); + #else + return simde_float16_from_float32(simde_math_ceilf(simde_float16_to_float32(a))); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) + #undef vrndph_f16 + #define vrndph_f16(a) simde_vrndph_f16(a) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_float16x4_t +simde_vrndp_f16(simde_float16x4_t a) { + #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARM_NEON_FP16) + return vrndp_f16(a); + #else + simde_float16x4_private + r_, + a_ = simde_float16x4_to_private(a); + + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { + r_.values[i] = simde_vrndph_f16(a_.values[i]); + } + + return simde_float16x4_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) + #undef vrndp_f16 + #define vrndp_f16(a) simde_vrndp_f16(a) +#endif + SIMDE_FUNCTION_ATTRIBUTES simde_float32x2_t simde_vrndp_f32(simde_float32x2_t a) { @@ -79,6 +117,29 @@ simde_vrndp_f64(simde_float64x1_t a) { #define vrndp_f64(a) simde_vrndp_f64(a) #endif +SIMDE_FUNCTION_ATTRIBUTES +simde_float16x8_t +simde_vrndpq_f16(simde_float16x8_t a) { + #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARM_NEON_FP16) + return vrndpq_f16(a); + #else + simde_float16x8_private + r_, + a_ = simde_float16x8_to_private(a); + + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { + r_.values[i] = simde_vrndph_f16(a_.values[i]); + } + + return simde_float16x8_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) + #undef vrndpq_f16 + #define vrndpq_f16(a) simde_vrndpq_f16(a) +#endif + SIMDE_FUNCTION_ATTRIBUTES simde_float32x4_t simde_vrndpq_f32(simde_float32x4_t a) { diff --git a/lib/simd_wrapper/simde/arm/neon/rndx.h b/lib/simd_wrapper/simde/arm/neon/rndx.h new file mode 100644 index 00000000000..406f5e753ce --- /dev/null +++ b/lib/simd_wrapper/simde/arm/neon/rndx.h @@ -0,0 +1,191 @@ +/* SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, copy, + * modify, merge, publish, distribute, sublicense, and/or sell copies + * of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + * Copyright: + * 2023 Yi-Yen Chung (Copyright owned by Andes Technology) + */ + +#if !defined(SIMDE_ARM_NEON_RNDX_H) +#define SIMDE_ARM_NEON_RNDX_H + +#include "types.h" + +HEDLEY_DIAGNOSTIC_PUSH +SIMDE_DISABLE_UNWANTED_DIAGNOSTICS +SIMDE_BEGIN_DECLS_ + +SIMDE_FUNCTION_ATTRIBUTES +simde_float16_t +simde_vrndxh_f16(simde_float16_t a) { + #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARM_NEON_FP16) + return vrndxh_f16(a); + #else + return simde_float16_from_float32(simde_math_rintf(simde_float16_to_float32(a))); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) + #undef vrndxh_f16 + #define vrndxh_f16(a) simde_vrndxh_f16(a) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_float16x4_t +simde_vrndx_f16(simde_float16x4_t a) { + #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARM_NEON_FP16) + return vrndx_f16(a); + #else + simde_float16x4_private + r_, + a_ = simde_float16x4_to_private(a); + + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { + r_.values[i] = simde_vrndxh_f16(a_.values[i]); + } + + return simde_float16x4_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) + #undef vrndx_f16 + #define vrndx_f16(a) simde_vrndx_f16(a) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_float32x2_t +simde_vrndx_f32(simde_float32x2_t a) { + #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) + return vrndx_f32(a); + #else + simde_float32x2_private + r_, + a_ = simde_float32x2_to_private(a); + + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { + r_.values[i] = simde_math_rintf(a_.values[i]); + } + + return simde_float32x2_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) + #undef vrndx_f32 + #define vrndx_f32(a) simde_vrndx_f32(a) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_float64x1_t +simde_vrndx_f64(simde_float64x1_t a) { + #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + return vrndx_f64(a); + #else + simde_float64x1_private + r_, + a_ = simde_float64x1_to_private(a); + + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { + r_.values[i] = simde_math_rint(a_.values[i]); + } + + return simde_float64x1_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vrndx_f64 + #define vrndx_f64(a) simde_vrndx_f64(a) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_float16x8_t +simde_vrndxq_f16(simde_float16x8_t a) { + #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARM_NEON_FP16) + return vrndxq_f16(a); + #else + simde_float16x8_private + r_, + a_ = simde_float16x8_to_private(a); + + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { + r_.values[i] = simde_vrndxh_f16(a_.values[i]); + } + + return simde_float16x8_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) + #undef vrndxq_f16 + #define vrndxq_f16(a) simde_vrndxq_f16(a) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_float32x4_t +simde_vrndxq_f32(simde_float32x4_t a) { + #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) + return vrndxq_f32(a); + #else + simde_float32x4_private + r_, + a_ = simde_float32x4_to_private(a); + + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { + r_.values[i] = simde_math_rintf(a_.values[i]); + } + + return simde_float32x4_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) + #undef vrndxq_f32 + #define vrndxq_f32(a) simde_vrndxq_f32(a) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_float64x2_t +simde_vrndxq_f64(simde_float64x2_t a) { + #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + return vrndxq_f64(a); + #else + simde_float64x2_private + r_, + a_ = simde_float64x2_to_private(a); + + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { + r_.values[i] = simde_math_rint(a_.values[i]); + } + + return simde_float64x2_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vrndxq_f64 + #define vrndxq_f64(a) simde_vrndxq_f64(a) +#endif + +SIMDE_END_DECLS_ +HEDLEY_DIAGNOSTIC_POP + +#endif /* !defined(SIMDE_ARM_NEON_RNDX_H) */ diff --git a/lib/simd_wrapper/simde/arm/neon/rshl.h b/lib/simd_wrapper/simde/arm/neon/rshl.h index 8ffcfc66632..091a9a407f3 100644 --- a/lib/simd_wrapper/simde/arm/neon/rshl.h +++ b/lib/simd_wrapper/simde/arm/neon/rshl.h @@ -27,7 +27,7 @@ #if !defined(SIMDE_ARM_NEON_RSHL_H) #define SIMDE_ARM_NEON_RSHL_H - +#include "../../x86/avx.h" #include "types.h" /* Notes from the implementer (Christopher Moore aka rosbif) @@ -84,7 +84,9 @@ simde_vrshld_s64(int64_t a, int64_t b) { ? 0 : (b >= 0) ? (a << b) - : ((a + (INT64_C(1) << (-b - 1))) >> -b); + : (a <= 0 + ? ((a + (INT64_C(1) << (-b - 1))) >> -b) + : HEDLEY_STATIC_CAST(int64_t, (HEDLEY_STATIC_CAST(uint64_t, (a + (INT64_C(1) << (-b - 1)))) >> -b))); #endif } #if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) @@ -96,7 +98,7 @@ SIMDE_FUNCTION_ATTRIBUTES uint64_t simde_vrshld_u64(uint64_t a, int64_t b) { #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) - return vrshld_u64(a, HEDLEY_STATIC_CAST(uint64_t, b)); + return vrshld_u64(a, HEDLEY_STATIC_CAST(int64_t, b)); #else b = HEDLEY_STATIC_CAST(int8_t, b); return @@ -141,14 +143,16 @@ simde_vrshl_s8 (const simde_int8x8_t a, const simde_int8x8_t b) { _mm256_srai_epi32(_mm256_sub_epi32(a256_shr, ff), 1), _mm256_cmpgt_epi32(zero, b256)); r256 = _mm256_shuffle_epi8(r256, _mm256_set1_epi32(0x0C080400)); - r_.m64 = _mm_set_pi32(_mm256_extract_epi32(r256, 4), _mm256_extract_epi32(r256, 0)); + r_.m64 = _mm_set_pi32(simde_mm256_extract_epi32(r256, 4), simde_mm256_extract_epi32(r256, 0)); #else SIMDE_VECTORIZE for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { r_.values[i] = HEDLEY_STATIC_CAST(int8_t, (simde_math_abs(b_.values[i]) >= 8) ? 0 : (b_.values[i] >= 0) ? (a_.values[i] << b_.values[i]) : - ((a_.values[i] + (1 << (-b_.values[i] - 1))) >> -b_.values[i])); + ((a_.values[i] <= 0) ? ((a_.values[i] + (1 << (-b_.values[i] - 1))) >> -b_.values[i]) : + HEDLEY_STATIC_CAST(int8_t, ((HEDLEY_STATIC_CAST(uint8_t, + (a_.values[i] + (1 << (-b_.values[i] - 1)))) >> -b_.values[i]) & (0x7FUL))))); } #endif @@ -189,7 +193,9 @@ simde_vrshl_s16 (const simde_int16x4_t a, const simde_int16x4_t b) { r_.values[i] = HEDLEY_STATIC_CAST(int16_t, (simde_math_abs(b_.values[i]) >= 16) ? 0 : (b_.values[i] >= 0) ? (a_.values[i] << b_.values[i]) : - ((a_.values[i] + (1 << (-b_.values[i] - 1))) >> -b_.values[i])); + ((a_.values[i] <= 0) ? ((a_.values[i] + (1 << (-b_.values[i] - 1))) >> -b_.values[i]) : + HEDLEY_STATIC_CAST(int16_t, ((HEDLEY_STATIC_CAST(uint16_t, + (a_.values[i] + (1 << (-b_.values[i] - 1)))) >> -b_.values[i]) & (0x7FFFUL))))); } #endif @@ -230,7 +236,9 @@ simde_vrshl_s32 (const simde_int32x2_t a, const simde_int32x2_t b) { r_.values[i] = HEDLEY_STATIC_CAST(int32_t, (simde_math_abs(b_.values[i]) >= 32) ? 0 : (b_.values[i] >= 0) ? (a_.values[i] << b_.values[i]) : - ((a_.values[i] + (1 << (-b_.values[i] - 1))) >> -b_.values[i])); + ((a_.values[i] <= 0) ? ((a_.values[i] + (1 << (-b_.values[i] - 1))) >> -b_.values[i]) : + HEDLEY_STATIC_CAST(int32_t, ((HEDLEY_STATIC_CAST(uint32_t, + (a_.values[i] + (1 << (-b_.values[i] - 1)))) >> -b_.values[i]) & (0x7FFFFFFFUL))))); } #endif @@ -322,7 +330,7 @@ simde_vrshl_u8 (const simde_uint8x8_t a, const simde_int8x8_t b) { _mm256_srli_epi32(_mm256_sub_epi32(a256_shr, ff), 1), _mm256_cmpgt_epi32(zero, b256)); r256 = _mm256_shuffle_epi8(r256, _mm256_set1_epi32(0x0C080400)); - r_.m64 = _mm_set_pi32(_mm256_extract_epi32(r256, 4), _mm256_extract_epi32(r256, 0)); + r_.m64 = _mm_set_pi32(simde_mm256_extract_epi32(r256, 4), simde_mm256_extract_epi32(r256, 0)); #else SIMDE_VECTORIZE for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { @@ -513,7 +521,9 @@ simde_vrshlq_s8 (const simde_int8x16_t a, const simde_int8x16_t b) { r_.values[i] = HEDLEY_STATIC_CAST(int8_t, (simde_math_abs(b_.values[i]) >= 8) ? 0 : (b_.values[i] >= 0) ? (a_.values[i] << b_.values[i]) : - ((a_.values[i] + (1 << (-b_.values[i] - 1))) >> -b_.values[i])); + ((a_.values[i] <= 0) ? ((a_.values[i] + (1 << (-b_.values[i] - 1))) >> -b_.values[i]) : + HEDLEY_STATIC_CAST(int8_t, ((HEDLEY_STATIC_CAST(uint8_t, + (a_.values[i] + (1 << (-b_.values[i] - 1)))) >> -b_.values[i]) & (0x7FUL))))); } #endif @@ -572,7 +582,7 @@ simde_vrshlq_s16 (const simde_int16x8_t a, const simde_int16x8_t b) { _mm256_srai_epi32(_mm256_sub_epi32(a256_shr, ff), 1), _mm256_cmpgt_epi32(zero, b256)); r256 = _mm256_shuffle_epi8(r256, _mm256_set1_epi64x(0x0D0C090805040100)); - r_.m128i = _mm_set_epi64x(_mm256_extract_epi64(r256, 2), _mm256_extract_epi64(r256, 0)); + r_.m128i = _mm_set_epi64x(simde_mm256_extract_epi64(r256, 2), simde_mm256_extract_epi64(r256, 0)); #else SIMDE_VECTORIZE for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { @@ -580,7 +590,9 @@ simde_vrshlq_s16 (const simde_int16x8_t a, const simde_int16x8_t b) { r_.values[i] = HEDLEY_STATIC_CAST(int16_t, (simde_math_abs(b_.values[i]) >= 16) ? 0 : (b_.values[i] >= 0) ? (a_.values[i] << b_.values[i]) : - ((a_.values[i] + (1 << (-b_.values[i] - 1))) >> -b_.values[i])); + ((a_.values[i] <= 0) ? ((a_.values[i] + (1 << (-b_.values[i] - 1))) >> -b_.values[i]) : + HEDLEY_STATIC_CAST(int16_t, ((HEDLEY_STATIC_CAST(uint16_t, + (a_.values[i] + (1 << (-b_.values[i] - 1)))) >> -b_.values[i]) & (0x7FFFUL))))); } #endif @@ -634,8 +646,10 @@ simde_vrshlq_s32 (const simde_int32x4_t a, const simde_int32x4_t b) { b_.values[i] = HEDLEY_STATIC_CAST(int8_t, b_.values[i]); r_.values[i] = HEDLEY_STATIC_CAST(int32_t, (simde_math_abs(b_.values[i]) >= 32) ? 0 : - (b_.values[i] >= 0) ? (a_.values[i] << b_.values[i]) : - ((a_.values[i] + (1 << (-b_.values[i] - 1))) >> -b_.values[i])); + (b_.values[i] >= 0) ? (a_.values[i] << b_.values[i]) : + ((a_.values[i] <= 0) ? ((a_.values[i] + (1 << (-b_.values[i] - 1))) >> -b_.values[i]) : + HEDLEY_STATIC_CAST(int32_t, ((HEDLEY_STATIC_CAST(uint32_t, + (a_.values[i] + (1 << (-b_.values[i] - 1)))) >> -b_.values[i]) & (0X7FFFFFFFUL))))); } #endif @@ -811,7 +825,7 @@ simde_vrshlq_u16 (const simde_uint16x8_t a, const simde_int16x8_t b) { _mm256_srli_epi32(_mm256_sub_epi32(a256_shr, ff), 1), _mm256_cmpgt_epi32(zero, b256)); r256 = _mm256_shuffle_epi8(r256, _mm256_set1_epi64x(0x0D0C090805040100)); - r_.m128i = _mm_set_epi64x(_mm256_extract_epi64(r256, 2), _mm256_extract_epi64(r256, 0)); + r_.m128i = _mm_set_epi64x(simde_mm256_extract_epi64(r256, 2), simde_mm256_extract_epi64(r256, 0)); #else SIMDE_VECTORIZE for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { diff --git a/lib/simd_wrapper/simde/arm/neon/rshr_n.h b/lib/simd_wrapper/simde/arm/neon/rshr_n.h index 1eb0c11c065..a27495536ad 100644 --- a/lib/simd_wrapper/simde/arm/neon/rshr_n.h +++ b/lib/simd_wrapper/simde/arm/neon/rshr_n.h @@ -23,6 +23,7 @@ * Copyright: * 2020 Evan Nemerson * 2020 Christopher Moore + * 2023 Yi-Yen Chung (Copyright owned by Andes Technology) */ #if !defined(SIMDE_ARM_NEON_RSHR_N_H) @@ -41,6 +42,20 @@ HEDLEY_DIAGNOSTIC_PUSH SIMDE_DISABLE_UNWANTED_DIAGNOSTICS SIMDE_BEGIN_DECLS_ +SIMDE_FUNCTION_ATTRIBUTES +int16_t +simde_x_vrshrh_n_s16(int16_t a, const int n) + SIMDE_REQUIRE_CONSTANT_RANGE(n, 1, 16) { + return (a >> ((n == 16) ? 15 : n)) + ((a & HEDLEY_STATIC_CAST(int16_t, UINT16_C(1) << (n - 1))) != 0); +} + +SIMDE_FUNCTION_ATTRIBUTES +uint16_t +simde_x_vrshrh_n_u16(uint16_t a, const int n) + SIMDE_REQUIRE_CONSTANT_RANGE(n, 1, 16) { + return ((n == 16) ? 0 : (a >> n)) + ((a & (UINT32_C(1) << (n - 1))) != 0); +} + SIMDE_FUNCTION_ATTRIBUTES int32_t simde_x_vrshrs_n_s32(int32_t a, const int n) diff --git a/lib/simd_wrapper/simde/arm/neon/rshrn_high_n.h b/lib/simd_wrapper/simde/arm/neon/rshrn_high_n.h new file mode 100644 index 00000000000..7897581a58c --- /dev/null +++ b/lib/simd_wrapper/simde/arm/neon/rshrn_high_n.h @@ -0,0 +1,101 @@ +/* SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, copy, + * modify, merge, publish, distribute, sublicense, and/or sell copies + * of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + * Copyright: + * 2023 Yi-Yen Chung (Copyright owned by Andes Technology) + */ + +#if !defined(SIMDE_ARM_NEON_RSHRN_HIGH_N_H) +#define SIMDE_ARM_NEON_RSHRN_HIGH_N_H + +#include "rshrn_n.h" +#include "combine.h" +#include "types.h" + +HEDLEY_DIAGNOSTIC_PUSH +SIMDE_DISABLE_UNWANTED_DIAGNOSTICS +SIMDE_BEGIN_DECLS_ + +#if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + #define simde_vrshrn_high_n_s16(r, a, n) vrshrn_high_n_s16((r), (a), (n)) +#else + #define simde_vrshrn_high_n_s16(r, a, n) simde_vcombine_s8(r, simde_vrshrn_n_s16(a, n)) +#endif +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vrshrn_high_n_s16 + #define vrshrn_high_n_s16(r, a, n) simde_vrshrn_high_n_s16((r), (a), (n)) +#endif + +#if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + #define simde_vrshrn_high_n_s32(r, a, n) vrshrn_high_n_s32((r), (a), (n)) +#else + #define simde_vrshrn_high_n_s32(r, a, n) simde_vcombine_s16(r, simde_vrshrn_n_s32(a, n)) +#endif +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vrshrn_high_n_s32 + #define vrshrn_high_n_s32(r, a, n) simde_vrshrn_high_n_s32((r), (a), (n)) +#endif + +#if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + #define simde_vrshrn_high_n_s64(r, a, n) vrshrn_high_n_s64((r), (a), (n)) +#else + #define simde_vrshrn_high_n_s64(r, a, n) simde_vcombine_s32(r, simde_vrshrn_n_s64(a, n)) +#endif +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vrshrn_high_n_s64 + #define vrshrn_high_n_s64(r, a, n) simde_vrshrn_high_n_s64((r), (a), (n)) +#endif + +#if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + #define simde_vrshrn_high_n_u16(r, a, n) vrshrn_high_n_u16((r), (a), (n)) +#else + #define simde_vrshrn_high_n_u16(r, a, n) simde_vcombine_u8(r, simde_vrshrn_n_u16(a, n)) +#endif +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vrshrn_high_n_u16 + #define vrshrn_high_n_u16(r, a, n) simde_vrshrn_high_n_u16((r), (a), (n)) +#endif + +#if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + #define simde_vrshrn_high_n_u32(r, a, n) vrshrn_high_n_u32((r), (a), (n)) +#else + #define simde_vrshrn_high_n_u32(r, a, n) simde_vcombine_u16(r, simde_vrshrn_n_u32(a, n)) +#endif +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vrshrn_high_n_u32 + #define vrshrn_high_n_u32(r, a, n) simde_vrshrn_high_n_u32((r), (a), (n)) +#endif + +#if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + #define simde_vrshrn_high_n_u64(r, a, n) vrshrn_high_n_u64((r), (a), (n)) +#else + #define simde_vrshrn_high_n_u64(r, a, n) simde_vcombine_u32(r, simde_vrshrn_n_u64(a, n)) +#endif +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vrshrn_high_n_u64 + #define vrshrn_high_n_u64(r, a, n) simde_vrshrn_high_n_u64((r), (a), (n)) +#endif + +SIMDE_END_DECLS_ +HEDLEY_DIAGNOSTIC_POP + +#endif /* !defined(SIMDE_ARM_NEON_RSHRN_HIGH_N_H) */ diff --git a/lib/simd_wrapper/simde/arm/neon/rsqrte.h b/lib/simd_wrapper/simde/arm/neon/rsqrte.h index 8b2adbe2adb..d3a1c5ac182 100644 --- a/lib/simd_wrapper/simde/arm/neon/rsqrte.h +++ b/lib/simd_wrapper/simde/arm/neon/rsqrte.h @@ -23,6 +23,7 @@ * Copyright: * 2020 Evan Nemerson * 2021 Zhi An Ng (Copyright owned by Google, LLC) + * 2023 Yi-Yen Chung (Copyright owned by Andes Technology) */ #if !defined(SIMDE_ARM_NEON_RSQRTE_H) @@ -34,6 +35,27 @@ HEDLEY_DIAGNOSTIC_PUSH SIMDE_DISABLE_UNWANTED_DIAGNOSTICS SIMDE_BEGIN_DECLS_ +SIMDE_FUNCTION_ATTRIBUTES +simde_float16_t +simde_vrsqrteh_f16(simde_float16_t a) { + #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && defined(SIMDE_ARM_NEON_FP16) + return vrsqrteh_f16(a); + #else + #if defined(simde_math_sqrtf) + simde_float32_t r_; + simde_float32_t a_ = simde_float16_to_float32(a); + r_ = 1.0f / simde_math_sqrtf(a_); + return simde_float16_from_float32(r_); + #else + HEDLEY_UNREACHABLE(); + #endif + #endif +} +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vrsqrteh_f16 + #define vrsqrteh_f16(a) simde_vrsqrteh_f16((a)) +#endif + SIMDE_FUNCTION_ATTRIBUTES simde_float32_t simde_vrsqrtes_f32(simde_float32_t a) { @@ -119,11 +141,11 @@ simde_vrsqrte_u32(simde_uint32x2_t a) { r_; for(size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[i])) ; i++) { - if(a_.values[i] < 0x3FFFFFFF) { + if (a_.values[i] < 0x3FFFFFFF) { r_.values[i] = UINT32_MAX; } else { uint32_t a_temp = (a_.values[i] >> 23) & 511; - if(a_temp < 256) { + if (a_temp < 256) { a_temp = a_temp * 2 + 1; } else { a_temp = (a_temp >> 1) << 1; @@ -144,6 +166,33 @@ simde_vrsqrte_u32(simde_uint32x2_t a) { #define vrsqrte_u32(a) simde_vrsqrte_u32((a)) #endif +SIMDE_FUNCTION_ATTRIBUTES +simde_float16x4_t +simde_vrsqrte_f16(simde_float16x4_t a) { + #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARM_NEON_FP16) + return vrsqrte_f16(a); + #else + simde_float16x4_private + r_, + a_ = simde_float16x4_to_private(a); + + #if defined(simde_math_sqrtf) + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { + r_.values[i] = simde_vrsqrteh_f16(a_.values[i]); + } + #else + HEDLEY_UNREACHABLE(); + #endif + + return simde_float16x4_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) + #undef vrsqrte_f16 + #define vrsqrte_f16(a) simde_vrsqrte_f16((a)) +#endif + SIMDE_FUNCTION_ATTRIBUTES simde_float32x2_t simde_vrsqrte_f32(simde_float32x2_t a) { @@ -254,11 +303,11 @@ simde_vrsqrteq_u32(simde_uint32x4_t a) { r_; for(size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[i])) ; i++) { - if(a_.values[i] < 0x3FFFFFFF) { + if (a_.values[i] < 0x3FFFFFFF) { r_.values[i] = UINT32_MAX; } else { uint32_t a_temp = (a_.values[i] >> 23) & 511; - if(a_temp < 256) { + if (a_temp < 256) { a_temp = a_temp * 2 + 1; } else { a_temp = (a_temp >> 1) << 1; @@ -279,6 +328,33 @@ simde_vrsqrteq_u32(simde_uint32x4_t a) { #define vrsqrteq_u32(a) simde_vrsqrteq_u32((a)) #endif +SIMDE_FUNCTION_ATTRIBUTES +simde_float16x8_t +simde_vrsqrteq_f16(simde_float16x8_t a) { + #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARM_NEON_FP16) + return vrsqrteq_f16(a); + #else + simde_float16x8_private + r_, + a_ = simde_float16x8_to_private(a); + + #if defined(simde_math_sqrtf) + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { + r_.values[i] = simde_vrsqrteh_f16(a_.values[i]); + } + #else + HEDLEY_UNREACHABLE(); + #endif + + return simde_float16x8_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) + #undef vrsqrteq_f16 + #define vrsqrteq_f16(a) simde_vrsqrteq_f16((a)) +#endif + SIMDE_FUNCTION_ATTRIBUTES simde_float32x4_t simde_vrsqrteq_f32(simde_float32x4_t a) { diff --git a/lib/simd_wrapper/simde/arm/neon/rsqrts.h b/lib/simd_wrapper/simde/arm/neon/rsqrts.h index 3c7f720bb2a..633ad3aaf4b 100644 --- a/lib/simd_wrapper/simde/arm/neon/rsqrts.h +++ b/lib/simd_wrapper/simde/arm/neon/rsqrts.h @@ -23,6 +23,7 @@ * Copyright: * 2021 Zhi An Ng (Copyright owned by Google, LLC) * 2021 Evan Nemerson + * 2023 Yi-Yen Chung (Copyright owned by Andes Technology) */ #if !defined(SIMDE_ARM_NEON_RSQRTS_H) @@ -37,6 +38,26 @@ HEDLEY_DIAGNOSTIC_PUSH SIMDE_DISABLE_UNWANTED_DIAGNOSTICS SIMDE_BEGIN_DECLS_ +SIMDE_FUNCTION_ATTRIBUTES +simde_float16_t +simde_vrsqrtsh_f16(simde_float16_t a, simde_float16_t b) { + #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && defined(SIMDE_ARM_NEON_FP16) + return vrsqrtsh_f16(a, b); + #else + return + simde_vmulh_f16( + simde_vsubh_f16( + SIMDE_FLOAT16_VALUE(3.0), + simde_vmulh_f16(a, b)), + SIMDE_FLOAT16_VALUE(0.5) + ); + #endif +} +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vrsqrtsh_f16 + #define vrsqrtsh_f16(a, b) simde_vrsqrtsh_f16((a), (b)) +#endif + SIMDE_FUNCTION_ATTRIBUTES simde_float32_t simde_vrsqrtss_f32(simde_float32_t a, simde_float32_t b) { @@ -65,6 +86,26 @@ simde_vrsqrtsd_f64(simde_float64_t a, simde_float64_t b) { #define vrsqrtsd_f64(a, b) simde_vrsqrtsd_f64((a), (b)) #endif +SIMDE_FUNCTION_ATTRIBUTES +simde_float16x4_t +simde_vrsqrts_f16(simde_float16x4_t a, simde_float16x4_t b) { + #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARM_NEON_FP16) + return vrsqrts_f16(a, b); + #else + return + simde_vmul_n_f16( + simde_vsub_f16( + simde_vdup_n_f16(SIMDE_FLOAT16_VALUE(3.0)), + simde_vmul_f16(a, b)), + SIMDE_FLOAT16_VALUE(0.5) + ); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) + #undef vrsqrts_f16 + #define vrsqrts_f16(a, b) simde_vrsqrts_f16((a), (b)) +#endif + SIMDE_FUNCTION_ATTRIBUTES simde_float32x2_t simde_vrsqrts_f32(simde_float32x2_t a, simde_float32x2_t b) { @@ -107,6 +148,26 @@ simde_vrsqrts_f64(simde_float64x1_t a, simde_float64x1_t b) { #define vrsqrts_f64(a, b) simde_vrsqrts_f64((a), (b)) #endif +SIMDE_FUNCTION_ATTRIBUTES +simde_float16x8_t +simde_vrsqrtsq_f16(simde_float16x8_t a, simde_float16x8_t b) { + #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARM_NEON_FP16) + return vrsqrtsq_f16(a, b); + #else + return + simde_vmulq_n_f16( + simde_vsubq_f16( + simde_vdupq_n_f16(SIMDE_FLOAT16_VALUE(3.0)), + simde_vmulq_f16(a, b)), + SIMDE_FLOAT16_VALUE(0.5) + ); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) + #undef vrsqrtsq_f16 + #define vrsqrtsq_f16(a, b) simde_vrsqrtsq_f16((a), (b)) +#endif + SIMDE_FUNCTION_ATTRIBUTES simde_float32x4_t simde_vrsqrtsq_f32(simde_float32x4_t a, simde_float32x4_t b) { diff --git a/lib/simd_wrapper/simde/arm/neon/rsubhn.h b/lib/simd_wrapper/simde/arm/neon/rsubhn.h new file mode 100644 index 00000000000..2d6a15da46a --- /dev/null +++ b/lib/simd_wrapper/simde/arm/neon/rsubhn.h @@ -0,0 +1,182 @@ +/* SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, copy, + * modify, merge, publish, distribute, sublicense, and/or sell copies + * of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + * Copyright: + * 2023 Yi-Yen Chung (Copyright owned by Andes Technology) + */ + +#if !defined(SIMDE_ARM_NEON_RSUBHN_H) +#define SIMDE_ARM_NEON_RSUBHN_H + +#include "sub.h" +#include "shr_n.h" +#include "movn.h" + +#include "reinterpret.h" + +HEDLEY_DIAGNOSTIC_PUSH +SIMDE_DISABLE_UNWANTED_DIAGNOSTICS +SIMDE_BEGIN_DECLS_ + +SIMDE_FUNCTION_ATTRIBUTES +simde_int8x8_t +simde_vrsubhn_s16(simde_int16x8_t a, simde_int16x8_t b) { + #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) + return vrsubhn_s16(a, b); + #else + simde_int16x8_private + r_, + a_ = simde_int16x8_to_private(a), + b_ = simde_int16x8_to_private(b); + int16_t round_cast = 1 << 7; + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { + r_.values[i] = HEDLEY_STATIC_CAST(int16_t, a_.values[i] - b_.values[i] + round_cast); + } + return simde_vmovn_s16(simde_vshrq_n_s16(simde_int16x8_from_private(r_), 8)); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vrsubhn_s16 + #define vrsubhn_s16(a, b) simde_vrsubhn_s16((a), (b)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_int16x4_t +simde_vrsubhn_s32(simde_int32x4_t a, simde_int32x4_t b) { + #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) + return vrsubhn_s32(a, b); + #else + simde_int32x4_private + r_, + a_ = simde_int32x4_to_private(a), + b_ = simde_int32x4_to_private(b); + int round_cast = 1 << 15; + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { + r_.values[i] = a_.values[i] - b_.values[i] + round_cast; + } + return simde_vmovn_s32(simde_vshrq_n_s32(simde_int32x4_from_private(r_), 16)); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vrsubhn_s32 + #define vrsubhn_s32(a, b) simde_vrsubhn_s32((a), (b)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_int32x2_t +simde_vrsubhn_s64(simde_int64x2_t a, simde_int64x2_t b) { + #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) + return vrsubhn_s64(a, b); + #else + simde_int64x2_private + r_, + a_ = simde_int64x2_to_private(a), + b_ = simde_int64x2_to_private(b); + int64_t round_cast = 1ll << 31; + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { + r_.values[i] = ((a_.values[i] - b_.values[i] + round_cast) >> 32); + } + return simde_vmovn_s64(simde_int64x2_from_private(r_)); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vrsubhn_s64 + #define vrsubhn_s64(a, b) simde_vrsubhn_s64((a), (b)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_uint8x8_t +simde_vrsubhn_u16(simde_uint16x8_t a, simde_uint16x8_t b) { + #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) + return vrsubhn_u16(a, b); + #else + simde_uint16x8_private + r_, + a_ = simde_uint16x8_to_private(a), + b_ = simde_uint16x8_to_private(b); + uint16_t round_cast = 1 << 7; + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { + r_.values[i] = HEDLEY_STATIC_CAST(uint16_t, a_.values[i] - b_.values[i] + round_cast); + } + return simde_vmovn_u16(simde_vshrq_n_u16(simde_uint16x8_from_private(r_), 8)); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vrsubhn_u16 + #define vrsubhn_u16(a, b) simde_vrsubhn_u16((a), (b)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_uint16x4_t +simde_vrsubhn_u32(simde_uint32x4_t a, simde_uint32x4_t b) { + #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) + return vrsubhn_u32(a, b); + #else + simde_uint32x4_private + r_, + a_ = simde_uint32x4_to_private(a), + b_ = simde_uint32x4_to_private(b); + uint32_t round_cast = 1 << 15; + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { + r_.values[i] = a_.values[i] - b_.values[i] + round_cast; + } + return simde_vmovn_u32(simde_vshrq_n_u32(simde_uint32x4_from_private(r_), 16)); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vrsubhn_u32 + #define vrsubhn_u32(a, b) simde_vrsubhn_u32((a), (b)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_uint32x2_t +simde_vrsubhn_u64(simde_uint64x2_t a, simde_uint64x2_t b) { + #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) + return vrsubhn_u64(a, b); + #else + simde_uint64x2_private + r_, + a_ = simde_uint64x2_to_private(a), + b_ = simde_uint64x2_to_private(b); + uint64_t round_cast = 1ull << 31; + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { + r_.values[i] = ((a_.values[i] - b_.values[i] + round_cast) >> 32); + } + return simde_vmovn_u64(simde_uint64x2_from_private(r_)); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vrsubhn_u64 + #define vrsubhn_u64(a, b) simde_vrsubhn_u64((a), (b)) +#endif + + +SIMDE_END_DECLS_ +HEDLEY_DIAGNOSTIC_POP + +#endif /* !defined(SIMDE_ARM_NEON_RSUBHN_H) */ diff --git a/lib/simd_wrapper/simde/arm/neon/rsubhn_high.h b/lib/simd_wrapper/simde/arm/neon/rsubhn_high.h new file mode 100644 index 00000000000..d7b19849e02 --- /dev/null +++ b/lib/simd_wrapper/simde/arm/neon/rsubhn_high.h @@ -0,0 +1,102 @@ +/* SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, copy, + * modify, merge, publish, distribute, sublicense, and/or sell copies + * of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + * Copyright: + * 2023 Yi-Yen Chung (Copyright owned by Andes Technology) + */ + +#if !defined(SIMDE_ARM_NEON_RSUBHN_HIGH_H) +#define SIMDE_ARM_NEON_RSUBHN_HIGH_H + +#include "rsubhn.h" +#include "combine.h" + +#include "reinterpret.h" + +HEDLEY_DIAGNOSTIC_PUSH +SIMDE_DISABLE_UNWANTED_DIAGNOSTICS +SIMDE_BEGIN_DECLS_ + +#if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + #define simde_vrsubhn_high_s16(r, a, b) vrsubhn_high_s16((r), (a), (b)) +#else + #define simde_vrsubhn_high_s16(r, a, b) simde_vcombine_s8(r, simde_vrsubhn_s16(a, b)) +#endif +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vrsubhn_high_s16 + #define vrsubhn_high_s16(r, a, b) simde_vrsubhn_high_s16((r), (a), (b)) +#endif + +#if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + #define simde_vrsubhn_high_s32(r, a, b) vrsubhn_high_s32((r), (a), (b)) +#else + #define simde_vrsubhn_high_s32(r, a, b) simde_vcombine_s16(r, simde_vrsubhn_s32(a, b)) +#endif +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vrsubhn_high_s32 + #define vrsubhn_high_s32(r, a, b) simde_vrsubhn_high_s32((r), (a), (b)) +#endif + +#if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + #define simde_vrsubhn_high_s64(r, a, b) vrsubhn_high_s64((r), (a), (b)) +#else + #define simde_vrsubhn_high_s64(r, a, b) simde_vcombine_s32(r, simde_vrsubhn_s64(a, b)) +#endif +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vrsubhn_high_s64 + #define vrsubhn_high_s64(r, a, b) simde_vrsubhn_high_s64((r), (a), (b)) +#endif + +#if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + #define simde_vrsubhn_high_u16(r, a, b) vrsubhn_high_u16((r), (a), (b)) +#else + #define simde_vrsubhn_high_u16(r, a, b) simde_vcombine_u8(r, simde_vrsubhn_u16(a, b)) +#endif +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vrsubhn_high_u16 + #define vrsubhn_high_u16(r, a, b) simde_vrsubhn_high_u16((r), (a), (b)) +#endif + +#if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + #define simde_vrsubhn_high_u32(r, a, b) vrsubhn_high_u32((r), (a), (b)) +#else + #define simde_vrsubhn_high_u32(r, a, b) simde_vcombine_u16(r, simde_vrsubhn_u32(a, b)) +#endif +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vrsubhn_high_u32 + #define vrsubhn_high_u32(r, a, b) simde_vrsubhn_high_u32((r), (a), (b)) +#endif + +#if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + #define simde_vrsubhn_high_u64(r, a, b) vrsubhn_high_u64((r), (a), (b)) +#else + #define simde_vrsubhn_high_u64(r, a, b) simde_vcombine_u32(r, simde_vrsubhn_u64(a, b)) +#endif +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vrsubhn_high_u64 + #define vrsubhn_high_u64(r, a, b) simde_vrsubhn_high_u64((r), (a), (b)) +#endif + +SIMDE_END_DECLS_ +HEDLEY_DIAGNOSTIC_POP + +#endif /* !defined(SIMDE_ARM_NEON_RSUBHN_HIGH_H) */ diff --git a/lib/simd_wrapper/simde/arm/neon/set_lane.h b/lib/simd_wrapper/simde/arm/neon/set_lane.h index 70291143af5..1c230f39b37 100644 --- a/lib/simd_wrapper/simde/arm/neon/set_lane.h +++ b/lib/simd_wrapper/simde/arm/neon/set_lane.h @@ -23,6 +23,7 @@ * Copyright: * 2020 Evan Nemerson * 2020 Sean Maher (Copyright owned by Google, LLC) + * 2023 Yi-Yen Chung (Copyright owned by Andes Technology) */ #if !defined(SIMDE_ARM_NEON_SET_LANE_H) @@ -33,6 +34,25 @@ HEDLEY_DIAGNOSTIC_PUSH SIMDE_DISABLE_UNWANTED_DIAGNOSTICS SIMDE_BEGIN_DECLS_ +SIMDE_FUNCTION_ATTRIBUTES +simde_float16x4_t +simde_vset_lane_f16(simde_float16_t a, simde_float16x4_t v, const int lane) + SIMDE_REQUIRE_CONSTANT_RANGE(lane, 0, 3) { + simde_float16x4_t r; + #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) && defined(SIMDE_ARM_NEON_FP16) + SIMDE_CONSTIFY_4_(vset_lane_f16, r, (HEDLEY_UNREACHABLE(), v), lane, a, v); + #else + simde_float16x4_private v_ = simde_float16x4_to_private(v); + v_.values[lane] = a; + r = simde_float16x4_from_private(v_); + #endif + return r; +} +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vset_lane_f16 + #define vset_lane_f16(a, b, c) simde_vset_lane_f16((a), (b), (c)) +#endif + SIMDE_FUNCTION_ATTRIBUTES simde_float32x2_t simde_vset_lane_f32(simde_float32_t a, simde_float32x2_t v, const int lane) @@ -226,6 +246,25 @@ simde_vset_lane_u64(uint64_t a, simde_uint64x1_t v, const int lane) #define vset_lane_u64(a, b, c) simde_vset_lane_u64((a), (b), (c)) #endif +SIMDE_FUNCTION_ATTRIBUTES +simde_float16x8_t +simde_vsetq_lane_f16(simde_float16_t a, simde_float16x8_t v, const int lane) + SIMDE_REQUIRE_CONSTANT_RANGE(lane, 0, 7) { + simde_float16x8_t r; + #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) && defined(SIMDE_ARM_NEON_FP16) + SIMDE_CONSTIFY_8_(vsetq_lane_f16, r, (HEDLEY_UNREACHABLE(), v), lane, a, v); + #else + simde_float16x8_private v_ = simde_float16x8_to_private(v); + v_.values[lane] = a; + r = simde_float16x8_from_private(v_); + #endif + return r; +} +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vsetq_lane_f16 + #define vsetq_lane_f16(a, b, c) simde_vsetq_lane_f16((a), (b), (c)) +#endif + SIMDE_FUNCTION_ATTRIBUTES simde_float32x4_t simde_vsetq_lane_f32(simde_float32_t a, simde_float32x4_t v, const int lane) @@ -416,6 +455,152 @@ simde_vsetq_lane_u64(uint64_t a, simde_uint64x2_t v, const int lane) #define vsetq_lane_u64(a, b, c) simde_vsetq_lane_u64((a), (b), (c)) #endif +SIMDE_FUNCTION_ATTRIBUTES +simde_poly8x8_t +simde_vset_lane_p8(simde_poly8_t a, simde_poly8x8_t v, const int lane) + SIMDE_REQUIRE_CONSTANT_RANGE(lane, 0, 7) { + simde_poly8x8_t r; + simde_poly8x8_private v_ = simde_poly8x8_to_private(v); + v_.values[lane] = a; + r = simde_poly8x8_from_private(v_); + return r; +} +#if defined(SIMDE_ARM_NEON_A32V7_NATIVE) && !defined(SIMDE_BUG_CLANG_71362) + #define simde_vset_lane_p8(a, b, c) vset_lane_p8((a), (b), (c)) +#endif +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vset_lane_p8 + #define vset_lane_p8(a, b, c) simde_vset_lane_p8((a), (b), (c)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_poly16x4_t +simde_vset_lane_p16(simde_poly16_t a, simde_poly16x4_t v, const int lane) + SIMDE_REQUIRE_CONSTANT_RANGE(lane, 0, 3) { + simde_poly16x4_t r; + simde_poly16x4_private v_ = simde_poly16x4_to_private(v); + v_.values[lane] = a; + r = simde_poly16x4_from_private(v_); + return r; +} +#if defined(SIMDE_ARM_NEON_A32V7_NATIVE) && !defined(SIMDE_BUG_CLANG_71362) + #define simde_vset_lane_p16(a, b, c) vset_lane_p16((a), (b), (c)) +#endif +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vset_lane_p16 + #define vset_lane_p16(a, b, c) simde_vset_lane_p16((a), (b), (c)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_poly64x1_t +simde_vset_lane_p64(simde_poly64_t a, simde_poly64x1_t v, const int lane) + SIMDE_REQUIRE_CONSTANT_RANGE(lane, 0, 0) { + simde_poly64x1_t r; + simde_poly64x1_private v_ = simde_poly64x1_to_private(v); + v_.values[lane] = a; + r = simde_poly64x1_from_private(v_); + return r; +} +#if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && !defined(SIMDE_BUG_CLANG_71362) + #define simde_vset_lane_p64(a, b, c) vset_lane_p64((a), (b), (c)) +#endif +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) + #undef vset_lane_p64 + #define vset_lane_p64(a, b, c) simde_vset_lane_p64((a), (b), (c)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_poly8x16_t +simde_vsetq_lane_p8(simde_poly8_t a, simde_poly8x16_t v, const int lane) + SIMDE_REQUIRE_CONSTANT_RANGE(lane, 0, 15) { + simde_poly8x16_t r; + simde_poly8x16_private v_ = simde_poly8x16_to_private(v); + v_.values[lane] = a; + r = simde_poly8x16_from_private(v_); + return r; +} +#if defined(SIMDE_ARM_NEON_A32V7_NATIVE) && !defined(SIMDE_BUG_CLANG_71362) + #define simde_vsetq_lane_p8(a, b, c) vsetq_lane_p8((a), (b), (c)) +#endif +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vsetq_lane_p8 + #define vsetq_lane_p8(a, b, c) simde_vsetq_lane_p8((a), (b), (c)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_poly16x8_t +simde_vsetq_lane_p16(simde_poly16_t a, simde_poly16x8_t v, const int lane) + SIMDE_REQUIRE_CONSTANT_RANGE(lane, 0, 7) { + simde_poly16x8_t r; + simde_poly16x8_private v_ = simde_poly16x8_to_private(v); + v_.values[lane] = a; + r = simde_poly16x8_from_private(v_); + return r; +} +#if defined(SIMDE_ARM_NEON_A32V7_NATIVE) && !defined(SIMDE_BUG_CLANG_71362) + #define simde_vsetq_lane_p16(a, b, c) vsetq_lane_p16((a), (b), (c)) +#endif +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vsetq_lane_p16 + #define vsetq_lane_p16(a, b, c) simde_vsetq_lane_p16((a), (b), (c)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_poly64x2_t +simde_vsetq_lane_p64(simde_poly64_t a, simde_poly64x2_t v, const int lane) + SIMDE_REQUIRE_CONSTANT_RANGE(lane, 0, 1) { + simde_poly64x2_t r; + simde_poly64x2_private v_ = simde_poly64x2_to_private(v); + v_.values[lane] = a; + r = simde_poly64x2_from_private(v_); + return r; +} +#if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && !defined(SIMDE_BUG_CLANG_71362) + #define simde_vsetq_lane_p64(a, b, c) vsetq_lane_p64((a), (b), (c)) +#endif +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) + #undef vsetq_lane_p64 + #define vsetq_lane_p64(a, b, c) simde_vsetq_lane_p64((a), (b), (c)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_bfloat16x4_t +simde_vset_lane_bf16(simde_bfloat16_t a, simde_bfloat16x4_t v, const int lane) + SIMDE_REQUIRE_CONSTANT_RANGE(lane, 0, 3) { + simde_bfloat16x4_t r; + #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARM_NEON_BF16) + SIMDE_CONSTIFY_4_(vset_lane_bf16, r, (HEDLEY_UNREACHABLE(), v), lane, a, v); + #else + simde_bfloat16x4_private v_ = simde_bfloat16x4_to_private(v); + v_.values[lane] = a; + r = simde_bfloat16x4_from_private(v_); + #endif + return r; +} +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) + #undef vset_lane_bf16 + #define vset_lane_bf16(a, b, c) simde_vset_lane_bf16((a), (b), (c)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_bfloat16x8_t +simde_vsetq_lane_bf16(simde_bfloat16_t a, simde_bfloat16x8_t v, const int lane) + SIMDE_REQUIRE_CONSTANT_RANGE(lane, 0, 7) { + simde_bfloat16x8_t r; + #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARM_NEON_BF16) + SIMDE_CONSTIFY_8_(vsetq_lane_bf16, r, (HEDLEY_UNREACHABLE(), v), lane, a, v); + #else + simde_bfloat16x8_private v_ = simde_bfloat16x8_to_private(v); + v_.values[lane] = a; + r = simde_bfloat16x8_from_private(v_); + #endif + return r; +} +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) + #undef vsetq_lane_bf16 + #define vsetq_lane_bf16(a, b, c) simde_vsetq_lane_bf16((a), (b), (c)) +#endif + SIMDE_END_DECLS_ HEDLEY_DIAGNOSTIC_POP diff --git a/lib/simd_wrapper/simde/arm/neon/sha1.h b/lib/simd_wrapper/simde/arm/neon/sha1.h new file mode 100644 index 00000000000..73b7988e316 --- /dev/null +++ b/lib/simd_wrapper/simde/arm/neon/sha1.h @@ -0,0 +1,202 @@ +/* SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, copy, + * modify, merge, publish, distribute, sublicense, and/or sell copies + * of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + * Copyright: + * 2023 Yi-Yen Chung (Copyright owned by Andes Technology) + */ + +#if !defined(SIMDE_ARM_NEON_SHA1_H) +#define SIMDE_ARM_NEON_SHA1_H + +#include "types.h" + +HEDLEY_DIAGNOSTIC_PUSH +SIMDE_DISABLE_UNWANTED_DIAGNOSTICS +SIMDE_BEGIN_DECLS_ + +#define ROL(operand, N, shift) (((operand) >> (N-shift)) | ((operand) << (shift))) + +SIMDE_FUNCTION_ATTRIBUTES +uint32_t +simde_vsha1h_u32(uint32_t a) { + #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARCH_ARM_SHA2) + return vsha1h_u32(a); + #else + return ROL(a, 32, 30); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) + #undef vsha1h_u32 + #define vsha1h_u32(a) simde_vsha1h_u32((a)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_uint32x4_t +simde_vsha1cq_u32(simde_uint32x4_t hash_abcd, uint32_t hash_e, simde_uint32x4_t wk) { + #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARCH_ARM_SHA2) + return vsha1cq_u32(hash_abcd, hash_e, wk); + #else + simde_uint32x4_private + x_ = simde_uint32x4_to_private(hash_abcd), + w_ = simde_uint32x4_to_private(wk); + uint32_t y_ = hash_e; + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(x_.values) / sizeof(x_.values[0])) ; i++) { + uint32_t t = (((x_.values[2] ^ x_.values[3]) & x_.values[1]) ^ x_.values[3]); + y_ = y_ + ROL(x_.values[0], 32, 5) + t + w_.values[i]; + x_.values[1] = ROL(x_.values[1], 32, 30); + uint32_t tmp = y_; + y_ = 0x0 | x_.values[3]; + x_.values[3] = 0x0 | x_.values[2]; + x_.values[2] = 0x0 | x_.values[1]; + x_.values[1] = 0x0 | x_.values[0]; + x_.values[0] = tmp | 0x0; + } + return simde_uint32x4_from_private(x_); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) + #undef vsha1cq_u32 + #define vsha1cq_u32(hash_abcd, hash_e, wk) simde_vsha1cq_u32((hash_abcd), (hash_e), (wk)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_uint32x4_t +simde_vsha1mq_u32(simde_uint32x4_t hash_abcd, uint32_t hash_e, simde_uint32x4_t wk) { + #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARCH_ARM_SHA2) + return vsha1mq_u32(hash_abcd, hash_e, wk); + #else + simde_uint32x4_private + x_ = simde_uint32x4_to_private(hash_abcd), + w_ = simde_uint32x4_to_private(wk); + uint32_t y_ = hash_e; + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(x_.values) / sizeof(x_.values[0])) ; i++) { + uint32_t t = ((x_.values[1] & x_.values[2]) | ((x_.values[1] | x_.values[2]) & x_.values[3])); + y_ = y_ + ROL(x_.values[0], 32, 5) + t + w_.values[i]; + x_.values[1] = ROL(x_.values[1], 32, 30); + uint32_t tmp = y_; + y_ = 0x0 | x_.values[3]; + x_.values[3] = 0x0 | x_.values[2]; + x_.values[2] = 0x0 | x_.values[1]; + x_.values[1] = 0x0 | x_.values[0]; + x_.values[0] = tmp | 0x0; + } + return simde_uint32x4_from_private(x_); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) + #undef vsha1mq_u32 + #define vsha1mq_u32(hash_abcd, hash_e, wk) simde_vsha1mq_u32((hash_abcd), (hash_e), (wk)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_uint32x4_t +simde_vsha1pq_u32(simde_uint32x4_t hash_abcd, uint32_t hash_e, simde_uint32x4_t wk) { + #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARCH_ARM_SHA2) + return vsha1pq_u32(hash_abcd, hash_e, wk); + #else + simde_uint32x4_private + x_ = simde_uint32x4_to_private(hash_abcd), + w_ = simde_uint32x4_to_private(wk); + uint32_t y_ = hash_e; + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(x_.values) / sizeof(x_.values[0])) ; i++) { + uint32_t t = (x_.values[1] ^ x_.values[2] ^ x_.values[3]); + y_ = y_ + ROL(x_.values[0], 32, 5) + t + w_.values[i]; + x_.values[1] = ROL(x_.values[1], 32, 30); + uint32_t tmp = y_; + y_ = 0x0 | x_.values[3]; + x_.values[3] = 0x0 | x_.values[2]; + x_.values[2] = 0x0 | x_.values[1]; + x_.values[1] = 0x0 | x_.values[0]; + x_.values[0] = tmp | 0x0; + } + return simde_uint32x4_from_private(x_); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) + #undef vsha1pq_u32 + #define vsha1pq_u32(hash_abcd, hash_e, wk) simde_vsha1pq_u32((hash_abcd), (hash_e), (wk)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_uint32x4_t +simde_vsha1su0q_u32(simde_uint32x4_t w0_3, simde_uint32x4_t w4_7, simde_uint32x4_t w8_11) { + #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARCH_ARM_SHA2) + return vsha1su0q_u32(w0_3, w4_7, w8_11); + #else + simde_uint32x4_private + r_, + x_ = simde_uint32x4_to_private(w0_3), + y_ = simde_uint32x4_to_private(w4_7), + z_ = simde_uint32x4_to_private(w8_11); + r_.values[3] = y_.values[1]; + r_.values[2] = y_.values[0]; + r_.values[1] = x_.values[3]; + r_.values[0] = x_.values[2]; + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(x_.values) / sizeof(x_.values[0])) ; i++) { + r_.values[i] = r_.values[i] ^ x_.values[i] ^ z_.values[i]; + } + return simde_uint32x4_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) + #undef vsha1su0q_u32 + #define vsha1su0q_u32(w0_3, w4_7, w8_11) simde_vsha1su0q_u32((w0_3), (w4_7), (w8_11)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_uint32x4_t +simde_vsha1su1q_u32(simde_uint32x4_t tw0_3, simde_uint32x4_t tw12_15) { + #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARCH_ARM_SHA2) + return vsha1su1q_u32(tw0_3, tw12_15); + #else + simde_uint32x4_private + r_, + T_, + x_ = simde_uint32x4_to_private(tw0_3), + y_ = simde_uint32x4_to_private(tw12_15); + T_.values[0] = x_.values[0] ^ y_.values[1]; + T_.values[1] = x_.values[1] ^ y_.values[2]; + T_.values[2] = x_.values[2] ^ y_.values[3]; + T_.values[3] = x_.values[3] ^ 0x0; + r_.values[0] = ROL(T_.values[0], 32, 1); + r_.values[1] = ROL(T_.values[1], 32, 1); + r_.values[2] = ROL(T_.values[2], 32, 1); + r_.values[3] = ROL(T_.values[3], 32, 1) ^ ROL(T_.values[0], 32, 2); + + return simde_uint32x4_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) + #undef vsha1su1q_u32 + #define vsha1su1q_u32(tw0_3, tw12_15) simde_vsha1su1q_u32((tw0_3), (tw12_15)) +#endif + +#undef ROL + +SIMDE_END_DECLS_ +HEDLEY_DIAGNOSTIC_POP + +#endif /* !defined(SIMDE_ARM_NEON_SHA1_H) */ diff --git a/lib/simd_wrapper/simde/arm/neon/sha256.h b/lib/simd_wrapper/simde/arm/neon/sha256.h new file mode 100644 index 00000000000..7012f4b0269 --- /dev/null +++ b/lib/simd_wrapper/simde/arm/neon/sha256.h @@ -0,0 +1,193 @@ +/* SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, copy, + * modify, merge, publish, distribute, sublicense, and/or sell copies + * of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + * Copyright: + * 2023 Yi-Yen Chung (Copyright owned by Andes Technology) + */ + +#if !defined(SIMDE_ARM_NEON_SHA256_H) +#define SIMDE_ARM_NEON_SHA256_H + +#include "types.h" + +HEDLEY_DIAGNOSTIC_PUSH +SIMDE_DISABLE_UNWANTED_DIAGNOSTICS +SIMDE_BEGIN_DECLS_ + +#define ROR32(operand, shift) (((operand) >> (shift)) | ((operand) << (32-shift))) +#define ROL32(operand, shift) (((operand) >> (32-shift)) | ((operand) << (shift))) +#define LSR(operand, shift) ((operand) >> (shift)) +#define LSL(operand, shift) ((operand) << (shift)) + +static uint32_t simde_SHAchoose(uint32_t x, uint32_t y, uint32_t z) { + return (((y ^ z) & x) ^ z); +} + +static uint32_t simde_SHAmajority(uint32_t x, uint32_t y, uint32_t z) { + return ((x & y) | ((x | y) & z)); +} + +static uint32_t simde_SHAhashSIGMA0(uint32_t x) { + return ROR32(x, 2) ^ ROR32(x, 13) ^ ROR32(x, 22); +} + +static uint32_t simde_SHAhashSIGMA1(uint32_t x) { + return ROR32(x, 6) ^ ROR32(x, 11) ^ ROR32(x, 25); +} + +static simde_uint32x4_t +x_simde_sha256hash(simde_uint32x4_t x, simde_uint32x4_t y, simde_uint32x4_t w, int part1) { + uint32_t chs, maj, t; + simde_uint32x4_private + x_ = simde_uint32x4_to_private(x), + y_ = simde_uint32x4_to_private(y), + w_ = simde_uint32x4_to_private(w); + + for(int i = 0; i < 4; ++i) { + chs = simde_SHAchoose(y_.values[0], y_.values[1], y_.values[2]); + maj = simde_SHAmajority(x_.values[0], x_.values[1], x_.values[2]); + t = y_.values[3] + simde_SHAhashSIGMA1(y_.values[0]) + chs + w_.values[i]; + x_.values[3] = t + x_.values[3]; + y_.values[3] = t + simde_SHAhashSIGMA0(x_.values[0]) + maj; + uint32_t tmp = y_.values[3]; + y_.values[3] = 0x0 | y_.values[2]; + y_.values[2] = 0x0 | y_.values[1]; + y_.values[1] = 0x0 | y_.values[0]; + y_.values[0] = 0x0 | x_.values[3]; + x_.values[3] = 0x0 | x_.values[2]; + x_.values[2] = 0x0 | x_.values[1]; + x_.values[1] = 0x0 | x_.values[0]; + x_.values[0] = tmp | 0x0; + } + return (part1 == 1) ? simde_uint32x4_from_private(x_) : simde_uint32x4_from_private(y_); +} + +SIMDE_FUNCTION_ATTRIBUTES +simde_uint32x4_t +simde_vsha256hq_u32(simde_uint32x4_t hash_efgh, simde_uint32x4_t hash_abcd, simde_uint32x4_t wk) { + #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARCH_ARM_SHA2) + return vsha256hq_u32(hash_efgh, hash_abcd, wk); + #else + return x_simde_sha256hash(hash_efgh, hash_abcd, wk, 1); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) + #undef vsha256hq_u32 + #define vsha256hq_u32(hash_efgh, hash_abcd, wk) simde_vsha256hq_u32((hash_efgh), (hash_abcd), (wk)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_uint32x4_t +simde_vsha256h2q_u32(simde_uint32x4_t hash_efgh, simde_uint32x4_t hash_abcd, simde_uint32x4_t wk) { + #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARCH_ARM_SHA2) + return vsha256h2q_u32(hash_efgh, hash_abcd, wk); + #else + return x_simde_sha256hash(hash_abcd, hash_efgh, wk, 0); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) + #undef vsha256h2q_u32 + #define vsha256h2q_u32(hash_efgh, hash_abcd, wk) simde_vsha256h2q_u32((hash_efgh), (hash_abcd), (wk)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_uint32x4_t +simde_vsha256su0q_u32(simde_uint32x4_t w0_3, simde_uint32x4_t w4_7) { + #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARCH_ARM_SHA2) + return vsha256su0q_u32(w0_3, w4_7); + #else + simde_uint32x4_private + r_, + T_, + x_ = simde_uint32x4_to_private(w0_3), + y_ = simde_uint32x4_to_private(w4_7); + T_.values[3] = y_.values[0]; + T_.values[2] = x_.values[3]; + T_.values[1] = x_.values[2]; + T_.values[0] = x_.values[1]; + uint32_t elt; + for(int i = 0; i < 4; ++i) { + elt = T_.values[i]; + elt = ROR32(elt, 7) ^ ROR32(elt, 18) ^ LSR(elt, 3); + r_.values[i] = elt + x_.values[i]; + } + return simde_uint32x4_from_private(r_); + + #endif +} +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) + #undef vsha256su0q_u32 + #define vsha256su0q_u32(w0_3, w4_7) simde_vsha256su0q_u32((w0_3), (w4_7)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_uint32x4_t +simde_vsha256su1q_u32(simde_uint32x4_t tw0_3, simde_uint32x4_t w8_11, simde_uint32x4_t w12_15) { + #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARCH_ARM_SHA2) + return vsha256su1q_u32(tw0_3, w8_11, w12_15); + #else + simde_uint32x4_private + r_, + T0_, + x_ = simde_uint32x4_to_private(tw0_3), + y_ = simde_uint32x4_to_private(w8_11), + z_ = simde_uint32x4_to_private(w12_15); + simde_uint32x2_private T1_; + T0_.values[3] = z_.values[0]; + T0_.values[2] = y_.values[3]; + T0_.values[1] = y_.values[2]; + T0_.values[0] = y_.values[1]; + uint32_t elt; + T1_.values[1] = z_.values[3]; + T1_.values[0] = z_.values[2]; + for(int i = 0; i < 2; ++i) { + elt = T1_.values[i]; + elt = ROR32(elt, 17) ^ ROR32(elt, 19) ^ LSR(elt, 10); + elt = elt + x_.values[i] + T0_.values[i]; + r_.values[i] = elt; + } + T1_.values[1] = r_.values[1]; + T1_.values[0] = r_.values[0]; + for(int i = 2; i < 4; ++i) { + elt = T1_.values[i-2]; + elt = ROR32(elt, 17) ^ ROR32(elt, 19) ^ LSR(elt, 10); + elt = elt + x_.values[i] + T0_.values[i]; + r_.values[i] = elt; + } + return simde_uint32x4_from_private(r_); + + #endif +} +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) + #undef vsha256su1q_u32 + #define vsha256su1q_u32(tw0_3, w8_11, w12_15) simde_vsha256su1q_u32((tw0_3), (w8_11), (w12_15)) +#endif + +#undef ROR32 +#undef ROL32 +#undef LSR +#undef LSL + +SIMDE_END_DECLS_ +HEDLEY_DIAGNOSTIC_POP + +#endif /* !defined(SIMDE_ARM_NEON_SHA256_H) */ diff --git a/lib/simd_wrapper/simde/arm/neon/sha512.h b/lib/simd_wrapper/simde/arm/neon/sha512.h new file mode 100644 index 00000000000..e3b202cadcb --- /dev/null +++ b/lib/simd_wrapper/simde/arm/neon/sha512.h @@ -0,0 +1,153 @@ +/* SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, copy, + * modify, merge, publish, distribute, sublicense, and/or sell copies + * of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + * Copyright: + * 2023 Yi-Yen Chung (Copyright owned by Andes Technology) + */ + +#if !defined(SIMDE_ARM_NEON_SHA512_H) +#define SIMDE_ARM_NEON_SHA512_H + +#include "types.h" + +HEDLEY_DIAGNOSTIC_PUSH +SIMDE_DISABLE_UNWANTED_DIAGNOSTICS +SIMDE_BEGIN_DECLS_ + +#define ROR64(operand, shift) (((operand) >> (shift)) | ((operand) << (64-shift))) +#define ROL64(operand, shift) (((operand) >> (64-shift)) | ((operand) << (shift))) +#define LSR(operand, shift) ((operand) >> (shift)) +#define LSL(operand, shift) ((operand) << (shift)) + +SIMDE_FUNCTION_ATTRIBUTES +simde_uint64x2_t +simde_vsha512hq_u64(simde_uint64x2_t w, simde_uint64x2_t x, simde_uint64x2_t y) { + #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && defined(SIMDE_ARCH_ARM_SHA512) + return vsha512hq_u64(w, x, y); + #else + simde_uint64x2_private + r_, + w_ = simde_uint64x2_to_private(w), + x_ = simde_uint64x2_to_private(x), + y_ = simde_uint64x2_to_private(y); + uint64_t Msigma1; + uint64_t tmp; + Msigma1 = ROR64(y_.values[1], 14) ^ ROR64(y_.values[1], 18) ^ ROR64(y_.values[1], 41); + r_.values[1] = (y_.values[1] & x_.values[0]) ^ (~(y_.values[1]) & x_.values[1]); + r_.values[1] = (r_.values[1] + Msigma1 + w_.values[1]); + tmp = r_.values[1] + y_.values[0]; + Msigma1 = ROR64(tmp, 14) ^ ROR64(tmp, 18) ^ ROR64(tmp, 41); + r_.values[0] = (tmp & y_.values[1]) ^ (~(tmp) & x_.values[0]); + r_.values[0] = (r_.values[0] + Msigma1 + w_.values[0]); + return simde_uint64x2_from_private(r_); + + #endif +} +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vsha512hq_u64 + #define vsha512hq_u64(w, x, y) simde_vsha512hq_u64((w), (x), (y)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_uint64x2_t +simde_vsha512h2q_u64(simde_uint64x2_t w, simde_uint64x2_t x, simde_uint64x2_t y) { + #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && defined(SIMDE_ARCH_ARM_SHA512) + return vsha512h2q_u64(w, x, y); + #else + simde_uint64x2_private + r_, + w_ = simde_uint64x2_to_private(w), + x_ = simde_uint64x2_to_private(x), + y_ = simde_uint64x2_to_private(y); + uint64_t Msigma0; + Msigma0 = ROR64(y_.values[0], 28) ^ ROR64(y_.values[0], 34) ^ ROR64(y_.values[0], 39); + r_.values[1] = (y_.values[1] & x_.values[0]) ^ (y_.values[0] & x_.values[0]) ^ (y_.values[1] & y_.values[0]); + r_.values[1] = (r_.values[1] + Msigma0 + w_.values[1]); + Msigma0 = ROR64(r_.values[1], 28) ^ ROR64(r_.values[1], 34) ^ ROR64(r_.values[1], 39); + r_.values[0] = (r_.values[1] & y_.values[0]) ^ (r_.values[1] & y_.values[1]) ^ (y_.values[1] & y_.values[0]); + r_.values[0] = (r_.values[0] + Msigma0 + w_.values[0]); + return simde_uint64x2_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vsha512h2q_u64 + #define vsha512h2q_u64(w, x, y) simde_vsha512h2q_u64((w), (x), (y)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_uint64x2_t +simde_vsha512su0q_u64(simde_uint64x2_t w, simde_uint64x2_t x) { + #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && defined(SIMDE_ARCH_ARM_SHA512) + return vsha512su0q_u64(w, x); + #else + simde_uint64x2_private + r_, + w_ = simde_uint64x2_to_private(w), + x_ = simde_uint64x2_to_private(x); + uint64_t sig0; + sig0 = ROR64(w_.values[1], 1) ^ ROR64(w_.values[1], 8) ^ (w_.values[1] >> 7); + r_.values[0] = w_.values[0] + sig0; + sig0 = ROR64(x_.values[0], 1) ^ ROR64(x_.values[0], 8) ^ (x_.values[0] >> 7); + r_.values[1] = w_.values[1] + sig0; + return simde_uint64x2_from_private(r_); + + #endif +} +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vsha512su0q_u64 + #define vsha512su0q_u64(w, x) simde_vsha512su0q_u64((w), (x)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_uint64x2_t +simde_vsha512su1q_u64(simde_uint64x2_t w, simde_uint64x2_t x, simde_uint64x2_t y) { + #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && defined(SIMDE_ARCH_ARM_SHA512) + return vsha512su1q_u64(w, x, y); + #else + simde_uint64x2_private + r_, + w_ = simde_uint64x2_to_private(w), + x_ = simde_uint64x2_to_private(x), + y_ = simde_uint64x2_to_private(y); + uint64_t sig1; + sig1 = ROR64(x_.values[1], 19) ^ ROR64(x_.values[1], 61) ^ (x_.values[1] >> 6); + r_.values[1] = w_.values[1] + sig1 + y_.values[1]; + sig1 = ROR64(x_.values[0], 19) ^ ROR64(x_.values[0], 61) ^ (x_.values[0] >> 6); + r_.values[0] = w_.values[0] + sig1 + y_.values[0]; + return simde_uint64x2_from_private(r_); + + #endif +} +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vsha512su1q_u64 + #define vsha512su1q_u64(w, x, y) simde_vsha512su1q_u64((w), (x), (y)) +#endif + +#undef ROR64 +#undef ROL64 +#undef LSR +#undef LSL + +SIMDE_END_DECLS_ +HEDLEY_DIAGNOSTIC_POP + +#endif /* !defined(SIMDE_ARM_NEON_SHA512_H) */ diff --git a/lib/simd_wrapper/simde/arm/neon/shl.h b/lib/simd_wrapper/simde/arm/neon/shl.h index 3799fbab657..b4fd842562e 100644 --- a/lib/simd_wrapper/simde/arm/neon/shl.h +++ b/lib/simd_wrapper/simde/arm/neon/shl.h @@ -29,6 +29,7 @@ #define SIMDE_ARM_NEON_SHL_H #include "types.h" +#include "../../x86/avx.h" /* Notes from the implementer (Christopher Moore aka rosbif) * @@ -99,7 +100,7 @@ SIMDE_FUNCTION_ATTRIBUTES uint64_t simde_vshld_u64 (const uint64_t a, const int64_t b) { #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) - return vshld_u64(a, HEDLEY_STATIC_CAST(uint64_t, b)); + return vshld_u64(a, HEDLEY_STATIC_CAST(int64_t, b)); #else int8_t b_ = HEDLEY_STATIC_CAST(int8_t, b); return @@ -140,7 +141,7 @@ simde_vshl_s8 (const simde_int8x8_t a, const simde_int8x8_t b) { _mm256_srav_epi32(a256, _mm256_abs_epi32(b256)), _mm256_cmpgt_epi32(_mm256_setzero_si256(), b256)); r256 = _mm256_shuffle_epi8(r256, _mm256_set1_epi32(0x0C080400)); - r_.m64 = _mm_set_pi32(_mm256_extract_epi32(r256, 4), _mm256_extract_epi32(r256, 0)); + r_.m64 = _mm_set_pi32(simde_mm256_extract_epi32(r256, 4), simde_mm256_extract_epi32(r256, 0)); #else SIMDE_VECTORIZE for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { @@ -305,7 +306,7 @@ simde_vshl_u8 (const simde_uint8x8_t a, const simde_int8x8_t b) { _mm256_srlv_epi32(a256, _mm256_abs_epi32(b256)), _mm256_cmpgt_epi32(_mm256_setzero_si256(), b256)); r256 = _mm256_shuffle_epi8(r256, _mm256_set1_epi32(0x0C080400)); - r_.m64 = _mm_set_pi32(_mm256_extract_epi32(r256, 4), _mm256_extract_epi32(r256, 0)); + r_.m64 = _mm_set_pi32(simde_mm256_extract_epi32(r256, 4), simde_mm256_extract_epi32(r256, 0)); #else SIMDE_VECTORIZE for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { @@ -533,7 +534,7 @@ simde_vshlq_s16 (const simde_int16x8_t a, const simde_int16x8_t b) { _mm256_srav_epi32(a256, _mm256_abs_epi32(b256)), _mm256_cmpgt_epi32(_mm256_setzero_si256(), b256)); r256 = _mm256_shuffle_epi8(r256, _mm256_set1_epi64x(0x0D0C090805040100)); - r_.m128i = _mm_set_epi64x(_mm256_extract_epi64(r256, 2), _mm256_extract_epi64(r256, 0)); + r_.m128i = _mm_set_epi64x(simde_mm256_extract_epi64(r256, 2), simde_mm256_extract_epi64(r256, 0)); #else SIMDE_VECTORIZE for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { @@ -743,7 +744,7 @@ simde_vshlq_u16 (const simde_uint16x8_t a, const simde_int16x8_t b) { _mm256_srlv_epi32(a256, _mm256_abs_epi32(b256)), _mm256_cmpgt_epi32(_mm256_setzero_si256(), b256)); r256 = _mm256_shuffle_epi8(r256, _mm256_set1_epi64x(0x0D0C090805040100)); - r_.m128i = _mm_set_epi64x(_mm256_extract_epi64(r256, 2), _mm256_extract_epi64(r256, 0)); + r_.m128i = _mm_set_epi64x(simde_mm256_extract_epi64(r256, 2), simde_mm256_extract_epi64(r256, 0)); #else SIMDE_VECTORIZE for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { diff --git a/lib/simd_wrapper/simde/arm/neon/shll_high_n.h b/lib/simd_wrapper/simde/arm/neon/shll_high_n.h new file mode 100644 index 00000000000..962d409a511 --- /dev/null +++ b/lib/simd_wrapper/simde/arm/neon/shll_high_n.h @@ -0,0 +1,180 @@ +/* SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, copy, + * modify, merge, publish, distribute, sublicense, and/or sell copies + * of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + * Copyright: + * 2023 Yi-Yen Chung (Copyright owned by Andes Technology) + */ + +#if !defined(SIMDE_ARM_NEON_SHLL_HIGH_N_H) +#define SIMDE_ARM_NEON_SHLL_HIGH_N_H + +#include "types.h" + +/* + * The constant range requirements for the shift amount *n* looks strange. + * The ARM Neon Intrinsics Reference states that for *_s8, 0 << n << 7. This + * does not match the actual instruction decoding in the ARM Reference manual, + * which states that the shift amount "must be equal to the source element width + * in bits" (ARM DDI 0487F.b C7-1959). So for *_s8 instructions, *n* must be 8, + * for *_s16, it must be 16, and *_s32 must be 32 (similarly for unsigned). + */ + +HEDLEY_DIAGNOSTIC_PUSH +SIMDE_DISABLE_UNWANTED_DIAGNOSTICS +SIMDE_BEGIN_DECLS_ + +SIMDE_FUNCTION_ATTRIBUTES +simde_int16x8_t +simde_vshll_high_n_s8 (const simde_int8x16_t a, const int n) + SIMDE_REQUIRE_CONSTANT_RANGE(n, 1, 7) { + simde_int16x8_private r_; + simde_int8x16_private a_ = simde_int8x16_to_private(a); + + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { + r_.values[i] = HEDLEY_STATIC_CAST(int16_t, HEDLEY_STATIC_CAST(int16_t, a_.values[i+(sizeof(r_.values) / sizeof(r_.values[0]))]) << n); + } + + return simde_int16x8_from_private(r_); +} +#if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + #define simde_vshll_high_n_s8(a, n) vshll_high_n_s8((a), (n)) +#endif +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vshll_high_n_s8 + #define vshll_high_n_s8(a, n) simde_vshll_high_n_s8((a), (n)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_int32x4_t +simde_vshll_high_n_s16 (const simde_int16x8_t a, const int n) + SIMDE_REQUIRE_CONSTANT_RANGE(n, 1, 15) { + simde_int32x4_private r_; + simde_int16x8_private a_ = simde_int16x8_to_private(a); + + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { + r_.values[i] = HEDLEY_STATIC_CAST(int32_t, a_.values[i+(sizeof(r_.values) / sizeof(r_.values[0]))]) << n; + } + + return simde_int32x4_from_private(r_); +} +#if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + #define simde_vshll_high_n_s16(a, n) vshll_high_n_s16((a), (n)) +#endif +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vshll_high_n_s16 + #define vshll_high_n_s16(a, n) simde_vshll_high_n_s16((a), (n)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_int64x2_t +simde_vshll_high_n_s32 (const simde_int32x4_t a, const int n) + SIMDE_REQUIRE_CONSTANT_RANGE(n, 1, 31) { + simde_int64x2_private r_; + simde_int32x4_private a_ = simde_int32x4_to_private(a); + + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { + r_.values[i] = HEDLEY_STATIC_CAST(int64_t, a_.values[i+(sizeof(r_.values) / sizeof(r_.values[0]))]) << n; + } + + return simde_int64x2_from_private(r_); +} +#if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + #define simde_vshll_high_n_s32(a, n) vshll_high_n_s32((a), (n)) +#endif +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vshll_high_n_s32 + #define vshll_high_n_s32(a, n) simde_vshll_high_n_s32((a), (n)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_uint16x8_t +simde_vshll_high_n_u8 (const simde_uint8x16_t a, const int n) + SIMDE_REQUIRE_CONSTANT_RANGE(n, 1, 7) { + simde_uint16x8_private r_; + simde_uint8x16_private a_ = simde_uint8x16_to_private(a); + + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { + r_.values[i] = HEDLEY_STATIC_CAST(uint16_t, HEDLEY_STATIC_CAST(uint16_t, a_.values[i+(sizeof(r_.values) / sizeof(r_.values[0]))]) << n); + } + + return simde_uint16x8_from_private(r_); +} +#if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + #define simde_vshll_high_n_u8(a, n) vshll_high_n_u8((a), (n)) +#endif +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vshll_high_n_u8 + #define vshll_high_n_u8(a, n) simde_vshll_high_n_u8((a), (n)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_uint32x4_t +simde_vshll_high_n_u16 (const simde_uint16x8_t a, const int n) + SIMDE_REQUIRE_CONSTANT_RANGE(n, 1, 15) { + simde_uint32x4_private r_; + simde_uint16x8_private a_ = simde_uint16x8_to_private(a); + + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { + r_.values[i] = HEDLEY_STATIC_CAST(uint32_t, a_.values[i+(sizeof(r_.values) / sizeof(r_.values[0]))]) << n; + } + + return simde_uint32x4_from_private(r_); +} +#if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + #define simde_vshll_high_n_u16(a, n) vshll_high_n_u16((a), (n)) +#endif +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vshll_high_n_u16 + #define vshll_high_n_u16(a, n) simde_vshll_high_n_u16((a), (n)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_uint64x2_t +simde_vshll_high_n_u32 (const simde_uint32x4_t a, const int n) + SIMDE_REQUIRE_CONSTANT_RANGE(n, 1, 31) { + simde_uint64x2_private r_; + simde_uint32x4_private a_ = simde_uint32x4_to_private(a); + + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { + r_.values[i] = HEDLEY_STATIC_CAST(uint64_t, a_.values[i+(sizeof(r_.values) / sizeof(r_.values[0]))]) << n; + } + + return simde_uint64x2_from_private(r_); +} +#if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + #define simde_vshll_high_n_u32(a, n) vshll_high_n_u32((a), (n)) +#endif +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vshll_high_n_u32 + #define vshll_high_n_u32(a, n) simde_vshll_high_n_u32((a), (n)) +#endif + +SIMDE_END_DECLS_ +HEDLEY_DIAGNOSTIC_POP + +#endif /* !defined(SIMDE_ARM_NEON_SHLL_HIGH_N_H) */ diff --git a/lib/simd_wrapper/simde/arm/neon/shll_n.h b/lib/simd_wrapper/simde/arm/neon/shll_n.h index 36fb96eaa0d..898e307ed6e 100644 --- a/lib/simd_wrapper/simde/arm/neon/shll_n.h +++ b/lib/simd_wrapper/simde/arm/neon/shll_n.h @@ -46,7 +46,7 @@ SIMDE_BEGIN_DECLS_ SIMDE_FUNCTION_ATTRIBUTES simde_int16x8_t simde_vshll_n_s8 (const simde_int8x8_t a, const int n) - SIMDE_REQUIRE_CONSTANT_RANGE(n, 1, 7) { + SIMDE_REQUIRE_CONSTANT_RANGE(n, 0, 8) { simde_int16x8_private r_; simde_int8x8_private a_ = simde_int8x8_to_private(a); @@ -68,7 +68,7 @@ simde_vshll_n_s8 (const simde_int8x8_t a, const int n) SIMDE_FUNCTION_ATTRIBUTES simde_int32x4_t simde_vshll_n_s16 (const simde_int16x4_t a, const int n) - SIMDE_REQUIRE_CONSTANT_RANGE(n, 1, 15) { + SIMDE_REQUIRE_CONSTANT_RANGE(n, 0, 16) { simde_int32x4_private r_; simde_int16x4_private a_ = simde_int16x4_to_private(a); @@ -90,7 +90,7 @@ simde_vshll_n_s16 (const simde_int16x4_t a, const int n) SIMDE_FUNCTION_ATTRIBUTES simde_int64x2_t simde_vshll_n_s32 (const simde_int32x2_t a, const int n) - SIMDE_REQUIRE_CONSTANT_RANGE(n, 1, 31) { + SIMDE_REQUIRE_CONSTANT_RANGE(n, 0, 32) { simde_int64x2_private r_; simde_int32x2_private a_ = simde_int32x2_to_private(a); @@ -112,7 +112,7 @@ simde_vshll_n_s32 (const simde_int32x2_t a, const int n) SIMDE_FUNCTION_ATTRIBUTES simde_uint16x8_t simde_vshll_n_u8 (const simde_uint8x8_t a, const int n) - SIMDE_REQUIRE_CONSTANT_RANGE(n, 1, 7) { + SIMDE_REQUIRE_CONSTANT_RANGE(n, 0, 8) { simde_uint16x8_private r_; simde_uint8x8_private a_ = simde_uint8x8_to_private(a); @@ -134,7 +134,7 @@ simde_vshll_n_u8 (const simde_uint8x8_t a, const int n) SIMDE_FUNCTION_ATTRIBUTES simde_uint32x4_t simde_vshll_n_u16 (const simde_uint16x4_t a, const int n) - SIMDE_REQUIRE_CONSTANT_RANGE(n, 1, 15) { + SIMDE_REQUIRE_CONSTANT_RANGE(n, 0, 16) { simde_uint32x4_private r_; simde_uint16x4_private a_ = simde_uint16x4_to_private(a); @@ -156,7 +156,7 @@ simde_vshll_n_u16 (const simde_uint16x4_t a, const int n) SIMDE_FUNCTION_ATTRIBUTES simde_uint64x2_t simde_vshll_n_u32 (const simde_uint32x2_t a, const int n) - SIMDE_REQUIRE_CONSTANT_RANGE(n, 1, 31) { + SIMDE_REQUIRE_CONSTANT_RANGE(n, 0, 32) { simde_uint64x2_private r_; simde_uint32x2_private a_ = simde_uint32x2_to_private(a); diff --git a/lib/simd_wrapper/simde/arm/neon/shr_n.h b/lib/simd_wrapper/simde/arm/neon/shr_n.h index 5c912571ef6..10f77d78683 100644 --- a/lib/simd_wrapper/simde/arm/neon/shr_n.h +++ b/lib/simd_wrapper/simde/arm/neon/shr_n.h @@ -23,6 +23,7 @@ * Copyright: * 2020 Evan Nemerson * 2020 Christopher Moore + * 2023 Yi-Yen Chung (Copyright owned by Andes Technology) */ #if !defined(SIMDE_ARM_NEON_SHR_N_H) @@ -34,6 +35,20 @@ HEDLEY_DIAGNOSTIC_PUSH SIMDE_DISABLE_UNWANTED_DIAGNOSTICS SIMDE_BEGIN_DECLS_ +SIMDE_FUNCTION_ATTRIBUTES +int16_t +simde_x_vshrh_n_s16(int16_t a, const int n) + SIMDE_REQUIRE_CONSTANT_RANGE(n, 1, 16) { + return a >> ((n == 16) ? 15 : n); +} + +SIMDE_FUNCTION_ATTRIBUTES +uint16_t +simde_x_vshrh_n_u16(uint16_t a, const int n) + SIMDE_REQUIRE_CONSTANT_RANGE(n, 1, 16) { + return (n == 16) ? 0 : a >> n; +} + SIMDE_FUNCTION_ATTRIBUTES int32_t simde_x_vshrs_n_s32(int32_t a, const int n) diff --git a/lib/simd_wrapper/simde/arm/neon/shrn_high_n.h b/lib/simd_wrapper/simde/arm/neon/shrn_high_n.h new file mode 100644 index 00000000000..bb45c37c808 --- /dev/null +++ b/lib/simd_wrapper/simde/arm/neon/shrn_high_n.h @@ -0,0 +1,114 @@ +/* SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, copy, + * modify, merge, publish, distribute, sublicense, and/or sell copies + * of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + * Copyright: + * 2023 Yi-Yen Chung (Copyright owned by Andes Technology) + */ + +#if !defined(SIMDE_ARM_NEON_SHRN_HIGH_N_H) +#define SIMDE_ARM_NEON_SHRN_HIGH_N_H + +#include "types.h" +#include "reinterpret.h" +#include "combine.h" +#include "shrn_n.h" + +HEDLEY_DIAGNOSTIC_PUSH +SIMDE_DISABLE_UNWANTED_DIAGNOSTICS +SIMDE_BEGIN_DECLS_ + +#if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + #define simde_vshrn_high_n_s16(r, a, n) vshrn_high_n_s16((r), (a), (n)) +#else + #define simde_vshrn_high_n_s16(r, a, n) \ + simde_vcombine_s8((r), simde_vshrn_n_s16((a), (n))) +#endif +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vshrn_high_n_s16 + #define vshrn_high_n_s16(r, a, n) simde_vshrn_high_n_s16((r), (a), (n)) +#endif + +#if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + #define simde_vshrn_high_n_s32(r, a, n) vshrn_high_n_s32((r), (a), (n)) +#else + #define simde_vshrn_high_n_s32(r, a, n) \ + simde_vcombine_s16((r), simde_vshrn_n_s32((a), (n))) +#endif +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vshrn_high_n_s32 + #define vshrn_high_n_s32(r, a, n) simde_vshrn_high_n_s32((r), (a), (n)) +#endif + +#if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + #define simde_vshrn_high_n_s64(r, a, n) vshrn_high_n_s64((r), (a), (n)) +#else + #define simde_vshrn_high_n_s64(r, a, n) \ + simde_vcombine_s32((r), simde_vshrn_n_s64((a), (n))) +#endif +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vshrn_high_n_s64 + #define vshrn_high_n_s64(r, a, n) simde_vshrn_high_n_s64((r), (a), (n)) +#endif + +#if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + #define simde_vshrn_high_n_u16(r, a, n) vshrn_high_n_u16((r), (a), (n)) +#else + #define simde_vshrn_high_n_u16(r, a, n) \ + simde_vreinterpretq_u8_s8( \ + simde_vcombine_s8(simde_vreinterpret_s8_u8(r), \ + simde_vshrn_n_s16(simde_vreinterpretq_s16_u16(a), (n)))) +#endif +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vshrn_high_n_u16 + #define vshrn_high_n_u16(r, a, n) simde_vshrn_high_n_u16((r), (a), (n)) +#endif + +#if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + #define simde_vshrn_high_n_u32(r, a, n) vshrn_high_n_u32((r), (a), (n)) +#else + #define simde_vshrn_high_n_u32(r, a, n) \ + simde_vreinterpretq_u16_s16( \ + simde_vcombine_s16(simde_vreinterpret_s16_u16(r), \ + simde_vshrn_n_s32(simde_vreinterpretq_s32_u32(a), (n)))) +#endif +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vshrn_high_n_u32 + #define vshrn_high_n_u32(r, a, n) simde_vshrn_high_n_u32((r), (a), (n)) +#endif + +#if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + #define simde_vshrn_high_n_u64(r, a, n) vshrn_high_n_u64((r), (a), (n)) +#else + #define simde_vshrn_high_n_u64(r, a, n) \ + simde_vreinterpretq_u32_s32( \ + simde_vcombine_s32(simde_vreinterpret_s32_u32(r), \ + simde_vshrn_n_s64(simde_vreinterpretq_s64_u64(a), (n)))) +#endif +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vshrn_high_n_u64 + #define vshrn_high_n_u64(r, a, n) simde_vshrn_high_n_u64((r), (a), (n)) +#endif + +SIMDE_END_DECLS_ +HEDLEY_DIAGNOSTIC_POP + +#endif /* !defined(SIMDE_ARM_NEON_SHRN_HIGH_N_H) */ diff --git a/lib/simd_wrapper/simde/arm/neon/shrn_n.h b/lib/simd_wrapper/simde/arm/neon/shrn_n.h index 6e890b43177..bba58a8ec20 100644 --- a/lib/simd_wrapper/simde/arm/neon/shrn_n.h +++ b/lib/simd_wrapper/simde/arm/neon/shrn_n.h @@ -23,6 +23,7 @@ * Copyright: * 2020 Evan Nemerson * 2021 Zhi An Ng (Copyright owned by Google, LLC) + * 2023 Yi-Yen Chung (Copyright owned by Andes Technology) */ #if !defined(SIMDE_ARM_NEON_SHRN_N_H) @@ -107,40 +108,36 @@ simde_vshrn_n_s64 (const simde_int64x2_t a, const int n) #define vshrn_n_s64(a, n) simde_vshrn_n_s64((a), (n)) #endif -#define simde_vshrn_n_u16(a, n) \ - simde_vreinterpret_u8_s8( \ - simde_vshrn_n_s16(simde_vreinterpretq_s16_u16(a), (n))) - #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) - #undef simde_vshrn_n_u16 #define simde_vshrn_n_u16(a, n) vshrn_n_u16((a), (n)) +#else + #define simde_vshrn_n_u16(a, n) \ + simde_vreinterpret_u8_s8( \ + simde_vshrn_n_s16(simde_vreinterpretq_s16_u16(a), (n))) #endif - #if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) #undef vshrn_n_u16 #define vshrn_n_u16(a, n) simde_vshrn_n_u16((a), (n)) #endif -#define simde_vshrn_n_u32(a, n) \ - simde_vreinterpret_u16_s16( \ - simde_vshrn_n_s32(simde_vreinterpretq_s32_u32(a), (n))) - #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) - #undef simde_vshrn_n_u32 #define simde_vshrn_n_u32(a, n) vshrn_n_u32((a), (n)) +#else + #define simde_vshrn_n_u32(a, n) \ + simde_vreinterpret_u16_s16( \ + simde_vshrn_n_s32(simde_vreinterpretq_s32_u32(a), (n))) #endif #if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) #undef vshrn_n_u32 #define vshrn_n_u32(a, n) simde_vshrn_n_u32((a), (n)) #endif -#define simde_vshrn_n_u64(a, n) \ - simde_vreinterpret_u32_s32( \ - simde_vshrn_n_s64(simde_vreinterpretq_s64_u64(a), (n))) - #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) - #undef simde_vshrn_n_u64 #define simde_vshrn_n_u64(a, n) vshrn_n_u64((a), (n)) +#else + #define simde_vshrn_n_u64(a, n) \ + simde_vreinterpret_u32_s32( \ + simde_vshrn_n_s64(simde_vreinterpretq_s64_u64(a), (n))) #endif #if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) #undef vshrn_n_u64 diff --git a/lib/simd_wrapper/simde/arm/neon/sli_n.h b/lib/simd_wrapper/simde/arm/neon/sli_n.h new file mode 100644 index 00000000000..1fff37abd57 --- /dev/null +++ b/lib/simd_wrapper/simde/arm/neon/sli_n.h @@ -0,0 +1,343 @@ +/* SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, copy, + * modify, merge, publish, distribute, sublicense, and/or sell copies + * of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + * Copyright: + * 2023 Yi-Yen Chung (Copyright owned by Andes Technology) + */ + +#if !defined(SIMDE_ARM_NEON_SLI_N_H) +#define SIMDE_ARM_NEON_SLI_N_H + +#include "types.h" +#include "shl_n.h" +#include "dup_n.h" +#include "and.h" +#include "orr.h" +#include "reinterpret.h" + +HEDLEY_DIAGNOSTIC_PUSH +SIMDE_DISABLE_UNWANTED_DIAGNOSTICS +SIMDE_BEGIN_DECLS_ + +#if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + #define simde_vslid_n_s64(a, b, n) vslid_n_s64(a, b, n) +#else + #define simde_vslid_n_s64(a, b, n) \ + HEDLEY_STATIC_CAST(int64_t, \ + simde_vslid_n_u64(HEDLEY_STATIC_CAST(uint64_t, a), HEDLEY_STATIC_CAST(uint64_t, b), n)) +#endif +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vslid_n_s64 + #define vslid_n_s64(a, b, n) simde_vslid_n_s64((a), (b), (n)) +#endif + +#if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + #define simde_vslid_n_u64(a, b, n) vslid_n_u64(a, b, n) +#else +#define simde_vslid_n_u64(a, b, n) \ + (((a & (UINT64_C(0xffffffffffffffff) >> (64 - n))) | simde_vshld_n_u64((b), (n)))) +#endif +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vslid_n_u64 + #define vslid_n_u64(a, b, n) simde_vslid_n_u64((a), (b), (n)) +#endif + +#if defined(SIMDE_ARM_NEON_A32V7_NATIVE) + #define simde_vsli_n_s8(a, b, n) vsli_n_s8((a), (b), (n)) +#else + #define simde_vsli_n_s8(a, b, n) \ + simde_vreinterpret_s8_u8(simde_vsli_n_u8( \ + simde_vreinterpret_u8_s8((a)), simde_vreinterpret_u8_s8((b)), (n))) +#endif +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vsli_n_s8 + #define vsli_n_s8(a, b, n) simde_vsli_n_s8((a), (b), (n)) +#endif + +#if defined(SIMDE_ARM_NEON_A32V7_NATIVE) + #define simde_vsli_n_u8(a, b, n) vsli_n_u8((a), (b), (n)) +#else + #define simde_vsli_n_u8(a, b, n) \ + simde_vorr_u8( \ + simde_vand_u8((a), simde_vdup_n_u8((UINT8_C(0xff) >> (8 - n)))), \ + simde_vshl_n_u8((b), (n))) +#endif +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vsli_n_u8 + #define vsli_n_u8(a, b, n) simde_vsli_n_u8((a), (b), (n)) +#endif + +#if defined(SIMDE_ARM_NEON_A32V7_NATIVE) + #define simde_vsli_n_s16(a, b, n) vsli_n_s16((a), (b), (n)) +#else + #define simde_vsli_n_s16(a, b, n) \ + simde_vreinterpret_s16_u16(simde_vsli_n_u16( \ + simde_vreinterpret_u16_s16((a)), simde_vreinterpret_u16_s16((b)), (n))) +#endif +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vsli_n_s16 + #define vsli_n_s16(a, b, n) simde_vsli_n_s16((a), (b), (n)) +#endif + +#if defined(SIMDE_ARM_NEON_A32V7_NATIVE) + #define simde_vsli_n_u16(a, b, n) vsli_n_u16((a), (b), (n)) +#else + #define simde_vsli_n_u16(a, b, n) \ + simde_vorr_u16( \ + simde_vand_u16((a), simde_vdup_n_u16((UINT16_C(0xffff) >> (16 - n)))), \ + simde_vshl_n_u16((b), (n))) +#endif +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vsli_n_u16 + #define vsli_n_u16(a, b, n) simde_vsli_n_u16((a), (b), (n)) +#endif + +#if defined(SIMDE_ARM_NEON_A32V7_NATIVE) + #define simde_vsli_n_s32(a, b, n) vsli_n_s32((a), (b), (n)) +#else + #define simde_vsli_n_s32(a, b, n) \ + simde_vreinterpret_s32_u32(simde_vsli_n_u32( \ + simde_vreinterpret_u32_s32((a)), simde_vreinterpret_u32_s32((b)), (n))) +#endif +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vsli_n_s32 + #define vsli_n_s32(a, b, n) simde_vsli_n_s32((a), (b), (n)) +#endif + +#if defined(SIMDE_ARM_NEON_A32V7_NATIVE) + #define simde_vsli_n_u32(a, b, n) vsli_n_u32((a), (b), (n)) +#else + #define simde_vsli_n_u32(a, b, n) \ + simde_vorr_u32( \ + simde_vand_u32((a), \ + simde_vdup_n_u32((UINT32_C(0xffffffff) >> (32 - n)))), \ + simde_vshl_n_u32((b), (n))) +#endif +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vsli_n_u32 + #define vsli_n_u32(a, b, n) simde_vsli_n_u32((a), (b), (n)) +#endif + +#if defined(SIMDE_ARM_NEON_A32V7_NATIVE) + #define simde_vsli_n_s64(a, b, n) vsli_n_s64((a), (b), (n)) +#else + #define simde_vsli_n_s64(a, b, n) \ + simde_vreinterpret_s64_u64(simde_vsli_n_u64( \ + simde_vreinterpret_u64_s64((a)), simde_vreinterpret_u64_s64((b)), (n))) +#endif +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vsli_n_s64 + #define vsli_n_s64(a, b, n) simde_vsli_n_s64((a), (b), (n)) +#endif + +#if defined(SIMDE_ARM_NEON_A32V7_NATIVE) + #define simde_vsli_n_u64(a, b, n) vsli_n_u64((a), (b), (n)) +#else +#define simde_vsli_n_u64(a, b, n) \ + simde_vorr_u64( \ + simde_vand_u64((a), simde_vdup_n_u64( \ + (UINT64_C(0xffffffffffffffff) >> (64 - n)))), \ + simde_vshl_n_u64((b), (n))) +#endif +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vsli_n_u64 + #define vsli_n_u64(a, b, n) simde_vsli_n_u64((a), (b), (n)) +#endif + +#if defined(SIMDE_ARM_NEON_A32V7_NATIVE) + #define simde_vsliq_n_s8(a, b, n) vsliq_n_s8((a), (b), (n)) +#else + #define simde_vsliq_n_s8(a, b, n) \ + simde_vreinterpretq_s8_u8(simde_vsliq_n_u8( \ + simde_vreinterpretq_u8_s8((a)), simde_vreinterpretq_u8_s8((b)), (n))) +#endif +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vsliq_n_s8 + #define vsliq_n_s8(a, b, n) simde_vsliq_n_s8((a), (b), (n)) +#endif + +#if defined(SIMDE_ARM_NEON_A32V7_NATIVE) + #define simde_vsliq_n_u8(a, b, n) vsliq_n_u8((a), (b), (n)) +#else + #define simde_vsliq_n_u8(a, b, n) \ + simde_vorrq_u8( \ + simde_vandq_u8((a), simde_vdupq_n_u8((UINT8_C(0xff) >> (8 - n)))), \ + simde_vshlq_n_u8((b), (n))) +#endif +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vsliq_n_u8 + #define vsliq_n_u8(a, b, n) simde_vsliq_n_u8((a), (b), (n)) +#endif + +#if defined(SIMDE_ARM_NEON_A32V7_NATIVE) + #define simde_vsliq_n_s16(a, b, n) vsliq_n_s16((a), (b), (n)) +#else + #define simde_vsliq_n_s16(a, b, n) \ + simde_vreinterpretq_s16_u16(simde_vsliq_n_u16( \ + simde_vreinterpretq_u16_s16((a)), simde_vreinterpretq_u16_s16((b)), (n))) +#endif +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vsliq_n_s16 + #define vsliq_n_s16(a, b, n) simde_vsliq_n_s16((a), (b), (n)) +#endif + +#if defined(SIMDE_ARM_NEON_A32V7_NATIVE) + #define simde_vsliq_n_u16(a, b, n) vsliq_n_u16((a), (b), (n)) +#else + #define simde_vsliq_n_u16(a, b, n) \ + simde_vorrq_u16( \ + simde_vandq_u16((a), simde_vdupq_n_u16((UINT16_C(0xffff) >> (16 - n)))), \ + simde_vshlq_n_u16((b), (n))) +#endif +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vsliq_n_u16 + #define vsliq_n_u16(a, b, n) simde_vsliq_n_u16((a), (b), (n)) +#endif + +#if defined(SIMDE_ARM_NEON_A32V7_NATIVE) + #define simde_vsliq_n_s32(a, b, n) vsliq_n_s32((a), (b), (n)) +#else + #define simde_vsliq_n_s32(a, b, n) \ + simde_vreinterpretq_s32_u32(simde_vsliq_n_u32( \ + simde_vreinterpretq_u32_s32((a)), simde_vreinterpretq_u32_s32((b)), (n))) +#endif +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vsliq_n_s32 + #define vsliq_n_s32(a, b, n) simde_vsliq_n_s32((a), (b), (n)) +#endif + +#if defined(SIMDE_ARM_NEON_A32V7_NATIVE) + #define simde_vsliq_n_u32(a, b, n) vsliq_n_u32((a), (b), (n)) +#else + #define simde_vsliq_n_u32(a, b, n) \ + simde_vorrq_u32( \ + simde_vandq_u32((a), \ + simde_vdupq_n_u32((UINT32_C(0xffffffff) >> (32 - n)))), \ + simde_vshlq_n_u32((b), (n))) +#endif +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vsliq_n_u32 + #define vsliq_n_u32(a, b, n) simde_vsliq_n_u32((a), (b), (n)) +#endif + +#if defined(SIMDE_ARM_NEON_A32V7_NATIVE) + #define simde_vsliq_n_s64(a, b, n) vsliq_n_s64((a), (b), (n)) +#else + #define simde_vsliq_n_s64(a, b, n) \ + simde_vreinterpretq_s64_u64(simde_vsliq_n_u64( \ + simde_vreinterpretq_u64_s64((a)), simde_vreinterpretq_u64_s64((b)), (n))) +#endif +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vsliq_n_s64 + #define vsliq_n_s64(a, b, n) simde_vsliq_n_s64((a), (b), (n)) +#endif + +#if defined(SIMDE_ARM_NEON_A32V7_NATIVE) + #define simde_vsliq_n_u64(a, b, n) vsliq_n_u64((a), (b), (n)) +#else +#define simde_vsliq_n_u64(a, b, n) \ + simde_vorrq_u64( \ + simde_vandq_u64((a), simde_vdupq_n_u64( \ + (UINT64_C(0xffffffffffffffff) >> (64 - n)))), \ + simde_vshlq_n_u64((b), (n))) +#endif +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vsliq_n_u64 + #define vsliq_n_u64(a, b, n) simde_vsliq_n_u64((a), (b), (n)) +#endif + +#if defined(SIMDE_ARM_NEON_A32V7_NATIVE) + #define simde_vsli_n_p8(a, b, n) vsli_n_p8((a), (b), (n)) +#else + #define simde_vsli_n_p8(a, b, n) \ + simde_vreinterpret_p8_u8(simde_vsli_n_u8( \ + simde_vreinterpret_u8_p8((a)), simde_vreinterpret_u8_p8((b)), (n))) +#endif +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vsli_n_p8 + #define vsli_n_p8(a, b, n) simde_vsli_n_p8((a), (b), (n)) +#endif + +#if defined(SIMDE_ARM_NEON_A32V7_NATIVE) + #define simde_vsli_n_p16(a, b, n) vsli_n_p16((a), (b), (n)) +#else + #define simde_vsli_n_p16(a, b, n) \ + simde_vreinterpret_p16_u16(simde_vsli_n_u16( \ + simde_vreinterpret_u16_p16((a)), simde_vreinterpret_u16_p16((b)), (n))) +#endif +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vsli_n_p16 + #define vsli_n_p16(a, b, n) simde_vsli_n_p16((a), (b), (n)) +#endif + +#if defined(SIMDE_ARM_NEON_A32V8_NATIVE) + #define simde_vsli_n_p64(a, b, n) vsli_n_p64((a), (b), (n)) +#else + #define simde_vsli_n_p64(a, b, n) \ + simde_vreinterpret_p64_u64(simde_vsli_n_u64( \ + simde_vreinterpret_u64_p64((a)), simde_vreinterpret_u64_p64((b)), (n))) +#endif +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) + #undef vsli_n_p64 + #define vsli_n_p64(a, b, n) simde_vsli_n_p64((a), (b), (n)) +#endif + +#if defined(SIMDE_ARM_NEON_A32V7_NATIVE) + #define simde_vsliq_n_p8(a, b, n) vsliq_n_p8((a), (b), (n)) +#else + #define simde_vsliq_n_p8(a, b, n) \ + simde_vreinterpretq_p8_u8(simde_vsliq_n_u8( \ + simde_vreinterpretq_u8_p8((a)), simde_vreinterpretq_u8_p8((b)), (n))) +#endif +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vsliq_n_p8 + #define vsliq_n_p8(a, b, n) simde_vsliq_n_p8((a), (b), (n)) +#endif + +#if defined(SIMDE_ARM_NEON_A32V7_NATIVE) + #define simde_vsliq_n_p16(a, b, n) vsliq_n_p16((a), (b), (n)) +#else + #define simde_vsliq_n_p16(a, b, n) \ + simde_vreinterpretq_p16_u16(simde_vsliq_n_u16( \ + simde_vreinterpretq_u16_p16((a)), simde_vreinterpretq_u16_p16((b)), (n))) +#endif +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vsliq_n_p16 + #define vsliq_n_p16(a, b, n) simde_vsliq_n_p16((a), (b), (n)) +#endif + +#if defined(SIMDE_ARM_NEON_A32V8_NATIVE) + #define simde_vsliq_n_p64(a, b, n) vsliq_n_p64((a), (b), (n)) +#else + #define simde_vsliq_n_p64(a, b, n) \ + simde_vreinterpretq_p64_u64(simde_vsliq_n_u64( \ + simde_vreinterpretq_u64_p64((a)), simde_vreinterpretq_u64_p64((b)), (n))) +#endif +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) + #undef vsliq_n_p64 + #define vsliq_n_p64(a, b, n) simde_vsliq_n_p64((a), (b), (n)) +#endif + +SIMDE_END_DECLS_ +HEDLEY_DIAGNOSTIC_POP + +#endif /* !defined(SIMDE_ARM_NEON_SLI_N_H) */ diff --git a/lib/simd_wrapper/simde/arm/neon/sm3.h b/lib/simd_wrapper/simde/arm/neon/sm3.h new file mode 100644 index 00000000000..64f02c9d84d --- /dev/null +++ b/lib/simd_wrapper/simde/arm/neon/sm3.h @@ -0,0 +1,251 @@ +/* SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, copy, + * modify, merge, publish, distribute, sublicense, and/or sell copies + * of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + * Copyright: + * 2023 Yi-Yen Chung (Copyright owned by Andes Technology) + */ + +#if !defined(SIMDE_ARM_NEON_SM3_H) +#define SIMDE_ARM_NEON_SM3_H + +#include "types.h" + +HEDLEY_DIAGNOSTIC_PUSH +SIMDE_DISABLE_UNWANTED_DIAGNOSTICS +SIMDE_BEGIN_DECLS_ + +#define ROR32(operand, shift) (((operand) >> (shift)) | ((operand) << (32-shift))) +#define ROL32(operand, shift) (((operand) >> (32-shift)) | ((operand) << (shift))) +#define LSR(operand, shift) ((operand) >> (shift)) +#define LSL(operand, shift) ((operand) << (shift)) + +SIMDE_FUNCTION_ATTRIBUTES +simde_uint32x4_t +simde_vsm3ss1q_u32(simde_uint32x4_t n, simde_uint32x4_t m, simde_uint32x4_t a) { + #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && defined(SIMDE_ARCH_ARM_SM3) + return vsm3ss1q_u32(n, m, a); + #else + simde_uint32x4_private + r_, + n_ = simde_uint32x4_to_private(n), + m_ = simde_uint32x4_to_private(m), + a_ = simde_uint32x4_to_private(a); + r_.values[3] = ROL32((ROL32(n_.values[3], 12) + m_.values[3] + a_.values[3]), 7); + r_.values[2] = 0; + r_.values[1] = 0; + r_.values[0] = 0; + return simde_uint32x4_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vsm3ss1q_u32 + #define vsm3ss1q_u32(n, m, a) simde_vsm3ss1q_u32((n), (m), (a)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_uint32x4_t +simde_vsm3tt1aq_u32(simde_uint32x4_t a, simde_uint32x4_t b, simde_uint32x4_t c, const int imm2) + SIMDE_REQUIRE_CONSTANT_RANGE(imm2, 0, 3) +{ + simde_uint32x4_private + r_, + a_ = simde_uint32x4_to_private(a), + b_ = simde_uint32x4_to_private(b), + c_ = simde_uint32x4_to_private(c); + uint32_t WjPrime, TT1, SS2; + + WjPrime = c_.values[imm2]; + SS2 = b_.values[3] ^ ROL32(a_.values[3], 12); + TT1 = a_.values[1] ^ (a_.values[3] ^ a_.values[2]); + TT1 = (TT1 + a_.values[0] + SS2 + WjPrime); + r_.values[0] = a_.values[1]; + r_.values[1] = ROL32(a_.values[2], 9); + r_.values[2] = a_.values[3]; + r_.values[3] = TT1; + return simde_uint32x4_from_private(r_); +} +#if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && defined(SIMDE_ARCH_ARM_SM3) + #define simde_vsm3tt1aq_u32(a, b, c, imm2) vsm3tt1aq_u32((a), (b), (c), (imm2)); +#endif +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vsm3tt1aq_u32 + #define vsm3tt1aq_u32(a, b, c, imm2) simde_vsm3tt1aq_u32((a), (b), (c), (imm2)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_uint32x4_t +simde_vsm3tt1bq_u32(simde_uint32x4_t a, simde_uint32x4_t b, simde_uint32x4_t c, const int imm2) + SIMDE_REQUIRE_CONSTANT_RANGE(imm2, 0, 3) +{ + simde_uint32x4_private + r_, + a_ = simde_uint32x4_to_private(a), + b_ = simde_uint32x4_to_private(b), + c_ = simde_uint32x4_to_private(c); + uint32_t WjPrime, TT1, SS2; + + WjPrime = c_.values[imm2]; + SS2 = b_.values[3] ^ ROL32(a_.values[3], 12); + TT1 = (a_.values[3] & a_.values[1]) | (a_.values[3] & a_.values[2]) | (a_.values[1] & a_.values[2]); + TT1 = (TT1 + a_.values[0] + SS2 + WjPrime); + r_.values[0] = a_.values[1]; + r_.values[1] = ROL32(a_.values[2], 9); + r_.values[2] = a_.values[3]; + r_.values[3] = TT1; + return simde_uint32x4_from_private(r_); +} +#if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && defined(SIMDE_ARCH_ARM_SM3) + #define simde_vsm3tt1bq_u32(a, b, c, imm2) vsm3tt1bq_u32((a), (b), (c), (imm2)); +#endif +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vsm3tt1bq_u32 + #define vsm3tt1bq_u32(a, b, c, imm2) simde_vsm3tt1bq_u32((a), (b), (c), (imm2)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_uint32x4_t +simde_vsm3tt2aq_u32(simde_uint32x4_t a, simde_uint32x4_t b, simde_uint32x4_t c, const int imm2) + SIMDE_REQUIRE_CONSTANT_RANGE(imm2, 0, 3) +{ + simde_uint32x4_private + r_, + a_ = simde_uint32x4_to_private(a), + b_ = simde_uint32x4_to_private(b), + c_ = simde_uint32x4_to_private(c); + uint32_t Wj, TT2; + + Wj = c_.values[imm2]; + TT2 = a_.values[1] ^ (a_.values[3] ^ a_.values[2]); + TT2 = (TT2 + a_.values[0] + b_.values[3] + Wj); + r_.values[0] = a_.values[1]; + r_.values[1] = ROL32(a_.values[2], 19); + r_.values[2] = a_.values[3]; + r_.values[3] = TT2 ^ ROL32(TT2, 9) ^ ROL32(TT2, 17); + return simde_uint32x4_from_private(r_); +} +#if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && defined(SIMDE_ARCH_ARM_SM3) + #define simde_vsm3tt2aq_u32(a, b, c, imm2) vsm3tt2aq_u32((a), (b), (c), (imm2)); +#endif +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vsm3tt2aq_u32 + #define vsm3tt2aq_u32(a, b, c, imm2) simde_vsm3tt2aq_u32((a), (b), (c), (imm2)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_uint32x4_t +simde_vsm3tt2bq_u32(simde_uint32x4_t a, simde_uint32x4_t b, simde_uint32x4_t c, const int imm2) + SIMDE_REQUIRE_CONSTANT_RANGE(imm2, 0, 3) +{ + simde_uint32x4_private + r_, + a_ = simde_uint32x4_to_private(a), + b_ = simde_uint32x4_to_private(b), + c_ = simde_uint32x4_to_private(c); + uint32_t Wj, TT2; + + Wj = c_.values[imm2]; + TT2 = (a_.values[3] & a_.values[2]) | (~(a_.values[3]) & a_.values[1]); + TT2 = (TT2 + a_.values[0] + b_.values[3] + Wj); + r_.values[0] = a_.values[1]; + r_.values[1] = ROL32(a_.values[2], 19); + r_.values[2] = a_.values[3]; + r_.values[3] = TT2 ^ ROL32(TT2, 9) ^ ROL32(TT2, 17); + return simde_uint32x4_from_private(r_); +} +#if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && defined(SIMDE_ARCH_ARM_SM3) + #define simde_vsm3tt2bq_u32(a, b, c, imm2) vsm3tt2bq_u32((a), (b), (c), (imm2)); +#endif +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vsm3tt2bq_u32 + #define vsm3tt2bq_u32(a, b, c, imm2) simde_vsm3tt2bq_u32((a), (b), (c), (imm2)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_uint32x4_t +simde_vsm3partw1q_u32(simde_uint32x4_t a, simde_uint32x4_t b, simde_uint32x4_t c) { + #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && defined(SIMDE_ARCH_ARM_SM3) + return vsm3partw1q_u32(a, b, c); + #else + simde_uint32x4_private + r_, + a_ = simde_uint32x4_to_private(a), + b_ = simde_uint32x4_to_private(b), + c_ = simde_uint32x4_to_private(c); + r_.values[2] = (a_.values[2] ^ b_.values[2]) ^ (ROL32(c_.values[3], 15)); + r_.values[1] = (a_.values[1] ^ b_.values[1]) ^ (ROL32(c_.values[2], 15)); + r_.values[0] = (a_.values[0] ^ b_.values[0]) ^ (ROL32(c_.values[1], 15)); + for(int i = 0; i < 4; ++i) { + if (i == 3) { + r_.values[3] = (a_.values[3] ^ b_.values[3]) ^ (ROL32(r_.values[0], 15)); + } + r_.values[i] = r_.values[i] ^ ROL32(r_.values[i], 15) ^ ROL32(r_.values[i], 23); + } + return simde_uint32x4_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vsm3partw1q_u32 + #define vsm3partw1q_u32(a, b, c) simde_vsm3partw1q_u32((a), (b), (c)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_uint32x4_t +simde_vsm3partw2q_u32(simde_uint32x4_t a, simde_uint32x4_t b, simde_uint32x4_t c) { + #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && defined(SIMDE_ARCH_ARM_SM3) + return vsm3partw2q_u32(a, b, c); + #else + simde_uint32x4_private + r_, + tmp_, + a_ = simde_uint32x4_to_private(a), + b_ = simde_uint32x4_to_private(b), + c_ = simde_uint32x4_to_private(c); + uint32_t tmp2; + tmp_.values[3] = b_.values[3] ^ (ROL32(c_.values[3], 7)); + tmp_.values[2] = b_.values[2] ^ (ROL32(c_.values[2], 7)); + tmp_.values[1] = b_.values[1] ^ (ROL32(c_.values[1], 7)); + tmp_.values[0] = b_.values[0] ^ (ROL32(c_.values[0], 7)); + r_.values[3] = a_.values[3] ^ tmp_.values[3]; + r_.values[2] = a_.values[2] ^ tmp_.values[2]; + r_.values[1] = a_.values[1] ^ tmp_.values[1]; + r_.values[0] = a_.values[0] ^ tmp_.values[0]; + tmp2 = ROL32(tmp_.values[0], 15); + tmp2 = tmp2 ^ ROL32(tmp2, 15) ^ ROL32(tmp2, 23); + r_.values[3] = r_.values[3] ^ tmp2; + + return simde_uint32x4_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vsm3partw2q_u32 + #define vsm3partw2q_u32(a, b, c) simde_vsm3partw2q_u32((a), (b), (c)) +#endif + +#undef ROR32 +#undef ROL32 +#undef LSR +#undef LSL + +SIMDE_END_DECLS_ +HEDLEY_DIAGNOSTIC_POP + +#endif /* !defined(SIMDE_ARM_NEON_SM3_H) */ diff --git a/lib/simd_wrapper/simde/arm/neon/sm4.h b/lib/simd_wrapper/simde/arm/neon/sm4.h new file mode 100644 index 00000000000..d1b36a6fecb --- /dev/null +++ b/lib/simd_wrapper/simde/arm/neon/sm4.h @@ -0,0 +1,155 @@ +/* SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, copy, + * modify, merge, publish, distribute, sublicense, and/or sell copies + * of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + * Copyright: + * 2023 Yi-Yen Chung (Copyright owned by Andes Technology) + */ + +#if !defined(SIMDE_ARM_NEON_SM4_H) +#define SIMDE_ARM_NEON_SM4_H + +#include "types.h" + +HEDLEY_DIAGNOSTIC_PUSH +SIMDE_DISABLE_UNWANTED_DIAGNOSTICS +SIMDE_BEGIN_DECLS_ + +#define ROR32(operand, shift) (((operand) >> (shift)) | ((operand) << (32-shift))) +#define ROL32(operand, shift) (((operand) >> (32-shift)) | ((operand) << (shift))) +#define LSR(operand, shift) ((operand) >> (shift)) +#define LSL(operand, shift) ((operand) << (shift)) + +static const uint8_t simde_sbox_sm4[256] = { + 0xd6,0x90,0xe9,0xfe,0xcc,0xe1,0x3d,0xb7,0x16,0xb6,0x14,0xc2,0x28,0xfb,0x2c,0x05, + 0x2b,0x67,0x9a,0x76,0x2a,0xbe,0x04,0xc3,0xaa,0x44,0x13,0x26,0x49,0x86,0x06,0x99, + 0x9c,0x42,0x50,0xf4,0x91,0xef,0x98,0x7a,0x33,0x54,0x0b,0x43,0xed,0xcf,0xac,0x62, + 0xe4,0xb3,0x1c,0xa9,0xc9,0x08,0xe8,0x95,0x80,0xdf,0x94,0xfa,0x75,0x8f,0x3f,0xa6, + 0x47,0x07,0xa7,0xfc,0xf3,0x73,0x17,0xba,0x83,0x59,0x3c,0x19,0xe6,0x85,0x4f,0xa8, + 0x68,0x6b,0x81,0xb2,0x71,0x64,0xda,0x8b,0xf8,0xeb,0x0f,0x4b,0x70,0x56,0x9d,0x35, + 0x1e,0x24,0x0e,0x5e,0x63,0x58,0xd1,0xa2,0x25,0x22,0x7c,0x3b,0x01,0x21,0x78,0x87, + 0xd4,0x00,0x46,0x57,0x9f,0xd3,0x27,0x52,0x4c,0x36,0x02,0xe7,0xa0,0xc4,0xc8,0x9e, + 0xea,0xbf,0x8a,0xd2,0x40,0xc7,0x38,0xb5,0xa3,0xf7,0xf2,0xce,0xf9,0x61,0x15,0xa1, + 0xe0,0xae,0x5d,0xa4,0x9b,0x34,0x1a,0x55,0xad,0x93,0x32,0x30,0xf5,0x8c,0xb1,0xe3, + 0x1d,0xf6,0xe2,0x2e,0x82,0x66,0xca,0x60,0xc0,0x29,0x23,0xab,0x0d,0x53,0x4e,0x6f, + 0xd5,0xdb,0x37,0x45,0xde,0xfd,0x8e,0x2f,0x03,0xff,0x6a,0x72,0x6d,0x6c,0x5b,0x51, + 0x8d,0x1b,0xaf,0x92,0xbb,0xdd,0xbc,0x7f,0x11,0xd9,0x5c,0x41,0x1f,0x10,0x5a,0xd8, + 0x0a,0xc1,0x31,0x88,0xa5,0xcd,0x7b,0xbd,0x2d,0x74,0xd0,0x12,0xb8,0xe5,0xb4,0xb0, + 0x89,0x69,0x97,0x4a,0x0c,0x96,0x77,0x7e,0x65,0xb9,0xf1,0x09,0xc5,0x6e,0xc6,0x84, + 0x18,0xf0,0x7d,0xec,0x3a,0xdc,0x4d,0x20,0x79,0xee,0x5f,0x3e,0xd7,0xcb,0x39,0x48 +}; + +static void simde_u32_to_u8x4(uint32_t src, uint8_t* dst) { + for(int i = 0; i < 4; ++i) { + *(dst + i) = HEDLEY_STATIC_CAST(uint8_t, ((src << (i * 8)) >> 24)); + } +} + +static void simde_u32_from_u8x4(uint8_t* src, uint32_t* dst) { + *dst = 0; + for(int i = 0; i < 4; ++i) { + *dst = *dst | (HEDLEY_STATIC_CAST(uint32_t, src[i]) << (24 - i * 8)); + } +} + +SIMDE_FUNCTION_ATTRIBUTES +simde_uint32x4_t +simde_vsm4eq_u32(simde_uint32x4_t a, simde_uint32x4_t b) { + #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && defined(SIMDE_ARCH_ARM_SM4) + return vsm4eq_u32(a, b); + #else + simde_uint32x4_private + a_ = simde_uint32x4_to_private(a), + b_ = simde_uint32x4_to_private(b); + uint32_t intval, roundkey; + uint8_t _intval[4]; + for(int index = 0; index < 4; ++index) { + roundkey = b_.values[index]; + + intval = a_.values[3] ^ a_.values[2] ^ a_.values[1] ^ roundkey; + + simde_u32_to_u8x4(intval, _intval); + for(int i = 0; i < 4; ++i) { + _intval[i] = simde_sbox_sm4[_intval[i]]; + } + simde_u32_from_u8x4(_intval, &intval); + intval = intval ^ ROL32(intval, 2) ^ ROL32(intval, 10) ^ ROL32(intval, 18) ^ ROL32(intval, 24); + intval = intval ^ a_.values[0]; + + a_.values[0] = a_.values[1]; + a_.values[1] = a_.values[2]; + a_.values[2] = a_.values[3]; + a_.values[3] = intval; + } + return simde_uint32x4_from_private(a_); + #endif +} +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vsm4eq_u32 + #define vsm4eq_u32(a, b) simde_vsm4eq_u32((a), (b)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_uint32x4_t +simde_vsm4ekeyq_u32(simde_uint32x4_t a, simde_uint32x4_t b) { + #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && defined(SIMDE_ARCH_ARM_SM4) + return vsm4ekeyq_u32(a, b); + #else + simde_uint32x4_private + a_ = simde_uint32x4_to_private(a), + b_ = simde_uint32x4_to_private(b); + uint32_t intval, constval; + uint8_t _intval[4]; + for(int index = 0; index < 4; ++index) { + constval = b_.values[index]; + + intval = a_.values[3] ^ a_.values[2] ^ a_.values[1] ^ constval; + + simde_u32_to_u8x4(intval, _intval); + for(int i = 0; i < 4; ++i) { + _intval[i] = simde_sbox_sm4[_intval[i]]; + } + simde_u32_from_u8x4(_intval, &intval); + intval = intval ^ ROL32(intval, 13) ^ ROL32(intval, 23); + intval = intval ^ a_.values[0]; + + a_.values[0] = a_.values[1]; + a_.values[1] = a_.values[2]; + a_.values[2] = a_.values[3]; + a_.values[3] = intval; + } + return simde_uint32x4_from_private(a_); + #endif +} +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vsm4ekeyq_u32 + #define vsm4ekeyq_u32(a, b) simde_vsm4ekeyq_u32((a), (b)) +#endif + +#undef ROR32 +#undef ROL32 +#undef LSR +#undef LSL + +SIMDE_END_DECLS_ +HEDLEY_DIAGNOSTIC_POP + +#endif /* !defined(SIMDE_ARM_NEON_SM4_H) */ diff --git a/lib/simd_wrapper/simde/arm/neon/sqadd.h b/lib/simd_wrapper/simde/arm/neon/sqadd.h index 6e1b7e25c46..7e39cf10fdd 100644 --- a/lib/simd_wrapper/simde/arm/neon/sqadd.h +++ b/lib/simd_wrapper/simde/arm/neon/sqadd.h @@ -30,6 +30,20 @@ #include "types.h" #include +// Workaround on ARM64 windows due to windows SDK bug +// https://developercommunity.visualstudio.com/t/In-arm64_neonh-vsqaddb_u8-vsqaddh_u16/10271747?sort=newest +#if (defined _MSC_VER) && (defined SIMDE_ARM_NEON_A64V8_NATIVE) && (_MSC_VER < 1938) +#pragma message ("Due to msvc bug, current version of msvc is supported by workaround. Recommend to update msvc") +#undef vsqaddb_u8 +#define vsqaddb_u8(src1, src2) neon_usqadds8(__uint8ToN8_v(src1), __int8ToN8_v(src2)).n8_u8[0] +#undef vsqaddh_u16 +#define vsqaddh_u16(src1, src2) neon_usqadds16(__uint16ToN16_v(src1), __int16ToN16_v(src2)).n16_u16[0] +#undef vsqadds_u32 +#define vsqadds_u32(src1, src2) _CopyUInt32FromFloat(neon_usqadds32(_CopyFloatFromUInt32(src1), _CopyFloatFromInt32(src2))) +#undef vsqaddd_u64 +#define vsqaddd_u64(src1, src2) neon_usqadds64(__uint64ToN64_v(src1), __int64ToN64_v(src2)).n64_u64[0] +#endif + HEDLEY_DIAGNOSTIC_PUSH SIMDE_DISABLE_UNWANTED_DIAGNOSTICS SIMDE_BEGIN_DECLS_ diff --git a/lib/simd_wrapper/simde/arm/neon/sqrt.h b/lib/simd_wrapper/simde/arm/neon/sqrt.h new file mode 100644 index 00000000000..0ddbc349456 --- /dev/null +++ b/lib/simd_wrapper/simde/arm/neon/sqrt.h @@ -0,0 +1,201 @@ +/* SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, copy, + * modify, merge, publish, distribute, sublicense, and/or sell copies + * of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + * Copyright: + * 2023 Yi-Yen Chung (Copyright owned by Andes Technology) + */ + +#if !defined(SIMDE_ARM_NEON_SQRT_H) +#define SIMDE_ARM_NEON_SQRT_H + +#include "types.h" + +HEDLEY_DIAGNOSTIC_PUSH +SIMDE_DISABLE_UNWANTED_DIAGNOSTICS +SIMDE_BEGIN_DECLS_ + +SIMDE_FUNCTION_ATTRIBUTES +simde_float16 +simde_vsqrth_f16(simde_float16_t a) { + #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARM_NEON_FP16) + return vsqrth_f16(a); + #elif defined(simde_math_sqrtf) + simde_float32 af = simde_float16_to_float32(a); + return simde_float16_from_float32(simde_math_sqrtf(af)); + #else + HEDLEY_UNREACHABLE(); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) + #undef vsqrth_f16 + #define vsqrth_f16(a) simde_vsqrth_f16((a)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_float16x4_t +simde_vsqrt_f16(simde_float16x4_t a) { + #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && defined(SIMDE_ARM_NEON_FP16) + return vsqrt_f16(a); + #else + simde_float16x4_private + r_, + a_ = simde_float16x4_to_private(a); + + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { + r_.values[i] = simde_vsqrth_f16(a_.values[i]); + } + + return simde_float16x4_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vsqrt_f16 + #define vsqrt_f16(a) simde_vsqrt_f16((a)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_float32x2_t +simde_vsqrt_f32(simde_float32x2_t a) { + #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + return vsqrt_f32(a); + #elif defined(simde_math_sqrtf) + simde_float32x2_private + r_, + a_ = simde_float32x2_to_private(a); + + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { + r_.values[i] = simde_math_sqrtf(a_.values[i]); + } + + return simde_float32x2_from_private(r_); + #else + HEDLEY_UNREACHABLE(); + #endif +} +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vsqrt_f32 + #define vsqrt_f32(a) simde_vsqrt_f32((a)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_float64x1_t +simde_vsqrt_f64(simde_float64x1_t a) { + #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + return vsqrt_f64(a); + #elif defined(simde_math_sqrt) + simde_float64x1_private + r_, + a_ = simde_float64x1_to_private(a); + + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { + r_.values[i] = simde_math_sqrt(a_.values[i]); + } + + return simde_float64x1_from_private(r_); + #else + HEDLEY_UNREACHABLE(); + #endif +} +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vsqrt_f64 + #define vsqrt_f64(a) simde_vsqrt_f64((a)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_float16x8_t +simde_vsqrtq_f16(simde_float16x8_t a) { + #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && defined(SIMDE_ARM_NEON_FP16) + return vsqrtq_f16(a); + #else + simde_float16x8_private + r_, + a_ = simde_float16x8_to_private(a); + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { + r_.values[i] = simde_vsqrth_f16(a_.values[i]); + } + + return simde_float16x8_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vsqrtq_f16 + #define vsqrtq_f16(a) simde_vsqrtq_f16((a)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_float32x4_t +simde_vsqrtq_f32(simde_float32x4_t a) { + #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + return vsqrtq_f32(a); + #elif defined(simde_math_sqrtf) + simde_float32x4_private + r_, + a_ = simde_float32x4_to_private(a); + + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { + r_.values[i] = simde_math_sqrtf(a_.values[i]); + } + + return simde_float32x4_from_private(r_); + #else + HEDLEY_UNREACHABLE(); + #endif +} +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vsqrtq_f32 + #define vsqrtq_f32(a) simde_vsqrtq_f32((a)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_float64x2_t +simde_vsqrtq_f64(simde_float64x2_t a) { + #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + return vsqrtq_f64(a); + #elif defined(simde_math_sqrt) + simde_float64x2_private + r_, + a_ = simde_float64x2_to_private(a); + + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { + r_.values[i] = simde_math_sqrt(a_.values[i]); + } + + return simde_float64x2_from_private(r_); + #else + HEDLEY_UNREACHABLE(); + #endif +} +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vsqrtq_f64 + #define vsqrtq_f64(a) simde_vsqrtq_f64((a)) +#endif + + +SIMDE_END_DECLS_ +HEDLEY_DIAGNOSTIC_POP +#endif /* !defined(SIMDE_ARM_NEON_SQRT_H) */ diff --git a/lib/simd_wrapper/simde/arm/neon/sri_n.h b/lib/simd_wrapper/simde/arm/neon/sri_n.h index f2b337703e5..d0213c2c969 100644 --- a/lib/simd_wrapper/simde/arm/neon/sri_n.h +++ b/lib/simd_wrapper/simde/arm/neon/sri_n.h @@ -23,6 +23,7 @@ * Copyright: * 2021 Zhi An Ng (Copyright owned by Google, LLC) * 2021 Evan Nemerson + * 2023 Yi-Yen Chung (Copyright owned by Andes Technology) */ #if !defined(SIMDE_ARM_NEON_SRI_N_H) @@ -266,6 +267,78 @@ SIMDE_BEGIN_DECLS_ #define vsriq_n_u64(a, b, n) simde_vsriq_n_u64((a), (b), (n)) #endif +#if defined(SIMDE_ARM_NEON_A32V7_NATIVE) + #define simde_vsri_n_p8(a, b, n) vsri_n_p8((a), (b), (n)) +#else + #define simde_vsri_n_p8(a, b, n) \ + simde_vreinterpret_p8_u8(simde_vsri_n_u8( \ + simde_vreinterpret_u8_p8((a)), simde_vreinterpret_u8_p8((b)), (n))) +#endif +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vsri_n_p8 + #define vsri_n_p8(a, b, n) simde_vsri_n_p8((a), (b), (n)) +#endif + +#if defined(SIMDE_ARM_NEON_A32V7_NATIVE) + #define simde_vsri_n_p16(a, b, n) vsri_n_p16((a), (b), (n)) +#else + #define simde_vsri_n_p16(a, b, n) \ + simde_vreinterpret_p16_u16(simde_vsri_n_u16( \ + simde_vreinterpret_u16_p16((a)), simde_vreinterpret_u16_p16((b)), (n))) +#endif +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vsri_n_p16 + #define vsri_n_p16(a, b, n) simde_vsri_n_p16((a), (b), (n)) +#endif + +#if defined(SIMDE_ARM_NEON_A32V8_NATIVE) + #define simde_vsri_n_p64(a, b, n) vsri_n_p64((a), (b), (n)) +#else + #define simde_vsri_n_p64(a, b, n) \ + simde_vreinterpret_p64_u64(simde_vsri_n_u64( \ + simde_vreinterpret_u64_p64((a)), simde_vreinterpret_u64_p64((b)), (n))) +#endif +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) + #undef vsri_n_p64 + #define vsri_n_p64(a, b, n) simde_vsri_n_p64((a), (b), (n)) +#endif + +#if defined(SIMDE_ARM_NEON_A32V7_NATIVE) + #define simde_vsriq_n_p8(a, b, n) vsriq_n_p8((a), (b), (n)) +#else + #define simde_vsriq_n_p8(a, b, n) \ + simde_vreinterpretq_p8_u8(simde_vsriq_n_u8( \ + simde_vreinterpretq_u8_p8((a)), simde_vreinterpretq_u8_p8((b)), (n))) +#endif +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vsriq_n_p8 + #define vsriq_n_p8(a, b, n) simde_vsriq_n_p8((a), (b), (n)) +#endif + +#if defined(SIMDE_ARM_NEON_A32V7_NATIVE) + #define simde_vsriq_n_p16(a, b, n) vsriq_n_p16((a), (b), (n)) +#else + #define simde_vsriq_n_p16(a, b, n) \ + simde_vreinterpretq_p16_u16(simde_vsriq_n_u16( \ + simde_vreinterpretq_u16_p16((a)), simde_vreinterpretq_u16_p16((b)), (n))) +#endif +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vsriq_n_p16 + #define vsriq_n_p16(a, b, n) simde_vsriq_n_p16((a), (b), (n)) +#endif + +#if defined(SIMDE_ARM_NEON_A32V8_NATIVE) + #define simde_vsriq_n_p64(a, b, n) vsriq_n_p64((a), (b), (n)) +#else + #define simde_vsriq_n_p64(a, b, n) \ + simde_vreinterpretq_p64_u64(simde_vsriq_n_u64( \ + simde_vreinterpretq_u64_p64((a)), simde_vreinterpretq_u64_p64((b)), (n))) +#endif +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) + #undef vsriq_n_p64 + #define vsriq_n_p64(a, b, n) simde_vsriq_n_p64((a), (b), (n)) +#endif + SIMDE_END_DECLS_ HEDLEY_DIAGNOSTIC_POP diff --git a/lib/simd_wrapper/simde/arm/neon/st1.h b/lib/simd_wrapper/simde/arm/neon/st1.h index 6d5901aac86..2e9b912a762 100644 --- a/lib/simd_wrapper/simde/arm/neon/st1.h +++ b/lib/simd_wrapper/simde/arm/neon/st1.h @@ -22,6 +22,8 @@ * * Copyright: * 2020 Evan Nemerson + * 2023 Yi-Yen Chung (Copyright owned by Andes Technology) + * 2023 Chi-Wei Chu (Copyright owned by NTHU pllab) */ #if !defined(SIMDE_ARM_NEON_ST1_H) @@ -40,7 +42,11 @@ simde_vst1_f16(simde_float16_t ptr[HEDLEY_ARRAY_PARAM(4)], simde_float16x4_t val vst1_f16(ptr, val); #else simde_float16x4_private val_ = simde_float16x4_to_private(val); - simde_memcpy(ptr, &val_, sizeof(val_)); + #if defined(SIMDE_RISCV_V_NATIVE) && defined(SIMDE_ARCH_RISCV_ZVFH) + __riscv_vse16_v_f16m1((_Float16 *)ptr , val_.sv64 , 4); + #else + simde_memcpy(ptr, &val_, sizeof(val_)); + #endif #endif } #if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) @@ -55,7 +61,11 @@ simde_vst1_f32(simde_float32_t ptr[HEDLEY_ARRAY_PARAM(2)], simde_float32x2_t val vst1_f32(ptr, val); #else simde_float32x2_private val_ = simde_float32x2_to_private(val); - simde_memcpy(ptr, &val_, sizeof(val_)); + #if defined(SIMDE_RISCV_V_NATIVE) + __riscv_vse32_v_f32m1(ptr , val_.sv64 , 2); + #else + simde_memcpy(ptr, &val_, sizeof(val_)); + #endif #endif } #if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) @@ -70,7 +80,11 @@ simde_vst1_f64(simde_float64_t ptr[HEDLEY_ARRAY_PARAM(1)], simde_float64x1_t val vst1_f64(ptr, val); #else simde_float64x1_private val_ = simde_float64x1_to_private(val); - simde_memcpy(ptr, &val_, sizeof(val_)); + #if defined(SIMDE_RISCV_V_NATIVE) + __riscv_vse64_v_f64m1(ptr , val_.sv64 , 1); + #else + simde_memcpy(ptr, &val_, sizeof(val_)); + #endif #endif } #if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) @@ -85,7 +99,11 @@ simde_vst1_s8(int8_t ptr[HEDLEY_ARRAY_PARAM(8)], simde_int8x8_t val) { vst1_s8(ptr, val); #else simde_int8x8_private val_ = simde_int8x8_to_private(val); - simde_memcpy(ptr, &val_, sizeof(val_)); + #if defined(SIMDE_RISCV_V_NATIVE) + __riscv_vse8_v_i8m1(ptr , val_.sv64 , 8); + #else + simde_memcpy(ptr, &val_, sizeof(val_)); + #endif #endif } #if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) @@ -100,7 +118,11 @@ simde_vst1_s16(int16_t ptr[HEDLEY_ARRAY_PARAM(4)], simde_int16x4_t val) { vst1_s16(ptr, val); #else simde_int16x4_private val_ = simde_int16x4_to_private(val); - simde_memcpy(ptr, &val_, sizeof(val_)); + #if defined(SIMDE_RISCV_V_NATIVE) + __riscv_vse16_v_i16m1(ptr , val_.sv64 , 4); + #else + simde_memcpy(ptr, &val_, sizeof(val_)); + #endif #endif } #if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) @@ -115,7 +137,11 @@ simde_vst1_s32(int32_t ptr[HEDLEY_ARRAY_PARAM(2)], simde_int32x2_t val) { vst1_s32(ptr, val); #else simde_int32x2_private val_ = simde_int32x2_to_private(val); - simde_memcpy(ptr, &val_, sizeof(val_)); + #if defined(SIMDE_RISCV_V_NATIVE) + __riscv_vse32_v_i32m1(ptr , val_.sv64 , 2); + #else + simde_memcpy(ptr, &val_, sizeof(val_)); + #endif #endif } #if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) @@ -130,7 +156,11 @@ simde_vst1_s64(int64_t ptr[HEDLEY_ARRAY_PARAM(1)], simde_int64x1_t val) { vst1_s64(ptr, val); #else simde_int64x1_private val_ = simde_int64x1_to_private(val); - simde_memcpy(ptr, &val_, sizeof(val_)); + #if defined(SIMDE_RISCV_V_NATIVE) + __riscv_vse64_v_i64m1(ptr , val_.sv64 , 1); + #else + simde_memcpy(ptr, &val_, sizeof(val_)); + #endif #endif } #if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) @@ -145,7 +175,11 @@ simde_vst1_u8(uint8_t ptr[HEDLEY_ARRAY_PARAM(8)], simde_uint8x8_t val) { vst1_u8(ptr, val); #else simde_uint8x8_private val_ = simde_uint8x8_to_private(val); - simde_memcpy(ptr, &val_, sizeof(val_)); + #if defined(SIMDE_RISCV_V_NATIVE) + __riscv_vse8_v_u8m1(ptr , val_.sv64 , 8); + #else + simde_memcpy(ptr, &val_, sizeof(val_)); + #endif #endif } #if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) @@ -160,7 +194,11 @@ simde_vst1_u16(uint16_t ptr[HEDLEY_ARRAY_PARAM(4)], simde_uint16x4_t val) { vst1_u16(ptr, val); #else simde_uint16x4_private val_ = simde_uint16x4_to_private(val); - simde_memcpy(ptr, &val_, sizeof(val_)); + #if defined(SIMDE_RISCV_V_NATIVE) + __riscv_vse16_v_u16m1(ptr , val_.sv64 , 4); + #else + simde_memcpy(ptr, &val_, sizeof(val_)); + #endif #endif } #if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) @@ -175,7 +213,11 @@ simde_vst1_u32(uint32_t ptr[HEDLEY_ARRAY_PARAM(2)], simde_uint32x2_t val) { vst1_u32(ptr, val); #else simde_uint32x2_private val_ = simde_uint32x2_to_private(val); - simde_memcpy(ptr, &val_, sizeof(val_)); + #if defined(SIMDE_RISCV_V_NATIVE) + __riscv_vse32_v_u32m1(ptr , val_.sv64 , 2); + #else + simde_memcpy(ptr, &val_, sizeof(val_)); + #endif #endif } #if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) @@ -190,7 +232,11 @@ simde_vst1_u64(uint64_t ptr[HEDLEY_ARRAY_PARAM(1)], simde_uint64x1_t val) { vst1_u64(ptr, val); #else simde_uint64x1_private val_ = simde_uint64x1_to_private(val); - simde_memcpy(ptr, &val_, sizeof(val_)); + #if defined(SIMDE_RISCV_V_NATIVE) + __riscv_vse64_v_u64m1(ptr , val_.sv64 , 1); + #else + simde_memcpy(ptr, &val_, sizeof(val_)); + #endif #endif } #if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) @@ -208,6 +254,8 @@ simde_vst1q_f16(simde_float16_t ptr[HEDLEY_ARRAY_PARAM(8)], simde_float16x8_t va #if defined(SIMDE_WASM_SIMD128_NATIVE) wasm_v128_store(ptr, val_.v128); + #elif defined(SIMDE_RISCV_V_NATIVE) && defined(SIMDE_ARCH_RISCV_ZVFH) + __riscv_vse16_v_f16m1((_Float16 *)ptr , val_.sv128 , 8); #else simde_memcpy(ptr, &val_, sizeof(val_)); #endif @@ -223,13 +271,13 @@ void simde_vst1q_f32(simde_float32_t ptr[HEDLEY_ARRAY_PARAM(4)], simde_float32x4_t val) { #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) vst1q_f32(ptr, val); - #elif defined(SIMDE_POWER_ALTIVEC_P6_NATIVE) - vec_st(val, 0, ptr); #else simde_float32x4_private val_ = simde_float32x4_to_private(val); #if defined(SIMDE_WASM_SIMD128_NATIVE) wasm_v128_store(ptr, val_.v128); + #elif defined(SIMDE_RISCV_V_NATIVE) + __riscv_vse32_v_f32m1(ptr , val_.sv128 , 4); #else simde_memcpy(ptr, &val_, sizeof(val_)); #endif @@ -250,6 +298,8 @@ simde_vst1q_f64(simde_float64_t ptr[HEDLEY_ARRAY_PARAM(2)], simde_float64x2_t va #if defined(SIMDE_WASM_SIMD128_NATIVE) wasm_v128_store(ptr, val_.v128); + #elif defined(SIMDE_RISCV_V_NATIVE) + __riscv_vse64_v_f64m1(ptr , val_.sv128 , 2); #else simde_memcpy(ptr, &val_, sizeof(val_)); #endif @@ -270,6 +320,8 @@ simde_vst1q_s8(int8_t ptr[HEDLEY_ARRAY_PARAM(16)], simde_int8x16_t val) { #if defined(SIMDE_WASM_SIMD128_NATIVE) wasm_v128_store(ptr, val_.v128); + #elif defined(SIMDE_RISCV_V_NATIVE) + __riscv_vse8_v_i8m1(ptr , val_.sv128 , 16); #else simde_memcpy(ptr, &val_, sizeof(val_)); #endif @@ -290,6 +342,8 @@ simde_vst1q_s16(int16_t ptr[HEDLEY_ARRAY_PARAM(8)], simde_int16x8_t val) { #if defined(SIMDE_WASM_SIMD128_NATIVE) wasm_v128_store(ptr, val_.v128); + #elif defined(SIMDE_RISCV_V_NATIVE) + __riscv_vse16_v_i16m1(ptr , val_.sv128 , 8); #else simde_memcpy(ptr, &val_, sizeof(val_)); #endif @@ -310,6 +364,8 @@ simde_vst1q_s32(int32_t ptr[HEDLEY_ARRAY_PARAM(4)], simde_int32x4_t val) { #if defined(SIMDE_WASM_SIMD128_NATIVE) wasm_v128_store(ptr, val_.v128); + #elif defined(SIMDE_RISCV_V_NATIVE) + __riscv_vse32_v_i32m1(ptr , val_.sv128 , 4); #else simde_memcpy(ptr, &val_, sizeof(val_)); #endif @@ -330,6 +386,8 @@ simde_vst1q_s64(int64_t ptr[HEDLEY_ARRAY_PARAM(2)], simde_int64x2_t val) { #if defined(SIMDE_WASM_SIMD128_NATIVE) wasm_v128_store(ptr, val_.v128); + #elif defined(SIMDE_RISCV_V_NATIVE) + __riscv_vse64_v_i64m1(ptr , val_.sv128 , 2); #else simde_memcpy(ptr, &val_, sizeof(val_)); #endif @@ -350,6 +408,8 @@ simde_vst1q_u8(uint8_t ptr[HEDLEY_ARRAY_PARAM(16)], simde_uint8x16_t val) { #if defined(SIMDE_WASM_SIMD128_NATIVE) wasm_v128_store(ptr, val_.v128); + #elif defined(SIMDE_RISCV_V_NATIVE) + __riscv_vse8_v_u8m1(ptr , val_.sv128 , 16); #else simde_memcpy(ptr, &val_, sizeof(val_)); #endif @@ -370,6 +430,8 @@ simde_vst1q_u16(uint16_t ptr[HEDLEY_ARRAY_PARAM(8)], simde_uint16x8_t val) { #if defined(SIMDE_WASM_SIMD128_NATIVE) wasm_v128_store(ptr, val_.v128); + #elif defined(SIMDE_RISCV_V_NATIVE) + __riscv_vse16_v_u16m1(ptr , val_.sv128 , 8); #else simde_memcpy(ptr, &val_, sizeof(val_)); #endif @@ -390,6 +452,8 @@ simde_vst1q_u32(uint32_t ptr[HEDLEY_ARRAY_PARAM(4)], simde_uint32x4_t val) { #if defined(SIMDE_WASM_SIMD128_NATIVE) wasm_v128_store(ptr, val_.v128); + #elif defined(SIMDE_RISCV_V_NATIVE) + __riscv_vse32_v_u32m1(ptr , val_.sv128 , 4); #else simde_memcpy(ptr, &val_, sizeof(val_)); #endif @@ -410,6 +474,8 @@ simde_vst1q_u64(uint64_t ptr[HEDLEY_ARRAY_PARAM(2)], simde_uint64x2_t val) { #if defined(SIMDE_WASM_SIMD128_NATIVE) wasm_v128_store(ptr, val_.v128); + #elif defined(SIMDE_RISCV_V_NATIVE) + __riscv_vse64_v_u64m1(ptr , val_.sv128 , 2); #else simde_memcpy(ptr, &val_, sizeof(val_)); #endif @@ -420,6 +486,166 @@ simde_vst1q_u64(uint64_t ptr[HEDLEY_ARRAY_PARAM(2)], simde_uint64x2_t val) { #define vst1q_u64(a, b) simde_vst1q_u64((a), (b)) #endif +SIMDE_FUNCTION_ATTRIBUTES +void +simde_vst1_p8(simde_poly8_t ptr[HEDLEY_ARRAY_PARAM(8)], simde_poly8x8_t val) { + #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) + vst1_p8(ptr, val); + #else + simde_poly8x8_private val_ = simde_poly8x8_to_private(val); + #if defined(SIMDE_RISCV_V_NATIVE) + __riscv_vse8_v_u8m1(ptr , val_.sv64 , 8); + #else + simde_memcpy(ptr, &val_, sizeof(val_)); + #endif + #endif +} +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vst1_p8 + #define vst1_p8(a, b) simde_vst1_p8((a), (b)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +void +simde_vst1_p16(simde_poly16_t ptr[HEDLEY_ARRAY_PARAM(4)], simde_poly16x4_t val) { + #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) + vst1_p16(ptr, val); + #else + simde_poly16x4_private val_ = simde_poly16x4_to_private(val); + #if defined(SIMDE_RISCV_V_NATIVE) + __riscv_vse16_v_u16m1(ptr , val_.sv64 , 4); + #else + simde_memcpy(ptr, &val_, sizeof(val_)); + #endif + #endif +} +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vst1_p16 + #define vst1_p16(a, b) simde_vst1_p16((a), (b)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +void +simde_vst1_p64(simde_poly64_t ptr[HEDLEY_ARRAY_PARAM(1)], simde_poly64x1_t val) { + #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) + vst1_p64(ptr, val); + #else + simde_poly64x1_private val_ = simde_poly64x1_to_private(val); + #if defined(SIMDE_RISCV_V_NATIVE) + __riscv_vse64_v_u64m1(ptr , val_.sv64 , 1); + #else + simde_memcpy(ptr, &val_, sizeof(val_)); + #endif + #endif +} +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) + #undef vst1_p64 + #define vst1_p64(a, b) simde_vst1_p64((a), (b)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +void +simde_vst1q_p8(simde_poly8_t ptr[HEDLEY_ARRAY_PARAM(16)], simde_poly8x16_t val) { + #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) + vst1q_p8(ptr, val); + #else + simde_poly8x16_private val_ = simde_poly8x16_to_private(val); + #if defined(SIMDE_RISCV_V_NATIVE) + __riscv_vse8_v_u8m1(ptr , val_.sv128 , 16); + #else + simde_memcpy(ptr, &val_, sizeof(val_)); + #endif + #endif +} +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vst1q_p8 + #define vst1q_p8(a, b) simde_vst1q_p8((a), (b)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +void +simde_vst1q_p16(simde_poly16_t ptr[HEDLEY_ARRAY_PARAM(8)], simde_poly16x8_t val) { + #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) + vst1q_p16(ptr, val); + #else + simde_poly16x8_private val_ = simde_poly16x8_to_private(val); + #if defined(SIMDE_RISCV_V_NATIVE) + __riscv_vse16_v_u16m1(ptr , val_.sv128 , 8); + #else + simde_memcpy(ptr, &val_, sizeof(val_)); + #endif + #endif +} +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vst1q_p16 + #define vst1q_p16(a, b) simde_vst1q_p16((a), (b)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +void +simde_vst1q_p64(simde_poly64_t ptr[HEDLEY_ARRAY_PARAM(2)], simde_poly64x2_t val) { + #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) + vst1q_p64(ptr, val); + #else + simde_poly64x2_private val_ = simde_poly64x2_to_private(val); + #if defined(SIMDE_RISCV_V_NATIVE) + __riscv_vse64_v_u64m1(ptr , val_.sv128 , 2); + #else + simde_memcpy(ptr, &val_, sizeof(val_)); + #endif + #endif +} +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) + #undef vst1q_p64 + #define vst1q_p64(a, b) simde_vst1q_p64((a), (b)) +#endif + +#if !defined(SIMDE_TARGET_NOT_SUPPORT_INT128_TYPE) +SIMDE_FUNCTION_ATTRIBUTES +void +simde_vstrq_p128(simde_poly128_t ptr[HEDLEY_ARRAY_PARAM(1)], simde_poly128_t val) { + #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARCH_ARM_CRYPTO) + vstrq_p128(ptr, val); + #else + simde_memcpy(ptr, &val, sizeof(val)); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) + #undef vstrq_p128 + #define vstrq_p128(a, b) simde_vstrq_p128((a), (b)) +#endif +#endif /* !defined(SIMDE_TARGET_NOT_SUPPORT_INT128_TYPE) */ + +SIMDE_FUNCTION_ATTRIBUTES +void +simde_vst1_bf16(simde_bfloat16_t ptr[HEDLEY_ARRAY_PARAM(4)], simde_bfloat16x4_t val) { + #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARM_NEON_BF16) + vst1_bf16(ptr, val); + #else + simde_bfloat16x4_private val_ = simde_bfloat16x4_to_private(val); + simde_memcpy(ptr, &val_, sizeof(val_)); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) + #undef vst1_bf16 + #define vst1_bf16(a, b) simde_vst1_bf16((a), (b)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +void +simde_vst1q_bf16(simde_bfloat16_t ptr[HEDLEY_ARRAY_PARAM(8)], simde_bfloat16x8_t val) { + #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARM_NEON_BF16) + vst1q_bf16(ptr, val); + #else + simde_bfloat16x8_private val_ = simde_bfloat16x8_to_private(val); + simde_memcpy(ptr, &val_, sizeof(val_)); + #endif +} +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vst1q_bf16 + #define vst1q_bf16(a, b) simde_vst1q_bf16((a), (b)) +#endif + SIMDE_END_DECLS_ HEDLEY_DIAGNOSTIC_POP diff --git a/lib/simd_wrapper/simde/arm/neon/st1_lane.h b/lib/simd_wrapper/simde/arm/neon/st1_lane.h index f0e78365cfb..8facb2e778e 100644 --- a/lib/simd_wrapper/simde/arm/neon/st1_lane.h +++ b/lib/simd_wrapper/simde/arm/neon/st1_lane.h @@ -23,6 +23,7 @@ * Copyright: * 2020 Evan Nemerson * 2020 Sean Maher (Copyright owned by Google, LLC) + * 2023 Yi-Yen Chung (Copyright owned by Andes Technology) */ #if !defined(SIMDE_ARM_NEON_ST1_LANE_H) @@ -33,6 +34,22 @@ HEDLEY_DIAGNOSTIC_PUSH SIMDE_DISABLE_UNWANTED_DIAGNOSTICS SIMDE_BEGIN_DECLS_ +SIMDE_FUNCTION_ATTRIBUTES +void +simde_vst1_lane_f16(simde_float16_t *ptr, simde_float16x4_t val, const int lane) + SIMDE_REQUIRE_CONSTANT_RANGE(lane, 0, 3) { + #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) && defined(SIMDE_ARM_NEON_FP16) + SIMDE_CONSTIFY_4_NO_RESULT_(vst1_lane_f16, HEDLEY_UNREACHABLE(), lane, ptr, val); + #else + simde_float16x4_private val_ = simde_float16x4_to_private(val); + *ptr = val_.values[lane]; + #endif +} +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vst1_lane_f16 + #define vst1_lane_f16(a, b, c) simde_vst1_lane_f16((a), (b), (c)) +#endif + SIMDE_FUNCTION_ATTRIBUTES void simde_vst1_lane_f32(simde_float32_t *ptr, simde_float32x2_t val, const int lane) @@ -196,6 +213,22 @@ simde_vst1_lane_u64(uint64_t *ptr, simde_uint64x1_t val, const int lane) #define vst1_lane_u64(a, b, c) simde_vst1_lane_u64((a), (b), (c)) #endif +SIMDE_FUNCTION_ATTRIBUTES +void +simde_vst1q_lane_f16(simde_float16_t *ptr, simde_float16x8_t val, const int lane) + SIMDE_REQUIRE_CONSTANT_RANGE(lane, 0, 7) { + #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) && defined(SIMDE_ARM_NEON_FP16) + SIMDE_CONSTIFY_8_NO_RESULT_(vst1q_lane_f16, HEDLEY_UNREACHABLE(), lane, ptr, val); + #else + simde_float16x8_private val_ = simde_float16x8_to_private(val); + *ptr = val_.values[lane]; + #endif +} +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vst1q_lane_f16 + #define vst1q_lane_f16(a, b, c) simde_vst1q_lane_f16((a), (b), (c)) +#endif + SIMDE_FUNCTION_ATTRIBUTES void simde_vst1q_lane_f32(simde_float32_t *ptr, simde_float32x4_t val, const int lane) @@ -356,6 +389,135 @@ simde_vst1q_lane_u64(uint64_t *ptr, simde_uint64x2_t val, const int lane) #define vst1q_lane_u64(a, b, c) simde_vst1q_lane_u64((a), (b), (c)) #endif +SIMDE_FUNCTION_ATTRIBUTES +void +simde_vst1_lane_p8(simde_poly8_t *ptr, simde_poly8x8_t val, const int lane) + SIMDE_REQUIRE_CONSTANT_RANGE(lane, 0, 7) { + #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) + SIMDE_CONSTIFY_8_NO_RESULT_(vst1_lane_p8, HEDLEY_UNREACHABLE(), lane, ptr, val); + #else + simde_poly8x8_private val_ = simde_poly8x8_to_private(val); + *ptr = val_.values[lane]; + #endif +} +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vst1_lane_p8 + #define vst1_lane_p8(a, b, c) simde_vst1_lane_p8((a), (b), (c)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +void +simde_vst1_lane_p16(simde_poly16_t *ptr, simde_poly16x4_t val, const int lane) + SIMDE_REQUIRE_CONSTANT_RANGE(lane, 0, 3) { + #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) + SIMDE_CONSTIFY_4_NO_RESULT_(vst1_lane_p16, HEDLEY_UNREACHABLE(), lane, ptr, val); + #else + simde_poly16x4_private val_ = simde_poly16x4_to_private(val); + *ptr = val_.values[lane]; + #endif +} +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vst1_lane_p16 + #define vst1_lane_p16(a, b, c) simde_vst1_lane_p16((a), (b), (c)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +void +simde_vst1_lane_p64(simde_poly64_t *ptr, simde_poly64x1_t val, const int lane) + SIMDE_REQUIRE_CONSTANT_RANGE(lane, 0, 0) { + #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) + (void) lane; + vst1_lane_p64(ptr, val, 0); + #else + simde_poly64x1_private val_ = simde_poly64x1_to_private(val); + *ptr = val_.values[lane]; + #endif +} +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) + #undef vst1_lane_p64 + #define vst1_lane_p64(a, b, c) simde_vst1_lane_p64((a), (b), (c)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +void +simde_vst1q_lane_p8(simde_poly8_t *ptr, simde_poly8x16_t val, const int lane) + SIMDE_REQUIRE_CONSTANT_RANGE(lane, 0, 15) { + #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) + SIMDE_CONSTIFY_16_NO_RESULT_(vst1q_lane_p8, HEDLEY_UNREACHABLE(), lane, ptr, val); + #else + simde_poly8x16_private val_ = simde_poly8x16_to_private(val); + *ptr = val_.values[lane]; + #endif +} +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vst1q_lane_p8 + #define vst1q_lane_p8(a, b, c) simde_vst1q_lane_p8((a), (b), (c)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +void +simde_vst1q_lane_p16(simde_poly16_t *ptr, simde_poly16x8_t val, const int lane) + SIMDE_REQUIRE_CONSTANT_RANGE(lane, 0, 7) { + #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) + SIMDE_CONSTIFY_8_NO_RESULT_(vst1q_lane_p16, HEDLEY_UNREACHABLE(), lane, ptr, val); + #else + simde_poly16x8_private val_ = simde_poly16x8_to_private(val); + *ptr = val_.values[lane]; + #endif +} +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vst1q_lane_p16 + #define vst1q_lane_p16(a, b, c) simde_vst1q_lane_p16((a), (b), (c)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +void +simde_vst1q_lane_p64(simde_poly64_t *ptr, simde_poly64x2_t val, const int lane) + SIMDE_REQUIRE_CONSTANT_RANGE(lane, 0, 1) { + #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) + SIMDE_CONSTIFY_2_NO_RESULT_(vst1q_lane_p64, HEDLEY_UNREACHABLE(), lane, ptr, val); + #else + simde_poly64x2_private val_ = simde_poly64x2_to_private(val); + *ptr = val_.values[lane]; + #endif +} +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) + #undef vst1q_lane_p64 + #define vst1q_lane_p64(a, b, c) simde_vst1q_lane_p64((a), (b), (c)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +void +simde_vst1_lane_bf16(simde_bfloat16_t *ptr, simde_bfloat16x4_t val, const int lane) + SIMDE_REQUIRE_CONSTANT_RANGE(lane, 0, 3) { + #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARM_NEON_BF16) + SIMDE_CONSTIFY_4_NO_RESULT_(vst1_lane_bf16, HEDLEY_UNREACHABLE(), lane, ptr, val); + #else + simde_bfloat16x4_private val_ = simde_bfloat16x4_to_private(val); + *ptr = val_.values[lane]; + #endif +} +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) + #undef vst1_lane_bf16 + #define vst1_lane_bf16(a, b, c) simde_vst1_lane_bf16((a), (b), (c)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +void +simde_vst1q_lane_bf16(simde_bfloat16_t *ptr, simde_bfloat16x8_t val, const int lane) + SIMDE_REQUIRE_CONSTANT_RANGE(lane, 0, 7) { + #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARM_NEON_BF16) + SIMDE_CONSTIFY_8_NO_RESULT_(vst1q_lane_bf16, HEDLEY_UNREACHABLE(), lane, ptr, val); + #else + simde_bfloat16x8_private val_ = simde_bfloat16x8_to_private(val); + *ptr = val_.values[lane]; + #endif +} +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) + #undef vst1q_lane_bf16 + #define vst1q_lane_bf16(a, b, c) simde_vst1q_lane_bf16((a), (b), (c)) +#endif + SIMDE_END_DECLS_ HEDLEY_DIAGNOSTIC_POP diff --git a/lib/simd_wrapper/simde/arm/neon/st1_x2.h b/lib/simd_wrapper/simde/arm/neon/st1_x2.h new file mode 100644 index 00000000000..2b9f94c96bf --- /dev/null +++ b/lib/simd_wrapper/simde/arm/neon/st1_x2.h @@ -0,0 +1,311 @@ +/* SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, copy, + * modify, merge, publish, distribute, sublicense, and/or sell copies + * of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + * Copyright: + * 2020 Evan Nemerson + * 2021 Décio Luiz Gazzoni Filho + * 2023 Yi-Yen Chung (Copyright owned by Andes Technology) + * 2023 Chi-Wei Chu (Copyright owned by NTHU pllab) + */ + +#if !defined(SIMDE_ARM_NEON_ST1_X2_H) +#define SIMDE_ARM_NEON_ST1_X2_H + +#include "types.h" + +HEDLEY_DIAGNOSTIC_PUSH +SIMDE_DISABLE_UNWANTED_DIAGNOSTICS +SIMDE_BEGIN_DECLS_ + +#if !defined(SIMDE_BUG_INTEL_857088) + +SIMDE_FUNCTION_ATTRIBUTES +void +simde_vst1_f16_x2(simde_float16_t ptr[HEDLEY_ARRAY_PARAM(8)], simde_float16x4x2_t val) { + #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) && defined(SIMDE_ARM_NEON_FP16) && !defined(SIMDE_BUG_GCC_REV_260989) + vst1_f16_x2(ptr, val); + #else + simde_float16x4_private a_[2] = {simde_float16x4_to_private(val.val[0]), + simde_float16x4_to_private(val.val[1])}; + #if defined(SIMDE_RISCV_V_NATIVE) && SIMDE_ARCH_RISCV_ZVFH + __riscv_vse16_v_f16m1((_Float16 *)ptr , a_[0].sv64 , 4); + __riscv_vse16_v_f16m1((_Float16 *)ptr+4 , a_[1].sv64 , 4); + #else + simde_float16_t buf[8]; + for (size_t i = 0; i < 8; i++) { + buf[i] = a_[i / 4].values[i % 4]; + } + simde_memcpy(ptr, buf, sizeof(buf)); + #endif + #endif +} +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vst1_f16_x2 + #define vst1_f16_x2(ptr, val) simde_vst1_f16_x2((ptr), (val)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +void +simde_vst1_f32_x2(simde_float32 ptr[HEDLEY_ARRAY_PARAM(4)], simde_float32x2x2_t val) { + #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) && !defined(SIMDE_BUG_GCC_REV_260989) + vst1_f32_x2(ptr, val); + #else + simde_vst1_f32(ptr, val.val[0]); + simde_vst1_f32(ptr+2, val.val[1]); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vst1_f32_x2 + #define vst1_f32_x2(ptr, val) simde_vst1_f32_x2((ptr), (val)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +void +simde_vst1_f64_x2(simde_float64 ptr[HEDLEY_ARRAY_PARAM(2)], simde_float64x1x2_t val) { + #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + vst1_f64_x2(ptr, val); + #else + simde_vst1_f64(ptr, val.val[0]); + simde_vst1_f64(ptr+1, val.val[1]); + #endif +} +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vst1_f64_x2 + #define vst1_f64_x2(ptr, val) simde_vst1_f64_x2((ptr), (val)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +void +simde_vst1_s8_x2(int8_t ptr[HEDLEY_ARRAY_PARAM(16)], simde_int8x8x2_t val) { + #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) && !defined(SIMDE_BUG_GCC_REV_260989) + vst1_s8_x2(ptr, val); + #else + simde_vst1_s8(ptr, val.val[0]); + simde_vst1_s8(ptr+8, val.val[1]); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vst1_s8_x2 + #define vst1_s8_x2(ptr, val) simde_vst1_s8_x2((ptr), (val)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +void +simde_vst1_s16_x2(int16_t ptr[HEDLEY_ARRAY_PARAM(8)], simde_int16x4x2_t val) { + #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) && !defined(SIMDE_BUG_GCC_REV_260989) + vst1_s16_x2(ptr, val); + #else + simde_vst1_s16(ptr, val.val[0]); + simde_vst1_s16(ptr+4, val.val[1]); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vst1_s16_x2 + #define vst1_s16_x2(ptr, val) simde_vst1_s16_x2((ptr), (val)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +void +simde_vst1_s32_x2(int32_t ptr[HEDLEY_ARRAY_PARAM(4)], simde_int32x2x2_t val) { + #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) && !defined(SIMDE_BUG_GCC_REV_260989) + vst1_s32_x2(ptr, val); + #else + simde_vst1_s32(ptr, val.val[0]); + simde_vst1_s32(ptr+2, val.val[1]); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vst1_s32_x2 + #define vst1_s32_x2(ptr, val) simde_vst1_s32_x2((ptr), (val)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +void +simde_vst1_s64_x2(int64_t ptr[HEDLEY_ARRAY_PARAM(2)], simde_int64x1x2_t val) { + #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) && !defined(SIMDE_BUG_GCC_REV_260989) + vst1_s64_x2(ptr, val); + #else + simde_vst1_s64(ptr, val.val[0]); + simde_vst1_s64(ptr+1, val.val[1]); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vst1_s64_x2 + #define vst1_s64_x2(ptr, val) simde_vst1_s64_x2((ptr), (val)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +void +simde_vst1_u8_x2(uint8_t ptr[HEDLEY_ARRAY_PARAM(16)], simde_uint8x8x2_t val) { + #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) && !defined(SIMDE_BUG_GCC_REV_260989) + vst1_u8_x2(ptr, val); + #else + simde_vst1_u8(ptr, val.val[0]); + simde_vst1_u8(ptr+8, val.val[1]); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vst1_u8_x2 + #define vst1_u8_x2(ptr, val) simde_vst1_u8_x2((ptr), (val)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +void +simde_vst1_u16_x2(uint16_t ptr[HEDLEY_ARRAY_PARAM(8)], simde_uint16x4x2_t val) { + #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) && !defined(SIMDE_BUG_GCC_REV_260989) + vst1_u16_x2(ptr, val); + #else + simde_vst1_u16(ptr, val.val[0]); + simde_vst1_u16(ptr+4, val.val[1]); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vst1_u16_x2 + #define vst1_u16_x2(ptr, val) simde_vst1_u16_x2((ptr), (val)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +void +simde_vst1_u32_x2(uint32_t ptr[HEDLEY_ARRAY_PARAM(4)], simde_uint32x2x2_t val) { + #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) && !defined(SIMDE_BUG_GCC_REV_260989) + vst1_u32_x2(ptr, val); + #else + simde_vst1_u32(ptr, val.val[0]); + simde_vst1_u32(ptr+2, val.val[1]); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vst1_u32_x2 + #define vst1_u32_x2(ptr, val) simde_vst1_u32_x2((ptr), (val)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +void +simde_vst1_u64_x2(uint64_t ptr[HEDLEY_ARRAY_PARAM(2)], simde_uint64x1x2_t val) { + #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) && !defined(SIMDE_BUG_GCC_REV_260989) + vst1_u64_x2(ptr, val); + #else + simde_vst1_u64(ptr, val.val[0]); + simde_vst1_u64(ptr+1, val.val[1]); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vst1_u64_x2 + #define vst1_u64_x2(ptr, val) simde_vst1_u64_x2((ptr), (val)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +void +simde_vst1_p8_x2(simde_poly8_t ptr[HEDLEY_ARRAY_PARAM(16)], simde_poly8x8x2_t val) { + #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) && \ + (!defined(HEDLEY_GCC_VERSION) || (HEDLEY_GCC_VERSION_CHECK(8,5,0) && defined(SIMDE_ARM_NEON_A64V8_NATIVE))) + vst1_p8_x2(ptr, val); + #else + simde_poly8x8_private val_[2]; + for (size_t i = 0; i < 2; i++) { + val_[i] = simde_poly8x8_to_private(val.val[i]); + } + #if defined(SIMDE_RISCV_V_NATIVE) + __riscv_vse8_v_u8m1(ptr , val_[0].sv64 , 8); + __riscv_vse8_v_u8m1(ptr+8 , val_[1].sv64 , 8); + #else + simde_memcpy(ptr, &val_, sizeof(val_)); + #endif + #endif +} +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vst1_p8_x2 + #define vst1_p8_x2(a, b) simde_vst1_p8_x2((a), (b)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +void +simde_vst1_p16_x2(simde_poly16_t ptr[HEDLEY_ARRAY_PARAM(8)], simde_poly16x4x2_t val) { + #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) && \ + (!defined(HEDLEY_GCC_VERSION) || (HEDLEY_GCC_VERSION_CHECK(8,5,0) && defined(SIMDE_ARM_NEON_A64V8_NATIVE))) + vst1_p16_x2(ptr, val); + #else + simde_poly16x4_private val_[2]; + for (size_t i = 0; i < 2; i++) { + val_[i] = simde_poly16x4_to_private(val.val[i]); + } + #if defined(SIMDE_RISCV_V_NATIVE) + __riscv_vse16_v_u16m1(ptr , val_[0].sv64 , 4); + __riscv_vse16_v_u16m1(ptr+4 , val_[1].sv64 , 4); + #else + simde_memcpy(ptr, &val_, sizeof(val_)); + #endif + #endif +} +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vst1_p16_x2 + #define vst1_p16_x2(a, b) simde_vst1_p16_x2((a), (b)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +void +simde_vst1_p64_x2(simde_poly64_t ptr[HEDLEY_ARRAY_PARAM(2)], simde_poly64x1x2_t val) { + #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && \ + (!defined(HEDLEY_GCC_VERSION) || (HEDLEY_GCC_VERSION_CHECK(8,5,0) && defined(SIMDE_ARM_NEON_A64V8_NATIVE))) + vst1_p64_x2(ptr, val); + #else + simde_poly64x1_private val_[2]; + for (size_t i = 0; i < 2; i++) { + val_[i] = simde_poly64x1_to_private(val.val[i]); + } + #if defined(SIMDE_RISCV_V_NATIVE) + __riscv_vse64_v_u64m1(ptr , val_[0].sv64 , 1); + __riscv_vse64_v_u64m1(ptr+1 , val_[1].sv64 , 1); + #else + simde_memcpy(ptr, &val_, sizeof(val_)); + #endif + #endif +} +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) + #undef vst1_p64_x2 + #define vst1_p64_x2(a, b) simde_vst1_p64_x2((a), (b)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +void +simde_vst1_bf16_x2(simde_bfloat16_t ptr[HEDLEY_ARRAY_PARAM(8)], simde_bfloat16x4x2_t val) { + #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARM_NEON_BF16) + vst1_bf16_x2(ptr, val); + #else + simde_bfloat16x4_private val_[2]; + for (size_t i = 0; i < 2; i++) { + val_[i] = simde_bfloat16x4_to_private(val.val[i]); + } + simde_memcpy(ptr, &val_, sizeof(val_)); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) + #undef vst1_bf16_x2 + #define vst1_bf16_x2(a, b) simde_vst1_bf16_x2((a), (b)) +#endif + +#endif /* !defined(SIMDE_BUG_INTEL_857088) */ + +SIMDE_END_DECLS_ +HEDLEY_DIAGNOSTIC_POP + +#endif /* !defined(SIMDE_ARM_NEON_ST1_X2_H) */ diff --git a/lib/simd_wrapper/simde/arm/neon/st1_x3.h b/lib/simd_wrapper/simde/arm/neon/st1_x3.h new file mode 100644 index 00000000000..510c9d67eda --- /dev/null +++ b/lib/simd_wrapper/simde/arm/neon/st1_x3.h @@ -0,0 +1,326 @@ +/* SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, copy, + * modify, merge, publish, distribute, sublicense, and/or sell copies + * of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + * Copyright: + * 2020 Evan Nemerson + * 2021 Décio Luiz Gazzoni Filho + * 2023 Yi-Yen Chung (Copyright owned by Andes Technology) + * 2023 Chi-Wei Chu (Copyright owned by NTHU pllab) + */ + +#if !defined(SIMDE_ARM_NEON_ST1_X3_H) +#define SIMDE_ARM_NEON_ST1_X3_H + +#include "types.h" + +HEDLEY_DIAGNOSTIC_PUSH +SIMDE_DISABLE_UNWANTED_DIAGNOSTICS +SIMDE_BEGIN_DECLS_ + +#if !defined(SIMDE_BUG_INTEL_857088) + +SIMDE_FUNCTION_ATTRIBUTES +void +simde_vst1_f16_x3(simde_float16_t ptr[HEDLEY_ARRAY_PARAM(12)], simde_float16x4x3_t val) { + #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) && defined(SIMDE_ARM_NEON_FP16) + vst1_f16_x3(ptr, val); + #else + simde_float16x4_private a[3] = { simde_float16x4_to_private(val.val[0]), + simde_float16x4_to_private(val.val[1]), + simde_float16x4_to_private(val.val[2]) }; + #if defined(SIMDE_RISCV_V_NATIVE) && SIMDE_ARCH_RISCV_ZVFH + __riscv_vse16_v_f16m1((_Float16 *)ptr , a[0].sv64 , 4); + __riscv_vse16_v_f16m1((_Float16 *)ptr+4 , a[1].sv64 , 4); + __riscv_vse16_v_f16m1((_Float16 *)ptr+8 , a[2].sv64 , 4); + #else + simde_float16_t buf[12]; + for (size_t i = 0; i < 12 ; i++) { + buf[i] = a[i / 4].values[i % 4]; + } + simde_memcpy(ptr, buf, sizeof(buf)); + #endif + #endif +} +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vst1_f16_x3 + #define vst1_f16_x3(a, b) simde_vst1_f16_x3((a), (b)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +void +simde_vst1_f32_x3(simde_float32 ptr[HEDLEY_ARRAY_PARAM(6)], simde_float32x2x3_t val) { + #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) && !defined(SIMDE_BUG_GCC_REV_260989) + vst1_f32_x3(ptr, val); + #else + simde_vst1_f32(ptr, val.val[0]); + simde_vst1_f32(ptr+2, val.val[1]); + simde_vst1_f32(ptr+4, val.val[2]); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vst1_f32_x3 + #define vst1_f32_x3(ptr, val) simde_vst1_f32_x3((ptr), (val)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +void +simde_vst1_f64_x3(simde_float64 ptr[HEDLEY_ARRAY_PARAM(3)], simde_float64x1x3_t val) { + #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + vst1_f64_x3(ptr, val); + #else + simde_vst1_f64(ptr, val.val[0]); + simde_vst1_f64(ptr+1, val.val[1]); + simde_vst1_f64(ptr+2, val.val[2]); + #endif +} +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vst1_f64_x3 + #define vst1_f64_x3(ptr, val) simde_vst1_f64_x3((ptr), (val)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +void +simde_vst1_s8_x3(int8_t ptr[HEDLEY_ARRAY_PARAM(24)], simde_int8x8x3_t val) { + #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) && !defined(SIMDE_BUG_GCC_REV_260989) + vst1_s8_x3(ptr, val); + #else + simde_vst1_s8(ptr, val.val[0]); + simde_vst1_s8(ptr+8, val.val[1]); + simde_vst1_s8(ptr+16, val.val[2]); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vst1_s8_x3 + #define vst1_s8_x3(ptr, val) simde_vst1_s8_x3((ptr), (val)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +void +simde_vst1_s16_x3(int16_t ptr[HEDLEY_ARRAY_PARAM(12)], simde_int16x4x3_t val) { + #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) && !defined(SIMDE_BUG_GCC_REV_260989) + vst1_s16_x3(ptr, val); + #else + simde_vst1_s16(ptr, val.val[0]); + simde_vst1_s16(ptr+4, val.val[1]); + simde_vst1_s16(ptr+8, val.val[2]); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vst1_s16_x3 + #define vst1_s16_x3(ptr, val) simde_vst1_s16_x3((ptr), (val)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +void +simde_vst1_s32_x3(int32_t ptr[HEDLEY_ARRAY_PARAM(6)], simde_int32x2x3_t val) { + #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) && !defined(SIMDE_BUG_GCC_REV_260989) + vst1_s32_x3(ptr, val); + #else + simde_vst1_s32(ptr, val.val[0]); + simde_vst1_s32(ptr+2, val.val[1]); + simde_vst1_s32(ptr+4, val.val[2]); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vst1_s32_x3 + #define vst1_s32_x3(ptr, val) simde_vst1_s32_x3((ptr), (val)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +void +simde_vst1_s64_x3(int64_t ptr[HEDLEY_ARRAY_PARAM(3)], simde_int64x1x3_t val) { + #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) && !defined(SIMDE_BUG_GCC_REV_260989) + vst1_s64_x3(ptr, val); + #else + simde_vst1_s64(ptr, val.val[0]); + simde_vst1_s64(ptr+1, val.val[1]); + simde_vst1_s64(ptr+2, val.val[2]); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vst1_s64_x3 + #define vst1_s64_x3(ptr, val) simde_vst1_s64_x3((ptr), (val)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +void +simde_vst1_u8_x3(uint8_t ptr[HEDLEY_ARRAY_PARAM(24)], simde_uint8x8x3_t val) { + #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) && !defined(SIMDE_BUG_GCC_REV_260989) + vst1_u8_x3(ptr, val); + #else + simde_vst1_u8(ptr, val.val[0]); + simde_vst1_u8(ptr+8, val.val[1]); + simde_vst1_u8(ptr+16, val.val[2]); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vst1_u8_x3 + #define vst1_u8_x3(ptr, val) simde_vst1_u8_x3((ptr), (val)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +void +simde_vst1_u16_x3(uint16_t ptr[HEDLEY_ARRAY_PARAM(12)], simde_uint16x4x3_t val) { + #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) && !defined(SIMDE_BUG_GCC_REV_260989) + vst1_u16_x3(ptr, val); + #else + simde_vst1_u16(ptr, val.val[0]); + simde_vst1_u16(ptr+4, val.val[1]); + simde_vst1_u16(ptr+8, val.val[2]); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vst1_u16_x3 + #define vst1_u16_x3(ptr, val) simde_vst1_u16_x3((ptr), (val)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +void +simde_vst1_u32_x3(uint32_t ptr[HEDLEY_ARRAY_PARAM(6)], simde_uint32x2x3_t val) { + #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) && !defined(SIMDE_BUG_GCC_REV_260989) + vst1_u32_x3(ptr, val); + #else + simde_vst1_u32(ptr, val.val[0]); + simde_vst1_u32(ptr+2, val.val[1]); + simde_vst1_u32(ptr+4, val.val[2]); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vst1_u32_x3 + #define vst1_u32_x3(ptr, val) simde_vst1_u32_x3((ptr), (val)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +void +simde_vst1_u64_x3(uint64_t ptr[HEDLEY_ARRAY_PARAM(3)], simde_uint64x1x3_t val) { + #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) && !defined(SIMDE_BUG_GCC_REV_260989) + vst1_u64_x3(ptr, val); + #else + simde_vst1_u64(ptr, val.val[0]); + simde_vst1_u64(ptr+1, val.val[1]); + simde_vst1_u64(ptr+2, val.val[2]); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vst1_u64_x3 + #define vst1_u64_x3(ptr, val) simde_vst1_u64_x3((ptr), (val)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +void +simde_vst1_p8_x3(simde_poly8_t ptr[HEDLEY_ARRAY_PARAM(24)], simde_poly8x8x3_t val) { + #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) && \ + (!defined(HEDLEY_GCC_VERSION) || (HEDLEY_GCC_VERSION_CHECK(8,5,0) && defined(SIMDE_ARM_NEON_A64V8_NATIVE))) + vst1_p8_x3(ptr, val); + #else + simde_poly8x8_private val_[3]; + for (size_t i = 0; i < 3; i++) { + val_[i] = simde_poly8x8_to_private(val.val[i]); + } + #if defined(SIMDE_RISCV_V_NATIVE) + __riscv_vse8_v_u8m1(ptr , val_[0].sv64 , 8); + __riscv_vse8_v_u8m1(ptr+8 , val_[1].sv64 , 8); + __riscv_vse8_v_u8m1(ptr+16 , val_[2].sv64 , 8); + #else + simde_memcpy(ptr, &val_, sizeof(val_)); + #endif + #endif +} +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vst1_p8_x3 + #define vst1_p8_x3(a, b) simde_vst1_p8_x3((a), (b)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +void +simde_vst1_p16_x3(simde_poly16_t ptr[HEDLEY_ARRAY_PARAM(12)], simde_poly16x4x3_t val) { + #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) && \ + (!defined(HEDLEY_GCC_VERSION) || (HEDLEY_GCC_VERSION_CHECK(8,5,0) && defined(SIMDE_ARM_NEON_A64V8_NATIVE))) + vst1_p16_x3(ptr, val); + #else + simde_poly16x4_private val_[3]; + for (size_t i = 0; i < 3; i++) { + val_[i] = simde_poly16x4_to_private(val.val[i]); + } + #if defined(SIMDE_RISCV_V_NATIVE) + __riscv_vse16_v_u16m1(ptr , val_[0].sv64 , 4); + __riscv_vse16_v_u16m1(ptr+4 , val_[1].sv64 , 4); + __riscv_vse16_v_u16m1(ptr+8 , val_[2].sv64 , 4); + #else + simde_memcpy(ptr, &val_, sizeof(val_)); + #endif + #endif +} +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vst1_p16_x3 + #define vst1_p16_x3(a, b) simde_vst1_p16_x3((a), (b)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +void +simde_vst1_p64_x3(simde_poly64_t ptr[HEDLEY_ARRAY_PARAM(3)], simde_poly64x1x3_t val) { + #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && \ + (!defined(HEDLEY_GCC_VERSION) || (HEDLEY_GCC_VERSION_CHECK(8,5,0) && defined(SIMDE_ARM_NEON_A64V8_NATIVE))) + vst1_p64_x3(ptr, val); + #else + simde_poly64x1_private val_[3]; + for (size_t i = 0; i < 3; i++) { + val_[i] = simde_poly64x1_to_private(val.val[i]); + } + #if defined(SIMDE_RISCV_V_NATIVE) + __riscv_vse64_v_u64m1(ptr , val_[0].sv64 , 1); + __riscv_vse64_v_u64m1(ptr+1 , val_[1].sv64 , 1); + __riscv_vse64_v_u64m1(ptr+2 , val_[2].sv64 , 1); + #else + simde_memcpy(ptr, &val_, sizeof(val_)); + #endif + #endif +} +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) + #undef vst1_p64_x3 + #define vst1_p64_x3(a, b) simde_vst1_p64_x3((a), (b)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +void +simde_vst1_bf16_x3(simde_bfloat16_t ptr[HEDLEY_ARRAY_PARAM(12)], simde_bfloat16x4x3_t val) { + #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARM_NEON_BF16) + vst1_bf16_x3(ptr, val); + #else + simde_bfloat16x4_private val_[3]; + for (size_t i = 0; i < 3; i++) { + val_[i] = simde_bfloat16x4_to_private(val.val[i]); + } + simde_memcpy(ptr, &val_, sizeof(val_)); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) + #undef vst1_bf16_x3 + #define vst1_bf16_x3(a, b) simde_vst1_bf16_x3((a), (b)) +#endif + +#endif /* !defined(SIMDE_BUG_INTEL_857088) */ + +SIMDE_END_DECLS_ +HEDLEY_DIAGNOSTIC_POP + +#endif /* !defined(SIMDE_ARM_NEON_ST1_X3_H) */ diff --git a/lib/simd_wrapper/simde/arm/neon/st1_x4.h b/lib/simd_wrapper/simde/arm/neon/st1_x4.h new file mode 100644 index 00000000000..41f6db6e1ce --- /dev/null +++ b/lib/simd_wrapper/simde/arm/neon/st1_x4.h @@ -0,0 +1,339 @@ +/* SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, copy, + * modify, merge, publish, distribute, sublicense, and/or sell copies + * of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + * Copyright: + * 2020 Evan Nemerson + * 2021 Décio Luiz Gazzoni Filho + * 2023 Yi-Yen Chung (Copyright owned by Andes Technology) + * 2023 Chi-Wei Chu (Copyright owned by NTHU pllab) + */ + +#if !defined(SIMDE_ARM_NEON_ST1_X4_H) +#define SIMDE_ARM_NEON_ST1_X4_H + +#include "types.h" + +HEDLEY_DIAGNOSTIC_PUSH +SIMDE_DISABLE_UNWANTED_DIAGNOSTICS +SIMDE_BEGIN_DECLS_ + +#if !defined(SIMDE_BUG_INTEL_857088) + +SIMDE_FUNCTION_ATTRIBUTES +void +simde_vst1_f16_x4(simde_float16_t ptr[HEDLEY_ARRAY_PARAM(16)], simde_float16x4x4_t val) { + #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) && defined(SIMDE_ARM_NEON_FP16) + vst1_f16_x4(ptr, val); + #else + simde_float16x4_private a_[4] = { simde_float16x4_to_private(val.val[0]), simde_float16x4_to_private(val.val[1]), + simde_float16x4_to_private(val.val[2]), simde_float16x4_to_private(val.val[3]) }; + #if defined(SIMDE_RISCV_V_NATIVE) && SIMDE_ARCH_RISCV_ZVFH + __riscv_vse16_v_f16m1((_Float16 *)ptr , a_[0].sv64 , 4); + __riscv_vse16_v_f16m1((_Float16 *)ptr+4 , a_[1].sv64 , 4); + __riscv_vse16_v_f16m1((_Float16 *)ptr+8 , a_[2].sv64 , 4); + __riscv_vse16_v_f16m1((_Float16 *)ptr+12 , a_[3].sv64 , 4); + #else + simde_float16_t buf[16]; + for (size_t i = 0; i < 16 ; i++) { + buf[i] = a_[i / 4].values[i % 4]; + } + simde_memcpy(ptr, buf, sizeof(buf)); + #endif + #endif +} +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vst1_f16_x4 + #define vst1_f16_x4(a, b) simde_vst1_f16_x4((a), (b)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +void +simde_vst1_f32_x4(simde_float32 ptr[HEDLEY_ARRAY_PARAM(8)], simde_float32x2x4_t val) { + #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) && !defined(SIMDE_BUG_GCC_REV_260989) + vst1_f32_x4(ptr, val); + #else + simde_vst1_f32(ptr, val.val[0]); + simde_vst1_f32(ptr+2, val.val[1]); + simde_vst1_f32(ptr+4, val.val[2]); + simde_vst1_f32(ptr+6, val.val[3]); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vst1_f32_x4 + #define vst1_f32_x4(ptr, val) simde_vst1_f32_x4((ptr), (val)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +void +simde_vst1_f64_x4(simde_float64 ptr[HEDLEY_ARRAY_PARAM(4)], simde_float64x1x4_t val) { + #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + vst1_f64_x4(ptr, val); + #else + simde_vst1_f64(ptr, val.val[0]); + simde_vst1_f64(ptr+1, val.val[1]); + simde_vst1_f64(ptr+2, val.val[2]); + simde_vst1_f64(ptr+3, val.val[3]); + #endif +} +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vst1_f64_x4 + #define vst1_f64_x4(ptr, val) simde_vst1_f64_x4((ptr), (val)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +void +simde_vst1_s8_x4(int8_t ptr[HEDLEY_ARRAY_PARAM(32)], simde_int8x8x4_t val) { + #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) && !defined(SIMDE_BUG_GCC_REV_260989) + vst1_s8_x4(ptr, val); + #else + simde_vst1_s8(ptr, val.val[0]); + simde_vst1_s8(ptr+8, val.val[1]); + simde_vst1_s8(ptr+16, val.val[2]); + simde_vst1_s8(ptr+24, val.val[3]); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vst1_s8_x4 + #define vst1_s8_x4(ptr, val) simde_vst1_s8_x4((ptr), (val)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +void +simde_vst1_s16_x4(int16_t ptr[HEDLEY_ARRAY_PARAM(16)], simde_int16x4x4_t val) { + #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) && !defined(SIMDE_BUG_GCC_REV_260989) + vst1_s16_x4(ptr, val); + #else + simde_vst1_s16(ptr, val.val[0]); + simde_vst1_s16(ptr+4, val.val[1]); + simde_vst1_s16(ptr+8, val.val[2]); + simde_vst1_s16(ptr+12, val.val[3]); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vst1_s16_x4 + #define vst1_s16_x4(ptr, val) simde_vst1_s16_x4((ptr), (val)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +void +simde_vst1_s32_x4(int32_t ptr[HEDLEY_ARRAY_PARAM(8)], simde_int32x2x4_t val) { + #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) && !defined(SIMDE_BUG_GCC_REV_260989) + vst1_s32_x4(ptr, val); + #else + simde_vst1_s32(ptr, val.val[0]); + simde_vst1_s32(ptr+2, val.val[1]); + simde_vst1_s32(ptr+4, val.val[2]); + simde_vst1_s32(ptr+6, val.val[3]); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vst1_s32_x4 + #define vst1_s32_x4(ptr, val) simde_vst1_s32_x4((ptr), (val)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +void +simde_vst1_s64_x4(int64_t ptr[HEDLEY_ARRAY_PARAM(4)], simde_int64x1x4_t val) { + #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) && !defined(SIMDE_BUG_GCC_REV_260989) + vst1_s64_x4(ptr, val); + #else + simde_vst1_s64(ptr, val.val[0]); + simde_vst1_s64(ptr+1, val.val[1]); + simde_vst1_s64(ptr+2, val.val[2]); + simde_vst1_s64(ptr+3, val.val[3]); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vst1_s64_x4 + #define vst1_s64_x4(ptr, val) simde_vst1_s64_x4((ptr), (val)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +void +simde_vst1_u8_x4(uint8_t ptr[HEDLEY_ARRAY_PARAM(32)], simde_uint8x8x4_t val) { + #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) && !defined(SIMDE_BUG_GCC_REV_260989) + vst1_u8_x4(ptr, val); + #else + simde_vst1_u8(ptr, val.val[0]); + simde_vst1_u8(ptr+8, val.val[1]); + simde_vst1_u8(ptr+16, val.val[2]); + simde_vst1_u8(ptr+24, val.val[3]); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vst1_u8_x4 + #define vst1_u8_x4(ptr, val) simde_vst1_u8_x4((ptr), (val)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +void +simde_vst1_u16_x4(uint16_t ptr[HEDLEY_ARRAY_PARAM(16)], simde_uint16x4x4_t val) { + #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) && !defined(SIMDE_BUG_GCC_REV_260989) + vst1_u16_x4(ptr, val); + #else + simde_vst1_u16(ptr, val.val[0]); + simde_vst1_u16(ptr+4, val.val[1]); + simde_vst1_u16(ptr+8, val.val[2]); + simde_vst1_u16(ptr+12, val.val[3]); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vst1_u16_x4 + #define vst1_u16_x4(ptr, val) simde_vst1_u16_x4((ptr), (val)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +void +simde_vst1_u32_x4(uint32_t ptr[HEDLEY_ARRAY_PARAM(8)], simde_uint32x2x4_t val) { + #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) && !defined(SIMDE_BUG_GCC_REV_260989) + vst1_u32_x4(ptr, val); + #else + simde_vst1_u32(ptr, val.val[0]); + simde_vst1_u32(ptr+2, val.val[1]); + simde_vst1_u32(ptr+4, val.val[2]); + simde_vst1_u32(ptr+6, val.val[3]); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vst1_u32_x4 + #define vst1_u32_x4(ptr, val) simde_vst1_u32_x4((ptr), (val)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +void +simde_vst1_u64_x4(uint64_t ptr[HEDLEY_ARRAY_PARAM(4)], simde_uint64x1x4_t val) { + #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) && !defined(SIMDE_BUG_GCC_REV_260989) + vst1_u64_x4(ptr, val); + #else + simde_vst1_u64(ptr, val.val[0]); + simde_vst1_u64(ptr+1, val.val[1]); + simde_vst1_u64(ptr+2, val.val[2]); + simde_vst1_u64(ptr+3, val.val[3]); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vst1_u64_x4 + #define vst1_u64_x4(ptr, val) simde_vst1_u64_x4((ptr), (val)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +void +simde_vst1_p8_x4(simde_poly8_t ptr[HEDLEY_ARRAY_PARAM(32)], simde_poly8x8x4_t val) { + #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) && \ + (!defined(HEDLEY_GCC_VERSION) || (HEDLEY_GCC_VERSION_CHECK(8,5,0) && defined(SIMDE_ARM_NEON_A64V8_NATIVE))) + vst1_p8_x4(ptr, val); + #else + simde_poly8x8_private val_[4]; + for (size_t i = 0; i < 4; i++) { + val_[i] = simde_poly8x8_to_private(val.val[i]); + } + #if defined(SIMDE_RISCV_V_NATIVE) + __riscv_vse8_v_u8m1(ptr , val_[0].sv64 , 8); + __riscv_vse8_v_u8m1(ptr+8 , val_[1].sv64 , 8); + __riscv_vse8_v_u8m1(ptr+16 , val_[2].sv64 , 8); + __riscv_vse8_v_u8m1(ptr+24 , val_[3].sv64 , 8); + #else + simde_memcpy(ptr, &val_, sizeof(val_)); + #endif + #endif +} +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vst1_p8_x4 + #define vst1_p8_x4(a, b) simde_vst1_p8_x4((a), (b)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +void +simde_vst1_p16_x4(simde_poly16_t ptr[HEDLEY_ARRAY_PARAM(16)], simde_poly16x4x4_t val) { + #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) && \ + (!defined(HEDLEY_GCC_VERSION) || (HEDLEY_GCC_VERSION_CHECK(8,5,0) && defined(SIMDE_ARM_NEON_A64V8_NATIVE))) + vst1_p16_x4(ptr, val); + #else + simde_poly16x4_private val_[4]; + for (size_t i = 0; i < 4; i++) { + val_[i] = simde_poly16x4_to_private(val.val[i]); + } + #if defined(SIMDE_RISCV_V_NATIVE) + __riscv_vse16_v_u16m1(ptr , val_[0].sv64 , 4); + __riscv_vse16_v_u16m1(ptr+4 , val_[1].sv64 , 4); + __riscv_vse16_v_u16m1(ptr+8 , val_[2].sv64 , 4); + __riscv_vse16_v_u16m1(ptr+12 , val_[3].sv64 , 4); + #else + simde_memcpy(ptr, &val_, sizeof(val_)); + #endif + #endif +} +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vst1_p16_x4 + #define vst1_p16_x4(a, b) simde_vst1_p16_x4((a), (b)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +void +simde_vst1_p64_x4(simde_poly64_t ptr[HEDLEY_ARRAY_PARAM(4)], simde_poly64x1x4_t val) { + #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && \ + (!defined(HEDLEY_GCC_VERSION) || (HEDLEY_GCC_VERSION_CHECK(8,5,0) && defined(SIMDE_ARM_NEON_A64V8_NATIVE))) + vst1_p64_x4(ptr, val); + #else + simde_poly64x1_private val_[4]; + for (size_t i = 0; i < 4; i++) { + val_[i] = simde_poly64x1_to_private(val.val[i]); + } + #if defined(SIMDE_RISCV_V_NATIVE) + __riscv_vse64_v_u64m1(ptr , val_[0].sv64 , 1); + __riscv_vse64_v_u64m1(ptr+1 , val_[1].sv64 , 1); + __riscv_vse64_v_u64m1(ptr+2 , val_[2].sv64 , 1); + __riscv_vse64_v_u64m1(ptr+3 , val_[3].sv64 , 1); + #else + simde_memcpy(ptr, &val_, sizeof(val_)); + #endif + #endif +} +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) + #undef vst1_p64_x4 + #define vst1_p64_x4(a, b) simde_vst1_p64_x4((a), (b)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +void +simde_vst1_bf16_x4(simde_bfloat16_t ptr[HEDLEY_ARRAY_PARAM(16)], simde_bfloat16x4x4_t val) { + #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARM_NEON_BF16) + vst1_bf16_x4(ptr, val); + #else + simde_bfloat16x4_private val_[4]; + for (size_t i = 0; i < 4; i++) { + val_[i] = simde_bfloat16x4_to_private(val.val[i]); + } + simde_memcpy(ptr, &val_, sizeof(val_)); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) + #undef vst1_bf16_x4 + #define vst1_bf16_x4(a, b) simde_vst1_bf16_x4((a), (b)) +#endif + +#endif /* !defined(SIMDE_BUG_INTEL_857088) */ + +SIMDE_END_DECLS_ +HEDLEY_DIAGNOSTIC_POP + +#endif /* !defined(SIMDE_ARM_NEON_ST1_X4_H) */ diff --git a/lib/simd_wrapper/simde/arm/neon/st1q_x2.h b/lib/simd_wrapper/simde/arm/neon/st1q_x2.h new file mode 100644 index 00000000000..4e96191afba --- /dev/null +++ b/lib/simd_wrapper/simde/arm/neon/st1q_x2.h @@ -0,0 +1,309 @@ +/* SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, copy, + * modify, merge, publish, distribute, sublicense, and/or sell copies + * of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + * Copyright: + * 2023 Yi-Yen Chung (Copyright owned by Andes Technology) + * 2023 Chi-Wei Chu (Copyright owned by NTHU pllab) + */ + +#if !defined(SIMDE_ARM_NEON_ST1Q_X2_H) +#define SIMDE_ARM_NEON_ST1Q_X2_H + +#include "types.h" + +HEDLEY_DIAGNOSTIC_PUSH +SIMDE_DISABLE_UNWANTED_DIAGNOSTICS +SIMDE_BEGIN_DECLS_ + +#if !defined(SIMDE_BUG_INTEL_857088) + +SIMDE_FUNCTION_ATTRIBUTES +void +simde_vst1q_f16_x2(simde_float16_t ptr[HEDLEY_ARRAY_PARAM(16)], simde_float16x8x2_t val) { + #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) && defined(SIMDE_ARM_NEON_FP16) + vst1q_f16_x2(ptr, val); + #else + simde_float16x8_private a_[2] = {simde_float16x8_to_private(val.val[0]), + simde_float16x8_to_private(val.val[1])}; + #if defined(SIMDE_RISCV_V_NATIVE) && SIMDE_ARCH_RISCV_ZVFH + __riscv_vse16_v_f16m1((_Float16 *)ptr , a_[0].sv128 , 8); + __riscv_vse16_v_f16m1((_Float16 *)ptr+8 , a_[1].sv128 , 8); + #else + simde_float16_t buf[16]; + for (size_t i = 0; i < 16; i++) { + buf[i] = a_[i / 8].values[i % 8]; + } + simde_memcpy(ptr, buf, sizeof(buf)); + #endif + #endif +} +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vst1q_f16_x2 + #define vst1q_f16_x2(a, b) simde_vst1q_f16_x2((a), (b)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +void +simde_vst1q_f32_x2(simde_float32 ptr[HEDLEY_ARRAY_PARAM(8)], simde_float32x4x2_t val) { + #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) && !defined(SIMDE_BUG_GCC_REV_260989) + vst1q_f32_x2(ptr, val); + #else + simde_vst1q_f32(ptr, val.val[0]); + simde_vst1q_f32(ptr+4, val.val[1]); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vst1q_f32_x2 + #define vst1q_f32_x2(ptr, val) simde_vst1q_f32_x2((ptr), (val)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +void +simde_vst1q_f64_x2(simde_float64 ptr[HEDLEY_ARRAY_PARAM(4)], simde_float64x2x2_t val) { + #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + vst1q_f64_x2(ptr, val); + #else + simde_vst1q_f64(ptr, val.val[0]); + simde_vst1q_f64(ptr+2, val.val[1]); + #endif +} +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vst1q_f64_x2 + #define vst1q_f64_x2(ptr, val) simde_vst1q_f64_x2((ptr), (val)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +void +simde_vst1q_s8_x2(int8_t ptr[HEDLEY_ARRAY_PARAM(32)], simde_int8x16x2_t val) { + #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) && !defined(SIMDE_BUG_GCC_REV_260989) + vst1q_s8_x2(ptr, val); + #else + simde_vst1q_s8(ptr, val.val[0]); + simde_vst1q_s8(ptr+16, val.val[1]); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vst1q_s8_x2 + #define vst1q_s8_x2(ptr, val) simde_vst1q_s8_x2((ptr), (val)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +void +simde_vst1q_s16_x2(int16_t ptr[HEDLEY_ARRAY_PARAM(16)], simde_int16x8x2_t val) { + #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) && !defined(SIMDE_BUG_GCC_REV_260989) + vst1q_s16_x2(ptr, val); + #else + simde_vst1q_s16(ptr, val.val[0]); + simde_vst1q_s16(ptr+8, val.val[1]); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vst1q_s16_x2 + #define vst1q_s16_x2(ptr, val) simde_vst1q_s16_x2((ptr), (val)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +void +simde_vst1q_s32_x2(int32_t ptr[HEDLEY_ARRAY_PARAM(8)], simde_int32x4x2_t val) { + #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) && !defined(SIMDE_BUG_GCC_REV_260989) + vst1q_s32_x2(ptr, val); + #else + simde_vst1q_s32(ptr, val.val[0]); + simde_vst1q_s32(ptr+4, val.val[1]); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vst1q_s32_x2 + #define vst1q_s32_x2(ptr, val) simde_vst1q_s32_x2((ptr), (val)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +void +simde_vst1q_s64_x2(int64_t ptr[HEDLEY_ARRAY_PARAM(4)], simde_int64x2x2_t val) { + #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) && !defined(SIMDE_BUG_GCC_REV_260989) + vst1q_s64_x2(ptr, val); + #else + simde_vst1q_s64(ptr, val.val[0]); + simde_vst1q_s64(ptr+2, val.val[1]); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vst1q_s64_x2 + #define vst1q_s64_x2(ptr, val) simde_vst1q_s64_x2((ptr), (val)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +void +simde_vst1q_u8_x2(uint8_t ptr[HEDLEY_ARRAY_PARAM(32)], simde_uint8x16x2_t val) { + #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) && !defined(SIMDE_BUG_GCC_REV_260989) + vst1q_u8_x2(ptr, val); + #else + simde_vst1q_u8(ptr, val.val[0]); + simde_vst1q_u8(ptr+16, val.val[1]); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vst1q_u8_x2 + #define vst1q_u8_x2(ptr, val) simde_vst1q_u8_x2((ptr), (val)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +void +simde_vst1q_u16_x2(uint16_t ptr[HEDLEY_ARRAY_PARAM(16)], simde_uint16x8x2_t val) { + #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) && !defined(SIMDE_BUG_GCC_REV_260989) + vst1q_u16_x2(ptr, val); + #else + simde_vst1q_u16(ptr, val.val[0]); + simde_vst1q_u16(ptr+8, val.val[1]); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vst1q_u16_x2 + #define vst1q_u16_x2(ptr, val) simde_vst1q_u16_x2((ptr), (val)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +void +simde_vst1q_u32_x2(uint32_t ptr[HEDLEY_ARRAY_PARAM(8)], simde_uint32x4x2_t val) { + #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) && !defined(SIMDE_BUG_GCC_REV_260989) + vst1q_u32_x2(ptr, val); + #else + simde_vst1q_u32(ptr, val.val[0]); + simde_vst1q_u32(ptr+4, val.val[1]); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vst1q_u32_x2 + #define vst1q_u32_x2(ptr, val) simde_vst1q_u32_x2((ptr), (val)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +void +simde_vst1q_u64_x2(uint64_t ptr[HEDLEY_ARRAY_PARAM(4)], simde_uint64x2x2_t val) { + #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) && !defined(SIMDE_BUG_GCC_REV_260989) + vst1q_u64_x2(ptr, val); + #else + simde_vst1q_u64(ptr, val.val[0]); + simde_vst1q_u64(ptr+2, val.val[1]); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vst1q_u64_x2 + #define vst1q_u64_x2(ptr, val) simde_vst1q_u64_x2((ptr), (val)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +void +simde_vst1q_p8_x2(simde_poly8_t ptr[HEDLEY_ARRAY_PARAM(32)], simde_poly8x16x2_t val) { + #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) && \ + (!defined(HEDLEY_GCC_VERSION) || (HEDLEY_GCC_VERSION_CHECK(8,5,0) && defined(SIMDE_ARM_NEON_A64V8_NATIVE))) + vst1q_p8_x2(ptr, val); + #else + simde_poly8x16_private val_[2]; + for (size_t i = 0; i < 2; i++) { + val_[i] = simde_poly8x16_to_private(val.val[i]); + } + #if defined(SIMDE_RISCV_V_NATIVE) + __riscv_vse8_v_u8m1(ptr , val_[0].sv128 , 16); + __riscv_vse8_v_u8m1(ptr+16 , val_[1].sv128 , 16); + #else + simde_memcpy(ptr, &val_, sizeof(val_)); + #endif + #endif +} +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vst1q_p8_x2 + #define vst1q_p8_x2(a, b) simde_vst1q_p8_x2((a), (b)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +void +simde_vst1q_p16_x2(simde_poly16_t ptr[HEDLEY_ARRAY_PARAM(16)], simde_poly16x8x2_t val) { + #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) && \ + (!defined(HEDLEY_GCC_VERSION) || (HEDLEY_GCC_VERSION_CHECK(8,5,0) && defined(SIMDE_ARM_NEON_A64V8_NATIVE))) + vst1q_p16_x2(ptr, val); + #else + simde_poly16x8_private val_[2]; + for (size_t i = 0; i < 2; i++) { + val_[i] = simde_poly16x8_to_private(val.val[i]); + } + #if defined(SIMDE_RISCV_V_NATIVE) + __riscv_vse16_v_u16m1(ptr , val_[0].sv128 , 8); + __riscv_vse16_v_u16m1(ptr+8 , val_[1].sv128 , 8); + #else + simde_memcpy(ptr, &val_, sizeof(val_)); + #endif + #endif +} +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vst1q_p16_x2 + #define vst1q_p16_x2(a, b) simde_vst1q_p16_x2((a), (b)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +void +simde_vst1q_p64_x2(simde_poly64_t ptr[HEDLEY_ARRAY_PARAM(4)], simde_poly64x2x2_t val) { + #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && \ + (!defined(HEDLEY_GCC_VERSION) || (HEDLEY_GCC_VERSION_CHECK(8,5,0) && defined(SIMDE_ARM_NEON_A64V8_NATIVE))) + vst1q_p64_x2(ptr, val); + #else + simde_poly64x2_private val_[2]; + for (size_t i = 0; i < 2; i++) { + val_[i] = simde_poly64x2_to_private(val.val[i]); + } + #if defined(SIMDE_RISCV_V_NATIVE) + __riscv_vse64_v_u64m1(ptr , val_[0].sv128 , 2); + __riscv_vse64_v_u64m1(ptr+2 , val_[1].sv128 , 2); + #else + simde_memcpy(ptr, &val_, sizeof(val_)); + #endif + #endif +} +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) + #undef vst1q_p64_x2 + #define vst1q_p64_x2(a, b) simde_vst1q_p64_x2((a), (b)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +void +simde_vst1q_bf16_x2(simde_bfloat16_t ptr[HEDLEY_ARRAY_PARAM(16)], simde_bfloat16x8x2_t val) { + #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARM_NEON_BF16) + vst1q_bf16_x2(ptr, val); + #else + simde_bfloat16x8_private val_[2]; + for (size_t i = 0; i < 2; i++) { + val_[i] = simde_bfloat16x8_to_private(val.val[i]); + } + simde_memcpy(ptr, &val_, sizeof(val_)); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) + #undef vst1q_bf16_x2 + #define vst1q_bf16_x2(a, b) simde_vst1q_bf16_x2((a), (b)) +#endif + +#endif /* !defined(SIMDE_BUG_INTEL_857088) */ + +SIMDE_END_DECLS_ +HEDLEY_DIAGNOSTIC_POP + +#endif /* !defined(SIMDE_ARM_NEON_ST1Q_X2_H) */ diff --git a/lib/simd_wrapper/simde/arm/neon/st1q_x3.h b/lib/simd_wrapper/simde/arm/neon/st1q_x3.h new file mode 100644 index 00000000000..04beeb2c84e --- /dev/null +++ b/lib/simd_wrapper/simde/arm/neon/st1q_x3.h @@ -0,0 +1,324 @@ +/* SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, copy, + * modify, merge, publish, distribute, sublicense, and/or sell copies + * of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + * Copyright: + * 2023 Yi-Yen Chung (Copyright owned by Andes Technology) + * 2023 Chi-Wei Chu (Copyright owned by NTHU pllab) + */ + +#if !defined(SIMDE_ARM_NEON_ST1Q_X3_H) +#define SIMDE_ARM_NEON_ST1Q_X3_H + +#include "types.h" + +HEDLEY_DIAGNOSTIC_PUSH +SIMDE_DISABLE_UNWANTED_DIAGNOSTICS +SIMDE_BEGIN_DECLS_ + +#if !defined(SIMDE_BUG_INTEL_857088) + +SIMDE_FUNCTION_ATTRIBUTES +void +simde_vst1q_f16_x3(simde_float16_t ptr[HEDLEY_ARRAY_PARAM(24)], simde_float16x8x3_t val) { + #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) && defined(SIMDE_ARM_NEON_FP16) + vst1q_f16_x3(ptr, val); + #else + simde_float16x8_private a[3] = { simde_float16x8_to_private(val.val[0]), + simde_float16x8_to_private(val.val[1]), + simde_float16x8_to_private(val.val[2]) }; + #if defined(SIMDE_RISCV_V_NATIVE) && SIMDE_ARCH_RISCV_ZVFH + __riscv_vse16_v_f16m1((_Float16 *)ptr , a[0].sv128 , 8); + __riscv_vse16_v_f16m1((_Float16 *)ptr+8 , a[1].sv128 , 8); + __riscv_vse16_v_f16m1((_Float16 *)ptr+16 , a[2].sv128 , 8); + #else + simde_float16_t buf[24]; + for (size_t i = 0; i < 24 ; i++) { + buf[i] = a[i / 8].values[i % 8]; + } + simde_memcpy(ptr, buf, sizeof(buf)); + #endif + #endif +} +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vst1q_f16_x3 + #define vst1q_f16_x3(a, b) simde_vst1q_f16_x3((a), (b)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +void +simde_vst1q_f32_x3(simde_float32 ptr[HEDLEY_ARRAY_PARAM(12)], simde_float32x4x3_t val) { + #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) && !defined(SIMDE_BUG_GCC_REV_260989) + vst1q_f32_x3(ptr, val); + #else + simde_vst1q_f32(ptr, val.val[0]); + simde_vst1q_f32(ptr+4, val.val[1]); + simde_vst1q_f32(ptr+8, val.val[2]); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vst1q_f32_x3 + #define vst1q_f32_x3(ptr, val) simde_vst1q_f32_x3((ptr), (val)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +void +simde_vst1q_f64_x3(simde_float64 ptr[HEDLEY_ARRAY_PARAM(6)], simde_float64x2x3_t val) { + #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + vst1q_f64_x3(ptr, val); + #else + simde_vst1q_f64(ptr, val.val[0]); + simde_vst1q_f64(ptr+2, val.val[1]); + simde_vst1q_f64(ptr+4, val.val[2]); + #endif +} +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vst1q_f64_x3 + #define vst1q_f64_x3(ptr, val) simde_vst1q_f64_x3((ptr), (val)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +void +simde_vst1q_s8_x3(int8_t ptr[HEDLEY_ARRAY_PARAM(48)], simde_int8x16x3_t val) { + #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) && !defined(SIMDE_BUG_GCC_REV_260989) + vst1q_s8_x3(ptr, val); + #else + simde_vst1q_s8(ptr, val.val[0]); + simde_vst1q_s8(ptr+16, val.val[1]); + simde_vst1q_s8(ptr+32, val.val[2]); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vst1q_s8_x3 + #define vst1q_s8_x3(ptr, val) simde_vst1q_s8_x3((ptr), (val)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +void +simde_vst1q_s16_x3(int16_t ptr[HEDLEY_ARRAY_PARAM(24)], simde_int16x8x3_t val) { + #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) && !defined(SIMDE_BUG_GCC_REV_260989) + vst1q_s16_x3(ptr, val); + #else + simde_vst1q_s16(ptr, val.val[0]); + simde_vst1q_s16(ptr+8, val.val[1]); + simde_vst1q_s16(ptr+16, val.val[2]); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vst1q_s16_x3 + #define vst1q_s16_x3(ptr, val) simde_vst1q_s16_x3((ptr), (val)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +void +simde_vst1q_s32_x3(int32_t ptr[HEDLEY_ARRAY_PARAM(12)], simde_int32x4x3_t val) { + #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) && !defined(SIMDE_BUG_GCC_REV_260989) + vst1q_s32_x3(ptr, val); + #else + simde_vst1q_s32(ptr, val.val[0]); + simde_vst1q_s32(ptr+4, val.val[1]); + simde_vst1q_s32(ptr+8, val.val[2]); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vst1q_s32_x3 + #define vst1q_s32_x3(ptr, val) simde_vst1q_s32_x3((ptr), (val)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +void +simde_vst1q_s64_x3(int64_t ptr[HEDLEY_ARRAY_PARAM(6)], simde_int64x2x3_t val) { + #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) && !defined(SIMDE_BUG_GCC_REV_260989) + vst1q_s64_x3(ptr, val); + #else + simde_vst1q_s64(ptr, val.val[0]); + simde_vst1q_s64(ptr+2, val.val[1]); + simde_vst1q_s64(ptr+4, val.val[2]); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vst1q_s64_x3 + #define vst1q_s64_x3(ptr, val) simde_vst1q_s64_x3((ptr), (val)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +void +simde_vst1q_u8_x3(uint8_t ptr[HEDLEY_ARRAY_PARAM(48)], simde_uint8x16x3_t val) { + #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) && !defined(SIMDE_BUG_GCC_REV_260989) + vst1q_u8_x3(ptr, val); + #else + simde_vst1q_u8(ptr, val.val[0]); + simde_vst1q_u8(ptr+16, val.val[1]); + simde_vst1q_u8(ptr+32, val.val[2]); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vst1q_u8_x3 + #define vst1q_u8_x3(ptr, val) simde_vst1q_u8_x3((ptr), (val)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +void +simde_vst1q_u16_x3(uint16_t ptr[HEDLEY_ARRAY_PARAM(24)], simde_uint16x8x3_t val) { + #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) && !defined(SIMDE_BUG_GCC_REV_260989) + vst1q_u16_x3(ptr, val); + #else + simde_vst1q_u16(ptr, val.val[0]); + simde_vst1q_u16(ptr+8, val.val[1]); + simde_vst1q_u16(ptr+16, val.val[2]); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vst1q_u16_x3 + #define vst1q_u16_x3(ptr, val) simde_vst1q_u16_x3((ptr), (val)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +void +simde_vst1q_u32_x3(uint32_t ptr[HEDLEY_ARRAY_PARAM(12)], simde_uint32x4x3_t val) { + #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) && !defined(SIMDE_BUG_GCC_REV_260989) + vst1q_u32_x3(ptr, val); + #else + simde_vst1q_u32(ptr, val.val[0]); + simde_vst1q_u32(ptr+4, val.val[1]); + simde_vst1q_u32(ptr+8, val.val[2]); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vst1q_u32_x3 + #define vst1q_u32_x3(ptr, val) simde_vst1q_u32_x3((ptr), (val)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +void +simde_vst1q_u64_x3(uint64_t ptr[HEDLEY_ARRAY_PARAM(6)], simde_uint64x2x3_t val) { + #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) && !defined(SIMDE_BUG_GCC_REV_260989) + vst1q_u64_x3(ptr, val); + #else + simde_vst1q_u64(ptr, val.val[0]); + simde_vst1q_u64(ptr+2, val.val[1]); + simde_vst1q_u64(ptr+4, val.val[2]); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vst1q_u64_x3 + #define vst1q_u64_x3(ptr, val) simde_vst1q_u64_x3((ptr), (val)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +void +simde_vst1q_p8_x3(simde_poly8_t ptr[HEDLEY_ARRAY_PARAM(48)], simde_poly8x16x3_t val) { + #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) && \ + (!defined(HEDLEY_GCC_VERSION) || (HEDLEY_GCC_VERSION_CHECK(8,5,0) && defined(SIMDE_ARM_NEON_A64V8_NATIVE))) + vst1q_p8_x3(ptr, val); + #else + simde_poly8x16_private val_[3]; + for (size_t i = 0; i < 3; i++) { + val_[i] = simde_poly8x16_to_private(val.val[i]); + } + #if defined(SIMDE_RISCV_V_NATIVE) + __riscv_vse8_v_u8m1(ptr , val_[0].sv128 , 16); + __riscv_vse8_v_u8m1(ptr+16 , val_[1].sv128 , 16); + __riscv_vse8_v_u8m1(ptr+32 , val_[2].sv128 , 16); + #else + simde_memcpy(ptr, &val_, sizeof(val_)); + #endif + #endif +} +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vst1q_p8_x3 + #define vst1q_p8_x3(a, b) simde_vst1q_p8_x3((a), (b)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +void +simde_vst1q_p16_x3(simde_poly16_t ptr[HEDLEY_ARRAY_PARAM(24)], simde_poly16x8x3_t val) { + #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) && \ + (!defined(HEDLEY_GCC_VERSION) || (HEDLEY_GCC_VERSION_CHECK(8,5,0) && defined(SIMDE_ARM_NEON_A64V8_NATIVE))) + vst1q_p16_x3(ptr, val); + #else + simde_poly16x8_private val_[3]; + for (size_t i = 0; i < 3; i++) { + val_[i] = simde_poly16x8_to_private(val.val[i]); + } + #if defined(SIMDE_RISCV_V_NATIVE) + __riscv_vse16_v_u16m1(ptr , val_[0].sv128 , 8); + __riscv_vse16_v_u16m1(ptr+8 , val_[1].sv128 , 8); + __riscv_vse16_v_u16m1(ptr+16 , val_[2].sv128 , 8); + #else + simde_memcpy(ptr, &val_, sizeof(val_)); + #endif + #endif +} +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vst1q_p16_x3 + #define vst1q_p16_x3(a, b) simde_vst1q_p16_x3((a), (b)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +void +simde_vst1q_p64_x3(simde_poly64_t ptr[HEDLEY_ARRAY_PARAM(6)], simde_poly64x2x3_t val) { + #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && \ + (!defined(HEDLEY_GCC_VERSION) || (HEDLEY_GCC_VERSION_CHECK(8,5,0) && defined(SIMDE_ARM_NEON_A64V8_NATIVE))) + vst1q_p64_x3(ptr, val); + #else + simde_poly64x2_private val_[3]; + for (size_t i = 0; i < 3; i++) { + val_[i] = simde_poly64x2_to_private(val.val[i]); + } + #if defined(SIMDE_RISCV_V_NATIVE) + __riscv_vse64_v_u64m1(ptr , val_[0].sv128 , 2); + __riscv_vse64_v_u64m1(ptr+2 , val_[1].sv128 , 2); + __riscv_vse64_v_u64m1(ptr+4 , val_[2].sv128 , 2); + #else + simde_memcpy(ptr, &val_, sizeof(val_)); + #endif + #endif +} +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) + #undef vst1q_p64_x3 + #define vst1q_p64_x3(a, b) simde_vst1q_p64_x3((a), (b)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +void +simde_vst1q_bf16_x3(simde_bfloat16_t ptr[HEDLEY_ARRAY_PARAM(24)], simde_bfloat16x8x3_t val) { + #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARM_NEON_BF16) + vst1q_bf16_x3(ptr, val); + #else + simde_bfloat16x8_private val_[3]; + for (size_t i = 0; i < 3; i++) { + val_[i] = simde_bfloat16x8_to_private(val.val[i]); + } + simde_memcpy(ptr, &val_, sizeof(val_)); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) + #undef vst1q_bf16_x3 + #define vst1q_bf16_x3(a, b) simde_vst1q_bf16_x3((a), (b)) +#endif + +#endif /* !defined(SIMDE_BUG_INTEL_857088) */ + +SIMDE_END_DECLS_ +HEDLEY_DIAGNOSTIC_POP + +#endif /* !defined(SIMDE_ARM_NEON_ST1Q_X3_H) */ diff --git a/lib/simd_wrapper/simde/arm/neon/st1q_x4.h b/lib/simd_wrapper/simde/arm/neon/st1q_x4.h new file mode 100644 index 00000000000..9b91c632e25 --- /dev/null +++ b/lib/simd_wrapper/simde/arm/neon/st1q_x4.h @@ -0,0 +1,339 @@ +/* SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, copy, + * modify, merge, publish, distribute, sublicense, and/or sell copies + * of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + * Copyright: + * 2020 Evan Nemerson + * 2021 Décio Luiz Gazzoni Filho + * 2023 Yi-Yen Chung (Copyright owned by Andes Technology) + * 2023 Chi-Wei Chu (Copyright owned by NTHU pllab) + */ + +#if !defined(SIMDE_ARM_NEON_ST1Q_X4_H) +#define SIMDE_ARM_NEON_ST1Q_X4_H + +#include "types.h" + +HEDLEY_DIAGNOSTIC_PUSH +SIMDE_DISABLE_UNWANTED_DIAGNOSTICS +SIMDE_BEGIN_DECLS_ + +#if !defined(SIMDE_BUG_INTEL_857088) + +SIMDE_FUNCTION_ATTRIBUTES +void +simde_vst1q_f16_x4(simde_float16_t ptr[HEDLEY_ARRAY_PARAM(32)], simde_float16x8x4_t val) { + #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) && defined(SIMDE_ARM_NEON_FP16) + vst1q_f16_x4(ptr, val); + #else + simde_float16x8_private a_[4] = { simde_float16x8_to_private(val.val[0]), simde_float16x8_to_private(val.val[1]), + simde_float16x8_to_private(val.val[2]), simde_float16x8_to_private(val.val[3]) }; + #if defined(SIMDE_RISCV_V_NATIVE) && SIMDE_ARCH_RISCV_ZVFH + __riscv_vse16_v_f16m1((_Float16 *)ptr , a_[0].sv128 , 8); + __riscv_vse16_v_f16m1((_Float16 *)ptr+8 , a_[1].sv128 , 8); + __riscv_vse16_v_f16m1((_Float16 *)ptr+16 , a_[2].sv128 , 8); + __riscv_vse16_v_f16m1((_Float16 *)ptr+24 , a_[3].sv128 , 8); + #else + simde_float16_t buf[32]; + for (size_t i = 0; i < 32 ; i++) { + buf[i] = a_[i / 8].values[i % 8]; + } + simde_memcpy(ptr, buf, sizeof(buf)); + #endif + #endif +} +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vst1q_f16_x4 + #define vst1q_f16_x4(a, b) simde_vst1q_f16_x4((a), (b)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +void +simde_vst1q_f32_x4(simde_float32 ptr[HEDLEY_ARRAY_PARAM(16)], simde_float32x4x4_t val) { + #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) && !defined(SIMDE_BUG_GCC_REV_260989) + vst1q_f32_x4(ptr, val); + #else + simde_vst1q_f32(ptr, val.val[0]); + simde_vst1q_f32(ptr+4, val.val[1]); + simde_vst1q_f32(ptr+8, val.val[2]); + simde_vst1q_f32(ptr+12, val.val[3]); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vst1q_f32_x4 + #define vst1q_f32_x4(ptr, val) simde_vst1q_f32_x4((ptr), (val)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +void +simde_vst1q_f64_x4(simde_float64 ptr[HEDLEY_ARRAY_PARAM(8)], simde_float64x2x4_t val) { + #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + vst1q_f64_x4(ptr, val); + #else + simde_vst1q_f64(ptr, val.val[0]); + simde_vst1q_f64(ptr+2, val.val[1]); + simde_vst1q_f64(ptr+4, val.val[2]); + simde_vst1q_f64(ptr+6, val.val[3]); + #endif +} +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vst1q_f64_x4 + #define vst1q_f64_x4(ptr, val) simde_vst1q_f64_x4((ptr), (val)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +void +simde_vst1q_s8_x4(int8_t ptr[HEDLEY_ARRAY_PARAM(64)], simde_int8x16x4_t val) { + #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) && !defined(SIMDE_BUG_GCC_REV_260989) + vst1q_s8_x4(ptr, val); + #else + simde_vst1q_s8(ptr, val.val[0]); + simde_vst1q_s8(ptr+16, val.val[1]); + simde_vst1q_s8(ptr+32, val.val[2]); + simde_vst1q_s8(ptr+48, val.val[3]); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vst1q_s8_x4 + #define vst1q_s8_x4(ptr, val) simde_vst1q_s8_x4((ptr), (val)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +void +simde_vst1q_s16_x4(int16_t ptr[HEDLEY_ARRAY_PARAM(32)], simde_int16x8x4_t val) { + #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) && !defined(SIMDE_BUG_GCC_REV_260989) + vst1q_s16_x4(ptr, val); + #else + simde_vst1q_s16(ptr, val.val[0]); + simde_vst1q_s16(ptr+8, val.val[1]); + simde_vst1q_s16(ptr+16, val.val[2]); + simde_vst1q_s16(ptr+24, val.val[3]); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vst1q_s16_x4 + #define vst1q_s16_x4(ptr, val) simde_vst1q_s16_x4((ptr), (val)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +void +simde_vst1q_s32_x4(int32_t ptr[HEDLEY_ARRAY_PARAM(16)], simde_int32x4x4_t val) { + #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) && !defined(SIMDE_BUG_GCC_REV_260989) + vst1q_s32_x4(ptr, val); + #else + simde_vst1q_s32(ptr, val.val[0]); + simde_vst1q_s32(ptr+4, val.val[1]); + simde_vst1q_s32(ptr+8, val.val[2]); + simde_vst1q_s32(ptr+12, val.val[3]); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vst1q_s32_x4 + #define vst1q_s32_x4(ptr, val) simde_vst1q_s32_x4((ptr), (val)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +void +simde_vst1q_s64_x4(int64_t ptr[HEDLEY_ARRAY_PARAM(8)], simde_int64x2x4_t val) { + #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) && !defined(SIMDE_BUG_GCC_REV_260989) + vst1q_s64_x4(ptr, val); + #else + simde_vst1q_s64(ptr, val.val[0]); + simde_vst1q_s64(ptr+2, val.val[1]); + simde_vst1q_s64(ptr+4, val.val[2]); + simde_vst1q_s64(ptr+6, val.val[3]); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vst1q_s64_x4 + #define vst1q_s64_x4(ptr, val) simde_vst1q_s64_x4((ptr), (val)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +void +simde_vst1q_u8_x4(uint8_t ptr[HEDLEY_ARRAY_PARAM(64)], simde_uint8x16x4_t val) { + #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) && !defined(SIMDE_BUG_GCC_REV_260989) + vst1q_u8_x4(ptr, val); + #else + simde_vst1q_u8(ptr, val.val[0]); + simde_vst1q_u8(ptr+16, val.val[1]); + simde_vst1q_u8(ptr+32, val.val[2]); + simde_vst1q_u8(ptr+48, val.val[3]); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vst1q_u8_x4 + #define vst1q_u8_x4(ptr, val) simde_vst1q_u8_x4((ptr), (val)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +void +simde_vst1q_u16_x4(uint16_t ptr[HEDLEY_ARRAY_PARAM(32)], simde_uint16x8x4_t val) { + #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) && !defined(SIMDE_BUG_GCC_REV_260989) + vst1q_u16_x4(ptr, val); + #else + simde_vst1q_u16(ptr, val.val[0]); + simde_vst1q_u16(ptr+8, val.val[1]); + simde_vst1q_u16(ptr+16, val.val[2]); + simde_vst1q_u16(ptr+24, val.val[3]); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vst1q_u16_x4 + #define vst1q_u16_x4(ptr, val) simde_vst1q_u16_x4((ptr), (val)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +void +simde_vst1q_u32_x4(uint32_t ptr[HEDLEY_ARRAY_PARAM(16)], simde_uint32x4x4_t val) { + #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) && !defined(SIMDE_BUG_GCC_REV_260989) + vst1q_u32_x4(ptr, val); + #else + simde_vst1q_u32(ptr, val.val[0]); + simde_vst1q_u32(ptr+4, val.val[1]); + simde_vst1q_u32(ptr+8, val.val[2]); + simde_vst1q_u32(ptr+12, val.val[3]); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vst1q_u32_x4 + #define vst1q_u32_x4(ptr, val) simde_vst1q_u32_x4((ptr), (val)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +void +simde_vst1q_u64_x4(uint64_t ptr[HEDLEY_ARRAY_PARAM(8)], simde_uint64x2x4_t val) { + #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) && !defined(SIMDE_BUG_GCC_REV_260989) + vst1q_u64_x4(ptr, val); + #else + simde_vst1q_u64(ptr, val.val[0]); + simde_vst1q_u64(ptr+2, val.val[1]); + simde_vst1q_u64(ptr+4, val.val[2]); + simde_vst1q_u64(ptr+6, val.val[3]); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vst1q_u64_x4 + #define vst1q_u64_x4(ptr, val) simde_vst1q_u64_x4((ptr), (val)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +void +simde_vst1q_p8_x4(simde_poly8_t ptr[HEDLEY_ARRAY_PARAM(64)], simde_poly8x16x4_t val) { + #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) && \ + (!defined(HEDLEY_GCC_VERSION) || (HEDLEY_GCC_VERSION_CHECK(8,5,0) && defined(SIMDE_ARM_NEON_A64V8_NATIVE))) + vst1q_p8_x4(ptr, val); + #else + simde_poly8x16_private val_[4]; + for (size_t i = 0; i < 4; i++) { + val_[i] = simde_poly8x16_to_private(val.val[i]); + } + #if defined(SIMDE_RISCV_V_NATIVE) + __riscv_vse8_v_u8m1(ptr , val_[0].sv128 , 16); + __riscv_vse8_v_u8m1(ptr+16 , val_[1].sv128 , 16); + __riscv_vse8_v_u8m1(ptr+32 , val_[2].sv128 , 16); + __riscv_vse8_v_u8m1(ptr+48 , val_[3].sv128 , 16); + #else + simde_memcpy(ptr, &val_, sizeof(val_)); + #endif + #endif +} +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vst1q_p8_x4 + #define vst1q_p8_x4(a, b) simde_vst1q_p8_x4((a), (b)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +void +simde_vst1q_p16_x4(simde_poly16_t ptr[HEDLEY_ARRAY_PARAM(32)], simde_poly16x8x4_t val) { + #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) && \ + (!defined(HEDLEY_GCC_VERSION) || (HEDLEY_GCC_VERSION_CHECK(8,5,0) && defined(SIMDE_ARM_NEON_A64V8_NATIVE))) + vst1q_p16_x4(ptr, val); + #else + simde_poly16x8_private val_[4]; + for (size_t i = 0; i < 4; i++) { + val_[i] = simde_poly16x8_to_private(val.val[i]); + } + #if defined(SIMDE_RISCV_V_NATIVE) + __riscv_vse16_v_u16m1(ptr , val_[0].sv128 , 8); + __riscv_vse16_v_u16m1(ptr+8 , val_[1].sv128 , 8); + __riscv_vse16_v_u16m1(ptr+16 , val_[2].sv128 , 8); + __riscv_vse16_v_u16m1(ptr+24 , val_[3].sv128 , 8); + #else + simde_memcpy(ptr, &val_, sizeof(val_)); + #endif + #endif +} +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vst1q_p16_x4 + #define vst1q_p16_x4(a, b) simde_vst1q_p16_x4((a), (b)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +void +simde_vst1q_p64_x4(simde_poly64_t ptr[HEDLEY_ARRAY_PARAM(8)], simde_poly64x2x4_t val) { + #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && \ + (!defined(HEDLEY_GCC_VERSION) || (HEDLEY_GCC_VERSION_CHECK(8,5,0) && defined(SIMDE_ARM_NEON_A64V8_NATIVE))) + vst1q_p64_x4(ptr, val); + #else + simde_poly64x2_private val_[4]; + for (size_t i = 0; i < 4; i++) { + val_[i] = simde_poly64x2_to_private(val.val[i]); + } + #if defined(SIMDE_RISCV_V_NATIVE) + __riscv_vse64_v_u64m1(ptr , val_[0].sv128 , 2); + __riscv_vse64_v_u64m1(ptr+2 , val_[1].sv128 , 2); + __riscv_vse64_v_u64m1(ptr+4 , val_[2].sv128 , 2); + __riscv_vse64_v_u64m1(ptr+6 , val_[3].sv128 , 2); + #else + simde_memcpy(ptr, &val_, sizeof(val_)); + #endif + #endif +} +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) + #undef vst1q_p64_x4 + #define vst1q_p64_x4(a, b) simde_vst1q_p64_x4((a), (b)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +void +simde_vst1q_bf16_x4(simde_bfloat16_t ptr[HEDLEY_ARRAY_PARAM(32)], simde_bfloat16x8x4_t val) { + #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARM_NEON_BF16) + vst1q_bf16_x4(ptr, val); + #else + simde_bfloat16x8_private val_[4]; + for (size_t i = 0; i < 4; i++) { + val_[i] = simde_bfloat16x8_to_private(val.val[i]); + } + simde_memcpy(ptr, &val_, sizeof(val_)); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) + #undef vst1q_bf16_x4 + #define vst1q_bf16_x4(a, b) simde_vst1q_bf16_x4((a), (b)) +#endif + +#endif /* !defined(SIMDE_BUG_INTEL_857088) */ + +SIMDE_END_DECLS_ +HEDLEY_DIAGNOSTIC_POP + +#endif /* !defined(SIMDE_ARM_NEON_ST1Q_X4_H) */ diff --git a/lib/simd_wrapper/simde/arm/neon/st2.h b/lib/simd_wrapper/simde/arm/neon/st2.h index 9dcaef6335e..157123cf71b 100644 --- a/lib/simd_wrapper/simde/arm/neon/st2.h +++ b/lib/simd_wrapper/simde/arm/neon/st2.h @@ -23,6 +23,8 @@ * Copyright: * 2020 Evan Nemerson * 2021 Zhi An Ng (Copyright owned by Google, LLC) + * 2023 Yi-Yen Chung (Copyright owned by Andes Technology) + * 2023 Chi-Wei Chu (Copyright owned by NTHU pllab) */ #if !defined(SIMDE_ARM_NEON_ST2_H) @@ -37,19 +39,53 @@ SIMDE_BEGIN_DECLS_ #if !defined(SIMDE_BUG_INTEL_857088) +SIMDE_FUNCTION_ATTRIBUTES +void +simde_vst2_f16(simde_float16_t *ptr, simde_float16x4x2_t val) { + #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) && defined(SIMDE_ARM_NEON_FP16) + vst2_f16(ptr, val); + #else + simde_float16x4_private a_[2] = {simde_float16x4_to_private(val.val[0]), + simde_float16x4_to_private(val.val[1])}; + #if defined(SIMDE_RISCV_V_NATIVE) && SIMDE_ARCH_RISCV_ZVFH && (SIMDE_NATURAL_VECTOR_SIZE >= 128) + vfloat16m1x2_t dest = __riscv_vlseg2e16_v_f16m1x2((_Float16 *)ptr, 4); + dest = __riscv_vset_v_f16m1_f16m1x2 (dest, 0, a_[0].sv64); + dest = __riscv_vset_v_f16m1_f16m1x2 (dest, 1, a_[1].sv64); + __riscv_vsseg2e16_v_f16m1x2 ((_Float16 *)ptr, dest, 4); + #else + simde_float16_t buf[8]; + for (size_t i = 0; i < 8 ; i++) { + buf[i] = a_[i % 2].values[i / 2]; + } + simde_memcpy(ptr, buf, sizeof(buf)); + #endif + #endif +} +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vst2_f16 + #define vst2_f16(a, b) simde_vst2_f16((a), (b)) +#endif + SIMDE_FUNCTION_ATTRIBUTES void simde_vst2_f32(simde_float32_t *ptr, simde_float32x2x2_t val) { #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) vst2_f32(ptr, val); #else - simde_float32_t buf[4]; simde_float32x2_private a_[2] = {simde_float32x2_to_private(val.val[0]), simde_float32x2_to_private(val.val[1])}; - for (size_t i = 0; i < (sizeof(val.val[0]) / sizeof(*ptr)) * 2 ; i++) { - buf[i] = a_[i % 2].values[i / 2]; - } - simde_memcpy(ptr, buf, sizeof(buf)); + #if defined(SIMDE_RISCV_V_NATIVE) + vfloat32m1x2_t dest = __riscv_vlseg2e32_v_f32m1x2(ptr, 2); + dest = __riscv_vset_v_f32m1_f32m1x2 (dest, 0, a_[0].sv64); + dest = __riscv_vset_v_f32m1_f32m1x2 (dest, 1, a_[1].sv64); + __riscv_vsseg2e32_v_f32m1x2 (ptr, dest, 2); + #else + simde_float32_t buf[4]; + for (size_t i = 0; i < (sizeof(val.val[0]) / sizeof(*ptr)) * 2 ; i++) { + buf[i] = a_[i % 2].values[i / 2]; + } + simde_memcpy(ptr, buf, sizeof(buf)); + #endif #endif } #if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) @@ -63,13 +99,20 @@ simde_vst2_f64(simde_float64_t *ptr, simde_float64x1x2_t val) { #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) vst2_f64(ptr, val); #else - simde_float64_t buf[2]; simde_float64x1_private a_[2] = {simde_float64x1_to_private(val.val[0]), simde_float64x1_to_private(val.val[1])}; - for (size_t i = 0; i < (sizeof(val.val[0]) / sizeof(*ptr)) * 2 ; i++) { - buf[i] = a_[i % 2].values[i / 2]; - } - simde_memcpy(ptr, buf, sizeof(buf)); + #if defined(SIMDE_RISCV_V_NATIVE) + vfloat64m1x2_t dest = __riscv_vlseg2e64_v_f64m1x2(ptr, 1); + dest = __riscv_vset_v_f64m1_f64m1x2 (dest, 0, a_[0].sv64); + dest = __riscv_vset_v_f64m1_f64m1x2 (dest, 1, a_[1].sv64); + __riscv_vsseg2e64_v_f64m1x2 (ptr, dest, 1); + #else + simde_float64_t buf[2]; + for (size_t i = 0; i < (sizeof(val.val[0]) / sizeof(*ptr)) * 2 ; i++) { + buf[i] = a_[i % 2].values[i / 2]; + } + simde_memcpy(ptr, buf, sizeof(buf)); + #endif #endif } #if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) @@ -83,13 +126,20 @@ simde_vst2_s8(int8_t *ptr, simde_int8x8x2_t val) { #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) vst2_s8(ptr, val); #else - int8_t buf[16]; simde_int8x8_private a_[2] = {simde_int8x8_to_private(val.val[0]), simde_int8x8_to_private(val.val[1])}; - for (size_t i = 0; i < (sizeof(val.val[0]) / sizeof(*ptr)) * 2 ; i++) { - buf[i] = a_[i % 2].values[i / 2]; - } - simde_memcpy(ptr, buf, sizeof(buf)); + #if defined(SIMDE_RISCV_V_NATIVE) + vint8m1x2_t dest = __riscv_vlseg2e8_v_i8m1x2(ptr, 8); + dest = __riscv_vset_v_i8m1_i8m1x2 (dest, 0, a_[0].sv64); + dest = __riscv_vset_v_i8m1_i8m1x2 (dest, 1, a_[1].sv64); + __riscv_vsseg2e8_v_i8m1x2 (ptr, dest, 8); + #else + int8_t buf[16]; + for (size_t i = 0; i < (sizeof(val.val[0]) / sizeof(*ptr)) * 2 ; i++) { + buf[i] = a_[i % 2].values[i / 2]; + } + simde_memcpy(ptr, buf, sizeof(buf)); + #endif #endif } #if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) @@ -103,13 +153,20 @@ simde_vst2_s16(int16_t *ptr, simde_int16x4x2_t val) { #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) vst2_s16(ptr, val); #else - int16_t buf[8]; simde_int16x4_private a_[2] = {simde_int16x4_to_private(val.val[0]), simde_int16x4_to_private(val.val[1])}; - for (size_t i = 0; i < (sizeof(val.val[0]) / sizeof(*ptr)) * 2 ; i++) { - buf[i] = a_[i % 2].values[i / 2]; - } - simde_memcpy(ptr, buf, sizeof(buf)); + #if defined(SIMDE_RISCV_V_NATIVE) + vint16m1x2_t dest = __riscv_vlseg2e16_v_i16m1x2(ptr, 4); + dest = __riscv_vset_v_i16m1_i16m1x2 (dest, 0, a_[0].sv64); + dest = __riscv_vset_v_i16m1_i16m1x2 (dest, 1, a_[1].sv64); + __riscv_vsseg2e16_v_i16m1x2 (ptr, dest, 4); + #else + int16_t buf[8]; + for (size_t i = 0; i < (sizeof(val.val[0]) / sizeof(*ptr)) * 2 ; i++) { + buf[i] = a_[i % 2].values[i / 2]; + } + simde_memcpy(ptr, buf, sizeof(buf)); + #endif #endif } #if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) @@ -123,13 +180,20 @@ simde_vst2_s32(int32_t *ptr, simde_int32x2x2_t val) { #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) vst2_s32(ptr, val); #else - int32_t buf[4]; simde_int32x2_private a_[2] = {simde_int32x2_to_private(val.val[0]), simde_int32x2_to_private(val.val[1])}; - for (size_t i = 0; i < (sizeof(val.val[0]) / sizeof(*ptr)) * 2 ; i++) { - buf[i] = a_[i % 2].values[i / 2]; - } - simde_memcpy(ptr, buf, sizeof(buf)); + #if defined(SIMDE_RISCV_V_NATIVE) + vint32m1x2_t dest = __riscv_vlseg2e32_v_i32m1x2(ptr, 2); + dest = __riscv_vset_v_i32m1_i32m1x2 (dest, 0, a_[0].sv64); + dest = __riscv_vset_v_i32m1_i32m1x2 (dest, 1, a_[1].sv64); + __riscv_vsseg2e32_v_i32m1x2 (ptr, dest, 2); + #else + int32_t buf[4]; + for (size_t i = 0; i < (sizeof(val.val[0]) / sizeof(*ptr)) * 2 ; i++) { + buf[i] = a_[i % 2].values[i / 2]; + } + simde_memcpy(ptr, buf, sizeof(buf)); + #endif #endif } #if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) @@ -143,13 +207,20 @@ simde_vst2_s64(int64_t *ptr, simde_int64x1x2_t val) { #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) vst2_s64(ptr, val); #else - int64_t buf[2]; simde_int64x1_private a_[2] = {simde_int64x1_to_private(val.val[0]), simde_int64x1_to_private(val.val[1])}; - for (size_t i = 0; i < (sizeof(val.val[0]) / sizeof(*ptr)) * 2 ; i++) { - buf[i] = a_[i % 2].values[i / 2]; - } - simde_memcpy(ptr, buf, sizeof(buf)); + #if defined(SIMDE_RISCV_V_NATIVE) + vint64m1x2_t dest = __riscv_vlseg2e64_v_i64m1x2(ptr, 1); + dest = __riscv_vset_v_i64m1_i64m1x2 (dest, 0, a_[0].sv64); + dest = __riscv_vset_v_i64m1_i64m1x2 (dest, 1, a_[1].sv64); + __riscv_vsseg2e64_v_i64m1x2 (ptr, dest, 1); + #else + int64_t buf[2]; + for (size_t i = 0; i < (sizeof(val.val[0]) / sizeof(*ptr)) * 2 ; i++) { + buf[i] = a_[i % 2].values[i / 2]; + } + simde_memcpy(ptr, buf, sizeof(buf)); + #endif #endif } #if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) @@ -163,13 +234,20 @@ simde_vst2_u8(uint8_t *ptr, simde_uint8x8x2_t val) { #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) vst2_u8(ptr, val); #else - uint8_t buf[16]; simde_uint8x8_private a_[2] = {simde_uint8x8_to_private(val.val[0]), simde_uint8x8_to_private(val.val[1])}; - for (size_t i = 0; i < (sizeof(val.val[0]) / sizeof(*ptr)) * 2 ; i++) { - buf[i] = a_[i % 2].values[i / 2]; - } - simde_memcpy(ptr, buf, sizeof(buf)); + #if defined(SIMDE_RISCV_V_NATIVE) + vuint8m1x2_t dest = __riscv_vlseg2e8_v_u8m1x2(ptr, 8); + dest = __riscv_vset_v_u8m1_u8m1x2 (dest, 0, a_[0].sv64); + dest = __riscv_vset_v_u8m1_u8m1x2 (dest, 1, a_[1].sv64); + __riscv_vsseg2e8_v_u8m1x2 (ptr, dest, 8); + #else + uint8_t buf[16]; + for (size_t i = 0; i < (sizeof(val.val[0]) / sizeof(*ptr)) * 2 ; i++) { + buf[i] = a_[i % 2].values[i / 2]; + } + simde_memcpy(ptr, buf, sizeof(buf)); + #endif #endif } #if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) @@ -183,13 +261,20 @@ simde_vst2_u16(uint16_t *ptr, simde_uint16x4x2_t val) { #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) vst2_u16(ptr, val); #else - uint16_t buf[8]; simde_uint16x4_private a_[2] = {simde_uint16x4_to_private(val.val[0]), simde_uint16x4_to_private(val.val[1])}; - for (size_t i = 0; i < (sizeof(val.val[0]) / sizeof(*ptr)) * 2 ; i++) { - buf[i] = a_[i % 2].values[i / 2]; - } - simde_memcpy(ptr, buf, sizeof(buf)); + #if defined(SIMDE_RISCV_V_NATIVE) + vuint16m1x2_t dest = __riscv_vlseg2e16_v_u16m1x2(ptr, 4); + dest = __riscv_vset_v_u16m1_u16m1x2 (dest, 0, a_[0].sv64); + dest = __riscv_vset_v_u16m1_u16m1x2 (dest, 1, a_[1].sv64); + __riscv_vsseg2e16_v_u16m1x2 (ptr, dest, 4); + #else + uint16_t buf[8]; + for (size_t i = 0; i < (sizeof(val.val[0]) / sizeof(*ptr)) * 2 ; i++) { + buf[i] = a_[i % 2].values[i / 2]; + } + simde_memcpy(ptr, buf, sizeof(buf)); + #endif #endif } #if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) @@ -203,13 +288,20 @@ simde_vst2_u32(uint32_t *ptr, simde_uint32x2x2_t val) { #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) vst2_u32(ptr, val); #else - uint32_t buf[4]; simde_uint32x2_private a_[2] = {simde_uint32x2_to_private(val.val[0]), simde_uint32x2_to_private(val.val[1])}; - for (size_t i = 0; i < (sizeof(val.val[0]) / sizeof(*ptr)) * 2 ; i++) { - buf[i] = a_[i % 2].values[i / 2]; - } - simde_memcpy(ptr, buf, sizeof(buf)); + #if defined(SIMDE_RISCV_V_NATIVE) + vuint32m1x2_t dest = __riscv_vlseg2e32_v_u32m1x2(ptr, 2); + dest = __riscv_vset_v_u32m1_u32m1x2 (dest, 0, a_[0].sv64); + dest = __riscv_vset_v_u32m1_u32m1x2 (dest, 1, a_[1].sv64); + __riscv_vsseg2e32_v_u32m1x2 (ptr, dest, 2); + #else + uint32_t buf[4]; + for (size_t i = 0; i < (sizeof(val.val[0]) / sizeof(*ptr)) * 2 ; i++) { + buf[i] = a_[i % 2].values[i / 2]; + } + simde_memcpy(ptr, buf, sizeof(buf)); + #endif #endif } #if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) @@ -223,13 +315,20 @@ simde_vst2_u64(uint64_t *ptr, simde_uint64x1x2_t val) { #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) vst2_u64(ptr, val); #else - uint64_t buf[2]; simde_uint64x1_private a_[2] = {simde_uint64x1_to_private(val.val[0]), simde_uint64x1_to_private(val.val[1])}; - for (size_t i = 0; i < (sizeof(val.val[0]) / sizeof(*ptr)) * 2 ; i++) { - buf[i] = a_[i % 2].values[i / 2]; - } - simde_memcpy(ptr, buf, sizeof(buf)); + #if defined(SIMDE_RISCV_V_NATIVE) + vuint64m1x2_t dest = __riscv_vlseg2e64_v_u64m1x2(ptr, 1); + dest = __riscv_vset_v_u64m1_u64m1x2 (dest, 0, a_[0].sv64); + dest = __riscv_vset_v_u64m1_u64m1x2 (dest, 1, a_[1].sv64); + __riscv_vsseg2e64_v_u64m1x2 (ptr, dest, 1); + #else + uint64_t buf[2]; + for (size_t i = 0; i < (sizeof(val.val[0]) / sizeof(*ptr)) * 2 ; i++) { + buf[i] = a_[i % 2].values[i / 2]; + } + simde_memcpy(ptr, buf, sizeof(buf)); + #endif #endif } #if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) @@ -237,11 +336,41 @@ simde_vst2_u64(uint64_t *ptr, simde_uint64x1x2_t val) { #define vst2_u64(a, b) simde_vst2_u64((a), (b)) #endif +SIMDE_FUNCTION_ATTRIBUTES +void +simde_vst2q_f16(simde_float16_t *ptr, simde_float16x8x2_t val) { + #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) && defined(SIMDE_ARM_NEON_FP16) + vst2q_f16(ptr, val); + #elif defined(SIMDE_RISCV_V_NATIVE) && SIMDE_ARCH_RISCV_ZVFH && (SIMDE_NATURAL_VECTOR_SIZE >= 128) + simde_float16x8_private a_[2] = {simde_float16x8_to_private(val.val[0]), + simde_float16x8_to_private(val.val[1])}; + vfloat16m1x2_t dest = __riscv_vlseg2e16_v_f16m1x2((_Float16 *)ptr, 8); + dest = __riscv_vset_v_f16m1_f16m1x2 (dest, 0, a_[0].sv128); + dest = __riscv_vset_v_f16m1_f16m1x2 (dest, 1, a_[1].sv128); + __riscv_vsseg2e16_v_f16m1x2 ((_Float16 *)ptr, dest, 8); + #else + simde_float16x8x2_t r = simde_vzipq_f16(val.val[0], val.val[1]); + simde_vst1q_f16(ptr, r.val[0]); + simde_vst1q_f16(ptr+8, r.val[1]); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vst2q_f16 + #define vst2q_f16(a, b) simde_vst2q_f16((a), (b)) +#endif + SIMDE_FUNCTION_ATTRIBUTES void simde_vst2q_f32(simde_float32_t *ptr, simde_float32x4x2_t val) { #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) vst2q_f32(ptr, val); + #elif defined(SIMDE_RISCV_V_NATIVE) + simde_float32x4_private a_[2] = {simde_float32x4_to_private(val.val[0]), + simde_float32x4_to_private(val.val[1])}; + vfloat32m1x2_t dest = __riscv_vlseg2e32_v_f32m1x2(ptr, 4); + dest = __riscv_vset_v_f32m1_f32m1x2 (dest, 0, a_[0].sv128); + dest = __riscv_vset_v_f32m1_f32m1x2 (dest, 1, a_[1].sv128); + __riscv_vsseg2e32_v_f32m1x2 (ptr, dest, 4); #else simde_float32x4x2_t r = simde_vzipq_f32(val.val[0], val.val[1]); simde_vst1q_f32(ptr, r.val[0]); @@ -259,13 +388,20 @@ simde_vst2q_f64(simde_float64_t *ptr, simde_float64x2x2_t val) { #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) vst2q_f64(ptr, val); #else - simde_float64_t buf[4]; simde_float64x2_private a_[2] = {simde_float64x2_to_private(val.val[0]), simde_float64x2_to_private(val.val[1])}; - for (size_t i = 0; i < (sizeof(val.val[0]) / sizeof(*ptr)) * 2 ; i++) { - buf[i] = a_[i % 2].values[i / 2]; - } - simde_memcpy(ptr, buf, sizeof(buf)); + #if defined(SIMDE_RISCV_V_NATIVE) + vfloat64m1x2_t dest = __riscv_vlseg2e64_v_f64m1x2(ptr, 2); + dest = __riscv_vset_v_f64m1_f64m1x2 (dest, 0, a_[0].sv128); + dest = __riscv_vset_v_f64m1_f64m1x2 (dest, 1, a_[1].sv128); + __riscv_vsseg2e64_v_f64m1x2 (ptr, dest, 2); + #else + simde_float64_t buf[4]; + for (size_t i = 0; i < (sizeof(val.val[0]) / sizeof(*ptr)) * 2 ; i++) { + buf[i] = a_[i % 2].values[i / 2]; + } + simde_memcpy(ptr, buf, sizeof(buf)); + #endif #endif } #if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) @@ -278,6 +414,13 @@ void simde_vst2q_s8(int8_t *ptr, simde_int8x16x2_t val) { #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) vst2q_s8(ptr, val); + #elif defined(SIMDE_RISCV_V_NATIVE) + simde_int8x16_private a_[2] = {simde_int8x16_to_private(val.val[0]), + simde_int8x16_to_private(val.val[1])}; + vint8m1x2_t dest = __riscv_vlseg2e8_v_i8m1x2(ptr, 16); + dest = __riscv_vset_v_i8m1_i8m1x2 (dest, 0, a_[0].sv128); + dest = __riscv_vset_v_i8m1_i8m1x2 (dest, 1, a_[1].sv128); + __riscv_vsseg2e8_v_i8m1x2 (ptr, dest, 16); #else simde_int8x16x2_t r = simde_vzipq_s8(val.val[0], val.val[1]); simde_vst1q_s8(ptr, r.val[0]); @@ -294,6 +437,13 @@ void simde_vst2q_s16(int16_t *ptr, simde_int16x8x2_t val) { #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) vst2q_s16(ptr, val); + #elif defined(SIMDE_RISCV_V_NATIVE) + simde_int16x8_private a_[2] = {simde_int16x8_to_private(val.val[0]), + simde_int16x8_to_private(val.val[1])}; + vint16m1x2_t dest = __riscv_vlseg2e16_v_i16m1x2(ptr, 8); + dest = __riscv_vset_v_i16m1_i16m1x2 (dest, 0, a_[0].sv128); + dest = __riscv_vset_v_i16m1_i16m1x2 (dest, 1, a_[1].sv128); + __riscv_vsseg2e16_v_i16m1x2 (ptr, dest, 8); #else simde_int16x8x2_t r = simde_vzipq_s16(val.val[0], val.val[1]); simde_vst1q_s16(ptr, r.val[0]); @@ -310,6 +460,13 @@ void simde_vst2q_s32(int32_t *ptr, simde_int32x4x2_t val) { #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) vst2q_s32(ptr, val); + #elif defined(SIMDE_RISCV_V_NATIVE) + simde_int32x4_private a_[2] = {simde_int32x4_to_private(val.val[0]), + simde_int32x4_to_private(val.val[1])}; + vint32m1x2_t dest = __riscv_vlseg2e32_v_i32m1x2(ptr, 4); + dest = __riscv_vset_v_i32m1_i32m1x2 (dest, 0, a_[0].sv128); + dest = __riscv_vset_v_i32m1_i32m1x2 (dest, 1, a_[1].sv128); + __riscv_vsseg2e32_v_i32m1x2 (ptr, dest, 4); #else simde_int32x4x2_t r = simde_vzipq_s32(val.val[0], val.val[1]); simde_vst1q_s32(ptr, r.val[0]); @@ -326,6 +483,13 @@ void simde_vst2q_s64(int64_t *ptr, simde_int64x2x2_t val) { #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) vst2q_s64(ptr, val); + #elif defined(SIMDE_RISCV_V_NATIVE) + simde_int64x2_private a_[2] = {simde_int64x2_to_private(val.val[0]), + simde_int64x2_to_private(val.val[1])}; + vint64m1x2_t dest = __riscv_vlseg2e64_v_i64m1x2(ptr, 2); + dest = __riscv_vset_v_i64m1_i64m1x2 (dest, 0, a_[0].sv128); + dest = __riscv_vset_v_i64m1_i64m1x2 (dest, 1, a_[1].sv128); + __riscv_vsseg2e64_v_i64m1x2 (ptr, dest, 2); #else int64_t buf[4]; simde_int64x2_private a_[2] = {simde_int64x2_to_private(val.val[0]), @@ -346,6 +510,13 @@ void simde_vst2q_u8(uint8_t *ptr, simde_uint8x16x2_t val) { #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) vst2q_u8(ptr, val); + #elif defined(SIMDE_RISCV_V_NATIVE) + simde_uint8x16_private a_[2] = {simde_uint8x16_to_private(val.val[0]), + simde_uint8x16_to_private(val.val[1])}; + vuint8m1x2_t dest = __riscv_vlseg2e8_v_u8m1x2(ptr, 16); + dest = __riscv_vset_v_u8m1_u8m1x2 (dest, 0, a_[0].sv128); + dest = __riscv_vset_v_u8m1_u8m1x2 (dest, 1, a_[1].sv128); + __riscv_vsseg2e8_v_u8m1x2 (ptr, dest, 16); #else simde_uint8x16x2_t r = simde_vzipq_u8(val.val[0], val.val[1]); simde_vst1q_u8(ptr, r.val[0]); @@ -362,6 +533,13 @@ void simde_vst2q_u16(uint16_t *ptr, simde_uint16x8x2_t val) { #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) vst2q_u16(ptr, val); + #elif defined(SIMDE_RISCV_V_NATIVE) + simde_uint16x8_private a_[2] = {simde_uint16x8_to_private(val.val[0]), + simde_uint16x8_to_private(val.val[1])}; + vuint16m1x2_t dest = __riscv_vlseg2e16_v_u16m1x2(ptr, 8); + dest = __riscv_vset_v_u16m1_u16m1x2 (dest, 0, a_[0].sv128); + dest = __riscv_vset_v_u16m1_u16m1x2 (dest, 1, a_[1].sv128); + __riscv_vsseg2e16_v_u16m1x2 (ptr, dest, 8); #else simde_uint16x8x2_t r = simde_vzipq_u16(val.val[0], val.val[1]); simde_vst1q_u16(ptr, r.val[0]); @@ -378,6 +556,13 @@ void simde_vst2q_u32(uint32_t *ptr, simde_uint32x4x2_t val) { #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) vst2q_u32(ptr, val); + #elif defined(SIMDE_RISCV_V_NATIVE) + simde_uint32x4_private a_[2] = {simde_uint32x4_to_private(val.val[0]), + simde_uint32x4_to_private(val.val[1])}; + vuint32m1x2_t dest = __riscv_vlseg2e32_v_u32m1x2(ptr, 4); + dest = __riscv_vset_v_u32m1_u32m1x2 (dest, 0, a_[0].sv128); + dest = __riscv_vset_v_u32m1_u32m1x2 (dest, 1, a_[1].sv128); + __riscv_vsseg2e32_v_u32m1x2 (ptr, dest, 4); #else simde_uint32x4x2_t r = simde_vzipq_u32(val.val[0], val.val[1]); simde_vst1q_u32(ptr, r.val[0]); @@ -395,18 +580,227 @@ simde_vst2q_u64(uint64_t *ptr, simde_uint64x2x2_t val) { #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) vst2q_u64(ptr, val); #else - uint64_t buf[4]; simde_uint64x2_private a_[2] = {simde_uint64x2_to_private(val.val[0]), simde_uint64x2_to_private(val.val[1])}; + #if defined(SIMDE_RISCV_V_NATIVE) + vuint64m1x2_t dest = __riscv_vlseg2e64_v_u64m1x2(ptr, 2); + dest = __riscv_vset_v_u64m1_u64m1x2 (dest, 0, a_[0].sv128); + dest = __riscv_vset_v_u64m1_u64m1x2 (dest, 1, a_[1].sv128); + __riscv_vsseg2e64_v_u64m1x2 (ptr, dest, 2); + #else + uint64_t buf[4]; + for (size_t i = 0; i < (sizeof(val.val[0]) / sizeof(*ptr)) * 2 ; i++) { + buf[i] = a_[i % 2].values[i / 2]; + } + simde_memcpy(ptr, buf, sizeof(buf)); + #endif + #endif +} +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vst2q_u64 + #define vst2q_u64(a, b) simde_vst2q_u64((a), (b)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +void +simde_vst2_p8(simde_poly8_t *ptr, simde_poly8x8x2_t val) { + #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) + vst2_p8(ptr, val); + #else + simde_poly8x8_private a_[2] = {simde_poly8x8_to_private(val.val[0]), + simde_poly8x8_to_private(val.val[1])}; + #if defined(SIMDE_RISCV_V_NATIVE) + vuint8m1x2_t dest = __riscv_vlseg2e8_v_u8m1x2(ptr, 8); + dest = __riscv_vset_v_u8m1_u8m1x2 (dest, 0, a_[0].sv64); + dest = __riscv_vset_v_u8m1_u8m1x2 (dest, 1, a_[1].sv64); + __riscv_vsseg2e8_v_u8m1x2 (ptr, dest, 8); + #else + simde_poly8_t buf[16]; + for (size_t i = 0; i < (sizeof(val.val[0]) / sizeof(*ptr)) * 2 ; i++) { + buf[i] = a_[i % 2].values[i / 2]; + } + simde_memcpy(ptr, buf, sizeof(buf)); + #endif + #endif +} +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vst2_p8 + #define vst2_p8(a, b) simde_vst2_p8((a), (b)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +void +simde_vst2_p16(simde_poly16_t *ptr, simde_poly16x4x2_t val) { + #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) + vst2_p16(ptr, val); + #else + simde_poly16x4_private a_[2] = {simde_poly16x4_to_private(val.val[0]), + simde_poly16x4_to_private(val.val[1])}; + #if defined(SIMDE_RISCV_V_NATIVE) + vuint16m1x2_t dest = __riscv_vlseg2e16_v_u16m1x2(ptr, 4); + dest = __riscv_vset_v_u16m1_u16m1x2 (dest, 0, a_[0].sv64); + dest = __riscv_vset_v_u16m1_u16m1x2 (dest, 1, a_[1].sv64); + __riscv_vsseg2e16_v_u16m1x2 (ptr, dest, 4); + #else + simde_poly16_t buf[8]; + for (size_t i = 0; i < (sizeof(val.val[0]) / sizeof(*ptr)) * 2 ; i++) { + buf[i] = a_[i % 2].values[i / 2]; + } + simde_memcpy(ptr, buf, sizeof(buf)); + #endif + #endif +} +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vst2_p16 + #define vst2_p16(a, b) simde_vst2_p16((a), (b)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +void +simde_vst2_p64(simde_poly64_t *ptr, simde_poly64x1x2_t val) { + #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) + vst2_p64(ptr, val); + #else + simde_poly64x1_private a_[2] = {simde_poly64x1_to_private(val.val[0]), + simde_poly64x1_to_private(val.val[1])}; + #if defined(SIMDE_RISCV_V_NATIVE) + vuint64m1x2_t dest = __riscv_vlseg2e64_v_u64m1x2(ptr, 1); + dest = __riscv_vset_v_u64m1_u64m1x2 (dest, 0, a_[0].sv64); + dest = __riscv_vset_v_u64m1_u64m1x2 (dest, 1, a_[1].sv64); + __riscv_vsseg2e64_v_u64m1x2 (ptr, dest, 1); + #else + simde_poly64_t buf[2]; + for (size_t i = 0; i < (sizeof(val.val[0]) / sizeof(*ptr)) * 2 ; i++) { + buf[i] = a_[i % 2].values[i / 2]; + } + simde_memcpy(ptr, buf, sizeof(buf)); + #endif + #endif +} +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) + #undef vst2_p64 + #define vst2_p64(a, b) simde_vst2_p64((a), (b)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +void +simde_vst2q_p8(simde_poly8_t *ptr, simde_poly8x16x2_t val) { + #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) + vst2q_p8(ptr, val); + #else + simde_poly8x16_private a_[2] = {simde_poly8x16_to_private(val.val[0]), + simde_poly8x16_to_private(val.val[1])}; + #if defined(SIMDE_RISCV_V_NATIVE) + vuint8m1x2_t dest = __riscv_vlseg2e8_v_u8m1x2(ptr, 16); + dest = __riscv_vset_v_u8m1_u8m1x2 (dest, 0, a_[0].sv128); + dest = __riscv_vset_v_u8m1_u8m1x2 (dest, 1, a_[1].sv128); + __riscv_vsseg2e8_v_u8m1x2 (ptr, dest, 16); + #else + simde_poly8_t buf[32]; + for (size_t i = 0; i < (sizeof(val.val[0]) / sizeof(*ptr)) * 2 ; i++) { + buf[i] = a_[i % 2].values[i / 2]; + } + simde_memcpy(ptr, buf, sizeof(buf)); + #endif + #endif +} +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vst2q_p8 + #define vst2q_p8(a, b) simde_vst2q_p8((a), (b)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +void +simde_vst2q_p16(simde_poly16_t *ptr, simde_poly16x8x2_t val) { + #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) + vst2q_p16(ptr, val); + #else + simde_poly16x8_private a_[2] = {simde_poly16x8_to_private(val.val[0]), + simde_poly16x8_to_private(val.val[1])}; + #if defined(SIMDE_RISCV_V_NATIVE) + vuint16m1x2_t dest = __riscv_vlseg2e16_v_u16m1x2(ptr, 8); + dest = __riscv_vset_v_u16m1_u16m1x2 (dest, 0, a_[0].sv128); + dest = __riscv_vset_v_u16m1_u16m1x2 (dest, 1, a_[1].sv128); + __riscv_vsseg2e16_v_u16m1x2 (ptr, dest, 8); + #else + simde_poly16_t buf[16]; + for (size_t i = 0; i < (sizeof(val.val[0]) / sizeof(*ptr)) * 2 ; i++) { + buf[i] = a_[i % 2].values[i / 2]; + } + simde_memcpy(ptr, buf, sizeof(buf)); + #endif + #endif +} +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vst2q_p16 + #define vst2q_p16(a, b) simde_vst2q_p16((a), (b)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +void +simde_vst2q_p64(simde_poly64_t *ptr, simde_poly64x2x2_t val) { + #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + vst2q_p64(ptr, val); + #else + simde_poly64x2_private a_[2] = {simde_poly64x2_to_private(val.val[0]), + simde_poly64x2_to_private(val.val[1])}; + #if defined(SIMDE_RISCV_V_NATIVE) + vuint64m1x2_t dest = __riscv_vlseg2e64_v_u64m1x2(ptr, 2); + dest = __riscv_vset_v_u64m1_u64m1x2 (dest, 0, a_[0].sv128); + dest = __riscv_vset_v_u64m1_u64m1x2 (dest, 1, a_[1].sv128); + __riscv_vsseg2e64_v_u64m1x2 (ptr, dest, 2); + #else + simde_poly64_t buf[4]; + for (size_t i = 0; i < (sizeof(val.val[0]) / sizeof(*ptr)) * 2 ; i++) { + buf[i] = a_[i % 2].values[i / 2]; + } + simde_memcpy(ptr, buf, sizeof(buf)); + #endif + #endif +} +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vst2q_p64 + #define vst2q_p64(a, b) simde_vst2q_p64((a), (b)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +void +simde_vst2_bf16(simde_bfloat16_t *ptr, simde_bfloat16x4x2_t val) { + #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARM_NEON_BF16) + vst2_bf16(ptr, val); + #else + simde_bfloat16x4_private a_[2] = {simde_bfloat16x4_to_private(val.val[0]), + simde_bfloat16x4_to_private(val.val[1])}; + simde_bfloat16_t buf[8]; for (size_t i = 0; i < (sizeof(val.val[0]) / sizeof(*ptr)) * 2 ; i++) { buf[i] = a_[i % 2].values[i / 2]; } simde_memcpy(ptr, buf, sizeof(buf)); #endif } -#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) - #undef vst2q_u64 - #define vst2q_u64(a, b) simde_vst2q_u64((a), (b)) +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) + #undef vst2_bf16 + #define vst2_bf16(a, b) simde_vst2_bf16((a), (b)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +void +simde_vst2q_bf16(simde_bfloat16_t *ptr, simde_bfloat16x8x2_t val) { + #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARM_NEON_BF16) + vst2q_bf16(ptr, val); + #else + simde_bfloat16x8_private a_[2] = {simde_bfloat16x8_to_private(val.val[0]), + simde_bfloat16x8_to_private(val.val[1])}; + simde_bfloat16_t buf[16]; + for (size_t i = 0; i < (sizeof(val.val[0]) / sizeof(*ptr)) * 2 ; i++) { + buf[i] = a_[i % 2].values[i / 2]; + } + simde_memcpy(ptr, buf, sizeof(buf)); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) + #undef vst2q_bf16 + #define vst2q_bf16(a, b) simde_vst2q_bf16((a), (b)) #endif #endif /* !defined(SIMDE_BUG_INTEL_857088) */ diff --git a/lib/simd_wrapper/simde/arm/neon/st2_lane.h b/lib/simd_wrapper/simde/arm/neon/st2_lane.h index 0eee6a8a416..3be5c8e799f 100644 --- a/lib/simd_wrapper/simde/arm/neon/st2_lane.h +++ b/lib/simd_wrapper/simde/arm/neon/st2_lane.h @@ -22,6 +22,7 @@ * * Copyright: * 2021 Zhi An Ng (Copyright owned by Google, LLC) + * 2023 Yi-Yen Chung (Copyright owned by Andes Technology) */ #if !defined(SIMDE_ARM_NEON_ST2_LANE_H) @@ -189,6 +190,25 @@ simde_vst2_lane_u64(uint64_t ptr[HEDLEY_ARRAY_PARAM(2)], simde_uint64x1x2_t val, #define vst2_lane_u64(a, b, c) simde_vst2_lane_u64((a), (b), (c)) #endif +SIMDE_FUNCTION_ATTRIBUTES +void +simde_vst2_lane_f16(simde_float16_t ptr[HEDLEY_ARRAY_PARAM(2)], simde_float16x4x2_t val, const int lane) + SIMDE_REQUIRE_CONSTANT_RANGE(lane, 0, 3) { + #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) && defined(SIMDE_ARM_NEON_FP16) + SIMDE_CONSTIFY_4_NO_RESULT_(vst2_lane_f16, HEDLEY_UNREACHABLE(), lane, ptr, val); + #else + simde_float16x4_private r; + for (size_t i = 0 ; i < 2 ; i ++) { + r = simde_float16x4_to_private(val.val[i]); + ptr[i] = r.values[lane]; + } + #endif +} +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vst2_lane_f16 + #define vst2_lane_f16(a, b, c) simde_vst2_lane_f16((a), (b), (c)) +#endif + SIMDE_FUNCTION_ATTRIBUTES void simde_vst2_lane_f32(simde_float32_t ptr[HEDLEY_ARRAY_PARAM(2)], simde_float32x2x2_t val, const int lane) @@ -380,6 +400,25 @@ simde_vst2q_lane_u64(uint64_t ptr[HEDLEY_ARRAY_PARAM(2)], simde_uint64x2x2_t val #define vst2q_lane_u64(a, b, c) simde_vst2q_lane_u64((a), (b), (c)) #endif +SIMDE_FUNCTION_ATTRIBUTES +void +simde_vst2q_lane_f16(simde_float16_t ptr[HEDLEY_ARRAY_PARAM(2)], simde_float16x8x2_t val, const int lane) + SIMDE_REQUIRE_CONSTANT_RANGE(lane, 0, 7) { + #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) && defined(SIMDE_ARM_NEON_FP16) + SIMDE_CONSTIFY_8_NO_RESULT_(vst2q_lane_f16, HEDLEY_UNREACHABLE(), lane, ptr, val); + #else + simde_float16x8_private r; + for (size_t i = 0 ; i < 2 ; i++) { + r = simde_float16x8_to_private(val.val[i]); + ptr[i] = r.values[lane]; + } + #endif +} +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vst2q_lane_f16 + #define vst2q_lane_f16(a, b, c) simde_vst2q_lane_f16((a), (b), (c)) +#endif + SIMDE_FUNCTION_ATTRIBUTES void simde_vst2q_lane_f32(simde_float32_t ptr[HEDLEY_ARRAY_PARAM(2)], simde_float32x4x2_t val, const int lane) @@ -418,6 +457,159 @@ simde_vst2q_lane_f64(simde_float64_t ptr[HEDLEY_ARRAY_PARAM(2)], simde_float64x2 #define vst2q_lane_f64(a, b, c) simde_vst2q_lane_f64((a), (b), (c)) #endif +SIMDE_FUNCTION_ATTRIBUTES +void +simde_vst2_lane_p8(simde_poly8_t ptr[HEDLEY_ARRAY_PARAM(2)], simde_poly8x8x2_t val, const int lane) + SIMDE_REQUIRE_CONSTANT_RANGE(lane, 0, 7) { + #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) + SIMDE_CONSTIFY_8_NO_RESULT_(vst2_lane_p8, HEDLEY_UNREACHABLE(), lane, ptr, val); + #else + simde_poly8x8_private r; + for (size_t i = 0 ; i < 2 ; i++) { + r = simde_poly8x8_to_private(val.val[i]); + ptr[i] = r.values[lane]; + } + #endif +} +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vst2_lane_p8 + #define vst2_lane_p8(a, b, c) simde_vst2_lane_p8((a), (b), (c)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +void +simde_vst2_lane_p16(simde_poly16_t ptr[HEDLEY_ARRAY_PARAM(2)], simde_poly16x4x2_t val, const int lane) + SIMDE_REQUIRE_CONSTANT_RANGE(lane, 0, 3) { + #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) + SIMDE_CONSTIFY_4_NO_RESULT_(vst2_lane_p16, HEDLEY_UNREACHABLE(), lane, ptr, val); + #else + simde_poly16x4_private r; + for (size_t i = 0 ; i < 2 ; i++) { + r = simde_poly16x4_to_private(val.val[i]); + ptr[i] = r.values[lane]; + } + #endif +} +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vst2_lane_p16 + #define vst2_lane_p16(a, b, c) simde_vst2_lane_p16((a), (b), (c)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +void +simde_vst2_lane_p64(simde_poly64_t ptr[HEDLEY_ARRAY_PARAM(2)], simde_poly64x1x2_t val, const int lane) + SIMDE_REQUIRE_CONSTANT_RANGE(lane, 0, 0) { + #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + HEDLEY_STATIC_CAST(void, lane); + vst2_lane_p64(ptr, val, 0); + #else + simde_poly64x1_private r; + for (size_t i = 0 ; i < 2 ; i++) { + r = simde_poly64x1_to_private(val.val[i]); + ptr[i] = r.values[lane]; + } + #endif +} +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vst2_lane_p64 + #define vst2_lane_p64(a, b, c) simde_vst2_lane_p64((a), (b), (c)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +void +simde_vst2q_lane_p8(simde_poly8_t ptr[HEDLEY_ARRAY_PARAM(2)], simde_poly8x16x2_t val, const int lane) + SIMDE_REQUIRE_CONSTANT_RANGE(lane, 0, 16) { + #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + SIMDE_CONSTIFY_16_NO_RESULT_(vst2q_lane_p8, HEDLEY_UNREACHABLE(), lane, ptr, val); + #else + simde_poly8x16_private r; + for (size_t i = 0 ; i < 2 ; i++) { + r = simde_poly8x16_to_private(val.val[i]); + ptr[i] = r.values[lane]; + } + #endif +} +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vst2q_lane_p8 + #define vst2q_lane_p8(a, b, c) simde_vst2q_lane_p8((a), (b), (c)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +void +simde_vst2q_lane_p16(simde_poly16_t ptr[HEDLEY_ARRAY_PARAM(2)], simde_poly16x8x2_t val, const int lane) + SIMDE_REQUIRE_CONSTANT_RANGE(lane, 0, 7) { + #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) + SIMDE_CONSTIFY_8_NO_RESULT_(vst2q_lane_p16, HEDLEY_UNREACHABLE(), lane, ptr, val); + #else + simde_poly16x8_private r; + for (size_t i = 0 ; i < 2 ; i++) { + r = simde_poly16x8_to_private(val.val[i]); + ptr[i] = r.values[lane]; + } + #endif +} +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vst2q_lane_p16 + #define vst2q_lane_p16(a, b, c) simde_vst2q_lane_p16((a), (b), (c)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +void +simde_vst2q_lane_p64(simde_poly64_t ptr[HEDLEY_ARRAY_PARAM(2)], simde_poly64x2x2_t val, const int lane) + SIMDE_REQUIRE_CONSTANT_RANGE(lane, 0, 1) { + #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + SIMDE_CONSTIFY_2_NO_RESULT_(vst2q_lane_p64, HEDLEY_UNREACHABLE(), lane, ptr, val); + #else + simde_poly64x2_private r; + for (size_t i = 0 ; i < 2 ; i++) { + r = simde_poly64x2_to_private(val.val[i]); + ptr[i] = r.values[lane]; + } + #endif +} +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vst2q_lane_p64 + #define vst2q_lane_p64(a, b, c) simde_vst2q_lane_p64((a), (b), (c)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +void +simde_vst2_lane_bf16(simde_bfloat16_t ptr[HEDLEY_ARRAY_PARAM(2)], simde_bfloat16x4x2_t val, const int lane) + SIMDE_REQUIRE_CONSTANT_RANGE(lane, 0, 3) { + #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARM_NEON_BF16) + SIMDE_CONSTIFY_4_NO_RESULT_(vst2_lane_bf16, HEDLEY_UNREACHABLE(), lane, ptr, val); + #else + simde_bfloat16x4_private r; + for (size_t i = 0 ; i < 2 ; i ++) { + r = simde_bfloat16x4_to_private(val.val[i]); + ptr[i] = r.values[lane]; + } + #endif +} +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) + #undef vst2_lane_bf16 + #define vst2_lane_bf16(a, b, c) simde_vst2_lane_bf16((a), (b), (c)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +void +simde_vst2q_lane_bf16(simde_bfloat16_t ptr[HEDLEY_ARRAY_PARAM(2)], simde_bfloat16x8x2_t val, const int lane) + SIMDE_REQUIRE_CONSTANT_RANGE(lane, 0, 7) { + #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARM_NEON_BF16) + SIMDE_CONSTIFY_8_NO_RESULT_(vst2q_lane_bf16, HEDLEY_UNREACHABLE(), lane, ptr, val); + #else + simde_bfloat16x8_private r; + for (size_t i = 0 ; i < 2 ; i++) { + r = simde_bfloat16x8_to_private(val.val[i]); + ptr[i] = r.values[lane]; + } + #endif +} +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) + #undef vst2q_lane_bf16 + #define vst2q_lane_bf16(a, b, c) simde_vst2q_lane_bf16((a), (b), (c)) +#endif + #endif /* !defined(SIMDE_BUG_INTEL_857088) */ SIMDE_END_DECLS_ diff --git a/lib/simd_wrapper/simde/arm/neon/st3.h b/lib/simd_wrapper/simde/arm/neon/st3.h index 2a3616d424a..29301ae6c34 100644 --- a/lib/simd_wrapper/simde/arm/neon/st3.h +++ b/lib/simd_wrapper/simde/arm/neon/st3.h @@ -23,6 +23,8 @@ * Copyright: * 2020 Evan Nemerson * 2020 Sean Maher + * 2023 Yi-Yen Chung (Copyright owned by Andes Technology) + * 2023 Chi-Wei Chu (Copyright owned by NTHU pllab) */ #if !defined(SIMDE_ARM_NEON_ST3_H) @@ -37,6 +39,35 @@ SIMDE_BEGIN_DECLS_ #if !defined(SIMDE_BUG_INTEL_857088) +SIMDE_FUNCTION_ATTRIBUTES +void +simde_vst3_f16(simde_float16_t ptr[HEDLEY_ARRAY_PARAM(12)], simde_float16x4x3_t val) { + #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) && defined(SIMDE_ARM_NEON_FP16) + vst3_f16(ptr, val); + #else + simde_float16x4_private a[3] = { simde_float16x4_to_private(val.val[0]), + simde_float16x4_to_private(val.val[1]), + simde_float16x4_to_private(val.val[2]) }; + #if defined(SIMDE_RISCV_V_NATIVE) && SIMDE_ARCH_RISCV_ZVFH && (SIMDE_NATURAL_VECTOR_SIZE >= 128) + vfloat16m1x3_t dest = __riscv_vlseg3e16_v_f16m1x3((_Float16 *)ptr, 4); + dest = __riscv_vset_v_f16m1_f16m1x3 (dest, 0, a[0].sv64); + dest = __riscv_vset_v_f16m1_f16m1x3 (dest, 1, a[1].sv64); + dest = __riscv_vset_v_f16m1_f16m1x3 (dest, 2, a[2].sv64); + __riscv_vsseg3e16_v_f16m1x3 ((_Float16 *)ptr, dest, 4); + #else + simde_float16_t buf[12]; + for (size_t i = 0; i < 12 ; i++) { + buf[i] = a[i % 3].values[i / 3]; + } + simde_memcpy(ptr, buf, sizeof(buf)); + #endif + #endif +} +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vst3_f16 + #define vst3_f16(a, b) simde_vst3_f16((a), (b)) +#endif + SIMDE_FUNCTION_ATTRIBUTES void simde_vst3_f32(simde_float32_t ptr[HEDLEY_ARRAY_PARAM(6)], simde_float32x2x3_t val) { @@ -46,7 +77,13 @@ simde_vst3_f32(simde_float32_t ptr[HEDLEY_ARRAY_PARAM(6)], simde_float32x2x3_t v simde_float32x2_private a[3] = { simde_float32x2_to_private(val.val[0]), simde_float32x2_to_private(val.val[1]), simde_float32x2_to_private(val.val[2]) }; - #if defined(SIMDE_SHUFFLE_VECTOR_) + #if defined(SIMDE_RISCV_V_NATIVE) + vfloat32m1x3_t dest = __riscv_vlseg3e32_v_f32m1x3(ptr, 2); + dest = __riscv_vset_v_f32m1_f32m1x3 (dest, 0, a[0].sv64); + dest = __riscv_vset_v_f32m1_f32m1x3 (dest, 1, a[1].sv64); + dest = __riscv_vset_v_f32m1_f32m1x3 (dest, 2, a[2].sv64); + __riscv_vsseg3e32_v_f32m1x3 (ptr, dest, 2); + #elif defined(SIMDE_SHUFFLE_VECTOR_) __typeof__(a[0].values) r1 = SIMDE_SHUFFLE_VECTOR_(32, 8, a[0].values, a[1].values, 0, 2); __typeof__(a[0].values) r2 = SIMDE_SHUFFLE_VECTOR_(32, 8, a[2].values, a[0].values, 0, 3); __typeof__(a[0].values) r3 = SIMDE_SHUFFLE_VECTOR_(32, 8, a[1].values, a[2].values, 1, 3); @@ -55,7 +92,7 @@ simde_vst3_f32(simde_float32_t ptr[HEDLEY_ARRAY_PARAM(6)], simde_float32x2x3_t v simde_memcpy(&ptr[4], &r3, sizeof(r3)); #else simde_float32_t buf[6]; - for (size_t i = 0; i < (sizeof(val.val[0]) / sizeof(*ptr)) * 3 ; i++) { + for (size_t i = 0; i < 6 ; i++) { buf[i] = a[i % 3].values[i / 3]; } simde_memcpy(ptr, buf, sizeof(buf)); @@ -76,9 +113,17 @@ simde_vst3_f64(simde_float64_t ptr[HEDLEY_ARRAY_PARAM(3)], simde_float64x1x3_t v simde_float64x1_private a_[3] = { simde_float64x1_to_private(val.val[0]), simde_float64x1_to_private(val.val[1]), simde_float64x1_to_private(val.val[2]) }; - simde_memcpy(ptr, &a_[0].values, sizeof(a_[0].values)); - simde_memcpy(&ptr[1], &a_[1].values, sizeof(a_[1].values)); - simde_memcpy(&ptr[2], &a_[2].values, sizeof(a_[2].values)); + #if defined(SIMDE_RISCV_V_NATIVE) + vfloat64m1x3_t dest = __riscv_vlseg3e64_v_f64m1x3(ptr, 1); + dest = __riscv_vset_v_f64m1_f64m1x3 (dest, 0, a_[0].sv64); + dest = __riscv_vset_v_f64m1_f64m1x3 (dest, 1, a_[1].sv64); + dest = __riscv_vset_v_f64m1_f64m1x3 (dest, 2, a_[2].sv64); + __riscv_vsseg3e64_v_f64m1x3(ptr, dest, 1); + #else + simde_memcpy(ptr, &a_[0].values, sizeof(a_[0].values)); + simde_memcpy(&ptr[1], &a_[1].values, sizeof(a_[1].values)); + simde_memcpy(&ptr[2], &a_[2].values, sizeof(a_[2].values)); + #endif #endif } #if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) @@ -95,7 +140,13 @@ simde_vst3_s8(int8_t ptr[HEDLEY_ARRAY_PARAM(24)], simde_int8x8x3_t val) { simde_int8x8_private a_[3] = { simde_int8x8_to_private(val.val[0]), simde_int8x8_to_private(val.val[1]), simde_int8x8_to_private(val.val[2]) }; - #if defined(SIMDE_SHUFFLE_VECTOR_) && !defined(SIMDE_BUG_GCC_100762) + #if defined(SIMDE_RISCV_V_NATIVE) + vint8m1x3_t dest = __riscv_vlseg3e8_v_i8m1x3(ptr, 8); + dest = __riscv_vset_v_i8m1_i8m1x3 (dest, 0, a_[0].sv64); + dest = __riscv_vset_v_i8m1_i8m1x3 (dest, 1, a_[1].sv64); + dest = __riscv_vset_v_i8m1_i8m1x3 (dest, 2, a_[2].sv64); + __riscv_vsseg3e8_v_i8m1x3(ptr, dest, 8); + #elif defined(SIMDE_SHUFFLE_VECTOR_) && !defined(SIMDE_BUG_GCC_100762) __typeof__(a_[0].values) r0 = SIMDE_SHUFFLE_VECTOR_(8, 8, a_[0].values, a_[1].values, 0, 8, 3, 1, 9, 4, 2, 10); __typeof__(a_[0].values) m0 = SIMDE_SHUFFLE_VECTOR_(8, 8, r0, a_[2].values, @@ -115,7 +166,7 @@ simde_vst3_s8(int8_t ptr[HEDLEY_ARRAY_PARAM(24)], simde_int8x8x3_t val) { simde_memcpy(&ptr[16], &m2, sizeof(m2)); #else int8_t buf[24]; - for (size_t i = 0; i < (sizeof(val.val[0]) / sizeof(*ptr)) * 3 ; i++) { + for (size_t i = 0; i < 24 ; i++) { buf[i] = a_[i % 3].values[i / 3]; } simde_memcpy(ptr, buf, sizeof(buf)); @@ -136,7 +187,13 @@ simde_vst3_s16(int16_t ptr[HEDLEY_ARRAY_PARAM(12)], simde_int16x4x3_t val) { simde_int16x4_private a_[3] = { simde_int16x4_to_private(val.val[0]), simde_int16x4_to_private(val.val[1]), simde_int16x4_to_private(val.val[2]) }; - #if defined(SIMDE_SHUFFLE_VECTOR_) + #if defined(SIMDE_RISCV_V_NATIVE) + vint16m1x3_t dest = __riscv_vlseg3e16_v_i16m1x3(ptr, 4); + dest = __riscv_vset_v_i16m1_i16m1x3 (dest, 0, a_[0].sv64); + dest = __riscv_vset_v_i16m1_i16m1x3 (dest, 1, a_[1].sv64); + dest = __riscv_vset_v_i16m1_i16m1x3 (dest, 2, a_[2].sv64); + __riscv_vsseg3e16_v_i16m1x3 (ptr, dest, 4); + #elif defined(SIMDE_SHUFFLE_VECTOR_) __typeof__(a_[0].values) r0 = SIMDE_SHUFFLE_VECTOR_(16, 8, a_[0].values, a_[1].values, 0, 4, 1, 0); __typeof__(a_[0].values) m0 = SIMDE_SHUFFLE_VECTOR_(16, 8, r0, a_[2].values, @@ -156,7 +213,7 @@ simde_vst3_s16(int16_t ptr[HEDLEY_ARRAY_PARAM(12)], simde_int16x4x3_t val) { simde_memcpy(&ptr[8], &m2, sizeof(m2)); #else int16_t buf[12]; - for (size_t i = 0; i < (sizeof(val.val[0]) / sizeof(*ptr)) * 3 ; i++) { + for (size_t i = 0; i < 12 ; i++) { buf[i] = a_[i % 3].values[i / 3]; } simde_memcpy(ptr, buf, sizeof(buf)); @@ -177,7 +234,13 @@ simde_vst3_s32(int32_t ptr[HEDLEY_ARRAY_PARAM(6)], simde_int32x2x3_t val) { simde_int32x2_private a[3] = { simde_int32x2_to_private(val.val[0]), simde_int32x2_to_private(val.val[1]), simde_int32x2_to_private(val.val[2]) }; - #if defined(SIMDE_SHUFFLE_VECTOR_) && !defined(SIMDE_BUG_GCC_100762) + #if defined(SIMDE_RISCV_V_NATIVE) + vint32m1x3_t dest = __riscv_vlseg3e32_v_i32m1x3(ptr, 2); + dest = __riscv_vset_v_i32m1_i32m1x3 (dest, 0, a[0].sv64); + dest = __riscv_vset_v_i32m1_i32m1x3 (dest, 1, a[1].sv64); + dest = __riscv_vset_v_i32m1_i32m1x3 (dest, 2, a[2].sv64); + __riscv_vsseg3e32_v_i32m1x3 (ptr, dest, 2); + #elif defined(SIMDE_SHUFFLE_VECTOR_) && !defined(SIMDE_BUG_GCC_100762) __typeof__(a[0].values) r1 = SIMDE_SHUFFLE_VECTOR_(32, 8, a[0].values, a[1].values, 0, 2); __typeof__(a[0].values) r2 = SIMDE_SHUFFLE_VECTOR_(32, 8, a[2].values, a[0].values, 0, 3); __typeof__(a[0].values) r3 = SIMDE_SHUFFLE_VECTOR_(32, 8, a[1].values, a[2].values, 1, 3); @@ -186,7 +249,7 @@ simde_vst3_s32(int32_t ptr[HEDLEY_ARRAY_PARAM(6)], simde_int32x2x3_t val) { simde_memcpy(&ptr[4], &r3, sizeof(r3)); #else int32_t buf[6]; - for (size_t i = 0; i < (sizeof(val.val[0]) / sizeof(*ptr)) * 3 ; i++) { + for (size_t i = 0; i < 6 ; i++) { buf[i] = a[i % 3].values[i / 3]; } simde_memcpy(ptr, buf, sizeof(buf)); @@ -207,12 +270,20 @@ simde_vst3_s64(int64_t ptr[HEDLEY_ARRAY_PARAM(3)], simde_int64x1x3_t val) { simde_int64x1_private a_[3] = { simde_int64x1_to_private(val.val[0]), simde_int64x1_to_private(val.val[1]), simde_int64x1_to_private(val.val[2]) }; - simde_memcpy(ptr, &a_[0].values, sizeof(a_[0].values)); - simde_memcpy(&ptr[1], &a_[1].values, sizeof(a_[1].values)); - simde_memcpy(&ptr[2], &a_[2].values, sizeof(a_[2].values)); + #if defined(SIMDE_RISCV_V_NATIVE) + vint64m1x3_t dest = __riscv_vlseg3e64_v_i64m1x3(ptr, 1); + dest = __riscv_vset_v_i64m1_i64m1x3 (dest, 0, a_[0].sv64); + dest = __riscv_vset_v_i64m1_i64m1x3 (dest, 1, a_[1].sv64); + dest = __riscv_vset_v_i64m1_i64m1x3 (dest, 2, a_[2].sv64); + __riscv_vsseg3e64_v_i64m1x3 (ptr, dest, 1); + #else + simde_memcpy(ptr, &a_[0].values, sizeof(a_[0].values)); + simde_memcpy(&ptr[1], &a_[1].values, sizeof(a_[1].values)); + simde_memcpy(&ptr[2], &a_[2].values, sizeof(a_[2].values)); + #endif #endif } -#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) #undef vst3_s64 #define vst3_s64(a, b) simde_vst3_s64((a), (b)) #endif @@ -226,7 +297,13 @@ simde_vst3_u8(uint8_t ptr[HEDLEY_ARRAY_PARAM(24)], simde_uint8x8x3_t val) { simde_uint8x8_private a_[3] = { simde_uint8x8_to_private(val.val[0]), simde_uint8x8_to_private(val.val[1]), simde_uint8x8_to_private(val.val[2]) }; - #if defined(SIMDE_SHUFFLE_VECTOR_) && !defined(SIMDE_BUG_GCC_100762) + #if defined(SIMDE_RISCV_V_NATIVE) + vuint8m1x3_t dest = __riscv_vlseg3e8_v_u8m1x3(ptr, 8); + dest = __riscv_vset_v_u8m1_u8m1x3 (dest, 0, a_[0].sv64); + dest = __riscv_vset_v_u8m1_u8m1x3 (dest, 1, a_[1].sv64); + dest = __riscv_vset_v_u8m1_u8m1x3 (dest, 2, a_[2].sv64); + __riscv_vsseg3e8_v_u8m1x3 (ptr, dest, 8); + #elif defined(SIMDE_SHUFFLE_VECTOR_) && !defined(SIMDE_BUG_GCC_100762) __typeof__(a_[0].values) r0 = SIMDE_SHUFFLE_VECTOR_(8, 8, a_[0].values, a_[1].values, 0, 8, 3, 1, 9, 4, 2, 10); __typeof__(a_[0].values) m0 = SIMDE_SHUFFLE_VECTOR_(8, 8, r0, a_[2].values, @@ -246,7 +323,7 @@ simde_vst3_u8(uint8_t ptr[HEDLEY_ARRAY_PARAM(24)], simde_uint8x8x3_t val) { simde_memcpy(&ptr[16], &m2, sizeof(m2)); #else uint8_t buf[24]; - for (size_t i = 0; i < (sizeof(val.val[0]) / sizeof(*ptr)) * 3 ; i++) { + for (size_t i = 0; i < 24 ; i++) { buf[i] = a_[i % 3].values[i / 3]; } simde_memcpy(ptr, buf, sizeof(buf)); @@ -267,7 +344,13 @@ simde_vst3_u16(uint16_t ptr[HEDLEY_ARRAY_PARAM(12)], simde_uint16x4x3_t val) { simde_uint16x4_private a_[3] = { simde_uint16x4_to_private(val.val[0]), simde_uint16x4_to_private(val.val[1]), simde_uint16x4_to_private(val.val[2]) }; - #if defined(SIMDE_SHUFFLE_VECTOR_) + #if defined(SIMDE_RISCV_V_NATIVE) + vuint16m1x3_t dest = __riscv_vlseg3e16_v_u16m1x3(ptr, 4); + dest = __riscv_vset_v_u16m1_u16m1x3 (dest, 0, a_[0].sv64); + dest = __riscv_vset_v_u16m1_u16m1x3 (dest, 1, a_[1].sv64); + dest = __riscv_vset_v_u16m1_u16m1x3 (dest, 2, a_[2].sv64); + __riscv_vsseg3e16_v_u16m1x3 (ptr, dest, 4); + #elif defined(SIMDE_SHUFFLE_VECTOR_) __typeof__(a_[0].values) r0 = SIMDE_SHUFFLE_VECTOR_(16, 8, a_[0].values, a_[1].values, 0, 4, 1, 0); __typeof__(a_[0].values) m0 = SIMDE_SHUFFLE_VECTOR_(16, 8, r0, a_[2].values, @@ -287,7 +370,7 @@ simde_vst3_u16(uint16_t ptr[HEDLEY_ARRAY_PARAM(12)], simde_uint16x4x3_t val) { simde_memcpy(&ptr[8], &m2, sizeof(m2)); #else uint16_t buf[12]; - for (size_t i = 0; i < (sizeof(val.val[0]) / sizeof(*ptr)) * 3 ; i++) { + for (size_t i = 0; i < 12 ; i++) { buf[i] = a_[i % 3].values[i / 3]; } simde_memcpy(ptr, buf, sizeof(buf)); @@ -308,7 +391,13 @@ simde_vst3_u32(uint32_t ptr[HEDLEY_ARRAY_PARAM(6)], simde_uint32x2x3_t val) { simde_uint32x2_private a[3] = { simde_uint32x2_to_private(val.val[0]), simde_uint32x2_to_private(val.val[1]), simde_uint32x2_to_private(val.val[2]) }; - #if defined(SIMDE_SHUFFLE_VECTOR_) && !defined(SIMDE_BUG_GCC_100762) + #if defined(SIMDE_RISCV_V_NATIVE) + vuint32m1x3_t dest = __riscv_vlseg3e32_v_u32m1x3(ptr, 2); + dest = __riscv_vset_v_u32m1_u32m1x3 (dest, 0, a[0].sv64); + dest = __riscv_vset_v_u32m1_u32m1x3 (dest, 1, a[1].sv64); + dest = __riscv_vset_v_u32m1_u32m1x3 (dest, 2, a[2].sv64); + __riscv_vsseg3e32_v_u32m1x3 (ptr, dest, 2); + #elif defined(SIMDE_SHUFFLE_VECTOR_) && !defined(SIMDE_BUG_GCC_100762) __typeof__(a[0].values) r1 = SIMDE_SHUFFLE_VECTOR_(32, 8, a[0].values, a[1].values, 0, 2); __typeof__(a[0].values) r2 = SIMDE_SHUFFLE_VECTOR_(32, 8, a[2].values, a[0].values, 0, 3); __typeof__(a[0].values) r3 = SIMDE_SHUFFLE_VECTOR_(32, 8, a[1].values, a[2].values, 1, 3); @@ -317,7 +406,7 @@ simde_vst3_u32(uint32_t ptr[HEDLEY_ARRAY_PARAM(6)], simde_uint32x2x3_t val) { simde_memcpy(&ptr[4], &r3, sizeof(r3)); #else uint32_t buf[6]; - for (size_t i = 0; i < (sizeof(val.val[0]) / sizeof(*ptr)) * 3 ; i++) { + for (size_t i = 0; i < 6 ; i++) { buf[i] = a[i % 3].values[i / 3]; } simde_memcpy(ptr, buf, sizeof(buf)); @@ -338,16 +427,53 @@ simde_vst3_u64(uint64_t ptr[HEDLEY_ARRAY_PARAM(3)], simde_uint64x1x3_t val) { simde_uint64x1_private a_[3] = { simde_uint64x1_to_private(val.val[0]), simde_uint64x1_to_private(val.val[1]), simde_uint64x1_to_private(val.val[2]) }; - simde_memcpy(ptr, &a_[0].values, sizeof(a_[0].values)); - simde_memcpy(&ptr[1], &a_[1].values, sizeof(a_[1].values)); - simde_memcpy(&ptr[2], &a_[2].values, sizeof(a_[2].values)); + #if defined(SIMDE_RISCV_V_NATIVE) + vuint64m1x3_t dest = __riscv_vlseg3e64_v_u64m1x3(ptr, 1); + dest = __riscv_vset_v_u64m1_u64m1x3 (dest, 0, a_[0].sv64); + dest = __riscv_vset_v_u64m1_u64m1x3 (dest, 1, a_[1].sv64); + dest = __riscv_vset_v_u64m1_u64m1x3 (dest, 2, a_[2].sv64); + __riscv_vsseg3e64_v_u64m1x3 (ptr, dest, 1); + #else + simde_memcpy(ptr, &a_[0].values, sizeof(a_[0].values)); + simde_memcpy(&ptr[1], &a_[1].values, sizeof(a_[1].values)); + simde_memcpy(&ptr[2], &a_[2].values, sizeof(a_[2].values)); + #endif #endif } -#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) #undef vst3_u64 #define vst3_u64(a, b) simde_vst3_u64((a), (b)) #endif +SIMDE_FUNCTION_ATTRIBUTES +void +simde_vst3q_f16(simde_float16_t ptr[HEDLEY_ARRAY_PARAM(24)], simde_float16x8x3_t val) { + #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) && defined(SIMDE_ARM_NEON_FP16) + vst3q_f16(ptr, val); + #else + simde_float16x8_private a_[3] = { simde_float16x8_to_private(val.val[0]), + simde_float16x8_to_private(val.val[1]), + simde_float16x8_to_private(val.val[2]) }; + #if defined(SIMDE_RISCV_V_NATIVE) && SIMDE_ARCH_RISCV_ZVFH && (SIMDE_NATURAL_VECTOR_SIZE >= 128) + vfloat16m1x3_t dest = __riscv_vlseg3e16_v_f16m1x3((_Float16 *)ptr, 8); + dest = __riscv_vset_v_f16m1_f16m1x3 (dest, 0, a_[0].sv128); + dest = __riscv_vset_v_f16m1_f16m1x3 (dest, 1, a_[1].sv128); + dest = __riscv_vset_v_f16m1_f16m1x3 (dest, 2, a_[2].sv128); + __riscv_vsseg3e16_v_f16m1x3 ((_Float16 *)ptr, dest, 8); + #else + simde_float16_t buf[24]; + for (size_t i = 0; i < 24 ; i++) { + buf[i] = a_[i % 3].values[i / 3]; + } + simde_memcpy(ptr, buf, sizeof(buf)); + #endif + #endif +} +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vst3q_f16 + #define vst3q_f16(a, b) simde_vst3q_f16((a), (b)) +#endif + SIMDE_FUNCTION_ATTRIBUTES void simde_vst3q_f32(simde_float32_t ptr[HEDLEY_ARRAY_PARAM(12)], simde_float32x4x3_t val) { @@ -357,7 +483,13 @@ simde_vst3q_f32(simde_float32_t ptr[HEDLEY_ARRAY_PARAM(12)], simde_float32x4x3_t simde_float32x4_private a_[3] = { simde_float32x4_to_private(val.val[0]), simde_float32x4_to_private(val.val[1]), simde_float32x4_to_private(val.val[2]) }; - #if defined(SIMDE_SHUFFLE_VECTOR_) + #if defined(SIMDE_RISCV_V_NATIVE) + vfloat32m1x3_t dest = __riscv_vlseg3e32_v_f32m1x3(ptr, 4); + dest = __riscv_vset_v_f32m1_f32m1x3 (dest, 0, a_[0].sv128); + dest = __riscv_vset_v_f32m1_f32m1x3 (dest, 1, a_[1].sv128); + dest = __riscv_vset_v_f32m1_f32m1x3 (dest, 2, a_[2].sv128); + __riscv_vsseg3e32_v_f32m1x3 (ptr, dest, 4); + #elif defined(SIMDE_SHUFFLE_VECTOR_) __typeof__(a_[0].values) r0 = SIMDE_SHUFFLE_VECTOR_(32, 16, a_[0].values, a_[1].values, 0, 4, 1, 0); __typeof__(a_[0].values) m0 = SIMDE_SHUFFLE_VECTOR_(32, 16, r0, a_[2].values, @@ -377,7 +509,7 @@ simde_vst3q_f32(simde_float32_t ptr[HEDLEY_ARRAY_PARAM(12)], simde_float32x4x3_t simde_memcpy(&ptr[8], &m2, sizeof(m2)); #else simde_float32_t buf[12]; - for (size_t i = 0; i < (sizeof(val.val[0]) / sizeof(*ptr)) * 3 ; i++) { + for (size_t i = 0; i < 12 ; i++) { buf[i] = a_[i % 3].values[i / 3]; } simde_memcpy(ptr, buf, sizeof(buf)); @@ -398,7 +530,13 @@ simde_vst3q_f64(simde_float64_t ptr[HEDLEY_ARRAY_PARAM(6)], simde_float64x2x3_t simde_float64x2_private a[3] = { simde_float64x2_to_private(val.val[0]), simde_float64x2_to_private(val.val[1]), simde_float64x2_to_private(val.val[2]) }; - #if defined(SIMDE_SHUFFLE_VECTOR_) + #if defined(SIMDE_RISCV_V_NATIVE) + vfloat64m1x3_t dest = __riscv_vlseg3e64_v_f64m1x3(ptr, 2); + dest = __riscv_vset_v_f64m1_f64m1x3 (dest, 0, a[0].sv128); + dest = __riscv_vset_v_f64m1_f64m1x3 (dest, 1, a[1].sv128); + dest = __riscv_vset_v_f64m1_f64m1x3 (dest, 2, a[2].sv128); + __riscv_vsseg3e64_v_f64m1x3 (ptr, dest, 2); + #elif defined(SIMDE_SHUFFLE_VECTOR_) __typeof__(a[0].values) r1 = SIMDE_SHUFFLE_VECTOR_(64, 16, a[0].values, a[1].values, 0, 2); __typeof__(a[0].values) r2 = SIMDE_SHUFFLE_VECTOR_(64, 16, a[2].values, a[0].values, 0, 3); __typeof__(a[0].values) r3 = SIMDE_SHUFFLE_VECTOR_(64, 16, a[1].values, a[2].values, 1, 3); @@ -407,7 +545,7 @@ simde_vst3q_f64(simde_float64_t ptr[HEDLEY_ARRAY_PARAM(6)], simde_float64x2x3_t simde_memcpy(&ptr[4], &r3, sizeof(r3)); #else simde_float64_t buf[6]; - for (size_t i = 0; i < (sizeof(val.val[0]) / sizeof(*ptr)) * 3 ; i++) { + for (size_t i = 0; i < 6 ; i++) { buf[i] = a[i % 3].values[i / 3]; } simde_memcpy(ptr, buf, sizeof(buf)); @@ -428,7 +566,13 @@ simde_vst3q_s8(int8_t ptr[HEDLEY_ARRAY_PARAM(48)], simde_int8x16x3_t val) { simde_int8x16_private a_[3] = { simde_int8x16_to_private(val.val[0]), simde_int8x16_to_private(val.val[1]), simde_int8x16_to_private(val.val[2]) }; - #if defined(SIMDE_SHUFFLE_VECTOR_) + #if defined(SIMDE_RISCV_V_NATIVE) + vint8m1x3_t dest = __riscv_vlseg3e8_v_i8m1x3(ptr, 16); + dest = __riscv_vset_v_i8m1_i8m1x3 (dest, 0, a_[0].sv128); + dest = __riscv_vset_v_i8m1_i8m1x3 (dest, 1, a_[1].sv128); + dest = __riscv_vset_v_i8m1_i8m1x3 (dest, 2, a_[2].sv128); + __riscv_vsseg3e8_v_i8m1x3 (ptr, dest, 16); + #elif defined(SIMDE_SHUFFLE_VECTOR_) __typeof__(a_[0].values) r0 = SIMDE_SHUFFLE_VECTOR_(8, 16, a_[0].values, a_[1].values, 0, 16, 6, 1, 17, 7, 2, 18, 8, 3, 19, 9, 4, 20, 10, 5); @@ -453,7 +597,7 @@ simde_vst3q_s8(int8_t ptr[HEDLEY_ARRAY_PARAM(48)], simde_int8x16x3_t val) { simde_memcpy(&ptr[32], &m2, sizeof(m2)); #else int8_t buf[48]; - for (size_t i = 0; i < (sizeof(val.val[0]) / sizeof(*ptr)) * 3 ; i++) { + for (size_t i = 0; i < 48 ; i++) { buf[i] = a_[i % 3].values[i / 3]; } simde_memcpy(ptr, buf, sizeof(buf)); @@ -474,7 +618,13 @@ simde_vst3q_s16(int16_t ptr[HEDLEY_ARRAY_PARAM(24)], simde_int16x8x3_t val) { simde_int16x8_private a_[3] = { simde_int16x8_to_private(val.val[0]), simde_int16x8_to_private(val.val[1]), simde_int16x8_to_private(val.val[2]) }; - #if defined(SIMDE_SHUFFLE_VECTOR_) + #if defined(SIMDE_RISCV_V_NATIVE) + vint16m1x3_t dest = __riscv_vlseg3e16_v_i16m1x3(ptr, 8); + dest = __riscv_vset_v_i16m1_i16m1x3 (dest, 0, a_[0].sv128); + dest = __riscv_vset_v_i16m1_i16m1x3 (dest, 1, a_[1].sv128); + dest = __riscv_vset_v_i16m1_i16m1x3 (dest, 2, a_[2].sv128); + __riscv_vsseg3e16_v_i16m1x3 (ptr, dest, 8); + #elif defined(SIMDE_SHUFFLE_VECTOR_) __typeof__(a_[0].values) r0 = SIMDE_SHUFFLE_VECTOR_(16, 16, a_[0].values, a_[1].values, 0, 8, 3, 1, 9, 4, 2, 10); __typeof__(a_[0].values) m0 = SIMDE_SHUFFLE_VECTOR_(16, 16, r0, a_[2].values, @@ -494,7 +644,7 @@ simde_vst3q_s16(int16_t ptr[HEDLEY_ARRAY_PARAM(24)], simde_int16x8x3_t val) { simde_memcpy(&ptr[16], &m2, sizeof(m2)); #else int16_t buf[24]; - for (size_t i = 0; i < (sizeof(val.val[0]) / sizeof(*ptr)) * 3 ; i++) { + for (size_t i = 0; i < 24 ; i++) { buf[i] = a_[i % 3].values[i / 3]; } simde_memcpy(ptr, buf, sizeof(buf)); @@ -515,7 +665,13 @@ simde_vst3q_s32(int32_t ptr[HEDLEY_ARRAY_PARAM(12)], simde_int32x4x3_t val) { simde_int32x4_private a_[3] = { simde_int32x4_to_private(val.val[0]), simde_int32x4_to_private(val.val[1]), simde_int32x4_to_private(val.val[2]) }; - #if defined(SIMDE_SHUFFLE_VECTOR_) + #if defined(SIMDE_RISCV_V_NATIVE) + vint32m1x3_t dest = __riscv_vlseg3e32_v_i32m1x3(ptr, 4); + dest = __riscv_vset_v_i32m1_i32m1x3 (dest, 0, a_[0].sv128); + dest = __riscv_vset_v_i32m1_i32m1x3 (dest, 1, a_[1].sv128); + dest = __riscv_vset_v_i32m1_i32m1x3 (dest, 2, a_[2].sv128); + __riscv_vsseg3e32_v_i32m1x3 (ptr, dest, 4); + #elif defined(SIMDE_SHUFFLE_VECTOR_) __typeof__(a_[0].values) r0 = SIMDE_SHUFFLE_VECTOR_(32, 16, a_[0].values, a_[1].values, 0, 4, 1, 0); __typeof__(a_[0].values) m0 = SIMDE_SHUFFLE_VECTOR_(32, 16, r0, a_[2].values, @@ -535,7 +691,7 @@ simde_vst3q_s32(int32_t ptr[HEDLEY_ARRAY_PARAM(12)], simde_int32x4x3_t val) { simde_memcpy(&ptr[8], &m2, sizeof(m2)); #else int32_t buf[12]; - for (size_t i = 0; i < (sizeof(val.val[0]) / sizeof(*ptr)) * 3 ; i++) { + for (size_t i = 0; i < 12 ; i++) { buf[i] = a_[i % 3].values[i / 3]; } simde_memcpy(ptr, buf, sizeof(buf)); @@ -556,7 +712,13 @@ simde_vst3q_s64(int64_t ptr[HEDLEY_ARRAY_PARAM(6)], simde_int64x2x3_t val) { simde_int64x2_private a[3] = { simde_int64x2_to_private(val.val[0]), simde_int64x2_to_private(val.val[1]), simde_int64x2_to_private(val.val[2]) }; - #if defined(SIMDE_SHUFFLE_VECTOR_) + #if defined(SIMDE_RISCV_V_NATIVE) + vint64m1x3_t dest = __riscv_vlseg3e64_v_i64m1x3(ptr, 2); + dest = __riscv_vset_v_i64m1_i64m1x3 (dest, 0, a[0].sv128); + dest = __riscv_vset_v_i64m1_i64m1x3 (dest, 1, a[1].sv128); + dest = __riscv_vset_v_i64m1_i64m1x3 (dest, 2, a[2].sv128); + __riscv_vsseg3e64_v_i64m1x3 (ptr, dest, 2); + #elif defined(SIMDE_SHUFFLE_VECTOR_) __typeof__(a[0].values) r1 = SIMDE_SHUFFLE_VECTOR_(64, 16, a[0].values, a[1].values, 0, 2); __typeof__(a[0].values) r2 = SIMDE_SHUFFLE_VECTOR_(64, 16, a[2].values, a[0].values, 0, 3); __typeof__(a[0].values) r3 = SIMDE_SHUFFLE_VECTOR_(64, 16, a[1].values, a[2].values, 1, 3); @@ -565,7 +727,7 @@ simde_vst3q_s64(int64_t ptr[HEDLEY_ARRAY_PARAM(6)], simde_int64x2x3_t val) { simde_memcpy(&ptr[4], &r3, sizeof(r3)); #else int64_t buf[6]; - for (size_t i = 0; i < (sizeof(val.val[0]) / sizeof(*ptr)) * 3 ; i++) { + for (size_t i = 0; i < 6 ; i++) { buf[i] = a[i % 3].values[i / 3]; } simde_memcpy(ptr, buf, sizeof(buf)); @@ -618,6 +780,12 @@ simde_vst3q_u8(uint8_t ptr[HEDLEY_ARRAY_PARAM(48)], simde_uint8x16x3_t val) { v128_t m2 = wasm_i8x16_shuffle(r2, r1, 0, 1, 18, 3, 4, 21, 6, 7, 24, 9, 10, 27, 12, 13, 30, 15); wasm_v128_store(ptr + 32, m2); + #elif defined(SIMDE_RISCV_V_NATIVE) + vuint8m1x3_t dest = __riscv_vlseg3e8_v_u8m1x3(ptr, 16); + dest = __riscv_vset_v_u8m1_u8m1x3 (dest, 0, a_[0].sv128); + dest = __riscv_vset_v_u8m1_u8m1x3 (dest, 1, a_[1].sv128); + dest = __riscv_vset_v_u8m1_u8m1x3 (dest, 2, a_[2].sv128); + __riscv_vsseg3e8_v_u8m1x3 (ptr, dest, 16); #elif defined(SIMDE_SHUFFLE_VECTOR_) __typeof__(a_[0].values) r0 = SIMDE_SHUFFLE_VECTOR_(8, 16, a_[0].values, a_[1].values, 0, 16, 6, 1, 17, 7, 2, 18, 8, 3, 19, 9, @@ -643,7 +811,7 @@ simde_vst3q_u8(uint8_t ptr[HEDLEY_ARRAY_PARAM(48)], simde_uint8x16x3_t val) { simde_memcpy(&ptr[32], &m2, sizeof(m2)); #else uint8_t buf[48]; - for (size_t i = 0; i < (sizeof(val.val[0]) / sizeof(*ptr)) * 3 ; i++) { + for (size_t i = 0; i < 48 ; i++) { buf[i] = a_[i % 3].values[i / 3]; } simde_memcpy(ptr, buf, sizeof(buf)); @@ -665,7 +833,13 @@ simde_vst3q_u16(uint16_t ptr[HEDLEY_ARRAY_PARAM(24)], simde_uint16x8x3_t val) { simde_uint16x8_to_private(val.val[1]), simde_uint16x8_to_private(val.val[2]) }; - #if defined(SIMDE_SHUFFLE_VECTOR_) + #if defined(SIMDE_RISCV_V_NATIVE) + vuint16m1x3_t dest = __riscv_vlseg3e16_v_u16m1x3(ptr, 8); + dest = __riscv_vset_v_u16m1_u16m1x3 (dest, 0, a_[0].sv128); + dest = __riscv_vset_v_u16m1_u16m1x3 (dest, 1, a_[1].sv128); + dest = __riscv_vset_v_u16m1_u16m1x3 (dest, 2, a_[2].sv128); + __riscv_vsseg3e16_v_u16m1x3 (ptr, dest, 8); + #elif defined(SIMDE_SHUFFLE_VECTOR_) __typeof__(a_[0].values) r0 = SIMDE_SHUFFLE_VECTOR_(16, 16, a_[0].values, a_[1].values, 0, 8, 3, 1, 9, 4, 2, 10); __typeof__(a_[0].values) m0 = SIMDE_SHUFFLE_VECTOR_(16, 16, r0, a_[2].values, @@ -685,7 +859,7 @@ simde_vst3q_u16(uint16_t ptr[HEDLEY_ARRAY_PARAM(24)], simde_uint16x8x3_t val) { simde_memcpy(&ptr[16], &m2, sizeof(m2)); #else uint16_t buf[24]; - for (size_t i = 0; i < (sizeof(val.val[0]) / sizeof(*ptr)) * 3 ; i++) { + for (size_t i = 0; i < 24 ; i++) { buf[i] = a_[i % 3].values[i / 3]; } simde_memcpy(ptr, buf, sizeof(buf)); @@ -707,7 +881,13 @@ simde_vst3q_u32(uint32_t ptr[HEDLEY_ARRAY_PARAM(12)], simde_uint32x4x3_t val) { simde_uint32x4_to_private(val.val[1]), simde_uint32x4_to_private(val.val[2]) }; - #if defined(SIMDE_SHUFFLE_VECTOR_) + #if defined(SIMDE_RISCV_V_NATIVE) + vuint32m1x3_t dest = __riscv_vlseg3e32_v_u32m1x3(ptr, 4); + dest = __riscv_vset_v_u32m1_u32m1x3 (dest, 0, a_[0].sv128); + dest = __riscv_vset_v_u32m1_u32m1x3 (dest, 1, a_[1].sv128); + dest = __riscv_vset_v_u32m1_u32m1x3 (dest, 2, a_[2].sv128); + __riscv_vsseg3e32_v_u32m1x3 (ptr, dest, 4); + #elif defined(SIMDE_SHUFFLE_VECTOR_) __typeof__(a_[0].values) r0 = SIMDE_SHUFFLE_VECTOR_(32, 16, a_[0].values, a_[1].values, 0, 4, 1, 0); __typeof__(a_[0].values) m0 = SIMDE_SHUFFLE_VECTOR_(32, 16, r0, a_[2].values, @@ -727,7 +907,7 @@ simde_vst3q_u32(uint32_t ptr[HEDLEY_ARRAY_PARAM(12)], simde_uint32x4x3_t val) { simde_memcpy(&ptr[8], &m2, sizeof(m2)); #else uint32_t buf[12]; - for (size_t i = 0; i < (sizeof(val.val[0]) / sizeof(*ptr)) * 3 ; i++) { + for (size_t i = 0; i < 12 ; i++) { buf[i] = a_[i % 3].values[i / 3]; } simde_memcpy(ptr, buf, sizeof(buf)); @@ -748,7 +928,13 @@ simde_vst3q_u64(uint64_t ptr[HEDLEY_ARRAY_PARAM(6)], simde_uint64x2x3_t val) { simde_uint64x2_private a[3] = { simde_uint64x2_to_private(val.val[0]), simde_uint64x2_to_private(val.val[1]), simde_uint64x2_to_private(val.val[2]) }; - #if defined(SIMDE_SHUFFLE_VECTOR_) + #if defined(SIMDE_RISCV_V_NATIVE) + vuint64m1x3_t dest = __riscv_vlseg3e64_v_u64m1x3(ptr, 2); + dest = __riscv_vset_v_u64m1_u64m1x3 (dest, 0, a[0].sv128); + dest = __riscv_vset_v_u64m1_u64m1x3 (dest, 1, a[1].sv128); + dest = __riscv_vset_v_u64m1_u64m1x3 (dest, 2, a[2].sv128); + __riscv_vsseg3e64_v_u64m1x3 (ptr, dest, 2); + #elif defined(SIMDE_SHUFFLE_VECTOR_) __typeof__(a[0].values) r1 = SIMDE_SHUFFLE_VECTOR_(64, 16, a[0].values, a[1].values, 0, 2); __typeof__(a[0].values) r2 = SIMDE_SHUFFLE_VECTOR_(64, 16, a[2].values, a[0].values, 0, 3); __typeof__(a[0].values) r3 = SIMDE_SHUFFLE_VECTOR_(64, 16, a[1].values, a[2].values, 1, 3); @@ -757,7 +943,7 @@ simde_vst3q_u64(uint64_t ptr[HEDLEY_ARRAY_PARAM(6)], simde_uint64x2x3_t val) { simde_memcpy(&ptr[4], &r3, sizeof(r3)); #else uint64_t buf[6]; - for (size_t i = 0; i < (sizeof(val.val[0]) / sizeof(*ptr)) * 3 ; i++) { + for (size_t i = 0; i < 6 ; i++) { buf[i] = a[i % 3].values[i / 3]; } simde_memcpy(ptr, buf, sizeof(buf)); @@ -769,6 +955,221 @@ simde_vst3q_u64(uint64_t ptr[HEDLEY_ARRAY_PARAM(6)], simde_uint64x2x3_t val) { #define vst3q_u64(a, b) simde_vst3q_u64((a), (b)) #endif +SIMDE_FUNCTION_ATTRIBUTES +void +simde_vst3_p8(simde_poly8_t ptr[HEDLEY_ARRAY_PARAM(24)], simde_poly8x8x3_t val) { + #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) + vst3_p8(ptr, val); + #else + simde_poly8x8_private a_[3] = { simde_poly8x8_to_private(val.val[0]), + simde_poly8x8_to_private(val.val[1]), + simde_poly8x8_to_private(val.val[2]) }; + #if defined(SIMDE_RISCV_V_NATIVE) + vuint8m1x3_t dest = __riscv_vlseg3e8_v_u8m1x3(ptr, 8); + dest = __riscv_vset_v_u8m1_u8m1x3 (dest, 0, a_[0].sv64); + dest = __riscv_vset_v_u8m1_u8m1x3 (dest, 1, a_[1].sv64); + dest = __riscv_vset_v_u8m1_u8m1x3 (dest, 2, a_[2].sv64); + __riscv_vsseg3e8_v_u8m1x3 (ptr, dest, 8); + #else + simde_poly8_t buf[24]; + for (size_t i = 0; i < (sizeof(val.val[0]) / sizeof(*ptr)) * 3 ; i++) { + buf[i] = a_[i % 3].values[i / 3]; + } + simde_memcpy(ptr, buf, sizeof(buf)); + #endif + #endif +} +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vst3_p8 + #define vst3_p8(a, b) simde_vst3_p8((a), (b)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +void +simde_vst3_p16(simde_poly16_t ptr[HEDLEY_ARRAY_PARAM(12)], simde_poly16x4x3_t val) { + #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) + vst3_p16(ptr, val); + #else + simde_poly16x4_private a_[3] = { simde_poly16x4_to_private(val.val[0]), + simde_poly16x4_to_private(val.val[1]), + simde_poly16x4_to_private(val.val[2]) }; + #if defined(SIMDE_RISCV_V_NATIVE) + vuint16m1x3_t dest = __riscv_vlseg3e16_v_u16m1x3(ptr, 4); + dest = __riscv_vset_v_u16m1_u16m1x3 (dest, 0, a_[0].sv64); + dest = __riscv_vset_v_u16m1_u16m1x3 (dest, 1, a_[1].sv64); + dest = __riscv_vset_v_u16m1_u16m1x3 (dest, 2, a_[2].sv64); + __riscv_vsseg3e16_v_u16m1x3 (ptr, dest, 4); + #else + simde_poly16_t buf[12]; + for (size_t i = 0; i < (sizeof(val.val[0]) / sizeof(*ptr)) * 3 ; i++) { + buf[i] = a_[i % 3].values[i / 3]; + } + simde_memcpy(ptr, buf, sizeof(buf)); + #endif + #endif +} +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vst3_p16 + #define vst3_p16(a, b) simde_vst3_p16((a), (b)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +void +simde_vst3_p64(simde_poly64_t ptr[HEDLEY_ARRAY_PARAM(3)], simde_poly64x1x3_t val) { + #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) + vst3_p64(ptr, val); + #else + simde_poly64x1_private a_[3] = { simde_poly64x1_to_private(val.val[0]), + simde_poly64x1_to_private(val.val[1]), + simde_poly64x1_to_private(val.val[2]) }; + #if defined(SIMDE_RISCV_V_NATIVE) + vuint64m1x3_t dest = __riscv_vlseg3e64_v_u64m1x3(ptr, 1); + dest = __riscv_vset_v_u64m1_u64m1x3 (dest, 0, a_[0].sv64); + dest = __riscv_vset_v_u64m1_u64m1x3 (dest, 1, a_[1].sv64); + dest = __riscv_vset_v_u64m1_u64m1x3 (dest, 2, a_[2].sv64); + __riscv_vsseg3e64_v_u64m1x3 (ptr, dest, 1); + #else + simde_memcpy(ptr, &a_[0].values, sizeof(a_[0].values)); + simde_memcpy(&ptr[1], &a_[1].values, sizeof(a_[1].values)); + simde_memcpy(&ptr[2], &a_[2].values, sizeof(a_[2].values)); + #endif + #endif +} +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) + #undef vst3_p64 + #define vst3_p64(a, b) simde_vst3_p64((a), (b)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +void +simde_vst3q_p8(simde_poly8_t ptr[HEDLEY_ARRAY_PARAM(48)], simde_poly8x16x3_t val) { + #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) + vst3q_p8(ptr, val); + #else + simde_poly8x16_private a_[3] = {simde_poly8x16_to_private(val.val[0]), + simde_poly8x16_to_private(val.val[1]), + simde_poly8x16_to_private(val.val[2])}; + #if defined(SIMDE_RISCV_V_NATIVE) + vuint8m1x3_t dest = __riscv_vlseg3e8_v_u8m1x3(ptr, 16); + dest = __riscv_vset_v_u8m1_u8m1x3 (dest, 0, a_[0].sv128); + dest = __riscv_vset_v_u8m1_u8m1x3 (dest, 1, a_[1].sv128); + dest = __riscv_vset_v_u8m1_u8m1x3 (dest, 2, a_[2].sv128); + __riscv_vsseg3e8_v_u8m1x3 (ptr, dest, 16); + #else + simde_poly8_t buf[48]; + for (size_t i = 0; i < (sizeof(val.val[0]) / sizeof(*ptr)) * 3 ; i++) { + buf[i] = a_[i % 3].values[i / 3]; + } + simde_memcpy(ptr, buf, sizeof(buf)); + #endif + #endif +} +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vst3q_p8 + #define vst3q_p8(a, b) simde_vst3q_p8((a), (b)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +void +simde_vst3q_p16(simde_poly16_t ptr[HEDLEY_ARRAY_PARAM(24)], simde_poly16x8x3_t val) { + #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) + vst3q_p16(ptr, val); + #else + simde_poly16x8_private a_[3] = { simde_poly16x8_to_private(val.val[0]), + simde_poly16x8_to_private(val.val[1]), + simde_poly16x8_to_private(val.val[2]) }; + + #if defined(SIMDE_RISCV_V_NATIVE) + vuint16m1x3_t dest = __riscv_vlseg3e16_v_u16m1x3(ptr, 8); + dest = __riscv_vset_v_u16m1_u16m1x3 (dest, 0, a_[0].sv128); + dest = __riscv_vset_v_u16m1_u16m1x3 (dest, 1, a_[1].sv128); + dest = __riscv_vset_v_u16m1_u16m1x3 (dest, 2, a_[2].sv128); + __riscv_vsseg3e16_v_u16m1x3 (ptr, dest, 8); + #else + simde_poly16_t buf[24]; + for (size_t i = 0; i < (sizeof(val.val[0]) / sizeof(*ptr)) * 3 ; i++) { + buf[i] = a_[i % 3].values[i / 3]; + } + simde_memcpy(ptr, buf, sizeof(buf)); + #endif + #endif +} +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vst3q_p16 + #define vst3q_p16(a, b) simde_vst3q_p16((a), (b)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +void +simde_vst3q_p64(simde_poly64_t ptr[HEDLEY_ARRAY_PARAM(6)], simde_poly64x2x3_t val) { + #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + vst3q_p64(ptr, val); + #else + simde_poly64x2_private a_[3] = { simde_poly64x2_to_private(val.val[0]), + simde_poly64x2_to_private(val.val[1]), + simde_poly64x2_to_private(val.val[2]) }; + #if defined(SIMDE_RISCV_V_NATIVE) + vuint64m1x3_t dest = __riscv_vlseg3e64_v_u64m1x3(ptr, 2); + dest = __riscv_vset_v_u64m1_u64m1x3 (dest, 0, a_[0].sv128); + dest = __riscv_vset_v_u64m1_u64m1x3 (dest, 1, a_[1].sv128); + dest = __riscv_vset_v_u64m1_u64m1x3 (dest, 2, a_[2].sv128); + __riscv_vsseg3e64_v_u64m1x3 (ptr, dest, 2); + #else + simde_poly64_t buf[6]; + for (size_t i = 0; i < (sizeof(val.val[0]) / sizeof(*ptr)) * 3 ; i++) { + buf[i] = a_[i % 3].values[i / 3]; + } + simde_memcpy(ptr, buf, sizeof(buf)); + #endif + #endif +} +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vst3q_p64 + #define vst3q_p64(a, b) simde_vst3q_p64((a), (b)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +void +simde_vst3_bf16(simde_bfloat16_t ptr[HEDLEY_ARRAY_PARAM(12)], simde_bfloat16x4x3_t val) { + #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARM_NEON_BF16) + vst3_bf16(ptr, val); + #else + simde_bfloat16x4_private a[3] = { simde_bfloat16x4_to_private(val.val[0]), + simde_bfloat16x4_to_private(val.val[1]), + simde_bfloat16x4_to_private(val.val[2]) }; + simde_bfloat16_t buf[12]; + for (size_t i = 0; i < (sizeof(val.val[0]) / sizeof(*ptr)) * 3 ; i++) { + buf[i] = a[i % 3].values[i / 3]; + } + simde_memcpy(ptr, buf, sizeof(buf)); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) + #undef vst3_bf16 + #define vst3_bf16(a, b) simde_vst3_bf16((a), (b)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +void +simde_vst3q_bf16(simde_bfloat16_t ptr[HEDLEY_ARRAY_PARAM(24)], simde_bfloat16x8x3_t val) { + #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARM_NEON_BF16) + vst3q_bf16(ptr, val); + #else + simde_bfloat16x8_private a_[3] = { simde_bfloat16x8_to_private(val.val[0]), + simde_bfloat16x8_to_private(val.val[1]), + simde_bfloat16x8_to_private(val.val[2]) }; + simde_bfloat16_t buf[24]; + for (size_t i = 0; i < (sizeof(val.val[0]) / sizeof(*ptr)) * 3 ; i++) { + buf[i] = a_[i % 3].values[i / 3]; + } + simde_memcpy(ptr, buf, sizeof(buf)); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) + #undef vst3q_bf16 + #define vst3q_bf16(a, b) simde_vst3q_bf16((a), (b)) +#endif + #endif /* !defined(SIMDE_BUG_INTEL_857088) */ SIMDE_END_DECLS_ diff --git a/lib/simd_wrapper/simde/arm/neon/st3_lane.h b/lib/simd_wrapper/simde/arm/neon/st3_lane.h index ba3283b2442..d8325b9a3f8 100644 --- a/lib/simd_wrapper/simde/arm/neon/st3_lane.h +++ b/lib/simd_wrapper/simde/arm/neon/st3_lane.h @@ -22,6 +22,7 @@ * * Copyright: * 2021 Zhi An Ng (Copyright owned by Google, LLC) + * 2023 Yi-Yen Chung (Copyright owned by Andes Technology) */ #if !defined(SIMDE_ARM_NEON_ST3_LANE_H) @@ -189,6 +190,25 @@ simde_vst3_lane_u64(uint64_t ptr[HEDLEY_ARRAY_PARAM(3)], simde_uint64x1x3_t val, #define vst3_lane_u64(a, b, c) simde_vst3_lane_u64((a), (b), (c)) #endif +SIMDE_FUNCTION_ATTRIBUTES +void +simde_vst3_lane_f16(simde_float16_t ptr[HEDLEY_ARRAY_PARAM(3)], simde_float16x4x3_t val, const int lane) + SIMDE_REQUIRE_CONSTANT_RANGE(lane, 0, 3) { + #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) && defined(SIMDE_ARM_NEON_FP16) + SIMDE_CONSTIFY_4_NO_RESULT_(vst3_lane_f16, HEDLEY_UNREACHABLE(), lane, ptr, val); + #else + simde_float16x4_private r; + for (size_t i = 0 ; i < 3 ; i++) { + r = simde_float16x4_to_private(val.val[i]); + ptr[i] = r.values[lane]; + } + #endif +} +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vst3_lane_f16 + #define vst3_lane_f16(a, b, c) simde_vst3_lane_f16((a), (b), (c)) +#endif + SIMDE_FUNCTION_ATTRIBUTES void simde_vst3_lane_f32(simde_float32_t ptr[HEDLEY_ARRAY_PARAM(3)], simde_float32x2x3_t val, const int lane) @@ -380,6 +400,25 @@ simde_vst3q_lane_u64(uint64_t ptr[HEDLEY_ARRAY_PARAM(3)], simde_uint64x2x3_t val #define vst3q_lane_u64(a, b, c) simde_vst3q_lane_u64((a), (b), (c)) #endif +SIMDE_FUNCTION_ATTRIBUTES +void +simde_vst3q_lane_f16(simde_float16_t ptr[HEDLEY_ARRAY_PARAM(3)], simde_float16x8x3_t val, const int lane) + SIMDE_REQUIRE_CONSTANT_RANGE(lane, 0, 7) { + #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) && defined(SIMDE_ARM_NEON_FP16) + SIMDE_CONSTIFY_8_NO_RESULT_(vst3q_lane_f16, HEDLEY_UNREACHABLE(), lane, ptr, val); + #else + simde_float16x8_private r; + for (size_t i = 0 ; i < 3 ; i++) { + r = simde_float16x8_to_private(val.val[i]); + ptr[i] = r.values[lane]; + } + #endif +} +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vst3q_lane_f16 + #define vst3q_lane_f16(a, b, c) simde_vst3q_lane_f16((a), (b), (c)) +#endif + SIMDE_FUNCTION_ATTRIBUTES void simde_vst3q_lane_f32(simde_float32_t ptr[HEDLEY_ARRAY_PARAM(3)], simde_float32x4x3_t val, const int lane) @@ -418,6 +457,159 @@ simde_vst3q_lane_f64(simde_float64_t ptr[HEDLEY_ARRAY_PARAM(3)], simde_float64x2 #define vst3q_lane_f64(a, b, c) simde_vst3q_lane_f64((a), (b), (c)) #endif +SIMDE_FUNCTION_ATTRIBUTES +void +simde_vst3_lane_p8(simde_poly8_t ptr[HEDLEY_ARRAY_PARAM(3)], simde_poly8x8x3_t val, const int lane) + SIMDE_REQUIRE_CONSTANT_RANGE(lane, 0, 7) { + #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) + SIMDE_CONSTIFY_8_NO_RESULT_(vst3_lane_p8, HEDLEY_UNREACHABLE(), lane, ptr, val); + #else + simde_poly8x8_private r; + for (size_t i = 0 ; i < 3 ; i++) { + r = simde_poly8x8_to_private(val.val[i]); + ptr[i] = r.values[lane]; + } + #endif +} +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vst3_lane_p8 + #define vst3_lane_p8(a, b, c) simde_vst3_lane_p8((a), (b), (c)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +void +simde_vst3_lane_p16(simde_poly16_t ptr[HEDLEY_ARRAY_PARAM(3)], simde_poly16x4x3_t val, const int lane) + SIMDE_REQUIRE_CONSTANT_RANGE(lane, 0, 3) { + #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) + SIMDE_CONSTIFY_4_NO_RESULT_(vst3_lane_p16, HEDLEY_UNREACHABLE(), lane, ptr, val); + #else + simde_poly16x4_private r; + for (size_t i = 0 ; i < 3 ; i++) { + r = simde_poly16x4_to_private(val.val[i]); + ptr[i] = r.values[lane]; + } + #endif +} +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vst3_lane_p16 + #define vst3_lane_p16(a, b, c) simde_vst3_lane_p16((a), (b), (c)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +void +simde_vst3_lane_p64(simde_poly64_t ptr[HEDLEY_ARRAY_PARAM(3)], simde_poly64x1x3_t val, const int lane) + SIMDE_REQUIRE_CONSTANT_RANGE(lane, 0, 0) { + #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + HEDLEY_STATIC_CAST(void, lane); + vst3_lane_p64(ptr, val, 0); + #else + simde_poly64x1_private r; + for (size_t i = 0 ; i < 3 ; i++) { + r = simde_poly64x1_to_private(val.val[i]); + ptr[i] = r.values[lane]; + } + #endif +} +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vst3_lane_p64 + #define vst3_lane_p64(a, b, c) simde_vst3_lane_p64((a), (b), (c)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +void +simde_vst3q_lane_p8(simde_poly8_t ptr[HEDLEY_ARRAY_PARAM(3)], simde_poly8x16x3_t val, const int lane) + SIMDE_REQUIRE_CONSTANT_RANGE(lane, 0, 15) { + #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + SIMDE_CONSTIFY_16_NO_RESULT_(vst3q_lane_p8, HEDLEY_UNREACHABLE(), lane, ptr, val); + #else + simde_poly8x16_private r; + for (size_t i = 0 ; i < 3 ; i++) { + r = simde_poly8x16_to_private(val.val[i]); + ptr[i] = r.values[lane]; + } + #endif +} +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vst3q_lane_p8 + #define vst3q_lane_p8(a, b, c) simde_vst3q_lane_p8((a), (b), (c)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +void +simde_vst3q_lane_p16(simde_poly16_t ptr[HEDLEY_ARRAY_PARAM(3)], simde_poly16x8x3_t val, const int lane) + SIMDE_REQUIRE_CONSTANT_RANGE(lane, 0, 7) { + #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) + SIMDE_CONSTIFY_8_NO_RESULT_(vst3q_lane_p16, HEDLEY_UNREACHABLE(), lane, ptr, val); + #else + simde_poly16x8_private r; + for (size_t i = 0 ; i < 3 ; i++) { + r = simde_poly16x8_to_private(val.val[i]); + ptr[i] = r.values[lane]; + } + #endif +} +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vst3q_lane_p16 + #define vst3q_lane_p16(a, b, c) simde_vst3q_lane_p16((a), (b), (c)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +void +simde_vst3q_lane_p64(simde_poly64_t ptr[HEDLEY_ARRAY_PARAM(3)], simde_poly64x2x3_t val, const int lane) + SIMDE_REQUIRE_CONSTANT_RANGE(lane, 0, 1) { + #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + SIMDE_CONSTIFY_2_NO_RESULT_(vst3q_lane_p64, HEDLEY_UNREACHABLE(), lane, ptr, val); + #else + simde_poly64x2_private r; + for (size_t i = 0 ; i < 3 ; i++) { + r = simde_poly64x2_to_private(val.val[i]); + ptr[i] = r.values[lane]; + } + #endif +} +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vst3q_lane_p64 + #define vst3q_lane_p64(a, b, c) simde_vst3q_lane_p64((a), (b), (c)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +void +simde_vst3_lane_bf16(simde_bfloat16_t ptr[HEDLEY_ARRAY_PARAM(3)], simde_bfloat16x4x3_t val, const int lane) + SIMDE_REQUIRE_CONSTANT_RANGE(lane, 0, 3) { + #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARM_NEON_BF16) + SIMDE_CONSTIFY_4_NO_RESULT_(vst3_lane_bf16, HEDLEY_UNREACHABLE(), lane, ptr, val); + #else + simde_bfloat16x4_private r; + for (size_t i = 0 ; i < 3 ; i++) { + r = simde_bfloat16x4_to_private(val.val[i]); + ptr[i] = r.values[lane]; + } + #endif +} +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) + #undef vst3_lane_bf16 + #define vst3_lane_bf16(a, b, c) simde_vst3_lane_bf16((a), (b), (c)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +void +simde_vst3q_lane_bf16(simde_bfloat16_t ptr[HEDLEY_ARRAY_PARAM(3)], simde_bfloat16x8x3_t val, const int lane) + SIMDE_REQUIRE_CONSTANT_RANGE(lane, 0, 7) { + #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARM_NEON_BF16) + SIMDE_CONSTIFY_8_NO_RESULT_(vst3q_lane_bf16, HEDLEY_UNREACHABLE(), lane, ptr, val); + #else + simde_bfloat16x8_private r; + for (size_t i = 0 ; i < 3 ; i++) { + r = simde_bfloat16x8_to_private(val.val[i]); + ptr[i] = r.values[lane]; + } + #endif +} +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) + #undef vst3q_lane_bf16 + #define vst3q_lane_bf16(a, b, c) simde_vst3q_lane_bf16((a), (b), (c)) +#endif + #endif /* !defined(SIMDE_BUG_INTEL_857088) */ SIMDE_END_DECLS_ diff --git a/lib/simd_wrapper/simde/arm/neon/st4.h b/lib/simd_wrapper/simde/arm/neon/st4.h index 2ccb1c3dd89..6b0b4706bbb 100644 --- a/lib/simd_wrapper/simde/arm/neon/st4.h +++ b/lib/simd_wrapper/simde/arm/neon/st4.h @@ -23,6 +23,8 @@ * Copyright: * 2020 Evan Nemerson * 2020 Sean Maher + * 2023 Yi-Yen Chung (Copyright owned by Andes Technology) + * 2023 Chi-Wei Chu (Copyright owned by NTHU pllab) */ #if !defined(SIMDE_ARM_NEON_ST4_H) @@ -36,19 +38,57 @@ SIMDE_BEGIN_DECLS_ #if !defined(SIMDE_BUG_INTEL_857088) +SIMDE_FUNCTION_ATTRIBUTES +void +simde_vst4_f16(simde_float16_t *ptr, simde_float16x4x4_t val) { + #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) && defined(SIMDE_ARM_NEON_FP16) + vst4_f16(ptr, val); + #else + simde_float16x4_private a_[4] = { simde_float16x4_to_private(val.val[0]), simde_float16x4_to_private(val.val[1]), + simde_float16x4_to_private(val.val[2]), simde_float16x4_to_private(val.val[3]) }; + #if defined(SIMDE_RISCV_V_NATIVE) && SIMDE_ARCH_RISCV_ZVFH && (SIMDE_NATURAL_VECTOR_SIZE >= 128) + vfloat16m1x4_t dest = __riscv_vlseg4e16_v_f16m1x4((_Float16 *)ptr, 4); + dest = __riscv_vset_v_f16m1_f16m1x4 (dest, 0, a_[0].sv64); + dest = __riscv_vset_v_f16m1_f16m1x4 (dest, 1, a_[1].sv64); + dest = __riscv_vset_v_f16m1_f16m1x4 (dest, 2, a_[2].sv64); + dest = __riscv_vset_v_f16m1_f16m1x4 (dest, 3, a_[3].sv64); + __riscv_vsseg4e16_v_f16m1x4 ((_Float16 *)ptr, dest, 4); + #else + simde_float16_t buf[16]; + for (size_t i = 0; i < (sizeof(val.val[0]) / sizeof(*ptr)) * 4 ; i++) { + buf[i] = a_[i % 4].values[i / 4]; + } + simde_memcpy(ptr, buf, sizeof(buf)); + #endif + #endif +} +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vst4_f16 + #define vst4_f16(a, b) simde_vst4_f16((a), (b)) +#endif + SIMDE_FUNCTION_ATTRIBUTES void simde_vst4_f32(simde_float32_t *ptr, simde_float32x2x4_t val) { #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) vst4_f32(ptr, val); #else - simde_float32_t buf[8]; simde_float32x2_private a_[4] = { simde_float32x2_to_private(val.val[0]), simde_float32x2_to_private(val.val[1]), simde_float32x2_to_private(val.val[2]), simde_float32x2_to_private(val.val[3]) }; - for (size_t i = 0; i < (sizeof(val.val[0]) / sizeof(*ptr)) * 4 ; i++) { - buf[i] = a_[i % 4].values[i / 4]; - } - simde_memcpy(ptr, buf, sizeof(buf)); + #if defined(SIMDE_RISCV_V_NATIVE) + vfloat32m1x4_t dest = __riscv_vlseg4e32_v_f32m1x4(ptr, 2); + dest = __riscv_vset_v_f32m1_f32m1x4 (dest, 0, a_[0].sv64); + dest = __riscv_vset_v_f32m1_f32m1x4 (dest, 1, a_[1].sv64); + dest = __riscv_vset_v_f32m1_f32m1x4 (dest, 2, a_[2].sv64); + dest = __riscv_vset_v_f32m1_f32m1x4 (dest, 3, a_[3].sv64); + __riscv_vsseg4e32_v_f32m1x4 (ptr, dest, 2); + #else + simde_float32_t buf[8]; + for (size_t i = 0; i < (sizeof(val.val[0]) / sizeof(*ptr)) * 4 ; i++) { + buf[i] = a_[i % 4].values[i / 4]; + } + simde_memcpy(ptr, buf, sizeof(buf)); + #endif #endif } #if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) @@ -62,13 +102,22 @@ simde_vst4_f64(simde_float64_t *ptr, simde_float64x1x4_t val) { #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) vst4_f64(ptr, val); #else - simde_float64_t buf[4]; simde_float64x1_private a_[4] = { simde_float64x1_to_private(val.val[0]), simde_float64x1_to_private(val.val[1]), simde_float64x1_to_private(val.val[2]), simde_float64x1_to_private(val.val[3]) }; - for (size_t i = 0; i < (sizeof(val.val[0]) / sizeof(*ptr)) * 4 ; i++) { - buf[i] = a_[i % 4].values[i / 4]; - } - simde_memcpy(ptr, buf, sizeof(buf)); + #if defined(SIMDE_RISCV_V_NATIVE) + vfloat64m1x4_t dest = __riscv_vlseg4e64_v_f64m1x4(ptr, 1); + dest = __riscv_vset_v_f64m1_f64m1x4 (dest, 0, a_[0].sv64); + dest = __riscv_vset_v_f64m1_f64m1x4 (dest, 1, a_[1].sv64); + dest = __riscv_vset_v_f64m1_f64m1x4 (dest, 2, a_[2].sv64); + dest = __riscv_vset_v_f64m1_f64m1x4 (dest, 3, a_[3].sv64); + __riscv_vsseg4e64_v_f64m1x4(ptr, dest, 1); + #else + simde_float64_t buf[4]; + for (size_t i = 0; i < (sizeof(val.val[0]) / sizeof(*ptr)) * 4 ; i++) { + buf[i] = a_[i % 4].values[i / 4]; + } + simde_memcpy(ptr, buf, sizeof(buf)); + #endif #endif } #if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) @@ -82,13 +131,22 @@ simde_vst4_s8(int8_t *ptr, simde_int8x8x4_t val) { #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) vst4_s8(ptr, val); #else - int8_t buf[32]; simde_int8x8_private a_[4] = { simde_int8x8_to_private(val.val[0]), simde_int8x8_to_private(val.val[1]), simde_int8x8_to_private(val.val[2]), simde_int8x8_to_private(val.val[3]) }; - for (size_t i = 0; i < (sizeof(val.val[0]) / sizeof(*ptr)) * 4 ; i++) { - buf[i] = a_[i % 4].values[i / 4]; - } - simde_memcpy(ptr, buf, sizeof(buf)); + #if defined(SIMDE_RISCV_V_NATIVE) + vint8m1x4_t dest = __riscv_vlseg4e8_v_i8m1x4(ptr, 8); + dest = __riscv_vset_v_i8m1_i8m1x4 (dest, 0, a_[0].sv64); + dest = __riscv_vset_v_i8m1_i8m1x4 (dest, 1, a_[1].sv64); + dest = __riscv_vset_v_i8m1_i8m1x4 (dest, 2, a_[2].sv64); + dest = __riscv_vset_v_i8m1_i8m1x4 (dest, 3, a_[3].sv64); + __riscv_vsseg4e8_v_i8m1x4(ptr, dest, 8); + #else + int8_t buf[32]; + for (size_t i = 0; i < (sizeof(val.val[0]) / sizeof(*ptr)) * 4 ; i++) { + buf[i] = a_[i % 4].values[i / 4]; + } + simde_memcpy(ptr, buf, sizeof(buf)); + #endif #endif } #if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) @@ -102,13 +160,22 @@ simde_vst4_s16(int16_t *ptr, simde_int16x4x4_t val) { #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) vst4_s16(ptr, val); #else - int16_t buf[16]; simde_int16x4_private a_[4] = { simde_int16x4_to_private(val.val[0]), simde_int16x4_to_private(val.val[1]), simde_int16x4_to_private(val.val[2]), simde_int16x4_to_private(val.val[3]) }; - for (size_t i = 0; i < (sizeof(val.val[0]) / sizeof(*ptr)) * 4 ; i++) { - buf[i] = a_[i % 4].values[i / 4]; - } - simde_memcpy(ptr, buf, sizeof(buf)); + #if defined(SIMDE_RISCV_V_NATIVE) + vint16m1x4_t dest = __riscv_vlseg4e16_v_i16m1x4(ptr, 4); + dest = __riscv_vset_v_i16m1_i16m1x4 (dest, 0, a_[0].sv64); + dest = __riscv_vset_v_i16m1_i16m1x4 (dest, 1, a_[1].sv64); + dest = __riscv_vset_v_i16m1_i16m1x4 (dest, 2, a_[2].sv64); + dest = __riscv_vset_v_i16m1_i16m1x4 (dest, 3, a_[3].sv64); + __riscv_vsseg4e16_v_i16m1x4 (ptr, dest, 4); + #else + int16_t buf[16]; + for (size_t i = 0; i < (sizeof(val.val[0]) / sizeof(*ptr)) * 4 ; i++) { + buf[i] = a_[i % 4].values[i / 4]; + } + simde_memcpy(ptr, buf, sizeof(buf)); + #endif #endif } #if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) @@ -122,13 +189,22 @@ simde_vst4_s32(int32_t *ptr, simde_int32x2x4_t val) { #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) vst4_s32(ptr, val); #else - int32_t buf[8]; simde_int32x2_private a_[4] = { simde_int32x2_to_private(val.val[0]), simde_int32x2_to_private(val.val[1]), simde_int32x2_to_private(val.val[2]), simde_int32x2_to_private(val.val[3]) }; - for (size_t i = 0; i < (sizeof(val.val[0]) / sizeof(*ptr)) * 4 ; i++) { - buf[i] = a_[i % 4].values[i / 4]; - } - simde_memcpy(ptr, buf, sizeof(buf)); + #if defined(SIMDE_RISCV_V_NATIVE) + vint32m1x4_t dest = __riscv_vlseg4e32_v_i32m1x4(ptr, 2); + dest = __riscv_vset_v_i32m1_i32m1x4 (dest, 0, a_[0].sv64); + dest = __riscv_vset_v_i32m1_i32m1x4 (dest, 1, a_[1].sv64); + dest = __riscv_vset_v_i32m1_i32m1x4 (dest, 2, a_[2].sv64); + dest = __riscv_vset_v_i32m1_i32m1x4 (dest, 3, a_[3].sv64); + __riscv_vsseg4e32_v_i32m1x4 (ptr, dest, 2); + #else + int32_t buf[8]; + for (size_t i = 0; i < (sizeof(val.val[0]) / sizeof(*ptr)) * 4 ; i++) { + buf[i] = a_[i % 4].values[i / 4]; + } + simde_memcpy(ptr, buf, sizeof(buf)); + #endif #endif } #if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) @@ -142,16 +218,25 @@ simde_vst4_s64(int64_t *ptr, simde_int64x1x4_t val) { #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) vst4_s64(ptr, val); #else - int64_t buf[4]; simde_int64x1_private a_[4] = { simde_int64x1_to_private(val.val[0]), simde_int64x1_to_private(val.val[1]), simde_int64x1_to_private(val.val[2]), simde_int64x1_to_private(val.val[3]) }; - for (size_t i = 0; i < (sizeof(val.val[0]) / sizeof(*ptr)) * 4 ; i++) { - buf[i] = a_[i % 4].values[i / 4]; - } - simde_memcpy(ptr, buf, sizeof(buf)); + #if defined(SIMDE_RISCV_V_NATIVE) + vint64m1x4_t dest = __riscv_vlseg4e64_v_i64m1x4(ptr, 1); + dest = __riscv_vset_v_i64m1_i64m1x4 (dest, 0, a_[0].sv64); + dest = __riscv_vset_v_i64m1_i64m1x4 (dest, 1, a_[1].sv64); + dest = __riscv_vset_v_i64m1_i64m1x4 (dest, 2, a_[2].sv64); + dest = __riscv_vset_v_i64m1_i64m1x4 (dest, 3, a_[3].sv64); + __riscv_vsseg4e64_v_i64m1x4 (ptr, dest, 1); + #else + int64_t buf[4]; + for (size_t i = 0; i < (sizeof(val.val[0]) / sizeof(*ptr)) * 4 ; i++) { + buf[i] = a_[i % 4].values[i / 4]; + } + simde_memcpy(ptr, buf, sizeof(buf)); + #endif #endif } -#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) #undef vst4_s64 #define vst4_s64(a, b) simde_vst4_s64((a), (b)) #endif @@ -162,13 +247,22 @@ simde_vst4_u8(uint8_t *ptr, simde_uint8x8x4_t val) { #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) vst4_u8(ptr, val); #else - uint8_t buf[32]; simde_uint8x8_private a_[4] = { simde_uint8x8_to_private(val.val[0]), simde_uint8x8_to_private(val.val[1]), simde_uint8x8_to_private(val.val[2]), simde_uint8x8_to_private(val.val[3]) }; - for (size_t i = 0; i < (sizeof(val.val[0]) / sizeof(*ptr)) * 4 ; i++) { - buf[i] = a_[i % 4].values[i / 4]; - } - simde_memcpy(ptr, buf, sizeof(buf)); + #if defined(SIMDE_RISCV_V_NATIVE) + vuint8m1x4_t dest = __riscv_vlseg4e8_v_u8m1x4(ptr, 8); + dest = __riscv_vset_v_u8m1_u8m1x4 (dest, 0, a_[0].sv64); + dest = __riscv_vset_v_u8m1_u8m1x4 (dest, 1, a_[1].sv64); + dest = __riscv_vset_v_u8m1_u8m1x4 (dest, 2, a_[2].sv64); + dest = __riscv_vset_v_u8m1_u8m1x4 (dest, 3, a_[3].sv64); + __riscv_vsseg4e8_v_u8m1x4 (ptr, dest, 8); + #else + uint8_t buf[32]; + for (size_t i = 0; i < (sizeof(val.val[0]) / sizeof(*ptr)) * 4 ; i++) { + buf[i] = a_[i % 4].values[i / 4]; + } + simde_memcpy(ptr, buf, sizeof(buf)); + #endif #endif } #if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) @@ -182,13 +276,22 @@ simde_vst4_u16(uint16_t *ptr, simde_uint16x4x4_t val) { #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) vst4_u16(ptr, val); #else - uint16_t buf[16]; simde_uint16x4_private a_[4] = { simde_uint16x4_to_private(val.val[0]), simde_uint16x4_to_private(val.val[1]), simde_uint16x4_to_private(val.val[2]), simde_uint16x4_to_private(val.val[3]) }; - for (size_t i = 0; i < (sizeof(val.val[0]) / sizeof(*ptr)) * 4 ; i++) { - buf[i] = a_[i % 4].values[i / 4]; - } - simde_memcpy(ptr, buf, sizeof(buf)); + #if defined(SIMDE_RISCV_V_NATIVE) + vuint16m1x4_t dest = __riscv_vlseg4e16_v_u16m1x4(ptr, 4); + dest = __riscv_vset_v_u16m1_u16m1x4 (dest, 0, a_[0].sv64); + dest = __riscv_vset_v_u16m1_u16m1x4 (dest, 1, a_[1].sv64); + dest = __riscv_vset_v_u16m1_u16m1x4 (dest, 2, a_[2].sv64); + dest = __riscv_vset_v_u16m1_u16m1x4 (dest, 3, a_[3].sv64); + __riscv_vsseg4e16_v_u16m1x4 (ptr, dest, 4); + #else + uint16_t buf[16]; + for (size_t i = 0; i < (sizeof(val.val[0]) / sizeof(*ptr)) * 4 ; i++) { + buf[i] = a_[i % 4].values[i / 4]; + } + simde_memcpy(ptr, buf, sizeof(buf)); + #endif #endif } #if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) @@ -202,13 +305,22 @@ simde_vst4_u32(uint32_t *ptr, simde_uint32x2x4_t val) { #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) vst4_u32(ptr, val); #else - uint32_t buf[8]; simde_uint32x2_private a_[4] = { simde_uint32x2_to_private(val.val[0]), simde_uint32x2_to_private(val.val[1]), simde_uint32x2_to_private(val.val[2]), simde_uint32x2_to_private(val.val[3]) }; - for (size_t i = 0; i < (sizeof(val.val[0]) / sizeof(*ptr)) * 4 ; i++) { - buf[i] = a_[i % 4].values[i / 4]; - } - simde_memcpy(ptr, buf, sizeof(buf)); + #if defined(SIMDE_RISCV_V_NATIVE) + vuint32m1x4_t dest = __riscv_vlseg4e32_v_u32m1x4(ptr, 2); + dest = __riscv_vset_v_u32m1_u32m1x4 (dest, 0, a_[0].sv64); + dest = __riscv_vset_v_u32m1_u32m1x4 (dest, 1, a_[1].sv64); + dest = __riscv_vset_v_u32m1_u32m1x4 (dest, 2, a_[2].sv64); + dest = __riscv_vset_v_u32m1_u32m1x4 (dest, 3, a_[3].sv64); + __riscv_vsseg4e32_v_u32m1x4 (ptr, dest, 2); + #else + uint32_t buf[8]; + for (size_t i = 0; i < (sizeof(val.val[0]) / sizeof(*ptr)) * 4 ; i++) { + buf[i] = a_[i % 4].values[i / 4]; + } + simde_memcpy(ptr, buf, sizeof(buf)); + #endif #endif } #if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) @@ -222,33 +334,80 @@ simde_vst4_u64(uint64_t *ptr, simde_uint64x1x4_t val) { #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) vst4_u64(ptr, val); #else - uint64_t buf[4]; simde_uint64x1_private a_[4] = { simde_uint64x1_to_private(val.val[0]), simde_uint64x1_to_private(val.val[1]), simde_uint64x1_to_private(val.val[2]), simde_uint64x1_to_private(val.val[3]) }; - for (size_t i = 0; i < (sizeof(val.val[0]) / sizeof(*ptr)) * 4 ; i++) { - buf[i] = a_[i % 4].values[i / 4]; - } - simde_memcpy(ptr, buf, sizeof(buf)); + #if defined(SIMDE_RISCV_V_NATIVE) + vuint64m1x4_t dest = __riscv_vlseg4e64_v_u64m1x4(ptr, 1); + dest = __riscv_vset_v_u64m1_u64m1x4 (dest, 0, a_[0].sv64); + dest = __riscv_vset_v_u64m1_u64m1x4 (dest, 1, a_[1].sv64); + dest = __riscv_vset_v_u64m1_u64m1x4 (dest, 2, a_[2].sv64); + dest = __riscv_vset_v_u64m1_u64m1x4 (dest, 3, a_[3].sv64); + __riscv_vsseg4e64_v_u64m1x4 (ptr, dest, 1); + #else + uint64_t buf[4]; + for (size_t i = 0; i < (sizeof(val.val[0]) / sizeof(*ptr)) * 4 ; i++) { + buf[i] = a_[i % 4].values[i / 4]; + } + simde_memcpy(ptr, buf, sizeof(buf)); + #endif #endif } -#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) #undef vst4_u64 #define vst4_u64(a, b) simde_vst4_u64((a), (b)) #endif +SIMDE_FUNCTION_ATTRIBUTES +void +simde_vst4q_f16(simde_float16_t *ptr, simde_float16x8x4_t val) { + #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) && defined(SIMDE_ARM_NEON_FP16) + vst4q_f16(ptr, val); + #else + simde_float16x8_private a_[4] = { simde_float16x8_to_private(val.val[0]), simde_float16x8_to_private(val.val[1]), + simde_float16x8_to_private(val.val[2]), simde_float16x8_to_private(val.val[3]) }; + #if defined(SIMDE_RISCV_V_NATIVE) && SIMDE_ARCH_RISCV_ZVFH && (SIMDE_NATURAL_VECTOR_SIZE >= 128) + vfloat16m1x4_t dest = __riscv_vlseg4e16_v_f16m1x4((_Float16 *)ptr, 8); + dest = __riscv_vset_v_f16m1_f16m1x4 (dest, 0, a_[0].sv128); + dest = __riscv_vset_v_f16m1_f16m1x4 (dest, 1, a_[1].sv128); + dest = __riscv_vset_v_f16m1_f16m1x4 (dest, 2, a_[2].sv128); + dest = __riscv_vset_v_f16m1_f16m1x4 (dest, 3, a_[3].sv128); + __riscv_vsseg4e16_v_f16m1x4 ((_Float16 *)ptr, dest, 8); + #else + simde_float16_t buf[32]; + for (size_t i = 0; i < (sizeof(val.val[0]) / sizeof(*ptr)) * 4 ; i++) { + buf[i] = a_[i % 4].values[i / 4]; + } + simde_memcpy(ptr, buf, sizeof(buf)); + #endif + #endif +} +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vst4q_f16 + #define vst4q_f16(a, b) simde_vst4q_f16((a), (b)) +#endif + SIMDE_FUNCTION_ATTRIBUTES void simde_vst4q_f32(simde_float32_t *ptr, simde_float32x4x4_t val) { #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) vst4q_f32(ptr, val); #else - simde_float32_t buf[16]; simde_float32x4_private a_[4] = { simde_float32x4_to_private(val.val[0]), simde_float32x4_to_private(val.val[1]), simde_float32x4_to_private(val.val[2]), simde_float32x4_to_private(val.val[3]) }; - for (size_t i = 0; i < (sizeof(val.val[0]) / sizeof(*ptr)) * 4 ; i++) { - buf[i] = a_[i % 4].values[i / 4]; - } - simde_memcpy(ptr, buf, sizeof(buf)); + #if defined(SIMDE_RISCV_V_NATIVE) + vfloat32m1x4_t dest = __riscv_vlseg4e32_v_f32m1x4(ptr, 4); + dest = __riscv_vset_v_f32m1_f32m1x4 (dest, 0, a_[0].sv128); + dest = __riscv_vset_v_f32m1_f32m1x4 (dest, 1, a_[1].sv128); + dest = __riscv_vset_v_f32m1_f32m1x4 (dest, 2, a_[2].sv128); + dest = __riscv_vset_v_f32m1_f32m1x4 (dest, 3, a_[3].sv128); + __riscv_vsseg4e32_v_f32m1x4 (ptr, dest, 4); + #else + simde_float32_t buf[16]; + for (size_t i = 0; i < (sizeof(val.val[0]) / sizeof(*ptr)) * 4 ; i++) { + buf[i] = a_[i % 4].values[i / 4]; + } + simde_memcpy(ptr, buf, sizeof(buf)); + #endif #endif } #if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) @@ -262,13 +421,22 @@ simde_vst4q_f64(simde_float64_t *ptr, simde_float64x2x4_t val) { #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) vst4q_f64(ptr, val); #else - simde_float64_t buf[8]; simde_float64x2_private a_[4] = { simde_float64x2_to_private(val.val[0]), simde_float64x2_to_private(val.val[1]), simde_float64x2_to_private(val.val[2]), simde_float64x2_to_private(val.val[3]) }; - for (size_t i = 0; i < (sizeof(val.val[0]) / sizeof(*ptr)) * 4 ; i++) { - buf[i] = a_[i % 4].values[i / 4]; - } - simde_memcpy(ptr, buf, sizeof(buf)); + #if defined(SIMDE_RISCV_V_NATIVE) + vfloat64m1x4_t dest = __riscv_vlseg4e64_v_f64m1x4(ptr, 2); + dest = __riscv_vset_v_f64m1_f64m1x4 (dest, 0, a_[0].sv128); + dest = __riscv_vset_v_f64m1_f64m1x4 (dest, 1, a_[1].sv128); + dest = __riscv_vset_v_f64m1_f64m1x4 (dest, 2, a_[2].sv128); + dest = __riscv_vset_v_f64m1_f64m1x4 (dest, 3, a_[3].sv128); + __riscv_vsseg4e64_v_f64m1x4 (ptr, dest, 2); + #else + simde_float64_t buf[8]; + for (size_t i = 0; i < (sizeof(val.val[0]) / sizeof(*ptr)) * 4 ; i++) { + buf[i] = a_[i % 4].values[i / 4]; + } + simde_memcpy(ptr, buf, sizeof(buf)); + #endif #endif } #if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) @@ -282,13 +450,22 @@ simde_vst4q_s8(int8_t *ptr, simde_int8x16x4_t val) { #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) vst4q_s8(ptr, val); #else - int8_t buf[64]; simde_int8x16_private a_[4] = { simde_int8x16_to_private(val.val[0]), simde_int8x16_to_private(val.val[1]), simde_int8x16_to_private(val.val[2]), simde_int8x16_to_private(val.val[3]) }; - for (size_t i = 0; i < (sizeof(val.val[0]) / sizeof(*ptr)) * 4 ; i++) { - buf[i] = a_[i % 4].values[i / 4]; - } - simde_memcpy(ptr, buf, sizeof(buf)); + #if defined(SIMDE_RISCV_V_NATIVE) + vint8m1x4_t dest = __riscv_vlseg4e8_v_i8m1x4(ptr, 16); + dest = __riscv_vset_v_i8m1_i8m1x4 (dest, 0, a_[0].sv128); + dest = __riscv_vset_v_i8m1_i8m1x4 (dest, 1, a_[1].sv128); + dest = __riscv_vset_v_i8m1_i8m1x4 (dest, 2, a_[2].sv128); + dest = __riscv_vset_v_i8m1_i8m1x4 (dest, 3, a_[3].sv128); + __riscv_vsseg4e8_v_i8m1x4 (ptr, dest, 16); + #else + int8_t buf[64]; + for (size_t i = 0; i < (sizeof(val.val[0]) / sizeof(*ptr)) * 4 ; i++) { + buf[i] = a_[i % 4].values[i / 4]; + } + simde_memcpy(ptr, buf, sizeof(buf)); + #endif #endif } #if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) @@ -302,13 +479,22 @@ simde_vst4q_s16(int16_t *ptr, simde_int16x8x4_t val) { #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) vst4q_s16(ptr, val); #else - int16_t buf[32]; simde_int16x8_private a_[4] = { simde_int16x8_to_private(val.val[0]), simde_int16x8_to_private(val.val[1]), simde_int16x8_to_private(val.val[2]), simde_int16x8_to_private(val.val[3]) }; - for (size_t i = 0; i < (sizeof(val.val[0]) / sizeof(*ptr)) * 4 ; i++) { - buf[i] = a_[i % 4].values[i / 4]; - } - simde_memcpy(ptr, buf, sizeof(buf)); + #if defined(SIMDE_RISCV_V_NATIVE) + vint16m1x4_t dest = __riscv_vlseg4e16_v_i16m1x4(ptr, 8); + dest = __riscv_vset_v_i16m1_i16m1x4 (dest, 0, a_[0].sv128); + dest = __riscv_vset_v_i16m1_i16m1x4 (dest, 1, a_[1].sv128); + dest = __riscv_vset_v_i16m1_i16m1x4 (dest, 2, a_[2].sv128); + dest = __riscv_vset_v_i16m1_i16m1x4 (dest, 3, a_[3].sv128); + __riscv_vsseg4e16_v_i16m1x4 (ptr, dest, 8); + #else + int16_t buf[32]; + for (size_t i = 0; i < (sizeof(val.val[0]) / sizeof(*ptr)) * 4 ; i++) { + buf[i] = a_[i % 4].values[i / 4]; + } + simde_memcpy(ptr, buf, sizeof(buf)); + #endif #endif } #if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) @@ -322,13 +508,22 @@ simde_vst4q_s32(int32_t *ptr, simde_int32x4x4_t val) { #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) vst4q_s32(ptr, val); #else - int32_t buf[16]; simde_int32x4_private a_[4] = { simde_int32x4_to_private(val.val[0]), simde_int32x4_to_private(val.val[1]), simde_int32x4_to_private(val.val[2]), simde_int32x4_to_private(val.val[3]) }; - for (size_t i = 0; i < (sizeof(val.val[0]) / sizeof(*ptr)) * 4 ; i++) { - buf[i] = a_[i % 4].values[i / 4]; - } - simde_memcpy(ptr, buf, sizeof(buf)); + #if defined(SIMDE_RISCV_V_NATIVE) + vint32m1x4_t dest = __riscv_vlseg4e32_v_i32m1x4(ptr, 4); + dest = __riscv_vset_v_i32m1_i32m1x4 (dest, 0, a_[0].sv128); + dest = __riscv_vset_v_i32m1_i32m1x4 (dest, 1, a_[1].sv128); + dest = __riscv_vset_v_i32m1_i32m1x4 (dest, 2, a_[2].sv128); + dest = __riscv_vset_v_i32m1_i32m1x4 (dest, 3, a_[3].sv128); + __riscv_vsseg4e32_v_i32m1x4 (ptr, dest, 4); + #else + int32_t buf[16]; + for (size_t i = 0; i < (sizeof(val.val[0]) / sizeof(*ptr)) * 4 ; i++) { + buf[i] = a_[i % 4].values[i / 4]; + } + simde_memcpy(ptr, buf, sizeof(buf)); + #endif #endif } #if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) @@ -342,13 +537,22 @@ simde_vst4q_s64(int64_t *ptr, simde_int64x2x4_t val) { #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) vst4q_s64(ptr, val); #else - int64_t buf[8]; simde_int64x2_private a_[4] = { simde_int64x2_to_private(val.val[0]), simde_int64x2_to_private(val.val[1]), simde_int64x2_to_private(val.val[2]), simde_int64x2_to_private(val.val[3]) }; - for (size_t i = 0; i < (sizeof(val.val[0]) / sizeof(*ptr)) * 4 ; i++) { - buf[i] = a_[i % 4].values[i / 4]; - } - simde_memcpy(ptr, buf, sizeof(buf)); + #if defined(SIMDE_RISCV_V_NATIVE) + vint64m1x4_t dest = __riscv_vlseg4e64_v_i64m1x4(ptr, 2); + dest = __riscv_vset_v_i64m1_i64m1x4 (dest, 0, a_[0].sv128); + dest = __riscv_vset_v_i64m1_i64m1x4 (dest, 1, a_[1].sv128); + dest = __riscv_vset_v_i64m1_i64m1x4 (dest, 2, a_[2].sv128); + dest = __riscv_vset_v_i64m1_i64m1x4 (dest, 3, a_[3].sv128); + __riscv_vsseg4e64_v_i64m1x4 (ptr, dest, 2); + #else + int64_t buf[8]; + for (size_t i = 0; i < (sizeof(val.val[0]) / sizeof(*ptr)) * 4 ; i++) { + buf[i] = a_[i % 4].values[i / 4]; + } + simde_memcpy(ptr, buf, sizeof(buf)); + #endif #endif } #if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) @@ -363,13 +567,22 @@ simde_vst4q_u8(uint8_t *ptr, simde_uint8x16x4_t val) { #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) vst4q_u8(ptr, val); #else - uint8_t buf[64]; simde_uint8x16_private a_[4] = { simde_uint8x16_to_private(val.val[0]), simde_uint8x16_to_private(val.val[1]), simde_uint8x16_to_private(val.val[2]), simde_uint8x16_to_private(val.val[3]) }; - for (size_t i = 0; i < (sizeof(val.val[0]) / sizeof(*ptr)) * 4 ; i++) { - buf[i] = a_[i % 4].values[i / 4]; - } - simde_memcpy(ptr, buf, sizeof(buf)); + #if defined(SIMDE_RISCV_V_NATIVE) + vuint8m1x4_t dest = __riscv_vlseg4e8_v_u8m1x4(ptr, 16); + dest = __riscv_vset_v_u8m1_u8m1x4 (dest, 0, a_[0].sv128); + dest = __riscv_vset_v_u8m1_u8m1x4 (dest, 1, a_[1].sv128); + dest = __riscv_vset_v_u8m1_u8m1x4 (dest, 2, a_[2].sv128); + dest = __riscv_vset_v_u8m1_u8m1x4 (dest, 3, a_[3].sv128); + __riscv_vsseg4e8_v_u8m1x4 (ptr, dest, 16); + #else + uint8_t buf[64]; + for (size_t i = 0; i < (sizeof(val.val[0]) / sizeof(*ptr)) * 4 ; i++) { + buf[i] = a_[i % 4].values[i / 4]; + } + simde_memcpy(ptr, buf, sizeof(buf)); + #endif #endif } #if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) @@ -383,13 +596,22 @@ simde_vst4q_u16(uint16_t *ptr, simde_uint16x8x4_t val) { #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) vst4q_u16(ptr, val); #else - uint16_t buf[32]; simde_uint16x8_private a_[4] = { simde_uint16x8_to_private(val.val[0]), simde_uint16x8_to_private(val.val[1]), simde_uint16x8_to_private(val.val[2]), simde_uint16x8_to_private(val.val[3]) }; - for (size_t i = 0; i < (sizeof(val.val[0]) / sizeof(*ptr)) * 4 ; i++) { - buf[i] = a_[i % 4].values[i / 4]; - } - simde_memcpy(ptr, buf, sizeof(buf)); + #if defined(SIMDE_RISCV_V_NATIVE) + vuint16m1x4_t dest = __riscv_vlseg4e16_v_u16m1x4(ptr, 8); + dest = __riscv_vset_v_u16m1_u16m1x4 (dest, 0, a_[0].sv128); + dest = __riscv_vset_v_u16m1_u16m1x4 (dest, 1, a_[1].sv128); + dest = __riscv_vset_v_u16m1_u16m1x4 (dest, 2, a_[2].sv128); + dest = __riscv_vset_v_u16m1_u16m1x4 (dest, 3, a_[3].sv128); + __riscv_vsseg4e16_v_u16m1x4 (ptr, dest, 8); + #else + uint16_t buf[32]; + for (size_t i = 0; i < (sizeof(val.val[0]) / sizeof(*ptr)) * 4 ; i++) { + buf[i] = a_[i % 4].values[i / 4]; + } + simde_memcpy(ptr, buf, sizeof(buf)); + #endif #endif } #if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) @@ -403,13 +625,22 @@ simde_vst4q_u32(uint32_t *ptr, simde_uint32x4x4_t val) { #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) vst4q_u32(ptr, val); #else - uint32_t buf[16]; simde_uint32x4_private a_[4] = { simde_uint32x4_to_private(val.val[0]), simde_uint32x4_to_private(val.val[1]), simde_uint32x4_to_private(val.val[2]), simde_uint32x4_to_private(val.val[3]) }; - for (size_t i = 0; i < (sizeof(val.val[0]) / sizeof(*ptr)) * 4 ; i++) { - buf[i] = a_[i % 4].values[i / 4]; - } - simde_memcpy(ptr, buf, sizeof(buf)); + #if defined(SIMDE_RISCV_V_NATIVE) + vuint32m1x4_t dest = __riscv_vlseg4e32_v_u32m1x4(ptr, 4); + dest = __riscv_vset_v_u32m1_u32m1x4 (dest, 0, a_[0].sv128); + dest = __riscv_vset_v_u32m1_u32m1x4 (dest, 1, a_[1].sv128); + dest = __riscv_vset_v_u32m1_u32m1x4 (dest, 2, a_[2].sv128); + dest = __riscv_vset_v_u32m1_u32m1x4 (dest, 3, a_[3].sv128); + __riscv_vsseg4e32_v_u32m1x4 (ptr, dest, 4); + #else + uint32_t buf[16]; + for (size_t i = 0; i < (sizeof(val.val[0]) / sizeof(*ptr)) * 4 ; i++) { + buf[i] = a_[i % 4].values[i / 4]; + } + simde_memcpy(ptr, buf, sizeof(buf)); + #endif #endif } #if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) @@ -423,18 +654,241 @@ simde_vst4q_u64(uint64_t *ptr, simde_uint64x2x4_t val) { #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) vst4q_u64(ptr, val); #else - uint64_t buf[8]; simde_uint64x2_private a_[4] = { simde_uint64x2_to_private(val.val[0]), simde_uint64x2_to_private(val.val[1]), simde_uint64x2_to_private(val.val[2]), simde_uint64x2_to_private(val.val[3]) }; + #if defined(SIMDE_RISCV_V_NATIVE) + vuint64m1x4_t dest = __riscv_vlseg4e64_v_u64m1x4(ptr, 2); + dest = __riscv_vset_v_u64m1_u64m1x4 (dest, 0, a_[0].sv128); + dest = __riscv_vset_v_u64m1_u64m1x4 (dest, 1, a_[1].sv128); + dest = __riscv_vset_v_u64m1_u64m1x4 (dest, 2, a_[2].sv128); + dest = __riscv_vset_v_u64m1_u64m1x4 (dest, 3, a_[3].sv128); + __riscv_vsseg4e64_v_u64m1x4 (ptr, dest, 2); + #else + uint64_t buf[8]; + for (size_t i = 0; i < (sizeof(val.val[0]) / sizeof(*ptr)) * 4 ; i++) { + buf[i] = a_[i % 4].values[i / 4]; + } + simde_memcpy(ptr, buf, sizeof(buf)); + #endif + #endif +} +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vst4q_u64 + #define vst4q_u64(a, b) simde_vst4q_u64((a), (b)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +void +simde_vst4_p8(simde_poly8_t *ptr, simde_poly8x8x4_t val) { + #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) + vst4_p8(ptr, val); + #else + simde_poly8x8_private a_[4] = { simde_poly8x8_to_private(val.val[0]), simde_poly8x8_to_private(val.val[1]), + simde_poly8x8_to_private(val.val[2]), simde_poly8x8_to_private(val.val[3]) }; + #if defined(SIMDE_RISCV_V_NATIVE) + vuint8m1x4_t dest = __riscv_vlseg4e8_v_u8m1x4(ptr, 8); + dest = __riscv_vset_v_u8m1_u8m1x4 (dest, 0, a_[0].sv64); + dest = __riscv_vset_v_u8m1_u8m1x4 (dest, 1, a_[1].sv64); + dest = __riscv_vset_v_u8m1_u8m1x4 (dest, 2, a_[2].sv64); + dest = __riscv_vset_v_u8m1_u8m1x4 (dest, 3, a_[3].sv64); + __riscv_vsseg4e8_v_u8m1x4 (ptr, dest, 8); + #else + simde_poly8_t buf[32]; + for (size_t i = 0; i < (sizeof(val.val[0]) / sizeof(*ptr)) * 4 ; i++) { + buf[i] = a_[i % 4].values[i / 4]; + } + simde_memcpy(ptr, buf, sizeof(buf)); + #endif + #endif +} +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vst4_p8 + #define vst4_p8(a, b) simde_vst4_p8((a), (b)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +void +simde_vst4_p16(simde_poly16_t *ptr, simde_poly16x4x4_t val) { + #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) + vst4_p16(ptr, val); + #else + simde_poly16x4_private a_[4] = { simde_poly16x4_to_private(val.val[0]), simde_poly16x4_to_private(val.val[1]), + simde_poly16x4_to_private(val.val[2]), simde_poly16x4_to_private(val.val[3]) }; + #if defined(SIMDE_RISCV_V_NATIVE) + vuint16m1x4_t dest = __riscv_vlseg4e16_v_u16m1x4(ptr, 4); + dest = __riscv_vset_v_u16m1_u16m1x4 (dest, 0, a_[0].sv64); + dest = __riscv_vset_v_u16m1_u16m1x4 (dest, 1, a_[1].sv64); + dest = __riscv_vset_v_u16m1_u16m1x4 (dest, 2, a_[2].sv64); + dest = __riscv_vset_v_u16m1_u16m1x4 (dest, 3, a_[3].sv64); + __riscv_vsseg4e16_v_u16m1x4 (ptr, dest, 4); + #else + simde_poly16_t buf[16]; + for (size_t i = 0; i < (sizeof(val.val[0]) / sizeof(*ptr)) * 4 ; i++) { + buf[i] = a_[i % 4].values[i / 4]; + } + simde_memcpy(ptr, buf, sizeof(buf)); + #endif + #endif +} +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vst4_p16 + #define vst4_p16(a, b) simde_vst4_p16((a), (b)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +void +simde_vst4_p64(simde_poly64_t *ptr, simde_poly64x1x4_t val) { + #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) + vst4_p64(ptr, val); + #else + simde_poly64x1_private a_[4] = { simde_poly64x1_to_private(val.val[0]), simde_poly64x1_to_private(val.val[1]), + simde_poly64x1_to_private(val.val[2]), simde_poly64x1_to_private(val.val[3]) }; + #if defined(SIMDE_RISCV_V_NATIVE) + vuint64m1x4_t dest = __riscv_vlseg4e64_v_u64m1x4(ptr, 1); + dest = __riscv_vset_v_u64m1_u64m1x4 (dest, 0, a_[0].sv64); + dest = __riscv_vset_v_u64m1_u64m1x4 (dest, 1, a_[1].sv64); + dest = __riscv_vset_v_u64m1_u64m1x4 (dest, 2, a_[2].sv64); + dest = __riscv_vset_v_u64m1_u64m1x4 (dest, 3, a_[3].sv64); + __riscv_vsseg4e64_v_u64m1x4 (ptr, dest, 1); + #else + simde_poly64_t buf[4]; + for (size_t i = 0; i < (sizeof(val.val[0]) / sizeof(*ptr)) * 4 ; i++) { + buf[i] = a_[i % 4].values[i / 4]; + } + simde_memcpy(ptr, buf, sizeof(buf)); + #endif + #endif +} +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) + #undef vst4_p64 + #define vst4_p64(a, b) simde_vst4_p64((a), (b)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +void +simde_vst4q_p8(simde_poly8_t *ptr, simde_poly8x16x4_t val) { + #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) + vst4q_p8(ptr, val); + #else + simde_poly8x16_private a_[4] = { simde_poly8x16_to_private(val.val[0]), simde_poly8x16_to_private(val.val[1]), + simde_poly8x16_to_private(val.val[2]), simde_poly8x16_to_private(val.val[3]) }; + #if defined(SIMDE_RISCV_V_NATIVE) + vuint8m1x4_t dest = __riscv_vlseg4e8_v_u8m1x4(ptr, 16); + dest = __riscv_vset_v_u8m1_u8m1x4 (dest, 0, a_[0].sv128); + dest = __riscv_vset_v_u8m1_u8m1x4 (dest, 1, a_[1].sv128); + dest = __riscv_vset_v_u8m1_u8m1x4 (dest, 2, a_[2].sv128); + dest = __riscv_vset_v_u8m1_u8m1x4 (dest, 3, a_[3].sv128); + __riscv_vsseg4e8_v_u8m1x4 (ptr, dest, 16); + #else + simde_poly8_t buf[64]; + for (size_t i = 0; i < (sizeof(val.val[0]) / sizeof(*ptr)) * 4 ; i++) { + buf[i] = a_[i % 4].values[i / 4]; + } + simde_memcpy(ptr, buf, sizeof(buf)); + #endif + #endif +} +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vst4q_p8 + #define vst4q_p8(a, b) simde_vst4q_p8((a), (b)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +void +simde_vst4q_p16(simde_poly16_t *ptr, simde_poly16x8x4_t val) { + #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) + vst4q_p16(ptr, val); + #else + simde_poly16x8_private a_[4] = { simde_poly16x8_to_private(val.val[0]), simde_poly16x8_to_private(val.val[1]), + simde_poly16x8_to_private(val.val[2]), simde_poly16x8_to_private(val.val[3]) }; + #if defined(SIMDE_RISCV_V_NATIVE) + vuint16m1x4_t dest = __riscv_vlseg4e16_v_u16m1x4(ptr, 8); + dest = __riscv_vset_v_u16m1_u16m1x4 (dest, 0, a_[0].sv128); + dest = __riscv_vset_v_u16m1_u16m1x4 (dest, 1, a_[1].sv128); + dest = __riscv_vset_v_u16m1_u16m1x4 (dest, 2, a_[2].sv128); + dest = __riscv_vset_v_u16m1_u16m1x4 (dest, 3, a_[3].sv128); + __riscv_vsseg4e16_v_u16m1x4 (ptr, dest, 8); + #else + simde_poly16_t buf[32]; + for (size_t i = 0; i < (sizeof(val.val[0]) / sizeof(*ptr)) * 4 ; i++) { + buf[i] = a_[i % 4].values[i / 4]; + } + simde_memcpy(ptr, buf, sizeof(buf)); + #endif + #endif +} +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vst4q_p16 + #define vst4q_p16(a, b) simde_vst4q_p16((a), (b)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +void +simde_vst4q_p64(simde_poly64_t *ptr, simde_poly64x2x4_t val) { + #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + vst4q_p64(ptr, val); + #else + simde_poly64x2_private a_[4] = { simde_poly64x2_to_private(val.val[0]), simde_poly64x2_to_private(val.val[1]), + simde_poly64x2_to_private(val.val[2]), simde_poly64x2_to_private(val.val[3]) }; + #if defined(SIMDE_RISCV_V_NATIVE) + vuint64m1x4_t dest = __riscv_vlseg4e64_v_u64m1x4(ptr, 2); + dest = __riscv_vset_v_u64m1_u64m1x4 (dest, 0, a_[0].sv128); + dest = __riscv_vset_v_u64m1_u64m1x4 (dest, 1, a_[1].sv128); + dest = __riscv_vset_v_u64m1_u64m1x4 (dest, 2, a_[2].sv128); + dest = __riscv_vset_v_u64m1_u64m1x4 (dest, 3, a_[3].sv128); + __riscv_vsseg4e64_v_u64m1x4 (ptr, dest, 2); + #else + simde_poly64_t buf[8]; + for (size_t i = 0; i < (sizeof(val.val[0]) / sizeof(*ptr)) * 4 ; i++) { + buf[i] = a_[i % 4].values[i / 4]; + } + simde_memcpy(ptr, buf, sizeof(buf)); + #endif + #endif +} +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vst4q_p64 + #define vst4q_p64(a, b) simde_vst4q_p64((a), (b)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +void +simde_vst4_bf16(simde_bfloat16_t *ptr, simde_bfloat16x4x4_t val) { + #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARM_NEON_BF16) + vst4_bf16(ptr, val); + #else + simde_bfloat16x4_private a_[4] = { simde_bfloat16x4_to_private(val.val[0]), simde_bfloat16x4_to_private(val.val[1]), + simde_bfloat16x4_to_private(val.val[2]), simde_bfloat16x4_to_private(val.val[3]) }; + simde_bfloat16_t buf[16]; for (size_t i = 0; i < (sizeof(val.val[0]) / sizeof(*ptr)) * 4 ; i++) { buf[i] = a_[i % 4].values[i / 4]; } simde_memcpy(ptr, buf, sizeof(buf)); #endif } -#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) - #undef vst4q_u64 - #define vst4q_u64(a, b) simde_vst4q_u64((a), (b)) +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) + #undef vst4_bf16 + #define vst4_bf16(a, b) simde_vst4_bf16((a), (b)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +void +simde_vst4q_bf16(simde_bfloat16_t *ptr, simde_bfloat16x8x4_t val) { + #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARM_NEON_BF16) + vst4q_bf16(ptr, val); + #else + simde_bfloat16x8_private a_[4] = { simde_bfloat16x8_to_private(val.val[0]), simde_bfloat16x8_to_private(val.val[1]), + simde_bfloat16x8_to_private(val.val[2]), simde_bfloat16x8_to_private(val.val[3]) }; + simde_bfloat16_t buf[32]; + for (size_t i = 0; i < (sizeof(val.val[0]) / sizeof(*ptr)) * 4 ; i++) { + buf[i] = a_[i % 4].values[i / 4]; + } + simde_memcpy(ptr, buf, sizeof(buf)); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) + #undef vst4q_bf16 + #define vst4q_bf16(a, b) simde_vst4q_bf16((a), (b)) #endif #endif /* !defined(SIMDE_BUG_INTEL_857088) */ diff --git a/lib/simd_wrapper/simde/arm/neon/st4_lane.h b/lib/simd_wrapper/simde/arm/neon/st4_lane.h index e5101e46de2..4f8a5b655a9 100644 --- a/lib/simd_wrapper/simde/arm/neon/st4_lane.h +++ b/lib/simd_wrapper/simde/arm/neon/st4_lane.h @@ -23,6 +23,7 @@ * Copyright: * 2021 Evan Nemerson * 2021 Zhi An Ng (Copyright owned by Google, LLC) + * 2023 Yi-Yen Chung (Copyright owned by Andes Technology) */ #if !defined(SIMDE_ARM_NEON_ST4_LANE_H) @@ -190,6 +191,24 @@ simde_vst4_lane_u64(uint64_t ptr[HEDLEY_ARRAY_PARAM(4)], simde_uint64x1x4_t val, #define vst4_lane_u64(a, b, c) simde_vst4_lane_u64((a), (b), (c)) #endif +SIMDE_FUNCTION_ATTRIBUTES +void +simde_vst4_lane_f16(simde_float16_t ptr[HEDLEY_ARRAY_PARAM(4)], simde_float16x4x4_t val, const int lane) + SIMDE_REQUIRE_CONSTANT_RANGE(lane, 0, 3) { + simde_float16x4_private r; + for (size_t i = 0 ; i < 4 ; i++) { + r = simde_float16x4_to_private(val.val[i]); + ptr[i] = r.values[lane]; + } +} +#if defined(SIMDE_ARM_NEON_A32V7_NATIVE) && defined(SIMDE_ARM_NEON_FP16) + #define simde_vst4_lane_f16(a, b, c) vst4_lane_f16((a), (b), (c)) +#endif +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vst4_lane_f16 + #define vst4_lane_f16(a, b, c) simde_vst4_lane_f16((a), (b), (c)) +#endif + SIMDE_FUNCTION_ATTRIBUTES void simde_vst4_lane_f32(simde_float32_t ptr[HEDLEY_ARRAY_PARAM(4)], simde_float32x2x4_t val, const int lane) @@ -381,6 +400,25 @@ simde_vst4q_lane_u64(uint64_t ptr[HEDLEY_ARRAY_PARAM(4)], simde_uint64x2x4_t val #define vst4q_lane_u64(a, b, c) simde_vst4q_lane_u64((a), (b), (c)) #endif +SIMDE_FUNCTION_ATTRIBUTES +void +simde_vst4q_lane_f16(simde_float16_t ptr[HEDLEY_ARRAY_PARAM(4)], simde_float16x8x4_t val, const int lane) + SIMDE_REQUIRE_CONSTANT_RANGE(lane, 0, 7) { + #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) && defined(SIMDE_ARM_NEON_FP16) + SIMDE_CONSTIFY_8_NO_RESULT_(vst4q_lane_f16, HEDLEY_UNREACHABLE(), lane, ptr, val); + #else + simde_float16x8_private r; + for (size_t i = 0 ; i < 4 ; i++) { + r = simde_float16x8_to_private(val.val[i]); + ptr[i] = r.values[lane]; + } + #endif +} +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vst4q_lane_f16 + #define vst4q_lane_f16(a, b, c) simde_vst4q_lane_f16((a), (b), (c)) +#endif + SIMDE_FUNCTION_ATTRIBUTES void simde_vst4q_lane_f32(simde_float32_t ptr[HEDLEY_ARRAY_PARAM(4)], simde_float32x4x4_t val, const int lane) @@ -420,6 +458,159 @@ simde_vst4q_lane_f64(simde_float64_t ptr[HEDLEY_ARRAY_PARAM(4)], simde_float64x2 #define vst4q_lane_f64(a, b, c) simde_vst4q_lane_f64((a), (b), (c)) #endif +SIMDE_FUNCTION_ATTRIBUTES +void +simde_vst4_lane_p8(simde_poly8_t ptr[HEDLEY_ARRAY_PARAM(4)], simde_poly8x8x4_t val, const int lane) + SIMDE_REQUIRE_CONSTANT_RANGE(lane, 0, 7) { + #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) + SIMDE_CONSTIFY_8_NO_RESULT_(vst4_lane_p8, HEDLEY_UNREACHABLE(), lane, ptr, val); + #else + simde_poly8x8_private r; + for (size_t i = 0 ; i < 4 ; i++) { + r = simde_poly8x8_to_private(val.val[i]); + ptr[i] = r.values[lane]; + } + #endif +} +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vst4_lane_p8 + #define vst4_lane_p8(a, b, c) simde_vst4_lane_p8((a), (b), (c)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +void +simde_vst4_lane_p16(simde_poly16_t ptr[HEDLEY_ARRAY_PARAM(4)], simde_poly16x4x4_t val, const int lane) + SIMDE_REQUIRE_CONSTANT_RANGE(lane, 0, 3) { + #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) + SIMDE_CONSTIFY_4_NO_RESULT_(vst4_lane_p16, HEDLEY_UNREACHABLE(), lane, ptr, val); + #else + simde_poly16x4_private r; + for (size_t i = 0 ; i < 4 ; i++) { + r = simde_poly16x4_to_private(val.val[i]); + ptr[i] = r.values[lane]; + } + #endif +} +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vst4_lane_p16 + #define vst4_lane_p16(a, b, c) simde_vst4_lane_p16((a), (b), (c)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +void +simde_vst4_lane_p64(simde_poly64_t ptr[HEDLEY_ARRAY_PARAM(4)], simde_poly64x1x4_t val, const int lane) + SIMDE_REQUIRE_CONSTANT_RANGE(lane, 0, 0) { + #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + (void) lane; + vst4_lane_p64(ptr, val, 0); + #else + simde_poly64x1_private r; + for (size_t i = 0 ; i < 4 ; i++) { + r = simde_poly64x1_to_private(val.val[i]); + ptr[i] = r.values[lane]; + } + #endif +} +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vst4_lane_p64 + #define vst4_lane_p64(a, b, c) simde_vst4_lane_p64((a), (b), (c)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +void +simde_vst4q_lane_p8(simde_poly8_t ptr[HEDLEY_ARRAY_PARAM(4)], simde_poly8x16x4_t val, const int lane) + SIMDE_REQUIRE_CONSTANT_RANGE(lane, 0, 15) { + #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + SIMDE_CONSTIFY_16_NO_RESULT_(vst4q_lane_p8, HEDLEY_UNREACHABLE(), lane, ptr, val); + #else + simde_poly8x16_private r; + for (size_t i = 0 ; i < 4 ; i++) { + r = simde_poly8x16_to_private(val.val[i]); + ptr[i] = r.values[lane]; + } + #endif +} +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vst4q_lane_p8 + #define vst4q_lane_p8(a, b, c) simde_vst4q_lane_p8((a), (b), (c)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +void +simde_vst4q_lane_p16(simde_poly16_t ptr[HEDLEY_ARRAY_PARAM(4)], simde_poly16x8x4_t val, const int lane) + SIMDE_REQUIRE_CONSTANT_RANGE(lane, 0, 7) { + #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) + SIMDE_CONSTIFY_8_NO_RESULT_(vst4q_lane_p16, HEDLEY_UNREACHABLE(), lane, ptr, val); + #else + simde_poly16x8_private r; + for (size_t i = 0 ; i < 4 ; i++) { + r = simde_poly16x8_to_private(val.val[i]); + ptr[i] = r.values[lane]; + } + #endif +} +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vst4q_lane_p16 + #define vst4q_lane_p16(a, b, c) simde_vst4q_lane_p16((a), (b), (c)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +void +simde_vst4q_lane_p64(simde_poly64_t ptr[HEDLEY_ARRAY_PARAM(4)], simde_poly64x2x4_t val, const int lane) + SIMDE_REQUIRE_CONSTANT_RANGE(lane, 0, 1) { + #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + SIMDE_CONSTIFY_2_NO_RESULT_(vst4q_lane_p64, HEDLEY_UNREACHABLE(), lane, ptr, val); + #else + simde_poly64x2_private r; + for (size_t i = 0 ; i < 4 ; i++) { + r = simde_poly64x2_to_private(val.val[i]); + ptr[i] = r.values[lane]; + } + #endif +} +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vst4q_lane_p64 + #define vst4q_lane_p64(a, b, c) simde_vst4q_lane_p64((a), (b), (c)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +void +simde_vst4_lane_bf16(simde_bfloat16_t ptr[HEDLEY_ARRAY_PARAM(4)], simde_bfloat16x4x4_t val, const int lane) + SIMDE_REQUIRE_CONSTANT_RANGE(lane, 0, 3) { + #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARM_NEON_BF16) + SIMDE_CONSTIFY_4_NO_RESULT_(vst4_lane_bf16, HEDLEY_UNREACHABLE(), lane, ptr, val); + #else + simde_bfloat16x4_private r; + for (size_t i = 0 ; i < 4 ; i++) { + r = simde_bfloat16x4_to_private(val.val[i]); + ptr[i] = r.values[lane]; + } + #endif +} +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) + #undef vst4_lane_bf16 + #define vst4_lane_bf16(a, b, c) simde_vst4_lane_bf16((a), (b), (c)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +void +simde_vst4q_lane_bf16(simde_bfloat16_t ptr[HEDLEY_ARRAY_PARAM(4)], simde_bfloat16x8x4_t val, const int lane) + SIMDE_REQUIRE_CONSTANT_RANGE(lane, 0, 7) { + #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARM_NEON_BF16) + SIMDE_CONSTIFY_8_NO_RESULT_(vst4q_lane_bf16, HEDLEY_UNREACHABLE(), lane, ptr, val); + #else + simde_bfloat16x8_private r; + for (size_t i = 0 ; i < 4 ; i++) { + r = simde_bfloat16x8_to_private(val.val[i]); + ptr[i] = r.values[lane]; + } + #endif +} +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) + #undef vst4q_lane_bf16 + #define vst4q_lane_bf16(a, b, c) simde_vst4q_lane_bf16((a), (b), (c)) +#endif + #endif /* !defined(SIMDE_BUG_INTEL_857088) */ SIMDE_END_DECLS_ diff --git a/lib/simd_wrapper/simde/arm/neon/sub.h b/lib/simd_wrapper/simde/arm/neon/sub.h index 85a9d501719..d540950445b 100644 --- a/lib/simd_wrapper/simde/arm/neon/sub.h +++ b/lib/simd_wrapper/simde/arm/neon/sub.h @@ -22,6 +22,7 @@ * * Copyright: * 2020 Evan Nemerson + * 2023 Yi-Yen Chung (Copyright owned by Andes Technology) */ #if !defined(SIMDE_ARM_NEON_SUB_H) @@ -33,6 +34,22 @@ HEDLEY_DIAGNOSTIC_PUSH SIMDE_DISABLE_UNWANTED_DIAGNOSTICS SIMDE_BEGIN_DECLS_ +SIMDE_FUNCTION_ATTRIBUTES +simde_float16 +simde_vsubh_f16(simde_float16_t a, simde_float16_t b) { + #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARM_NEON_FP16) + return vsubh_f16(a, b); + #else + simde_float32 af = simde_float16_to_float32(a); + simde_float32 bf = simde_float16_to_float32(b); + return simde_float16_from_float32(af - bf); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) + #undef vsubh_f16 + #define vsubh_f16(a, b) simde_vsubh_f16((a), (b)) +#endif + SIMDE_FUNCTION_ATTRIBUTES int64_t simde_vsubd_s64(int64_t a, int64_t b) { @@ -61,6 +78,30 @@ simde_vsubd_u64(uint64_t a, uint64_t b) { #define vsubd_u64(a, b) simde_vsubd_u64((a), (b)) #endif +SIMDE_FUNCTION_ATTRIBUTES +simde_float16x4_t +simde_vsub_f16(simde_float16x4_t a, simde_float16x4_t b) { + #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && defined(SIMDE_ARM_NEON_FP16) + return vsub_f16(a, b); + #else + simde_float16x4_private + r_, + a_ = simde_float16x4_to_private(a), + b_ = simde_float16x4_to_private(b); + + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { + r_.values[i] = simde_vsubh_f16(a_.values[i], b_.values[i]); + } + + return simde_float16x4_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) + #undef vsub_f16 + #define vsub_f16(a, b) simde_vsub_f16((a), (b)) +#endif + SIMDE_FUNCTION_ATTRIBUTES simde_float32x2_t simde_vsub_f32(simde_float32x2_t a, simde_float32x2_t b) { @@ -353,6 +394,32 @@ simde_vsub_u64(simde_uint64x1_t a, simde_uint64x1_t b) { #define vsub_u64(a, b) simde_vsub_u64((a), (b)) #endif +SIMDE_FUNCTION_ATTRIBUTES +simde_float16x8_t +simde_vsubq_f16(simde_float16x8_t a, simde_float16x8_t b) { + #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && defined(SIMDE_ARM_NEON_FP16) + return vsubq_f16(a, b); + #else + simde_float16x8_private + r_, + a_ = simde_float16x8_to_private(a), + b_ = simde_float16x8_to_private(b); + + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { + simde_float32_t tmp_a_ = simde_float16_to_float32(a_.values[i]); + simde_float32_t tmp_b_ = simde_float16_to_float32(b_.values[i]); + r_.values[i] = simde_float16_from_float32(tmp_a_ - tmp_b_); + } + + return simde_float16x8_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) + #undef vsubq_f16 + #define vsubq_f16(a, b) simde_vsubq_f16((a), (b)) +#endif + SIMDE_FUNCTION_ATTRIBUTES simde_float32x4_t simde_vsubq_f32(simde_float32x4_t a, simde_float32x4_t b) { diff --git a/lib/simd_wrapper/simde/arm/neon/subhn_high.h b/lib/simd_wrapper/simde/arm/neon/subhn_high.h new file mode 100644 index 00000000000..4a14749a105 --- /dev/null +++ b/lib/simd_wrapper/simde/arm/neon/subhn_high.h @@ -0,0 +1,102 @@ +/* SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, copy, + * modify, merge, publish, distribute, sublicense, and/or sell copies + * of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + * Copyright: + * 2023 Yi-Yen Chung (Copyright owned by Andes Technology) + */ + +#if !defined(SIMDE_ARM_NEON_SUBHN_HIGH_H) +#define SIMDE_ARM_NEON_SUBHN_HIGH_H + +#include "subhn.h" +#include "combine.h" + +#include "reinterpret.h" + +HEDLEY_DIAGNOSTIC_PUSH +SIMDE_DISABLE_UNWANTED_DIAGNOSTICS +SIMDE_BEGIN_DECLS_ + +#if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + #define simde_vsubhn_high_s16(r, a, b) vsubhn_high_s16((r), (a), (b)) +#else + #define simde_vsubhn_high_s16(r, a, b) simde_vcombine_s8(r, simde_vsubhn_s16(a, b)) +#endif +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vsubhn_high_s16 + #define vsubhn_high_s16(r, a, b) simde_vsubhn_high_s16((r), (a), (b)) +#endif + +#if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + #define simde_vsubhn_high_s32(r, a, b) vsubhn_high_s32((r), (a), (b)) +#else + #define simde_vsubhn_high_s32(r, a, b) simde_vcombine_s16(r, simde_vsubhn_s32(a, b)) +#endif +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vsubhn_high_s32 + #define vsubhn_high_s32(r, a, b) simde_vsubhn_high_s32((r), (a), (b)) +#endif + +#if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + #define simde_vsubhn_high_s64(r, a, b) vsubhn_high_s64((r), (a), (b)) +#else + #define simde_vsubhn_high_s64(r, a, b) simde_vcombine_s32(r, simde_vsubhn_s64(a, b)) +#endif +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vsubhn_high_s64 + #define vsubhn_high_s64(r, a, b) simde_vsubhn_high_s64((r), (a), (b)) +#endif + +#if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + #define simde_vsubhn_high_u16(r, a, b) vsubhn_high_u16((r), (a), (b)) +#else + #define simde_vsubhn_high_u16(r, a, b) simde_vcombine_u8(r, simde_vsubhn_u16(a, b)) +#endif +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vsubhn_high_u16 + #define vsubhn_high_u16(r, a, b) simde_vsubhn_high_u16((r), (a), (b)) +#endif + +#if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + #define simde_vsubhn_high_u32(r, a, b) vsubhn_high_u32((r), (a), (b)) +#else + #define simde_vsubhn_high_u32(r, a, b) simde_vcombine_u16(r, simde_vsubhn_u32(a, b)) +#endif +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vsubhn_high_u32 + #define vsubhn_high_u32(r, a, b) simde_vsubhn_high_u32((r), (a), (b)) +#endif + +#if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + #define simde_vsubhn_high_u64(r, a, b) vsubhn_high_u64((r), (a), (b)) +#else + #define simde_vsubhn_high_u64(r, a, b) simde_vcombine_u32(r, simde_vsubhn_u64(a, b)) +#endif +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vsubhn_high_u64 + #define vsubhn_high_u64(r, a, b) simde_vsubhn_high_u64((r), (a), (b)) +#endif + +SIMDE_END_DECLS_ +HEDLEY_DIAGNOSTIC_POP + +#endif /* !defined(SIMDE_ARM_NEON_SUBHN_HIGH_H) */ diff --git a/lib/simd_wrapper/simde/arm/neon/sudot_lane.h b/lib/simd_wrapper/simde/arm/neon/sudot_lane.h new file mode 100644 index 00000000000..6d3844bce78 --- /dev/null +++ b/lib/simd_wrapper/simde/arm/neon/sudot_lane.h @@ -0,0 +1,165 @@ +/* SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, copy, + * modify, merge, publish, distribute, sublicense, and/or sell copies + * of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + * Copyright: + * 2023 Yi-Yen Chung (Copyright owned by Andes Technology) + */ + +#if !defined(SIMDE_ARM_NEON_SUDOT_LANE_H) +#define SIMDE_ARM_NEON_SUDOT_LANE_H + +#include "types.h" + +HEDLEY_DIAGNOSTIC_PUSH +SIMDE_DISABLE_UNWANTED_DIAGNOSTICS +SIMDE_BEGIN_DECLS_ + +SIMDE_FUNCTION_ATTRIBUTES +simde_int32x2_t +simde_vsudot_lane_s32(simde_int32x2_t r, simde_int8x8_t a, simde_uint8x8_t b, const int lane) + SIMDE_REQUIRE_CONSTANT_RANGE(lane, 0, 1) { + simde_int32x2_t result; + simde_int32x2_private r_ = simde_int32x2_to_private(r); + simde_int8x8_private a_ = simde_int8x8_to_private(a); + simde_uint8x8_private b_ = simde_uint8x8_to_private(b); + + for (int i = 0 ; i < 2 ; i++) { + int32_t acc = 0; + SIMDE_VECTORIZE_REDUCTION(+:acc) + for (int j = 0 ; j < 4 ; j++) { + const int idx_b = j + (lane << 2); + const int idx_a = j + (i << 2); + acc += HEDLEY_STATIC_CAST(int32_t, a_.values[idx_a]) * HEDLEY_STATIC_CAST(int32_t, b_.values[idx_b]); + } + r_.values[i] += acc; + } + + result = simde_int32x2_from_private(r_); + + return result; +} +#if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARCH_ARM_MATMUL_INT8) + #define simde_vsudot_lane_s32(r, a, b, lane) vsudot_lane_s32((r), (a), (b), (lane)) +#endif +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) + #undef vsudot_lane_s32 + #define vsudot_lane_s32(r, a, b, lane) simde_vsudot_lane_s32((r), (a), (b), (lane)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_int32x2_t +simde_vsudot_laneq_s32(simde_int32x2_t r, simde_int8x8_t a, simde_uint8x16_t b, const int lane) + SIMDE_REQUIRE_CONSTANT_RANGE(lane, 0, 3) { + simde_int32x2_t result; + simde_int32x2_private r_ = simde_int32x2_to_private(r); + simde_int8x8_private a_ = simde_int8x8_to_private(a); + simde_uint8x16_private b_ = simde_uint8x16_to_private(b); + + for (int i = 0 ; i < 2 ; i++) { + int32_t acc = 0; + SIMDE_VECTORIZE_REDUCTION(+:acc) + for (int j = 0 ; j < 4 ; j++) { + const int idx_b = j + (lane << 2); + const int idx_a = j + (i << 2); + acc += HEDLEY_STATIC_CAST(int32_t, a_.values[idx_a]) * HEDLEY_STATIC_CAST(int32_t, b_.values[idx_b]); + } + r_.values[i] += acc; + } + + result = simde_int32x2_from_private(r_); + + return result; +} +#if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && defined(SIMDE_ARCH_ARM_MATMUL_INT8) + #define simde_vsudot_laneq_s32(r, a, b, lane) vsudot_laneq_s32((r), (a), (b), (lane)) +#endif +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vsudot_laneq_s32 + #define vsudot_laneq_s32(r, a, b, lane) simde_vsudot_laneq_s32((r), (a), (b), (lane)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_int32x4_t +simde_vsudotq_laneq_s32(simde_int32x4_t r, simde_int8x16_t a, simde_uint8x16_t b, const int lane) + SIMDE_REQUIRE_CONSTANT_RANGE(lane, 0, 3) { + simde_int32x4_t result; + simde_int32x4_private r_ = simde_int32x4_to_private(r); + simde_int8x16_private a_ = simde_int8x16_to_private(a); + simde_uint8x16_private b_ = simde_uint8x16_to_private(b); + + for(int i = 0 ; i < 4 ; i++) { + int32_t acc = 0; + SIMDE_VECTORIZE_REDUCTION(+:acc) + for(int j = 0 ; j < 4 ; j++) { + const int idx_b = j + (lane << 2); + const int idx_a = j + (i << 2); + acc += HEDLEY_STATIC_CAST(int32_t, a_.values[idx_a]) * HEDLEY_STATIC_CAST(int32_t, b_.values[idx_b]); + } + r_.values[i] += acc; + } + + result = simde_int32x4_from_private(r_); + return result; +} +#if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && defined(SIMDE_ARCH_ARM_MATMUL_INT8) + #define simde_vsudotq_laneq_s32(r, a, b, lane) vsudotq_laneq_s32((r), (a), (b), (lane)) +#endif +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vsudotq_laneq_s32 + #define vsudotq_laneq_s32(r, a, b, lane) simde_vsudotq_laneq_s32((r), (a), (b), (lane)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_int32x4_t +simde_vsudotq_lane_s32(simde_int32x4_t r, simde_int8x16_t a, simde_uint8x8_t b, const int lane) + SIMDE_REQUIRE_CONSTANT_RANGE(lane, 0, 1) { + simde_int32x4_t result; + simde_int32x4_private r_ = simde_int32x4_to_private(r); + simde_int8x16_private a_ = simde_int8x16_to_private(a); + simde_uint8x8_private b_ = simde_uint8x8_to_private(b); + + for(int i = 0 ; i < 4 ; i++) { + int32_t acc = 0; + SIMDE_VECTORIZE_REDUCTION(+:acc) + for(int j = 0 ; j < 4 ; j++) { + const int idx_b = j + (lane << 2); + const int idx_a = j + (i << 2); + acc += HEDLEY_STATIC_CAST(int32_t, a_.values[idx_a]) * HEDLEY_STATIC_CAST(int32_t, b_.values[idx_b]); + } + r_.values[i] += acc; + } + + result = simde_int32x4_from_private(r_); + return result; +} +#if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARCH_ARM_MATMUL_INT8) + #define simde_vsudotq_lane_s32(r, a, b, lane) vsudotq_lane_s32((r), (a), (b), (lane)) +#endif +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) + #undef vsudotq_lane_s32 + #define vsudotq_lane_s32(r, a, b, lane) simde_vsudotq_lane_s32((r), (a), (b), (lane)) +#endif + +SIMDE_END_DECLS_ +HEDLEY_DIAGNOSTIC_POP + +#endif /* !defined(SIMDE_ARM_NEON_SUDOT_LANE_H) */ diff --git a/lib/simd_wrapper/simde/arm/neon/tbl.h b/lib/simd_wrapper/simde/arm/neon/tbl.h index 224e86d7ce5..3d0d841d65c 100644 --- a/lib/simd_wrapper/simde/arm/neon/tbl.h +++ b/lib/simd_wrapper/simde/arm/neon/tbl.h @@ -23,6 +23,7 @@ * Copyright: * 2020 Evan Nemerson * 2020 Christopher Moore + * 2023 Yi-Yen Chung (Copyright owned by Andes Technology) */ #if !defined(SIMDE_ARM_NEON_TBL_H) @@ -235,6 +236,68 @@ simde_vtbl4_s8(simde_int8x8x4_t a, simde_int8x8_t b) { #define vtbl4_s8(a, b) simde_vtbl4_s8((a), (b)) #endif +SIMDE_FUNCTION_ATTRIBUTES +simde_poly8x8_t +simde_vtbl1_p8(simde_poly8x8_t a, simde_uint8x8_t b) { + #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) + return vtbl1_p8(a, b); + #else + return simde_vreinterpret_p8_u8(simde_vtbl1_u8(simde_vreinterpret_u8_p8(a), b)); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vtbl1_p8 + #define vtbl1_p8(a, b) simde_vtbl1_p8((a), (b)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_poly8x8_t +simde_vtbl2_p8(simde_poly8x8x2_t a, simde_uint8x8_t b) { + #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) + return vtbl2_p8(a, b); + #else + simde_uint8x8x2_t a_; + simde_memcpy(&a_, &a, sizeof(a_)); + return simde_vreinterpret_p8_u8(simde_vtbl2_u8(a_, b)); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vtbl2_p8 + #define vtbl2_p8(a, b) simde_vtbl2_p8((a), (b)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_poly8x8_t +simde_vtbl3_p8(simde_poly8x8x3_t a, simde_uint8x8_t b) { + #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) + return vtbl3_p8(a, b); + #else + simde_uint8x8x3_t a_; + simde_memcpy(&a_, &a, sizeof(a_)); + return simde_vreinterpret_p8_u8(simde_vtbl3_u8(a_, b)); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vtbl3_p8 + #define vtbl3_p8(a, b) simde_vtbl3_p8((a), (b)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_poly8x8_t +simde_vtbl4_p8(simde_poly8x8x4_t a, simde_uint8x8_t b) { + #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) + return vtbl4_p8(a, b); + #else + simde_uint8x8x4_t a_; + simde_memcpy(&a_, &a, sizeof(a_)); + return simde_vreinterpret_p8_u8(simde_vtbl4_u8(a_, b)); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vtbl4_p8 + #define vtbl4_p8(a, b) simde_vtbl4_p8((a), (b)) +#endif + #endif /* !defined(SIMDE_BUG_INTEL_857088) */ SIMDE_END_DECLS_ diff --git a/lib/simd_wrapper/simde/arm/neon/tbx.h b/lib/simd_wrapper/simde/arm/neon/tbx.h index 4e2c639f094..0b2cae22267 100644 --- a/lib/simd_wrapper/simde/arm/neon/tbx.h +++ b/lib/simd_wrapper/simde/arm/neon/tbx.h @@ -23,6 +23,7 @@ * Copyright: * 2020 Evan Nemerson * 2020 Christopher Moore + * 2023 Yi-Yen Chung (Copyright owned by Andes Technology) */ #if !defined(SIMDE_ARM_NEON_TBX_H) @@ -247,6 +248,74 @@ simde_vtbx4_s8(simde_int8x8_t a, simde_int8x8x4_t b, simde_int8x8_t c) { #define vtbx4_s8(a, b, c) simde_vtbx4_s8((a), (b), (c)) #endif +SIMDE_FUNCTION_ATTRIBUTES +simde_poly8x8_t +simde_vtbx1_p8(simde_poly8x8_t a, simde_poly8x8_t b, simde_uint8x8_t c) { + #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) + return vtbx1_p8(a, b, c); + #else + return simde_vreinterpret_p8_u8(simde_vtbx1_u8(simde_vreinterpret_u8_p8(a), simde_vreinterpret_u8_p8(b), c)); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vtbx1_p8 + #define vtbx1_p8(a, b, c) simde_vtbx1_p8((a), (b), (c)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_poly8x8_t +simde_vtbx2_p8(simde_poly8x8_t a, simde_poly8x8x2_t b, simde_uint8x8_t c) { + #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) + return vtbx2_p8(a, b, c); + #else + simde_uint8x8x2_t b_; + simde_memcpy(&b_, &b, sizeof(b_)); + return simde_vreinterpret_p8_u8(simde_vtbx2_u8(simde_vreinterpret_u8_p8(a), + b_, + c)); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vtbx2_p8 + #define vtbx2_p8(a, b, c) simde_vtbx2_p8((a), (b), (c)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_poly8x8_t +simde_vtbx3_p8(simde_poly8x8_t a, simde_poly8x8x3_t b, simde_uint8x8_t c) { + #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) + return vtbx3_p8(a, b, c); + #else + simde_uint8x8x3_t b_; + simde_memcpy(&b_, &b, sizeof(b_)); + return simde_vreinterpret_p8_u8(simde_vtbx3_u8(simde_vreinterpret_u8_p8(a), + b_, + c)); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vtbx3_p8 + #define vtbx3_p8(a, b, c) simde_vtbx3_p8((a), (b), (c)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_poly8x8_t +simde_vtbx4_p8(simde_poly8x8_t a, simde_poly8x8x4_t b, simde_uint8x8_t c) { + #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) + return vtbx4_p8(a, b, c); + #else + simde_uint8x8x4_t b_; + simde_memcpy(&b_, &b, sizeof(b_)); + return simde_vreinterpret_p8_u8(simde_vtbx4_u8(simde_vreinterpret_u8_p8(a), + b_, + c)); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vtbx4_p8 + #define vtbx4_p8(a, b, c) simde_vtbx4_p8((a), (b), (c)) +#endif + #endif /* !defined(SIMDE_BUG_INTEL_857088) */ SIMDE_END_DECLS_ diff --git a/lib/simd_wrapper/simde/arm/neon/trn.h b/lib/simd_wrapper/simde/arm/neon/trn.h index 9f9184849cd..8dfdf0efbf5 100644 --- a/lib/simd_wrapper/simde/arm/neon/trn.h +++ b/lib/simd_wrapper/simde/arm/neon/trn.h @@ -23,6 +23,7 @@ * Copyright: * 2020 Evan Nemerson * 2020 Sean Maher (Copyright owned by Google, LLC) + * 2023 Yi-Yen Chung (Copyright owned by Andes Technology) */ #if !defined(SIMDE_ARM_NEON_TRN_H) && !defined(SIMDE_BUG_INTEL_857088) @@ -36,6 +37,21 @@ HEDLEY_DIAGNOSTIC_PUSH SIMDE_DISABLE_UNWANTED_DIAGNOSTICS SIMDE_BEGIN_DECLS_ +SIMDE_FUNCTION_ATTRIBUTES +simde_float16x4x2_t +simde_vtrn_f16(simde_float16x4_t a, simde_float16x4_t b) { + #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) && defined(SIMDE_ARM_NEON_FP16) + return vtrn_f16(a, b); + #else + simde_float16x4x2_t r = { { simde_vtrn1_f16(a, b), simde_vtrn2_f16(a, b) } }; + return r; + #endif +} +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vtrn_f16 + #define vtrn_f16(a, b) simde_vtrn_f16((a), (b)) +#endif + SIMDE_FUNCTION_ATTRIBUTES simde_float32x2x2_t simde_vtrn_f32(simde_float32x2_t a, simde_float32x2_t b) { @@ -141,6 +157,21 @@ simde_vtrn_u32(simde_uint32x2_t a, simde_uint32x2_t b) { #define vtrn_u32(a, b) simde_vtrn_u32((a), (b)) #endif +SIMDE_FUNCTION_ATTRIBUTES +simde_float16x8x2_t +simde_vtrnq_f16(simde_float16x8_t a, simde_float16x8_t b) { + #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) && defined(SIMDE_ARM_NEON_FP16) + return vtrnq_f16(a, b); + #else + simde_float16x8x2_t r = { { simde_vtrn1q_f16(a, b), simde_vtrn2q_f16(a, b) } }; + return r; + #endif +} +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vtrnq_f16 + #define vtrnq_f16(a, b) simde_vtrnq_f16((a), (b)) +#endif + SIMDE_FUNCTION_ATTRIBUTES simde_float32x4x2_t simde_vtrnq_f32(simde_float32x4_t a, simde_float32x4_t b) { @@ -246,6 +277,66 @@ simde_vtrnq_u32(simde_uint32x4_t a, simde_uint32x4_t b) { #define vtrnq_u32(a, b) simde_vtrnq_u32((a), (b)) #endif +SIMDE_FUNCTION_ATTRIBUTES +simde_poly8x8x2_t +simde_vtrn_p8(simde_poly8x8_t a, simde_poly8x8_t b) { + #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) + return vtrn_p8(a, b); + #else + simde_poly8x8x2_t r = { { simde_vtrn1_p8(a, b), simde_vtrn2_p8(a, b) } }; + return r; + #endif +} +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vtrn_p8 + #define vtrn_p8(a, b) simde_vtrn_p8((a), (b)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_poly16x4x2_t +simde_vtrn_p16(simde_poly16x4_t a, simde_poly16x4_t b) { + #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) + return vtrn_p16(a, b); + #else + simde_poly16x4x2_t r = { { simde_vtrn1_p16(a, b), simde_vtrn2_p16(a, b) } }; + return r; + #endif +} +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vtrn_p16 + #define vtrn_p16(a, b) simde_vtrn_p16((a), (b)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_poly8x16x2_t +simde_vtrnq_p8(simde_poly8x16_t a, simde_poly8x16_t b) { + #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) + return vtrnq_p8(a, b); + #else + simde_poly8x16x2_t r = { { simde_vtrn1q_p8(a, b), simde_vtrn2q_p8(a, b) } }; + return r; + #endif +} +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vtrnq_p8 + #define vtrnq_p8(a, b) simde_vtrnq_p8((a), (b)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_poly16x8x2_t +simde_vtrnq_p16(simde_poly16x8_t a, simde_poly16x8_t b) { + #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) + return vtrnq_p16(a, b); + #else + simde_poly16x8x2_t r = { { simde_vtrn1q_p16(a, b), simde_vtrn2q_p16(a, b) } }; + return r; + #endif +} +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vtrnq_p16 + #define vtrnq_p16(a, b) simde_vtrnq_p16((a), (b)) +#endif + SIMDE_END_DECLS_ HEDLEY_DIAGNOSTIC_POP diff --git a/lib/simd_wrapper/simde/arm/neon/trn1.h b/lib/simd_wrapper/simde/arm/neon/trn1.h index f3b1521aa7b..47d35ceafc6 100644 --- a/lib/simd_wrapper/simde/arm/neon/trn1.h +++ b/lib/simd_wrapper/simde/arm/neon/trn1.h @@ -23,6 +23,7 @@ * Copyright: * 2020 Evan Nemerson * 2020 Sean Maher (Copyright owned by Google, LLC) + * 2023 Yi-Yen Chung (Copyright owned by Andes Technology) */ #if !defined(SIMDE_ARM_NEON_TRN1_H) @@ -34,6 +35,33 @@ HEDLEY_DIAGNOSTIC_PUSH SIMDE_DISABLE_UNWANTED_DIAGNOSTICS SIMDE_BEGIN_DECLS_ +SIMDE_FUNCTION_ATTRIBUTES +simde_float16x4_t +simde_vtrn1_f16(simde_float16x4_t a, simde_float16x4_t b) { + #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && defined(SIMDE_ARM_NEON_FP16) + return vtrn1_f16(a, b); + #else + simde_float16x4_private + r_, + a_ = simde_float16x4_to_private(a), + b_ = simde_float16x4_to_private(b); + + const size_t halfway_point = sizeof(r_.values) / sizeof(r_.values[0]) / 2; + SIMDE_VECTORIZE + for (size_t i = 0 ; i < halfway_point ; i++) { + const size_t idx = i << 1; + r_.values[idx] = a_.values[idx]; + r_.values[idx | 1] = b_.values[idx]; + } + + return simde_float16x4_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vtrn1_f16 + #define vtrn1_f16(a, b) simde_vtrn1_f16((a), (b)) +#endif + SIMDE_FUNCTION_ATTRIBUTES simde_float32x2_t simde_vtrn1_f32(simde_float32x2_t a, simde_float32x2_t b) { @@ -223,6 +251,33 @@ simde_vtrn1_u32(simde_uint32x2_t a, simde_uint32x2_t b) { #define vtrn1_u32(a, b) simde_vtrn1_u32((a), (b)) #endif +SIMDE_FUNCTION_ATTRIBUTES +simde_float16x8_t +simde_vtrn1q_f16(simde_float16x8_t a, simde_float16x8_t b) { + #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && defined(SIMDE_ARM_NEON_FP16) + return vtrn1q_f16(a, b); + #else + simde_float16x8_private + r_, + a_ = simde_float16x8_to_private(a), + b_ = simde_float16x8_to_private(b); + + const size_t halfway_point = sizeof(r_.values) / sizeof(r_.values[0]) / 2; + SIMDE_VECTORIZE + for (size_t i = 0 ; i < halfway_point ; i++) { + const size_t idx = i << 1; + r_.values[idx] = a_.values[idx]; + r_.values[idx | 1] = b_.values[idx]; + } + + return simde_float16x8_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vtrn1q_f16 + #define vtrn1q_f16(a, b) simde_vtrn1q_f16((a), (b)) +#endif + SIMDE_FUNCTION_ATTRIBUTES simde_float32x4_t simde_vtrn1q_f32(simde_float32x4_t a, simde_float32x4_t b) { @@ -494,6 +549,141 @@ simde_vtrn1q_u64(simde_uint64x2_t a, simde_uint64x2_t b) { #define vtrn1q_u64(a, b) simde_vtrn1q_u64((a), (b)) #endif +SIMDE_FUNCTION_ATTRIBUTES +simde_poly8x8_t +simde_vtrn1_p8(simde_poly8x8_t a, simde_poly8x8_t b) { + #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + return vtrn1_p8(a, b); + #else + simde_poly8x8_private + r_, + a_ = simde_poly8x8_to_private(a), + b_ = simde_poly8x8_to_private(b); + + const size_t halfway_point = sizeof(r_.values) / sizeof(r_.values[0]) / 2; + SIMDE_VECTORIZE + for (size_t i = 0 ; i < halfway_point ; i++) { + const size_t idx = i << 1; + r_.values[idx] = a_.values[idx]; + r_.values[idx | 1] = b_.values[idx]; + } + + return simde_poly8x8_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vtrn1_p8 + #define vtrn1_p8(a, b) simde_vtrn1_p8((a), (b)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_poly16x4_t +simde_vtrn1_p16(simde_poly16x4_t a, simde_poly16x4_t b) { + #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + return vtrn1_p16(a, b); + #else + simde_poly16x4_private + r_, + a_ = simde_poly16x4_to_private(a), + b_ = simde_poly16x4_to_private(b); + + const size_t halfway_point = sizeof(r_.values) / sizeof(r_.values[0]) / 2; + SIMDE_VECTORIZE + for (size_t i = 0 ; i < halfway_point ; i++) { + const size_t idx = i << 1; + r_.values[idx] = a_.values[idx]; + r_.values[idx | 1] = b_.values[idx]; + } + + return simde_poly16x4_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vtrn1_p16 + #define vtrn1_p16(a, b) simde_vtrn1_p16((a), (b)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_poly8x16_t +simde_vtrn1q_p8(simde_poly8x16_t a, simde_poly8x16_t b) { + #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + return vtrn1q_p8(a, b); + #else + simde_poly8x16_private + r_, + a_ = simde_poly8x16_to_private(a), + b_ = simde_poly8x16_to_private(b); + + const size_t halfway_point = sizeof(r_.values) / sizeof(r_.values[0]) / 2; + SIMDE_VECTORIZE + for (size_t i = 0 ; i < halfway_point ; i++) { + const size_t idx = i << 1; + r_.values[idx] = a_.values[idx]; + r_.values[idx | 1] = b_.values[idx]; + } + + return simde_poly8x16_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vtrn1q_p8 + #define vtrn1q_p8(a, b) simde_vtrn1q_p8((a), (b)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_poly16x8_t +simde_vtrn1q_p16(simde_poly16x8_t a, simde_poly16x8_t b) { + #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + return vtrn1q_p16(a, b); + #else + simde_poly16x8_private + r_, + a_ = simde_poly16x8_to_private(a), + b_ = simde_poly16x8_to_private(b); + + const size_t halfway_point = sizeof(r_.values) / sizeof(r_.values[0]) / 2; + SIMDE_VECTORIZE + for (size_t i = 0 ; i < halfway_point ; i++) { + const size_t idx = i << 1; + r_.values[idx] = a_.values[idx]; + r_.values[idx | 1] = b_.values[idx]; + } + + return simde_poly16x8_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vtrn1q_p16 + #define vtrn1q_p16(a, b) simde_vtrn1q_p16((a), (b)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_poly64x2_t +simde_vtrn1q_p64(simde_poly64x2_t a, simde_poly64x2_t b) { + #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + return vtrn1q_p64(a, b); + #else + simde_poly64x2_private + r_, + a_ = simde_poly64x2_to_private(a), + b_ = simde_poly64x2_to_private(b); + + const size_t halfway_point = sizeof(r_.values) / sizeof(r_.values[0]) / 2; + SIMDE_VECTORIZE + for (size_t i = 0 ; i < halfway_point ; i++) { + const size_t idx = i << 1; + r_.values[idx] = a_.values[idx]; + r_.values[idx | 1] = b_.values[idx]; + } + + return simde_poly64x2_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vtrn1q_p64 + #define vtrn1q_p64(a, b) simde_vtrn1q_p64((a), (b)) +#endif + SIMDE_END_DECLS_ HEDLEY_DIAGNOSTIC_POP diff --git a/lib/simd_wrapper/simde/arm/neon/trn2.h b/lib/simd_wrapper/simde/arm/neon/trn2.h index 31bd7dc4e77..e2d06304003 100644 --- a/lib/simd_wrapper/simde/arm/neon/trn2.h +++ b/lib/simd_wrapper/simde/arm/neon/trn2.h @@ -23,6 +23,7 @@ * Copyright: * 2020 Evan Nemerson * 2020 Sean Maher (Copyright owned by Google, LLC) + * 2023 Yi-Yen Chung (Copyright owned by Andes Technology) */ #if !defined(SIMDE_ARM_NEON_TRN2_H) @@ -34,6 +35,33 @@ HEDLEY_DIAGNOSTIC_PUSH SIMDE_DISABLE_UNWANTED_DIAGNOSTICS SIMDE_BEGIN_DECLS_ +SIMDE_FUNCTION_ATTRIBUTES +simde_float16x4_t +simde_vtrn2_f16(simde_float16x4_t a, simde_float16x4_t b) { + #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && defined(SIMDE_ARM_NEON_FP16) + return vtrn2_f16(a, b); + #else + simde_float16x4_private + r_, + a_ = simde_float16x4_to_private(a), + b_ = simde_float16x4_to_private(b); + + const size_t halfway_point = sizeof(r_.values) / sizeof(r_.values[0]) / 2; + SIMDE_VECTORIZE + for (size_t i = 0 ; i < halfway_point ; i++) { + const size_t idx = i << 1; + r_.values[idx] = a_.values[idx | 1]; + r_.values[idx | 1] = b_.values[idx | 1]; + } + + return simde_float16x4_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vtrn2_f16 + #define vtrn2_f16(a, b) simde_vtrn2_f16((a), (b)) +#endif + SIMDE_FUNCTION_ATTRIBUTES simde_float32x2_t simde_vtrn2_f32(simde_float32x2_t a, simde_float32x2_t b) { @@ -223,6 +251,33 @@ simde_vtrn2_u32(simde_uint32x2_t a, simde_uint32x2_t b) { #define vtrn2_u32(a, b) simde_vtrn2_u32((a), (b)) #endif +SIMDE_FUNCTION_ATTRIBUTES +simde_float16x8_t +simde_vtrn2q_f16(simde_float16x8_t a, simde_float16x8_t b) { + #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && defined(SIMDE_ARM_NEON_FP16) + return vtrn2q_f16(a, b); + #else + simde_float16x8_private + r_, + a_ = simde_float16x8_to_private(a), + b_ = simde_float16x8_to_private(b); + + const size_t halfway_point = sizeof(r_.values) / sizeof(r_.values[0]) / 2; + SIMDE_VECTORIZE + for (size_t i = 0 ; i < halfway_point ; i++) { + const size_t idx = i << 1; + r_.values[idx] = a_.values[idx | 1]; + r_.values[idx | 1] = b_.values[idx | 1]; + } + + return simde_float16x8_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vtrn2q_f16 + #define vtrn2q_f16(a, b) simde_vtrn2q_f16((a), (b)) +#endif + SIMDE_FUNCTION_ATTRIBUTES simde_float32x4_t simde_vtrn2q_f32(simde_float32x4_t a, simde_float32x4_t b) { @@ -493,6 +548,141 @@ simde_vtrn2q_u64(simde_uint64x2_t a, simde_uint64x2_t b) { #define vtrn2q_u64(a, b) simde_vtrn2q_u64((a), (b)) #endif +SIMDE_FUNCTION_ATTRIBUTES +simde_poly8x8_t +simde_vtrn2_p8(simde_poly8x8_t a, simde_poly8x8_t b) { + #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + return vtrn2_p8(a, b); + #else + simde_poly8x8_private + r_, + a_ = simde_poly8x8_to_private(a), + b_ = simde_poly8x8_to_private(b); + + const size_t halfway_point = sizeof(r_.values) / sizeof(r_.values[0]) / 2; + SIMDE_VECTORIZE + for (size_t i = 0 ; i < halfway_point ; i++) { + const size_t idx = i << 1; + r_.values[idx] = a_.values[idx | 1]; + r_.values[idx | 1] = b_.values[idx | 1]; + } + + return simde_poly8x8_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vtrn2_p8 + #define vtrn2_p8(a, b) simde_vtrn2_p8((a), (b)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_poly16x4_t +simde_vtrn2_p16(simde_poly16x4_t a, simde_poly16x4_t b) { + #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + return vtrn2_p16(a, b); + #else + simde_poly16x4_private + r_, + a_ = simde_poly16x4_to_private(a), + b_ = simde_poly16x4_to_private(b); + + const size_t halfway_point = sizeof(r_.values) / sizeof(r_.values[0]) / 2; + SIMDE_VECTORIZE + for (size_t i = 0 ; i < halfway_point ; i++) { + const size_t idx = i << 1; + r_.values[idx] = a_.values[idx | 1]; + r_.values[idx | 1] = b_.values[idx | 1]; + } + + return simde_poly16x4_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vtrn2_p16 + #define vtrn2_p16(a, b) simde_vtrn2_p16((a), (b)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_poly8x16_t +simde_vtrn2q_p8(simde_poly8x16_t a, simde_poly8x16_t b) { + #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + return vtrn2q_p8(a, b); + #else + simde_poly8x16_private + r_, + a_ = simde_poly8x16_to_private(a), + b_ = simde_poly8x16_to_private(b); + + const size_t halfway_point = sizeof(r_.values) / sizeof(r_.values[0]) / 2; + SIMDE_VECTORIZE + for (size_t i = 0 ; i < halfway_point ; i++) { + const size_t idx = i << 1; + r_.values[idx] = a_.values[idx | 1]; + r_.values[idx | 1] = b_.values[idx | 1]; + } + + return simde_poly8x16_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vtrn2q_p8 + #define vtrn2q_p8(a, b) simde_vtrn2q_p8((a), (b)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_poly16x8_t +simde_vtrn2q_p16(simde_poly16x8_t a, simde_poly16x8_t b) { + #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + return vtrn2q_p16(a, b); + #else + simde_poly16x8_private + r_, + a_ = simde_poly16x8_to_private(a), + b_ = simde_poly16x8_to_private(b); + + const size_t halfway_point = sizeof(r_.values) / sizeof(r_.values[0]) / 2; + SIMDE_VECTORIZE + for (size_t i = 0 ; i < halfway_point ; i++) { + const size_t idx = i << 1; + r_.values[idx] = a_.values[idx | 1]; + r_.values[idx | 1] = b_.values[idx | 1]; + } + + return simde_poly16x8_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vtrn2q_p16 + #define vtrn2q_p16(a, b) simde_vtrn2q_p16((a), (b)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_poly64x2_t +simde_vtrn2q_p64(simde_poly64x2_t a, simde_poly64x2_t b) { + #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + return vtrn2q_p64(a, b); + #else + simde_poly64x2_private + r_, + a_ = simde_poly64x2_to_private(a), + b_ = simde_poly64x2_to_private(b); + + const size_t halfway_point = sizeof(r_.values) / sizeof(r_.values[0]) / 2; + SIMDE_VECTORIZE + for (size_t i = 0 ; i < halfway_point ; i++) { + const size_t idx = i << 1; + r_.values[idx] = a_.values[idx | 1]; + r_.values[idx | 1] = b_.values[idx | 1]; + } + + return simde_poly64x2_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vtrn2q_p64 + #define vtrn2q_p64(a, b) simde_vtrn2q_p64((a), (b)) +#endif + SIMDE_END_DECLS_ HEDLEY_DIAGNOSTIC_POP diff --git a/lib/simd_wrapper/simde/arm/neon/tst.h b/lib/simd_wrapper/simde/arm/neon/tst.h index 2434446229f..fdc146d2730 100644 --- a/lib/simd_wrapper/simde/arm/neon/tst.h +++ b/lib/simd_wrapper/simde/arm/neon/tst.h @@ -23,6 +23,7 @@ * Copyright: * 2020 Evan Nemerson * 2020 Christopher Moore + * 2023 Yi-Yen Chung (Copyright owned by Andes Technology) */ #if !defined(SIMDE_ARM_NEON_TST_H) @@ -562,6 +563,102 @@ simde_vtst_u64(simde_uint64x1_t a, simde_uint64x1_t b) { #define vtst_u64(a, b) simde_vtst_u64((a), (b)) #endif +SIMDE_FUNCTION_ATTRIBUTES +simde_uint8x8_t +simde_vtst_p8(simde_poly8x8_t a, simde_poly8x8_t b) { + #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) + return vtst_p8(a, b); + #else + simde_poly8x8_private + a_ = simde_poly8x8_to_private(a), + b_ = simde_poly8x8_to_private(b); + simde_uint8x8_private r_; + + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { + r_.values[i] = ((a_.values[i] & b_.values[i]) != 0) ? UINT8_MAX : 0; + } + + return simde_uint8x8_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vtst_p8 + #define vtst_p8(a, b) simde_vtst_p8((a), (b)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_uint64x1_t +simde_vtst_p64(simde_poly64x1_t a, simde_poly64x1_t b) { + #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + return vtst_p64(a, b); + #else + simde_poly64x1_private + a_ = simde_poly64x1_to_private(a), + b_ = simde_poly64x1_to_private(b); + simde_uint64x1_private r_; + + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { + r_.values[i] = ((a_.values[i] & b_.values[i]) != 0) ? UINT64_MAX : 0; + } + + return simde_uint64x1_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vtst_p64 + #define vtst_p64(a, b) simde_vtst_p64((a), (b)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_uint8x16_t +simde_vtstq_p8(simde_poly8x16_t a, simde_poly8x16_t b) { + #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) + return vtstq_p8(a, b); + #else + simde_poly8x16_private + a_ = simde_poly8x16_to_private(a), + b_ = simde_poly8x16_to_private(b); + simde_uint8x16_private r_; + + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { + r_.values[i] = ((a_.values[i] & b_.values[i]) != 0) ? UINT8_MAX : 0; + } + + return simde_uint8x16_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vtstq_p8 + #define vtstq_p8(a, b) simde_vtstq_p8((a), (b)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_uint64x2_t +simde_vtstq_p64(simde_poly64x2_t a, simde_poly64x2_t b) { + #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + return vtstq_p64(a, b); + #else + simde_poly64x2_private + a_ = simde_poly64x2_to_private(a), + b_ = simde_poly64x2_to_private(b); + simde_uint64x2_private r_; + + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) { + r_.values[i] = ((a_.values[i] & b_.values[i]) != 0) ? UINT64_MAX : 0; + } + + return simde_uint64x2_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vtstq_p64 + #define vtstq_p64(a, b) simde_vtstq_p64((a), (b)) +#endif + SIMDE_END_DECLS_ HEDLEY_DIAGNOSTIC_POP diff --git a/lib/simd_wrapper/simde/arm/neon/types.h b/lib/simd_wrapper/simde/arm/neon/types.h index 12bce8b8730..5a5954ac930 100644 --- a/lib/simd_wrapper/simde/arm/neon/types.h +++ b/lib/simd_wrapper/simde/arm/neon/types.h @@ -22,6 +22,8 @@ * * Copyright: * 2020 Evan Nemerson + * 2023 Yi-Yen Chung (Copyright owned by Andes Technology) + * 2023 Ju-Hung Li (Copyright owned by NTHU pllab) */ #if !defined(SIMDE_ARM_NEON_TYPES_H) @@ -29,6 +31,7 @@ #include "../../simde-common.h" #include "../../simde-f16.h" +#include "../../simde-bf16.h" HEDLEY_DIAGNOSTIC_PUSH SIMDE_DISABLE_UNWANTED_DIAGNOSTICS @@ -46,6 +49,11 @@ typedef union { #if defined(SIMDE_X86_MMX_NATIVE) __m64 m64; #endif + + #if defined(SIMDE_RISCV_V_NATIVE) + fixed_vint8m1_t sv64; + #endif + } simde_int8x8_private; typedef union { @@ -54,6 +62,11 @@ typedef union { #if defined(SIMDE_X86_MMX_NATIVE) __m64 m64; #endif + + #if defined(SIMDE_RISCV_V_NATIVE) + fixed_vint16m1_t sv64; + #endif + } simde_int16x4_private; typedef union { @@ -62,6 +75,11 @@ typedef union { #if defined(SIMDE_X86_MMX_NATIVE) __m64 m64; #endif + + #if defined(SIMDE_RISCV_V_NATIVE) + fixed_vint32m1_t sv64; + #endif + } simde_int32x2_private; typedef union { @@ -70,6 +88,11 @@ typedef union { #if defined(SIMDE_X86_MMX_NATIVE) __m64 m64; #endif + + #if defined(SIMDE_RISCV_V_NATIVE) + fixed_vint64m1_t sv64; + #endif + } simde_int64x1_private; typedef union { @@ -78,6 +101,11 @@ typedef union { #if defined(SIMDE_X86_MMX_NATIVE) __m64 m64; #endif + + #if defined(SIMDE_RISCV_V_NATIVE) + fixed_vuint8m1_t sv64; + #endif + } simde_uint8x8_private; typedef union { @@ -86,6 +114,11 @@ typedef union { #if defined(SIMDE_X86_MMX_NATIVE) __m64 m64; #endif + + #if defined(SIMDE_RISCV_V_NATIVE) + fixed_vuint16m1_t sv64; + #endif + } simde_uint16x4_private; typedef union { @@ -94,6 +127,11 @@ typedef union { #if defined(SIMDE_X86_MMX_NATIVE) __m64 m64; #endif + + #if defined(SIMDE_RISCV_V_NATIVE) + fixed_vuint32m1_t sv64; + #endif + } simde_uint32x2_private; typedef union { @@ -102,6 +140,11 @@ typedef union { #if defined(SIMDE_X86_MMX_NATIVE) __m64 m64; #endif + + #if defined(SIMDE_RISCV_V_NATIVE) + fixed_vuint64m1_t sv64; + #endif + } simde_uint64x1_private; typedef union { @@ -114,6 +157,11 @@ typedef union { #if defined(SIMDE_X86_MMX_NATIVE) __m64 m64; #endif + + #if defined(SIMDE_RISCV_V_NATIVE) && SIMDE_ARCH_RISCV_ZVFH + fixed_vfloat16m1_t sv64; + #endif + } simde_float16x4_private; typedef union { @@ -122,6 +170,11 @@ typedef union { #if defined(SIMDE_X86_MMX_NATIVE) __m64 m64; #endif + + #if defined(SIMDE_RISCV_V_NATIVE) + fixed_vfloat32m1_t sv64; + #endif + } simde_float32x2_private; typedef union { @@ -130,8 +183,34 @@ typedef union { #if defined(SIMDE_X86_MMX_NATIVE) __m64 m64; #endif + + #if defined(SIMDE_RISCV_V_NATIVE) + fixed_vfloat64m1_t sv64; + #endif + } simde_float64x1_private; +typedef union { + SIMDE_ARM_NEON_DECLARE_VECTOR(simde_poly8, values, 8); + #if defined(SIMDE_RISCV_V_NATIVE) + fixed_vuint8m1_t sv64; + #endif +} simde_poly8x8_private; + +typedef union { + SIMDE_ARM_NEON_DECLARE_VECTOR(simde_poly16, values, 8); + #if defined(SIMDE_RISCV_V_NATIVE) + fixed_vuint16m1_t sv64; + #endif +} simde_poly16x4_private; + +typedef union { + SIMDE_ARM_NEON_DECLARE_VECTOR(simde_poly64, values, 8); + #if defined(SIMDE_RISCV_V_NATIVE) + fixed_vuint64m1_t sv64; + #endif +} simde_poly64x1_private; + typedef union { SIMDE_ARM_NEON_DECLARE_VECTOR(int8_t, values, 16); @@ -146,6 +225,11 @@ typedef union { #if defined(SIMDE_WASM_SIMD128_NATIVE) v128_t v128; #endif + + #if defined(SIMDE_RISCV_V_NATIVE) + fixed_vint8m1_t sv128; + #endif + } simde_int8x16_private; typedef union { @@ -162,6 +246,11 @@ typedef union { #if defined(SIMDE_WASM_SIMD128_NATIVE) v128_t v128; #endif + + #if defined(SIMDE_RISCV_V_NATIVE) + fixed_vint16m1_t sv128; + #endif + } simde_int16x8_private; typedef union { @@ -182,6 +271,11 @@ typedef union { #if defined(SIMDE_WASM_SIMD128_NATIVE) v128_t v128; #endif + + #if defined(SIMDE_RISCV_V_NATIVE) + fixed_vint32m1_t sv128; + #endif + } simde_int32x4_private; typedef union { @@ -198,6 +292,11 @@ typedef union { #if defined(SIMDE_WASM_SIMD128_NATIVE) v128_t v128; #endif + + #if defined(SIMDE_RISCV_V_NATIVE) + fixed_vint64m1_t sv128; + #endif + } simde_int64x2_private; typedef union { @@ -214,6 +313,11 @@ typedef union { #if defined(SIMDE_WASM_SIMD128_NATIVE) v128_t v128; #endif + + #if defined(SIMDE_RISCV_V_NATIVE) + fixed_vuint8m1_t sv128; + #endif + } simde_uint8x16_private; typedef union { @@ -230,6 +334,11 @@ typedef union { #if defined(SIMDE_WASM_SIMD128_NATIVE) v128_t v128; #endif + + #if defined(SIMDE_RISCV_V_NATIVE) + fixed_vuint16m1_t sv128; + #endif + } simde_uint16x8_private; typedef union { @@ -246,6 +355,11 @@ typedef union { #if defined(SIMDE_WASM_SIMD128_NATIVE) v128_t v128; #endif + + #if defined(SIMDE_RISCV_V_NATIVE) + fixed_vuint32m1_t sv128; + #endif + } simde_uint32x4_private; typedef union { @@ -262,6 +376,11 @@ typedef union { #if defined(SIMDE_WASM_SIMD128_NATIVE) v128_t v128; #endif + + #if defined(SIMDE_RISCV_V_NATIVE) + fixed_vuint64m1_t sv128; + #endif + } simde_uint64x2_private; typedef union { @@ -271,7 +390,7 @@ typedef union { simde_float16 values[8]; #endif - #if defined(SIMDE_X86_SSE2_NATIVE) + #if defined(SIMDE_X86_SSE_NATIVE) __m128 m128; #endif @@ -282,12 +401,17 @@ typedef union { #if defined(SIMDE_WASM_SIMD128_NATIVE) v128_t v128; #endif + + #if defined(SIMDE_RISCV_V_NATIVE) && SIMDE_ARCH_RISCV_ZVFH + fixed_vfloat16m1_t sv128; + #endif + } simde_float16x8_private; typedef union { SIMDE_ARM_NEON_DECLARE_VECTOR(simde_float32, values, 16); - #if defined(SIMDE_X86_SSE2_NATIVE) + #if defined(SIMDE_X86_SSE_NATIVE) __m128 m128; #endif @@ -298,12 +422,17 @@ typedef union { #if defined(SIMDE_WASM_SIMD128_NATIVE) v128_t v128; #endif + + #if defined(SIMDE_RISCV_V_NATIVE) + fixed_vfloat32m1_t sv128; + #endif + } simde_float32x4_private; typedef union { SIMDE_ARM_NEON_DECLARE_VECTOR(simde_float64, values, 16); - #if defined(SIMDE_X86_SSE2_NATIVE) + #if defined(SIMDE_X86_SSE2_NATIVE) || defined(SIMDE_X86_SVML_NATIVE) __m128d m128d; #endif @@ -314,10 +443,54 @@ typedef union { #if defined(SIMDE_WASM_SIMD128_NATIVE) v128_t v128; #endif + + #if defined(SIMDE_RISCV_V_NATIVE) + fixed_vfloat64m1_t sv128; + #endif + } simde_float64x2_private; +typedef union { + SIMDE_ARM_NEON_DECLARE_VECTOR(simde_poly8, values, 16); + #if defined(SIMDE_RISCV_V_NATIVE) + fixed_vuint8m1_t sv128; + #endif +} simde_poly8x16_private; + +typedef union { + SIMDE_ARM_NEON_DECLARE_VECTOR(simde_poly16, values, 16); + #if defined(SIMDE_RISCV_V_NATIVE) + fixed_vuint16m1_t sv128; + #endif +} simde_poly16x8_private; + +typedef union { + SIMDE_ARM_NEON_DECLARE_VECTOR(simde_poly64, values, 16); + #if defined(SIMDE_RISCV_V_NATIVE) + fixed_vuint64m1_t sv128; + #endif +} simde_poly64x2_private; + +typedef union { + #if SIMDE_BFLOAT16_API == SIMDE_BFLOAT16_API_BF16 + SIMDE_ARM_NEON_DECLARE_VECTOR(simde_bfloat16, values, 8); + #else + simde_bfloat16 values[4]; + #endif +} simde_bfloat16x4_private; + +typedef union { + #if SIMDE_BFLOAT16_API == SIMDE_BFLOAT16_API_BF16 + SIMDE_ARM_NEON_DECLARE_VECTOR(simde_bfloat16, values, 16); + #else + simde_bfloat16 values[8]; + #endif +} simde_bfloat16x8_private; + #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) typedef float32_t simde_float32_t; + typedef poly8_t simde_poly8_t; + typedef poly16_t simde_poly16_t; typedef int8x8_t simde_int8x8_t; typedef int16x4_t simde_int16x4_t; @@ -328,6 +501,8 @@ typedef union { typedef uint32x2_t simde_uint32x2_t; typedef uint64x1_t simde_uint64x1_t; typedef float32x2_t simde_float32x2_t; + typedef poly8x8_t simde_poly8x8_t; + typedef poly16x4_t simde_poly16x4_t; typedef int8x16_t simde_int8x16_t; typedef int16x8_t simde_int16x8_t; @@ -338,6 +513,8 @@ typedef union { typedef uint32x4_t simde_uint32x4_t; typedef uint64x2_t simde_uint64x2_t; typedef float32x4_t simde_float32x4_t; + typedef poly8x16_t simde_poly8x16_t; + typedef poly16x8_t simde_poly16x8_t; typedef int8x8x2_t simde_int8x8x2_t; typedef int16x4x2_t simde_int16x4x2_t; @@ -348,6 +525,8 @@ typedef union { typedef uint32x2x2_t simde_uint32x2x2_t; typedef uint64x1x2_t simde_uint64x1x2_t; typedef float32x2x2_t simde_float32x2x2_t; + typedef poly8x8x2_t simde_poly8x8x2_t; + typedef poly16x4x2_t simde_poly16x4x2_t; typedef int8x16x2_t simde_int8x16x2_t; typedef int16x8x2_t simde_int16x8x2_t; @@ -358,6 +537,8 @@ typedef union { typedef uint32x4x2_t simde_uint32x4x2_t; typedef uint64x2x2_t simde_uint64x2x2_t; typedef float32x4x2_t simde_float32x4x2_t; + typedef poly8x16x2_t simde_poly8x16x2_t; + typedef poly16x8x2_t simde_poly16x8x2_t; typedef int8x8x3_t simde_int8x8x3_t; typedef int16x4x3_t simde_int16x4x3_t; @@ -368,6 +549,8 @@ typedef union { typedef uint32x2x3_t simde_uint32x2x3_t; typedef uint64x1x3_t simde_uint64x1x3_t; typedef float32x2x3_t simde_float32x2x3_t; + typedef poly8x8x3_t simde_poly8x8x3_t; + typedef poly16x4x3_t simde_poly16x4x3_t; typedef int8x16x3_t simde_int8x16x3_t; typedef int16x8x3_t simde_int16x8x3_t; @@ -378,6 +561,8 @@ typedef union { typedef uint32x4x3_t simde_uint32x4x3_t; typedef uint64x2x3_t simde_uint64x2x3_t; typedef float32x4x3_t simde_float32x4x3_t; + typedef poly8x16x3_t simde_poly8x16x3_t; + typedef poly16x8x3_t simde_poly16x8x3_t; typedef int8x8x4_t simde_int8x8x4_t; typedef int16x4x4_t simde_int16x4x4_t; @@ -388,6 +573,8 @@ typedef union { typedef uint32x2x4_t simde_uint32x2x4_t; typedef uint64x1x4_t simde_uint64x1x4_t; typedef float32x2x4_t simde_float32x2x4_t; + typedef poly8x8x4_t simde_poly8x8x4_t; + typedef poly16x4x4_t simde_poly16x4x4_t; typedef int8x16x4_t simde_int8x16x4_t; typedef int16x8x4_t simde_int16x8x4_t; @@ -398,6 +585,55 @@ typedef union { typedef uint32x4x4_t simde_uint32x4x4_t; typedef uint64x2x4_t simde_uint64x2x4_t; typedef float32x4x4_t simde_float32x4x4_t; + typedef poly8x16x4_t simde_poly8x16x4_t; + typedef poly16x8x4_t simde_poly16x8x4_t; + + #if defined(SIMDE_ARM_NEON_FP16) + typedef float16_t simde_float16_t; + typedef float16x4_t simde_float16x4_t; + typedef float16x4x2_t simde_float16x4x2_t; + typedef float16x4x3_t simde_float16x4x3_t; + typedef float16x4x4_t simde_float16x4x4_t; + typedef float16x8_t simde_float16x8_t; + typedef float16x8x2_t simde_float16x8x2_t; + typedef float16x8x3_t simde_float16x8x3_t; + typedef float16x8x4_t simde_float16x8x4_t; + #else + #define SIMDE_ARM_NEON_NEED_PORTABLE_F16 + #endif + + #if defined(SIMDE_ARM_NEON_BF16) + typedef bfloat16_t simde_bfloat16_t; + typedef bfloat16x4_t simde_bfloat16x4_t; + typedef bfloat16x4x2_t simde_bfloat16x4x2_t; + typedef bfloat16x4x3_t simde_bfloat16x4x3_t; + typedef bfloat16x4x4_t simde_bfloat16x4x4_t; + typedef bfloat16x8_t simde_bfloat16x8_t; + typedef bfloat16x8x2_t simde_bfloat16x8x2_t; + typedef bfloat16x8x3_t simde_bfloat16x8x3_t; + typedef bfloat16x8x4_t simde_bfloat16x8x4_t; + #else + #define SIMDE_ARM_NEON_NEED_PORTABLE_BF16 + #endif + + #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) + typedef poly64_t simde_poly64_t; + typedef poly64x1_t simde_poly64x1_t; + typedef poly64x2_t simde_poly64x2_t; + typedef poly64x1x2_t simde_poly64x1x2_t; + typedef poly64x2x2_t simde_poly64x2x2_t; + typedef poly64x1x3_t simde_poly64x1x3_t; + typedef poly64x2x3_t simde_poly64x2x3_t; + typedef poly64x1x4_t simde_poly64x1x4_t; + typedef poly64x2x4_t simde_poly64x2x4_t; + #if defined(SIMDE_ARCH_ARM_CRYPTO) + typedef poly128_t simde_poly128_t; + #else + #define SIMDE_ARM_NEON_NEED_PORTABLE_POLY_128_BIT + #endif + #else + #define SIMDE_ARM_NEON_NEED_PORTABLE_POLY_64_BIT + #endif #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) typedef float64_t simde_float64_t; @@ -417,16 +653,15 @@ typedef union { #define SIMDE_ARM_NEON_NEED_PORTABLE_F64X2XN #endif - #if SIMDE_FLOAT16_API == SIMDE_FLOAT16_API_FP16 - typedef float16_t simde_float16_t; - typedef float16x4_t simde_float16x4_t; - typedef float16x8_t simde_float16x8_t; - #else - #define SIMDE_ARM_NEON_NEED_PORTABLE_F16 - #endif #elif (defined(SIMDE_X86_MMX_NATIVE) || defined(SIMDE_X86_SSE_NATIVE)) && defined(SIMDE_ARM_NEON_FORCE_NATIVE_TYPES) + #define SIMDE_ARM_NEON_NEED_PORTABLE_F16 #define SIMDE_ARM_NEON_NEED_PORTABLE_F32 #define SIMDE_ARM_NEON_NEED_PORTABLE_F64 + #define SIMDE_ARM_NEON_NEED_PORTABLE_POLY + #define SIMDE_ARM_NEON_NEED_PORTABLE_POLY_64_BIT + #define SIMDE_ARM_NEON_NEED_PORTABLE_POLY_128_BIT + #define SIMDE_ARM_NEON_NEED_PORTABLE_POLY_VXN + #define SIMDE_ARM_NEON_NEED_PORTABLE_BF16 #define SIMDE_ARM_NEON_NEED_PORTABLE_VXN #define SIMDE_ARM_NEON_NEED_PORTABLE_F64X1XN @@ -462,7 +697,7 @@ typedef union { #define SIMDE_ARM_NEON_NEED_PORTABLE_F32X4 #endif - #if defined(SIMDE_X86_SSE2_NATIVE) + #if defined(SIMDE_X86_SSE2_NATIVE) || defined(SIMDE_X86_SVML_NATIVE) typedef __m128i simde_int8x16_t; typedef __m128i simde_int16x8_t; typedef __m128i simde_int32x4_t; @@ -483,11 +718,14 @@ typedef union { #define SIMDE_ARM_NEON_NEED_PORTABLE_U64X2 #define SIMDE_ARM_NEON_NEED_PORTABLE_F64X2 #endif - - #define SIMDE_ARM_NEON_NEED_PORTABLE_F16 #elif defined(SIMDE_WASM_SIMD128_NATIVE) && defined(SIMDE_ARM_NEON_FORCE_NATIVE_TYPES) #define SIMDE_ARM_NEON_NEED_PORTABLE_F32 #define SIMDE_ARM_NEON_NEED_PORTABLE_F64 + #define SIMDE_ARM_NEON_NEED_PORTABLE_POLY + #define SIMDE_ARM_NEON_NEED_PORTABLE_POLY_64_BIT + #define SIMDE_ARM_NEON_NEED_PORTABLE_POLY_128_BIT + #define SIMDE_ARM_NEON_NEED_PORTABLE_POLY_VXN + #define SIMDE_ARM_NEON_NEED_PORTABLE_BF16 #define SIMDE_ARM_NEON_NEED_PORTABLE_64BIT @@ -507,8 +745,14 @@ typedef union { typedef v128_t simde_float32x4_t; typedef v128_t simde_float64x2_t; #elif defined(SIMDE_POWER_ALTIVEC_P6_NATIVE) || defined(SIMDE_ZARCH_ZVECTOR_13_NATIVE) + #define SIMDE_ARM_NEON_NEED_PORTABLE_F16 #define SIMDE_ARM_NEON_NEED_PORTABLE_F32 #define SIMDE_ARM_NEON_NEED_PORTABLE_F64 + #define SIMDE_ARM_NEON_NEED_PORTABLE_POLY + #define SIMDE_ARM_NEON_NEED_PORTABLE_POLY_64_BIT + #define SIMDE_ARM_NEON_NEED_PORTABLE_POLY_128_BIT + #define SIMDE_ARM_NEON_NEED_PORTABLE_POLY_VXN + #define SIMDE_ARM_NEON_NEED_PORTABLE_BF16 #define SIMDE_ARM_NEON_NEED_PORTABLE_64BIT #define SIMDE_ARM_NEON_NEED_PORTABLE_F64X1XN @@ -531,9 +775,43 @@ typedef union { #define SIMDE_ARM_NEON_NEED_PORTABLE_I64X2 #define SIMDE_ARM_NEON_NEED_PORTABLE_U64X2 #define SIMDE_ARM_NEON_NEED_PORTABLE_F64X2 - #define SIMDE_ARM_NEON_NEED_PORTABLE_F16 #endif +#elif defined(SIMDE_RISCV_V_NATIVE) + + typedef fixed_vint8m1_t simde_int8x8_t; + typedef fixed_vint16m1_t simde_int16x4_t; + typedef fixed_vint32m1_t simde_int32x2_t; + typedef fixed_vint64m1_t simde_int64x1_t; + typedef fixed_vuint8m1_t simde_uint8x8_t; + typedef fixed_vuint16m1_t simde_uint16x4_t; + typedef fixed_vuint32m1_t simde_uint32x2_t; + typedef fixed_vuint64m1_t simde_uint64x1_t; + typedef fixed_vfloat32m1_t simde_float32x2_t; + typedef fixed_vfloat64m1_t simde_float64x1_t; + + typedef fixed_vint8m1_t simde_int8x16_t; + typedef fixed_vint16m1_t simde_int16x8_t; + typedef fixed_vint32m1_t simde_int32x4_t; + typedef fixed_vint64m1_t simde_int64x2_t; + typedef fixed_vuint8m1_t simde_uint8x16_t; + typedef fixed_vuint16m1_t simde_uint16x8_t; + typedef fixed_vuint32m1_t simde_uint32x4_t; + typedef fixed_vuint64m1_t simde_uint64x2_t; + typedef fixed_vfloat32m1_t simde_float32x4_t; + typedef fixed_vfloat64m1_t simde_float64x2_t; + #define SIMDE_ARM_NEON_NEED_PORTABLE_F16 + #define SIMDE_ARM_NEON_NEED_PORTABLE_F32 + #define SIMDE_ARM_NEON_NEED_PORTABLE_F64 + #define SIMDE_ARM_NEON_NEED_PORTABLE_POLY_128_BIT + #define SIMDE_ARM_NEON_NEED_PORTABLE_POLY_64_BIT + #define SIMDE_ARM_NEON_NEED_PORTABLE_POLY + #define SIMDE_ARM_NEON_NEED_PORTABLE_F64X1XN + #define SIMDE_ARM_NEON_NEED_PORTABLE_F64X2XN + #define SIMDE_ARM_NEON_NEED_PORTABLE_POLY_VXN + #define SIMDE_ARM_NEON_NEED_PORTABLE_VXN + #define SIMDE_ARM_NEON_NEED_PORTABLE_BF16 + #elif defined(SIMDE_VECTOR) typedef simde_float32 simde_float32_t; typedef simde_float64 simde_float64_t; @@ -562,14 +840,42 @@ typedef union { typedef simde_float16 simde_float16_t; typedef simde_float16_t simde_float16x4_t SIMDE_VECTOR(8); typedef simde_float16_t simde_float16x8_t SIMDE_VECTOR(16); + typedef struct simde_float16x4x2_t { + simde_float16x4_t val[2]; + } simde_float16x4x2_t; + typedef struct simde_float16x4x3_t { + simde_float16x4_t val[3]; + } simde_float16x4x3_t; + typedef struct simde_float16x4x4_t { + simde_float16x4_t val[4]; + } simde_float16x4x4_t; + typedef struct simde_float16x8x2_t { + simde_float16x8_t val[2]; + } simde_float16x8x2_t; + typedef struct simde_float16x8x3_t { + simde_float16x8_t val[3]; + } simde_float16x8x3_t; + typedef struct simde_float16x8x4_t { + simde_float16x8_t val[4]; + } simde_float16x8x4_t; #else #define SIMDE_ARM_NEON_NEED_PORTABLE_F16 #endif + #define SIMDE_ARM_NEON_NEED_PORTABLE_POLY + #define SIMDE_ARM_NEON_NEED_PORTABLE_POLY_64_BIT + #define SIMDE_ARM_NEON_NEED_PORTABLE_POLY_128_BIT + #define SIMDE_ARM_NEON_NEED_PORTABLE_POLY_VXN + #define SIMDE_ARM_NEON_NEED_PORTABLE_BF16 #define SIMDE_ARM_NEON_NEED_PORTABLE_VXN #define SIMDE_ARM_NEON_NEED_PORTABLE_F64X1XN #define SIMDE_ARM_NEON_NEED_PORTABLE_F64X2XN #else + #define SIMDE_ARM_NEON_NEED_PORTABLE_POLY + #define SIMDE_ARM_NEON_NEED_PORTABLE_POLY_64_BIT + #define SIMDE_ARM_NEON_NEED_PORTABLE_POLY_128_BIT + #define SIMDE_ARM_NEON_NEED_PORTABLE_POLY_VXN + #define SIMDE_ARM_NEON_NEED_PORTABLE_BF16 #define SIMDE_ARM_NEON_NEED_PORTABLE_F16 #define SIMDE_ARM_NEON_NEED_PORTABLE_F32 #define SIMDE_ARM_NEON_NEED_PORTABLE_F64 @@ -581,6 +887,114 @@ typedef union { #define SIMDE_ARM_NEON_NEED_PORTABLE_F64X2XN #endif +#if defined(SIMDE_ARM_NEON_NEED_PORTABLE_POLY) + typedef simde_poly8 simde_poly8_t; + typedef simde_poly16 simde_poly16_t; + + typedef simde_poly8x8_private simde_poly8x8_t; + typedef simde_poly16x4_private simde_poly16x4_t; + typedef simde_poly8x16_private simde_poly8x16_t; + typedef simde_poly16x8_private simde_poly16x8_t; +#endif + +#if defined(SIMDE_ARM_NEON_NEED_PORTABLE_POLY_64_BIT) + typedef simde_poly64 simde_poly64_t; + typedef simde_poly64x1_private simde_poly64x1_t; + typedef simde_poly64x2_private simde_poly64x2_t; + typedef struct simde_poly64x1x2_t { + simde_poly64x1_t val[2]; + } simde_poly64x1x2_t; + typedef struct simde_poly64x2x2_t { + simde_poly64x2_t val[2]; + } simde_poly64x2x2_t; + typedef struct simde_poly64x1x3_t { + simde_poly64x1_t val[3]; + } simde_poly64x1x3_t; + typedef struct simde_poly64x2x3_t { + simde_poly64x2_t val[3]; + } simde_poly64x2x3_t; + typedef struct simde_poly64x1x4_t { + simde_poly64x1_t val[4]; + } simde_poly64x1x4_t; + typedef struct simde_poly64x2x4_t { + simde_poly64x2_t val[4]; + } simde_poly64x2x4_t; +#endif + +#if defined(SIMDE_ARM_NEON_NEED_PORTABLE_POLY_128_BIT) + typedef simde_poly128 simde_poly128_t; +#endif + +#if defined(SIMDE_ARM_NEON_NEED_PORTABLE_POLY_VXN) + typedef struct simde_poly8x8x2_t { + simde_poly8x8_t val[2]; + } simde_poly8x8x2_t; + typedef struct simde_poly16x4x2_t { + simde_poly16x4_t val[2]; + } simde_poly16x4x2_t; + typedef struct simde_poly8x16x2_t { + simde_poly8x16_t val[2]; + } simde_poly8x16x2_t; + typedef struct simde_poly16x8x2_t { + simde_poly16x8_t val[2]; + } simde_poly16x8x2_t; + + typedef struct simde_poly8x8x3_t { + simde_poly8x8_t val[3]; + } simde_poly8x8x3_t; + typedef struct simde_poly16x4x3_t { + simde_poly16x4_t val[3]; + } simde_poly16x4x3_t; + typedef struct simde_poly8x16x3_t { + simde_poly8x16_t val[3]; + } simde_poly8x16x3_t; + typedef struct simde_poly16x8x3_t { + simde_poly16x8_t val[3]; + } simde_poly16x8x3_t; + + typedef struct simde_poly8x8x4_t { + simde_poly8x8_t val[4]; + } simde_poly8x8x4_t; + typedef struct simde_poly16x4x4_t { + simde_poly16x4_t val[4]; + } simde_poly16x4x4_t; + typedef struct simde_poly8x16x4_t { + simde_poly8x16_t val[4]; + } simde_poly8x16x4_t; + typedef struct simde_poly16x8x4_t { + simde_poly16x8_t val[4]; + } simde_poly16x8x4_t; +#endif + +#if defined(SIMDE_ARM_NEON_NEED_PORTABLE_BF16) + typedef simde_bfloat16 simde_bfloat16_t; + typedef simde_bfloat16x4_private simde_bfloat16x4_t; + typedef simde_bfloat16x8_private simde_bfloat16x8_t; + typedef struct simde_bfloat16x4x2_t { + simde_bfloat16x4_t val[2]; + } simde_bfloat16x4x2_t; + + typedef struct simde_bfloat16x8x2_t { + simde_bfloat16x8_t val[2]; + } simde_bfloat16x8x2_t; + + typedef struct simde_bfloat16x4x3_t { + simde_bfloat16x4_t val[3]; + } simde_bfloat16x4x3_t; + + typedef struct simde_bfloat16x8x3_t { + simde_bfloat16x8_t val[3]; + } simde_bfloat16x8x3_t; + + typedef struct simde_bfloat16x4x4_t { + simde_bfloat16x4_t val[4]; + } simde_bfloat16x4x4_t; + + typedef struct simde_bfloat16x8x4_t { + simde_bfloat16x8_t val[4]; + } simde_bfloat16x8x4_t; +#endif + #if defined(SIMDE_ARM_NEON_NEED_PORTABLE_I8X8) || defined(SIMDE_ARM_NEON_NEED_PORTABLE_64BIT) typedef simde_int8x8_private simde_int8x8_t; #endif @@ -647,6 +1061,25 @@ typedef union { typedef simde_float16 simde_float16_t; typedef simde_float16x4_private simde_float16x4_t; typedef simde_float16x8_private simde_float16x8_t; + + typedef struct simde_float16x4x2_t { + simde_float16x4_t val[2]; + } simde_float16x4x2_t; + typedef struct simde_float16x4x3_t { + simde_float16x4_t val[3]; + } simde_float16x4x3_t; + typedef struct simde_float16x4x4_t { + simde_float16x4_t val[4]; + } simde_float16x4x4_t; + typedef struct simde_float16x8x2_t { + simde_float16x8_t val[2]; + } simde_float16x8x2_t; + typedef struct simde_float16x8x3_t { + simde_float16x8_t val[3]; + } simde_float16x8x3_t; + typedef struct simde_float16x8x4_t { + simde_float16x8_t val[4]; + } simde_float16x8x4_t; #endif #if defined(SIMDE_ARM_NEON_NEED_PORTABLE_F32) typedef simde_float32 simde_float32_t; @@ -858,6 +1291,8 @@ typedef union { #endif #if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) typedef simde_float32_t float32_t; + typedef simde_poly8_t poly8_t; + typedef simde_poly16_t poly16_t; typedef simde_int8x8_t int8x8_t; typedef simde_int16x4_t int16x4_t; @@ -868,6 +1303,8 @@ typedef union { typedef simde_uint32x2_t uint32x2_t; typedef simde_uint64x1_t uint64x1_t; typedef simde_float32x2_t float32x2_t; + typedef simde_poly8x8_t poly8x8_t; + typedef simde_poly16x4_t poly16x4_t; typedef simde_int8x16_t int8x16_t; typedef simde_int16x8_t int16x8_t; @@ -878,6 +1315,8 @@ typedef union { typedef simde_uint32x4_t uint32x4_t; typedef simde_uint64x2_t uint64x2_t; typedef simde_float32x4_t float32x4_t; + typedef simde_poly8x16_t poly8x16_t; + typedef simde_poly16x8_t poly16x8_t; typedef simde_int8x8x2_t int8x8x2_t; typedef simde_int16x4x2_t int16x4x2_t; @@ -888,6 +1327,8 @@ typedef union { typedef simde_uint32x2x2_t uint32x2x2_t; typedef simde_uint64x1x2_t uint64x1x2_t; typedef simde_float32x2x2_t float32x2x2_t; + typedef simde_poly8x8x2_t poly8x8x2_t; + typedef simde_poly16x4x2_t poly16x4x2_t; typedef simde_int8x16x2_t int8x16x2_t; typedef simde_int16x8x2_t int16x8x2_t; @@ -898,6 +1339,8 @@ typedef union { typedef simde_uint32x4x2_t uint32x4x2_t; typedef simde_uint64x2x2_t uint64x2x2_t; typedef simde_float32x4x2_t float32x4x2_t; + typedef simde_poly8x16x2_t poly8x16x2_t; + typedef simde_poly16x8x2_t poly16x8x2_t; typedef simde_int8x8x3_t int8x8x3_t; typedef simde_int16x4x3_t int16x4x3_t; @@ -908,6 +1351,8 @@ typedef union { typedef simde_uint32x2x3_t uint32x2x3_t; typedef simde_uint64x1x3_t uint64x1x3_t; typedef simde_float32x2x3_t float32x2x3_t; + typedef simde_poly8x8x3_t poly8x8x3_t; + typedef simde_poly16x4x3_t poly16x4x3_t; typedef simde_int8x16x3_t int8x16x3_t; typedef simde_int16x8x3_t int16x8x3_t; @@ -918,6 +1363,8 @@ typedef union { typedef simde_uint32x4x3_t uint32x4x3_t; typedef simde_uint64x2x3_t uint64x2x3_t; typedef simde_float32x4x3_t float32x4x3_t; + typedef simde_poly8x16x3_t poly8x16x3_t; + typedef simde_poly16x8x3_t poly16x8x3_t; typedef simde_int8x8x4_t int8x8x4_t; typedef simde_int16x4x4_t int16x4x4_t; @@ -928,6 +1375,8 @@ typedef union { typedef simde_uint32x2x4_t uint32x2x4_t; typedef simde_uint64x1x4_t uint64x1x4_t; typedef simde_float32x2x4_t float32x2x4_t; + typedef simde_poly8x8x4_t poly8x8x4_t; + typedef simde_poly16x4x4_t poly16x4x4_t; typedef simde_int8x16x4_t int8x16x4_t; typedef simde_int16x8x4_t int16x8x4_t; @@ -938,6 +1387,18 @@ typedef union { typedef simde_uint32x4x4_t uint32x4x4_t; typedef simde_uint64x2x4_t uint64x2x4_t; typedef simde_float32x4x4_t float32x4x4_t; + typedef simde_poly8x16x4_t poly8x16x4_t; + typedef simde_poly16x8x4_t poly16x8x4_t; +#endif +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) + typedef simde_poly64x1_t poly64x1_t; + typedef simde_poly64x2_t poly64x2_t; + typedef simde_poly64x1x2_t poly64x1x2_t; + typedef simde_poly64x2x2_t poly64x2x2_t; + typedef simde_poly64x1x3_t poly64x1x3_t; + typedef simde_poly64x2x3_t poly64x2x3_t; + typedef simde_poly64x1x4_t poly64x1x4_t; + typedef simde_poly64x2x4_t poly64x2x4_t; #endif #if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) typedef simde_float64_t float64_t; @@ -979,7 +1440,7 @@ typedef union { SIMDE_DEFINE_CONVERSION_FUNCTION_(simde_float32x4_to_m128, __m128, simde_float32x4_t) SIMDE_DEFINE_CONVERSION_FUNCTION_(simde_float32x4_from_m128, simde_float32x4_t, __m128) #endif -#if defined(SIMDE_X86_SSE2_NATIVE) +#if defined(SIMDE_X86_SSE2_NATIVE) || defined(SIMDE_X86_SVML_NATIVE) SIMDE_DEFINE_CONVERSION_FUNCTION_(simde_int8x16_to_m128i, __m128i, simde_int8x16_t) SIMDE_DEFINE_CONVERSION_FUNCTION_(simde_int16x8_to_m128i, __m128i, simde_int16x8_t) SIMDE_DEFINE_CONVERSION_FUNCTION_(simde_int32x4_to_m128i, __m128i, simde_int32x4_t) @@ -1039,6 +1500,10 @@ SIMDE_ARM_NEON_TYPE_DEFINE_CONVERSIONS_(uint64x1) SIMDE_ARM_NEON_TYPE_DEFINE_CONVERSIONS_(float16x4) SIMDE_ARM_NEON_TYPE_DEFINE_CONVERSIONS_(float32x2) SIMDE_ARM_NEON_TYPE_DEFINE_CONVERSIONS_(float64x1) +SIMDE_ARM_NEON_TYPE_DEFINE_CONVERSIONS_(poly8x8) +SIMDE_ARM_NEON_TYPE_DEFINE_CONVERSIONS_(poly16x4) +SIMDE_ARM_NEON_TYPE_DEFINE_CONVERSIONS_(poly64x1) +SIMDE_ARM_NEON_TYPE_DEFINE_CONVERSIONS_(bfloat16x4) SIMDE_ARM_NEON_TYPE_DEFINE_CONVERSIONS_(int8x16) SIMDE_ARM_NEON_TYPE_DEFINE_CONVERSIONS_(int16x8) SIMDE_ARM_NEON_TYPE_DEFINE_CONVERSIONS_(int32x4) @@ -1047,9 +1512,13 @@ SIMDE_ARM_NEON_TYPE_DEFINE_CONVERSIONS_(uint8x16) SIMDE_ARM_NEON_TYPE_DEFINE_CONVERSIONS_(uint16x8) SIMDE_ARM_NEON_TYPE_DEFINE_CONVERSIONS_(uint32x4) SIMDE_ARM_NEON_TYPE_DEFINE_CONVERSIONS_(uint64x2) +SIMDE_ARM_NEON_TYPE_DEFINE_CONVERSIONS_(poly8x16) +SIMDE_ARM_NEON_TYPE_DEFINE_CONVERSIONS_(poly16x8) +SIMDE_ARM_NEON_TYPE_DEFINE_CONVERSIONS_(poly64x2) SIMDE_ARM_NEON_TYPE_DEFINE_CONVERSIONS_(float16x8) SIMDE_ARM_NEON_TYPE_DEFINE_CONVERSIONS_(float32x4) SIMDE_ARM_NEON_TYPE_DEFINE_CONVERSIONS_(float64x2) +SIMDE_ARM_NEON_TYPE_DEFINE_CONVERSIONS_(bfloat16x8) SIMDE_END_DECLS_ HEDLEY_DIAGNOSTIC_POP diff --git a/lib/simd_wrapper/simde/arm/neon/uqadd.h b/lib/simd_wrapper/simde/arm/neon/uqadd.h index 576fbb57618..42535de5e9e 100644 --- a/lib/simd_wrapper/simde/arm/neon/uqadd.h +++ b/lib/simd_wrapper/simde/arm/neon/uqadd.h @@ -33,6 +33,18 @@ HEDLEY_DIAGNOSTIC_PUSH SIMDE_DISABLE_UNWANTED_DIAGNOSTICS SIMDE_BEGIN_DECLS_ +// Workaround on ARM64 windows due to windows SDK bug +// https://developercommunity.visualstudio.com/t/In-arm64_neonh-vsqaddb_u8-vsqaddh_u16/10271747?sort=newest +#if (defined _MSC_VER) && (defined SIMDE_ARM_NEON_A64V8_NATIVE) && (_MSC_VER < 1938) +#pragma message ("Due to msvc bug, current version of msvc is supported by workaround. Recommend to update msvc") +#undef vuqaddh_s16 +#define vuqaddh_s16(src1, src2) neon_suqadds16(__int16ToN16_v(src1), __uint16ToN16_v(src2)).n16_i16[0] +#undef vuqadds_s32 +#define vuqadds_s32(src1, src2) _CopyInt32FromFloat(neon_suqadds32(_CopyFloatFromInt32(src1), _CopyFloatFromUInt32(src2))) +#undef vuqaddd_s64 +#define vuqaddd_s64(src1, src2) neon_suqadds64(__int64ToN64_v(src1), __uint64ToN64_v(src2)).n64_i64[0] +#endif + SIMDE_FUNCTION_ATTRIBUTES int8_t simde_vuqaddb_s8(int8_t a, uint8_t b) { diff --git a/lib/simd_wrapper/simde/arm/neon/usdot.h b/lib/simd_wrapper/simde/arm/neon/usdot.h new file mode 100644 index 00000000000..d32769479d3 --- /dev/null +++ b/lib/simd_wrapper/simde/arm/neon/usdot.h @@ -0,0 +1,93 @@ +/* SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, copy, + * modify, merge, publish, distribute, sublicense, and/or sell copies + * of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + * Copyright: + * 2023 Yi-Yen Chung (Copyright owned by Andes Technology) + */ + +#if !defined(SIMDE_ARM_NEON_USDOT_H) +#define SIMDE_ARM_NEON_USDOT_H + +#include "types.h" + +#include "add.h" + +HEDLEY_DIAGNOSTIC_PUSH +SIMDE_DISABLE_UNWANTED_DIAGNOSTICS +SIMDE_BEGIN_DECLS_ + +SIMDE_FUNCTION_ATTRIBUTES +simde_int32x2_t +simde_vusdot_s32(simde_int32x2_t r, simde_uint8x8_t a, simde_int8x8_t b) { + #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARCH_ARM_MATMUL_INT8) + return vusdot_s32(r, a, b); + #else + simde_int32x2_private r_; + simde_uint8x8_private a_ = simde_uint8x8_to_private(a); + simde_int8x8_private b_ = simde_int8x8_to_private(b); + for (int i = 0 ; i < 2 ; i++) { + int32_t acc = 0; + SIMDE_VECTORIZE_REDUCTION(+:acc) + for (int j = 0 ; j < 4 ; j++) { + const int idx = j + (i << 2); + acc += HEDLEY_STATIC_CAST(int32_t, a_.values[idx]) * HEDLEY_STATIC_CAST(int32_t, b_.values[idx]); + } + r_.values[i] = acc; + } + return simde_vadd_s32(r, simde_int32x2_from_private(r_)); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) + #undef vusdot_s32 + #define vusdot_s32(r, a, b) simde_vusdot_s32((r), (a), (b)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_int32x4_t +simde_vusdotq_s32(simde_int32x4_t r, simde_uint8x16_t a, simde_int8x16_t b) { + #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARCH_ARM_MATMUL_INT8) + return vusdotq_s32(r, a, b); + #else + simde_int32x4_private r_; + simde_uint8x16_private a_ = simde_uint8x16_to_private(a); + simde_int8x16_private b_ = simde_int8x16_to_private(b); + for (int i = 0 ; i < 4 ; i++) { + int32_t acc = 0; + SIMDE_VECTORIZE_REDUCTION(+:acc) + for (int j = 0 ; j < 4 ; j++) { + const int idx = j + (i << 2); + acc += HEDLEY_STATIC_CAST(int32_t, a_.values[idx]) * HEDLEY_STATIC_CAST(int32_t, b_.values[idx]); + } + r_.values[i] = acc; + } + return simde_vaddq_s32(r, simde_int32x4_from_private(r_)); + #endif +} +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) + #undef vusdotq_s32 + #define vusdotq_s32(r, a, b) simde_vusdotq_s32((r), (a), (b)) +#endif + +SIMDE_END_DECLS_ +HEDLEY_DIAGNOSTIC_POP + +#endif /* !defined(SIMDE_ARM_NEON_USDOT_H) */ diff --git a/lib/simd_wrapper/simde/arm/neon/usdot_lane.h b/lib/simd_wrapper/simde/arm/neon/usdot_lane.h new file mode 100644 index 00000000000..6d8de889d12 --- /dev/null +++ b/lib/simd_wrapper/simde/arm/neon/usdot_lane.h @@ -0,0 +1,165 @@ +/* SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, copy, + * modify, merge, publish, distribute, sublicense, and/or sell copies + * of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + * Copyright: + * 2023 Yi-Yen Chung (Copyright owned by Andes Technology) + */ + +#if !defined(SIMDE_ARM_NEON_USDOT_LANE_H) +#define SIMDE_ARM_NEON_USDOT_LANE_H + +#include "types.h" + +HEDLEY_DIAGNOSTIC_PUSH +SIMDE_DISABLE_UNWANTED_DIAGNOSTICS +SIMDE_BEGIN_DECLS_ + +SIMDE_FUNCTION_ATTRIBUTES +simde_int32x2_t +simde_vusdot_lane_s32(simde_int32x2_t r, simde_uint8x8_t a, simde_int8x8_t b, const int lane) + SIMDE_REQUIRE_CONSTANT_RANGE(lane, 0, 1) { + simde_int32x2_t result; + simde_int32x2_private r_ = simde_int32x2_to_private(r); + simde_uint8x8_private a_ = simde_uint8x8_to_private(a); + simde_int8x8_private b_ = simde_int8x8_to_private(b); + + for (int i = 0 ; i < 2 ; i++) { + int32_t acc = 0; + SIMDE_VECTORIZE_REDUCTION(+:acc) + for (int j = 0 ; j < 4 ; j++) { + const int idx_b = j + (lane << 2); + const int idx_a = j + (i << 2); + acc += HEDLEY_STATIC_CAST(int32_t, a_.values[idx_a]) * HEDLEY_STATIC_CAST(int32_t, b_.values[idx_b]); + } + r_.values[i] += acc; + } + + result = simde_int32x2_from_private(r_); + + return result; +} +#if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(__ARM_FEATURE_MATMUL_INT8) + #define simde_vusdot_lane_s32(r, a, b, lane) vusdot_lane_s32((r), (a), (b), (lane)) +#endif +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) + #undef vusdot_lane_s32 + #define vusdot_lane_s32(r, a, b, lane) simde_vusdot_lane_s32((r), (a), (b), (lane)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_int32x2_t +simde_vusdot_laneq_s32(simde_int32x2_t r, simde_uint8x8_t a, simde_int8x16_t b, const int lane) + SIMDE_REQUIRE_CONSTANT_RANGE(lane, 0, 3) { + simde_int32x2_t result; + simde_int32x2_private r_ = simde_int32x2_to_private(r); + simde_uint8x8_private a_ = simde_uint8x8_to_private(a); + simde_int8x16_private b_ = simde_int8x16_to_private(b); + + for (int i = 0 ; i < 2 ; i++) { + int32_t acc = 0; + SIMDE_VECTORIZE_REDUCTION(+:acc) + for (int j = 0 ; j < 4 ; j++) { + const int idx_b = j + (lane << 2); + const int idx_a = j + (i << 2); + acc += HEDLEY_STATIC_CAST(int32_t, a_.values[idx_a]) * HEDLEY_STATIC_CAST(int32_t, b_.values[idx_b]); + } + r_.values[i] += acc; + } + + result = simde_int32x2_from_private(r_); + + return result; +} +#if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && defined(__ARM_FEATURE_MATMUL_INT8) + #define simde_vusdot_laneq_s32(r, a, b, lane) vusdot_laneq_s32((r), (a), (b), (lane)) +#endif +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vusdot_laneq_s32 + #define vusdot_laneq_s32(r, a, b, lane) simde_vusdot_laneq_s32((r), (a), (b), (lane)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_int32x4_t +simde_vusdotq_laneq_s32(simde_int32x4_t r, simde_uint8x16_t a, simde_int8x16_t b, const int lane) + SIMDE_REQUIRE_CONSTANT_RANGE(lane, 0, 3) { + simde_int32x4_t result; + simde_int32x4_private r_ = simde_int32x4_to_private(r); + simde_uint8x16_private a_ = simde_uint8x16_to_private(a); + simde_int8x16_private b_ = simde_int8x16_to_private(b); + + for(int i = 0 ; i < 4 ; i++) { + int32_t acc = 0; + SIMDE_VECTORIZE_REDUCTION(+:acc) + for(int j = 0 ; j < 4 ; j++) { + const int idx_b = j + (lane << 2); + const int idx_a = j + (i << 2); + acc += HEDLEY_STATIC_CAST(int32_t, a_.values[idx_a]) * HEDLEY_STATIC_CAST(int32_t, b_.values[idx_b]); + } + r_.values[i] += acc; + } + + result = simde_int32x4_from_private(r_); + return result; +} +#if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && defined(__ARM_FEATURE_MATMUL_INT8) + #define simde_vusdotq_laneq_s32(r, a, b, lane) vusdotq_laneq_s32((r), (a), (b), (lane)) +#endif +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vusdotq_laneq_s32 + #define vusdotq_laneq_s32(r, a, b, lane) simde_vusdotq_laneq_s32((r), (a), (b), (lane)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_int32x4_t +simde_vusdotq_lane_s32(simde_int32x4_t r, simde_uint8x16_t a, simde_int8x8_t b, const int lane) + SIMDE_REQUIRE_CONSTANT_RANGE(lane, 0, 1) { + simde_int32x4_t result; + simde_int32x4_private r_ = simde_int32x4_to_private(r); + simde_uint8x16_private a_ = simde_uint8x16_to_private(a); + simde_int8x8_private b_ = simde_int8x8_to_private(b); + + for(int i = 0 ; i < 4 ; i++) { + int32_t acc = 0; + SIMDE_VECTORIZE_REDUCTION(+:acc) + for(int j = 0 ; j < 4 ; j++) { + const int idx_b = j + (lane << 2); + const int idx_a = j + (i << 2); + acc += HEDLEY_STATIC_CAST(int32_t, a_.values[idx_a]) * HEDLEY_STATIC_CAST(int32_t, b_.values[idx_b]); + } + r_.values[i] += acc; + } + + result = simde_int32x4_from_private(r_); + return result; +} +#if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(__ARM_FEATURE_MATMUL_INT8) + #define simde_vusdotq_lane_s32(r, a, b, lane) vusdotq_lane_s32((r), (a), (b), (lane)) +#endif +#if defined(SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES) + #undef vusdotq_lane_s32 + #define vusdotq_lane_s32(r, a, b, lane) simde_vusdotq_lane_s32((r), (a), (b), (lane)) +#endif + +SIMDE_END_DECLS_ +HEDLEY_DIAGNOSTIC_POP + +#endif /* !defined(SIMDE_ARM_NEON_USDOT_LANE_H) */ diff --git a/lib/simd_wrapper/simde/arm/neon/uzp.h b/lib/simd_wrapper/simde/arm/neon/uzp.h index b44db44774b..2788a6f53a0 100644 --- a/lib/simd_wrapper/simde/arm/neon/uzp.h +++ b/lib/simd_wrapper/simde/arm/neon/uzp.h @@ -23,6 +23,7 @@ * Copyright: * 2020 Evan Nemerson * 2020 Sean Maher (Copyright owned by Google, LLC) + * 2023 Yi-Yen Chung (Copyright owned by Andes Technology) */ #if !defined(SIMDE_ARM_NEON_UZP_H) && !defined(SIMDE_BUG_INTEL_857088) @@ -36,6 +37,21 @@ HEDLEY_DIAGNOSTIC_PUSH SIMDE_DISABLE_UNWANTED_DIAGNOSTICS SIMDE_BEGIN_DECLS_ +SIMDE_FUNCTION_ATTRIBUTES +simde_float16x4x2_t +simde_vuzp_f16(simde_float16x4_t a, simde_float16x4_t b) { + #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) && defined(SIMDE_ARM_NEON_FP16) + return vuzp_f16(a, b); + #else + simde_float16x4x2_t r = { { simde_vuzp1_f16(a, b), simde_vuzp2_f16(a, b) } }; + return r; + #endif +} +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vuzp_f16 + #define vuzp_f16(a, b) simde_vuzp_f16((a), (b)) +#endif + SIMDE_FUNCTION_ATTRIBUTES simde_float32x2x2_t simde_vuzp_f32(simde_float32x2_t a, simde_float32x2_t b) { @@ -141,6 +157,21 @@ simde_vuzp_u32(simde_uint32x2_t a, simde_uint32x2_t b) { #define vuzp_u32(a, b) simde_vuzp_u32((a), (b)) #endif +SIMDE_FUNCTION_ATTRIBUTES +simde_float16x8x2_t +simde_vuzpq_f16(simde_float16x8_t a, simde_float16x8_t b) { + #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) && defined(SIMDE_ARM_NEON_FP16) + return vuzpq_f16(a, b); + #else + simde_float16x8x2_t r = { { simde_vuzp1q_f16(a, b), simde_vuzp2q_f16(a, b) } }; + return r; + #endif +} +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vuzpq_f16 + #define vuzpq_f16(a, b) simde_vuzpq_f16((a), (b)) +#endif + SIMDE_FUNCTION_ATTRIBUTES simde_float32x4x2_t simde_vuzpq_f32(simde_float32x4_t a, simde_float32x4_t b) { @@ -246,6 +277,66 @@ simde_vuzpq_u32(simde_uint32x4_t a, simde_uint32x4_t b) { #define vuzpq_u32(a, b) simde_vuzpq_u32((a), (b)) #endif +SIMDE_FUNCTION_ATTRIBUTES +simde_poly8x8x2_t +simde_vuzp_p8(simde_poly8x8_t a, simde_poly8x8_t b) { + #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) + return vuzp_p8(a, b); + #else + simde_poly8x8x2_t r = { { simde_vuzp1_p8(a, b), simde_vuzp2_p8(a, b) } }; + return r; + #endif +} +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vuzp_p8 + #define vuzp_p8(a, b) simde_vuzp_p8((a), (b)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_poly16x4x2_t +simde_vuzp_p16(simde_poly16x4_t a, simde_poly16x4_t b) { + #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) + return vuzp_p16(a, b); + #else + simde_poly16x4x2_t r = { { simde_vuzp1_p16(a, b), simde_vuzp2_p16(a, b) } }; + return r; + #endif +} +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vuzp_p16 + #define vuzp_p16(a, b) simde_vuzp_p16((a), (b)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_poly8x16x2_t +simde_vuzpq_p8(simde_poly8x16_t a, simde_poly8x16_t b) { + #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) + return vuzpq_p8(a, b); + #else + simde_poly8x16x2_t r = { { simde_vuzp1q_p8(a, b), simde_vuzp2q_p8(a, b) } }; + return r; + #endif +} +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vuzpq_p8 + #define vuzpq_p8(a, b) simde_vuzpq_p8((a), (b)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_poly16x8x2_t +simde_vuzpq_p16(simde_poly16x8_t a, simde_poly16x8_t b) { + #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) + return vuzpq_p16(a, b); + #else + simde_poly16x8x2_t r = { { simde_vuzp1q_p16(a, b), simde_vuzp2q_p16(a, b) } }; + return r; + #endif +} +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vuzpq_p16 + #define vuzpq_p16(a, b) simde_vuzpq_p16((a), (b)) +#endif + SIMDE_END_DECLS_ HEDLEY_DIAGNOSTIC_POP diff --git a/lib/simd_wrapper/simde/arm/neon/uzp1.h b/lib/simd_wrapper/simde/arm/neon/uzp1.h index 6cf65a78201..fbc41218e9f 100644 --- a/lib/simd_wrapper/simde/arm/neon/uzp1.h +++ b/lib/simd_wrapper/simde/arm/neon/uzp1.h @@ -23,6 +23,7 @@ * Copyright: * 2020 Evan Nemerson * 2020 Sean Maher (Copyright owned by Google, LLC) + * 2023 Yi-Yen Chung (Copyright owned by Andes Technology) */ #if !defined(SIMDE_ARM_NEON_UZP1_H) @@ -34,6 +35,33 @@ HEDLEY_DIAGNOSTIC_PUSH SIMDE_DISABLE_UNWANTED_DIAGNOSTICS SIMDE_BEGIN_DECLS_ +SIMDE_FUNCTION_ATTRIBUTES +simde_float16x4_t +simde_vuzp1_f16(simde_float16x4_t a, simde_float16x4_t b) { + #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && defined(SIMDE_ARM_NEON_FP16) + return vuzp1_f16(a, b); + #else + simde_float16x4_private + r_, + a_ = simde_float16x4_to_private(a), + b_ = simde_float16x4_to_private(b); + + const size_t halfway_point = sizeof(r_.values) / sizeof(r_.values[0]) / 2; + SIMDE_VECTORIZE + for (size_t i = 0 ; i < halfway_point ; i++) { + const size_t idx = i << 1; + r_.values[ i ] = a_.values[idx]; + r_.values[i + halfway_point] = b_.values[idx]; + } + + return simde_float16x4_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vuzp1_f16 + #define vuzp1_f16(a, b) simde_vuzp1_f16((a), (b)) +#endif + SIMDE_FUNCTION_ATTRIBUTES simde_float32x2_t simde_vuzp1_f32(simde_float32x2_t a, simde_float32x2_t b) { @@ -272,6 +300,36 @@ simde_vuzp1_u32(simde_uint32x2_t a, simde_uint32x2_t b) { #define vuzp1_u32(a, b) simde_vuzp1_u32((a), (b)) #endif +SIMDE_FUNCTION_ATTRIBUTES +simde_float16x8_t +simde_vuzp1q_f16(simde_float16x8_t a, simde_float16x8_t b) { + #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && defined(SIMDE_ARM_NEON_FP16) + return vuzp1q_f16(a, b); + #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE) && defined(SIMDE_ARM_NEON_FP16) + float16x8x2_t t = vuzpq_f16(a, b); + return t.val[0]; + #else + simde_float16x8_private + r_, + a_ = simde_float16x8_to_private(a), + b_ = simde_float16x8_to_private(b); + + const size_t halfway_point = sizeof(r_.values) / sizeof(r_.values[0]) / 2; + SIMDE_VECTORIZE + for (size_t i = 0 ; i < halfway_point ; i++) { + const size_t idx = i << 1; + r_.values[ i ] = a_.values[idx]; + r_.values[i + halfway_point] = b_.values[idx]; + } + + return simde_float16x8_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vuzp1q_f16 + #define vuzp1q_f16(a, b) simde_vuzp1q_f16((a), (b)) +#endif + SIMDE_FUNCTION_ATTRIBUTES simde_float32x4_t simde_vuzp1q_f32(simde_float32x4_t a, simde_float32x4_t b) { @@ -637,6 +695,153 @@ simde_vuzp1q_u64(simde_uint64x2_t a, simde_uint64x2_t b) { #define vuzp1q_u64(a, b) simde_vuzp1q_u64((a), (b)) #endif +SIMDE_FUNCTION_ATTRIBUTES +simde_poly8x8_t +simde_vuzp1_p8(simde_poly8x8_t a, simde_poly8x8_t b) { + #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + return vuzp1_p8(a, b); + #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE) + simde_poly8x8x2_t t = vuzp_p8(a, b); + return t.val[0]; + #else + simde_poly8x8_private + r_, + a_ = simde_poly8x8_to_private(a), + b_ = simde_poly8x8_to_private(b); + + const size_t halfway_point = sizeof(r_.values) / sizeof(r_.values[0]) / 2; + SIMDE_VECTORIZE + for (size_t i = 0 ; i < halfway_point ; i++) { + const size_t idx = i << 1; + r_.values[ i ] = a_.values[idx]; + r_.values[i + halfway_point] = b_.values[idx]; + } + + return simde_poly8x8_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vuzp1_p8 + #define vuzp1_p8(a, b) simde_vuzp1_p8((a), (b)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_poly16x4_t +simde_vuzp1_p16(simde_poly16x4_t a, simde_poly16x4_t b) { + #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + return vuzp1_p16(a, b); + #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE) + simde_poly16x4x2_t t = vuzp_p16(a, b); + return t.val[0]; + #else + simde_poly16x4_private + r_, + a_ = simde_poly16x4_to_private(a), + b_ = simde_poly16x4_to_private(b); + + const size_t halfway_point = sizeof(r_.values) / sizeof(r_.values[0]) / 2; + SIMDE_VECTORIZE + for (size_t i = 0 ; i < halfway_point ; i++) { + const size_t idx = i << 1; + r_.values[ i ] = a_.values[idx]; + r_.values[i + halfway_point] = b_.values[idx]; + } + + return simde_poly16x4_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vuzp1_p16 + #define vuzp1_p16(a, b) simde_vuzp1_p16((a), (b)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_poly8x16_t +simde_vuzp1q_p8(simde_poly8x16_t a, simde_poly8x16_t b) { + #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + return vuzp1q_p8(a, b); + #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE) + simde_poly8x16x2_t t = vuzpq_p8(a, b); + return t.val[0]; + #else + simde_poly8x16_private + r_, + a_ = simde_poly8x16_to_private(a), + b_ = simde_poly8x16_to_private(b); + + const size_t halfway_point = sizeof(r_.values) / sizeof(r_.values[0]) / 2; + SIMDE_VECTORIZE + for (size_t i = 0 ; i < halfway_point ; i++) { + const size_t idx = i << 1; + r_.values[ i ] = a_.values[idx]; + r_.values[i + halfway_point] = b_.values[idx]; + } + + return simde_poly8x16_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vuzp1q_p8 + #define vuzp1q_p8(a, b) simde_vuzp1q_p8((a), (b)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_poly16x8_t +simde_vuzp1q_p16(simde_poly16x8_t a, simde_poly16x8_t b) { + #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + return vuzp1q_p16(a, b); + #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE) + simde_poly16x8x2_t t = vuzpq_p16(a, b); + return t.val[0]; + #else + simde_poly16x8_private + r_, + a_ = simde_poly16x8_to_private(a), + b_ = simde_poly16x8_to_private(b); + + const size_t halfway_point = sizeof(r_.values) / sizeof(r_.values[0]) / 2; + SIMDE_VECTORIZE + for (size_t i = 0 ; i < halfway_point ; i++) { + const size_t idx = i << 1; + r_.values[ i ] = a_.values[idx]; + r_.values[i + halfway_point] = b_.values[idx]; + } + + return simde_poly16x8_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vuzp1q_p16 + #define vuzp1q_p16(a, b) simde_vuzp1q_p16((a), (b)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_poly64x2_t +simde_vuzp1q_p64(simde_poly64x2_t a, simde_poly64x2_t b) { + #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + return vuzp1q_p64(a, b); + #else + simde_poly64x2_private + r_, + a_ = simde_poly64x2_to_private(a), + b_ = simde_poly64x2_to_private(b); + + const size_t halfway_point = sizeof(r_.values) / sizeof(r_.values[0]) / 2; + SIMDE_VECTORIZE + for (size_t i = 0 ; i < halfway_point ; i++) { + const size_t idx = i << 1; + r_.values[ i ] = a_.values[idx]; + r_.values[i + halfway_point] = b_.values[idx]; + } + + return simde_poly64x2_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vuzp1q_p64 + #define vuzp1q_p64(a, b) simde_vuzp1q_p64((a), (b)) +#endif + SIMDE_END_DECLS_ HEDLEY_DIAGNOSTIC_POP diff --git a/lib/simd_wrapper/simde/arm/neon/uzp2.h b/lib/simd_wrapper/simde/arm/neon/uzp2.h index 26856ab7ef7..b2b4091042d 100644 --- a/lib/simd_wrapper/simde/arm/neon/uzp2.h +++ b/lib/simd_wrapper/simde/arm/neon/uzp2.h @@ -23,6 +23,7 @@ * Copyright: * 2020 Evan Nemerson * 2020 Sean Maher (Copyright owned by Google, LLC) + * 2023 Yi-Yen Chung (Copyright owned by Andes Technology) */ #if !defined(SIMDE_ARM_NEON_UZP2_H) @@ -34,6 +35,33 @@ HEDLEY_DIAGNOSTIC_PUSH SIMDE_DISABLE_UNWANTED_DIAGNOSTICS SIMDE_BEGIN_DECLS_ +SIMDE_FUNCTION_ATTRIBUTES +simde_float16x4_t +simde_vuzp2_f16(simde_float16x4_t a, simde_float16x4_t b) { + #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && defined(SIMDE_ARM_NEON_FP16) + return vuzp2_f16(a, b); + #else + simde_float16x4_private + r_, + a_ = simde_float16x4_to_private(a), + b_ = simde_float16x4_to_private(b); + + const size_t halfway_point = sizeof(r_.values) / sizeof(r_.values[0]) / 2; + SIMDE_VECTORIZE + for (size_t i = 0 ; i < halfway_point ; i++) { + const size_t idx = i << 1; + r_.values[ i ] = a_.values[idx | 1]; + r_.values[i + halfway_point] = b_.values[idx | 1]; + } + + return simde_float16x4_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vuzp2_f16 + #define vuzp2_f16(a, b) simde_vuzp2_f16((a), (b)) +#endif + SIMDE_FUNCTION_ATTRIBUTES simde_float32x2_t simde_vuzp2_f32(simde_float32x2_t a, simde_float32x2_t b) { @@ -272,6 +300,36 @@ simde_vuzp2_u32(simde_uint32x2_t a, simde_uint32x2_t b) { #define vuzp2_u32(a, b) simde_vuzp2_u32((a), (b)) #endif +SIMDE_FUNCTION_ATTRIBUTES +simde_float16x8_t +simde_vuzp2q_f16(simde_float16x8_t a, simde_float16x8_t b) { + #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && defined(SIMDE_ARM_NEON_FP16) + return vuzp2q_f16(a, b); + #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE) && defined(SIMDE_ARM_NEON_FP16) + float16x8x2_t t = vuzpq_f16(a, b); + return t.val[1]; + #else + simde_float16x8_private + r_, + a_ = simde_float16x8_to_private(a), + b_ = simde_float16x8_to_private(b); + + const size_t halfway_point = sizeof(r_.values) / sizeof(r_.values[0]) / 2; + SIMDE_VECTORIZE + for (size_t i = 0 ; i < halfway_point ; i++) { + const size_t idx = i << 1; + r_.values[ i ] = a_.values[idx | 1]; + r_.values[i + halfway_point] = b_.values[idx | 1]; + } + + return simde_float16x8_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vuzp2q_f16 + #define vuzp2q_f16(a, b) simde_vuzp2q_f16((a), (b)) +#endif + SIMDE_FUNCTION_ATTRIBUTES simde_float32x4_t simde_vuzp2q_f32(simde_float32x4_t a, simde_float32x4_t b) { @@ -641,6 +699,153 @@ simde_vuzp2q_u64(simde_uint64x2_t a, simde_uint64x2_t b) { #define vuzp2q_u64(a, b) simde_vuzp2q_u64((a), (b)) #endif +SIMDE_FUNCTION_ATTRIBUTES +simde_poly8x8_t +simde_vuzp2_p8(simde_poly8x8_t a, simde_poly8x8_t b) { + #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + return vuzp2_p8(a, b); + #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE) + poly8x8x2_t t = vuzp_p8(a, b); + return t.val[1]; + #else + simde_poly8x8_private + r_, + a_ = simde_poly8x8_to_private(a), + b_ = simde_poly8x8_to_private(b); + + const size_t halfway_point = sizeof(r_.values) / sizeof(r_.values[0]) / 2; + SIMDE_VECTORIZE + for (size_t i = 0 ; i < halfway_point ; i++) { + const size_t idx = i << 1; + r_.values[ i ] = a_.values[idx | 1]; + r_.values[i + halfway_point] = b_.values[idx | 1]; + } + + return simde_poly8x8_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vuzp2_p8 + #define vuzp2_p8(a, b) simde_vuzp2_p8((a), (b)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_poly16x4_t +simde_vuzp2_p16(simde_poly16x4_t a, simde_poly16x4_t b) { + #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + return vuzp2_p16(a, b); + #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE) + poly16x4x2_t t = vuzp_p16(a, b); + return t.val[1]; + #else + simde_poly16x4_private + r_, + a_ = simde_poly16x4_to_private(a), + b_ = simde_poly16x4_to_private(b); + + const size_t halfway_point = sizeof(r_.values) / sizeof(r_.values[0]) / 2; + SIMDE_VECTORIZE + for (size_t i = 0 ; i < halfway_point ; i++) { + const size_t idx = i << 1; + r_.values[ i ] = a_.values[idx | 1]; + r_.values[i + halfway_point] = b_.values[idx | 1]; + } + + return simde_poly16x4_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vuzp2_p16 + #define vuzp2_p16(a, b) simde_vuzp2_p16((a), (b)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_poly8x16_t +simde_vuzp2q_p8(simde_poly8x16_t a, simde_poly8x16_t b) { + #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + return vuzp2q_p8(a, b); + #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE) + poly8x16x2_t t = vuzpq_p8(a, b); + return t.val[1]; + #else + simde_poly8x16_private + r_, + a_ = simde_poly8x16_to_private(a), + b_ = simde_poly8x16_to_private(b); + + const size_t halfway_point = sizeof(r_.values) / sizeof(r_.values[0]) / 2; + SIMDE_VECTORIZE + for (size_t i = 0 ; i < halfway_point ; i++) { + const size_t idx = i << 1; + r_.values[ i ] = a_.values[idx | 1]; + r_.values[i + halfway_point] = b_.values[idx | 1]; + } + + return simde_poly8x16_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vuzp2q_p8 + #define vuzp2q_p8(a, b) simde_vuzp2q_p8((a), (b)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_poly16x8_t +simde_vuzp2q_p16(simde_poly16x8_t a, simde_poly16x8_t b) { + #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + return vuzp2q_p16(a, b); + #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE) + poly16x8x2_t t = vuzpq_p16(a, b); + return t.val[1]; + #else + simde_poly16x8_private + r_, + a_ = simde_poly16x8_to_private(a), + b_ = simde_poly16x8_to_private(b); + + const size_t halfway_point = sizeof(r_.values) / sizeof(r_.values[0]) / 2; + SIMDE_VECTORIZE + for (size_t i = 0 ; i < halfway_point ; i++) { + const size_t idx = i << 1; + r_.values[ i ] = a_.values[idx | 1]; + r_.values[i + halfway_point] = b_.values[idx | 1]; + } + + return simde_poly16x8_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vuzp2q_p16 + #define vuzp2q_p16(a, b) simde_vuzp2q_p16((a), (b)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_poly64x2_t +simde_vuzp2q_p64(simde_poly64x2_t a, simde_poly64x2_t b) { + #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + return vuzp2q_p64(a, b); + #else + simde_poly64x2_private + r_, + a_ = simde_poly64x2_to_private(a), + b_ = simde_poly64x2_to_private(b); + + const size_t halfway_point = sizeof(r_.values) / sizeof(r_.values[0]) / 2; + SIMDE_VECTORIZE + for (size_t i = 0 ; i < halfway_point ; i++) { + const size_t idx = i << 1; + r_.values[ i ] = a_.values[idx | 1]; + r_.values[i + halfway_point] = b_.values[idx | 1]; + } + + return simde_poly64x2_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vuzp2q_p64 + #define vuzp2q_p64(a, b) simde_vuzp2q_p64((a), (b)) +#endif + SIMDE_END_DECLS_ HEDLEY_DIAGNOSTIC_POP diff --git a/lib/simd_wrapper/simde/arm/neon/xar.h b/lib/simd_wrapper/simde/arm/neon/xar.h index d48db05ed09..b7b2c583640 100644 --- a/lib/simd_wrapper/simde/arm/neon/xar.h +++ b/lib/simd_wrapper/simde/arm/neon/xar.h @@ -49,10 +49,10 @@ simde_vxarq_u64(simde_uint64x2_t a, simde_uint64x2_t b, const int d) return simde_uint64x2_from_private(r_); } -#if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && defined(__ARM_FEATURE_SHA3) +#if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && defined(SIMDE_ARCH_ARM_SHA3) #define simde_vxarq_u64(a, b, d) vxarq_u64((a), (b), (d)) #endif -#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && !defined(__ARM_FEATURE_SHA3)) +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && !defined(SIMDE_ARCH_ARM_SHA3)) #undef vxarq_u64 #define vxarq_u64(a, b, d) simde_vxarq_u64((a), (b), (d)) #endif diff --git a/lib/simd_wrapper/simde/arm/neon/zip.h b/lib/simd_wrapper/simde/arm/neon/zip.h index 830a8d4db1e..a7921cc6214 100644 --- a/lib/simd_wrapper/simde/arm/neon/zip.h +++ b/lib/simd_wrapper/simde/arm/neon/zip.h @@ -23,6 +23,7 @@ * Copyright: * 2020 Evan Nemerson * 2020 Sean Maher (Copyright owned by Google, LLC) + * 2023 Yi-Yen Chung (Copyright owned by Andes Technology) */ #if !defined(SIMDE_ARM_NEON_ZIP_H) && !defined(SIMDE_BUG_INTEL_857088) @@ -36,6 +37,21 @@ HEDLEY_DIAGNOSTIC_PUSH SIMDE_DISABLE_UNWANTED_DIAGNOSTICS SIMDE_BEGIN_DECLS_ +SIMDE_FUNCTION_ATTRIBUTES +simde_float16x4x2_t +simde_vzip_f16(simde_float16x4_t a, simde_float16x4_t b) { + #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) && defined(SIMDE_ARM_NEON_FP16) + return vzip_f16(a, b); + #else + simde_float16x4x2_t r = { { simde_vzip1_f16(a, b), simde_vzip2_f16(a, b) } }; + return r; + #endif +} +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vzip_f16 + #define vzip_f16(a, b) simde_vzip_f16((a), (b)) +#endif + SIMDE_FUNCTION_ATTRIBUTES simde_float32x2x2_t simde_vzip_f32(simde_float32x2_t a, simde_float32x2_t b) { @@ -141,6 +157,21 @@ simde_vzip_u32(simde_uint32x2_t a, simde_uint32x2_t b) { #define vzip_u32(a, b) simde_vzip_u32((a), (b)) #endif +SIMDE_FUNCTION_ATTRIBUTES +simde_float16x8x2_t +simde_vzipq_f16(simde_float16x8_t a, simde_float16x8_t b) { + #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) && defined(SIMDE_ARM_NEON_FP16) + return vzipq_f16(a, b); + #else + simde_float16x8x2_t r = { { simde_vzip1q_f16(a, b), simde_vzip2q_f16(a, b) } }; + return r; + #endif +} +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vzipq_f16 + #define vzipq_f16(a, b) simde_vzipq_f16((a), (b)) +#endif + SIMDE_FUNCTION_ATTRIBUTES simde_float32x4x2_t simde_vzipq_f32(simde_float32x4_t a, simde_float32x4_t b) { @@ -246,6 +277,66 @@ simde_vzipq_u32(simde_uint32x4_t a, simde_uint32x4_t b) { #define vzipq_u32(a, b) simde_vzipq_u32((a), (b)) #endif +SIMDE_FUNCTION_ATTRIBUTES +simde_poly8x8x2_t +simde_vzip_p8(simde_poly8x8_t a, simde_poly8x8_t b) { + #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) + return vzip_p8(a, b); + #else + simde_poly8x8x2_t r = { { simde_vzip1_p8(a, b), simde_vzip2_p8(a, b) } }; + return r; + #endif +} +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vzip_p8 + #define vzip_p8(a, b) simde_vzip_p8((a), (b)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_poly16x4x2_t +simde_vzip_p16(simde_poly16x4_t a, simde_poly16x4_t b) { + #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) + return vzip_p16(a, b); + #else + simde_poly16x4x2_t r = { { simde_vzip1_p16(a, b), simde_vzip2_p16(a, b) } }; + return r; + #endif +} +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vzip_p16 + #define vzip_p16(a, b) simde_vzip_p16((a), (b)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_poly8x16x2_t +simde_vzipq_p8(simde_poly8x16_t a, simde_poly8x16_t b) { + #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) + return vzipq_p8(a, b); + #else + simde_poly8x16x2_t r = { { simde_vzip1q_p8(a, b), simde_vzip2q_p8(a, b) } }; + return r; + #endif +} +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vzipq_p8 + #define vzipq_p8(a, b) simde_vzipq_p8((a), (b)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_poly16x8x2_t +simde_vzipq_p16(simde_poly16x8_t a, simde_poly16x8_t b) { + #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) + return vzipq_p16(a, b); + #else + simde_poly16x8x2_t r = { { simde_vzip1q_p16(a, b), simde_vzip2q_p16(a, b) } }; + return r; + #endif +} +#if defined(SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES) + #undef vzipq_p16 + #define vzipq_p16(a, b) simde_vzipq_p16((a), (b)) +#endif + SIMDE_END_DECLS_ HEDLEY_DIAGNOSTIC_POP diff --git a/lib/simd_wrapper/simde/arm/neon/zip1.h b/lib/simd_wrapper/simde/arm/neon/zip1.h index b0298be4f06..ea7794359ad 100644 --- a/lib/simd_wrapper/simde/arm/neon/zip1.h +++ b/lib/simd_wrapper/simde/arm/neon/zip1.h @@ -23,6 +23,7 @@ * Copyright: * 2020 Evan Nemerson * 2020 Sean Maher (Copyright owned by Google, LLC) + * 2023 Yi-Yen Chung (Copyright owned by Andes Technology) */ #if !defined(SIMDE_ARM_NEON_ZIP1_H) @@ -34,6 +35,32 @@ HEDLEY_DIAGNOSTIC_PUSH SIMDE_DISABLE_UNWANTED_DIAGNOSTICS SIMDE_BEGIN_DECLS_ +SIMDE_FUNCTION_ATTRIBUTES +simde_float16x4_t +simde_vzip1_f16(simde_float16x4_t a, simde_float16x4_t b) { + #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && defined(SIMDE_ARM_NEON_FP16) + return vzip1_f16(a, b); + #else + simde_float16x4_private + r_, + a_ = simde_float16x4_to_private(a), + b_ = simde_float16x4_to_private(b); + + const size_t halfway_point = sizeof(r_.values) / sizeof(r_.values[0]) / 2; + SIMDE_VECTORIZE + for (size_t i = 0 ; i < halfway_point ; i++) { + r_.values[2 * i ] = a_.values[i]; + r_.values[2 * i + 1] = b_.values[i]; + } + + return simde_float16x4_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vzip1_f16 + #define vzip1_f16(a, b) simde_vzip1_f16((a), (b)) +#endif + SIMDE_FUNCTION_ATTRIBUTES simde_float32x2_t simde_vzip1_f32(simde_float32x2_t a, simde_float32x2_t b) { @@ -279,6 +306,32 @@ simde_vzip1_u32(simde_uint32x2_t a, simde_uint32x2_t b) { #define vzip1_u32(a, b) simde_vzip1_u32((a), (b)) #endif +SIMDE_FUNCTION_ATTRIBUTES +simde_float16x8_t +simde_vzip1q_f16(simde_float16x8_t a, simde_float16x8_t b) { + #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && defined(SIMDE_ARM_NEON_FP16) + return vzip1q_f16(a, b); + #else + simde_float16x8_private + r_, + a_ = simde_float16x8_to_private(a), + b_ = simde_float16x8_to_private(b); + + const size_t halfway_point = sizeof(r_.values) / sizeof(r_.values[0]) / 2; + SIMDE_VECTORIZE + for (size_t i = 0 ; i < halfway_point ; i++) { + r_.values[2 * i ] = a_.values[i]; + r_.values[2 * i + 1] = b_.values[i]; + } + + return simde_float16x8_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vzip1q_f16 + #define vzip1q_f16(a, b) simde_vzip1q_f16((a), (b)) +#endif + SIMDE_FUNCTION_ATTRIBUTES simde_float32x4_t simde_vzip1q_f32(simde_float32x4_t a, simde_float32x4_t b) { @@ -661,6 +714,148 @@ simde_vzip1q_u64(simde_uint64x2_t a, simde_uint64x2_t b) { #define vzip1q_u64(a, b) simde_vzip1q_u64((a), (b)) #endif +SIMDE_FUNCTION_ATTRIBUTES +simde_poly8x8_t +simde_vzip1_p8(simde_poly8x8_t a, simde_poly8x8_t b) { + #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + return vzip1_p8(a, b); + #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE) + simde_poly8x8x2_t tmp = vzip_p8(a, b); + return tmp.val[0]; + #else + simde_poly8x8_private + r_, + a_ = simde_poly8x8_to_private(a), + b_ = simde_poly8x8_to_private(b); + + const size_t halfway_point = sizeof(r_.values) / sizeof(r_.values[0]) / 2; + SIMDE_VECTORIZE + for (size_t i = 0 ; i < halfway_point ; i++) { + r_.values[2 * i ] = a_.values[i]; + r_.values[2 * i + 1] = b_.values[i]; + } + + return simde_poly8x8_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vzip1_p8 + #define vzip1_p8(a, b) simde_vzip1_p8((a), (b)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_poly16x4_t +simde_vzip1_p16(simde_poly16x4_t a, simde_poly16x4_t b) { + #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + return vzip1_p16(a, b); + #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE) + simde_poly16x4x2_t tmp = vzip_p16(a, b); + return tmp.val[0]; + #else + simde_poly16x4_private + r_, + a_ = simde_poly16x4_to_private(a), + b_ = simde_poly16x4_to_private(b); + + const size_t halfway_point = sizeof(r_.values) / sizeof(r_.values[0]) / 2; + SIMDE_VECTORIZE + for (size_t i = 0 ; i < halfway_point ; i++) { + r_.values[2 * i ] = a_.values[i]; + r_.values[2 * i + 1] = b_.values[i]; + } + + return simde_poly16x4_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vzip1_p16 + #define vzip1_p16(a, b) simde_vzip1_p16((a), (b)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_poly8x16_t +simde_vzip1q_p8(simde_poly8x16_t a, simde_poly8x16_t b) { + #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + return vzip1q_p8(a, b); + #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE) + simde_poly8x8x2_t tmp = vzip_p8(vget_low_p8(a), vget_low_p8(b)); + return vcombine_p8(tmp.val[0], tmp.val[1]); + #else + simde_poly8x16_private + r_, + a_ = simde_poly8x16_to_private(a), + b_ = simde_poly8x16_to_private(b); + + const size_t halfway_point = sizeof(r_.values) / sizeof(r_.values[0]) / 2; + SIMDE_VECTORIZE + for (size_t i = 0 ; i < halfway_point ; i++) { + r_.values[2 * i ] = a_.values[i]; + r_.values[2 * i + 1] = b_.values[i]; + } + + return simde_poly8x16_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vzip1q_p8 + #define vzip1q_p8(a, b) simde_vzip1q_p8((a), (b)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_poly16x8_t +simde_vzip1q_p16(simde_poly16x8_t a, simde_poly16x8_t b) { + #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + return vzip1q_p16(a, b); + #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE) + simde_poly16x4x2_t tmp = vzip_p16(vget_low_p16(a), vget_low_p16(b)); + return vcombine_p16(tmp.val[0], tmp.val[1]); + #else + simde_poly16x8_private + r_, + a_ = simde_poly16x8_to_private(a), + b_ = simde_poly16x8_to_private(b); + + const size_t halfway_point = sizeof(r_.values) / sizeof(r_.values[0]) / 2; + SIMDE_VECTORIZE + for (size_t i = 0 ; i < halfway_point ; i++) { + r_.values[2 * i ] = a_.values[i]; + r_.values[2 * i + 1] = b_.values[i]; + } + + return simde_poly16x8_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vzip1q_p16 + #define vzip1q_p16(a, b) simde_vzip1q_p16((a), (b)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_poly64x2_t +simde_vzip1q_p64(simde_poly64x2_t a, simde_poly64x2_t b) { + #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + return vzip1q_p64(a, b); + #else + simde_poly64x2_private + r_, + a_ = simde_poly64x2_to_private(a), + b_ = simde_poly64x2_to_private(b); + + const size_t halfway_point = sizeof(r_.values) / sizeof(r_.values[0]) / 2; + SIMDE_VECTORIZE + for (size_t i = 0 ; i < halfway_point ; i++) { + r_.values[2 * i ] = a_.values[i]; + r_.values[2 * i + 1] = b_.values[i]; + } + + return simde_poly64x2_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vzip1q_p64 + #define vzip1q_p64(a, b) simde_vzip1q_p64((a), (b)) +#endif + SIMDE_END_DECLS_ HEDLEY_DIAGNOSTIC_POP diff --git a/lib/simd_wrapper/simde/arm/neon/zip2.h b/lib/simd_wrapper/simde/arm/neon/zip2.h index bf78b1201a4..0cd2150ac3f 100644 --- a/lib/simd_wrapper/simde/arm/neon/zip2.h +++ b/lib/simd_wrapper/simde/arm/neon/zip2.h @@ -23,6 +23,7 @@ * Copyright: * 2020 Evan Nemerson * 2020 Sean Maher (Copyright owned by Google, LLC) + * 2023 Yi-Yen Chung (Copyright owned by Andes Technology) */ #if !defined(SIMDE_ARM_NEON_ZIP2_H) @@ -34,6 +35,32 @@ HEDLEY_DIAGNOSTIC_PUSH SIMDE_DISABLE_UNWANTED_DIAGNOSTICS SIMDE_BEGIN_DECLS_ +SIMDE_FUNCTION_ATTRIBUTES +simde_float16x4_t +simde_vzip2_f16(simde_float16x4_t a, simde_float16x4_t b) { + #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && defined(SIMDE_ARM_NEON_FP16) + return vzip2_f16(a, b); + #else + simde_float16x4_private + r_, + a_ = simde_float16x4_to_private(a), + b_ = simde_float16x4_to_private(b); + + const size_t halfway_point = sizeof(r_.values) / sizeof(r_.values[0]) / 2; + SIMDE_VECTORIZE + for (size_t i = 0 ; i < halfway_point ; i++) { + r_.values[(2 * i) ] = a_.values[halfway_point + i]; + r_.values[(2 * i) + 1] = b_.values[halfway_point + i]; + } + + return simde_float16x4_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vzip2_f16 + #define vzip2_f16(a, b) simde_vzip2_f16((a), (b)) +#endif + SIMDE_FUNCTION_ATTRIBUTES simde_float32x2_t simde_vzip2_f32(simde_float32x2_t a, simde_float32x2_t b) { @@ -258,6 +285,32 @@ simde_vzip2_u32(simde_uint32x2_t a, simde_uint32x2_t b) { #define vzip2_u32(a, b) simde_vzip2_u32((a), (b)) #endif +SIMDE_FUNCTION_ATTRIBUTES +simde_float16x8_t +simde_vzip2q_f16(simde_float16x8_t a, simde_float16x8_t b) { + #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && defined(SIMDE_ARM_NEON_FP16) + return vzip2q_f16(a, b); + #else + simde_float16x8_private + r_, + a_ = simde_float16x8_to_private(a), + b_ = simde_float16x8_to_private(b); + + const size_t halfway_point = sizeof(r_.values) / sizeof(r_.values[0]) / 2; + SIMDE_VECTORIZE + for (size_t i = 0 ; i < halfway_point ; i++) { + r_.values[(2 * i) ] = a_.values[halfway_point + i]; + r_.values[(2 * i) + 1] = b_.values[halfway_point + i]; + } + + return simde_float16x8_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vzip2q_f16 + #define vzip2q_f16(a, b) simde_vzip2q_f16((a), (b)) +#endif + SIMDE_FUNCTION_ATTRIBUTES simde_float32x4_t simde_vzip2q_f32(simde_float32x4_t a, simde_float32x4_t b) { @@ -619,6 +672,136 @@ simde_vzip2q_u64(simde_uint64x2_t a, simde_uint64x2_t b) { #define vzip2q_u64(a, b) simde_vzip2q_u64((a), (b)) #endif +SIMDE_FUNCTION_ATTRIBUTES +simde_poly8x8_t +simde_vzip2_p8(simde_poly8x8_t a, simde_poly8x8_t b) { + #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + return vzip2_p8(a, b); + #else + simde_poly8x8_private + r_, + a_ = simde_poly8x8_to_private(a), + b_ = simde_poly8x8_to_private(b); + + const size_t halfway_point = sizeof(r_.values) / sizeof(r_.values[0]) / 2; + SIMDE_VECTORIZE + for (size_t i = 0 ; i < halfway_point ; i++) { + r_.values[(2 * i) ] = a_.values[halfway_point + i]; + r_.values[(2 * i) + 1] = b_.values[halfway_point + i]; + } + + return simde_poly8x8_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vzip2_p8 + #define vzip2_p8(a, b) simde_vzip2_p8((a), (b)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_poly16x4_t +simde_vzip2_p16(simde_poly16x4_t a, simde_poly16x4_t b) { + #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + return vzip2_p16(a, b); + #else + simde_poly16x4_private + r_, + a_ = simde_poly16x4_to_private(a), + b_ = simde_poly16x4_to_private(b); + + const size_t halfway_point = sizeof(r_.values) / sizeof(r_.values[0]) / 2; + SIMDE_VECTORIZE + for (size_t i = 0 ; i < halfway_point ; i++) { + r_.values[(2 * i) ] = a_.values[halfway_point + i]; + r_.values[(2 * i) + 1] = b_.values[halfway_point + i]; + } + + return simde_poly16x4_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vzip2_p16 + #define vzip2_p16(a, b) simde_vzip2_p16((a), (b)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_poly8x16_t +simde_vzip2q_p8(simde_poly8x16_t a, simde_poly8x16_t b) { + #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + return vzip2q_p8(a, b); + #else + simde_poly8x16_private + r_, + a_ = simde_poly8x16_to_private(a), + b_ = simde_poly8x16_to_private(b); + + const size_t halfway_point = sizeof(r_.values) / sizeof(r_.values[0]) / 2; + SIMDE_VECTORIZE + for (size_t i = 0 ; i < halfway_point ; i++) { + r_.values[(2 * i) ] = a_.values[halfway_point + i]; + r_.values[(2 * i) + 1] = b_.values[halfway_point + i]; + } + + return simde_poly8x16_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vzip2q_p8 + #define vzip2q_p8(a, b) simde_vzip2q_p8((a), (b)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_poly16x8_t +simde_vzip2q_p16(simde_poly16x8_t a, simde_poly16x8_t b) { + #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + return vzip2q_p16(a, b); + #else + simde_poly16x8_private + r_, + a_ = simde_poly16x8_to_private(a), + b_ = simde_poly16x8_to_private(b); + + const size_t halfway_point = sizeof(r_.values) / sizeof(r_.values[0]) / 2; + SIMDE_VECTORIZE + for (size_t i = 0 ; i < halfway_point ; i++) { + r_.values[(2 * i) ] = a_.values[halfway_point + i]; + r_.values[(2 * i) + 1] = b_.values[halfway_point + i]; + } + + return simde_poly16x8_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vzip2q_p16 + #define vzip2q_p16(a, b) simde_vzip2q_p16((a), (b)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_poly64x2_t +simde_vzip2q_p64(simde_poly64x2_t a, simde_poly64x2_t b) { + #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + return vzip2q_p64(a, b); + #else + simde_poly64x2_private + r_, + a_ = simde_poly64x2_to_private(a), + b_ = simde_poly64x2_to_private(b); + + const size_t halfway_point = sizeof(r_.values) / sizeof(r_.values[0]) / 2; + SIMDE_VECTORIZE + for (size_t i = 0 ; i < halfway_point ; i++) { + r_.values[(2 * i) ] = a_.values[halfway_point + i]; + r_.values[(2 * i) + 1] = b_.values[halfway_point + i]; + } + + return simde_poly64x2_from_private(r_); + #endif +} +#if defined(SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES) + #undef vzip2q_p64 + #define vzip2q_p64(a, b) simde_vzip2q_p64((a), (b)) +#endif + SIMDE_END_DECLS_ HEDLEY_DIAGNOSTIC_POP diff --git a/lib/simd_wrapper/simde/arm/sve/and.h b/lib/simd_wrapper/simde/arm/sve/and.h index 76b37d20bd0..12d3f63bcad 100644 --- a/lib/simd_wrapper/simde/arm/sve/and.h +++ b/lib/simd_wrapper/simde/arm/sve/and.h @@ -316,7 +316,8 @@ simde_svint32_t simde_svand_s32_z(simde_svbool_t pg, simde_svint32_t op1, simde_svint32_t op2) { #if defined(SIMDE_ARM_SVE_NATIVE) return svand_s32_z(pg, op1, op2); - #elif defined(SIMDE_X86_AVX512BW_NATIVE) && ((SIMDE_ARM_SVE_VECTOR_SIZE >= 512) || defined(SIMDE_X86_AVX512VL_NATIVE)) + #elif defined(SIMDE_X86_AVX512BW_NATIVE) && ((SIMDE_ARM_SVE_VECTOR_SIZE >= 512) || defined(SIMDE_X86_AVX512VL_NATIVE)) \ + && (!defined(HEDLEY_MSVC_VERSION) || HEDLEY_MSVC_VERSION_CHECK(19,20,0)) simde_svint32_t r; #if SIMDE_ARM_SVE_VECTOR_SIZE >= 512 @@ -340,7 +341,8 @@ simde_svint32_t simde_svand_s32_m(simde_svbool_t pg, simde_svint32_t op1, simde_svint32_t op2) { #if defined(SIMDE_ARM_SVE_NATIVE) return svand_s32_m(pg, op1, op2); - #elif defined(SIMDE_X86_AVX512BW_NATIVE) && ((SIMDE_ARM_SVE_VECTOR_SIZE >= 512) || defined(SIMDE_X86_AVX512VL_NATIVE)) + #elif defined(SIMDE_X86_AVX512BW_NATIVE) && ((SIMDE_ARM_SVE_VECTOR_SIZE >= 512) || defined(SIMDE_X86_AVX512VL_NATIVE)) \ + && (!defined(HEDLEY_MSVC_VERSION) || HEDLEY_MSVC_VERSION_CHECK(19,20,0)) simde_svint32_t r; #if SIMDE_ARM_SVE_VECTOR_SIZE >= 512 @@ -452,7 +454,8 @@ simde_svint64_t simde_svand_s64_z(simde_svbool_t pg, simde_svint64_t op1, simde_svint64_t op2) { #if defined(SIMDE_ARM_SVE_NATIVE) return svand_s64_z(pg, op1, op2); - #elif defined(SIMDE_X86_AVX512BW_NATIVE) && ((SIMDE_ARM_SVE_VECTOR_SIZE >= 512) || defined(SIMDE_X86_AVX512VL_NATIVE)) + #elif defined(SIMDE_X86_AVX512BW_NATIVE) && ((SIMDE_ARM_SVE_VECTOR_SIZE >= 512) || defined(SIMDE_X86_AVX512VL_NATIVE)) \ + && (!defined(HEDLEY_MSVC_VERSION) || HEDLEY_MSVC_VERSION_CHECK(19,20,0)) simde_svint64_t r; #if SIMDE_ARM_SVE_VECTOR_SIZE >= 512 @@ -476,7 +479,8 @@ simde_svint64_t simde_svand_s64_m(simde_svbool_t pg, simde_svint64_t op1, simde_svint64_t op2) { #if defined(SIMDE_ARM_SVE_NATIVE) return svand_s64_m(pg, op1, op2); - #elif defined(SIMDE_X86_AVX512BW_NATIVE) && ((SIMDE_ARM_SVE_VECTOR_SIZE >= 512) || defined(SIMDE_X86_AVX512VL_NATIVE)) + #elif defined(SIMDE_X86_AVX512BW_NATIVE) && ((SIMDE_ARM_SVE_VECTOR_SIZE >= 512) || defined(SIMDE_X86_AVX512VL_NATIVE)) \ + && (!defined(HEDLEY_MSVC_VERSION) || HEDLEY_MSVC_VERSION_CHECK(19,20,0)) simde_svint64_t r; #if SIMDE_ARM_SVE_VECTOR_SIZE >= 512 diff --git a/lib/simd_wrapper/simde/arm/sve/cmplt.h b/lib/simd_wrapper/simde/arm/sve/cmplt.h index fe400c4dd57..5df0f8441d6 100644 --- a/lib/simd_wrapper/simde/arm/sve/cmplt.h +++ b/lib/simd_wrapper/simde/arm/sve/cmplt.h @@ -40,9 +40,11 @@ simde_svcmplt_s8(simde_svbool_t pg, simde_svint8_t op1, simde_svint8_t op2) { #else simde_svbool_t r; - #if defined(SIMDE_X86_AVX512BW_NATIVE) && (SIMDE_ARM_SVE_VECTOR_SIZE >= 512) + #if defined(SIMDE_X86_AVX512BW_NATIVE) && (SIMDE_ARM_SVE_VECTOR_SIZE >= 512) \ + && (!defined(HEDLEY_MSVC_VERSION) || HEDLEY_MSVC_VERSION_CHECK(19,20,0)) r = simde_svbool_from_mmask64(_mm512_mask_cmplt_epi8_mask(simde_svbool_to_mmask64(pg), op1.m512i, op2.m512i)); - #elif defined(SIMDE_X86_AVX512BW_NATIVE) && defined(SIMDE_X86_AVX512VL_NATIVE) + #elif defined(SIMDE_X86_AVX512BW_NATIVE) && defined(SIMDE_X86_AVX512VL_NATIVE) \ + && (!defined(HEDLEY_MSVC_VERSION) || HEDLEY_MSVC_VERSION_CHECK(19,20,0)) r = simde_svbool_from_mmask32(_mm256_mask_cmplt_epi8_mask(simde_svbool_to_mmask32(pg), op1.m256i[0], op2.m256i[0])); #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE) r.neon_i8 = vandq_s8(pg.neon_i8, vreinterpretq_s8_u8(vcltq_s8(op1.neon, op2.neon))); @@ -81,9 +83,11 @@ simde_svcmplt_s16(simde_svbool_t pg, simde_svint16_t op1, simde_svint16_t op2) { #else simde_svbool_t r; - #if defined(SIMDE_X86_AVX512BW_NATIVE) && (SIMDE_ARM_SVE_VECTOR_SIZE >= 512) + #if defined(SIMDE_X86_AVX512BW_NATIVE) && (SIMDE_ARM_SVE_VECTOR_SIZE >= 512) \ + && (!defined(HEDLEY_MSVC_VERSION) || HEDLEY_MSVC_VERSION_CHECK(19,20,0)) r = simde_svbool_from_mmask32(_mm512_mask_cmplt_epi16_mask(simde_svbool_to_mmask32(pg), op1.m512i, op2.m512i)); - #elif defined(SIMDE_X86_AVX512BW_NATIVE) && defined(SIMDE_X86_AVX512VL_NATIVE) + #elif defined(SIMDE_X86_AVX512BW_NATIVE) && defined(SIMDE_X86_AVX512VL_NATIVE) \ + && (!defined(HEDLEY_MSVC_VERSION) || HEDLEY_MSVC_VERSION_CHECK(19,20,0)) r = simde_svbool_from_mmask16(_mm256_mask_cmplt_epi16_mask(simde_svbool_to_mmask16(pg), op1.m256i[0], op2.m256i[0])); #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE) r.neon_i16 = vandq_s16(pg.neon_i16, vreinterpretq_s16_u16(vcltq_s16(op1.neon, op2.neon))); @@ -122,9 +126,11 @@ simde_svcmplt_s32(simde_svbool_t pg, simde_svint32_t op1, simde_svint32_t op2) { #else simde_svbool_t r; - #if defined(SIMDE_X86_AVX512BW_NATIVE) && (SIMDE_ARM_SVE_VECTOR_SIZE >= 512) + #if defined(SIMDE_X86_AVX512BW_NATIVE) && (SIMDE_ARM_SVE_VECTOR_SIZE >= 512) \ + && (!defined(HEDLEY_MSVC_VERSION) || HEDLEY_MSVC_VERSION_CHECK(19,20,0)) r = simde_svbool_from_mmask16(_mm512_mask_cmplt_epi32_mask(simde_svbool_to_mmask16(pg), op1.m512i, op2.m512i)); - #elif defined(SIMDE_X86_AVX512BW_NATIVE) && defined(SIMDE_X86_AVX512VL_NATIVE) + #elif defined(SIMDE_X86_AVX512BW_NATIVE) && defined(SIMDE_X86_AVX512VL_NATIVE) \ + && (!defined(HEDLEY_MSVC_VERSION) || HEDLEY_MSVC_VERSION_CHECK(19,20,0)) r = simde_svbool_from_mmask8(_mm256_mask_cmplt_epi32_mask(simde_svbool_to_mmask8(pg), op1.m256i[0], op2.m256i[0])); #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE) r.neon_i32 = vandq_s32(pg.neon_i32, vreinterpretq_s32_u32(vcltq_s32(op1.neon, op2.neon))); @@ -163,9 +169,11 @@ simde_svcmplt_s64(simde_svbool_t pg, simde_svint64_t op1, simde_svint64_t op2) { #else simde_svbool_t r; - #if defined(SIMDE_X86_AVX512BW_NATIVE) && (SIMDE_ARM_SVE_VECTOR_SIZE >= 512) + #if defined(SIMDE_X86_AVX512BW_NATIVE) && (SIMDE_ARM_SVE_VECTOR_SIZE >= 512) \ + && (!defined(HEDLEY_MSVC_VERSION) || HEDLEY_MSVC_VERSION_CHECK(19,20,0)) r = simde_svbool_from_mmask8(_mm512_mask_cmplt_epi64_mask(simde_svbool_to_mmask8(pg), op1.m512i, op2.m512i)); - #elif defined(SIMDE_X86_AVX512BW_NATIVE) && defined(SIMDE_X86_AVX512VL_NATIVE) + #elif defined(SIMDE_X86_AVX512BW_NATIVE) && defined(SIMDE_X86_AVX512VL_NATIVE) \ + && (!defined(HEDLEY_MSVC_VERSION) || HEDLEY_MSVC_VERSION_CHECK(19,20,0)) r = simde_svbool_from_mmask4(_mm256_mask_cmplt_epi64_mask(simde_svbool_to_mmask4(pg), op1.m256i[0], op2.m256i[0])); #elif defined(SIMDE_ARM_NEON_A64V8_NATIVE) r.neon_i64 = vandq_s64(pg.neon_i64, vreinterpretq_s64_u64(vcltq_s64(op1.neon, op2.neon))); @@ -200,9 +208,11 @@ simde_svcmplt_u8(simde_svbool_t pg, simde_svuint8_t op1, simde_svuint8_t op2) { #else simde_svbool_t r; - #if defined(SIMDE_X86_AVX512BW_NATIVE) && (SIMDE_ARM_SVE_VECTOR_SIZE >= 512) + #if defined(SIMDE_X86_AVX512BW_NATIVE) && (SIMDE_ARM_SVE_VECTOR_SIZE >= 512) \ + && (!defined(HEDLEY_MSVC_VERSION) || HEDLEY_MSVC_VERSION_CHECK(19,20,0)) r = simde_svbool_from_mmask64(_mm512_mask_cmplt_epu8_mask(simde_svbool_to_mmask64(pg), op1.m512i, op2.m512i)); - #elif defined(SIMDE_X86_AVX512BW_NATIVE) && defined(SIMDE_X86_AVX512VL_NATIVE) + #elif defined(SIMDE_X86_AVX512BW_NATIVE) && defined(SIMDE_X86_AVX512VL_NATIVE) \ + && (!defined(HEDLEY_MSVC_VERSION) || HEDLEY_MSVC_VERSION_CHECK(19,20,0)) r = simde_svbool_from_mmask32(_mm256_mask_cmplt_epu8_mask(simde_svbool_to_mmask32(pg), op1.m256i[0], op2.m256i[0])); #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE) r.neon_u8 = vandq_u8(pg.neon_u8, vcltq_u8(op1.neon, op2.neon)); @@ -237,9 +247,11 @@ simde_svcmplt_u16(simde_svbool_t pg, simde_svuint16_t op1, simde_svuint16_t op2) #else simde_svbool_t r; - #if defined(SIMDE_X86_AVX512BW_NATIVE) && (SIMDE_ARM_SVE_VECTOR_SIZE >= 512) + #if defined(SIMDE_X86_AVX512BW_NATIVE) && (SIMDE_ARM_SVE_VECTOR_SIZE >= 512) \ + && (!defined(HEDLEY_MSVC_VERSION) || HEDLEY_MSVC_VERSION_CHECK(19,20,0)) r = simde_svbool_from_mmask32(_mm512_mask_cmplt_epu16_mask(simde_svbool_to_mmask32(pg), op1.m512i, op2.m512i)); - #elif defined(SIMDE_X86_AVX512BW_NATIVE) && defined(SIMDE_X86_AVX512VL_NATIVE) + #elif defined(SIMDE_X86_AVX512BW_NATIVE) && defined(SIMDE_X86_AVX512VL_NATIVE) \ + && (!defined(HEDLEY_MSVC_VERSION) || HEDLEY_MSVC_VERSION_CHECK(19,20,0)) r = simde_svbool_from_mmask16(_mm256_mask_cmplt_epu16_mask(simde_svbool_to_mmask16(pg), op1.m256i[0], op2.m256i[0])); #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE) r.neon_u16 = vandq_u16(pg.neon_u16, vcltq_u16(op1.neon, op2.neon)); @@ -274,9 +286,11 @@ simde_svcmplt_u32(simde_svbool_t pg, simde_svuint32_t op1, simde_svuint32_t op2) #else simde_svbool_t r; - #if defined(SIMDE_X86_AVX512BW_NATIVE) && (SIMDE_ARM_SVE_VECTOR_SIZE >= 512) + #if defined(SIMDE_X86_AVX512BW_NATIVE) && (SIMDE_ARM_SVE_VECTOR_SIZE >= 512) \ + && (!defined(HEDLEY_MSVC_VERSION) || HEDLEY_MSVC_VERSION_CHECK(19,20,0)) r = simde_svbool_from_mmask16(_mm512_mask_cmplt_epu32_mask(simde_svbool_to_mmask16(pg), op1.m512i, op2.m512i)); - #elif defined(SIMDE_X86_AVX512BW_NATIVE) && defined(SIMDE_X86_AVX512VL_NATIVE) + #elif defined(SIMDE_X86_AVX512BW_NATIVE) && defined(SIMDE_X86_AVX512VL_NATIVE) \ + && (!defined(HEDLEY_MSVC_VERSION) || HEDLEY_MSVC_VERSION_CHECK(19,20,0)) r = simde_svbool_from_mmask8(_mm256_mask_cmplt_epu32_mask(simde_svbool_to_mmask8(pg), op1.m256i[0], op2.m256i[0])); #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE) r.neon_u32 = vandq_u32(pg.neon_u32, vcltq_u32(op1.neon, op2.neon)); @@ -311,9 +325,11 @@ simde_svcmplt_u64(simde_svbool_t pg, simde_svuint64_t op1, simde_svuint64_t op2) #else simde_svbool_t r; - #if defined(SIMDE_X86_AVX512BW_NATIVE) && (SIMDE_ARM_SVE_VECTOR_SIZE >= 512) + #if defined(SIMDE_X86_AVX512BW_NATIVE) && (SIMDE_ARM_SVE_VECTOR_SIZE >= 512) \ + && (!defined(HEDLEY_MSVC_VERSION) || HEDLEY_MSVC_VERSION_CHECK(19,20,0)) r = simde_svbool_from_mmask8(_mm512_mask_cmplt_epu64_mask(simde_svbool_to_mmask8(pg), op1.m512i, op2.m512i)); - #elif defined(SIMDE_X86_AVX512BW_NATIVE) && defined(SIMDE_X86_AVX512VL_NATIVE) + #elif defined(SIMDE_X86_AVX512BW_NATIVE) && defined(SIMDE_X86_AVX512VL_NATIVE) \ + && (!defined(HEDLEY_MSVC_VERSION) || HEDLEY_MSVC_VERSION_CHECK(19,20,0)) r = simde_svbool_from_mmask4(_mm256_mask_cmplt_epu64_mask(simde_svbool_to_mmask4(pg), op1.m256i[0], op2.m256i[0])); #elif defined(SIMDE_ARM_NEON_A64V8_NATIVE) r.neon_u64 = vandq_u64(pg.neon_u64, vcltq_u64(op1.neon, op2.neon)); @@ -348,9 +364,11 @@ simde_svcmplt_f32(simde_svbool_t pg, simde_svfloat32_t op1, simde_svfloat32_t op #else simde_svbool_t r; - #if defined(SIMDE_X86_AVX512BW_NATIVE) && (SIMDE_ARM_SVE_VECTOR_SIZE >= 512) + #if defined(SIMDE_X86_AVX512BW_NATIVE) && (SIMDE_ARM_SVE_VECTOR_SIZE >= 512) \ + && (!defined(HEDLEY_MSVC_VERSION) || HEDLEY_MSVC_VERSION_CHECK(19,20,0)) r = simde_svbool_from_mmask16(_mm512_mask_cmp_ps_mask(simde_svbool_to_mmask16(pg), op1.m512, op2.m512, _CMP_LT_OQ)); - #elif defined(SIMDE_X86_AVX512BW_NATIVE) && defined(SIMDE_X86_AVX512VL_NATIVE) + #elif defined(SIMDE_X86_AVX512BW_NATIVE) && defined(SIMDE_X86_AVX512VL_NATIVE) \ + && (!defined(HEDLEY_MSVC_VERSION) || HEDLEY_MSVC_VERSION_CHECK(19,20,0)) r = simde_svbool_from_mmask8(_mm256_mask_cmp_ps_mask(simde_svbool_to_mmask8(pg), op1.m256[0], op2.m256[0], _CMP_LT_OQ)); #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE) r.neon_u32 = vandq_u32(pg.neon_u32, vcltq_f32(op1.neon, op2.neon)); @@ -389,9 +407,11 @@ simde_svcmplt_f64(simde_svbool_t pg, simde_svfloat64_t op1, simde_svfloat64_t op #else simde_svbool_t r; - #if defined(SIMDE_X86_AVX512BW_NATIVE) && (SIMDE_ARM_SVE_VECTOR_SIZE >= 512) + #if defined(SIMDE_X86_AVX512BW_NATIVE) && (SIMDE_ARM_SVE_VECTOR_SIZE >= 512) \ + && (!defined(HEDLEY_MSVC_VERSION) || HEDLEY_MSVC_VERSION_CHECK(19,20,0)) r = simde_svbool_from_mmask8(_mm512_mask_cmp_pd_mask(simde_svbool_to_mmask8(pg), op1.m512d, op2.m512d, _CMP_LT_OQ)); - #elif defined(SIMDE_X86_AVX512BW_NATIVE) && defined(SIMDE_X86_AVX512VL_NATIVE) + #elif defined(SIMDE_X86_AVX512BW_NATIVE) && defined(SIMDE_X86_AVX512VL_NATIVE) \ + && (!defined(HEDLEY_MSVC_VERSION) || HEDLEY_MSVC_VERSION_CHECK(19,20,0)) r = simde_svbool_from_mmask4(_mm256_mask_cmp_pd_mask(simde_svbool_to_mmask4(pg), op1.m256d[0], op2.m256d[0], _CMP_LT_OQ)); #elif defined(SIMDE_ARM_NEON_A64V8_NATIVE) r.neon_u64 = vandq_u64(pg.neon_u64, vcltq_f64(op1.neon, op2.neon)); diff --git a/lib/simd_wrapper/simde/arm/sve/ld1.h b/lib/simd_wrapper/simde/arm/sve/ld1.h index 607c3be40c6..8008ad60a0d 100644 --- a/lib/simd_wrapper/simde/arm/sve/ld1.h +++ b/lib/simd_wrapper/simde/arm/sve/ld1.h @@ -51,9 +51,11 @@ simde_svld1_s8(simde_svbool_t pg, const int8_t * base) { #else simde_svint8_t r; - #if defined(SIMDE_X86_AVX512BW_NATIVE) && (SIMDE_ARM_SVE_VECTOR_SIZE >= 512) + #if defined(SIMDE_X86_AVX512BW_NATIVE) && (SIMDE_ARM_SVE_VECTOR_SIZE >= 512) \ + && (!defined(HEDLEY_MSVC_VERSION) || HEDLEY_MSVC_VERSION_CHECK(19,20,0)) r.m512i = _mm512_maskz_loadu_epi8(simde_svbool_to_mmask64(pg), base); - #elif defined(SIMDE_X86_AVX512BW_NATIVE) && defined(SIMDE_X86_AVX512VL_NATIVE) + #elif defined(SIMDE_X86_AVX512BW_NATIVE) && defined(SIMDE_X86_AVX512VL_NATIVE) \ + && (!defined(HEDLEY_MSVC_VERSION) || HEDLEY_MSVC_VERSION_CHECK(19,20,0)) r.m256i[0] = _mm256_maskz_loadu_epi8(simde_svbool_to_mmask32(pg), base); #else for (int i = 0 ; i < HEDLEY_STATIC_CAST(int, simde_svcntb()) ; i++) { @@ -77,9 +79,11 @@ simde_svld1_s16(simde_svbool_t pg, const int16_t * base) { #else simde_svint16_t r; - #if defined(SIMDE_X86_AVX512BW_NATIVE) && (SIMDE_ARM_SVE_VECTOR_SIZE >= 512) + #if defined(SIMDE_X86_AVX512BW_NATIVE) && (SIMDE_ARM_SVE_VECTOR_SIZE >= 512) \ + && (!defined(HEDLEY_MSVC_VERSION) || HEDLEY_MSVC_VERSION_CHECK(19,20,0)) r.m512i = _mm512_maskz_loadu_epi16(simde_svbool_to_mmask32(pg), base); - #elif defined(SIMDE_X86_AVX512BW_NATIVE) && defined(SIMDE_X86_AVX512VL_NATIVE) + #elif defined(SIMDE_X86_AVX512BW_NATIVE) && defined(SIMDE_X86_AVX512VL_NATIVE) \ + && (!defined(HEDLEY_MSVC_VERSION) || HEDLEY_MSVC_VERSION_CHECK(19,20,0)) r.m256i[0] = _mm256_maskz_loadu_epi16(simde_svbool_to_mmask16(pg), base); #else for (int i = 0 ; i < HEDLEY_STATIC_CAST(int, simde_svcnth()) ; i++) { @@ -103,9 +107,11 @@ simde_svld1_s32(simde_svbool_t pg, const int32_t * base) { #else simde_svint32_t r; - #if defined(SIMDE_X86_AVX512BW_NATIVE) && (SIMDE_ARM_SVE_VECTOR_SIZE >= 512) + #if defined(SIMDE_X86_AVX512BW_NATIVE) && (SIMDE_ARM_SVE_VECTOR_SIZE >= 512) \ + && (!defined(HEDLEY_MSVC_VERSION) || HEDLEY_MSVC_VERSION_CHECK(19,20,0)) r.m512i = _mm512_maskz_loadu_epi32(simde_svbool_to_mmask16(pg), base); - #elif defined(SIMDE_X86_AVX512BW_NATIVE) && defined(SIMDE_X86_AVX512VL_NATIVE) + #elif defined(SIMDE_X86_AVX512BW_NATIVE) && defined(SIMDE_X86_AVX512VL_NATIVE) \ + && (!defined(HEDLEY_MSVC_VERSION) || HEDLEY_MSVC_VERSION_CHECK(19,20,0)) r.m256i[0] = _mm256_maskz_loadu_epi32(simde_svbool_to_mmask8(pg), base); #else for (int i = 0 ; i < HEDLEY_STATIC_CAST(int, simde_svcntw()) ; i++) { @@ -129,9 +135,11 @@ simde_svld1_s64(simde_svbool_t pg, const int64_t * base) { #else simde_svint64_t r; - #if defined(SIMDE_X86_AVX512BW_NATIVE) && (SIMDE_ARM_SVE_VECTOR_SIZE >= 512) + #if defined(SIMDE_X86_AVX512BW_NATIVE) && (SIMDE_ARM_SVE_VECTOR_SIZE >= 512) \ + && (!defined(HEDLEY_MSVC_VERSION) || HEDLEY_MSVC_VERSION_CHECK(19,20,0)) r.m512i = _mm512_maskz_loadu_epi64(simde_svbool_to_mmask8(pg), base); - #elif defined(SIMDE_X86_AVX512BW_NATIVE) && defined(SIMDE_X86_AVX512VL_NATIVE) + #elif defined(SIMDE_X86_AVX512BW_NATIVE) && defined(SIMDE_X86_AVX512VL_NATIVE) \ + && (!defined(HEDLEY_MSVC_VERSION) || HEDLEY_MSVC_VERSION_CHECK(19,20,0)) r.m256i[0] = _mm256_maskz_loadu_epi64(simde_svbool_to_mmask4(pg), base); #else for (int i = 0 ; i < HEDLEY_STATIC_CAST(int, simde_svcntd()) ; i++) { @@ -155,9 +163,11 @@ simde_svld1_u8(simde_svbool_t pg, const uint8_t * base) { #else simde_svuint8_t r; - #if defined(SIMDE_X86_AVX512BW_NATIVE) && (SIMDE_ARM_SVE_VECTOR_SIZE >= 512) + #if defined(SIMDE_X86_AVX512BW_NATIVE) && (SIMDE_ARM_SVE_VECTOR_SIZE >= 512) \ + && (!defined(HEDLEY_MSVC_VERSION) || HEDLEY_MSVC_VERSION_CHECK(19,20,0)) r.m512i = _mm512_maskz_loadu_epi8(simde_svbool_to_mmask64(pg), base); - #elif defined(SIMDE_X86_AVX512BW_NATIVE) && defined(SIMDE_X86_AVX512VL_NATIVE) + #elif defined(SIMDE_X86_AVX512BW_NATIVE) && defined(SIMDE_X86_AVX512VL_NATIVE) \ + && (!defined(HEDLEY_MSVC_VERSION) || HEDLEY_MSVC_VERSION_CHECK(19,20,0)) r.m256i[0] = _mm256_maskz_loadu_epi8(simde_svbool_to_mmask32(pg), base); #else for (int i = 0 ; i < HEDLEY_STATIC_CAST(int, simde_svcntb()) ; i++) { @@ -181,9 +191,11 @@ simde_svld1_u16(simde_svbool_t pg, const uint16_t * base) { #else simde_svuint16_t r; - #if defined(SIMDE_X86_AVX512BW_NATIVE) && (SIMDE_ARM_SVE_VECTOR_SIZE >= 512) + #if defined(SIMDE_X86_AVX512BW_NATIVE) && (SIMDE_ARM_SVE_VECTOR_SIZE >= 512) \ + && (!defined(HEDLEY_MSVC_VERSION) || HEDLEY_MSVC_VERSION_CHECK(19,20,0)) r.m512i = _mm512_maskz_loadu_epi16(simde_svbool_to_mmask32(pg), base); - #elif defined(SIMDE_X86_AVX512BW_NATIVE) && defined(SIMDE_X86_AVX512VL_NATIVE) + #elif defined(SIMDE_X86_AVX512BW_NATIVE) && defined(SIMDE_X86_AVX512VL_NATIVE) \ + && (!defined(HEDLEY_MSVC_VERSION) || HEDLEY_MSVC_VERSION_CHECK(19,20,0)) r.m256i[0] = _mm256_maskz_loadu_epi16(simde_svbool_to_mmask16(pg), base); #else for (int i = 0 ; i < HEDLEY_STATIC_CAST(int, simde_svcnth()) ; i++) { @@ -207,9 +219,11 @@ simde_svld1_u32(simde_svbool_t pg, const uint32_t * base) { #else simde_svuint32_t r; - #if defined(SIMDE_X86_AVX512BW_NATIVE) && (SIMDE_ARM_SVE_VECTOR_SIZE >= 512) + #if defined(SIMDE_X86_AVX512BW_NATIVE) && (SIMDE_ARM_SVE_VECTOR_SIZE >= 512) \ + && (!defined(HEDLEY_MSVC_VERSION) || HEDLEY_MSVC_VERSION_CHECK(19,20,0)) r.m512i = _mm512_maskz_loadu_epi32(simde_svbool_to_mmask16(pg), base); - #elif defined(SIMDE_X86_AVX512BW_NATIVE) && defined(SIMDE_X86_AVX512VL_NATIVE) + #elif defined(SIMDE_X86_AVX512BW_NATIVE) && defined(SIMDE_X86_AVX512VL_NATIVE) \ + && (!defined(HEDLEY_MSVC_VERSION) || HEDLEY_MSVC_VERSION_CHECK(19,20,0)) r.m256i[0] = _mm256_maskz_loadu_epi32(simde_svbool_to_mmask8(pg), base); #else for (int i = 0 ; i < HEDLEY_STATIC_CAST(int, simde_svcntw()) ; i++) { @@ -233,9 +247,11 @@ simde_svld1_u64(simde_svbool_t pg, const uint64_t * base) { #else simde_svuint64_t r; - #if defined(SIMDE_X86_AVX512BW_NATIVE) && (SIMDE_ARM_SVE_VECTOR_SIZE >= 512) + #if defined(SIMDE_X86_AVX512BW_NATIVE) && (SIMDE_ARM_SVE_VECTOR_SIZE >= 512) \ + && (!defined(HEDLEY_MSVC_VERSION) || HEDLEY_MSVC_VERSION_CHECK(19,20,0)) r.m512i = _mm512_maskz_loadu_epi64(simde_svbool_to_mmask8(pg), base); - #elif defined(SIMDE_X86_AVX512BW_NATIVE) && defined(SIMDE_X86_AVX512VL_NATIVE) + #elif defined(SIMDE_X86_AVX512BW_NATIVE) && defined(SIMDE_X86_AVX512VL_NATIVE) \ + && (!defined(HEDLEY_MSVC_VERSION) || HEDLEY_MSVC_VERSION_CHECK(19,20,0)) r.m256i[0] = _mm256_maskz_loadu_epi64(simde_svbool_to_mmask4(pg), base); #else for (int i = 0 ; i < HEDLEY_STATIC_CAST(int, simde_svcntd()) ; i++) { @@ -259,9 +275,11 @@ simde_svld1_f32(simde_svbool_t pg, const simde_float32 * base) { #else simde_svfloat32_t r; - #if defined(SIMDE_X86_AVX512BW_NATIVE) && (SIMDE_ARM_SVE_VECTOR_SIZE >= 512) + #if defined(SIMDE_X86_AVX512BW_NATIVE) && (SIMDE_ARM_SVE_VECTOR_SIZE >= 512) \ + && (!defined(HEDLEY_MSVC_VERSION) || HEDLEY_MSVC_VERSION_CHECK(19,20,0)) r.m512 = _mm512_maskz_loadu_ps(simde_svbool_to_mmask16(pg), base); - #elif defined(SIMDE_X86_AVX512BW_NATIVE) && defined(SIMDE_X86_AVX512VL_NATIVE) + #elif defined(SIMDE_X86_AVX512BW_NATIVE) && defined(SIMDE_X86_AVX512VL_NATIVE) \ + && (!defined(HEDLEY_MSVC_VERSION) || HEDLEY_MSVC_VERSION_CHECK(19,20,0)) r.m256[0] = _mm256_maskz_loadu_ps(simde_svbool_to_mmask8(pg), base); #else for (int i = 0 ; i < HEDLEY_STATIC_CAST(int, simde_svcntw()) ; i++) { @@ -285,9 +303,11 @@ simde_svld1_f64(simde_svbool_t pg, const simde_float64 * base) { #else simde_svfloat64_t r; - #if defined(SIMDE_X86_AVX512BW_NATIVE) && (SIMDE_ARM_SVE_VECTOR_SIZE >= 512) + #if defined(SIMDE_X86_AVX512BW_NATIVE) && (SIMDE_ARM_SVE_VECTOR_SIZE >= 512) \ + && (!defined(HEDLEY_MSVC_VERSION) || HEDLEY_MSVC_VERSION_CHECK(19,20,0)) r.m512d = _mm512_maskz_loadu_pd(simde_svbool_to_mmask8(pg), base); - #elif defined(SIMDE_X86_AVX512BW_NATIVE) && defined(SIMDE_X86_AVX512VL_NATIVE) + #elif defined(SIMDE_X86_AVX512BW_NATIVE) && defined(SIMDE_X86_AVX512VL_NATIVE) \ + && (!defined(HEDLEY_MSVC_VERSION) || HEDLEY_MSVC_VERSION_CHECK(19,20,0)) r.m256d[0] = _mm256_maskz_loadu_pd(simde_svbool_to_mmask4(pg), base); #else for (int i = 0 ; i < HEDLEY_STATIC_CAST(int, simde_svcntd()) ; i++) { diff --git a/lib/simd_wrapper/simde/arm/sve/ptest.h b/lib/simd_wrapper/simde/arm/sve/ptest.h index 5e6adb8b4c6..3046331156f 100644 --- a/lib/simd_wrapper/simde/arm/sve/ptest.h +++ b/lib/simd_wrapper/simde/arm/sve/ptest.h @@ -37,7 +37,7 @@ simde_bool simde_svptest_first(simde_svbool_t pg, simde_svbool_t op) { #if defined(SIMDE_ARM_SVE_NATIVE) return svptest_first(pg, op); - #elif defined(SIMDE_X86_AVX512BW_NATIVE) + #elif defined(SIMDE_X86_AVX512BW_NATIVE) && (!defined(HEDLEY_MSVC_VERSION) || HEDLEY_MSVC_VERSION_CHECK(19,20,0)) if (HEDLEY_LIKELY(pg.value & 1)) return op.value & 1; diff --git a/lib/simd_wrapper/simde/arm/sve/ptrue.h b/lib/simd_wrapper/simde/arm/sve/ptrue.h index b894b1e01ec..064b96ace97 100644 --- a/lib/simd_wrapper/simde/arm/sve/ptrue.h +++ b/lib/simd_wrapper/simde/arm/sve/ptrue.h @@ -37,7 +37,7 @@ simde_svbool_t simde_svptrue_b8(void) { #if defined(SIMDE_ARM_SVE_NATIVE) return svptrue_b8(); - #elif defined(SIMDE_X86_AVX512BW_NATIVE) + #elif defined(SIMDE_X86_AVX512BW_NATIVE) && (!defined(HEDLEY_MSVC_VERSION) || HEDLEY_MSVC_VERSION_CHECK(19,20,0)) simde_svbool_t r; #if SIMDE_ARM_SVE_VECTOR_SIZE >= 512 @@ -67,7 +67,7 @@ simde_svbool_t simde_svptrue_b16(void) { #if defined(SIMDE_ARM_SVE_NATIVE) return svptrue_b16(); - #elif defined(SIMDE_X86_AVX512BW_NATIVE) + #elif defined(SIMDE_X86_AVX512BW_NATIVE) && (!defined(HEDLEY_MSVC_VERSION) || HEDLEY_MSVC_VERSION_CHECK(19,20,0)) simde_svbool_t r; #if SIMDE_ARM_SVE_VECTOR_SIZE >= 512 @@ -97,7 +97,7 @@ simde_svbool_t simde_svptrue_b32(void) { #if defined(SIMDE_ARM_SVE_NATIVE) return svptrue_b32(); - #elif defined(SIMDE_X86_AVX512BW_NATIVE) + #elif defined(SIMDE_X86_AVX512BW_NATIVE) && (!defined(HEDLEY_MSVC_VERSION) || HEDLEY_MSVC_VERSION_CHECK(19,20,0)) simde_svbool_t r; #if SIMDE_ARM_SVE_VECTOR_SIZE >= 512 @@ -127,7 +127,7 @@ simde_svbool_t simde_svptrue_b64(void) { #if defined(SIMDE_ARM_SVE_NATIVE) return svptrue_b64(); - #elif defined(SIMDE_X86_AVX512BW_NATIVE) + #elif defined(SIMDE_X86_AVX512BW_NATIVE) && (!defined(HEDLEY_MSVC_VERSION) || HEDLEY_MSVC_VERSION_CHECK(19,20,0)) simde_svbool_t r; #if SIMDE_ARM_SVE_VECTOR_SIZE >= 512 diff --git a/lib/simd_wrapper/simde/arm/sve/sel.h b/lib/simd_wrapper/simde/arm/sve/sel.h index eb9b9f3cc2e..a5e79b5673c 100644 --- a/lib/simd_wrapper/simde/arm/sve/sel.h +++ b/lib/simd_wrapper/simde/arm/sve/sel.h @@ -43,9 +43,11 @@ simde_x_svsel_s8_z(simde_svbool_t pg, simde_svint8_t op1) { #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) r.neon = vandq_s8(pg.neon_i8, op1.neon); - #elif defined(SIMDE_X86_AVX512BW_NATIVE) && (SIMDE_ARM_SVE_VECTOR_SIZE >= 512) + #elif defined(SIMDE_X86_AVX512BW_NATIVE) && (SIMDE_ARM_SVE_VECTOR_SIZE >= 512) \ + && (!defined(HEDLEY_MSVC_VERSION) || HEDLEY_MSVC_VERSION_CHECK(19,20,0)) r.m512i = _mm512_maskz_mov_epi8(simde_svbool_to_mmask64(pg), op1.m512i); - #elif defined(SIMDE_X86_AVX512BW_NATIVE) && defined(SIMDE_X86_AVX512VL_NATIVE) + #elif defined(SIMDE_X86_AVX512BW_NATIVE) && defined(SIMDE_X86_AVX512VL_NATIVE) \ + && (!defined(HEDLEY_MSVC_VERSION) || HEDLEY_MSVC_VERSION_CHECK(19,20,0)) r.m256i[0] = _mm256_maskz_mov_epi8(simde_svbool_to_mmask32(pg), op1.m256i[0]); #elif defined(SIMDE_X86_AVX2_NATIVE) for (int i = 0 ; i < HEDLEY_STATIC_CAST(int, sizeof(r.m256i) / sizeof(r.m256i[0])) ; i++) { @@ -84,9 +86,11 @@ simde_svsel_s8(simde_svbool_t pg, simde_svint8_t op1, simde_svint8_t op2) { #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) r.neon = vbslq_s8(pg.neon_u8, op1.neon, op2.neon); - #elif defined(SIMDE_X86_AVX512BW_NATIVE) && (SIMDE_ARM_SVE_VECTOR_SIZE >= 512) + #elif defined(SIMDE_X86_AVX512BW_NATIVE) && (SIMDE_ARM_SVE_VECTOR_SIZE >= 512) \ + && (!defined(HEDLEY_MSVC_VERSION) || HEDLEY_MSVC_VERSION_CHECK(19,20,0)) r.m512i = _mm512_mask_mov_epi8(op2.m512i, simde_svbool_to_mmask64(pg), op1.m512i); - #elif defined(SIMDE_X86_AVX512BW_NATIVE) && defined(SIMDE_X86_AVX512VL_NATIVE) + #elif defined(SIMDE_X86_AVX512BW_NATIVE) && defined(SIMDE_X86_AVX512VL_NATIVE) \ + && (!defined(HEDLEY_MSVC_VERSION) || HEDLEY_MSVC_VERSION_CHECK(19,20,0)) r.m256i[0] = _mm256_mask_mov_epi8(op2.m256i[0], simde_svbool_to_mmask32(pg), op1.m256i[0]); #elif defined(SIMDE_X86_AVX2_NATIVE) for (int i = 0 ; i < HEDLEY_STATIC_CAST(int, sizeof(r.m256i) / sizeof(r.m256i[0])) ; i++) { @@ -131,9 +135,11 @@ simde_x_svsel_s16_z(simde_svbool_t pg, simde_svint16_t op1) { #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) r.neon = vandq_s16(pg.neon_i16, op1.neon); - #elif defined(SIMDE_X86_AVX512BW_NATIVE) && (SIMDE_ARM_SVE_VECTOR_SIZE >= 512) + #elif defined(SIMDE_X86_AVX512BW_NATIVE) && (SIMDE_ARM_SVE_VECTOR_SIZE >= 512) \ + && (!defined(HEDLEY_MSVC_VERSION) || HEDLEY_MSVC_VERSION_CHECK(19,20,0)) r.m512i = _mm512_maskz_mov_epi16(simde_svbool_to_mmask32(pg), op1.m512i); - #elif defined(SIMDE_X86_AVX512BW_NATIVE) && defined(SIMDE_X86_AVX512VL_NATIVE) + #elif defined(SIMDE_X86_AVX512BW_NATIVE) && defined(SIMDE_X86_AVX512VL_NATIVE) \ + && (!defined(HEDLEY_MSVC_VERSION) || HEDLEY_MSVC_VERSION_CHECK(19,20,0)) r.m256i[0] = _mm256_maskz_mov_epi16(simde_svbool_to_mmask16(pg), op1.m256i[0]); #elif defined(SIMDE_X86_AVX2_NATIVE) for (int i = 0 ; i < HEDLEY_STATIC_CAST(int, sizeof(r.m256i) / sizeof(r.m256i[0])) ; i++) { @@ -172,9 +178,11 @@ simde_svsel_s16(simde_svbool_t pg, simde_svint16_t op1, simde_svint16_t op2) { #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) r.neon = vbslq_s16(pg.neon_u16, op1.neon, op2.neon); - #elif defined(SIMDE_X86_AVX512BW_NATIVE) && (SIMDE_ARM_SVE_VECTOR_SIZE >= 512) + #elif defined(SIMDE_X86_AVX512BW_NATIVE) && (SIMDE_ARM_SVE_VECTOR_SIZE >= 512) \ + && (!defined(HEDLEY_MSVC_VERSION) || HEDLEY_MSVC_VERSION_CHECK(19,20,0)) r.m512i = _mm512_mask_mov_epi16(op2.m512i, simde_svbool_to_mmask32(pg), op1.m512i); - #elif defined(SIMDE_X86_AVX512BW_NATIVE) && defined(SIMDE_X86_AVX512VL_NATIVE) + #elif defined(SIMDE_X86_AVX512BW_NATIVE) && defined(SIMDE_X86_AVX512VL_NATIVE) \ + && (!defined(HEDLEY_MSVC_VERSION) || HEDLEY_MSVC_VERSION_CHECK(19,20,0)) r.m256i[0] = _mm256_mask_mov_epi16(op2.m256i[0], simde_svbool_to_mmask16(pg), op1.m256i[0]); #elif defined(SIMDE_X86_AVX2_NATIVE) for (int i = 0 ; i < HEDLEY_STATIC_CAST(int, sizeof(r.m256i) / sizeof(r.m256i[0])) ; i++) { @@ -219,9 +227,11 @@ simde_x_svsel_s32_z(simde_svbool_t pg, simde_svint32_t op1) { #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) r.neon = vandq_s32(pg.neon_i32, op1.neon); - #elif defined(SIMDE_X86_AVX512BW_NATIVE) && (SIMDE_ARM_SVE_VECTOR_SIZE >= 512) + #elif defined(SIMDE_X86_AVX512BW_NATIVE) && (SIMDE_ARM_SVE_VECTOR_SIZE >= 512) \ + && (!defined(HEDLEY_MSVC_VERSION) || HEDLEY_MSVC_VERSION_CHECK(19,20,0)) r.m512i = _mm512_maskz_mov_epi32(simde_svbool_to_mmask16(pg), op1.m512i); - #elif defined(SIMDE_X86_AVX512BW_NATIVE) && defined(SIMDE_X86_AVX512VL_NATIVE) + #elif defined(SIMDE_X86_AVX512BW_NATIVE) && defined(SIMDE_X86_AVX512VL_NATIVE) \ + && (!defined(HEDLEY_MSVC_VERSION) || HEDLEY_MSVC_VERSION_CHECK(19,20,0)) r.m256i[0] = _mm256_maskz_mov_epi32(simde_svbool_to_mmask8(pg), op1.m256i[0]); #elif defined(SIMDE_X86_AVX2_NATIVE) for (int i = 0 ; i < HEDLEY_STATIC_CAST(int, sizeof(r.m256i) / sizeof(r.m256i[0])) ; i++) { @@ -260,9 +270,11 @@ simde_svsel_s32(simde_svbool_t pg, simde_svint32_t op1, simde_svint32_t op2) { #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) r.neon = vbslq_s32(pg.neon_u32, op1.neon, op2.neon); - #elif defined(SIMDE_X86_AVX512BW_NATIVE) && (SIMDE_ARM_SVE_VECTOR_SIZE >= 512) + #elif defined(SIMDE_X86_AVX512BW_NATIVE) && (SIMDE_ARM_SVE_VECTOR_SIZE >= 512) \ + && (!defined(HEDLEY_MSVC_VERSION) || HEDLEY_MSVC_VERSION_CHECK(19,20,0)) r.m512i = _mm512_mask_mov_epi32(op2.m512i, simde_svbool_to_mmask16(pg), op1.m512i); - #elif defined(SIMDE_X86_AVX512BW_NATIVE) && defined(SIMDE_X86_AVX512VL_NATIVE) + #elif defined(SIMDE_X86_AVX512BW_NATIVE) && defined(SIMDE_X86_AVX512VL_NATIVE) \ + && (!defined(HEDLEY_MSVC_VERSION) || HEDLEY_MSVC_VERSION_CHECK(19,20,0)) r.m256i[0] = _mm256_mask_mov_epi32(op2.m256i[0], simde_svbool_to_mmask8(pg), op1.m256i[0]); #elif defined(SIMDE_X86_AVX2_NATIVE) for (int i = 0 ; i < HEDLEY_STATIC_CAST(int, sizeof(r.m256i) / sizeof(r.m256i[0])) ; i++) { @@ -307,9 +319,11 @@ simde_x_svsel_s64_z(simde_svbool_t pg, simde_svint64_t op1) { #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) r.neon = vandq_s64(pg.neon_i64, op1.neon); - #elif defined(SIMDE_X86_AVX512BW_NATIVE) && (SIMDE_ARM_SVE_VECTOR_SIZE >= 512) + #elif defined(SIMDE_X86_AVX512BW_NATIVE) && (SIMDE_ARM_SVE_VECTOR_SIZE >= 512) \ + && (!defined(HEDLEY_MSVC_VERSION) || HEDLEY_MSVC_VERSION_CHECK(19,20,0)) r.m512i = _mm512_maskz_mov_epi64(simde_svbool_to_mmask8(pg), op1.m512i); - #elif defined(SIMDE_X86_AVX512BW_NATIVE) && defined(SIMDE_X86_AVX512VL_NATIVE) + #elif defined(SIMDE_X86_AVX512BW_NATIVE) && defined(SIMDE_X86_AVX512VL_NATIVE) \ + && (!defined(HEDLEY_MSVC_VERSION) || HEDLEY_MSVC_VERSION_CHECK(19,20,0)) r.m256i[0] = _mm256_maskz_mov_epi64(simde_svbool_to_mmask4(pg), op1.m256i[0]); #elif defined(SIMDE_X86_AVX2_NATIVE) for (int i = 0 ; i < HEDLEY_STATIC_CAST(int, sizeof(r.m256i) / sizeof(r.m256i[0])) ; i++) { @@ -348,9 +362,11 @@ simde_svsel_s64(simde_svbool_t pg, simde_svint64_t op1, simde_svint64_t op2) { #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) r.neon = vbslq_s64(pg.neon_u64, op1.neon, op2.neon); - #elif defined(SIMDE_X86_AVX512BW_NATIVE) && (SIMDE_ARM_SVE_VECTOR_SIZE >= 512) + #elif defined(SIMDE_X86_AVX512BW_NATIVE) && (SIMDE_ARM_SVE_VECTOR_SIZE >= 512) \ + && (!defined(HEDLEY_MSVC_VERSION) || HEDLEY_MSVC_VERSION_CHECK(19,20,0)) r.m512i = _mm512_mask_mov_epi64(op2.m512i, simde_svbool_to_mmask8(pg), op1.m512i); - #elif defined(SIMDE_X86_AVX512BW_NATIVE) && defined(SIMDE_X86_AVX512VL_NATIVE) + #elif defined(SIMDE_X86_AVX512BW_NATIVE) && defined(SIMDE_X86_AVX512VL_NATIVE) \ + && (!defined(HEDLEY_MSVC_VERSION) || HEDLEY_MSVC_VERSION_CHECK(19,20,0)) r.m256i[0] = _mm256_mask_mov_epi64(op2.m256i[0], simde_svbool_to_mmask4(pg), op1.m256i[0]); #elif defined(SIMDE_X86_AVX2_NATIVE) for (int i = 0 ; i < HEDLEY_STATIC_CAST(int, sizeof(r.m256i) / sizeof(r.m256i[0])) ; i++) { @@ -390,7 +406,8 @@ simde_svuint8_t simde_x_svsel_u8_z(simde_svbool_t pg, simde_svuint8_t op1) { #if defined(SIMDE_ARM_SVE_NATIVE) return svand_u8_z(pg, op1, op1); - #elif defined(SIMDE_X86_AVX512BW_NATIVE) && ((SIMDE_ARM_SVE_VECTOR_SIZE >= 512) || defined(SIMDE_X86_AVX512VL_NATIVE)) + #elif defined(SIMDE_X86_AVX512BW_NATIVE) && ((SIMDE_ARM_SVE_VECTOR_SIZE >= 512) || defined(SIMDE_X86_AVX512VL_NATIVE)) \ + && (!defined(HEDLEY_MSVC_VERSION) || HEDLEY_MSVC_VERSION_CHECK(19,20,0)) simde_svuint8_t r; #if SIMDE_ARM_SVE_VECTOR_SIZE >= 512 @@ -410,7 +427,8 @@ simde_svuint8_t simde_svsel_u8(simde_svbool_t pg, simde_svuint8_t op1, simde_svuint8_t op2) { #if defined(SIMDE_ARM_SVE_NATIVE) return svsel_u8(pg, op1, op2); - #elif defined(SIMDE_X86_AVX512BW_NATIVE) && ((SIMDE_ARM_SVE_VECTOR_SIZE >= 512) || defined(SIMDE_X86_AVX512VL_NATIVE)) + #elif defined(SIMDE_X86_AVX512BW_NATIVE) && ((SIMDE_ARM_SVE_VECTOR_SIZE >= 512) || defined(SIMDE_X86_AVX512VL_NATIVE)) \ + && (!defined(HEDLEY_MSVC_VERSION) || HEDLEY_MSVC_VERSION_CHECK(19,20,0)) simde_svuint8_t r; #if SIMDE_ARM_SVE_VECTOR_SIZE >= 512 diff --git a/lib/simd_wrapper/simde/arm/sve/st1.h b/lib/simd_wrapper/simde/arm/sve/st1.h index 39f5c4c7952..e3c6230d8f8 100644 --- a/lib/simd_wrapper/simde/arm/sve/st1.h +++ b/lib/simd_wrapper/simde/arm/sve/st1.h @@ -37,9 +37,11 @@ void simde_svst1_s8(simde_svbool_t pg, int8_t * base, simde_svint8_t data) { #if defined(SIMDE_ARM_SVE_NATIVE) svst1_s8(pg, base, data); - #elif defined(SIMDE_X86_AVX512BW_NATIVE) && (SIMDE_ARM_SVE_VECTOR_SIZE >= 512) + #elif defined(SIMDE_X86_AVX512BW_NATIVE) && (SIMDE_ARM_SVE_VECTOR_SIZE >= 512) \ + && (!defined(HEDLEY_MSVC_VERSION) || HEDLEY_MSVC_VERSION_CHECK(19,20,0)) _mm512_mask_storeu_epi8(base, simde_svbool_to_mmask64(pg), data.m512i); - #elif defined(SIMDE_X86_AVX512BW_NATIVE) && defined(SIMDE_X86_AVX512VL_NATIVE) + #elif defined(SIMDE_X86_AVX512BW_NATIVE) && defined(SIMDE_X86_AVX512VL_NATIVE) \ + && (!defined(HEDLEY_MSVC_VERSION) || HEDLEY_MSVC_VERSION_CHECK(19,20,0)) _mm256_mask_storeu_epi8(base, simde_svbool_to_mmask32(pg), data.m256i[0]); #else for (int i = 0 ; i < HEDLEY_STATIC_CAST(int, simde_svcntb()) ; i++) { @@ -59,10 +61,12 @@ void simde_svst1_s16(simde_svbool_t pg, int16_t * base, simde_svint16_t data) { #if defined(SIMDE_ARM_SVE_NATIVE) svst1_s16(pg, base, data); - #elif defined(SIMDE_X86_AVX512BW_NATIVE) && (SIMDE_ARM_SVE_VECTOR_SIZE >= 512) - _mm512_mask_storeu_epi16(base, simde_svbool_to_mmask32(pg), data.m512i); - #elif defined(SIMDE_X86_AVX512BW_NATIVE) && defined(SIMDE_X86_AVX512VL_NATIVE) - _mm256_mask_storeu_epi16(base, simde_svbool_to_mmask16(pg), data.m256i[0]); + #elif defined(SIMDE_X86_AVX512BW_NATIVE) && (SIMDE_ARM_SVE_VECTOR_SIZE >= 512) \ + && (!defined(HEDLEY_MSVC_VERSION) || HEDLEY_MSVC_VERSION_CHECK(19,20,0)) + _mm512_mask_storeu_epi16(base, simde_svbool_to_mmask32(pg), data.m512i); + #elif defined(SIMDE_X86_AVX512BW_NATIVE) && defined(SIMDE_X86_AVX512VL_NATIVE) \ + && (!defined(HEDLEY_MSVC_VERSION) || HEDLEY_MSVC_VERSION_CHECK(19,20,0)) + _mm256_mask_storeu_epi16(base, simde_svbool_to_mmask16(pg), data.m256i[0]); #else for (int i = 0 ; i < HEDLEY_STATIC_CAST(int, simde_svcnth()) ; i++) { if (pg.values_i16[i]) { @@ -81,10 +85,12 @@ void simde_svst1_s32(simde_svbool_t pg, int32_t * base, simde_svint32_t data) { #if defined(SIMDE_ARM_SVE_NATIVE) svst1_s32(pg, base, data); - #elif defined(SIMDE_X86_AVX512BW_NATIVE) && (SIMDE_ARM_SVE_VECTOR_SIZE >= 512) - _mm512_mask_storeu_epi32(base, simde_svbool_to_mmask16(pg), data.m512i); - #elif defined(SIMDE_X86_AVX512BW_NATIVE) && defined(SIMDE_X86_AVX512VL_NATIVE) - _mm256_mask_storeu_epi32(base, simde_svbool_to_mmask8(pg), data.m256i[0]); + #elif defined(SIMDE_X86_AVX512BW_NATIVE) && (SIMDE_ARM_SVE_VECTOR_SIZE >= 512) \ + && (!defined(HEDLEY_MSVC_VERSION) || HEDLEY_MSVC_VERSION_CHECK(19,20,0)) + _mm512_mask_storeu_epi32(base, simde_svbool_to_mmask16(pg), data.m512i); + #elif defined(SIMDE_X86_AVX512BW_NATIVE) && defined(SIMDE_X86_AVX512VL_NATIVE) \ + && (!defined(HEDLEY_MSVC_VERSION) || HEDLEY_MSVC_VERSION_CHECK(19,20,0)) + _mm256_mask_storeu_epi32(base, simde_svbool_to_mmask8(pg), data.m256i[0]); #else for (int i = 0 ; i < HEDLEY_STATIC_CAST(int, simde_svcntw()) ; i++) { if (pg.values_i32[i]) { @@ -103,10 +109,12 @@ void simde_svst1_s64(simde_svbool_t pg, int64_t * base, simde_svint64_t data) { #if defined(SIMDE_ARM_SVE_NATIVE) svst1_s64(pg, base, data); - #elif defined(SIMDE_X86_AVX512BW_NATIVE) && (SIMDE_ARM_SVE_VECTOR_SIZE >= 512) - _mm512_mask_storeu_epi64(base, simde_svbool_to_mmask8(pg), data.m512i); - #elif defined(SIMDE_X86_AVX512BW_NATIVE) && defined(SIMDE_X86_AVX512VL_NATIVE) - _mm256_mask_storeu_epi64(base, simde_svbool_to_mmask4(pg), data.m256i[0]); + #elif defined(SIMDE_X86_AVX512BW_NATIVE) && (SIMDE_ARM_SVE_VECTOR_SIZE >= 512) \ + && (!defined(HEDLEY_MSVC_VERSION) || HEDLEY_MSVC_VERSION_CHECK(19,20,0)) + _mm512_mask_storeu_epi64(base, simde_svbool_to_mmask8(pg), data.m512i); + #elif defined(SIMDE_X86_AVX512BW_NATIVE) && defined(SIMDE_X86_AVX512VL_NATIVE) \ + && (!defined(HEDLEY_MSVC_VERSION) || HEDLEY_MSVC_VERSION_CHECK(19,20,0)) + _mm256_mask_storeu_epi64(base, simde_svbool_to_mmask4(pg), data.m256i[0]); #else for (int i = 0 ; i < HEDLEY_STATIC_CAST(int, simde_svcntd()) ; i++) { if (pg.values_i64[i]) { @@ -125,10 +133,12 @@ void simde_svst1_u8(simde_svbool_t pg, uint8_t * base, simde_svuint8_t data) { #if defined(SIMDE_ARM_SVE_NATIVE) svst1_u8(pg, base, data); - #elif defined(SIMDE_X86_AVX512BW_NATIVE) && (SIMDE_ARM_SVE_VECTOR_SIZE >= 512) - _mm512_mask_storeu_epi8(base, simde_svbool_to_mmask64(pg), data.m512i); - #elif defined(SIMDE_X86_AVX512BW_NATIVE) && defined(SIMDE_X86_AVX512VL_NATIVE) - _mm256_mask_storeu_epi8(base, simde_svbool_to_mmask32(pg), data.m256i[0]); + #elif defined(SIMDE_X86_AVX512BW_NATIVE) && (SIMDE_ARM_SVE_VECTOR_SIZE >= 512) \ + && (!defined(HEDLEY_MSVC_VERSION) || HEDLEY_MSVC_VERSION_CHECK(19,20,0)) + _mm512_mask_storeu_epi8(base, simde_svbool_to_mmask64(pg), data.m512i); + #elif defined(SIMDE_X86_AVX512BW_NATIVE) && defined(SIMDE_X86_AVX512VL_NATIVE) \ + && (!defined(HEDLEY_MSVC_VERSION) || HEDLEY_MSVC_VERSION_CHECK(19,20,0)) + _mm256_mask_storeu_epi8(base, simde_svbool_to_mmask32(pg), data.m256i[0]); #else for (int i = 0 ; i < HEDLEY_STATIC_CAST(int, simde_svcntb()) ; i++) { if (pg.values_u8[i]) { @@ -147,10 +157,12 @@ void simde_svst1_u16(simde_svbool_t pg, uint16_t * base, simde_svuint16_t data) { #if defined(SIMDE_ARM_SVE_NATIVE) svst1_u16(pg, base, data); - #elif defined(SIMDE_X86_AVX512BW_NATIVE) && (SIMDE_ARM_SVE_VECTOR_SIZE >= 512) - _mm512_mask_storeu_epi16(base, simde_svbool_to_mmask32(pg), data.m512i); - #elif defined(SIMDE_X86_AVX512BW_NATIVE) && defined(SIMDE_X86_AVX512VL_NATIVE) - _mm256_mask_storeu_epi16(base, simde_svbool_to_mmask16(pg), data.m256i[0]); + #elif defined(SIMDE_X86_AVX512BW_NATIVE) && (SIMDE_ARM_SVE_VECTOR_SIZE >= 512) \ + && (!defined(HEDLEY_MSVC_VERSION) || HEDLEY_MSVC_VERSION_CHECK(19,20,0)) + _mm512_mask_storeu_epi16(base, simde_svbool_to_mmask32(pg), data.m512i); + #elif defined(SIMDE_X86_AVX512BW_NATIVE) && defined(SIMDE_X86_AVX512VL_NATIVE) \ + && (!defined(HEDLEY_MSVC_VERSION) || HEDLEY_MSVC_VERSION_CHECK(19,20,0)) + _mm256_mask_storeu_epi16(base, simde_svbool_to_mmask16(pg), data.m256i[0]); #else for (int i = 0 ; i < HEDLEY_STATIC_CAST(int, simde_svcnth()) ; i++) { if (pg.values_u16[i]) { @@ -169,10 +181,12 @@ void simde_svst1_u32(simde_svbool_t pg, uint32_t * base, simde_svuint32_t data) { #if defined(SIMDE_ARM_SVE_NATIVE) svst1_u32(pg, base, data); - #elif defined(SIMDE_X86_AVX512BW_NATIVE) && (SIMDE_ARM_SVE_VECTOR_SIZE >= 512) - _mm512_mask_storeu_epi32(base, simde_svbool_to_mmask16(pg), data.m512i); - #elif defined(SIMDE_X86_AVX512BW_NATIVE) && defined(SIMDE_X86_AVX512VL_NATIVE) - _mm256_mask_storeu_epi32(base, simde_svbool_to_mmask8(pg), data.m256i[0]); + #elif defined(SIMDE_X86_AVX512BW_NATIVE) && (SIMDE_ARM_SVE_VECTOR_SIZE >= 512) \ + && (!defined(HEDLEY_MSVC_VERSION) || HEDLEY_MSVC_VERSION_CHECK(19,20,0)) + _mm512_mask_storeu_epi32(base, simde_svbool_to_mmask16(pg), data.m512i); + #elif defined(SIMDE_X86_AVX512BW_NATIVE) && defined(SIMDE_X86_AVX512VL_NATIVE) \ + && (!defined(HEDLEY_MSVC_VERSION) || HEDLEY_MSVC_VERSION_CHECK(19,20,0)) + _mm256_mask_storeu_epi32(base, simde_svbool_to_mmask8(pg), data.m256i[0]); #else for (int i = 0 ; i < HEDLEY_STATIC_CAST(int, simde_svcntw()) ; i++) { if (pg.values_u32[i]) { @@ -191,10 +205,12 @@ void simde_svst1_u64(simde_svbool_t pg, uint64_t * base, simde_svuint64_t data) { #if defined(SIMDE_ARM_SVE_NATIVE) svst1_u64(pg, base, data); - #elif defined(SIMDE_X86_AVX512BW_NATIVE) && (SIMDE_ARM_SVE_VECTOR_SIZE >= 512) - _mm512_mask_storeu_epi64(base, simde_svbool_to_mmask8(pg), data.m512i); - #elif defined(SIMDE_X86_AVX512BW_NATIVE) && defined(SIMDE_X86_AVX512VL_NATIVE) - _mm256_mask_storeu_epi64(base, simde_svbool_to_mmask4(pg), data.m256i[0]); + #elif defined(SIMDE_X86_AVX512BW_NATIVE) && (SIMDE_ARM_SVE_VECTOR_SIZE >= 512) \ + && (!defined(HEDLEY_MSVC_VERSION) || HEDLEY_MSVC_VERSION_CHECK(19,20,0)) + _mm512_mask_storeu_epi64(base, simde_svbool_to_mmask8(pg), data.m512i); + #elif defined(SIMDE_X86_AVX512BW_NATIVE) && defined(SIMDE_X86_AVX512VL_NATIVE) \ + && (!defined(HEDLEY_MSVC_VERSION) || HEDLEY_MSVC_VERSION_CHECK(19,20,0)) + _mm256_mask_storeu_epi64(base, simde_svbool_to_mmask4(pg), data.m256i[0]); #else for (int i = 0 ; i < HEDLEY_STATIC_CAST(int, simde_svcntd()) ; i++) { if (pg.values_u64[i]) { @@ -213,10 +229,12 @@ void simde_svst1_f32(simde_svbool_t pg, simde_float32 * base, simde_svfloat32_t data) { #if defined(SIMDE_ARM_SVE_NATIVE) svst1_f32(pg, base, data); - #elif defined(SIMDE_X86_AVX512BW_NATIVE) && (SIMDE_ARM_SVE_VECTOR_SIZE >= 512) - _mm512_mask_storeu_ps(base, simde_svbool_to_mmask16(pg), data.m512); - #elif defined(SIMDE_X86_AVX512BW_NATIVE) && defined(SIMDE_X86_AVX512VL_NATIVE) - _mm256_mask_storeu_ps(base, simde_svbool_to_mmask8(pg), data.m256[0]); + #elif defined(SIMDE_X86_AVX512BW_NATIVE) && (SIMDE_ARM_SVE_VECTOR_SIZE >= 512) \ + && (!defined(HEDLEY_MSVC_VERSION) || HEDLEY_MSVC_VERSION_CHECK(19,20,0)) + _mm512_mask_storeu_ps(base, simde_svbool_to_mmask16(pg), data.m512); + #elif defined(SIMDE_X86_AVX512BW_NATIVE) && defined(SIMDE_X86_AVX512VL_NATIVE) \ + && (!defined(HEDLEY_MSVC_VERSION) || HEDLEY_MSVC_VERSION_CHECK(19,20,0)) + _mm256_mask_storeu_ps(base, simde_svbool_to_mmask8(pg), data.m256[0]); #else for (int i = 0 ; i < HEDLEY_STATIC_CAST(int, simde_svcntw()) ; i++) { if (pg.values_i32[i]) { @@ -235,10 +253,12 @@ void simde_svst1_f64(simde_svbool_t pg, simde_float64 * base, simde_svfloat64_t data) { #if defined(SIMDE_ARM_SVE_NATIVE) svst1_f64(pg, base, data); - #elif defined(SIMDE_X86_AVX512BW_NATIVE) && (SIMDE_ARM_SVE_VECTOR_SIZE >= 512) - _mm512_mask_storeu_pd(base, simde_svbool_to_mmask8(pg), data.m512d); - #elif defined(SIMDE_X86_AVX512BW_NATIVE) && defined(SIMDE_X86_AVX512VL_NATIVE) - _mm256_mask_storeu_pd(base, simde_svbool_to_mmask4(pg), data.m256d[0]); + #elif defined(SIMDE_X86_AVX512BW_NATIVE) && (SIMDE_ARM_SVE_VECTOR_SIZE >= 512) \ + && (!defined(HEDLEY_MSVC_VERSION) || HEDLEY_MSVC_VERSION_CHECK(19,20,0)) + _mm512_mask_storeu_pd(base, simde_svbool_to_mmask8(pg), data.m512d); + #elif defined(SIMDE_X86_AVX512BW_NATIVE) && defined(SIMDE_X86_AVX512VL_NATIVE) \ + && (!defined(HEDLEY_MSVC_VERSION) || HEDLEY_MSVC_VERSION_CHECK(19,20,0)) + _mm256_mask_storeu_pd(base, simde_svbool_to_mmask4(pg), data.m256d[0]); #else for (int i = 0 ; i < HEDLEY_STATIC_CAST(int, simde_svcntd()) ; i++) { if (pg.values_i64[i]) { diff --git a/lib/simd_wrapper/simde/arm/sve/types.h b/lib/simd_wrapper/simde/arm/sve/types.h index ae7cbb95ee6..f0579d96c85 100644 --- a/lib/simd_wrapper/simde/arm/sve/types.h +++ b/lib/simd_wrapper/simde/arm/sve/types.h @@ -396,7 +396,7 @@ SIMDE_BEGIN_DECLS_ #endif } simde_svfloat64_t; - #if defined(SIMDE_X86_AVX512BW_NATIVE) + #if defined(SIMDE_X86_AVX512BW_NATIVE) && (!defined(HEDLEY_MSVC_VERSION) || HEDLEY_MSVC_VERSION_CHECK(19,20,0)) typedef struct { __mmask64 value; int type; diff --git a/lib/simd_wrapper/simde/arm/sve/whilelt.h b/lib/simd_wrapper/simde/arm/sve/whilelt.h index 44e024f0169..f0e0bd2cdc8 100644 --- a/lib/simd_wrapper/simde/arm/sve/whilelt.h +++ b/lib/simd_wrapper/simde/arm/sve/whilelt.h @@ -37,7 +37,8 @@ simde_svbool_t simde_svwhilelt_b8_s32(int32_t op1, int32_t op2) { #if defined(SIMDE_ARM_SVE_NATIVE) return svwhilelt_b8_s32(op1, op2); - #elif defined(SIMDE_X86_AVX512BW_NATIVE) && (SIMDE_ARM_SVE_VECTOR_SIZE >= 512) + #elif defined(SIMDE_X86_AVX512BW_NATIVE) && (SIMDE_ARM_SVE_VECTOR_SIZE >= 512) \ + && (!defined(HEDLEY_MSVC_VERSION) || HEDLEY_MSVC_VERSION_CHECK(19,20,0)) if (HEDLEY_UNLIKELY(op1 >= op2)) return simde_svbool_from_mmask64(HEDLEY_STATIC_CAST(__mmask64, 0)); @@ -48,7 +49,8 @@ simde_svwhilelt_b8_s32(int32_t op1, int32_t op2) { } return simde_svbool_from_mmask64(r); - #elif defined(SIMDE_X86_AVX512BW_NATIVE) && defined(SIMDE_X86_AVX512VL_NATIVE) + #elif defined(SIMDE_X86_AVX512BW_NATIVE) && defined(SIMDE_X86_AVX512VL_NATIVE) \ + && (!defined(HEDLEY_MSVC_VERSION) || HEDLEY_MSVC_VERSION_CHECK(19,20,0)) if (HEDLEY_UNLIKELY(op1 >= op2)) return simde_svbool_from_mmask32(HEDLEY_STATIC_CAST(__mmask32, 0)); @@ -82,7 +84,8 @@ simde_svbool_t simde_svwhilelt_b16_s32(int32_t op1, int32_t op2) { #if defined(SIMDE_ARM_SVE_NATIVE) return svwhilelt_b16_s32(op1, op2); - #elif defined(SIMDE_X86_AVX512BW_NATIVE) && (SIMDE_ARM_SVE_VECTOR_SIZE >= 512) + #elif defined(SIMDE_X86_AVX512BW_NATIVE) && (SIMDE_ARM_SVE_VECTOR_SIZE >= 512) \ + && (!defined(HEDLEY_MSVC_VERSION) || HEDLEY_MSVC_VERSION_CHECK(19,20,0)) if (HEDLEY_UNLIKELY(op1 >= op2)) return simde_svbool_from_mmask32(HEDLEY_STATIC_CAST(__mmask32, 0)); @@ -93,7 +96,8 @@ simde_svwhilelt_b16_s32(int32_t op1, int32_t op2) { } return simde_svbool_from_mmask32(r); - #elif defined(SIMDE_X86_AVX512BW_NATIVE) && defined(SIMDE_X86_AVX512VL_NATIVE) + #elif defined(SIMDE_X86_AVX512BW_NATIVE) && defined(SIMDE_X86_AVX512VL_NATIVE) \ + && (!defined(HEDLEY_MSVC_VERSION) || HEDLEY_MSVC_VERSION_CHECK(19,20,0)) if (HEDLEY_UNLIKELY(op1 >= op2)) return simde_svbool_from_mmask16(HEDLEY_STATIC_CAST(__mmask16, 0)); @@ -127,7 +131,8 @@ simde_svbool_t simde_svwhilelt_b32_s32(int32_t op1, int32_t op2) { #if defined(SIMDE_ARM_SVE_NATIVE) return svwhilelt_b32_s32(op1, op2); - #elif defined(SIMDE_X86_AVX512BW_NATIVE) && (SIMDE_ARM_SVE_VECTOR_SIZE >= 512) + #elif defined(SIMDE_X86_AVX512BW_NATIVE) && (SIMDE_ARM_SVE_VECTOR_SIZE >= 512) \ + && (!defined(HEDLEY_MSVC_VERSION) || HEDLEY_MSVC_VERSION_CHECK(19,20,0)) if (HEDLEY_UNLIKELY(op1 >= op2)) return simde_svbool_from_mmask16(HEDLEY_STATIC_CAST(__mmask16, 0)); @@ -138,7 +143,8 @@ simde_svwhilelt_b32_s32(int32_t op1, int32_t op2) { } return simde_svbool_from_mmask16(r); - #elif defined(SIMDE_X86_AVX512BW_NATIVE) && defined(SIMDE_X86_AVX512VL_NATIVE) + #elif defined(SIMDE_X86_AVX512BW_NATIVE) && defined(SIMDE_X86_AVX512VL_NATIVE) \ + && (!defined(HEDLEY_MSVC_VERSION) || HEDLEY_MSVC_VERSION_CHECK(19,20,0)) if (HEDLEY_UNLIKELY(op1 >= op2)) return simde_svbool_from_mmask8(HEDLEY_STATIC_CAST(__mmask8, 0)); @@ -172,7 +178,8 @@ simde_svbool_t simde_svwhilelt_b64_s32(int32_t op1, int32_t op2) { #if defined(SIMDE_ARM_SVE_NATIVE) return svwhilelt_b64_s32(op1, op2); - #elif defined(SIMDE_X86_AVX512BW_NATIVE) && (SIMDE_ARM_SVE_VECTOR_SIZE >= 512) + #elif defined(SIMDE_X86_AVX512BW_NATIVE) && (SIMDE_ARM_SVE_VECTOR_SIZE >= 512) \ + && (!defined(HEDLEY_MSVC_VERSION) || HEDLEY_MSVC_VERSION_CHECK(19,20,0)) if (HEDLEY_UNLIKELY(op1 >= op2)) return simde_svbool_from_mmask8(HEDLEY_STATIC_CAST(__mmask8, 0)); @@ -183,7 +190,8 @@ simde_svwhilelt_b64_s32(int32_t op1, int32_t op2) { } return simde_svbool_from_mmask8(r); - #elif defined(SIMDE_X86_AVX512BW_NATIVE) && defined(SIMDE_X86_AVX512VL_NATIVE) + #elif defined(SIMDE_X86_AVX512BW_NATIVE) && defined(SIMDE_X86_AVX512VL_NATIVE) \ + && (!defined(HEDLEY_MSVC_VERSION) || HEDLEY_MSVC_VERSION_CHECK(19,20,0)) if (HEDLEY_UNLIKELY(op1 >= op2)) return simde_svbool_from_mmask4(HEDLEY_STATIC_CAST(__mmask8, 0)); @@ -217,7 +225,8 @@ simde_svbool_t simde_svwhilelt_b8_s64(int64_t op1, int64_t op2) { #if defined(SIMDE_ARM_SVE_NATIVE) return svwhilelt_b8_s64(op1, op2); - #elif defined(SIMDE_X86_AVX512BW_NATIVE) && (SIMDE_ARM_SVE_VECTOR_SIZE >= 512) + #elif defined(SIMDE_X86_AVX512BW_NATIVE) && (SIMDE_ARM_SVE_VECTOR_SIZE >= 512) \ + && (!defined(HEDLEY_MSVC_VERSION) || HEDLEY_MSVC_VERSION_CHECK(19,20,0)) if (HEDLEY_UNLIKELY(op1 >= op2)) return simde_svbool_from_mmask64(HEDLEY_STATIC_CAST(__mmask64, 0)); @@ -228,7 +237,8 @@ simde_svwhilelt_b8_s64(int64_t op1, int64_t op2) { } return simde_svbool_from_mmask64(r); - #elif defined(SIMDE_X86_AVX512BW_NATIVE) && defined(SIMDE_X86_AVX512VL_NATIVE) + #elif defined(SIMDE_X86_AVX512BW_NATIVE) && defined(SIMDE_X86_AVX512VL_NATIVE) \ + && (!defined(HEDLEY_MSVC_VERSION) || HEDLEY_MSVC_VERSION_CHECK(19,20,0)) if (HEDLEY_UNLIKELY(op1 >= op2)) return simde_svbool_from_mmask32(HEDLEY_STATIC_CAST(__mmask32, 0)); @@ -262,18 +272,20 @@ simde_svbool_t simde_svwhilelt_b16_s64(int64_t op1, int64_t op2) { #if defined(SIMDE_ARM_SVE_NATIVE) return svwhilelt_b16_s64(op1, op2); - #elif defined(SIMDE_X86_AVX512BW_NATIVE) && (SIMDE_ARM_SVE_VECTOR_SIZE >= 512) + #elif defined(SIMDE_X86_AVX512BW_NATIVE) && (SIMDE_ARM_SVE_VECTOR_SIZE >= 512) \ + && (!defined(HEDLEY_MSVC_VERSION) || HEDLEY_MSVC_VERSION_CHECK(19,20,0)) if (HEDLEY_UNLIKELY(op1 >= op2)) return simde_svbool_from_mmask32(HEDLEY_STATIC_CAST(__mmask32, 0)); int_fast64_t remaining = (HEDLEY_STATIC_CAST(int_fast64_t, op2) - HEDLEY_STATIC_CAST(int_fast64_t, op1)); - __mmask32 r = HEDLEY_STATIC_CAST(__mmask32, ~UINT64_C(0)); + __mmask32 r = HEDLEY_STATIC_CAST(__mmask32, ~UINT32_C(0)); if (HEDLEY_UNLIKELY(remaining < 32)) { r >>= 32 - remaining; } return simde_svbool_from_mmask32(r); - #elif defined(SIMDE_X86_AVX512BW_NATIVE) && defined(SIMDE_X86_AVX512VL_NATIVE) + #elif defined(SIMDE_X86_AVX512BW_NATIVE) && defined(SIMDE_X86_AVX512VL_NATIVE) \ + && (!defined(HEDLEY_MSVC_VERSION) || HEDLEY_MSVC_VERSION_CHECK(19,20,0)) if (HEDLEY_UNLIKELY(op1 >= op2)) return simde_svbool_from_mmask16(HEDLEY_STATIC_CAST(__mmask16, 0)); @@ -307,7 +319,8 @@ simde_svbool_t simde_svwhilelt_b32_s64(int64_t op1, int64_t op2) { #if defined(SIMDE_ARM_SVE_NATIVE) return svwhilelt_b32_s64(op1, op2); - #elif defined(SIMDE_X86_AVX512BW_NATIVE) && (SIMDE_ARM_SVE_VECTOR_SIZE >= 512) + #elif defined(SIMDE_X86_AVX512BW_NATIVE) && (SIMDE_ARM_SVE_VECTOR_SIZE >= 512) \ + && (!defined(HEDLEY_MSVC_VERSION) || HEDLEY_MSVC_VERSION_CHECK(19,20,0)) if (HEDLEY_UNLIKELY(op1 >= op2)) return simde_svbool_from_mmask16(HEDLEY_STATIC_CAST(__mmask16, 0)); @@ -318,7 +331,8 @@ simde_svwhilelt_b32_s64(int64_t op1, int64_t op2) { } return simde_svbool_from_mmask16(r); - #elif defined(SIMDE_X86_AVX512BW_NATIVE) && defined(SIMDE_X86_AVX512VL_NATIVE) + #elif defined(SIMDE_X86_AVX512BW_NATIVE) && defined(SIMDE_X86_AVX512VL_NATIVE) \ + && (!defined(HEDLEY_MSVC_VERSION) || HEDLEY_MSVC_VERSION_CHECK(19,20,0)) if (HEDLEY_UNLIKELY(op1 >= op2)) return simde_svbool_from_mmask8(HEDLEY_STATIC_CAST(__mmask8, 0)); @@ -352,7 +366,8 @@ simde_svbool_t simde_svwhilelt_b64_s64(int64_t op1, int64_t op2) { #if defined(SIMDE_ARM_SVE_NATIVE) return svwhilelt_b64_s64(op1, op2); - #elif defined(SIMDE_X86_AVX512BW_NATIVE) && (SIMDE_ARM_SVE_VECTOR_SIZE >= 512) + #elif defined(SIMDE_X86_AVX512BW_NATIVE) && (SIMDE_ARM_SVE_VECTOR_SIZE >= 512) \ + && (!defined(HEDLEY_MSVC_VERSION) || HEDLEY_MSVC_VERSION_CHECK(19,20,0)) if (HEDLEY_UNLIKELY(op1 >= op2)) return simde_svbool_from_mmask8(HEDLEY_STATIC_CAST(__mmask8, 0)); @@ -363,7 +378,8 @@ simde_svwhilelt_b64_s64(int64_t op1, int64_t op2) { } return simde_svbool_from_mmask8(r); - #elif defined(SIMDE_X86_AVX512BW_NATIVE) && defined(SIMDE_X86_AVX512VL_NATIVE) + #elif defined(SIMDE_X86_AVX512BW_NATIVE) && defined(SIMDE_X86_AVX512VL_NATIVE) \ + && (!defined(HEDLEY_MSVC_VERSION) || HEDLEY_MSVC_VERSION_CHECK(19,20,0)) if (HEDLEY_UNLIKELY(op1 >= op2)) return simde_svbool_from_mmask4(HEDLEY_STATIC_CAST(__mmask8, 0)); @@ -397,7 +413,8 @@ simde_svbool_t simde_svwhilelt_b8_u32(uint32_t op1, uint32_t op2) { #if defined(SIMDE_ARM_SVE_NATIVE) return svwhilelt_b8_u32(op1, op2); - #elif defined(SIMDE_X86_AVX512BW_NATIVE) && (SIMDE_ARM_SVE_VECTOR_SIZE >= 512) + #elif defined(SIMDE_X86_AVX512BW_NATIVE) && (SIMDE_ARM_SVE_VECTOR_SIZE >= 512) \ + && (!defined(HEDLEY_MSVC_VERSION) || HEDLEY_MSVC_VERSION_CHECK(19,20,0)) if (HEDLEY_UNLIKELY(op1 >= op2)) return simde_svbool_from_mmask64(HEDLEY_STATIC_CAST(__mmask64, 0)); @@ -408,7 +425,8 @@ simde_svwhilelt_b8_u32(uint32_t op1, uint32_t op2) { } return simde_svbool_from_mmask64(r); - #elif defined(SIMDE_X86_AVX512BW_NATIVE) && defined(SIMDE_X86_AVX512VL_NATIVE) + #elif defined(SIMDE_X86_AVX512BW_NATIVE) && defined(SIMDE_X86_AVX512VL_NATIVE) \ + && (!defined(HEDLEY_MSVC_VERSION) || HEDLEY_MSVC_VERSION_CHECK(19,20,0)) if (HEDLEY_UNLIKELY(op1 >= op2)) return simde_svbool_from_mmask32(HEDLEY_STATIC_CAST(__mmask32, 0)); @@ -442,7 +460,8 @@ simde_svbool_t simde_svwhilelt_b16_u32(uint32_t op1, uint32_t op2) { #if defined(SIMDE_ARM_SVE_NATIVE) return svwhilelt_b16_u32(op1, op2); - #elif defined(SIMDE_X86_AVX512BW_NATIVE) && (SIMDE_ARM_SVE_VECTOR_SIZE >= 512) + #elif defined(SIMDE_X86_AVX512BW_NATIVE) && (SIMDE_ARM_SVE_VECTOR_SIZE >= 512) \ + && (!defined(HEDLEY_MSVC_VERSION) || HEDLEY_MSVC_VERSION_CHECK(19,20,0)) if (HEDLEY_UNLIKELY(op1 >= op2)) return simde_svbool_from_mmask32(HEDLEY_STATIC_CAST(__mmask32, 0)); @@ -453,7 +472,8 @@ simde_svwhilelt_b16_u32(uint32_t op1, uint32_t op2) { } return simde_svbool_from_mmask32(r); - #elif defined(SIMDE_X86_AVX512BW_NATIVE) && defined(SIMDE_X86_AVX512VL_NATIVE) + #elif defined(SIMDE_X86_AVX512BW_NATIVE) && defined(SIMDE_X86_AVX512VL_NATIVE) \ + && (!defined(HEDLEY_MSVC_VERSION) || HEDLEY_MSVC_VERSION_CHECK(19,20,0)) if (HEDLEY_UNLIKELY(op1 >= op2)) return simde_svbool_from_mmask16(HEDLEY_STATIC_CAST(__mmask16, 0)); @@ -487,7 +507,8 @@ simde_svbool_t simde_svwhilelt_b32_u32(uint32_t op1, uint32_t op2) { #if defined(SIMDE_ARM_SVE_NATIVE) return svwhilelt_b32_u32(op1, op2); - #elif defined(SIMDE_X86_AVX512BW_NATIVE) && (SIMDE_ARM_SVE_VECTOR_SIZE >= 512) + #elif defined(SIMDE_X86_AVX512BW_NATIVE) && (SIMDE_ARM_SVE_VECTOR_SIZE >= 512) \ + && (!defined(HEDLEY_MSVC_VERSION) || HEDLEY_MSVC_VERSION_CHECK(19,20,0)) if (HEDLEY_UNLIKELY(op1 >= op2)) return simde_svbool_from_mmask16(HEDLEY_STATIC_CAST(__mmask16, 0)); @@ -498,7 +519,8 @@ simde_svwhilelt_b32_u32(uint32_t op1, uint32_t op2) { } return simde_svbool_from_mmask16(r); - #elif defined(SIMDE_X86_AVX512BW_NATIVE) && defined(SIMDE_X86_AVX512VL_NATIVE) + #elif defined(SIMDE_X86_AVX512BW_NATIVE) && defined(SIMDE_X86_AVX512VL_NATIVE) \ + && (!defined(HEDLEY_MSVC_VERSION) || HEDLEY_MSVC_VERSION_CHECK(19,20,0)) if (HEDLEY_UNLIKELY(op1 >= op2)) return simde_svbool_from_mmask8(HEDLEY_STATIC_CAST(__mmask8, 0)); @@ -532,7 +554,8 @@ simde_svbool_t simde_svwhilelt_b64_u32(uint32_t op1, uint32_t op2) { #if defined(SIMDE_ARM_SVE_NATIVE) return svwhilelt_b64_u32(op1, op2); - #elif defined(SIMDE_X86_AVX512BW_NATIVE) && (SIMDE_ARM_SVE_VECTOR_SIZE >= 512) + #elif defined(SIMDE_X86_AVX512BW_NATIVE) && (SIMDE_ARM_SVE_VECTOR_SIZE >= 512) \ + && (!defined(HEDLEY_MSVC_VERSION) || HEDLEY_MSVC_VERSION_CHECK(19,20,0)) if (HEDLEY_UNLIKELY(op1 >= op2)) return simde_svbool_from_mmask8(HEDLEY_STATIC_CAST(__mmask8, 0)); @@ -543,7 +566,8 @@ simde_svwhilelt_b64_u32(uint32_t op1, uint32_t op2) { } return simde_svbool_from_mmask8(r); - #elif defined(SIMDE_X86_AVX512BW_NATIVE) && defined(SIMDE_X86_AVX512VL_NATIVE) + #elif defined(SIMDE_X86_AVX512BW_NATIVE) && defined(SIMDE_X86_AVX512VL_NATIVE) \ + && (!defined(HEDLEY_MSVC_VERSION) || HEDLEY_MSVC_VERSION_CHECK(19,20,0)) if (HEDLEY_UNLIKELY(op1 >= op2)) return simde_svbool_from_mmask4(HEDLEY_STATIC_CAST(__mmask8, 0)); @@ -577,7 +601,8 @@ simde_svbool_t simde_svwhilelt_b8_u64(uint64_t op1, uint64_t op2) { #if defined(SIMDE_ARM_SVE_NATIVE) return svwhilelt_b8_u64(op1, op2); - #elif defined(SIMDE_X86_AVX512BW_NATIVE) && (SIMDE_ARM_SVE_VECTOR_SIZE >= 512) + #elif defined(SIMDE_X86_AVX512BW_NATIVE) && (SIMDE_ARM_SVE_VECTOR_SIZE >= 512) \ + && (!defined(HEDLEY_MSVC_VERSION) || HEDLEY_MSVC_VERSION_CHECK(19,20,0)) if (HEDLEY_UNLIKELY(op1 >= op2)) return simde_svbool_from_mmask64(HEDLEY_STATIC_CAST(__mmask64, 0)); @@ -588,7 +613,8 @@ simde_svwhilelt_b8_u64(uint64_t op1, uint64_t op2) { } return simde_svbool_from_mmask64(r); - #elif defined(SIMDE_X86_AVX512BW_NATIVE) && defined(SIMDE_X86_AVX512VL_NATIVE) + #elif defined(SIMDE_X86_AVX512BW_NATIVE) && defined(SIMDE_X86_AVX512VL_NATIVE) \ + && (!defined(HEDLEY_MSVC_VERSION) || HEDLEY_MSVC_VERSION_CHECK(19,20,0)) if (HEDLEY_UNLIKELY(op1 >= op2)) return simde_svbool_from_mmask32(HEDLEY_STATIC_CAST(__mmask32, 0)); @@ -622,18 +648,20 @@ simde_svbool_t simde_svwhilelt_b16_u64(uint64_t op1, uint64_t op2) { #if defined(SIMDE_ARM_SVE_NATIVE) return svwhilelt_b16_u64(op1, op2); - #elif defined(SIMDE_X86_AVX512BW_NATIVE) && (SIMDE_ARM_SVE_VECTOR_SIZE >= 512) + #elif defined(SIMDE_X86_AVX512BW_NATIVE) && (SIMDE_ARM_SVE_VECTOR_SIZE >= 512) \ + && (!defined(HEDLEY_MSVC_VERSION) || HEDLEY_MSVC_VERSION_CHECK(19,20,0)) if (HEDLEY_UNLIKELY(op1 >= op2)) return simde_svbool_from_mmask32(HEDLEY_STATIC_CAST(__mmask32, 0)); uint_fast64_t remaining = (HEDLEY_STATIC_CAST(uint_fast64_t, op2) - HEDLEY_STATIC_CAST(uint_fast64_t, op1)); - __mmask32 r = HEDLEY_STATIC_CAST(__mmask32, ~UINT64_C(0)); + __mmask32 r = HEDLEY_STATIC_CAST(__mmask32, ~UINT32_C(0)); if (HEDLEY_UNLIKELY(remaining < 32)) { r >>= 32 - remaining; } return simde_svbool_from_mmask32(r); - #elif defined(SIMDE_X86_AVX512BW_NATIVE) && defined(SIMDE_X86_AVX512VL_NATIVE) + #elif defined(SIMDE_X86_AVX512BW_NATIVE) && defined(SIMDE_X86_AVX512VL_NATIVE) \ + && (!defined(HEDLEY_MSVC_VERSION) || HEDLEY_MSVC_VERSION_CHECK(19,20,0)) if (HEDLEY_UNLIKELY(op1 >= op2)) return simde_svbool_from_mmask16(HEDLEY_STATIC_CAST(__mmask16, 0)); @@ -667,7 +695,8 @@ simde_svbool_t simde_svwhilelt_b32_u64(uint64_t op1, uint64_t op2) { #if defined(SIMDE_ARM_SVE_NATIVE) return svwhilelt_b32_u64(op1, op2); - #elif defined(SIMDE_X86_AVX512BW_NATIVE) && (SIMDE_ARM_SVE_VECTOR_SIZE >= 512) + #elif defined(SIMDE_X86_AVX512BW_NATIVE) && (SIMDE_ARM_SVE_VECTOR_SIZE >= 512) \ + && (!defined(HEDLEY_MSVC_VERSION) || HEDLEY_MSVC_VERSION_CHECK(19,20,0)) if (HEDLEY_UNLIKELY(op1 >= op2)) return simde_svbool_from_mmask16(HEDLEY_STATIC_CAST(__mmask16, 0)); @@ -678,7 +707,8 @@ simde_svwhilelt_b32_u64(uint64_t op1, uint64_t op2) { } return simde_svbool_from_mmask16(r); - #elif defined(SIMDE_X86_AVX512BW_NATIVE) && defined(SIMDE_X86_AVX512VL_NATIVE) + #elif defined(SIMDE_X86_AVX512BW_NATIVE) && defined(SIMDE_X86_AVX512VL_NATIVE) \ + && (!defined(HEDLEY_MSVC_VERSION) || HEDLEY_MSVC_VERSION_CHECK(19,20,0)) if (HEDLEY_UNLIKELY(op1 >= op2)) return simde_svbool_from_mmask8(HEDLEY_STATIC_CAST(__mmask8, 0)); @@ -712,7 +742,8 @@ simde_svbool_t simde_svwhilelt_b64_u64(uint64_t op1, uint64_t op2) { #if defined(SIMDE_ARM_SVE_NATIVE) return svwhilelt_b64_u64(op1, op2); - #elif defined(SIMDE_X86_AVX512BW_NATIVE) && (SIMDE_ARM_SVE_VECTOR_SIZE >= 512) + #elif defined(SIMDE_X86_AVX512BW_NATIVE) && (SIMDE_ARM_SVE_VECTOR_SIZE >= 512) \ + && (!defined(HEDLEY_MSVC_VERSION) || HEDLEY_MSVC_VERSION_CHECK(19,20,0)) if (HEDLEY_UNLIKELY(op1 >= op2)) return simde_svbool_from_mmask8(HEDLEY_STATIC_CAST(__mmask8, 0)); @@ -723,7 +754,8 @@ simde_svwhilelt_b64_u64(uint64_t op1, uint64_t op2) { } return simde_svbool_from_mmask8(r); - #elif defined(SIMDE_X86_AVX512BW_NATIVE) && defined(SIMDE_X86_AVX512VL_NATIVE) + #elif defined(SIMDE_X86_AVX512BW_NATIVE) && defined(SIMDE_X86_AVX512VL_NATIVE) \ + && (!defined(HEDLEY_MSVC_VERSION) || HEDLEY_MSVC_VERSION_CHECK(19,20,0)) if (HEDLEY_UNLIKELY(op1 >= op2)) return simde_svbool_from_mmask4(HEDLEY_STATIC_CAST(__mmask8, 0)); diff --git a/lib/simd_wrapper/simde/check.h b/lib/simd_wrapper/simde/check.h index 8fd913eb845..7d17d2925af 100644 --- a/lib/simd_wrapper/simde/check.h +++ b/lib/simd_wrapper/simde/check.h @@ -1,5 +1,5 @@ /* Check (assertions) - * Portable Snippets - https://gitub.com/nemequ/portable-snippets + * Portable Snippets - https://github.com/nemequ/portable-snippets * Created by Evan Nemerson * * To the extent possible under law, the authors have waived all diff --git a/lib/simd_wrapper/simde/debug-trap.h b/lib/simd_wrapper/simde/debug-trap.h index 11da805d555..2d3c60f841b 100644 --- a/lib/simd_wrapper/simde/debug-trap.h +++ b/lib/simd_wrapper/simde/debug-trap.h @@ -1,5 +1,5 @@ /* Debugging assertions and traps - * Portable Snippets - https://gitub.com/nemequ/portable-snippets + * Portable Snippets - https://github.com/nemequ/portable-snippets * Created by Evan Nemerson * * To the extent possible under law, the authors have waived all diff --git a/lib/simd_wrapper/simde/hedley.h b/lib/simd_wrapper/simde/hedley.h index 41ac3022160..f064f3f4cb1 100644 --- a/lib/simd_wrapper/simde/hedley.h +++ b/lib/simd_wrapper/simde/hedley.h @@ -184,6 +184,7 @@ # undef HEDLEY_EMSCRIPTEN_VERSION #endif #if defined(__EMSCRIPTEN__) +# include # define HEDLEY_EMSCRIPTEN_VERSION HEDLEY_VERSION_ENCODE(__EMSCRIPTEN_major__, __EMSCRIPTEN_minor__, __EMSCRIPTEN_tiny__) #endif diff --git a/lib/simd_wrapper/simde/mips/msa/ld.h b/lib/simd_wrapper/simde/mips/msa/ld.h index 9f17dbfb88d..62662e6b65c 100644 --- a/lib/simd_wrapper/simde/mips/msa/ld.h +++ b/lib/simd_wrapper/simde/mips/msa/ld.h @@ -37,16 +37,15 @@ SIMDE_FUNCTION_ATTRIBUTES simde_v16i8 simde_msa_ld_b(const void * rs, const int s10) SIMDE_REQUIRE_CONSTANT_RANGE(s10, 0, 1023) { - #if defined(SIMDE_MIPS_MSA_NATIVE) - return __msa_ld_b(rs, s10); - #else - simde_v16i8 r; + simde_v16i8 r; - simde_memcpy(&r, &(HEDLEY_REINTERPRET_CAST(const int8_t*, rs)[s10]), sizeof(r)); + simde_memcpy(&r, &(HEDLEY_REINTERPRET_CAST(const int8_t*, rs)[s10]), sizeof(r)); - return r; - #endif + return r; } +#if defined(SIMDE_MIPS_MSA_NATIVE) + #define simde_msa_ld_b(rs, s10) __msa_ld_b((rs), (s10)) +#endif #if defined(SIMDE_MIPS_MSA_ENABLE_NATIVE_ALIASES) #undef __msa_ld_b #define __msa_ld_b(rs, s10) simde_msa_ld_b((rs), (s10)) @@ -57,16 +56,15 @@ simde_v8i16 simde_msa_ld_h(const void * rs, const int s10) SIMDE_REQUIRE_CONSTANT_RANGE(s10, 0, 1023) HEDLEY_REQUIRE_MSG((s10 % sizeof(int16_t)) == 0, "`s10' must be a multiple of sizeof(int16_t)") { - #if defined(SIMDE_MIPS_MSA_NATIVE) - return __msa_ld_h(rs, s10); - #else - simde_v8i16 r; + simde_v8i16 r; - simde_memcpy(&r, &(HEDLEY_REINTERPRET_CAST(const int8_t*, rs)[s10]), sizeof(r)); + simde_memcpy(&r, &(HEDLEY_REINTERPRET_CAST(const int8_t*, rs)[s10]), sizeof(r)); - return r; - #endif + return r; } +#if defined(SIMDE_MIPS_MSA_NATIVE) + #define simde_msa_ld_h(rs, s10) __msa_ld_h((rs), (s10)) +#endif #if defined(SIMDE_MIPS_MSA_ENABLE_NATIVE_ALIASES) #undef __msa_ld_h #define __msa_ld_h(rs, s10) simde_msa_ld_h((rs), (s10)) @@ -77,16 +75,15 @@ simde_v4i32 simde_msa_ld_w(const void * rs, const int s10) SIMDE_REQUIRE_CONSTANT_RANGE(s10, 0, 1023) HEDLEY_REQUIRE_MSG((s10 % sizeof(int32_t)) == 0, "`s10' must be a multiple of sizeof(int32_t)") { - #if defined(SIMDE_MIPS_MSA_NATIVE) - return __msa_ld_w(rs, s10); - #else - simde_v4i32 r; + simde_v4i32 r; - simde_memcpy(&r, &(HEDLEY_REINTERPRET_CAST(const int8_t*, rs)[s10]), sizeof(r)); + simde_memcpy(&r, &(HEDLEY_REINTERPRET_CAST(const int8_t*, rs)[s10]), sizeof(r)); - return r; - #endif + return r; } +#if defined(SIMDE_MIPS_MSA_NATIVE) + #define simde_msa_ld_w(rs, s10) __msa_ld_w((rs), (s10)) +#endif #if defined(SIMDE_MIPS_MSA_ENABLE_NATIVE_ALIASES) #undef __msa_ld_w #define __msa_ld_w(rs, s10) simde_msa_ld_w((rs), (s10)) @@ -97,16 +94,15 @@ simde_v2i64 simde_msa_ld_d(const void * rs, const int s10) SIMDE_REQUIRE_CONSTANT_RANGE(s10, 0, 1023) HEDLEY_REQUIRE_MSG((s10 % sizeof(int64_t)) == 0, "`s10' must be a multiple of sizeof(int64_t)") { - #if defined(SIMDE_MIPS_MSA_NATIVE) - return __msa_ld_d(rs, s10); - #else - simde_v2i64 r; + simde_v2i64 r; - simde_memcpy(&r, &(HEDLEY_REINTERPRET_CAST(const int8_t*, rs)[s10]), sizeof(r)); + simde_memcpy(&r, &(HEDLEY_REINTERPRET_CAST(const int8_t*, rs)[s10]), sizeof(r)); - return r; - #endif + return r; } +#if defined(SIMDE_MIPS_MSA_NATIVE) + #define simde_msa_ld_d(rs, s10) __msa_ld_d((rs), (s10)) +#endif #if defined(SIMDE_MIPS_MSA_ENABLE_NATIVE_ALIASES) #undef __msa_ld_d #define __msa_ld_d(rs, s10) simde_msa_ld_d((rs), (s10)) @@ -116,96 +112,90 @@ SIMDE_FUNCTION_ATTRIBUTES simde_v16u8 simde_x_msa_ld_u_b(const void * rs, const int s10) SIMDE_REQUIRE_CONSTANT_RANGE(s10, 0, 1023) { - #if defined(SIMDE_MIPS_MSA_NATIVE) - return HEDLEY_REINTERPRET_CAST(simde_v16u8, __msa_ld_b(rs, s10)); - #else - simde_v16u8 r; + simde_v16u8 r; - simde_memcpy(&r, &(HEDLEY_REINTERPRET_CAST(const int8_t*, rs)[s10]), sizeof(r)); + simde_memcpy(&r, &(HEDLEY_REINTERPRET_CAST(const int8_t*, rs)[s10]), sizeof(r)); - return r; - #endif + return r; } +#if defined(SIMDE_MIPS_MSA_NATIVE) + #define simde_x_msa_ld_u_b(rs, s10) HEDLEY_REINTERPRET_CAST(simde_v16u8, __msa_ld_b((rs), (s10))) +#endif SIMDE_FUNCTION_ATTRIBUTES simde_v8u16 simde_x_msa_ld_u_h(const void * rs, const int s10) SIMDE_REQUIRE_CONSTANT_RANGE(s10, 0, 1023) HEDLEY_REQUIRE_MSG((s10 % sizeof(int16_t)) == 0, "`s10' must be a multiple of sizeof(int16_t)") { - #if defined(SIMDE_MIPS_MSA_NATIVE) - return HEDLEY_REINTERPRET_CAST(simde_v8u16, __msa_ld_b(rs, s10)); - #else - simde_v8u16 r; + simde_v8u16 r; - simde_memcpy(&r, &(HEDLEY_REINTERPRET_CAST(const int8_t*, rs)[s10]), sizeof(r)); + simde_memcpy(&r, &(HEDLEY_REINTERPRET_CAST(const int8_t*, rs)[s10]), sizeof(r)); - return r; - #endif + return r; } +#if defined(SIMDE_MIPS_MSA_NATIVE) + #define simde_x_msa_ld_u_h(rs, s10) HEDLEY_REINTERPRET_CAST(simde_v8u16, __msa_ld_b((rs), (s10))) +#endif SIMDE_FUNCTION_ATTRIBUTES simde_v4u32 simde_x_msa_ld_u_w(const void * rs, const int s10) SIMDE_REQUIRE_CONSTANT_RANGE(s10, 0, 1023) HEDLEY_REQUIRE_MSG((s10 % sizeof(int32_t)) == 0, "`s10' must be a multiple of sizeof(int32_t)") { - #if defined(SIMDE_MIPS_MSA_NATIVE) - return HEDLEY_REINTERPRET_CAST(simde_v4u32, __msa_ld_b(rs, s10)); - #else - simde_v4u32 r; + simde_v4u32 r; - simde_memcpy(&r, &(HEDLEY_REINTERPRET_CAST(const int8_t*, rs)[s10]), sizeof(r)); + simde_memcpy(&r, &(HEDLEY_REINTERPRET_CAST(const int8_t*, rs)[s10]), sizeof(r)); - return r; - #endif + return r; } +#if defined(SIMDE_MIPS_MSA_NATIVE) + #define simde_x_msa_ld_u_w(rs, s10) HEDLEY_REINTERPRET_CAST(simde_v4u32, __msa_ld_b((rs), (s10))) +#endif SIMDE_FUNCTION_ATTRIBUTES simde_v2u64 simde_x_msa_ld_u_d(const void * rs, const int s10) SIMDE_REQUIRE_CONSTANT_RANGE(s10, 0, 1023) HEDLEY_REQUIRE_MSG((s10 % sizeof(int64_t)) == 0, "`s10' must be a multiple of sizeof(int64_t)") { - #if defined(SIMDE_MIPS_MSA_NATIVE) - return HEDLEY_REINTERPRET_CAST(simde_v2u64, __msa_ld_b(rs, s10)); - #else - simde_v2u64 r; + simde_v2u64 r; - simde_memcpy(&r, &(HEDLEY_REINTERPRET_CAST(const int8_t*, rs)[s10]), sizeof(r)); + simde_memcpy(&r, &(HEDLEY_REINTERPRET_CAST(const int8_t*, rs)[s10]), sizeof(r)); - return r; - #endif + return r; } +#if defined(SIMDE_MIPS_MSA_NATIVE) + #define simde_x_msa_ld_u_d(rs, s10) HEDLEY_REINTERPRET_CAST(simde_v2u64, __msa_ld_b((rs), (s10))) +#endif SIMDE_FUNCTION_ATTRIBUTES simde_v4f32 simde_x_msa_fld_w(const void * rs, const int s10) SIMDE_REQUIRE_CONSTANT_RANGE(s10, 0, 1023) HEDLEY_REQUIRE_MSG((s10 % sizeof(int32_t)) == 0, "`s10' must be a multiple of sizeof(int32_t)") { - #if defined(SIMDE_MIPS_MSA_NATIVE) - return HEDLEY_REINTERPRET_CAST(simde_v4f32, __msa_ld_b(rs, s10)); - #else - simde_v4f32 r; + simde_v4f32 r; - simde_memcpy(&r, &(HEDLEY_REINTERPRET_CAST(const int8_t*, rs)[s10]), sizeof(r)); + simde_memcpy(&r, &(HEDLEY_REINTERPRET_CAST(const int8_t*, rs)[s10]), sizeof(r)); - return r; - #endif + return r; } +#if defined(SIMDE_MIPS_MSA_NATIVE) + #define simde_x_msa_fld_w(rs, s10) HEDLEY_REINTERPRET_CAST(simde_v4f32, __msa_ld_b((rs), (s10))) +#endif SIMDE_FUNCTION_ATTRIBUTES simde_v2f64 simde_x_msa_fld_d(const void * rs, const int s10) SIMDE_REQUIRE_CONSTANT_RANGE(s10, 0, 1023) HEDLEY_REQUIRE_MSG((s10 % sizeof(int64_t)) == 0, "`s10' must be a multiple of sizeof(int64_t)") { - #if defined(SIMDE_MIPS_MSA_NATIVE) - return HEDLEY_REINTERPRET_CAST(simde_v2f64, __msa_ld_b(rs, s10)); - #else - simde_v2f64 r; + simde_v2f64 r; - simde_memcpy(&r, &(HEDLEY_REINTERPRET_CAST(const int8_t*, rs)[s10]), sizeof(r)); + simde_memcpy(&r, &(HEDLEY_REINTERPRET_CAST(const int8_t*, rs)[s10]), sizeof(r)); - return r; - #endif + return r; } +#if defined(SIMDE_MIPS_MSA_NATIVE) + #define simde_x_msa_fld_d(rs, s10) HEDLEY_REINTERPRET_CAST(simde_v2f64, __msa_ld_b((rs), (s10))) +#endif SIMDE_END_DECLS_ HEDLEY_DIAGNOSTIC_POP diff --git a/lib/simd_wrapper/simde/mips/msa/madd.h b/lib/simd_wrapper/simde/mips/msa/madd.h index 5037577a44a..61cf18e8722 100644 --- a/lib/simd_wrapper/simde/mips/msa/madd.h +++ b/lib/simd_wrapper/simde/mips/msa/madd.h @@ -38,7 +38,7 @@ simde_v4f32 simde_msa_fmadd_w(simde_v4f32 a, simde_v4f32 b, simde_v4f32 c) { #if defined(SIMDE_MIPS_MSA_NATIVE) return __msa_fmadd_w(a, b, c); - #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE) && defined(__ARM_FEATURE_FMA) + #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE) && defined(SIMDE_ARCH_ARM_FMA) return vfmaq_f32(a, c, b); #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE) return vmlaq_f32(a, b, c); @@ -56,7 +56,7 @@ simde_msa_fmadd_w(simde_v4f32 a, simde_v4f32 b, simde_v4f32 c) { #elif defined(SIMDE_X86_SSE_NATIVE) r_.m128 = _mm_add_ps(a_.m128, _mm_mul_ps(b_.m128, c_.m128)); #elif defined(SIMDE_WASM_RELAXED_SIMD_NATIVE) - return wasm_f32x4_fma(a_.v128, b_.v128, c_.v128); + r_.v128 = wasm_f32x4_relaxed_madd(b_.v128, c_.v128, a_.v128); #elif defined(SIMDE_WASM_SIMD128_NATIVE) r_.v128 = wasm_f32x4_add(a_.v128, wasm_f32x4_mul(b_.v128, c_.v128)); #elif defined(SIMDE_VECTOR_SUBSCRIPT) @@ -73,7 +73,7 @@ simde_msa_fmadd_w(simde_v4f32 a, simde_v4f32 b, simde_v4f32 c) { } #if defined(SIMDE_MIPS_MSA_ENABLE_NATIVE_ALIASES) #undef __msa_fmadd_w - #define __msa_fmadd_w(a, b) simde_msa_fmadd_w((a), (b)) + #define __msa_fmadd_w(a, b, c) simde_msa_fmadd_w((a), (b), (c)) #endif SIMDE_FUNCTION_ATTRIBUTES @@ -97,7 +97,7 @@ simde_msa_fmadd_d(simde_v2f64 a, simde_v2f64 b, simde_v2f64 c) { #elif defined(SIMDE_X86_SSE2_NATIVE) r_.m128d = _mm_add_pd(a_.m128d, _mm_mul_pd(b_.m128d, c_.m128d)); #elif defined(SIMDE_WASM_RELAXED_SIMD_NATIVE) - r_.v128 = wasm_f64x2_fma(a_.v128, b_.v128, c_.v128); + r_.v128 = wasm_f64x2_relaxed_madd(b_.v128, c_.v128, a_.v128); #elif defined(SIMDE_WASM_SIMD128_NATIVE) r_.v128 = wasm_f64x2_add(a_.v128, wasm_f64x2_mul(b_.v128, c_.v128)); #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) @@ -114,7 +114,7 @@ simde_msa_fmadd_d(simde_v2f64 a, simde_v2f64 b, simde_v2f64 c) { } #if defined(SIMDE_MIPS_MSA_ENABLE_NATIVE_ALIASES) #undef __msa_fmadd_d - #define __msa_fmadd_d(a, b) simde_msa_fmadd_d((a), (b)) + #define __msa_fmadd_d(a, b, c) simde_msa_fmadd_d((a), (b), (c)) #endif SIMDE_END_DECLS_ diff --git a/lib/simd_wrapper/simde/mips/msa/st.h b/lib/simd_wrapper/simde/mips/msa/st.h index 2c5b2883313..9565c84eef9 100644 --- a/lib/simd_wrapper/simde/mips/msa/st.h +++ b/lib/simd_wrapper/simde/mips/msa/st.h @@ -37,12 +37,11 @@ SIMDE_FUNCTION_ATTRIBUTES void simde_msa_st_b(simde_v16i8 a, void * rs, const int s10) SIMDE_REQUIRE_CONSTANT_RANGE(s10, 0, 1023) { - #if defined(SIMDE_MIPS_MSA_NATIVE) - return __msa_st_b(a, rs, s10); - #else - simde_memcpy(&(HEDLEY_REINTERPRET_CAST(int8_t*, rs)[s10]), &a, sizeof(a)); - #endif + simde_memcpy(&(HEDLEY_REINTERPRET_CAST(int8_t*, rs)[s10]), &a, sizeof(a)); } +#if defined(SIMDE_MIPS_MSA_NATIVE) + #define simde_msa_st_b(a, rs, s10) __msa_st_b((a), (rs), (s10)); +#endif #if defined(SIMDE_MIPS_MSA_ENABLE_NATIVE_ALIASES) #undef __msa_st_b #define __msa_st_b(a, rs, s10) simde_msa_st_b((a), (rs), (s10)) @@ -53,12 +52,11 @@ void simde_msa_st_h(simde_v8i16 a, void * rs, const int s10) SIMDE_REQUIRE_CONSTANT_RANGE(s10, 0, 1023) HEDLEY_REQUIRE_MSG((s10 % sizeof(int16_t)) == 0, "`s10' must be a multiple of sizeof(int16_t)") { - #if defined(SIMDE_MIPS_MSA_NATIVE) - return __msa_st_h(a, rs, s10); - #else - simde_memcpy(&(HEDLEY_REINTERPRET_CAST(int8_t*, rs)[s10]), &a, sizeof(a)); - #endif + simde_memcpy(&(HEDLEY_REINTERPRET_CAST(int8_t*, rs)[s10]), &a, sizeof(a)); } +#if defined(SIMDE_MIPS_MSA_NATIVE) + #define simde_msa_st_h(a, rs, s10) __msa_st_h((a), (rs), (s10)); +#endif #if defined(SIMDE_MIPS_MSA_ENABLE_NATIVE_ALIASES) #undef __msa_st_h #define __msa_st_h(a, rs, s10) simde_msa_st_h((a), (rs), (s10)) @@ -69,12 +67,11 @@ void simde_msa_st_w(simde_v4i32 a, void * rs, const int s10) SIMDE_REQUIRE_CONSTANT_RANGE(s10, 0, 1023) HEDLEY_REQUIRE_MSG((s10 % sizeof(int32_t)) == 0, "`s10' must be a multiple of sizeof(int32_t)") { - #if defined(SIMDE_MIPS_MSA_NATIVE) - return __msa_st_w(a, rs, s10); - #else - simde_memcpy(&(HEDLEY_REINTERPRET_CAST(int8_t*, rs)[s10]), &a, sizeof(a)); - #endif + simde_memcpy(&(HEDLEY_REINTERPRET_CAST(int8_t*, rs)[s10]), &a, sizeof(a)); } +#if defined(SIMDE_MIPS_MSA_NATIVE) + #define simde_msa_st_w(a, rs, s10) __msa_st_w((a), (rs), (s10)); +#endif #if defined(SIMDE_MIPS_MSA_ENABLE_NATIVE_ALIASES) #undef __msa_st_w #define __msa_st_w(a, rs, s10) simde_msa_st_w((a), (rs), (s10)) @@ -85,12 +82,11 @@ void simde_msa_st_d(simde_v2i64 a, void * rs, const int s10) SIMDE_REQUIRE_CONSTANT_RANGE(s10, 0, 1023) HEDLEY_REQUIRE_MSG((s10 % sizeof(int64_t)) == 0, "`s10' must be a multiple of sizeof(int64_t)") { - #if defined(SIMDE_MIPS_MSA_NATIVE) - return __msa_st_d(a, rs, s10); - #else - simde_memcpy(&(HEDLEY_REINTERPRET_CAST(int8_t*, rs)[s10]), &a, sizeof(a)); - #endif + simde_memcpy(&(HEDLEY_REINTERPRET_CAST(int8_t*, rs)[s10]), &a, sizeof(a)); } +#if defined(SIMDE_MIPS_MSA_NATIVE) + #define simde_msa_st_d(a, rs, s10) __msa_st_d((a), (rs), (s10)); +#endif #if defined(SIMDE_MIPS_MSA_ENABLE_NATIVE_ALIASES) #undef __msa_st_d #define __msa_st_d(a, rs, s10) simde_msa_st_d((a), (rs), (s10)) diff --git a/lib/simd_wrapper/simde/mips/msa/types.h b/lib/simd_wrapper/simde/mips/msa/types.h index b10880c65b1..93536bc4820 100644 --- a/lib/simd_wrapper/simde/mips/msa/types.h +++ b/lib/simd_wrapper/simde/mips/msa/types.h @@ -49,7 +49,7 @@ typedef union { #if defined(SIMDE_X86_SSE2_NATIVE) __m128i m128i; #endif - #if defined(SIMDE_MIPS_MSA_A32V7_NATIVE) + #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) int8x16_t neon; #endif #if defined(SIMDE_WASM_SIMD128_NATIVE) @@ -67,7 +67,7 @@ typedef union { #if defined(SIMDE_X86_SSE2_NATIVE) __m128i m128i; #endif - #if defined(SIMDE_MIPS_MSA_A32V7_NATIVE) + #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) int16x8_t neon; #endif #if defined(SIMDE_WASM_SIMD128_NATIVE) @@ -85,7 +85,7 @@ typedef union { #if defined(SIMDE_X86_SSE2_NATIVE) __m128i m128i; #endif - #if defined(SIMDE_MIPS_MSA_A32V7_NATIVE) + #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) int32x4_t neon; #endif #if defined(SIMDE_WASM_SIMD128_NATIVE) @@ -103,7 +103,7 @@ typedef union { #if defined(SIMDE_X86_SSE2_NATIVE) __m128i m128i; #endif - #if defined(SIMDE_MIPS_MSA_A32V7_NATIVE) + #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) int64x2_t neon; #endif #if defined(SIMDE_WASM_SIMD128_NATIVE) @@ -121,8 +121,8 @@ typedef union { #if defined(SIMDE_X86_SSE2_NATIVE) __m128i m128i; #endif - #if defined(SIMDE_MIPS_MSA_A32V7_NATIVE) - int8x16_t neon; + #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) + uint8x16_t neon; #endif #if defined(SIMDE_WASM_SIMD128_NATIVE) v128_t v128; @@ -139,8 +139,8 @@ typedef union { #if defined(SIMDE_X86_SSE2_NATIVE) __m128i m128i; #endif - #if defined(SIMDE_MIPS_MSA_A32V7_NATIVE) - int16x8_t neon; + #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) + uint16x8_t neon; #endif #if defined(SIMDE_WASM_SIMD128_NATIVE) v128_t v128; @@ -157,8 +157,8 @@ typedef union { #if defined(SIMDE_X86_SSE2_NATIVE) __m128i m128i; #endif - #if defined(SIMDE_MIPS_MSA_A32V7_NATIVE) - int32x4_t neon; + #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) + uint32x4_t neon; #endif #if defined(SIMDE_WASM_SIMD128_NATIVE) v128_t v128; @@ -175,8 +175,8 @@ typedef union { #if defined(SIMDE_X86_SSE2_NATIVE) __m128i m128i; #endif - #if defined(SIMDE_MIPS_MSA_A32V7_NATIVE) - int64x2_t neon; + #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) + uint64x2_t neon; #endif #if defined(SIMDE_WASM_SIMD128_NATIVE) v128_t v128; @@ -193,8 +193,8 @@ typedef union { #if defined(SIMDE_X86_SSE2_NATIVE) __m128 m128; #endif - #if defined(SIMDE_MIPS_MSA_A32V7_NATIVE) - int32x4_t neon; + #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) + float32x4_t neon; #endif #if defined(SIMDE_WASM_SIMD128_NATIVE) v128_t v128; @@ -211,8 +211,8 @@ typedef union { #if defined(SIMDE_X86_SSE2_NATIVE) __m128d m128d; #endif - #if defined(SIMDE_MIPS_MSA_A32V7_NATIVE) - int64x2_t neon; + #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + float64x2_t neon; #endif #if defined(SIMDE_WASM_SIMD128_NATIVE) v128_t v128; @@ -230,7 +230,7 @@ typedef union { typedef v2u64 simde_v2u64; typedef v4f32 simde_v4f32; typedef v2f64 simde_v2f64; -#elif defined(SIMDE_MIPS_MSA_A32V7_NATIVE) +#elif defined(SIMDE_ARM_NEON_A32V7_NATIVE) typedef int8x16_t simde_v16i8; typedef int16x8_t simde_v8i16; typedef int32x4_t simde_v4i32; @@ -240,7 +240,7 @@ typedef union { typedef uint32x4_t simde_v4u32; typedef uint64x2_t simde_v2u64; typedef float32x4_t simde_v4f32; - #if defined(SIMDE_MIPS_MSA_A64V8_NATIVE) + #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) typedef float64x2_t simde_v2f64; #elif defined(SIMDE_VECTOR) typedef double simde_v2f64 __attribute__((__vector_size__(16))); diff --git a/lib/simd_wrapper/simde/simde-aes.h b/lib/simd_wrapper/simde/simde-aes.h new file mode 100644 index 00000000000..ea3ef5aa40b --- /dev/null +++ b/lib/simd_wrapper/simde/simde-aes.h @@ -0,0 +1,266 @@ +/* SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, copy, + * modify, merge, publish, distribute, sublicense, and/or sell copies + * of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + * Copyright: + * 2023 Yi-Yen Chung (Copyright owned by Andes Technology) + */ + +#if !defined(SIMDE_AES_H) +#define SIMDE_AES_H + +#include "simde-features.h" + +HEDLEY_DIAGNOSTIC_PUSH +SIMDE_DISABLE_UNWANTED_DIAGNOSTICS + +#if !(defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARCH_ARM_AES) && \ + defined(SIMDE_ARM_NEON_A32V7_NATIVE) && defined(SIMDE_ARCH_ARM_CRYPTO)) + +/* + * Number of columns (32-bit words) comprising the State. For this + * standard, Nb = 4. + */ +#define simde_x_aes_Nb 4 + +static uint8_t simde_x_aes_gmult_lookup_table[8][256] = { +{ // gmult(0x02, b); + 0x00, 0x02, 0x04, 0x06, 0x08, 0x0a, 0x0c, 0x0e, 0x10, 0x12, 0x14, 0x16, 0x18, 0x1a, 0x1c, 0x1e, + 0x20, 0x22, 0x24, 0x26, 0x28, 0x2a, 0x2c, 0x2e, 0x30, 0x32, 0x34, 0x36, 0x38, 0x3a, 0x3c, 0x3e, + 0x40, 0x42, 0x44, 0x46, 0x48, 0x4a, 0x4c, 0x4e, 0x50, 0x52, 0x54, 0x56, 0x58, 0x5a, 0x5c, 0x5e, + 0x60, 0x62, 0x64, 0x66, 0x68, 0x6a, 0x6c, 0x6e, 0x70, 0x72, 0x74, 0x76, 0x78, 0x7a, 0x7c, 0x7e, + 0x80, 0x82, 0x84, 0x86, 0x88, 0x8a, 0x8c, 0x8e, 0x90, 0x92, 0x94, 0x96, 0x98, 0x9a, 0x9c, 0x9e, + 0xa0, 0xa2, 0xa4, 0xa6, 0xa8, 0xaa, 0xac, 0xae, 0xb0, 0xb2, 0xb4, 0xb6, 0xb8, 0xba, 0xbc, 0xbe, + 0xc0, 0xc2, 0xc4, 0xc6, 0xc8, 0xca, 0xcc, 0xce, 0xd0, 0xd2, 0xd4, 0xd6, 0xd8, 0xda, 0xdc, 0xde, + 0xe0, 0xe2, 0xe4, 0xe6, 0xe8, 0xea, 0xec, 0xee, 0xf0, 0xf2, 0xf4, 0xf6, 0xf8, 0xfa, 0xfc, 0xfe, + 0x1b, 0x19, 0x1f, 0x1d, 0x13, 0x11, 0x17, 0x15, 0x0b, 0x09, 0x0f, 0x0d, 0x03, 0x01, 0x07, 0x05, + 0x3b, 0x39, 0x3f, 0x3d, 0x33, 0x31, 0x37, 0x35, 0x2b, 0x29, 0x2f, 0x2d, 0x23, 0x21, 0x27, 0x25, + 0x5b, 0x59, 0x5f, 0x5d, 0x53, 0x51, 0x57, 0x55, 0x4b, 0x49, 0x4f, 0x4d, 0x43, 0x41, 0x47, 0x45, + 0x7b, 0x79, 0x7f, 0x7d, 0x73, 0x71, 0x77, 0x75, 0x6b, 0x69, 0x6f, 0x6d, 0x63, 0x61, 0x67, 0x65, + 0x9b, 0x99, 0x9f, 0x9d, 0x93, 0x91, 0x97, 0x95, 0x8b, 0x89, 0x8f, 0x8d, 0x83, 0x81, 0x87, 0x85, + 0xbb, 0xb9, 0xbf, 0xbd, 0xb3, 0xb1, 0xb7, 0xb5, 0xab, 0xa9, 0xaf, 0xad, 0xa3, 0xa1, 0xa7, 0xa5, + 0xdb, 0xd9, 0xdf, 0xdd, 0xd3, 0xd1, 0xd7, 0xd5, 0xcb, 0xc9, 0xcf, 0xcd, 0xc3, 0xc1, 0xc7, 0xc5, + 0xfb, 0xf9, 0xff, 0xfd, 0xf3, 0xf1, 0xf7, 0xf5, 0xeb, 0xe9, 0xef, 0xed, 0xe3, 0xe1, 0xe7, 0xe5 +}, +{ // gmult(0x01, b); + 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, + 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, + 0x20, 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, 0x28, 0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f, + 0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, 0x38, 0x39, 0x3a, 0x3b, 0x3c, 0x3d, 0x3e, 0x3f, + 0x40, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47, 0x48, 0x49, 0x4a, 0x4b, 0x4c, 0x4d, 0x4e, 0x4f, + 0x50, 0x51, 0x52, 0x53, 0x54, 0x55, 0x56, 0x57, 0x58, 0x59, 0x5a, 0x5b, 0x5c, 0x5d, 0x5e, 0x5f, + 0x60, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0x6a, 0x6b, 0x6c, 0x6d, 0x6e, 0x6f, + 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, 0x78, 0x79, 0x7a, 0x7b, 0x7c, 0x7d, 0x7e, 0x7f, + 0x80, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87, 0x88, 0x89, 0x8a, 0x8b, 0x8c, 0x8d, 0x8e, 0x8f, + 0x90, 0x91, 0x92, 0x93, 0x94, 0x95, 0x96, 0x97, 0x98, 0x99, 0x9a, 0x9b, 0x9c, 0x9d, 0x9e, 0x9f, + 0xa0, 0xa1, 0xa2, 0xa3, 0xa4, 0xa5, 0xa6, 0xa7, 0xa8, 0xa9, 0xaa, 0xab, 0xac, 0xad, 0xae, 0xaf, + 0xb0, 0xb1, 0xb2, 0xb3, 0xb4, 0xb5, 0xb6, 0xb7, 0xb8, 0xb9, 0xba, 0xbb, 0xbc, 0xbd, 0xbe, 0xbf, + 0xc0, 0xc1, 0xc2, 0xc3, 0xc4, 0xc5, 0xc6, 0xc7, 0xc8, 0xc9, 0xca, 0xcb, 0xcc, 0xcd, 0xce, 0xcf, + 0xd0, 0xd1, 0xd2, 0xd3, 0xd4, 0xd5, 0xd6, 0xd7, 0xd8, 0xd9, 0xda, 0xdb, 0xdc, 0xdd, 0xde, 0xdf, + 0xe0, 0xe1, 0xe2, 0xe3, 0xe4, 0xe5, 0xe6, 0xe7, 0xe8, 0xe9, 0xea, 0xeb, 0xec, 0xed, 0xee, 0xef, + 0xf0, 0xf1, 0xf2, 0xf3, 0xf4, 0xf5, 0xf6, 0xf7, 0xf8, 0xf9, 0xfa, 0xfb, 0xfc, 0xfd, 0xfe, 0xff, +}, +{ // gmult(0x01, b); + 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, + 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, + 0x20, 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, 0x28, 0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f, + 0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, 0x38, 0x39, 0x3a, 0x3b, 0x3c, 0x3d, 0x3e, 0x3f, + 0x40, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47, 0x48, 0x49, 0x4a, 0x4b, 0x4c, 0x4d, 0x4e, 0x4f, + 0x50, 0x51, 0x52, 0x53, 0x54, 0x55, 0x56, 0x57, 0x58, 0x59, 0x5a, 0x5b, 0x5c, 0x5d, 0x5e, 0x5f, + 0x60, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0x6a, 0x6b, 0x6c, 0x6d, 0x6e, 0x6f, + 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, 0x78, 0x79, 0x7a, 0x7b, 0x7c, 0x7d, 0x7e, 0x7f, + 0x80, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87, 0x88, 0x89, 0x8a, 0x8b, 0x8c, 0x8d, 0x8e, 0x8f, + 0x90, 0x91, 0x92, 0x93, 0x94, 0x95, 0x96, 0x97, 0x98, 0x99, 0x9a, 0x9b, 0x9c, 0x9d, 0x9e, 0x9f, + 0xa0, 0xa1, 0xa2, 0xa3, 0xa4, 0xa5, 0xa6, 0xa7, 0xa8, 0xa9, 0xaa, 0xab, 0xac, 0xad, 0xae, 0xaf, + 0xb0, 0xb1, 0xb2, 0xb3, 0xb4, 0xb5, 0xb6, 0xb7, 0xb8, 0xb9, 0xba, 0xbb, 0xbc, 0xbd, 0xbe, 0xbf, + 0xc0, 0xc1, 0xc2, 0xc3, 0xc4, 0xc5, 0xc6, 0xc7, 0xc8, 0xc9, 0xca, 0xcb, 0xcc, 0xcd, 0xce, 0xcf, + 0xd0, 0xd1, 0xd2, 0xd3, 0xd4, 0xd5, 0xd6, 0xd7, 0xd8, 0xd9, 0xda, 0xdb, 0xdc, 0xdd, 0xde, 0xdf, + 0xe0, 0xe1, 0xe2, 0xe3, 0xe4, 0xe5, 0xe6, 0xe7, 0xe8, 0xe9, 0xea, 0xeb, 0xec, 0xed, 0xee, 0xef, + 0xf0, 0xf1, 0xf2, 0xf3, 0xf4, 0xf5, 0xf6, 0xf7, 0xf8, 0xf9, 0xfa, 0xfb, 0xfc, 0xfd, 0xfe, 0xff, +}, +{ // gmult(0x03, b); + 0x00, 0x03, 0x06, 0x05, 0x0c, 0x0f, 0x0a, 0x09, 0x18, 0x1b, 0x1e, 0x1d, 0x14, 0x17, 0x12, 0x11, + 0x30, 0x33, 0x36, 0x35, 0x3c, 0x3f, 0x3a, 0x39, 0x28, 0x2b, 0x2e, 0x2d, 0x24, 0x27, 0x22, 0x21, + 0x60, 0x63, 0x66, 0x65, 0x6c, 0x6f, 0x6a, 0x69, 0x78, 0x7b, 0x7e, 0x7d, 0x74, 0x77, 0x72, 0x71, + 0x50, 0x53, 0x56, 0x55, 0x5c, 0x5f, 0x5a, 0x59, 0x48, 0x4b, 0x4e, 0x4d, 0x44, 0x47, 0x42, 0x41, + 0xc0, 0xc3, 0xc6, 0xc5, 0xcc, 0xcf, 0xca, 0xc9, 0xd8, 0xdb, 0xde, 0xdd, 0xd4, 0xd7, 0xd2, 0xd1, + 0xf0, 0xf3, 0xf6, 0xf5, 0xfc, 0xff, 0xfa, 0xf9, 0xe8, 0xeb, 0xee, 0xed, 0xe4, 0xe7, 0xe2, 0xe1, + 0xa0, 0xa3, 0xa6, 0xa5, 0xac, 0xaf, 0xaa, 0xa9, 0xb8, 0xbb, 0xbe, 0xbd, 0xb4, 0xb7, 0xb2, 0xb1, + 0x90, 0x93, 0x96, 0x95, 0x9c, 0x9f, 0x9a, 0x99, 0x88, 0x8b, 0x8e, 0x8d, 0x84, 0x87, 0x82, 0x81, + 0x9b, 0x98, 0x9d, 0x9e, 0x97, 0x94, 0x91, 0x92, 0x83, 0x80, 0x85, 0x86, 0x8f, 0x8c, 0x89, 0x8a, + 0xab, 0xa8, 0xad, 0xae, 0xa7, 0xa4, 0xa1, 0xa2, 0xb3, 0xb0, 0xb5, 0xb6, 0xbf, 0xbc, 0xb9, 0xba, + 0xfb, 0xf8, 0xfd, 0xfe, 0xf7, 0xf4, 0xf1, 0xf2, 0xe3, 0xe0, 0xe5, 0xe6, 0xef, 0xec, 0xe9, 0xea, + 0xcb, 0xc8, 0xcd, 0xce, 0xc7, 0xc4, 0xc1, 0xc2, 0xd3, 0xd0, 0xd5, 0xd6, 0xdf, 0xdc, 0xd9, 0xda, + 0x5b, 0x58, 0x5d, 0x5e, 0x57, 0x54, 0x51, 0x52, 0x43, 0x40, 0x45, 0x46, 0x4f, 0x4c, 0x49, 0x4a, + 0x6b, 0x68, 0x6d, 0x6e, 0x67, 0x64, 0x61, 0x62, 0x73, 0x70, 0x75, 0x76, 0x7f, 0x7c, 0x79, 0x7a, + 0x3b, 0x38, 0x3d, 0x3e, 0x37, 0x34, 0x31, 0x32, 0x23, 0x20, 0x25, 0x26, 0x2f, 0x2c, 0x29, 0x2a, + 0x0b, 0x08, 0x0d, 0x0e, 0x07, 0x04, 0x01, 0x02, 0x13, 0x10, 0x15, 0x16, 0x1f, 0x1c, 0x19, 0x1a, +}, +{ // gmult(0x0e, b); + 0x00, 0x0e, 0x1c, 0x12, 0x38, 0x36, 0x24, 0x2a, 0x70, 0x7e, 0x6c, 0x62, 0x48, 0x46, 0x54, 0x5a, + 0xe0, 0xee, 0xfc, 0xf2, 0xd8, 0xd6, 0xc4, 0xca, 0x90, 0x9e, 0x8c, 0x82, 0xa8, 0xa6, 0xb4, 0xba, + 0xdb, 0xd5, 0xc7, 0xc9, 0xe3, 0xed, 0xff, 0xf1, 0xab, 0xa5, 0xb7, 0xb9, 0x93, 0x9d, 0x8f, 0x81, + 0x3b, 0x35, 0x27, 0x29, 0x03, 0x0d, 0x1f, 0x11, 0x4b, 0x45, 0x57, 0x59, 0x73, 0x7d, 0x6f, 0x61, + 0xad, 0xa3, 0xb1, 0xbf, 0x95, 0x9b, 0x89, 0x87, 0xdd, 0xd3, 0xc1, 0xcf, 0xe5, 0xeb, 0xf9, 0xf7, + 0x4d, 0x43, 0x51, 0x5f, 0x75, 0x7b, 0x69, 0x67, 0x3d, 0x33, 0x21, 0x2f, 0x05, 0x0b, 0x19, 0x17, + 0x76, 0x78, 0x6a, 0x64, 0x4e, 0x40, 0x52, 0x5c, 0x06, 0x08, 0x1a, 0x14, 0x3e, 0x30, 0x22, 0x2c, + 0x96, 0x98, 0x8a, 0x84, 0xae, 0xa0, 0xb2, 0xbc, 0xe6, 0xe8, 0xfa, 0xf4, 0xde, 0xd0, 0xc2, 0xcc, + 0x41, 0x4f, 0x5d, 0x53, 0x79, 0x77, 0x65, 0x6b, 0x31, 0x3f, 0x2d, 0x23, 0x09, 0x07, 0x15, 0x1b, + 0xa1, 0xaf, 0xbd, 0xb3, 0x99, 0x97, 0x85, 0x8b, 0xd1, 0xdf, 0xcd, 0xc3, 0xe9, 0xe7, 0xf5, 0xfb, + 0x9a, 0x94, 0x86, 0x88, 0xa2, 0xac, 0xbe, 0xb0, 0xea, 0xe4, 0xf6, 0xf8, 0xd2, 0xdc, 0xce, 0xc0, + 0x7a, 0x74, 0x66, 0x68, 0x42, 0x4c, 0x5e, 0x50, 0x0a, 0x04, 0x16, 0x18, 0x32, 0x3c, 0x2e, 0x20, + 0xec, 0xe2, 0xf0, 0xfe, 0xd4, 0xda, 0xc8, 0xc6, 0x9c, 0x92, 0x80, 0x8e, 0xa4, 0xaa, 0xb8, 0xb6, + 0x0c, 0x02, 0x10, 0x1e, 0x34, 0x3a, 0x28, 0x26, 0x7c, 0x72, 0x60, 0x6e, 0x44, 0x4a, 0x58, 0x56, + 0x37, 0x39, 0x2b, 0x25, 0x0f, 0x01, 0x13, 0x1d, 0x47, 0x49, 0x5b, 0x55, 0x7f, 0x71, 0x63, 0x6d, + 0xd7, 0xd9, 0xcb, 0xc5, 0xef, 0xe1, 0xf3, 0xfd, 0xa7, 0xa9, 0xbb, 0xb5, 0x9f, 0x91, 0x83, 0x8d, +}, +{ // gmult(0x09, b); + 0x00, 0x09, 0x12, 0x1b, 0x24, 0x2d, 0x36, 0x3f, 0x48, 0x41, 0x5a, 0x53, 0x6c, 0x65, 0x7e, 0x77, + 0x90, 0x99, 0x82, 0x8b, 0xb4, 0xbd, 0xa6, 0xaf, 0xd8, 0xd1, 0xca, 0xc3, 0xfc, 0xf5, 0xee, 0xe7, + 0x3b, 0x32, 0x29, 0x20, 0x1f, 0x16, 0x0d, 0x04, 0x73, 0x7a, 0x61, 0x68, 0x57, 0x5e, 0x45, 0x4c, + 0xab, 0xa2, 0xb9, 0xb0, 0x8f, 0x86, 0x9d, 0x94, 0xe3, 0xea, 0xf1, 0xf8, 0xc7, 0xce, 0xd5, 0xdc, + 0x76, 0x7f, 0x64, 0x6d, 0x52, 0x5b, 0x40, 0x49, 0x3e, 0x37, 0x2c, 0x25, 0x1a, 0x13, 0x08, 0x01, + 0xe6, 0xef, 0xf4, 0xfd, 0xc2, 0xcb, 0xd0, 0xd9, 0xae, 0xa7, 0xbc, 0xb5, 0x8a, 0x83, 0x98, 0x91, + 0x4d, 0x44, 0x5f, 0x56, 0x69, 0x60, 0x7b, 0x72, 0x05, 0x0c, 0x17, 0x1e, 0x21, 0x28, 0x33, 0x3a, + 0xdd, 0xd4, 0xcf, 0xc6, 0xf9, 0xf0, 0xeb, 0xe2, 0x95, 0x9c, 0x87, 0x8e, 0xb1, 0xb8, 0xa3, 0xaa, + 0xec, 0xe5, 0xfe, 0xf7, 0xc8, 0xc1, 0xda, 0xd3, 0xa4, 0xad, 0xb6, 0xbf, 0x80, 0x89, 0x92, 0x9b, + 0x7c, 0x75, 0x6e, 0x67, 0x58, 0x51, 0x4a, 0x43, 0x34, 0x3d, 0x26, 0x2f, 0x10, 0x19, 0x02, 0x0b, + 0xd7, 0xde, 0xc5, 0xcc, 0xf3, 0xfa, 0xe1, 0xe8, 0x9f, 0x96, 0x8d, 0x84, 0xbb, 0xb2, 0xa9, 0xa0, + 0x47, 0x4e, 0x55, 0x5c, 0x63, 0x6a, 0x71, 0x78, 0x0f, 0x06, 0x1d, 0x14, 0x2b, 0x22, 0x39, 0x30, + 0x9a, 0x93, 0x88, 0x81, 0xbe, 0xb7, 0xac, 0xa5, 0xd2, 0xdb, 0xc0, 0xc9, 0xf6, 0xff, 0xe4, 0xed, + 0x0a, 0x03, 0x18, 0x11, 0x2e, 0x27, 0x3c, 0x35, 0x42, 0x4b, 0x50, 0x59, 0x66, 0x6f, 0x74, 0x7d, + 0xa1, 0xa8, 0xb3, 0xba, 0x85, 0x8c, 0x97, 0x9e, 0xe9, 0xe0, 0xfb, 0xf2, 0xcd, 0xc4, 0xdf, 0xd6, + 0x31, 0x38, 0x23, 0x2a, 0x15, 0x1c, 0x07, 0x0e, 0x79, 0x70, 0x6b, 0x62, 0x5d, 0x54, 0x4f, 0x46, + +}, +{ // gmult(0x0d, b); + 0x00, 0x0d, 0x1a, 0x17, 0x34, 0x39, 0x2e, 0x23, 0x68, 0x65, 0x72, 0x7f, 0x5c, 0x51, 0x46, 0x4b, + 0xd0, 0xdd, 0xca, 0xc7, 0xe4, 0xe9, 0xfe, 0xf3, 0xb8, 0xb5, 0xa2, 0xaf, 0x8c, 0x81, 0x96, 0x9b, + 0xbb, 0xb6, 0xa1, 0xac, 0x8f, 0x82, 0x95, 0x98, 0xd3, 0xde, 0xc9, 0xc4, 0xe7, 0xea, 0xfd, 0xf0, + 0x6b, 0x66, 0x71, 0x7c, 0x5f, 0x52, 0x45, 0x48, 0x03, 0x0e, 0x19, 0x14, 0x37, 0x3a, 0x2d, 0x20, + 0x6d, 0x60, 0x77, 0x7a, 0x59, 0x54, 0x43, 0x4e, 0x05, 0x08, 0x1f, 0x12, 0x31, 0x3c, 0x2b, 0x26, + 0xbd, 0xb0, 0xa7, 0xaa, 0x89, 0x84, 0x93, 0x9e, 0xd5, 0xd8, 0xcf, 0xc2, 0xe1, 0xec, 0xfb, 0xf6, + 0xd6, 0xdb, 0xcc, 0xc1, 0xe2, 0xef, 0xf8, 0xf5, 0xbe, 0xb3, 0xa4, 0xa9, 0x8a, 0x87, 0x90, 0x9d, + 0x06, 0x0b, 0x1c, 0x11, 0x32, 0x3f, 0x28, 0x25, 0x6e, 0x63, 0x74, 0x79, 0x5a, 0x57, 0x40, 0x4d, + 0xda, 0xd7, 0xc0, 0xcd, 0xee, 0xe3, 0xf4, 0xf9, 0xb2, 0xbf, 0xa8, 0xa5, 0x86, 0x8b, 0x9c, 0x91, + 0x0a, 0x07, 0x10, 0x1d, 0x3e, 0x33, 0x24, 0x29, 0x62, 0x6f, 0x78, 0x75, 0x56, 0x5b, 0x4c, 0x41, + 0x61, 0x6c, 0x7b, 0x76, 0x55, 0x58, 0x4f, 0x42, 0x09, 0x04, 0x13, 0x1e, 0x3d, 0x30, 0x27, 0x2a, + 0xb1, 0xbc, 0xab, 0xa6, 0x85, 0x88, 0x9f, 0x92, 0xd9, 0xd4, 0xc3, 0xce, 0xed, 0xe0, 0xf7, 0xfa, + 0xb7, 0xba, 0xad, 0xa0, 0x83, 0x8e, 0x99, 0x94, 0xdf, 0xd2, 0xc5, 0xc8, 0xeb, 0xe6, 0xf1, 0xfc, + 0x67, 0x6a, 0x7d, 0x70, 0x53, 0x5e, 0x49, 0x44, 0x0f, 0x02, 0x15, 0x18, 0x3b, 0x36, 0x21, 0x2c, + 0x0c, 0x01, 0x16, 0x1b, 0x38, 0x35, 0x22, 0x2f, 0x64, 0x69, 0x7e, 0x73, 0x50, 0x5d, 0x4a, 0x47, + 0xdc, 0xd1, 0xc6, 0xcb, 0xe8, 0xe5, 0xf2, 0xff, 0xb4, 0xb9, 0xae, 0xa3, 0x80, 0x8d, 0x9a, 0x97, +}, +{ // gmult(0x0b, b); + 0x00, 0x0b, 0x16, 0x1d, 0x2c, 0x27, 0x3a, 0x31, 0x58, 0x53, 0x4e, 0x45, 0x74, 0x7f, 0x62, 0x69, + 0xb0, 0xbb, 0xa6, 0xad, 0x9c, 0x97, 0x8a, 0x81, 0xe8, 0xe3, 0xfe, 0xf5, 0xc4, 0xcf, 0xd2, 0xd9, + 0x7b, 0x70, 0x6d, 0x66, 0x57, 0x5c, 0x41, 0x4a, 0x23, 0x28, 0x35, 0x3e, 0x0f, 0x04, 0x19, 0x12, + 0xcb, 0xc0, 0xdd, 0xd6, 0xe7, 0xec, 0xf1, 0xfa, 0x93, 0x98, 0x85, 0x8e, 0xbf, 0xb4, 0xa9, 0xa2, + 0xf6, 0xfd, 0xe0, 0xeb, 0xda, 0xd1, 0xcc, 0xc7, 0xae, 0xa5, 0xb8, 0xb3, 0x82, 0x89, 0x94, 0x9f, + 0x46, 0x4d, 0x50, 0x5b, 0x6a, 0x61, 0x7c, 0x77, 0x1e, 0x15, 0x08, 0x03, 0x32, 0x39, 0x24, 0x2f, + 0x8d, 0x86, 0x9b, 0x90, 0xa1, 0xaa, 0xb7, 0xbc, 0xd5, 0xde, 0xc3, 0xc8, 0xf9, 0xf2, 0xef, 0xe4, + 0x3d, 0x36, 0x2b, 0x20, 0x11, 0x1a, 0x07, 0x0c, 0x65, 0x6e, 0x73, 0x78, 0x49, 0x42, 0x5f, 0x54, + 0xf7, 0xfc, 0xe1, 0xea, 0xdb, 0xd0, 0xcd, 0xc6, 0xaf, 0xa4, 0xb9, 0xb2, 0x83, 0x88, 0x95, 0x9e, + 0x47, 0x4c, 0x51, 0x5a, 0x6b, 0x60, 0x7d, 0x76, 0x1f, 0x14, 0x09, 0x02, 0x33, 0x38, 0x25, 0x2e, + 0x8c, 0x87, 0x9a, 0x91, 0xa0, 0xab, 0xb6, 0xbd, 0xd4, 0xdf, 0xc2, 0xc9, 0xf8, 0xf3, 0xee, 0xe5, + 0x3c, 0x37, 0x2a, 0x21, 0x10, 0x1b, 0x06, 0x0d, 0x64, 0x6f, 0x72, 0x79, 0x48, 0x43, 0x5e, 0x55, + 0x01, 0x0a, 0x17, 0x1c, 0x2d, 0x26, 0x3b, 0x30, 0x59, 0x52, 0x4f, 0x44, 0x75, 0x7e, 0x63, 0x68, + 0xb1, 0xba, 0xa7, 0xac, 0x9d, 0x96, 0x8b, 0x80, 0xe9, 0xe2, 0xff, 0xf4, 0xc5, 0xce, 0xd3, 0xd8, + 0x7a, 0x71, 0x6c, 0x67, 0x56, 0x5d, 0x40, 0x4b, 0x22, 0x29, 0x34, 0x3f, 0x0e, 0x05, 0x18, 0x13, + 0xca, 0xc1, 0xdc, 0xd7, 0xe6, 0xed, 0xf0, 0xfb, 0x92, 0x99, 0x84, 0x8f, 0xbe, 0xb5, 0xa8, 0xa3, +} +}; + +/* + * S-box transformation table + */ +static uint8_t simde_x_aes_s_box[256] = { + // 0 1 2 3 4 5 6 7 8 9 a b c d e f + 0x63, 0x7c, 0x77, 0x7b, 0xf2, 0x6b, 0x6f, 0xc5, 0x30, 0x01, 0x67, 0x2b, 0xfe, 0xd7, 0xab, 0x76, // 0 + 0xca, 0x82, 0xc9, 0x7d, 0xfa, 0x59, 0x47, 0xf0, 0xad, 0xd4, 0xa2, 0xaf, 0x9c, 0xa4, 0x72, 0xc0, // 1 + 0xb7, 0xfd, 0x93, 0x26, 0x36, 0x3f, 0xf7, 0xcc, 0x34, 0xa5, 0xe5, 0xf1, 0x71, 0xd8, 0x31, 0x15, // 2 + 0x04, 0xc7, 0x23, 0xc3, 0x18, 0x96, 0x05, 0x9a, 0x07, 0x12, 0x80, 0xe2, 0xeb, 0x27, 0xb2, 0x75, // 3 + 0x09, 0x83, 0x2c, 0x1a, 0x1b, 0x6e, 0x5a, 0xa0, 0x52, 0x3b, 0xd6, 0xb3, 0x29, 0xe3, 0x2f, 0x84, // 4 + 0x53, 0xd1, 0x00, 0xed, 0x20, 0xfc, 0xb1, 0x5b, 0x6a, 0xcb, 0xbe, 0x39, 0x4a, 0x4c, 0x58, 0xcf, // 5 + 0xd0, 0xef, 0xaa, 0xfb, 0x43, 0x4d, 0x33, 0x85, 0x45, 0xf9, 0x02, 0x7f, 0x50, 0x3c, 0x9f, 0xa8, // 6 + 0x51, 0xa3, 0x40, 0x8f, 0x92, 0x9d, 0x38, 0xf5, 0xbc, 0xb6, 0xda, 0x21, 0x10, 0xff, 0xf3, 0xd2, // 7 + 0xcd, 0x0c, 0x13, 0xec, 0x5f, 0x97, 0x44, 0x17, 0xc4, 0xa7, 0x7e, 0x3d, 0x64, 0x5d, 0x19, 0x73, // 8 + 0x60, 0x81, 0x4f, 0xdc, 0x22, 0x2a, 0x90, 0x88, 0x46, 0xee, 0xb8, 0x14, 0xde, 0x5e, 0x0b, 0xdb, // 9 + 0xe0, 0x32, 0x3a, 0x0a, 0x49, 0x06, 0x24, 0x5c, 0xc2, 0xd3, 0xac, 0x62, 0x91, 0x95, 0xe4, 0x79, // a + 0xe7, 0xc8, 0x37, 0x6d, 0x8d, 0xd5, 0x4e, 0xa9, 0x6c, 0x56, 0xf4, 0xea, 0x65, 0x7a, 0xae, 0x08, // b + 0xba, 0x78, 0x25, 0x2e, 0x1c, 0xa6, 0xb4, 0xc6, 0xe8, 0xdd, 0x74, 0x1f, 0x4b, 0xbd, 0x8b, 0x8a, // c + 0x70, 0x3e, 0xb5, 0x66, 0x48, 0x03, 0xf6, 0x0e, 0x61, 0x35, 0x57, 0xb9, 0x86, 0xc1, 0x1d, 0x9e, // d + 0xe1, 0xf8, 0x98, 0x11, 0x69, 0xd9, 0x8e, 0x94, 0x9b, 0x1e, 0x87, 0xe9, 0xce, 0x55, 0x28, 0xdf, // e + 0x8c, 0xa1, 0x89, 0x0d, 0xbf, 0xe6, 0x42, 0x68, 0x41, 0x99, 0x2d, 0x0f, 0xb0, 0x54, 0xbb, 0x16};// f + +/* + * Inverse S-box transformation table + */ +static uint8_t simde_x_aes_inv_s_box[256] = { + // 0 1 2 3 4 5 6 7 8 9 a b c d e f + 0x52, 0x09, 0x6a, 0xd5, 0x30, 0x36, 0xa5, 0x38, 0xbf, 0x40, 0xa3, 0x9e, 0x81, 0xf3, 0xd7, 0xfb, // 0 + 0x7c, 0xe3, 0x39, 0x82, 0x9b, 0x2f, 0xff, 0x87, 0x34, 0x8e, 0x43, 0x44, 0xc4, 0xde, 0xe9, 0xcb, // 1 + 0x54, 0x7b, 0x94, 0x32, 0xa6, 0xc2, 0x23, 0x3d, 0xee, 0x4c, 0x95, 0x0b, 0x42, 0xfa, 0xc3, 0x4e, // 2 + 0x08, 0x2e, 0xa1, 0x66, 0x28, 0xd9, 0x24, 0xb2, 0x76, 0x5b, 0xa2, 0x49, 0x6d, 0x8b, 0xd1, 0x25, // 3 + 0x72, 0xf8, 0xf6, 0x64, 0x86, 0x68, 0x98, 0x16, 0xd4, 0xa4, 0x5c, 0xcc, 0x5d, 0x65, 0xb6, 0x92, // 4 + 0x6c, 0x70, 0x48, 0x50, 0xfd, 0xed, 0xb9, 0xda, 0x5e, 0x15, 0x46, 0x57, 0xa7, 0x8d, 0x9d, 0x84, // 5 + 0x90, 0xd8, 0xab, 0x00, 0x8c, 0xbc, 0xd3, 0x0a, 0xf7, 0xe4, 0x58, 0x05, 0xb8, 0xb3, 0x45, 0x06, // 6 + 0xd0, 0x2c, 0x1e, 0x8f, 0xca, 0x3f, 0x0f, 0x02, 0xc1, 0xaf, 0xbd, 0x03, 0x01, 0x13, 0x8a, 0x6b, // 7 + 0x3a, 0x91, 0x11, 0x41, 0x4f, 0x67, 0xdc, 0xea, 0x97, 0xf2, 0xcf, 0xce, 0xf0, 0xb4, 0xe6, 0x73, // 8 + 0x96, 0xac, 0x74, 0x22, 0xe7, 0xad, 0x35, 0x85, 0xe2, 0xf9, 0x37, 0xe8, 0x1c, 0x75, 0xdf, 0x6e, // 9 + 0x47, 0xf1, 0x1a, 0x71, 0x1d, 0x29, 0xc5, 0x89, 0x6f, 0xb7, 0x62, 0x0e, 0xaa, 0x18, 0xbe, 0x1b, // a + 0xfc, 0x56, 0x3e, 0x4b, 0xc6, 0xd2, 0x79, 0x20, 0x9a, 0xdb, 0xc0, 0xfe, 0x78, 0xcd, 0x5a, 0xf4, // b + 0x1f, 0xdd, 0xa8, 0x33, 0x88, 0x07, 0xc7, 0x31, 0xb1, 0x12, 0x10, 0x59, 0x27, 0x80, 0xec, 0x5f, // c + 0x60, 0x51, 0x7f, 0xa9, 0x19, 0xb5, 0x4a, 0x0d, 0x2d, 0xe5, 0x7a, 0x9f, 0x93, 0xc9, 0x9c, 0xef, // d + 0xa0, 0xe0, 0x3b, 0x4d, 0xae, 0x2a, 0xf5, 0xb0, 0xc8, 0xeb, 0xbb, 0x3c, 0x83, 0x53, 0x99, 0x61, // e + 0x17, 0x2b, 0x04, 0x7e, 0xba, 0x77, 0xd6, 0x26, 0xe1, 0x69, 0x14, 0x63, 0x55, 0x21, 0x0c, 0x7d};// f + +/* + * Multiplication of 4 byte words + * m(x) = x4+1 + +SIMDE_FUNCTION_ATTRIBUTES +void coef_mult(uint8_t *a, uint8_t *b, uint8_t *d) { + + d[0] = gmult(a[0],b[0])^gmult(a[3],b[1])^gmult(a[2],b[2])^gmult(a[1],b[3]); + d[1] = gmult(a[1],b[0])^gmult(a[0],b[1])^gmult(a[3],b[2])^gmult(a[2],b[3]); + d[2] = gmult(a[2],b[0])^gmult(a[1],b[1])^gmult(a[0],b[2])^gmult(a[3],b[3]); + d[3] = gmult(a[3],b[0])^gmult(a[2],b[1])^gmult(a[1],b[2])^gmult(a[0],b[3]); +} +*/ + +SIMDE_FUNCTION_ATTRIBUTES +void simde_x_aes_coef_mult_lookup(int lookup_table_offset, uint8_t *b, uint8_t *d) { + int o = lookup_table_offset; + + #define gmultl(o,b) simde_x_aes_gmult_lookup_table[o][b] + d[0] = gmultl(o+0,b[0])^gmultl(o+3,b[1])^gmultl(o+2,b[2])^gmultl(o+1,b[3]); + d[1] = gmultl(o+1,b[0])^gmultl(o+0,b[1])^gmultl(o+3,b[2])^gmultl(o+2,b[3]); + d[2] = gmultl(o+2,b[0])^gmultl(o+1,b[1])^gmultl(o+0,b[2])^gmultl(o+3,b[3]); + d[3] = gmultl(o+3,b[0])^gmultl(o+2,b[1])^gmultl(o+1,b[2])^gmultl(o+0,b[3]); + #undef gmultl +} + +#endif + +HEDLEY_DIAGNOSTIC_POP + +#endif /* !defined(SIMDE_AES_H) */ diff --git a/lib/simd_wrapper/simde/simde-align.h b/lib/simd_wrapper/simde/simde-align.h index 0c8a809eef2..2cd49e75ad5 100644 --- a/lib/simd_wrapper/simde/simde-align.h +++ b/lib/simd_wrapper/simde/simde-align.h @@ -11,7 +11,7 @@ ********************************************************************** * * This is portability layer which should help iron out some - * differences across various compilers, as well as various verisons of + * differences across various compilers, as well as various versions of * C and C++. * * It was originally developed for SIMD Everywhere @@ -55,7 +55,7 @@ #include "hedley.h" /* I know this seems a little silly, but some non-hosted compilers - * don't have stddef.h, so we try to accomodate them. */ + * don't have stddef.h, so we try to accommodate them. */ #if !defined(SIMDE_ALIGN_SIZE_T_) #if defined(__SIZE_TYPE__) #define SIMDE_ALIGN_SIZE_T_ __SIZE_TYPE__ @@ -405,7 +405,7 @@ /* SIMDE_ALIGN_ASSUME_LIKE(Pointer, Type) * - * Tihs is similar to SIMDE_ALIGN_ASSUME_TO, except that it takes a + * This is similar to SIMDE_ALIGN_ASSUME_TO, except that it takes a * type instead of a numeric value. */ #if defined(SIMDE_ALIGN_OF) && defined(SIMDE_ALIGN_ASSUME_TO) #define SIMDE_ALIGN_ASSUME_LIKE(Pointer, Type) SIMDE_ALIGN_ASSUME_TO(Pointer, SIMDE_ALIGN_OF(Type)) diff --git a/lib/simd_wrapper/simde/simde-arch.h b/lib/simd_wrapper/simde/simde-arch.h index 2d09ff77246..306974c17d2 100644 --- a/lib/simd_wrapper/simde/simde-arch.h +++ b/lib/simd_wrapper/simde/simde-arch.h @@ -42,6 +42,8 @@ #if !defined(SIMDE_ARCH_H) #define SIMDE_ARCH_H +#include "hedley.h" + /* Alpha */ #if defined(__alpha__) || defined(__alpha) || defined(_M_ALPHA) @@ -119,9 +121,51 @@ # define SIMDE_ARCH_ARM_NEON SIMDE_ARCH_ARM # endif #endif -#if defined(__ARM_FEATURE_SVE) +#if defined(__ARM_FEATURE_AES) && __ARM_FEATURE_AES +# define SIMDE_ARCH_ARM_AES +#endif +#if defined(__ARM_FEATURE_COMPLEX) && __ARM_FEATURE_COMPLEX +# define SIMDE_ARCH_ARM_COMPLEX +#endif +#if defined(__ARM_FEATURE_CRYPTO) && __ARM_FEATURE_CRYPTO +# define SIMDE_ARCH_ARM_CRYPTO +#endif +#if defined(__ARM_FEATURE_DOTPROD) && __ARM_FEATURE_DOTPROD +# define SIMDE_ARCH_ARM_DOTPROD +#endif +#if defined(__ARM_FEATURE_FMA) && __ARM_FEATURE_FMA +# define SIMDE_ARCH_ARM_FMA +#endif +#if defined(__ARM_FEATURE_FP16_FML) && __ARM_FEATURE_FP16_FML +# define SIMDE_ARCH_ARM_FP16_FML +#endif +#if defined(__ARM_FEATURE_FRINT) && __ARM_FEATURE_FRINT +# define SIMDE_ARCH_ARM_FRINT +#endif +#if defined(__ARM_FEATURE_MATMUL_INT8) && __ARM_FEATURE_MATMUL_INT8 +# define SIMDE_ARCH_ARM_MATMUL_INT8 +#endif +#if defined(__ARM_FEATURE_SHA2) && __ARM_FEATURE_SHA2 && !defined(__APPLE_CC__) +# define SIMDE_ARCH_ARM_SHA2 +#endif +#if defined(__ARM_FEATURE_SHA3) && __ARM_FEATURE_SHA3 +# define SIMDE_ARCH_ARM_SHA3 +#endif +#if defined(__ARM_FEATURE_SHA512) && __ARM_FEATURE_SHA512 +# define SIMDE_ARCH_ARM_SHA512 +#endif +#if defined(__ARM_FEATURE_SM3) && __ARM_FEATURE_SM3 +# define SIMDE_ARCH_ARM_SM3 +#endif +#if defined(__ARM_FEATURE_SM4) && __ARM_FEATURE_SM4 +# define SIMDE_ARCH_ARM_SM4 +#endif +#if defined(__ARM_FEATURE_SVE) && __ARM_FEATURE_SVE # define SIMDE_ARCH_ARM_SVE #endif +#if defined(__ARM_FEATURE_QRDMX) && __ARM_FEATURE_QRDMX +# define SIMDE_ARCH_ARM_QRDMX +#endif /* Blackfin */ @@ -267,12 +311,15 @@ # if !defined(SIMDE_ARCH_X86_SSE4_1) # define SIMDE_ARCH_X86_SSE4_1 1 # endif -# if !defined(SIMDE_ARCH_X86_SSE4_1) +# if !defined(SIMDE_ARCH_X86_SSE4_2) # define SIMDE_ARCH_X86_SSE4_2 1 # endif # endif # if defined(__AVX2__) # define SIMDE_ARCH_X86_AVX2 1 +# if defined(_MSC_VER) +# define SIMDE_ARCH_X86_FMA 1 +# endif # endif # if defined(__FMA__) # define SIMDE_ARCH_X86_FMA 1 @@ -319,6 +366,9 @@ # if defined(__AVX512VL__) # define SIMDE_ARCH_X86_AVX512VL 1 # endif +# if defined(__AVX512FP16__) +# define SIMDE_ARCH_X86_AVX512FP16 1 +# endif # if defined(__GFNI__) # define SIMDE_ARCH_X86_GFNI 1 # endif @@ -328,9 +378,12 @@ # if defined(__VPCLMULQDQ__) # define SIMDE_ARCH_X86_VPCLMULQDQ 1 # endif -# if defined(__F16C__) +# if defined(__F16C__) || (defined(HEDLEY_MSVC_VERSION) && HEDLEY_MSVC_VERSION_CHECK(19,30,0) && defined(SIMDE_ARCH_X86_AVX2) ) # define SIMDE_ARCH_X86_F16C 1 # endif +# if defined(__AES__) +# define SIMDE_ARCH_X86_AES 1 +# endif #endif /* Itanium @@ -459,6 +512,42 @@ #define SIMDE_ARCH_POWER_ALTIVEC_CHECK(version) (0) #endif +/* RISC-V + */ +#if defined(__riscv) || defined(__riscv__) +# if __riscv_xlen == 64 +# define SIMDE_ARCH_RISCV64 +# elif __riscv_xlen == 32 +# define SIMDE_ARCH_RISCV32 +# endif +#endif + +/* RISC-V SIMD ISA extensions */ +#if defined(__riscv_zve32x) +# define SIMDE_ARCH_RISCV_ZVE32X 1 +#endif +#if defined(__riscv_zve32f) +# define SIMDE_ARCH_RISCV_ZVE32F 1 +#endif +#if defined(__riscv_zve64x) +# define SIMDE_ARCH_RISCV_ZVE64X 1 +#endif +#if defined(__riscv_zve64f) +# define SIMDE_ARCH_RISCV_ZVE64F 1 +#endif +#if defined(__riscv_zve64d) +# define SIMDE_ARCH_RISCV_ZVE64D 1 +#endif +#if defined(__riscv_v) +# define SIMDE_ARCH_RISCV_V 1 +#endif +#if defined(__riscv_zvfh) +# define SIMDE_ARCH_RISCV_ZVFH 1 +#endif +#if defined(__riscv_zvfhmin) +# define SIMDE_ARCH_RISCV_ZVFHMIN 1 +#endif + /* SPARC */ #if defined(__sparc_v9__) || defined(__sparcv9) @@ -557,6 +646,10 @@ # define SIMDE_ARCH_WASM_SIMD128 #endif +#if defined(SIMDE_ARCH_WASM) && defined(__wasm_relaxed_simd__) +# define SIMDE_ARCH_WASM_RELAXED_SIMD +#endif + /* Xtensa */ #if defined(__xtensa__) || defined(__XTENSA__) @@ -568,4 +661,27 @@ # define SIMDE_ARCH_ARM_NEON_FP16 #endif +/* Availability of 16-bit brain floating-point arithmetic intrinsics */ +#if defined(__ARM_FEATURE_BF16_VECTOR_ARITHMETIC) +# define SIMDE_ARCH_ARM_NEON_BF16 +#endif + +/* LoongArch + */ +#if defined(__loongarch32) +# define SIMDE_ARCH_LOONGARCH 1 +#elif defined(__loongarch64) +# define SIMDE_ARCH_LOONGARCH 2 +#endif + +/* LSX: LoongArch 128-bits SIMD extension */ +#if defined(__loongarch_sx) +# define SIMDE_ARCH_LOONGARCH_LSX 1 +#endif + +/* LASX: LoongArch 256-bits SIMD extension */ +#if defined(__loongarch_asx) +# define SIMDE_ARCH_LOONGARCH_LASX 2 +#endif + #endif /* !defined(SIMDE_ARCH_H) */ diff --git a/lib/simd_wrapper/simde/simde-bf16.h b/lib/simd_wrapper/simde/simde-bf16.h new file mode 100644 index 00000000000..7e07368549b --- /dev/null +++ b/lib/simd_wrapper/simde/simde-bf16.h @@ -0,0 +1,131 @@ +/* SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, copy, + * modify, merge, publish, distribute, sublicense, and/or sell copies + * of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + * Copyright: + * 2023 Yi-Yen Chung (Copyright owned by Andes Technology) + */ + +#include "hedley.h" +#include "simde-common.h" +#include "simde-detect-clang.h" + +#if !defined(SIMDE_BFLOAT16_H) +#define SIMDE_BFLOAT16_H + +HEDLEY_DIAGNOSTIC_PUSH +SIMDE_DISABLE_UNWANTED_DIAGNOSTICS +SIMDE_BEGIN_DECLS_ + +/* This implementations is based upon simde-f16.h */ + +/* Portable version which should work on pretty much any compiler. + * Obviously you can't rely on compiler support for things like + * conversion to/from 32-bit floats, so make sure you always use the + * functions and macros in this file! + */ +#define SIMDE_BFLOAT16_API_PORTABLE 1 + +#define SIMDE_BFLOAT16_API_BF16 2 + +#if !defined(SIMDE_BFLOAT16_API) + #if defined(SIMDE_ARM_NEON_BF16) + #define SIMDE_BFLOAT16_API SIMDE_BFLOAT16_API_BF16 + #else + #define SIMDE_BFLOAT16_API SIMDE_BFLOAT16_API_PORTABLE + #endif +#endif + +#if SIMDE_BFLOAT16_API == SIMDE_BFLOAT16_API_BF16 + #include + typedef __bf16 simde_bfloat16; +#elif SIMDE_BFLOAT16_API == SIMDE_BFLOAT16_API_PORTABLE + typedef struct { uint16_t value; } simde_bfloat16; +#else + #error No 16-bit floating point API. +#endif + +/* Conversion -- convert between single-precision and brain half-precision + * floats. */ +static HEDLEY_ALWAYS_INLINE HEDLEY_CONST +simde_bfloat16 +simde_bfloat16_from_float32 (simde_float32 value) { +#if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARM_NEON_BF16) + return vcvth_bf16_f32(value); +#else + simde_bfloat16 res; + char* src = HEDLEY_REINTERPRET_CAST(char*, &value); + // rounding to nearest bfloat16 + // If the 17th bit of value is 1, set the rounding to 1. + uint8_t rounding = 0; + + #if SIMDE_ENDIAN_ORDER == SIMDE_ENDIAN_LITTLE + if (src[1] & UINT8_C(0x80)) rounding = 1; + src[2] = HEDLEY_STATIC_CAST(char, (HEDLEY_STATIC_CAST(uint8_t, src[2]) + rounding)); + simde_memcpy(&res, src+2, sizeof(res)); + #else + if (src[2] & UINT8_C(0x80)) rounding = 1; + src[1] = HEDLEY_STATIC_CAST(char, (HEDLEY_STATIC_CAST(uint8_t, src[1]) + rounding)); + simde_memcpy(&res, src, sizeof(res)); + #endif + + return res; +#endif +} + +static HEDLEY_ALWAYS_INLINE HEDLEY_CONST +simde_float32 +simde_bfloat16_to_float32 (simde_bfloat16 value) { +#if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARM_NEON_BF16) + return vcvtah_f32_bf16(value); +#else + simde_float32 res = 0.0; + char* _res = HEDLEY_REINTERPRET_CAST(char*, &res); + + #if SIMDE_ENDIAN_ORDER == SIMDE_ENDIAN_LITTLE + simde_memcpy(_res+2, &value, sizeof(value)); + #else + simde_memcpy(_res, &value, sizeof(value)); + #endif + + return res; +#endif +} + +SIMDE_DEFINE_CONVERSION_FUNCTION_(simde_uint16_as_bfloat16, simde_bfloat16, uint16_t) + +#define SIMDE_NANBF simde_uint16_as_bfloat16(0xFFC1) // a quiet Not-a-Number +#define SIMDE_INFINITYBF simde_uint16_as_bfloat16(0x7F80) +#define SIMDE_NINFINITYBF simde_uint16_as_bfloat16(0xFF80) + +#define SIMDE_BFLOAT16_VALUE(value) simde_bfloat16_from_float32(SIMDE_FLOAT32_C(value)) + +#if !defined(simde_isinfbf) && defined(simde_math_isinff) + #define simde_isinfbf(a) simde_math_isinff(simde_bfloat16_to_float32(a)) +#endif +#if !defined(simde_isnanbf) && defined(simde_math_isnanf) + #define simde_isnanbf(a) simde_math_isnanf(simde_bfloat16_to_float32(a)) +#endif + +SIMDE_END_DECLS_ +HEDLEY_DIAGNOSTIC_POP + +#endif /* !defined(SIMDE_BFLOAT16_H) */ diff --git a/lib/simd_wrapper/simde/simde-common.h b/lib/simd_wrapper/simde/simde-common.h index 752328224f5..5a755cca845 100644 --- a/lib/simd_wrapper/simde/simde-common.h +++ b/lib/simd_wrapper/simde/simde-common.h @@ -22,6 +22,8 @@ * * Copyright: * 2017-2020 Evan Nemerson + * 2023 Yi-Yen Chung (Copyright owned by Andes Technology) + * 2023 Ju-Hung Li (Copyright owned by NTHU pllab) */ #if !defined(SIMDE_COMMON_H) @@ -30,8 +32,8 @@ #include "hedley.h" #define SIMDE_VERSION_MAJOR 0 -#define SIMDE_VERSION_MINOR 7 -#define SIMDE_VERSION_MICRO 3 +#define SIMDE_VERSION_MINOR 8 +#define SIMDE_VERSION_MICRO 2 #define SIMDE_VERSION HEDLEY_VERSION_ENCODE(SIMDE_VERSION_MAJOR, SIMDE_VERSION_MINOR, SIMDE_VERSION_MICRO) // Also update meson.build in the root directory of the repository @@ -177,11 +179,24 @@ HEDLEY_INTEL_VERSION_CHECK(13,0,0) || \ defined(_Static_assert) \ ) -# define SIMDE_STATIC_ASSERT(expr, message) _Static_assert(expr, message) + /* Sometimes _Static_assert is defined (in cdefs.h) using a symbol which + * starts with a double-underscore. This is a system header so we have no + * control over it, but since it's a macro it will emit a diagnostic which + * prevents compilation with -Werror. */ + #if HEDLEY_HAS_WARNING("-Wreserved-identifier") + #define SIMDE_STATIC_ASSERT(expr, message) (__extension__({ \ + HEDLEY_DIAGNOSTIC_PUSH \ + _Pragma("clang diagnostic ignored \"-Wreserved-identifier\"") \ + _Static_assert(expr, message); \ + HEDLEY_DIAGNOSTIC_POP \ + })) + #else + #define SIMDE_STATIC_ASSERT(expr, message) _Static_assert(expr, message) + #endif #elif \ (defined(__cplusplus) && (__cplusplus >= 201103L)) || \ HEDLEY_MSVC_VERSION_CHECK(16,0,0) -# define SIMDE_STATIC_ASSERT(expr, message) HEDLEY_DIAGNOSTIC_DISABLE_CPP98_COMPAT_WRAP_(static_assert(expr, message)) + #define SIMDE_STATIC_ASSERT(expr, message) HEDLEY_DIAGNOSTIC_DISABLE_CPP98_COMPAT_WRAP_(static_assert(expr, message)) #endif /* Statement exprs */ @@ -549,6 +564,57 @@ typedef SIMDE_FLOAT32_TYPE simde_float32; #endif typedef SIMDE_FLOAT64_TYPE simde_float64; +#if defined(SIMDE_POLY8_TYPE) +# undef SIMDE_POLY8_TYPE +#endif +#if defined(SIMDE_ARM_NEON_A32V7_NATIVE) +# define SIMDE_POLY8_TYPE poly8_t +# define SIMDE_POLY8_C(value) (HEDLEY_STATIC_CAST(poly8_t, value)) +#else +# define SIMDE_POLY8_TYPE uint8_t +# define SIMDE_POLY8_C(value) (HEDLEY_STATIC_CAST(uint8_t, value)) +#endif +typedef SIMDE_POLY8_TYPE simde_poly8; + +#if defined(SIMDE_POLY16_TYPE) +# undef SIMDE_POLY16_TYPE +#endif +#if defined(SIMDE_ARM_NEON_A32V7_NATIVE) +# define SIMDE_POLY16_TYPE poly16_t +# define SIMDE_POLY16_C(value) (HEDLEY_STATIC_CAST(poly16_t, value)) +#else +# define SIMDE_POLY16_TYPE uint16_t +# define SIMDE_POLY16_C(value) (HEDLEY_STATIC_CAST(uint16_t, value)) +#endif +typedef SIMDE_POLY16_TYPE simde_poly16; + +#if defined(SIMDE_POLY64_TYPE) +# undef SIMDE_POLY64_TYPE +#endif +#if defined(SIMDE_ARM_NEON_A32V8_NATIVE) +# define SIMDE_POLY64_TYPE poly64_t +# define SIMDE_POLY64_C(value) (HEDLEY_STATIC_CAST(poly64_t, value ## ull)) +#else +# define SIMDE_POLY64_TYPE uint64_t +# define SIMDE_POLY64_C(value) value ## ull +#endif +typedef SIMDE_POLY64_TYPE simde_poly64; + +#if defined(SIMDE_POLY128_TYPE) +# undef SIMDE_POLY128_TYPE +#endif +#if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARCH_ARM_CRYPTO) +# define SIMDE_POLY128_TYPE poly128_t +# define SIMDE_POLY128_C(value) value +#elif defined(__SIZEOF_INT128__) +# define SIMDE_POLY128_TYPE __int128 +# define SIMDE_POLY128_C(value) (HEDLEY_STATIC_CAST(__int128, value)) +#else +# define SIMDE_POLY128_TYPE uint64_t +# define SIMDE_TARGET_NOT_SUPPORT_INT128_TYPE 1 +#endif +typedef SIMDE_POLY128_TYPE simde_poly128; + #if defined(__cplusplus) typedef bool simde_bool; #elif defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L) @@ -699,6 +765,36 @@ typedef SIMDE_FLOAT64_TYPE simde_float64; #endif #endif +/*** Functions that quiet a signaling NaN ***/ + +static HEDLEY_INLINE +double +simde_math_quiet(double x) { + uint64_t tmp, mask; + if (!simde_math_isnan(x)) { + return x; + } + simde_memcpy(&tmp, &x, 8); + mask = 0x7ff80000; + mask <<= 32; + tmp |= mask; + simde_memcpy(&x, &tmp, 8); + return x; +} + +static HEDLEY_INLINE +float +simde_math_quietf(float x) { + uint32_t tmp; + if (!simde_math_isnanf(x)) { + return x; + } + simde_memcpy(&tmp, &x, 4); + tmp |= 0x7fc00000lu; + simde_memcpy(&x, &tmp, 4); + return x; +} + #if defined(FE_ALL_EXCEPT) #define SIMDE_HAVE_FENV_H #elif defined(__has_include) @@ -813,6 +909,9 @@ SIMDE_DIAGNOSTIC_DISABLE_CPP98_COMPAT_PEDANTIC_ #define SIMDE_BUILTIN_TYPE_64_ long long #endif +/* SIMDE_DIAGNOSTIC_DISABLE_CPP98_COMPAT_PEDANTIC_ */ +HEDLEY_DIAGNOSTIC_POP + #if defined(SIMDE_BUILTIN_SUFFIX_8_) #define SIMDE_BUILTIN_8_(name) HEDLEY_CONCAT3(__builtin_, name, SIMDE_BUILTIN_SUFFIX_8_) #define SIMDE_BUILTIN_HAS_8_(name) HEDLEY_HAS_BUILTIN(HEDLEY_CONCAT3(__builtin_, name, SIMDE_BUILTIN_SUFFIX_8_)) @@ -886,6 +985,9 @@ SIMDE_DIAGNOSTIC_DISABLE_CPP98_COMPAT_PEDANTIC_ # if !HEDLEY_GCC_VERSION_CHECK(4,6,0) # define SIMDE_BUG_GCC_BAD_MM_EXTRACT_EPI8 /* TODO: find relevant bug or commit */ # endif +# if !HEDLEY_GCC_VERSION_CHECK(7,4,0) || (HEDLEY_GCC_VERSION_CHECK(8,0,0) && !HEDLEY_GCC_VERSION_CHECK(8,3,0)) +# define SIMDE_BUG_GCC_87467 +# endif # if !HEDLEY_GCC_VERSION_CHECK(8,0,0) # define SIMDE_BUG_GCC_REV_247851 # endif @@ -900,7 +1002,9 @@ SIMDE_DIAGNOSTIC_DISABLE_CPP98_COMPAT_PEDANTIC_ # if !HEDLEY_GCC_VERSION_CHECK(9,0,0) && defined(SIMDE_ARCH_AARCH64) # define SIMDE_BUG_GCC_BAD_VEXT_REV32 # endif -# if defined(SIMDE_ARCH_X86) && !defined(SIMDE_ARCH_AMD64) +# if !(HEDLEY_GCC_VERSION_CHECK(9,4,0) \ + || (HEDLEY_GCC_VERSION_CHECK(8,5,0) && !HEDLEY_GCC_VERSION_CHECK(9,0,0)) \ + ) && defined(SIMDE_ARCH_X86) && !defined(SIMDE_ARCH_AMD64) # define SIMDE_BUG_GCC_94482 # endif # if (defined(SIMDE_ARCH_X86) && !defined(SIMDE_ARCH_AMD64)) || defined(SIMDE_ARCH_ZARCH) @@ -910,13 +1014,15 @@ SIMDE_DIAGNOSTIC_DISABLE_CPP98_COMPAT_PEDANTIC_ # if HEDLEY_GCC_VERSION_CHECK(4,3,0) /* -Wsign-conversion */ # define SIMDE_BUG_GCC_95144 # endif -# if !HEDLEY_GCC_VERSION_CHECK(11,0,0) +# if !HEDLEY_GCC_VERSION_CHECK(11,2,0) # define SIMDE_BUG_GCC_95483 # endif # if defined(__OPTIMIZE__) # define SIMDE_BUG_GCC_100927 # endif -# define SIMDE_BUG_GCC_98521 +# if !(HEDLEY_GCC_VERSION_CHECK(10,3,0)) +# define SIMDE_BUG_GCC_98521 +# endif # endif # if !HEDLEY_GCC_VERSION_CHECK(9,4,0) && defined(SIMDE_ARCH_AARCH64) # define SIMDE_BUG_GCC_94488 @@ -924,18 +1030,30 @@ SIMDE_DIAGNOSTIC_DISABLE_CPP98_COMPAT_PEDANTIC_ # if !HEDLEY_GCC_VERSION_CHECK(9,1,0) && defined(SIMDE_ARCH_AARCH64) # define SIMDE_BUG_GCC_REV_264019 # endif -# if defined(SIMDE_ARCH_ARM) +# if (!HEDLEY_GCC_VERSION_CHECK(9,0,0) && !defined(SIMDE_ARCH_AARCH64)) || (!defined(SIMDE_ARCH_AARCH64) && defined(SIMDE_ARCH_ARM)) +# define SIMDE_BUG_GCC_REV_260989 +# endif +# if defined(SIMDE_ARCH_ARM) && !defined(SIMDE_ARCH_AARCH64) # define SIMDE_BUG_GCC_95399 # define SIMDE_BUG_GCC_95471 -# elif defined(SIMDE_ARCH_POWER) +# define SIMDE_BUG_GCC_111609 +# if SIMDE_ARCH_ARM_CHECK(8,0) +# define SIMDE_BUG_GCC_113065 +# endif +# endif +# if defined(SIMDE_ARCH_POWER) # define SIMDE_BUG_GCC_95227 # define SIMDE_BUG_GCC_95782 -# define SIMDE_BUG_VEC_CPSGN_REVERSED_ARGS -# elif defined(SIMDE_ARCH_X86) || defined(SIMDE_ARCH_AMD64) +# if !HEDLEY_GCC_VERSION_CHECK(12,0,0) +# define SIMDE_BUG_VEC_CPSGN_REVERSED_ARGS +# endif +# endif +# if defined(SIMDE_ARCH_X86) || defined(SIMDE_ARCH_AMD64) # if !HEDLEY_GCC_VERSION_CHECK(10,2,0) && !defined(__OPTIMIZE__) # define SIMDE_BUG_GCC_96174 # endif -# elif defined(SIMDE_ARCH_ZARCH) +# endif +# if defined(SIMDE_ARCH_ZARCH) # define SIMDE_BUG_GCC_95782 # if HEDLEY_GCC_VERSION_CHECK(10,0,0) # define SIMDE_BUG_GCC_101614 @@ -943,18 +1061,30 @@ SIMDE_DIAGNOSTIC_DISABLE_CPP98_COMPAT_PEDANTIC_ # endif # if defined(SIMDE_ARCH_MIPS_MSA) # define SIMDE_BUG_GCC_97248 -# define SIMDE_BUG_GCC_100760 -# define SIMDE_BUG_GCC_100761 -# define SIMDE_BUG_GCC_100762 +# if !HEDLEY_GCC_VERSION_CHECK(12,1,0) +# define SIMDE_BUG_GCC_100760 +# define SIMDE_BUG_GCC_100761 +# define SIMDE_BUG_GCC_100762 +# endif +# endif +# if !defined(__OPTIMIZE__) && !(\ + HEDLEY_GCC_VERSION_CHECK(11,4,0) \ + || (HEDLEY_GCC_VERSION_CHECK(10,4,0) && !(HEDLEY_GCC_VERSION_CHECK(11,0,0))) \ + || (HEDLEY_GCC_VERSION_CHECK(9,5,0) && !(HEDLEY_GCC_VERSION_CHECK(10,0,0)))) +# define SIMDE_BUG_GCC_105339 # endif -# define SIMDE_BUG_GCC_95399 # elif defined(__clang__) # if defined(SIMDE_ARCH_AARCH64) -# define SIMDE_BUG_CLANG_45541 -# define SIMDE_BUG_CLANG_46844 -# define SIMDE_BUG_CLANG_48257 +# define SIMDE_BUG_CLANG_48257 // https://github.com/llvm/llvm-project/issues/47601 +# define SIMDE_BUG_CLANG_71362 // https://github.com/llvm/llvm-project/issues/71362 +# define SIMDE_BUG_CLANG_71365 // https://github.com/llvm/llvm-project/issues/71365 +# define SIMDE_BUG_CLANG_71751 // https://github.com/llvm/llvm-project/issues/71751 +# if !SIMDE_DETECT_CLANG_VERSION_CHECK(15,0,0) +# define SIMDE_BUG_CLANG_45541 +# endif # if !SIMDE_DETECT_CLANG_VERSION_CHECK(12,0,0) # define SIMDE_BUG_CLANG_46840 +# define SIMDE_BUG_CLANG_46844 # endif # if SIMDE_DETECT_CLANG_VERSION_CHECK(10,0,0) && SIMDE_DETECT_CLANG_VERSION_NOT(11,0,0) # define SIMDE_BUG_CLANG_BAD_VI64_OPS @@ -968,19 +1098,26 @@ SIMDE_DIAGNOSTIC_DISABLE_CPP98_COMPAT_PEDANTIC_ # if !SIMDE_DETECT_CLANG_VERSION_CHECK(11,0,0) # define SIMDE_BUG_CLANG_BAD_VGET_SET_LANE_TYPES # endif +# if defined(SIMDE_ARM_NEON_A32V7_NATIVE) && !defined(SIMDE_ARM_NEON_A32V8_NATIVE) +# define SIMDE_BUG_CLANG_71763 // https://github.com/llvm/llvm-project/issues/71763 +# endif # endif # if defined(SIMDE_ARCH_POWER) && !SIMDE_DETECT_CLANG_VERSION_CHECK(12,0,0) # define SIMDE_BUG_CLANG_46770 # endif # if defined(SIMDE_ARCH_POWER) && (SIMDE_ARCH_POWER == 700) && (SIMDE_DETECT_CLANG_VERSION_CHECK(11,0,0)) -# define SIMDE_BUG_CLANG_50893 -# define SIMDE_BUG_CLANG_50901 +# if !SIMDE_DETECT_CLANG_VERSION_CHECK(13,0,0) +# define SIMDE_BUG_CLANG_50893 +# define SIMDE_BUG_CLANG_50901 +# endif # endif # if defined(_ARCH_PWR9) && !SIMDE_DETECT_CLANG_VERSION_CHECK(12,0,0) && !defined(__OPTIMIZE__) # define SIMDE_BUG_CLANG_POWER9_16x4_BAD_SHIFT # endif # if defined(SIMDE_ARCH_POWER) -# define SIMDE_BUG_CLANG_50932 +# if !SIMDE_DETECT_CLANG_VERSION_CHECK(14,0,0) +# define SIMDE_BUG_CLANG_50932 +# endif # if !SIMDE_DETECT_CLANG_VERSION_CHECK(12,0,0) # define SIMDE_BUG_VEC_CPSGN_REVERSED_ARGS # endif @@ -1007,9 +1144,12 @@ SIMDE_DIAGNOSTIC_DISABLE_CPP98_COMPAT_PEDANTIC_ # if HEDLEY_HAS_WARNING("-Wvector-conversion") && SIMDE_DETECT_CLANG_VERSION_NOT(11,0,0) # define SIMDE_BUG_CLANG_44589 # endif -# define SIMDE_BUG_CLANG_48673 +# define SIMDE_BUG_CLANG_48673 // https://github.com/llvm/llvm-project/issues/48017 +# endif +# define SIMDE_BUG_CLANG_45959 // https://github.com/llvm/llvm-project/issues/45304 +# if defined(SIMDE_ARCH_WASM_SIMD128) && !SIMDE_DETECT_CLANG_VERSION_CHECK(17,0,0) +# define SIMDE_BUG_CLANG_60655 # endif -# define SIMDE_BUG_CLANG_45959 # elif defined(HEDLEY_MSVC_VERSION) # if defined(SIMDE_ARCH_X86) # define SIMDE_BUG_MSVC_ROUND_EXTRACT @@ -1037,10 +1177,9 @@ SIMDE_DIAGNOSTIC_DISABLE_CPP98_COMPAT_PEDANTIC_ HEDLEY_GCC_VERSION_CHECK(4,3,0) # define SIMDE_BUG_IGNORE_SIGN_CONVERSION(expr) (__extension__ ({ \ HEDLEY_DIAGNOSTIC_PUSH \ - HEDLEY_DIAGNOSTIC_POP \ _Pragma("GCC diagnostic ignored \"-Wsign-conversion\"") \ __typeof__(expr) simde_bug_ignore_sign_conversion_v_= (expr); \ - HEDLEY_DIAGNOSTIC_PUSH \ + HEDLEY_DIAGNOSTIC_POP \ simde_bug_ignore_sign_conversion_v_; \ })) #else @@ -1057,6 +1196,34 @@ SIMDE_DIAGNOSTIC_DISABLE_CPP98_COMPAT_PEDANTIC_ #define SIMDE_CAST_VECTOR_SHIFT_COUNT(width, value) HEDLEY_STATIC_CAST(int##width##_t, (value)) #endif +/* Initial support for RISCV V extensions based on ZVE64D. */ +#if defined(SIMDE_ARCH_RISCV_ZVE64D) && SIMDE_NATURAL_VECTOR_SIZE >= 64 + #define RVV_FIXED_TYPE_DEF(name, lmul) \ + typedef vint8##name##_t fixed_vint8##name##_t __attribute__((riscv_rvv_vector_bits(__riscv_v_fixed_vlen * lmul))); \ + typedef vint16##name##_t fixed_vint16##name##_t __attribute__((riscv_rvv_vector_bits(__riscv_v_fixed_vlen * lmul))); \ + typedef vint32##name##_t fixed_vint32##name##_t __attribute__((riscv_rvv_vector_bits(__riscv_v_fixed_vlen * lmul))); \ + typedef vuint8##name##_t fixed_vuint8##name##_t __attribute__((riscv_rvv_vector_bits(__riscv_v_fixed_vlen * lmul))); \ + typedef vuint16##name##_t fixed_vuint16##name##_t __attribute__((riscv_rvv_vector_bits(__riscv_v_fixed_vlen * lmul))); \ + typedef vuint32##name##_t fixed_vuint32##name##_t __attribute__((riscv_rvv_vector_bits(__riscv_v_fixed_vlen * lmul))); \ + typedef vfloat32##name##_t fixed_vfloat32##name##_t __attribute__((riscv_rvv_vector_bits(__riscv_v_fixed_vlen * lmul))); + RVV_FIXED_TYPE_DEF(mf2, 1/2); + RVV_FIXED_TYPE_DEF(m1, 1); + RVV_FIXED_TYPE_DEF(m2, 2); + #define RVV_FIXED_TYPE_DEF_64B(name, lmul) \ + typedef vint64##name##_t fixed_vint64##name##_t __attribute__((riscv_rvv_vector_bits(__riscv_v_fixed_vlen * lmul))); \ + typedef vuint64##name##_t fixed_vuint64##name##_t __attribute__((riscv_rvv_vector_bits(__riscv_v_fixed_vlen * lmul))); \ + typedef vfloat64##name##_t fixed_vfloat64##name##_t __attribute__((riscv_rvv_vector_bits(__riscv_v_fixed_vlen * lmul))); + RVV_FIXED_TYPE_DEF_64B(m1, 1); + RVV_FIXED_TYPE_DEF_64B(m2, 2); + #if defined(SIMDE_ARCH_RISCV_ZVFH) + #define RVV_FIXED_TYPE_DEF_16F(name, lmul) \ + typedef vfloat16##name##_t fixed_vfloat16##name##_t __attribute__((riscv_rvv_vector_bits(__riscv_v_fixed_vlen * lmul))); + RVV_FIXED_TYPE_DEF_16F(mf2, 1/2); + RVV_FIXED_TYPE_DEF_16F(m1, 1); + RVV_FIXED_TYPE_DEF_16F(m2, 2); + #endif +#endif + /* SIMDE_DIAGNOSTIC_DISABLE_USED_BUT_MARKED_UNUSED_ */ HEDLEY_DIAGNOSTIC_POP diff --git a/lib/simd_wrapper/simde/simde-complex.h b/lib/simd_wrapper/simde/simde-complex.h index ce840e228ce..48ebe4cf1cd 100644 --- a/lib/simd_wrapper/simde/simde-complex.h +++ b/lib/simd_wrapper/simde/simde-complex.h @@ -26,7 +26,7 @@ /* Support for complex math. * - * We try to avoid inculding (in C++ mode) since it pulls in + * We try to avoid including (in C++ mode) since it pulls in * a *lot* of code. Unfortunately this only works for GNU modes (i.e., * -std=gnu++14 not -std=c++14) unless you pass -fext-numeric-literals, * but there is no way (AFAICT) to detect that flag so we have to rely diff --git a/lib/simd_wrapper/simde/simde-detect-clang.h b/lib/simd_wrapper/simde/simde-detect-clang.h index b281074598f..7326f02db90 100644 --- a/lib/simd_wrapper/simde/simde-detect-clang.h +++ b/lib/simd_wrapper/simde/simde-detect-clang.h @@ -54,10 +54,27 @@ * need more resolution I'm happy to accept patches that are able to * detect minor versions as well. That said, you'll probably have a * hard time with detection since AFAIK most minor releases don't add - * anything we can detect. */ + * anything we can detect. Updated based on + * https://github.com/google/highway/blob/438c705a295176b96a50336527bb3e7ea365ffac/hwy/detect_compiler_arch.h#L73 + * - would welcome patches/updates there as well. + */ #if defined(__clang__) && !defined(SIMDE_DETECT_CLANG_VERSION) -# if __has_warning("-Wformat-insufficient-args") +# if __has_warning("-Wmissing-designated-field-initializers") +# define SIMDE_DETECT_CLANG_VERSION 190000 +# elif __has_warning("-Woverriding-option") +# define SIMDE_DETECT_CLANG_VERSION 180000 +# elif __has_attribute(unsafe_buffer_usage) // no new warnings in 17.0 +# define SIMDE_DETECT_CLANG_VERSION 170000 +# elif __has_attribute(nouwtable) // no new warnings in 16.0 +# define SIMDE_DETECT_CLANG_VERSION 160000 +# elif __has_warning("-Warray-parameter") +# define SIMDE_DETECT_CLANG_VERSION 150000 +# elif __has_warning("-Wbitwise-instead-of-logical") +# define SIMDE_DETECT_CLANG_VERSION 140000 +# elif __has_warning("-Waix-compat") +# define SIMDE_DETECT_CLANG_VERSION 130000 +# elif __has_warning("-Wformat-insufficient-args") # define SIMDE_DETECT_CLANG_VERSION 120000 # elif __has_warning("-Wimplicit-const-int-float-conversion") # define SIMDE_DETECT_CLANG_VERSION 110000 @@ -67,7 +84,12 @@ # define SIMDE_DETECT_CLANG_VERSION 90000 # elif __has_warning("-Wextra-semi-stmt") || __has_builtin(__builtin_rotateleft32) # define SIMDE_DETECT_CLANG_VERSION 80000 -# elif __has_warning("-Wc++98-compat-extra-semi") +// For reasons unknown, Xcode 10.3 (Apple LLVM version 10.0.1) is apparently +// based on Clang 7, but does not support the warning we test. +// See https://en.wikipedia.org/wiki/Xcode#Toolchain_versions and +// https://trac.macports.org/wiki/XcodeVersionInfo. +# elif __has_warning("-Wc++98-compat-extra-semi") || \ + (defined(__apple_build_version__) && __apple_build_version__ >= 10010000) # define SIMDE_DETECT_CLANG_VERSION 70000 # elif __has_warning("-Wpragma-pack") # define SIMDE_DETECT_CLANG_VERSION 60000 diff --git a/lib/simd_wrapper/simde/simde-diagnostic.h b/lib/simd_wrapper/simde/simde-diagnostic.h index 95e3554760c..6c7d2e732f0 100644 --- a/lib/simd_wrapper/simde/simde-diagnostic.h +++ b/lib/simd_wrapper/simde/simde-diagnostic.h @@ -272,7 +272,7 @@ #define SIMDE_DIAGNOSTIC_DISABLE_CAST_FUNCTION_TYPE_ #endif -/* clang will emit this warning when we use C99 extensions whan not in +/* clang will emit this warning when we use C99 extensions when not in * C99 mode, even though it does support this. In such cases we check * the compiler and version first, so we know it's not a problem. */ #if HEDLEY_HAS_WARNING("-Wc99-extensions") @@ -281,6 +281,14 @@ #define SIMDE_DIAGNOSTIC_DISABLE_C99_EXTENSIONS_ #endif +/* Similar problm as above; we rely on some basic C99 support, but clang + * has started warning obut this even in C17 mode with -Weverything. */ +#if HEDLEY_HAS_WARNING("-Wdeclaration-after-statement") + #define SIMDE_DIAGNOSTIC_DISABLE_DECLARATION_AFTER_STATEMENT_ _Pragma("clang diagnostic ignored \"-Wdeclaration-after-statement\"") +#else + #define SIMDE_DIAGNOSTIC_DISABLE_DECLARATION_AFTER_STATEMENT_ +#endif + /* https://github.com/simd-everywhere/simde/issues/277 */ #if defined(HEDLEY_GCC_VERSION) && HEDLEY_GCC_VERSION_CHECK(4,6,0) && !HEDLEY_GCC_VERSION_CHECK(6,4,0) && defined(__cplusplus) #define SIMDE_DIAGNOSTIC_DISABLE_BUGGY_UNUSED_BUT_SET_VARIBALE_ _Pragma("GCC diagnostic ignored \"-Wunused-but-set-variable\"") @@ -392,6 +400,8 @@ * more elegantly, but until then... */ #if defined(HEDLEY_MSVC_VERSION) #define SIMDE_DIAGNOSTIC_DISABLE_UNREACHABLE_ __pragma(warning(disable:4702)) +#elif defined(__clang__) + #define SIMDE_DIAGNOSTIC_DISABLE_UNREACHABLE_ HEDLEY_PRAGMA(clang diagnostic ignored "-Wunreachable-code") #else #define SIMDE_DIAGNOSTIC_DISABLE_UNREACHABLE_ #endif @@ -429,6 +439,7 @@ SIMDE_DIAGNOSTIC_DISABLE_NO_EMMS_INSTRUCTION_ \ SIMDE_DIAGNOSTIC_DISABLE_SIMD_PRAGMA_DEPRECATED_ \ SIMDE_DIAGNOSTIC_DISABLE_CONDITIONAL_UNINITIALIZED_ \ + SIMDE_DIAGNOSTIC_DISABLE_DECLARATION_AFTER_STATEMENT_ \ SIMDE_DIAGNOSTIC_DISABLE_FLOAT_EQUAL_ \ SIMDE_DIAGNOSTIC_DISABLE_NON_CONSTANT_AGGREGATE_INITIALIZER_ \ SIMDE_DIAGNOSTIC_DISABLE_EXTRA_SEMI_ \ diff --git a/lib/simd_wrapper/simde/simde-f16.h b/lib/simd_wrapper/simde/simde-f16.h index 79673070cd1..77a220a1bd4 100644 --- a/lib/simd_wrapper/simde/simde-f16.h +++ b/lib/simd_wrapper/simde/simde-f16.h @@ -22,6 +22,7 @@ * * Copyright: * 2021 Evan Nemerson + * 2023 Ju-Hung Li (Copyright owned by NTHU pllab) */ #include "hedley.h" @@ -57,7 +58,7 @@ SIMDE_BEGIN_DECLS_ * that on Arm since it would break compatibility with the NEON F16 * functions. */ #define SIMDE_FLOAT16_API_FP16_NO_ABI 3 -/* This is basically __fp16 as specified by Arm, where arugments and +/* This is basically __fp16 as specified by Arm, where arguments and * return values are raw __fp16 values not structs. */ #define SIMDE_FLOAT16_API_FP16 4 @@ -65,16 +66,28 @@ SIMDE_BEGIN_DECLS_ * any ideas on how to improve it. If you do, patches are definitely * welcome. */ #if !defined(SIMDE_FLOAT16_API) - #if 0 && !defined(__cplusplus) - /* I haven't found a way to detect this. It seems like defining + #if defined(__ARM_FP16_FORMAT_IEEE) && (defined(SIMDE_ARM_NEON_FP16) || defined(__ARM_FP16_ARGS)) + #define SIMDE_FLOAT16_API SIMDE_FLOAT16_API_FP16 + #elif !defined(__EMSCRIPTEN__) && !(defined(__clang__) && defined(SIMDE_ARCH_POWER)) && \ + !(defined(HEDLEY_MSVC_VERSION) && defined(__clang__)) && \ + !(defined(SIMDE_ARCH_MIPS) && defined(__clang__)) && \ + !(defined(__clang__) && defined(SIMDE_ARCH_RISCV64)) && ( \ + defined(SIMDE_X86_AVX512FP16_NATIVE) || \ + (defined(SIMDE_ARCH_X86_SSE2) && HEDLEY_GCC_VERSION_CHECK(12,0,0)) || \ + (defined(SIMDE_ARCH_AARCH64) && HEDLEY_GCC_VERSION_CHECK(7,0,0) && !defined(__cplusplus)) || \ + ((defined(SIMDE_ARCH_X86) || defined(SIMDE_ARCH_AMD64)) && SIMDE_DETECT_CLANG_VERSION_CHECK(15,0,0)) || \ + (!(defined(SIMDE_ARCH_X86) || defined(SIMDE_ARCH_AMD64)) && SIMDE_DETECT_CLANG_VERSION_CHECK(6,0,0))) || \ + defined(SIMDE_ARCH_RISCV_ZVFH) + /* We haven't found a better way to detect this. It seems like defining * __STDC_WANT_IEC_60559_TYPES_EXT__, then including float.h, then * checking for defined(FLT16_MAX) should work, but both gcc and * clang will define the constants even if _Float16 is not * supported. Ideas welcome. */ #define SIMDE_FLOAT16_API SIMDE_FLOAT16_API_FLOAT16 - #elif defined(__ARM_FP16_FORMAT_IEEE) && defined(SIMDE_ARM_NEON_FP16) - #define SIMDE_FLOAT16_API SIMDE_FLOAT16_API_FP16 - #elif defined(__FLT16_MIN__) && (defined(__clang__) && (!defined(SIMDE_ARCH_AARCH64) || SIMDE_DETECT_CLANG_VERSION_CHECK(7,0,0))) + #elif defined(__FLT16_MIN__) && \ + (defined(__clang__) && \ + (!defined(SIMDE_ARCH_AARCH64) || SIMDE_DETECT_CLANG_VERSION_CHECK(7,0,0)) \ + && !defined(SIMDE_ARCH_RISCV64)) #define SIMDE_FLOAT16_API SIMDE_FLOAT16_API_FP16_NO_ABI #else #define SIMDE_FLOAT16_API SIMDE_FLOAT16_API_PORTABLE @@ -83,16 +96,23 @@ SIMDE_BEGIN_DECLS_ #if SIMDE_FLOAT16_API == SIMDE_FLOAT16_API_FLOAT16 typedef _Float16 simde_float16; - #define SIMDE_FLOAT16_C(value) value##f16 + #define SIMDE_FLOAT16_IS_SCALAR 1 + #if !defined(__cplusplus) + #define SIMDE_FLOAT16_C(value) value##f16 + #else + #define SIMDE_FLOAT16_C(value) HEDLEY_STATIC_CAST(_Float16, (value)) + #endif #elif SIMDE_FLOAT16_API == SIMDE_FLOAT16_API_FP16_NO_ABI typedef struct { __fp16 value; } simde_float16; - #if defined(SIMDE_STATEMENT_EXPR_) + #if defined(SIMDE_STATEMENT_EXPR_) && !defined(SIMDE_TESTS_H) #define SIMDE_FLOAT16_C(value) (__extension__({ ((simde_float16) { HEDLEY_DIAGNOSTIC_PUSH SIMDE_DIAGNOSTIC_DISABLE_C99_EXTENSIONS_ HEDLEY_STATIC_CAST(__fp16, (value)) }); HEDLEY_DIAGNOSTIC_POP })) #else #define SIMDE_FLOAT16_C(value) ((simde_float16) { HEDLEY_STATIC_CAST(__fp16, (value)) }) + #define SIMDE_FLOAT16_IS_SCALAR 1 #endif #elif SIMDE_FLOAT16_API == SIMDE_FLOAT16_API_FP16 typedef __fp16 simde_float16; + #define SIMDE_FLOAT16_IS_SCALAR 1 #define SIMDE_FLOAT16_C(value) HEDLEY_STATIC_CAST(__fp16, (value)) #elif SIMDE_FLOAT16_API == SIMDE_FLOAT16_API_PORTABLE typedef struct { uint16_t value; } simde_float16; @@ -100,6 +120,13 @@ SIMDE_BEGIN_DECLS_ #error No 16-bit floating point API. #endif +#if \ + defined(SIMDE_VECTOR_OPS) && \ + (SIMDE_FLOAT16_API != SIMDE_FLOAT16_API_PORTABLE) && \ + (SIMDE_FLOAT16_API != SIMDE_FLOAT16_API_FP16_NO_ABI) + #define SIMDE_FLOAT16_VECTOR +#endif + /* Reinterpret -- you *generally* shouldn't need these, they're really * intended for internal use. However, on x86 half-precision floats * get stuffed into a __m128i/__m256i, so it may be useful. */ @@ -107,12 +134,42 @@ SIMDE_BEGIN_DECLS_ SIMDE_DEFINE_CONVERSION_FUNCTION_(simde_float16_as_uint16, uint16_t, simde_float16) SIMDE_DEFINE_CONVERSION_FUNCTION_(simde_uint16_as_float16, simde_float16, uint16_t) -#define SIMDE_NANHF simde_uint16_as_float16(0x7E00) -#define SIMDE_INFINITYHF simde_uint16_as_float16(0x7C00) +#if SIMDE_FLOAT16_API == SIMDE_FLOAT16_API_PORTABLE + #define SIMDE_NANHF simde_uint16_as_float16(0x7E00) // a quiet Not-a-Number + #define SIMDE_INFINITYHF simde_uint16_as_float16(0x7C00) + #define SIMDE_NINFINITYHF simde_uint16_as_float16(0xFC00) +#else + #if SIMDE_FLOAT16_API == SIMDE_FLOAT16_API_FP16_NO_ABI + #if SIMDE_MATH_BUILTIN_LIBM(nanf16) + #define SIMDE_NANHF SIMDE_FLOAT16_C(__builtin_nanf16("")) + #elif defined(SIMDE_MATH_NAN) + #define SIMDE_NANHF SIMDE_FLOAT16_C(SIMDE_MATH_NAN) + #endif + #if SIMDE_MATH_BUILTIN_LIBM(inf16) + #define SIMDE_INFINITYHF SIMDE_FLOAT16_C(__builtin_inf16()) + #define SIMDE_NINFINITYHF SIMDE_FLOAT16_C(-__builtin_inf16()) + #else + #define SIMDE_INFINITYHF SIMDE_FLOAT16_C(SIMDE_MATH_INFINITY) + #define SIMDE_NINFINITYHF SIMDE_FLOAT16_C(-SIMDE_MATH_INFINITY) + #endif + #else + #if SIMDE_MATH_BUILTIN_LIBM(nanf16) + #define SIMDE_NANHF __builtin_nanf16("") + #elif defined(SIMDE_MATH_NAN) + #define SIMDE_NANHF SIMDE_MATH_NAN + #endif + #if SIMDE_MATH_BUILTIN_LIBM(inf16) + #define SIMDE_INFINITYHF __builtin_inf16() + #define SIMDE_NINFINITYHF -(__builtin_inf16()) + #else + #define SIMDE_INFINITYHF HEDLEY_STATIC_CAST(simde_float16, SIMDE_MATH_INFINITY) + #define SIMDE_NINFINITYHF HEDLEY_STATIC_CAST(simde_float16, -SIMDE_MATH_INFINITY) + #endif + #endif +#endif /* Conversion -- convert between single-precision and half-precision * floats. */ - static HEDLEY_ALWAYS_INLINE HEDLEY_CONST simde_float16 simde_float16_from_float32 (simde_float32 value) { @@ -210,6 +267,54 @@ simde_float16_to_float32 (simde_float16 value) { #define SIMDE_FLOAT16_VALUE(value) simde_float16_from_float32(SIMDE_FLOAT32_C(value)) #endif +#if !defined(simde_isinfhf) && defined(simde_math_isinff) + #define simde_isinfhf(a) simde_math_isinff(simde_float16_to_float32(a)) +#endif +#if !defined(simde_isnanhf) && defined(simde_math_isnanf) + #define simde_isnanhf(a) simde_math_isnanf(simde_float16_to_float32(a)) +#endif +#if !defined(simde_isnormalhf) && defined(simde_math_isnormalf) + #define simde_isnormalhf(a) simde_math_isnormalf(simde_float16_to_float32(a)) +#endif +#if !defined(simde_issubnormalhf) && defined(simde_math_issubnormalf) + #define simde_issubnormalhf(a) simde_math_issubnormalf(simde_float16_to_float32(a)) +#endif + +#define simde_fpclassifyhf(a) simde_math_fpclassifyf(simde_float16_to_float32(a)) + +static HEDLEY_INLINE +uint8_t +simde_fpclasshf(simde_float16 v, const int imm8) { + uint16_t bits = simde_float16_as_uint16(v); + uint8_t negative = (bits >> 15) & 1; + uint16_t const ExpMask = 0x7C00; // [14:10] + uint16_t const MantMask = 0x03FF; // [9:0] + uint8_t exponent_all_ones = ((bits & ExpMask) == ExpMask); + uint8_t exponent_all_zeros = ((bits & ExpMask) == 0); + uint8_t mantissa_all_zeros = ((bits & MantMask) == 0); + uint8_t zero = exponent_all_zeros & mantissa_all_zeros; + uint8_t signaling_bit = (bits >> 9) & 1; + + uint8_t result = 0; + uint8_t snan = exponent_all_ones & (!mantissa_all_zeros) & (!signaling_bit); + uint8_t qnan = exponent_all_ones & (!mantissa_all_zeros) & signaling_bit; + uint8_t positive_zero = (!negative) & zero; + uint8_t negative_zero = negative & zero; + uint8_t positive_infinity = (!negative) & exponent_all_ones & mantissa_all_zeros; + uint8_t negative_infinity = negative & exponent_all_ones & mantissa_all_zeros; + uint8_t denormal = exponent_all_zeros & (!mantissa_all_zeros); + uint8_t finite_negative = negative & (!exponent_all_ones) & (!zero); + result = (((imm8 >> 0) & qnan) | \ + ((imm8 >> 1) & positive_zero) | \ + ((imm8 >> 2) & negative_zero) | \ + ((imm8 >> 3) & positive_infinity) | \ + ((imm8 >> 4) & negative_infinity) | \ + ((imm8 >> 5) & denormal) | \ + ((imm8 >> 6) & finite_negative) | \ + ((imm8 >> 7) & snan)); + return result; +} + SIMDE_END_DECLS_ HEDLEY_DIAGNOSTIC_POP diff --git a/lib/simd_wrapper/simde/simde-features.h b/lib/simd_wrapper/simde/simde-features.h index 88a1207ba18..7b622ead359 100644 --- a/lib/simd_wrapper/simde/simde-features.h +++ b/lib/simd_wrapper/simde/simde-features.h @@ -22,6 +22,7 @@ * * Copyright: * 2020 Evan Nemerson + * 2023 Ju-Hung Li (Copyright owned by NTHU pllab) */ /* simde-arch.h is used to determine which features are available according @@ -39,9 +40,6 @@ #define SIMDE_X86_SVML_NATIVE #endif #endif -#if defined(SIMDE_X86_SVML_NATIVE) && !defined(SIMDE_X86_AVX512F_NATIVE) - #define SIMDE_X86_AVX512F_NATIVE -#endif #if !defined(SIMDE_X86_AVX512VP2INTERSECT_NATIVE) && !defined(SIMDE_X86_AVX512VP2INTERSECT_NO_NATIVE) && !defined(SIMDE_NO_NATIVE) #if defined(SIMDE_ARCH_X86_AVX512VP2INTERSECT) @@ -142,6 +140,15 @@ #define SIMDE_X86_AVX512F_NATIVE #endif +#if !defined(SIMDE_X86_AVX512FP16_NATIVE) && !defined(SIMDE_X86_AVX512FP16_NO_NATIVE) && !defined(SIMDE_NO_NATIVE) + #if defined(SIMDE_ARCH_X86_AVX512FP16) + #define SIMDE_X86_AVX512FP16_NATIVE + #endif +#endif +#if defined(SIMDE_X86_AVX512BW_NATIVE) && !defined(SIMDE_X86_AVX512F_NATIVE) + #define SIMDE_X86_AVX512F_NATIVE +#endif + #if !defined(SIMDE_X86_AVX512BF16_NATIVE) && !defined(SIMDE_X86_AVX512BF16_NO_NATIVE) && !defined(SIMDE_NO_NATIVE) #if defined(SIMDE_ARCH_X86_AVX512BF16) #define SIMDE_X86_AVX512BF16_NATIVE @@ -183,7 +190,7 @@ #define SIMDE_X86_AVX_NATIVE #endif #endif -#if defined(SIMDE_X86_AVX_NATIVE) && !defined(SIMDE_X86_SSE4_1_NATIVE) +#if defined(SIMDE_X86_AVX_NATIVE) && !defined(SIMDE_X86_SSE4_2_NATIVE) #define SIMDE_X86_SSE4_2_NATIVE #endif @@ -232,6 +239,15 @@ #define SIMDE_X86_SSE2_NATIVE #endif +#if !defined(SIMDE_X86_AES_NATIVE) && !defined(SIMDE_X86_AES_NO_NATIVE) && !defined(SIMDE_NO_NATIVE) + #if defined(SIMDE_ARCH_X86_AES) + #define SIMDE_X86_AES_NATIVE + #endif +#endif +#if defined(SIMDE_X86_AES_NATIVE) && !defined(SIMDE_X86_SSE2_NATIVE) + #define SIMDE_X86_SSE2_NATIVE +#endif + #if !defined(SIMDE_X86_SSE2_NATIVE) && !defined(SIMDE_X86_SSE2_NO_NATIVE) && !defined(SIMDE_NO_NATIVE) #if defined(SIMDE_ARCH_X86_SSE2) #define SIMDE_X86_SSE2_NATIVE @@ -278,7 +294,7 @@ #endif #if !defined(SIMDE_X86_SVML_NATIVE) && !defined(SIMDE_X86_SVML_NO_NATIVE) && !defined(SIMDE_NO_NATIVE) - #if defined(__INTEL_COMPILER) + #if defined(SIMDE_ARCH_X86) && (defined(__INTEL_COMPILER) || (HEDLEY_MSVC_VERSION_CHECK(14, 20, 0) && !defined(__clang__))) #define SIMDE_X86_SVML_NATIVE #endif #endif @@ -289,7 +305,7 @@ #endif #if \ - defined(SIMDE_X86_AVX_NATIVE) || defined(SIMDE_X86_GFNI_NATIVE) + defined(SIMDE_X86_AVX_NATIVE) || defined(SIMDE_X86_GFNI_NATIVE) || defined(SIMDE_X86_SVML_NATIVE) #include #elif defined(SIMDE_X86_SSE4_2_NATIVE) #include @@ -315,6 +331,10 @@ #endif #endif +#if defined(SIMDE_X86_AES_NATIVE) + #include +#endif + #if defined(HEDLEY_MSVC_VERSION) #pragma warning(pop) #endif @@ -333,6 +353,9 @@ #define SIMDE_ARM_NEON_A32V8_NATIVE #endif #endif +#if defined(__ARM_ACLE) + #include +#endif #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && !defined(SIMDE_ARM_NEON_A32V7_NATIVE) #define SIMDE_ARM_NEON_A32V7_NATIVE #endif @@ -356,12 +379,27 @@ #endif #endif +#if !defined(SIMDE_RISCV_V_NATIVE) && !defined(SIMDE_RISCV_V_NO_NATIVE) && !defined(SIMDE_NO_NATIVE) + #if defined(SIMDE_ARCH_RISCV_V) + #define SIMDE_RISCV_V_NATIVE + #endif +#endif +#if defined(SIMDE_RISCV_V_NATIVE) + #include +#endif + #if !defined(SIMDE_WASM_SIMD128_NATIVE) && !defined(SIMDE_WASM_SIMD128_NO_NATIVE) && !defined(SIMDE_NO_NATIVE) #if defined(SIMDE_ARCH_WASM_SIMD128) #define SIMDE_WASM_SIMD128_NATIVE #endif #endif -#if defined(SIMDE_WASM_SIMD128_NATIVE) + +#if !defined(SIMDE_WASM_RELAXED_SIMD_NATIVE) && !defined(SIMDE_WASM_RELAXED_SIMD_NO_NATIVE) && !defined(SIMDE_NO_NATIVE) + #if defined(SIMDE_ARCH_WASM_RELAXED_SIMD) + #define SIMDE_WASM_RELAXED_SIMD_NATIVE + #endif +#endif +#if defined(SIMDE_WASM_SIMD128_NATIVE) || defined(SIMDE_WASM_RELAXED_SIMD_NATIVE) #include #endif @@ -493,30 +531,70 @@ /* This is used to determine whether or not to fall back on a vector * function in an earlier ISA extensions, as well as whether * we expected any attempts at vectorization to be fruitful or if we - * expect to always be running serial code. */ + * expect to always be running serial code. + * + * Note that, for some architectures (okay, *one* architecture) there + * can be a split where some types are supported for one vector length + * but others only for a shorter length. Therefore, it is possible to + * provide separate values for float/int/double types. */ #if !defined(SIMDE_NATURAL_VECTOR_SIZE) #if defined(SIMDE_X86_AVX512F_NATIVE) #define SIMDE_NATURAL_VECTOR_SIZE (512) - #elif defined(SIMDE_X86_AVX_NATIVE) + #elif defined(SIMDE_X86_AVX2_NATIVE) #define SIMDE_NATURAL_VECTOR_SIZE (256) + #elif defined(SIMDE_X86_AVX_NATIVE) + #define SIMDE_NATURAL_FLOAT_VECTOR_SIZE (256) + #define SIMDE_NATURAL_INT_VECTOR_SIZE (128) + #define SIMDE_NATURAL_DOUBLE_VECTOR_SIZE (128) #elif \ - defined(SIMDE_X86_SSE_NATIVE) || \ + defined(SIMDE_X86_SSE2_NATIVE) || \ defined(SIMDE_ARM_NEON_A32V7_NATIVE) || \ defined(SIMDE_WASM_SIMD128_NATIVE) || \ defined(SIMDE_POWER_ALTIVEC_P5_NATIVE) || \ defined(SIMDE_ZARCH_ZVECTOR_13_NATIVE) || \ defined(SIMDE_MIPS_MSA_NATIVE) #define SIMDE_NATURAL_VECTOR_SIZE (128) + #elif defined(SIMDE_X86_SSE_NATIVE) + #define SIMDE_NATURAL_FLOAT_VECTOR_SIZE (128) + #define SIMDE_NATURAL_INT_VECTOR_SIZE (64) + #define SIMDE_NATURAL_DOUBLE_VECTOR_SIZE (0) + #elif defined(SIMDE_RISCV_V_NATIVE) && defined(__riscv_v_fixed_vlen) + //FIXME : SIMDE_NATURAL_VECTOR_SIZE == __riscv_v_fixed_vlen + #define SIMDE_NATURAL_VECTOR_SIZE (128) #endif #if !defined(SIMDE_NATURAL_VECTOR_SIZE) - #define SIMDE_NATURAL_VECTOR_SIZE (0) + #if defined(SIMDE_NATURAL_FLOAT_VECTOR_SIZE) + #define SIMDE_NATURAL_VECTOR_SIZE SIMDE_NATURAL_FLOAT_VECTOR_SIZE + #elif defined(SIMDE_NATURAL_INT_VECTOR_SIZE) + #define SIMDE_NATURAL_VECTOR_SIZE SIMDE_NATURAL_INT_VECTOR_SIZE + #elif defined(SIMDE_NATURAL_DOUBLE_VECTOR_SIZE) + #define SIMDE_NATURAL_VECTOR_SIZE SIMDE_NATURAL_DOUBLE_VECTOR_SIZE + #else + #define SIMDE_NATURAL_VECTOR_SIZE (0) + #endif + #endif + + #if !defined(SIMDE_NATURAL_FLOAT_VECTOR_SIZE) + #define SIMDE_NATURAL_FLOAT_VECTOR_SIZE SIMDE_NATURAL_VECTOR_SIZE + #endif + #if !defined(SIMDE_NATURAL_INT_VECTOR_SIZE) + #define SIMDE_NATURAL_INT_VECTOR_SIZE SIMDE_NATURAL_VECTOR_SIZE + #endif + #if !defined(SIMDE_NATURAL_DOUBLE_VECTOR_SIZE) + #define SIMDE_NATURAL_DOUBLE_VECTOR_SIZE SIMDE_NATURAL_VECTOR_SIZE #endif #endif #define SIMDE_NATURAL_VECTOR_SIZE_LE(x) ((SIMDE_NATURAL_VECTOR_SIZE > 0) && (SIMDE_NATURAL_VECTOR_SIZE <= (x))) #define SIMDE_NATURAL_VECTOR_SIZE_GE(x) ((SIMDE_NATURAL_VECTOR_SIZE > 0) && (SIMDE_NATURAL_VECTOR_SIZE >= (x))) +#define SIMDE_NATURAL_FLOAT_VECTOR_SIZE_LE(x) ((SIMDE_NATURAL_FLOAT_VECTOR_SIZE > 0) && (SIMDE_NATURAL_FLOAT_VECTOR_SIZE <= (x))) +#define SIMDE_NATURAL_FLOAT_VECTOR_SIZE_GE(x) ((SIMDE_NATURAL_FLOAT_VECTOR_SIZE > 0) && (SIMDE_NATURAL_FLOAT_VECTOR_SIZE >= (x))) +#define SIMDE_NATURAL_INT_VECTOR_SIZE_LE(x) ((SIMDE_NATURAL_INT_VECTOR_SIZE > 0) && (SIMDE_NATURAL_INT_VECTOR_SIZE <= (x))) +#define SIMDE_NATURAL_INT_VECTOR_SIZE_GE(x) ((SIMDE_NATURAL_INT_VECTOR_SIZE > 0) && (SIMDE_NATURAL_INT_VECTOR_SIZE >= (x))) +#define SIMDE_NATURAL_DOUBLE_VECTOR_SIZE_LE(x) ((SIMDE_NATURAL_DOUBLE_VECTOR_SIZE > 0) && (SIMDE_NATURAL_DOUBLE_VECTOR_SIZE <= (x))) +#define SIMDE_NATURAL_DOUBLE_VECTOR_SIZE_GE(x) ((SIMDE_NATURAL_DOUBLE_VECTOR_SIZE > 0) && (SIMDE_NATURAL_DOUBLE_VECTOR_SIZE >= (x))) /* Native aliases */ #if defined(SIMDE_ENABLE_NATIVE_ALIASES) @@ -580,12 +658,18 @@ #if !defined(SIMDE_X86_AVX512VPOPCNTDQ_NATIVE) #define SIMDE_X86_AVX512VPOPCNTDQ_ENABLE_NATIVE_ALIASES #endif + #if !defined(SIMDE_X86_AVX512VP2INTERSECT_NATIVE) + #define SIMDE_X86_AVX512VP2INTERSECT_ENABLE_NATIVE_ALIASES + #endif #if !defined(SIMDE_X86_AVX512DQ_NATIVE) #define SIMDE_X86_AVX512DQ_ENABLE_NATIVE_ALIASES #endif #if !defined(SIMDE_X86_AVX512CD_NATIVE) #define SIMDE_X86_AVX512CD_ENABLE_NATIVE_ALIASES #endif + #if !defined(SIMDE_X86_AVX512FP16_NATIVE) + #define SIMDE_X86_AVX512FP16_ENABLE_NATIVE_ALIASES + #endif #if !defined(SIMDE_X86_GFNI_NATIVE) #define SIMDE_X86_GFNI_ENABLE_NATIVE_ALIASES #endif @@ -598,6 +682,12 @@ #if !defined(SIMDE_X86_F16C_NATIVE) #define SIMDE_X86_F16C_ENABLE_NATIVE_ALIASES #endif + #if !defined(SIMDE_X86_AES_NATIVE) + #define SIMDE_X86_AES_ENABLE_NATIVE_ALIASES + #endif + #if !defined(SIMDE_X86_SVML_NATIVE) + #define SIMDE_X86_SVML_ENABLE_NATIVE_ALIASES + #endif #if !defined(SIMDE_ARM_NEON_A32V7_NATIVE) #define SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES @@ -613,6 +703,14 @@ #define SIMDE_ARM_SVE_ENABLE_NATIVE_ALIASES #endif + #if !defined(SIMDE_RISCV_V_NATIVE) + #define SIMDE_RISCV_V_ENABLE_NATIVE_ALIASES + #endif + + #if !defined(SIMDE_MIPS_MSA_NATIVE) + #define SIMDE_MIPS_MSA_ENABLE_NATIVE_ALIASES + #endif + #if !defined(SIMDE_WASM_SIMD128_NATIVE) #define SIMDE_WASM_SIMD128_ENABLE_NATIVE_ALIASES #endif @@ -645,4 +743,27 @@ #define SIMDE_ARM_NEON_FP16 #endif +#if defined(SIMDE_ARCH_ARM_NEON_BF16) + #define SIMDE_ARM_NEON_BF16 +#endif + +#if !defined(SIMDE_LOONGARCH_LASX_NATIVE) && !defined(SIMDE_LOONGARCH_LASX_NO_NATIVE) && !defined(SIMDE_NO_NATIVE) + #if defined(SIMDE_ARCH_LOONGARCH_LASX) + #define SIMDE_LOONGARCH_LASX_NATIVE + #endif +#endif + +#if !defined(SIMDE_LOONGARCH_LSX_NATIVE) && !defined(SIMDE_LOONGARCH_LSX_NO_NATIVE) && !defined(SIMDE_NO_NATIVE) + #if defined(SIMDE_ARCH_LOONGARCH_LSX) + #define SIMDE_LOONGARCH_LSX_NATIVE + #endif +#endif + +#if defined(SIMDE_LOONGARCH_LASX_NATIVE) + #include +#endif +#if defined(SIMDE_LOONGARCH_LSX_NATIVE) + #include +#endif + #endif /* !defined(SIMDE_FEATURES_H) */ diff --git a/lib/simd_wrapper/simde/simde-math.h b/lib/simd_wrapper/simde/simde-math.h index 7e15a1c0437..d18bf4f902e 100644 --- a/lib/simd_wrapper/simde/simde-math.h +++ b/lib/simd_wrapper/simde/simde-math.h @@ -22,6 +22,7 @@ * * Copyright: * 2017-2020 Evan Nemerson + * 2023 Yi-Yen Chung (Copyright owned by Andes Technology) */ /* Attempt to find math functions. Functions may be in , @@ -434,6 +435,91 @@ simde_math_fpclassify(double v) { #endif } +#define SIMDE_MATH_FP_QNAN 0x01 +#define SIMDE_MATH_FP_PZERO 0x02 +#define SIMDE_MATH_FP_NZERO 0x04 +#define SIMDE_MATH_FP_PINF 0x08 +#define SIMDE_MATH_FP_NINF 0x10 +#define SIMDE_MATH_FP_DENORMAL 0x20 +#define SIMDE_MATH_FP_NEGATIVE 0x40 +#define SIMDE_MATH_FP_SNAN 0x80 + +static HEDLEY_INLINE +uint8_t +simde_math_fpclassf(float v, const int imm8) { + union { + float f; + uint32_t u; + } fu; + fu.f = v; + uint32_t bits = fu.u; + uint8_t NegNum = (bits >> 31) & 1; + uint32_t const ExpMask = 0x3F800000; // [30:23] + uint32_t const MantMask = 0x007FFFFF; // [22:0] + uint8_t ExpAllOnes = ((bits & ExpMask) == ExpMask); + uint8_t ExpAllZeros = ((bits & ExpMask) == 0); + uint8_t MantAllZeros = ((bits & MantMask) == 0); + uint8_t ZeroNumber = ExpAllZeros & MantAllZeros; + uint8_t SignalingBit = (bits >> 22) & 1; + + uint8_t result = 0; + uint8_t qNaN_res = ExpAllOnes & (!MantAllZeros) & SignalingBit; + uint8_t Pzero_res = (!NegNum) & ExpAllZeros & MantAllZeros; + uint8_t Nzero_res = NegNum & ExpAllZeros & MantAllZeros; + uint8_t Pinf_res = (!NegNum) & ExpAllOnes & MantAllZeros; + uint8_t Ninf_res = NegNum & ExpAllOnes & MantAllZeros; + uint8_t Denorm_res = ExpAllZeros & (!MantAllZeros); + uint8_t FinNeg_res = NegNum & (!ExpAllOnes) & (!ZeroNumber); + uint8_t sNaN_res = ExpAllOnes & (!MantAllZeros) & (!SignalingBit); + result = (((imm8 >> 0) & qNaN_res) | \ + ((imm8 >> 1) & Pzero_res) | \ + ((imm8 >> 2) & Nzero_res) | \ + ((imm8 >> 3) & Pinf_res) | \ + ((imm8 >> 4) & Ninf_res) | \ + ((imm8 >> 5) & Denorm_res) | \ + ((imm8 >> 6) & FinNeg_res) | \ + ((imm8 >> 7) & sNaN_res)); + return result; +} + +static HEDLEY_INLINE +uint8_t +simde_math_fpclass(double v, const int imm8) { + union { + double d; + uint64_t u; + } du; + du.d = v; + uint64_t bits = du.u; + uint8_t NegNum = (bits >> 63) & 1; + uint64_t const ExpMask = 0x3FF0000000000000; // [62:52] + uint64_t const MantMask = 0x000FFFFFFFFFFFFF; // [51:0] + uint8_t ExpAllOnes = ((bits & ExpMask) == ExpMask); + uint8_t ExpAllZeros = ((bits & ExpMask) == 0); + uint8_t MantAllZeros = ((bits & MantMask) == 0); + uint8_t ZeroNumber = ExpAllZeros & MantAllZeros; + uint8_t SignalingBit = (bits >> 51) & 1; + + uint8_t result = 0; + uint8_t qNaN_res = ExpAllOnes & (!MantAllZeros) & SignalingBit; + uint8_t Pzero_res = (!NegNum) & ExpAllZeros & MantAllZeros; + uint8_t Nzero_res = NegNum & ExpAllZeros & MantAllZeros; + uint8_t Pinf_res = (!NegNum) & ExpAllOnes & MantAllZeros; + uint8_t Ninf_res = NegNum & ExpAllOnes & MantAllZeros; + uint8_t Denorm_res = ExpAllZeros & (!MantAllZeros); + uint8_t FinNeg_res = NegNum & (!ExpAllOnes) & (!ZeroNumber); + uint8_t sNaN_res = ExpAllOnes & (!MantAllZeros) & (!SignalingBit); + result = (((imm8 >> 0) & qNaN_res) | \ + ((imm8 >> 1) & Pzero_res) | \ + ((imm8 >> 2) & Nzero_res) | \ + ((imm8 >> 3) & Pinf_res) | \ + ((imm8 >> 4) & Ninf_res) | \ + ((imm8 >> 5) & Denorm_res) | \ + ((imm8 >> 6) & FinNeg_res) | \ + ((imm8 >> 7) & sNaN_res)); + return result; +} + /*** Manipulation functions ***/ #if !defined(simde_math_nextafter) @@ -706,6 +792,20 @@ simde_math_fpclassify(double v) { #endif #endif +#if !defined(simde_math_signbit) + #if SIMDE_MATH_BUILTIN_LIBM(signbit) + #if (!defined(__clang__) || SIMDE_DETECT_CLANG_VERSION_CHECK(7,0,0)) + #define simde_math_signbit(x) __builtin_signbit(x) + #else + #define simde_math_signbit(x) __builtin_signbit(HEDLEY_STATIC_CAST(double, (x))) + #endif + #elif defined(SIMDE_MATH_HAVE_CMATH) + #define simde_math_signbit(x) std::signbit(x) + #elif defined(SIMDE_MATH_HAVE_MATH_H) + #define simde_math_signbit(x) signbit(x) + #endif +#endif + #if !defined(simde_math_cos) #if SIMDE_MATH_BUILTIN_LIBM(cos) #define simde_math_cos(v) __builtin_cos(v) @@ -1166,7 +1266,7 @@ simde_math_fpclassify(double v) { #if !defined(simde_math_roundeven) #if \ - HEDLEY_HAS_BUILTIN(__builtin_roundeven) || \ + ((!defined(HEDLEY_EMSCRIPTEN_VERSION) || HEDLEY_EMSCRIPTEN_VERSION_CHECK(3, 1, 43)) && HEDLEY_HAS_BUILTIN(__builtin_roundeven)) || \ HEDLEY_GCC_VERSION_CHECK(10,0,0) #define simde_math_roundeven(v) __builtin_roundeven(v) #elif defined(simde_math_round) && defined(simde_math_fabs) @@ -1186,7 +1286,7 @@ simde_math_fpclassify(double v) { #if !defined(simde_math_roundevenf) #if \ - HEDLEY_HAS_BUILTIN(__builtin_roundevenf) || \ + ((!defined(HEDLEY_EMSCRIPTEN_VERSION) || HEDLEY_EMSCRIPTEN_VERSION_CHECK(3, 1, 43)) && HEDLEY_HAS_BUILTIN(__builtin_roundevenf)) || \ HEDLEY_GCC_VERSION_CHECK(10,0,0) #define simde_math_roundevenf(v) __builtin_roundevenf(v) #elif defined(simde_math_roundf) && defined(simde_math_fabsf) @@ -1264,6 +1364,16 @@ simde_math_fpclassify(double v) { #endif #endif +#if !defined(simde_math_sqrtl) + #if SIMDE_MATH_BUILTIN_LIBM(sqrtl) + #define simde_math_sqrtl(v) __builtin_sqrtl(v) + #elif defined(SIMDE_MATH_HAVE_CMATH) + #define simde_math_sqrtl(v) std::sqrt(v) + #elif defined(SIMDE_MATH_HAVE_MATH_H) + #define simde_math_sqrtl(v) sqrtl(v) + #endif +#endif + #if !defined(simde_math_tan) #if SIMDE_MATH_BUILTIN_LIBM(tan) #define simde_math_tan(v) __builtin_tan(v) @@ -1399,15 +1509,12 @@ simde_math_fpclassify(double v) { #define simde_math_cdfnormf simde_math_cdfnormf #endif -HEDLEY_DIAGNOSTIC_PUSH -SIMDE_DIAGNOSTIC_DISABLE_FLOAT_EQUAL_ - #if !defined(simde_math_cdfnorminv) && defined(simde_math_log) && defined(simde_math_sqrt) /*https://web.archive.org/web/20150910081113/http://home.online.no/~pjacklam/notes/invnorm/impl/sprouse/ltqnorm.c*/ static HEDLEY_INLINE double simde_math_cdfnorminv(double p) { - static const double a[] = { + static const double a[6] = { -3.969683028665376e+01, 2.209460984245205e+02, -2.759285104469687e+02, @@ -1416,7 +1523,7 @@ SIMDE_DIAGNOSTIC_DISABLE_FLOAT_EQUAL_ 2.506628277459239e+00 }; - static const double b[] = { + static const double b[5] = { -5.447609879822406e+01, 1.615858368580409e+02, -1.556989798598866e+02, @@ -1424,7 +1531,7 @@ SIMDE_DIAGNOSTIC_DISABLE_FLOAT_EQUAL_ -1.328068155288572e+01 }; - static const double c[] = { + static const double c[6] = { -7.784894002430293e-03, -3.223964580411365e-01, -2.400758277161838e+00, @@ -1433,7 +1540,7 @@ SIMDE_DIAGNOSTIC_DISABLE_FLOAT_EQUAL_ 2.938163982698783e+00 }; - static const double d[] = { + static const double d[4] = { 7.784695709041462e-03, 3.224671290700398e-01, 2.445134137142996e+00, @@ -1474,7 +1581,7 @@ SIMDE_DIAGNOSTIC_DISABLE_FLOAT_EQUAL_ static HEDLEY_INLINE float simde_math_cdfnorminvf(float p) { - static const float a[] = { + static const float a[6] = { -3.969683028665376e+01f, 2.209460984245205e+02f, -2.759285104469687e+02f, @@ -1482,14 +1589,14 @@ SIMDE_DIAGNOSTIC_DISABLE_FLOAT_EQUAL_ -3.066479806614716e+01f, 2.506628277459239e+00f }; - static const float b[] = { + static const float b[5] = { -5.447609879822406e+01f, 1.615858368580409e+02f, -1.556989798598866e+02f, 6.680131188771972e+01f, -1.328068155288572e+01f }; - static const float c[] = { + static const float c[6] = { -7.784894002430293e-03f, -3.223964580411365e-01f, -2.400758277161838e+00f, @@ -1497,7 +1604,7 @@ SIMDE_DIAGNOSTIC_DISABLE_FLOAT_EQUAL_ 4.374664141464968e+00f, 2.938163982698783e+00f }; - static const float d[] = { + static const float d[4] = { 7.784695709041462e-03f, 3.224671290700398e-01f, 2.445134137142996e+00f, @@ -1584,7 +1691,7 @@ SIMDE_DIAGNOSTIC_DISABLE_FLOAT_EQUAL_ if(x >= 0.0625 && x < 2.0) { return simde_math_erfinv(1.0 - x); } else if (x < 0.0625 && x >= 1.0e-100) { - double p[6] = { + static const double p[6] = { 0.1550470003116, 1.382719649631, 0.690969348887, @@ -1592,7 +1699,7 @@ SIMDE_DIAGNOSTIC_DISABLE_FLOAT_EQUAL_ 0.680544246825, -0.16444156791 }; - double q[3] = { + static const double q[3] = { 0.155024849822, 1.385228141995, 1.000000000000 @@ -1602,13 +1709,13 @@ SIMDE_DIAGNOSTIC_DISABLE_FLOAT_EQUAL_ return (p[0] / t + p[1] + t * (p[2] + t * (p[3] + t * (p[4] + t * p[5])))) / (q[0] + t * (q[1] + t * (q[2]))); } else if (x < 1.0e-100 && x >= SIMDE_MATH_DBL_MIN) { - double p[4] = { + static const double p[4] = { 0.00980456202915, 0.363667889171, 0.97302949837, -0.5374947401 }; - double q[3] = { + static const double q[3] = { 0.00980451277802, 0.363699971544, 1.000000000000 @@ -1675,8 +1782,6 @@ SIMDE_DIAGNOSTIC_DISABLE_FLOAT_EQUAL_ #define simde_math_erfcinvf simde_math_erfcinvf #endif -HEDLEY_DIAGNOSTIC_POP - static HEDLEY_INLINE double simde_math_rad2deg(double radians) { diff --git a/lib/simd_wrapper/simde/wasm/relaxed-simd.h b/lib/simd_wrapper/simde/wasm/relaxed-simd.h index 3bfcc902aba..b610eb08c6a 100644 --- a/lib/simd_wrapper/simde/wasm/relaxed-simd.h +++ b/lib/simd_wrapper/simde/wasm/relaxed-simd.h @@ -37,8 +37,10 @@ SIMDE_BEGIN_DECLS_ SIMDE_FUNCTION_ATTRIBUTES simde_v128_t -simde_wasm_i8x16_swizzle_relaxed (simde_v128_t a, simde_v128_t b) { - #if defined(SIMDE_WASM_SIMD128_NATIVE) +simde_wasm_i8x16_relaxed_swizzle(simde_v128_t a, simde_v128_t b) { + #if defined(SIMDE_WASM_RELAXED_SIMD_NATIVE) + return wasm_i8x16_relaxed_swizzle(a, b); + #elif defined(SIMDE_WASM_SIMD128_NATIVE) return wasm_i8x16_swizzle(a, b); #else simde_v128_private @@ -71,15 +73,17 @@ simde_wasm_i8x16_swizzle_relaxed (simde_v128_t a, simde_v128_t b) { #endif } #if defined(SIMDE_WASM_RELAXED_SIMD_ENABLE_NATIVE_ALIASES) - #define wasm_i8x16_swizzle_relaxed(a, b) simde_wasm_i8x16_swizzle_relaxed((a), (b)) + #define wasm_i8x16_relaxed_swizzle(a, b) simde_wasm_i8x16_relaxed_swizzle((a), (b)) #endif /* Conversions */ SIMDE_FUNCTION_ATTRIBUTES simde_v128_t -simde_wasm_i32x4_trunc_f32x4 (simde_v128_t a) { - #if defined(SIMDE_WASM_SIMD128_NATIVE) +simde_wasm_i32x4_relaxed_trunc_f32x4 (simde_v128_t a) { + #if defined(SIMDE_WASM_RELAXED_SIMD_NATIVE) + return wasm_i32x4_relaxed_trunc_f32x4(a); + #elif defined(SIMDE_WASM_SIMD128_NATIVE) return wasm_i32x4_trunc_sat_f32x4(a); #else simde_v128_private @@ -107,13 +111,15 @@ simde_wasm_i32x4_trunc_f32x4 (simde_v128_t a) { #endif } #if defined(SIMDE_WASM_RELAXED_SIMD_ENABLE_NATIVE_ALIASES) - #define wasm_i32x4_trunc_f32x4(a) simde_wasm_i32x4_trunc_f32x4((a)) + #define wasm_i32x4_relaxed_trunc_f32x4(a) simde_wasm_i32x4_relaxed_trunc_f32x4((a)) #endif SIMDE_FUNCTION_ATTRIBUTES simde_v128_t -simde_wasm_u32x4_trunc_f32x4 (simde_v128_t a) { - #if defined(SIMDE_WASM_SIMD128_NATIVE) +simde_wasm_u32x4_relaxed_trunc_f32x4 (simde_v128_t a) { + #if defined(SIMDE_WASM_RELAXED_SIMD_NATIVE) + return wasm_u32x4_relaxed_trunc_f32x4(a); + #elif defined(SIMDE_WASM_SIMD128_NATIVE) return wasm_u32x4_trunc_sat_f32x4(a); #else simde_v128_private @@ -152,13 +158,15 @@ simde_wasm_u32x4_trunc_f32x4 (simde_v128_t a) { #endif } #if defined(SIMDE_WASM_RELAXED_SIMD_ENABLE_NATIVE_ALIASES) - #define wasm_u32x4_trunc_f32x4(a) simde_wasm_u32x4_trunc_f32x4((a)) + #define wasm_u32x4_relaxed_trunc_f32x4(a) simde_wasm_u32x4_relaxed_trunc_f32x4((a)) #endif SIMDE_FUNCTION_ATTRIBUTES simde_v128_t -simde_wasm_i32x4_trunc_f64x2_zero (simde_v128_t a) { - #if defined(SIMDE_WASM_SIMD128_NATIVE) +simde_wasm_i32x4_relaxed_trunc_f64x2_zero (simde_v128_t a) { + #if defined(SIMDE_WASM_RELAXED_SIMD_NATIVE) + return wasm_i32x4_relaxed_trunc_f64x2_zero(a); + #elif defined(SIMDE_WASM_SIMD128_NATIVE) return wasm_i32x4_trunc_sat_f64x2_zero(a); #else simde_v128_private @@ -209,13 +217,15 @@ simde_wasm_i32x4_trunc_f64x2_zero (simde_v128_t a) { #endif } #if defined(SIMDE_WASM_RELAXED_SIMD_ENABLE_NATIVE_ALIASES) - #define wasm_i32x4_trunc_f64x2_zero(a) simde_wasm_i32x4_trunc_f64x2_zero((a)) + #define wasm_i32x4_relaxed_trunc_f64x2_zero(a) simde_wasm_i32x4_relaxed_trunc_f64x2_zero((a)) #endif SIMDE_FUNCTION_ATTRIBUTES simde_v128_t -simde_wasm_u32x4_trunc_f64x2_zero (simde_v128_t a) { - #if defined(SIMDE_WASM_SIMD128_NATIVE) +simde_wasm_u32x4_relaxed_trunc_f64x2_zero (simde_v128_t a) { + #if defined(SIMDE_WASM_RELAXED_SIMD_NATIVE) + return wasm_u32x4_relaxed_trunc_f64x2_zero(a); + #elif defined(SIMDE_WASM_SIMD128_NATIVE) return wasm_u32x4_trunc_sat_f64x2_zero(a); #else simde_v128_private @@ -254,14 +264,14 @@ simde_wasm_u32x4_trunc_f64x2_zero (simde_v128_t a) { #endif } #if defined(SIMDE_WASM_RELAXED_SIMD_ENABLE_NATIVE_ALIASES) - #define wasm_u32x4_trunc_f64x2_zero(a) simde_wasm_u32x4_trunc_f64x2_zero((a)) + #define wasm_u32x4_relaxed_trunc_f64x2_zero(a) simde_wasm_u32x4_relaxed_trunc_f64x2_zero((a)) #endif SIMDE_FUNCTION_ATTRIBUTES simde_v128_t -simde_wasm_i8x16_blend(simde_v128_t a, simde_v128_t b, simde_v128_t mask) { +simde_wasm_i8x16_relaxed_laneselect(simde_v128_t a, simde_v128_t b, simde_v128_t mask) { #if defined(SIMDE_WASM_RELAXED_SIMD_NATIVE) - return wasm_i8x16_blend(a, b, mask); + return wasm_i8x16_relaxed_laneselect(a, b, mask); #elif defined(SIMDE_X86_SSE4_1_NATIVE) simde_v128_private a_ = simde_v128_to_private(a), @@ -276,15 +286,15 @@ simde_wasm_i8x16_blend(simde_v128_t a, simde_v128_t b, simde_v128_t mask) { return simde_wasm_v128_bitselect(a, b, mask); #endif } -#if defined(SIMDE_WASM_SIMD128_ENABLE_NATIVE_ALIASES) - #define wasm_i8x16_blend(a, b, c) simde_wasm_i8x16_blend((a), (b), (c)) +#if defined(SIMDE_WASM_RELAXED_SIMD_ENABLE_NATIVE_ALIASES) + #define wasm_i8x16_relaxed_laneselect(a, b, mask) simde_wasm_i8x16_relaxed_laneselect((a), (b), (mask)) #endif SIMDE_FUNCTION_ATTRIBUTES simde_v128_t -simde_wasm_i16x8_blend(simde_v128_t a, simde_v128_t b, simde_v128_t mask) { +simde_wasm_i16x8_relaxed_laneselect(simde_v128_t a, simde_v128_t b, simde_v128_t mask) { #if defined(SIMDE_WASM_RELAXED_SIMD_NATIVE) - return wasm_i16x8_blend(a, b, mask); + return wasm_i16x8_relaxed_laneselect(a, b, mask); #elif defined(SIMDE_X86_SSE4_1_NATIVE) simde_v128_private a_ = simde_v128_to_private(a), @@ -299,15 +309,15 @@ simde_wasm_i16x8_blend(simde_v128_t a, simde_v128_t b, simde_v128_t mask) { return simde_wasm_v128_bitselect(a, b, mask); #endif } -#if defined(SIMDE_WASM_SIMD128_ENABLE_NATIVE_ALIASES) - #define wasm_i16x8_blend(a, b, c) simde_wasm_i16x8_blend((a), (b), (c)) +#if defined(SIMDE_WASM_RELAXED_SIMD_ENABLE_NATIVE_ALIASES) + #define wasm_i16x8_relaxed_laneselect(a, b, mask) simde_wasm_i16x8_relaxed_laneselect((a), (b), (mask)) #endif SIMDE_FUNCTION_ATTRIBUTES simde_v128_t -simde_wasm_i32x4_blend(simde_v128_t a, simde_v128_t b, simde_v128_t mask) { +simde_wasm_i32x4_relaxed_laneselect(simde_v128_t a, simde_v128_t b, simde_v128_t mask) { #if defined(SIMDE_WASM_RELAXED_SIMD_NATIVE) - return wasm_i32x4_blend(a, b, mask); + return wasm_i32x4_relaxed_laneselect(a, b, mask); #elif defined(SIMDE_X86_SSE4_1_NATIVE) simde_v128_private a_ = simde_v128_to_private(a), @@ -322,15 +332,15 @@ simde_wasm_i32x4_blend(simde_v128_t a, simde_v128_t b, simde_v128_t mask) { return simde_wasm_v128_bitselect(a, b, mask); #endif } -#if defined(SIMDE_WASM_SIMD128_ENABLE_NATIVE_ALIASES) - #define wasm_i32x4_blend(a, b, c) simde_wasm_i32x4_blend((a), (b), (c)) +#if defined(SIMDE_WASM_RELAXED_SIMD_ENABLE_NATIVE_ALIASES) + #define wasm_i32x4_relaxed_laneselect(a, b, c) simde_wasm_i32x4_relaxed_laneselect((a), (b), (c)) #endif SIMDE_FUNCTION_ATTRIBUTES simde_v128_t -simde_wasm_i64x2_blend(simde_v128_t a, simde_v128_t b, simde_v128_t mask) { +simde_wasm_i64x2_relaxed_laneselect(simde_v128_t a, simde_v128_t b, simde_v128_t mask) { #if defined(SIMDE_WASM_RELAXED_SIMD_NATIVE) - return wasm_i64x2_blend(a, b, mask); + return wasm_i64x2_relaxed_laneselect(a, b, mask); #elif defined(SIMDE_X86_SSE4_1_NATIVE) simde_v128_private a_ = simde_v128_to_private(a), @@ -345,19 +355,19 @@ simde_wasm_i64x2_blend(simde_v128_t a, simde_v128_t b, simde_v128_t mask) { return simde_wasm_v128_bitselect(a, b, mask); #endif } -#if defined(SIMDE_WASM_SIMD128_ENABLE_NATIVE_ALIASES) - #define wasm_i64x2_blend(a, b, c) simde_wasm_i64x2_blend((a), (b), (c)) +#if defined(SIMDE_WASM_RELAXED_SIMD_ENABLE_NATIVE_ALIASES) + #define wasm_i64x2_relaxed_laneselect(a, b, mask) simde_wasm_i64x2_relaxed_laneselect((a), (b), (mask)) #endif /* fma */ SIMDE_FUNCTION_ATTRIBUTES simde_v128_t -simde_wasm_f32x4_fma (simde_v128_t a, simde_v128_t b, simde_v128_t c) { +simde_wasm_f32x4_relaxed_madd (simde_v128_t a, simde_v128_t b, simde_v128_t c) { #if defined(SIMDE_WASM_RELAXED_SIMD_NATIVE) - return wasm_f32x4_fma(a, b, c); + return wasm_f32x4_relaxed_madd(a, b, c); #elif defined(SIMDE_WASM_SIMD128_NATIVE) - return wasm_f32x4_add(a, wasm_f32x4_mul(b, c)); + return wasm_f32x4_add(wasm_f32x4_mul(a, b), c); #else simde_v128_private a_ = simde_v128_to_private(a), @@ -366,19 +376,21 @@ simde_wasm_f32x4_fma (simde_v128_t a, simde_v128_t b, simde_v128_t c) { r_; #if defined(SIMDE_POWER_ALTIVEC_P7_NATIVE) - r_.altivec_f32 = vec_madd(c_.altivec_f32, b_.altivec_f32, a_.altivec_f32); - #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE) && defined(__ARM_FEATURE_FMA) - r_.neon_f32 = vfmaq_f32(a_.neon_f32, c_.neon_f32, b_.neon_f32); + r_.altivec_f32 = vec_madd(a_.altivec_f32, b_.altivec_f32, c_.altivec_f32); + #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE) && defined(SIMDE_ARCH_ARM_FMA) + r_.neon_f32 = vfmaq_f32(c_.neon_f32, a_.neon_f32, b_.neon_f32); #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE) - r_.neon_f32 = vmlaq_f32(a_.neon_f32, b_.neon_f32, c_.neon_f32); + r_.neon_f32 = vmlaq_f32(c_.neon_f32, a_.neon_f32, b_.neon_f32); #elif defined(SIMDE_X86_FMA_NATIVE) - r_.sse_m128 = _mm_fmadd_ps(c_.sse_m128, b_.sse_m128, a_.sse_m128); + r_.sse_m128 = _mm_fmadd_ps(a_.sse_m128, b_.sse_m128, c_.sse_m128); + #elif defined(SIMDE_MIPS_MSA_NATIVE) + r_.msa_v4f32 = __msa_fmadd_w(c_.msa_v4f32, a_.msa_v4f32, b_.msa_v4f32); #elif defined(SIMDE_VECTOR_SUBSCRIPT) - r_.f32 = a_.f32 + (b_.f32 * c_.f32); + r_.f32 = (a_.f32 * b_.f32) + c_.f32; #else SIMDE_VECTORIZE for (size_t i = 0 ; i < (sizeof(r_.f32) / sizeof(r_.f32[0])) ; i++) { - r_.f32[i] = simde_math_fmaf(c_.f32[i], b_.f32[i], a_.f32[i]); + r_.f32[i] = simde_math_fmaf(a_.f32[i], b_.f32[i], c_.f32[i]); } #endif @@ -386,16 +398,16 @@ simde_wasm_f32x4_fma (simde_v128_t a, simde_v128_t b, simde_v128_t c) { #endif } #if defined(SIMDE_WASM_RELAXED_SIMD_ENABLE_NATIVE_ALIASES) - #define wasm_f32x4_fma(a, b) simde_wasm_f32x4_fma((a), (b)) + #define wasm_f32x4_relaxed_madd(a, b, c) simde_wasm_f32x4_relaxed_madd((a), (b), (c)) #endif SIMDE_FUNCTION_ATTRIBUTES simde_v128_t -simde_wasm_f64x2_fma (simde_v128_t a, simde_v128_t b, simde_v128_t c) { +simde_wasm_f64x2_relaxed_madd (simde_v128_t a, simde_v128_t b, simde_v128_t c) { #if defined(SIMDE_WASM_RELAXED_SIMD_NATIVE) - return wasm_f64x2_fma(a, b, c); + return wasm_f64x2_relaxed_madd(a, b, c); #elif defined(SIMDE_WASM_SIMD128_NATIVE) - return wasm_f64x2_add(a, wasm_f64x2_mul(b, c)); + return wasm_f64x2_add(wasm_f64x2_mul(a, b), c); #else simde_v128_private a_ = simde_v128_to_private(a), @@ -404,17 +416,19 @@ simde_wasm_f64x2_fma (simde_v128_t a, simde_v128_t b, simde_v128_t c) { r_; #if defined(SIMDE_POWER_ALTIVEC_P7_NATIVE) || defined(SIMDE_ZARCH_ZVECTOR_13_NATIVE) - r_.altivec_f64 = vec_madd(c_.altivec_f64, b_.altivec_f64, a_.altivec_f64); + r_.altivec_f64 = vec_madd(a_.altivec_f64, b_.altivec_f64, c_.altivec_f64); #elif defined(SIMDE_ARM_NEON_A64V8_NATIVE) - r_.neon_f64 = vfmaq_f64(a_.neon_f64, c_.neon_f64, b_.neon_f64); + r_.neon_f64 = vfmaq_f64(c_.neon_f64, a_.neon_f64, b_.neon_f64); #elif defined(SIMDE_X86_FMA_NATIVE) - r_.sse_m128d = _mm_fmadd_pd(c_.sse_m128d, b_.sse_m128d, a_.sse_m128d); + r_.sse_m128d = _mm_fmadd_pd(a_.sse_m128d, b_.sse_m128d, c_.sse_m128d); + #elif defined(SIMDE_MIPS_MSA_NATIVE) + r_.msa_v2f64 = __msa_fmadd_d(c_.msa_v2f64, a_.msa_v2f64, b_.msa_v2f64); #elif defined(SIMDE_VECTOR_SUBSCRIPT) - r_.f64 = a_.f64 + (b_.f64 * c_.f64); + r_.f64 = (a_.f64 * b_.f64) + c_.f64; #else SIMDE_VECTORIZE for (size_t i = 0 ; i < (sizeof(r_.f64) / sizeof(r_.f64[0])) ; i++) { - r_.f64[i] = simde_math_fma(c_.f64[i], b_.f64[i], a_.f64[i]); + r_.f64[i] = simde_math_fma(a_.f64[i], b_.f64[i], c_.f64[i]); } #endif @@ -422,18 +436,18 @@ simde_wasm_f64x2_fma (simde_v128_t a, simde_v128_t b, simde_v128_t c) { #endif } #if defined(SIMDE_WASM_RELAXED_SIMD_ENABLE_NATIVE_ALIASES) - #define wasm_f64x2_fma(a, b) simde_wasm_f64x2_fma((a), (b)) + #define wasm_f64x2_relaxed_madd(a, b, c) simde_wasm_f64x2_relaxed_madd((a), (b), (c)) #endif /* fms */ SIMDE_FUNCTION_ATTRIBUTES simde_v128_t -simde_wasm_f32x4_fms (simde_v128_t a, simde_v128_t b, simde_v128_t c) { +simde_wasm_f32x4_relaxed_nmadd (simde_v128_t a, simde_v128_t b, simde_v128_t c) { #if defined(SIMDE_WASM_RELAXED_SIMD_NATIVE) - return wasm_f32x4_fms(a, b, c); + return wasm_f32x4_relaxed_nmadd(a, b, c); #elif defined(SIMDE_WASM_SIMD128_NATIVE) - return wasm_f32x4_sub(a, wasm_f32x4_mul(b, c)); + return wasm_f32x4_sub(c, wasm_f32x4_mul(a, b)); #else simde_v128_private a_ = simde_v128_to_private(a), @@ -442,19 +456,21 @@ simde_wasm_f32x4_fms (simde_v128_t a, simde_v128_t b, simde_v128_t c) { r_; #if defined(SIMDE_POWER_ALTIVEC_P7_NATIVE) - r_.altivec_f32 = vec_nmsub(c_.altivec_f32, b_.altivec_f32, a_.altivec_f32); - #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE) && defined(__ARM_FEATURE_FMA) - r_.neon_f32 = vfmsq_f32(a_.neon_f32, c_.neon_f32, b_.neon_f32); + r_.altivec_f32 = vec_nmsub(a_.altivec_f32, b_.altivec_f32, c_.altivec_f32); + #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE) && defined(SIMDE_ARCH_ARM_FMA) + r_.neon_f32 = vfmsq_f32(c_.neon_f32, a_.neon_f32, b_.neon_f32); #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE) - r_.neon_f32 = vmlsq_f32(a_.neon_f32, b_.neon_f32, c_.neon_f32); + r_.neon_f32 = vmlsq_f32(c_.neon_f32, a_.neon_f32, b_.neon_f32); #elif defined(SIMDE_X86_FMA_NATIVE) - r_.sse_m128 = _mm_fnmadd_ps(c_.sse_m128, b_.sse_m128, a_.sse_m128); + r_.sse_m128 = _mm_fnmadd_ps(a_.sse_m128, b_.sse_m128, c_.sse_m128); + #elif defined(SIMDE_MIPS_MSA_NATIVE) + r_.msa_v4f32 = __msa_fmsub_w(c_.msa_v4f32, a_.msa_v4f32, b_.msa_v4f32); #elif defined(SIMDE_VECTOR_SUBSCRIPT) - r_.f32 = a_.f32 - (b_.f32 * c_.f32); + r_.f32 = c_.f32 - (a_.f32 * b_.f32); #else SIMDE_VECTORIZE for (size_t i = 0 ; i < (sizeof(r_.f32) / sizeof(r_.f32[0])) ; i++) { - r_.f32[i] = a_.f32[i] - (b_.f32[i] * c_.f32[i]); + r_.f32[i] = c_.f32[i] - (a_.f32[i] * b_.f32[i]); } #endif @@ -462,16 +478,16 @@ simde_wasm_f32x4_fms (simde_v128_t a, simde_v128_t b, simde_v128_t c) { #endif } #if defined(SIMDE_WASM_RELAXED_SIMD_ENABLE_NATIVE_ALIASES) - #define wasm_f32x4_fms(a, b) simde_wasm_f32x4_fms((a), (b)) + #define wasm_f32x4_relaxed_nmadd(a, b, c) simde_wasm_f32x4_relaxed_nmadd((a), (b), (c)) #endif SIMDE_FUNCTION_ATTRIBUTES simde_v128_t -simde_wasm_f64x2_fms (simde_v128_t a, simde_v128_t b, simde_v128_t c) { +simde_wasm_f64x2_relaxed_nmadd (simde_v128_t a, simde_v128_t b, simde_v128_t c) { #if defined(SIMDE_WASM_RELAXED_SIMD_NATIVE) - return wasm_f64x2_fms(a, b, c); + return wasm_f64x2_relaxed_nmadd(a, b, c); #elif defined(SIMDE_WASM_SIMD128_NATIVE) - return wasm_f64x2_sub(a, wasm_f64x2_mul(b, c)); + return wasm_f64x2_sub(c, wasm_f64x2_mul(a, b)); #else simde_v128_private a_ = simde_v128_to_private(a), @@ -480,17 +496,19 @@ simde_wasm_f64x2_fms (simde_v128_t a, simde_v128_t b, simde_v128_t c) { r_; #if defined(SIMDE_POWER_ALTIVEC_P7_NATIVE) - r_.altivec_f64 = vec_nmsub(c_.altivec_f64, b_.altivec_f64, a_.altivec_f64); + r_.altivec_f64 = vec_nmsub(a_.altivec_f64, b_.altivec_f64, c_.altivec_f64); #elif defined(SIMDE_ARM_NEON_A64V8_NATIVE) - r_.neon_f64 = vfmsq_f64(a_.neon_f64, c_.neon_f64, b_.neon_f64); + r_.neon_f64 = vfmsq_f64(c_.neon_f64, a_.neon_f64, b_.neon_f64); #elif defined(SIMDE_X86_FMA_NATIVE) - r_.sse_m128d = _mm_fnmadd_pd(c_.sse_m128d, b_.sse_m128d, a_.sse_m128d); + r_.sse_m128d = _mm_fnmadd_pd(a_.sse_m128d, b_.sse_m128d, c_.sse_m128d); + #elif defined(SIMDE_MIPS_MSA_NATIVE) + r_.msa_v2f64 = __msa_fmsub_d(c_.msa_v2f64, a_.msa_v2f64, b_.msa_v2f64); #elif defined(SIMDE_VECTOR_SUBSCRIPT) - r_.f64 = a_.f64 - (b_.f64 * c_.f64); + r_.f64 = c_.f64 - (a_.f64 * b_.f64); #else SIMDE_VECTORIZE for (size_t i = 0 ; i < (sizeof(r_.f64) / sizeof(r_.f64[0])) ; i++) { - r_.f64[i] = a_.f64[i] - (b_.f64[i] * c_.f64[i]); + r_.f64[i] = c_.f64[i] - (a_.f64[i] * b_.f64[i]); } #endif @@ -498,7 +516,89 @@ simde_wasm_f64x2_fms (simde_v128_t a, simde_v128_t b, simde_v128_t c) { #endif } #if defined(SIMDE_WASM_RELAXED_SIMD_ENABLE_NATIVE_ALIASES) - #define wasm_f64x2_fms(a, b) simde_wasm_f64x2_fms((a), (b)) + #define wasm_f64x2_relaxed_nmadd(a, b, c) simde_wasm_f64x2_relaxed_nmadd((a), (b), (c)) +#endif + +/* min/max */ + +SIMDE_FUNCTION_ATTRIBUTES +simde_v128_t +simde_wasm_f32x4_relaxed_min (simde_v128_t a, simde_v128_t b) { + #if defined(SIMDE_WASM_RELAXED_SIMD_NATIVE) + return wasm_f32x4_relaxed_min(a, b); + #elif defined(SIMDE_X86_SSE_NATIVE) + return simde_v128_from_m128(_mm_min_ps(simde_v128_to_m128(a), simde_v128_to_m128(b))); + #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE) + return simde_v128_from_neon_f32(vminq_f32(simde_v128_to_neon_f32(a), simde_v128_to_neon_f32(b))); + #elif defined(SIMDE_POWER_ALTIVEC_P6_NATIVE) + return simde_v128_from_altivec_f32(vec_min(simde_v128_to_altivec_f32(a), simde_v128_to_altivec_f32(b))); + #elif defined(SIMDE_MIPS_MSA_NATIVE) + return simde_v128_from_msa_v4f32(__msa_fmin_w(simde_v128_to_msa_v4f32(a), simde_v128_to_msa_v4f32(b))); + #else + return simde_wasm_f32x4_min(a, b); + #endif +} +#if defined(SIMDE_WASM_RELAXED_SIMD_ENABLE_NATIVE_ALIASES) + #define wasm_f32x4_relaxed_min(a, b) simde_wasm_f32x4_relaxed_min((a), (b)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_v128_t +simde_wasm_f32x4_relaxed_max (simde_v128_t a, simde_v128_t b) { + #if defined(SIMDE_WASM_RELAXED_SIMD_NATIVE) + return wasm_f32x4_relaxed_max(a, b); + #elif defined(SIMDE_X86_SSE_NATIVE) + return simde_v128_from_m128(_mm_max_ps(simde_v128_to_m128(a), simde_v128_to_m128(b))); + #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE) + return simde_v128_from_neon_f32(vmaxq_f32(simde_v128_to_neon_f32(a), simde_v128_to_neon_f32(b))); + #elif defined(SIMDE_POWER_ALTIVEC_P6_NATIVE) + return simde_v128_from_altivec_f32(vec_max(simde_v128_to_altivec_f32(a), simde_v128_to_altivec_f32(b))); + #elif defined(SIMDE_MIPS_MSA_NATIVE) + return simde_v128_from_msa_v4f32(__msa_fmax_w(simde_v128_to_msa_v4f32(a), simde_v128_to_msa_v4f32(b))); + #else + return simde_wasm_f32x4_max(a, b); + #endif +} +#if defined(SIMDE_WASM_RELAXED_SIMD_ENABLE_NATIVE_ALIASES) + #define wasm_f32x4_relaxed_max(a, b) simde_wasm_f32x4_relaxed_max((a), (b)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_v128_t +simde_wasm_f64x2_relaxed_min (simde_v128_t a, simde_v128_t b) { + #if defined(SIMDE_WASM_RELAXED_SIMD_NATIVE) + return wasm_f64x2_relaxed_min(a, b); + #elif defined(SIMDE_X86_SSE2_NATIVE) + return simde_v128_from_m128d(_mm_min_pd(simde_v128_to_m128d(a), simde_v128_to_m128d(b))); + #elif defined(SIMDE_ARM_NEON_A64V8_NATIVE) + return simde_v128_from_neon_f64(vminq_f64(simde_v128_to_neon_f64(a), simde_v128_to_neon_f64(b))); + #elif defined(SIMDE_MIPS_MSA_NATIVE) + return simde_v128_from_msa_v2f64(__msa_fmin_d(simde_v128_to_msa_v2f64(a), simde_v128_to_msa_v2f64(b))); + #else + return simde_wasm_f64x2_min(a, b); + #endif +} +#if defined(SIMDE_WASM_RELAXED_SIMD_ENABLE_NATIVE_ALIASES) + #define wasm_f64x2_relaxed_min(a, b) simde_wasm_f64x2_relaxed_min((a), (b)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_v128_t +simde_wasm_f64x2_relaxed_max (simde_v128_t a, simde_v128_t b) { + #if defined(SIMDE_WASM_RELAXED_SIMD_NATIVE) + return wasm_f64x2_relaxed_max(a, b); + #elif defined(SIMDE_X86_SSE2_NATIVE) + return simde_v128_from_m128d(_mm_max_pd(simde_v128_to_m128d(a), simde_v128_to_m128d(b))); + #elif defined(SIMDE_ARM_NEON_A64V8_NATIVE) + return simde_v128_from_neon_f64(vmaxq_f64(simde_v128_to_neon_f64(a), simde_v128_to_neon_f64(b))); + #elif defined(SIMDE_MIPS_MSA_NATIVE) + return simde_v128_from_msa_v2f64(__msa_fmax_d(simde_v128_to_msa_v2f64(a), simde_v128_to_msa_v2f64(b))); + #else + return simde_wasm_f32x4_max(a, b); + #endif +} +#if defined(SIMDE_WASM_RELAXED_SIMD_ENABLE_NATIVE_ALIASES) + #define wasm_f32x4_relaxed_max(a, b) simde_wasm_f64x2_relaxed_max((a), (b)) #endif SIMDE_END_DECLS_ diff --git a/lib/simd_wrapper/simde/wasm/simd128.h b/lib/simd_wrapper/simde/wasm/simd128.h index 0433fc07188..ca0ca61cf09 100644 --- a/lib/simd_wrapper/simde/wasm/simd128.h +++ b/lib/simd_wrapper/simde/wasm/simd128.h @@ -22,6 +22,7 @@ * * Copyright: * 2021 Evan Nemerson + * 2023 Michael R. Crusoe */ #if !defined(SIMDE_WASM_SIMD128_H) @@ -91,6 +92,17 @@ typedef union { #endif #elif defined(SIMDE_WASM_SIMD128_NATIVE) SIMDE_ALIGN_TO_16 v128_t wasm_v128; + #elif defined(SIMDE_MIPS_MSA_NATIVE) + SIMDE_ALIGN_TO_16 v16i8 msa_v16i8; + SIMDE_ALIGN_TO_16 v8i16 msa_v8i16; + SIMDE_ALIGN_TO_16 v4i32 msa_v4i32; + SIMDE_ALIGN_TO_16 v2i64 msa_v2i64; + SIMDE_ALIGN_TO_16 v16u8 msa_v16u8; + SIMDE_ALIGN_TO_16 v8u16 msa_v8u16; + SIMDE_ALIGN_TO_16 v4u32 msa_v4u32; + SIMDE_ALIGN_TO_16 v2u64 msa_v2u64; + SIMDE_ALIGN_TO_16 v4f32 msa_v4f32; + SIMDE_ALIGN_TO_16 v2f64 msa_v2f64; #elif defined(SIMDE_POWER_ALTIVEC_P6_NATIVE) || defined(SIMDE_ZARCH_ZVECTOR_13_NATIVE) SIMDE_ALIGN_TO_16 SIMDE_POWER_ALTIVEC_VECTOR(unsigned char) altivec_u8; SIMDE_ALIGN_TO_16 SIMDE_POWER_ALTIVEC_VECTOR(unsigned short) altivec_u16; @@ -110,13 +122,15 @@ typedef union { #if defined(SIMDE_WASM_SIMD128_NATIVE) typedef v128_t simde_v128_t; #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE) - typedef int32x4_t simde_v128_t; + typedef int32x4_t simde_v128_t; #elif defined(SIMDE_X86_SSE2_NATIVE) - typedef __m128i simde_v128_t; + typedef __m128i simde_v128_t; #elif defined(SIMDE_X86_SSE_NATIVE) - typedef __m128 simde_v128_t; + typedef __m128 simde_v128_t; #elif defined(SIMDE_POWER_ALTIVEC_P6_NATIVE) || defined(SIMDE_ZARCH_ZVECTOR_13_NATIVE) - typedef SIMDE_POWER_ALTIVEC_VECTOR(signed int) simde_v128_t; + typedef SIMDE_POWER_ALTIVEC_VECTOR(signed int) simde_v128_t; +#elif defined(SIMDE_MIPS_MSA_NATIVE) + typedef v4i32 simde_v128_t; #elif defined(SIMDE_VECTOR_SUBSCRIPT) typedef int32_t simde_v128_t SIMDE_ALIGN_TO_16 SIMDE_VECTOR(16) SIMDE_MAY_ALIAS; #else @@ -151,8 +165,34 @@ HEDLEY_STATIC_ASSERT(SIMDE_ALIGN_OF(simde_v128_private) == 16, "simde_v128_priva SIMDE_WASM_SIMD128_GENERATE_CONVERSION_FUNCTIONS(simde_v128_private, simde_v128_t, simde_v128_to_private, simde_v128_from_private) -#if defined(SIMDE_X86_SSE2_NATIVE) +#define SIMDE_WASM_SIMD128_FMIN(x, y) \ + (simde_math_isnan(x) ? SIMDE_MATH_NAN \ + : simde_math_isnan(y) ? SIMDE_MATH_NAN \ + : (((x) == 0) && ((y) == 0)) ? (simde_math_signbit(x) ? (x) : (y)) \ + : ((x) < (y) ? (x) : (y))) + +#define SIMDE_WASM_SIMD128_FMAX(x, y) \ + (simde_math_isnan(x) ? SIMDE_MATH_NAN \ + : simde_math_isnan(y) ? SIMDE_MATH_NAN \ + : (((x) == 0) && ((y) == 0)) ? (simde_math_signbit(x) ? (y) : (x)) \ + : ((x) > (y) ? (x) : (y))) + +#define SIMDE_WASM_SIMD128_FMINF(x, y) \ + (simde_math_isnanf(x) ? SIMDE_MATH_NANF \ + : simde_math_isnanf(y) ? SIMDE_MATH_NANF \ + : (((x) == 0) && ((y) == 0)) ? (simde_math_signbit(x) ? (x) : (y)) \ + : ((x) < (y) ? (x) : (y))) + +#define SIMDE_WASM_SIMD128_FMAXF(x, y) \ + (simde_math_isnanf(x) ? SIMDE_MATH_NANF \ + : simde_math_isnanf(y) ? SIMDE_MATH_NANF \ + : (((x) == 0) && ((y) == 0)) ? (simde_math_signbit(x) ? (y) : (x)) \ + : ((x) > (y) ? (x) : (y))) + +#if defined(SIMDE_X86_SSE_NATIVE) SIMDE_WASM_SIMD128_GENERATE_CONVERSION_FUNCTIONS(__m128 , simde_v128_t, simde_v128_to_m128 , simde_v128_from_m128 ) +#endif +#if defined(SIMDE_X86_SSE2_NATIVE) SIMDE_WASM_SIMD128_GENERATE_CONVERSION_FUNCTIONS(__m128i, simde_v128_t, simde_v128_to_m128i, simde_v128_from_m128i) SIMDE_WASM_SIMD128_GENERATE_CONVERSION_FUNCTIONS(__m128d, simde_v128_t, simde_v128_to_m128d, simde_v128_from_m128d) #endif @@ -172,6 +212,19 @@ SIMDE_WASM_SIMD128_GENERATE_CONVERSION_FUNCTIONS(simde_v128_private, simde_v128_ #endif #endif /* defined(SIMDE_ARM_NEON_A32V7_NATIVE) */ +#if defined(SIMDE_MIPS_MSA_NATIVE) + SIMDE_WASM_SIMD128_GENERATE_CONVERSION_FUNCTIONS(v16i8, simde_v128_t, simde_v128_to_msa_v16i8, simde_v128_from_msa_v16i8) + SIMDE_WASM_SIMD128_GENERATE_CONVERSION_FUNCTIONS(v8i16, simde_v128_t, simde_v128_to_msa_v8i16, simde_v128_from_msa_v8i16) + SIMDE_WASM_SIMD128_GENERATE_CONVERSION_FUNCTIONS(v4i32, simde_v128_t, simde_v128_to_msa_v4i32, simde_v128_from_msa_v4i32) + SIMDE_WASM_SIMD128_GENERATE_CONVERSION_FUNCTIONS(v2i64, simde_v128_t, simde_v128_to_msa_v2i64, simde_v128_from_msa_v2i64) + SIMDE_WASM_SIMD128_GENERATE_CONVERSION_FUNCTIONS(v16u8, simde_v128_t, simde_v128_to_msa_v16u8, simde_v128_from_msa_v16u8) + SIMDE_WASM_SIMD128_GENERATE_CONVERSION_FUNCTIONS(v8u16, simde_v128_t, simde_v128_to_msa_v8u16, simde_v128_from_msa_v8u16) + SIMDE_WASM_SIMD128_GENERATE_CONVERSION_FUNCTIONS(v4u32, simde_v128_t, simde_v128_to_msa_v4u32, simde_v128_from_msa_v4u32) + SIMDE_WASM_SIMD128_GENERATE_CONVERSION_FUNCTIONS(v2u64, simde_v128_t, simde_v128_to_msa_v2u64, simde_v128_from_msa_v2u64) + SIMDE_WASM_SIMD128_GENERATE_CONVERSION_FUNCTIONS(v4f32, simde_v128_t, simde_v128_to_msa_v4f32, simde_v128_from_msa_v4f32) + SIMDE_WASM_SIMD128_GENERATE_CONVERSION_FUNCTIONS(v2f64, simde_v128_t, simde_v128_to_msa_v2f64, simde_v128_from_msa_v2f64) +#endif + #if defined(SIMDE_POWER_ALTIVEC_P6_NATIVE) || defined(SIMDE_ZARCH_ZVECTOR_13_NATIVE) SIMDE_WASM_SIMD128_GENERATE_CONVERSION_FUNCTIONS(SIMDE_POWER_ALTIVEC_VECTOR( signed char), simde_v128_t, simde_v128_to_altivec_i8 , simde_v128_from_altivec_i8 ) SIMDE_WASM_SIMD128_GENERATE_CONVERSION_FUNCTIONS(SIMDE_POWER_ALTIVEC_VECTOR( signed short), simde_v128_t, simde_v128_to_altivec_i16, simde_v128_from_altivec_i16) @@ -294,6 +347,55 @@ simde_wasm_i8x16_make ( (c8), (c9), (c10), (c11), (c12), (c13), (c14), (c15)) #endif +SIMDE_FUNCTION_ATTRIBUTES +simde_v128_t +simde_wasm_u8x16_make ( + uint8_t c0, uint8_t c1, uint8_t c2, uint8_t c3, uint8_t c4, uint8_t c5, uint8_t c6, uint8_t c7, + uint8_t c8, uint8_t c9, uint8_t c10, uint8_t c11, uint8_t c12, uint8_t c13, uint8_t c14, uint8_t c15) { + #if defined(SIMDE_WASM_SIMD128_NATIVE) + return + wasm_u8x16_make( + c0, c1, c2, c3, c4, c5, c6, c7, + c8, c9, c10, c11, c12, c13, c14, c15); + #elif defined(SIMDE_X86_SSE2_NATIVE) + return _mm_set_epi8( + HEDLEY_STATIC_CAST(char, c15), HEDLEY_STATIC_CAST(char, c14), HEDLEY_STATIC_CAST(char, c13), HEDLEY_STATIC_CAST(char, c12), + HEDLEY_STATIC_CAST(char, c11), HEDLEY_STATIC_CAST(char, c10), HEDLEY_STATIC_CAST(char, c9), HEDLEY_STATIC_CAST(char, c8), + HEDLEY_STATIC_CAST(char, c7), HEDLEY_STATIC_CAST(char, c6), HEDLEY_STATIC_CAST(char, c5), HEDLEY_STATIC_CAST(char, c4), + HEDLEY_STATIC_CAST(char, c3), HEDLEY_STATIC_CAST(char, c2), HEDLEY_STATIC_CAST(char, c1), HEDLEY_STATIC_CAST(char, c0)); + #else + simde_v128_private r_; + + r_.u8[ 0] = c0; + r_.u8[ 1] = c1; + r_.u8[ 2] = c2; + r_.u8[ 3] = c3; + r_.u8[ 4] = c4; + r_.u8[ 5] = c5; + r_.u8[ 6] = c6; + r_.u8[ 7] = c7; + r_.u8[ 8] = c8; + r_.u8[ 9] = c9; + r_.u8[10] = c10; + r_.u8[11] = c11; + r_.u8[12] = c12; + r_.u8[13] = c13; + r_.u8[14] = c14; + r_.u8[15] = c15; + + return simde_v128_from_private(r_); + #endif +} +#if defined(SIMDE_WASM_SIMD128_ENABLE_NATIVE_ALIASES) + #define \ + wasm_u8x16_make( \ + c0, c1, c2, c3, c4, c5, c6, c7, \ + c8, c9, c10, c11, c12, c13, c14, c15) \ + simde_wasm_u8x16_make( \ + (c0), (c1), (c2), (c3), (c4), (c5), (c6), (c7), \ + (c8), (c9), (c10), (c11), (c12), (c13), (c14), (c15)) +#endif + SIMDE_FUNCTION_ATTRIBUTES simde_v128_t simde_wasm_i16x8_make ( @@ -323,6 +425,37 @@ simde_wasm_i16x8_make ( simde_wasm_i16x8_make((c0), (c1), (c2), (c3), (c4), (c5), (c6), (c7)) #endif +SIMDE_FUNCTION_ATTRIBUTES +simde_v128_t +simde_wasm_u16x8_make ( + uint16_t c0, uint16_t c1, uint16_t c2, uint16_t c3, uint16_t c4, uint16_t c5, uint16_t c6, uint16_t c7) { + #if defined(SIMDE_WASM_SIMD128_NATIVE) + return wasm_u16x8_make(c0, c1, c2, c3, c4, c5, c6, c7); + #elif defined(SIMDE_X86_SSE2_NATIVE) + return _mm_set_epi16( + HEDLEY_STATIC_CAST(short, c7), HEDLEY_STATIC_CAST(short, c6), HEDLEY_STATIC_CAST(short, c5), HEDLEY_STATIC_CAST(short, c4), + HEDLEY_STATIC_CAST(short, c3), HEDLEY_STATIC_CAST(short, c2), HEDLEY_STATIC_CAST(short, c1), HEDLEY_STATIC_CAST(short, c0)); + #else + simde_v128_private r_; + + r_.u16[0] = c0; + r_.u16[1] = c1; + r_.u16[2] = c2; + r_.u16[3] = c3; + r_.u16[4] = c4; + r_.u16[5] = c5; + r_.u16[6] = c6; + r_.u16[7] = c7; + + return simde_v128_from_private(r_); + #endif +} +#if defined(SIMDE_WASM_SIMD128_ENABLE_NATIVE_ALIASES) + #define \ + wasm_u16x8_make(c0, c1, c2, c3, c4, c5, c6, c7) \ + simde_wasm_u16x8_make((c0), (c1), (c2), (c3), (c4), (c5), (c6), (c7)) +#endif + SIMDE_FUNCTION_ATTRIBUTES simde_v128_t simde_wasm_i32x4_make (int32_t c0, int32_t c1, int32_t c2, int32_t c3) { @@ -345,6 +478,30 @@ simde_wasm_i32x4_make (int32_t c0, int32_t c1, int32_t c2, int32_t c3) { #define wasm_i32x4_make(c0, c1, c2, c3) simde_wasm_i32x4_make((c0), (c1), (c2), (c3)) #endif +SIMDE_FUNCTION_ATTRIBUTES +simde_v128_t +simde_wasm_u32x4_make (uint32_t c0, uint32_t c1, uint32_t c2, uint32_t c3) { + #if defined(SIMDE_WASM_SIMD128_NATIVE) + return wasm_u32x4_make(c0, c1, c2, c3); + #elif defined(SIMDE_X86_SSE2_NATIVE) + return _mm_set_epi32( + HEDLEY_STATIC_CAST(int, c3), HEDLEY_STATIC_CAST(int, c2), HEDLEY_STATIC_CAST(int, c1), HEDLEY_STATIC_CAST(int, c0)); + #else + simde_v128_private r_; + + r_.u32[0] = c0; + r_.u32[1] = c1; + r_.u32[2] = c2; + r_.u32[3] = c3; + + return simde_v128_from_private(r_); + #endif +} +#if defined(SIMDE_WASM_SIMD128_ENABLE_NATIVE_ALIASES) + #define wasm_u32x4_make(c0, c1, c2, c3) simde_wasm_u32x4_make((c0), (c1), (c2), (c3)) +#endif + + SIMDE_FUNCTION_ATTRIBUTES simde_v128_t simde_wasm_i64x2_make (int64_t c0, int64_t c1) { @@ -355,8 +512,8 @@ simde_wasm_i64x2_make (int64_t c0, int64_t c1) { #else simde_v128_private r_; - r_.i64[ 0] = c0; - r_.i64[ 1] = c1; + r_.i64[0] = c0; + r_.i64[1] = c1; return simde_v128_from_private(r_); #endif @@ -365,6 +522,27 @@ simde_wasm_i64x2_make (int64_t c0, int64_t c1) { #define wasm_i64x2_make(c0, c1) simde_wasm_i64x2_make((c0), (c1)) #endif +SIMDE_FUNCTION_ATTRIBUTES +simde_v128_t +simde_wasm_u64x2_make (uint64_t c0, uint64_t c1) { + #if defined(SIMDE_WASM_SIMD128_NATIVE) + return wasm_u64x2_make(c0, c1); + #elif defined(SIMDE_X86_SSE2_NATIVE) + return _mm_set_epi64x(HEDLEY_STATIC_CAST(int64_t, c1), HEDLEY_STATIC_CAST(int64_t, c0)); + #else + simde_v128_private r_; + + r_.u64[0] = c0; + r_.u64[1] = c1; + + return simde_v128_from_private(r_); + #endif +} +#if defined(SIMDE_WASM_SIMD128_ENABLE_NATIVE_ALIASES) + #define wasm_u64x2_make(c0, c1) simde_wasm_u64x2_make((c0), (c1)) +#endif + + SIMDE_FUNCTION_ATTRIBUTES simde_v128_t simde_wasm_f32x4_make (simde_float32 c0, simde_float32 c1, simde_float32 c2, simde_float32 c3) { @@ -469,6 +647,62 @@ simde_wasm_f64x2_make (simde_float64 c0, simde_float64 c1) { (c8), (c9), (c10), (c11), (c12), (c13), (c14), (c15)) #endif +#if defined(SIMDE_WASM_SIMD128_NATIVE) + #define \ + simde_wasm_u8x16_const( \ + c0, c1, c2, c3, c4, c5, c6, c7, \ + c8, c9, c10, c11, c12, c13, c14, c15) \ + wasm_u8x16_const( \ + (c0), (c1), (c2), (c3), (c4), (c5), (c6), (c7), \ + (c8), (c9), (c10), (c11), (c12), (c13), (c14), (c15)) +#elif defined(SIMDE_STATEMENT_EXPR_) && defined(SIMDE_ASSERT_CONSTANT_) && defined(SIMDE_STATIC_ASSERT) + #define \ + simde_wasm_u8x16_const( \ + c0, c1, c2, c3, c4, c5, c6, c7, \ + c8, c9, c10, c11, c12, c13, c14, c15) \ + SIMDE_STATEMENT_EXPR_(({ \ + SIMDE_ASSERT_CONSTANT_(c0); \ + SIMDE_ASSERT_CONSTANT_(c1); \ + SIMDE_ASSERT_CONSTANT_(c2); \ + SIMDE_ASSERT_CONSTANT_(c3); \ + SIMDE_ASSERT_CONSTANT_(c4); \ + SIMDE_ASSERT_CONSTANT_(c5); \ + SIMDE_ASSERT_CONSTANT_(c6); \ + SIMDE_ASSERT_CONSTANT_(c7); \ + SIMDE_ASSERT_CONSTANT_(c8); \ + SIMDE_ASSERT_CONSTANT_(c9); \ + SIMDE_ASSERT_CONSTANT_(c10); \ + SIMDE_ASSERT_CONSTANT_(c11); \ + SIMDE_ASSERT_CONSTANT_(c12); \ + SIMDE_ASSERT_CONSTANT_(c13); \ + SIMDE_ASSERT_CONSTANT_(c13); \ + SIMDE_ASSERT_CONSTANT_(c15); \ + \ + simde_wasm_u8x16_make( \ + c0, c1, c2, c3, c4, c5, c6, c7, \ + c8, c9, c10, c11, c12, c13, c14, c15); \ + })) +#else + SIMDE_FUNCTION_ATTRIBUTES + simde_v128_t + simde_wasm_u8x16_const ( + uint8_t c0, uint8_t c1, uint8_t c2, uint8_t c3, uint8_t c4, uint8_t c5, uint8_t c6, uint8_t c7, + uint8_t c8, uint8_t c9, uint8_t c10, uint8_t c11, uint8_t c12, uint8_t c13, uint8_t c14, uint8_t c15) { + return simde_wasm_u8x16_make( + c0, c1, c2, c3, c4, c5, c6, c7, + c8, c9, c10, c11, c12, c13, c14, c15); + } +#endif +#if defined(SIMDE_WASM_SIMD128_ENABLE_NATIVE_ALIASES) + #define \ + wasm_u8x16_const( \ + c0, c1, c2, c3, c4, c5, c6, c7, \ + c8, c9, c10, c11, c12, c13, c14, c15) \ + simde_wasm_u8x16_const( \ + (c0), (c1), (c2), (c3), (c4), (c5), (c6), (c7), \ + (c8), (c9), (c10), (c11), (c12), (c13), (c14), (c15)) +#endif + #if defined(SIMDE_WASM_SIMD128_NATIVE) #define \ simde_wasm_i16x8_const( \ @@ -509,6 +743,46 @@ simde_wasm_f64x2_make (simde_float64 c0, simde_float64 c1) { (c0), (c1), (c2), (c3), (c4), (c5), (c6), (c7)) #endif +#if defined(SIMDE_WASM_SIMD128_NATIVE) + #define \ + simde_wasm_u16x8_const( \ + c0, c1, c2, c3, c4, c5, c6, c7) \ + wasm_u16x8_const( \ + (c0), (c1), (c2), (c3), (c4), (c5), (c6), (c7)) +#elif defined(SIMDE_STATEMENT_EXPR_) && defined(SIMDE_ASSERT_CONSTANT_) && defined(SIMDE_STATIC_ASSERT) + #define \ + simde_wasm_u16x8_const( \ + c0, c1, c2, c3, c4, c5, c6, c7) \ + SIMDE_STATEMENT_EXPR_(({ \ + SIMDE_ASSERT_CONSTANT_(c0); \ + SIMDE_ASSERT_CONSTANT_(c1); \ + SIMDE_ASSERT_CONSTANT_(c2); \ + SIMDE_ASSERT_CONSTANT_(c3); \ + SIMDE_ASSERT_CONSTANT_(c4); \ + SIMDE_ASSERT_CONSTANT_(c5); \ + SIMDE_ASSERT_CONSTANT_(c6); \ + SIMDE_ASSERT_CONSTANT_(c7); \ + \ + simde_wasm_u16x8_make( \ + c0, c1, c2, c3, c4, c5, c6, c7); \ + })) +#else + SIMDE_FUNCTION_ATTRIBUTES + simde_v128_t + simde_wasm_u16x8_const ( + uint16_t c0, uint16_t c1, uint16_t c2, uint16_t c3, uint16_t c4, uint16_t c5, uint16_t c6, uint16_t c7) { + return simde_wasm_u16x8_make( + c0, c1, c2, c3, c4, c5, c6, c7); + } +#endif +#if defined(SIMDE_WASM_SIMD128_ENABLE_NATIVE_ALIASES) + #define \ + wasm_u16x8_const( \ + c0, c1, c2, c3, c4, c5, c6, c7) \ + simde_wasm_u16x8_const( \ + (c0), (c1), (c2), (c3), (c4), (c5), (c6), (c7)) +#endif + #if defined(SIMDE_WASM_SIMD128_NATIVE) #define \ simde_wasm_i32x4_const( \ @@ -545,6 +819,42 @@ simde_wasm_f64x2_make (simde_float64 c0, simde_float64 c1) { (c0), (c1), (c2), (c3)) #endif +#if defined(SIMDE_WASM_SIMD128_NATIVE) + #define \ + simde_wasm_u32x4_const( \ + c0, c1, c2, c3) \ + wasm_u32x4_const( \ + (c0), (c1), (c2), (c3)) +#elif defined(SIMDE_STATEMENT_EXPR_) && defined(SIMDE_ASSERT_CONSTANT_) && defined(SIMDE_STATIC_ASSERT) + #define \ + simde_wasm_u32x4_const( \ + c0, c1, c2, c3) \ + SIMDE_STATEMENT_EXPR_(({ \ + SIMDE_ASSERT_CONSTANT_(c0); \ + SIMDE_ASSERT_CONSTANT_(c1); \ + SIMDE_ASSERT_CONSTANT_(c2); \ + SIMDE_ASSERT_CONSTANT_(c3); \ + \ + simde_wasm_u32x4_make( \ + c0, c1, c2, c3); \ + })) +#else + SIMDE_FUNCTION_ATTRIBUTES + simde_v128_t + simde_wasm_u32x4_const ( + uint32_t c0, uint32_t c1, uint32_t c2, uint32_t c3) { + return simde_wasm_u32x4_make( + c0, c1, c2, c3); + } +#endif +#if defined(SIMDE_WASM_SIMD128_ENABLE_NATIVE_ALIASES) + #define \ + wasm_u32x4_const( \ + c0, c1, c2, c3) \ + simde_wasm_u32x4_const( \ + (c0), (c1), (c2), (c3)) +#endif + #if defined(SIMDE_WASM_SIMD128_NATIVE) #define \ simde_wasm_i64x2_const( \ @@ -579,6 +889,40 @@ simde_wasm_f64x2_make (simde_float64 c0, simde_float64 c1) { (c0), (c1)) #endif +#if defined(SIMDE_WASM_SIMD128_NATIVE) + #define \ + simde_wasm_u64x2_const( \ + c0, c1) \ + wasm_u64x2_const( \ + (c0), (c1)) +#elif defined(SIMDE_STATEMENT_EXPR_) && defined(SIMDE_ASSERT_CONSTANT_) && defined(SIMDE_STATIC_ASSERT) + #define \ + simde_wasm_u64x2_const( \ + c0, c1) \ + SIMDE_STATEMENT_EXPR_(({ \ + SIMDE_ASSERT_CONSTANT_(c0); \ + SIMDE_ASSERT_CONSTANT_(c1); \ + \ + simde_wasm_u64x2_make( \ + c0, c1); \ + })) +#else + SIMDE_FUNCTION_ATTRIBUTES + simde_v128_t + simde_wasm_u64x2_const ( + uint64_t c0, uint64_t c1) { + return simde_wasm_u64x2_make( + c0, c1); + } +#endif +#if defined(SIMDE_WASM_SIMD128_ENABLE_NATIVE_ALIASES) + #define \ + wasm_u64x2_const( \ + c0, c1) \ + simde_wasm_u64x2_const( \ + (c0), (c1)) +#endif + #if defined(SIMDE_WASM_SIMD128_NATIVE) #define \ simde_wasm_f32x4_const( \ @@ -642,11 +986,7 @@ simde_wasm_f64x2_make (simde_float64 c0, simde_float64 c1) { } #endif #if defined(SIMDE_WASM_SIMD128_ENABLE_NATIVE_ALIASES) - #define \ - wasm_f64x2_const( \ - c0, c1) \ - simde_wasm_f64x2_const( \ - (c0), (c1)) + #define wasm_f64x2_const(c0, c1) simde_wasm_f64x2_const((c0), (c1)) #endif /* splat */ @@ -679,6 +1019,52 @@ simde_wasm_i8x16_splat (int8_t a) { #define wasm_i8x16_splat(a) simde_wasm_i8x16_splat((a)) #endif +SIMDE_FUNCTION_ATTRIBUTES +simde_v128_t +simde_wasm_u8x16_splat (uint8_t a) { + #if defined(SIMDE_WASM_SIMD128_NATIVE) + return wasm_u8x16_splat(a); + #else + simde_v128_private r_; + + #if defined(SIMDE_X86_SSE2_NATIVE) + r_.sse_m128i = _mm_set1_epi8(HEDLEY_STATIC_CAST(int8_t, a)); + #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE) + r_.neon_u8 = vdupq_n_u8(a); + #elif defined(SIMDE_POWER_ALTIVEC_P6_NATIVE) || defined(SIMDE_ZARCH_ZVECTOR_13_NATIVE) + r_.altivec_u8 = vec_splats(HEDLEY_STATIC_CAST(unsigned char, a)); + #else + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.u8) / sizeof(r_.u8[0])) ; i++) { + r_.u8[i] = a; + } + #endif + + return simde_v128_from_private(r_); + #endif +} +#if defined(SIMDE_WASM_SIMD128_ENABLE_NATIVE_ALIASES) + #define wasm_u8x16_splat(a) simde_wasm_u8x16_splat((a)) +#endif + +#if defined(SIMDE_WASM_SIMD128_NATIVE) + #define simde_wasm_i8x16_const_splat(a) wasm_i8x16_const_splat((a)) +#else + #define simde_wasm_i8x16_const_splat(a) simde_wasm_i8x16_splat(a); +#endif +#if defined(SIMDE_WASM_SIMD128_ENABLE_NATIVE_ALIASES) + #define wasm_i8x16_const_splat(a) simde_wasm_i8x16_const_splat((a)) +#endif + +#if defined(SIMDE_WASM_SIMD128_NATIVE) + #define simde_wasm_u8x16_const_splat(a) wasm_u8x16_const_splat((a)) +#else + #define simde_wasm_u8x16_const_splat(a) simde_wasm_u8x16_splat(a); +#endif +#if defined(SIMDE_WASM_SIMD128_ENABLE_NATIVE_ALIASES) + #define wasm_u8x16_const_splat(a) simde_wasm_u8x16_const_splat((a)) +#endif + SIMDE_FUNCTION_ATTRIBUTES simde_v128_t simde_wasm_i16x8_splat (int16_t a) { @@ -707,6 +1093,52 @@ simde_wasm_i16x8_splat (int16_t a) { #define wasm_i16x8_splat(a) simde_wasm_i16x8_splat((a)) #endif +SIMDE_FUNCTION_ATTRIBUTES +simde_v128_t +simde_wasm_u16x8_splat (uint16_t a) { + #if defined(SIMDE_WASM_SIMD128_NATIVE) + return wasm_u16x8_splat(a); + #else + simde_v128_private r_; + + #if defined(SIMDE_X86_SSE2_NATIVE) + r_.sse_m128i = _mm_set1_epi16(HEDLEY_STATIC_CAST(int16_t, a)); + #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE) + r_.neon_u16 = vdupq_n_u16(a); + #elif defined(SIMDE_POWER_ALTIVEC_P6_NATIVE) || defined(SIMDE_ZARCH_ZVECTOR_13_NATIVE) + r_.altivec_u16 = vec_splats(HEDLEY_STATIC_CAST(unsigned short, a)); + #else + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.u16) / sizeof(r_.u16[0])) ; i++) { + r_.u16[i] = a; + } + #endif + + return simde_v128_from_private(r_); + #endif +} +#if defined(SIMDE_WASM_SIMD128_ENABLE_NATIVE_ALIASES) + #define wasm_u16x8_splat(a) simde_wasm_u16x8_splat((a)) +#endif + +#if defined(SIMDE_WASM_SIMD128_NATIVE) + #define simde_wasm_i16x8_const_splat(a) wasm_i16x8_const_splat((a)) +#else + #define simde_wasm_i16x8_const_splat(a) simde_wasm_i16x8_splat(a); +#endif +#if defined(SIMDE_WASM_SIMD128_ENABLE_NATIVE_ALIASES) + #define wasm_i16x8_const_splat(a) simde_wasm_i16x8_const_splat((a)) +#endif + +#if defined(SIMDE_WASM_SIMD128_NATIVE) + #define simde_wasm_u16x8_const_splat(a) wasm_u16x8_const_splat((a)) +#else + #define simde_wasm_u16x8_const_splat(a) simde_wasm_u16x8_splat(a); +#endif +#if defined(SIMDE_WASM_SIMD128_ENABLE_NATIVE_ALIASES) + #define wasm_u16x8_const_splat(a) simde_wasm_u16x8_const_splat((a)) +#endif + SIMDE_FUNCTION_ATTRIBUTES simde_v128_t simde_wasm_i32x4_splat (int32_t a) { @@ -735,6 +1167,52 @@ simde_wasm_i32x4_splat (int32_t a) { #define wasm_i32x4_splat(a) simde_wasm_i32x4_splat((a)) #endif +SIMDE_FUNCTION_ATTRIBUTES +simde_v128_t +simde_wasm_u32x4_splat (uint32_t a) { + #if defined(SIMDE_WASM_SIMD128_NATIVE) + return wasm_u32x4_splat(a); + #else + simde_v128_private r_; + + #if defined(SIMDE_X86_SSE2_NATIVE) + r_.sse_m128i = _mm_set1_epi32(HEDLEY_STATIC_CAST(int32_t, a)); + #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE) + r_.neon_u32 = vdupq_n_u32(a); + #elif defined(SIMDE_POWER_ALTIVEC_P6_NATIVE) || defined(SIMDE_ZARCH_ZVECTOR_13_NATIVE) + r_.altivec_u32 = vec_splats(a); + #else + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.u32) / sizeof(r_.u32[0])) ; i++) { + r_.u32[i] = a; + } + #endif + + return simde_v128_from_private(r_); + #endif +} +#if defined(SIMDE_WASM_SIMD128_ENABLE_NATIVE_ALIASES) + #define wasm_u32x4_splat(a) simde_wasm_u32x4_splat((a)) +#endif + +#if defined(SIMDE_WASM_SIMD128_NATIVE) + #define simde_wasm_i32x4_const_splat(a) wasm_i32x4_const_splat((a)) +#else + #define simde_wasm_i32x4_const_splat(a) simde_wasm_i32x4_splat(a); +#endif +#if defined(SIMDE_WASM_SIMD128_ENABLE_NATIVE_ALIASES) + #define wasm_i32x4_const_splat(a) simde_wasm_i32x4_const_splat((a)) +#endif + +#if defined(SIMDE_WASM_SIMD128_NATIVE) + #define simde_wasm_u32x4_const_splat(a) wasm_u32x4_const_splat((a)) +#else + #define simde_wasm_u32x4_const_splat(a) simde_wasm_u32x4_splat(a); +#endif +#if defined(SIMDE_WASM_SIMD128_ENABLE_NATIVE_ALIASES) + #define wasm_u32x4_const_splat(a) simde_wasm_u32x4_const_splat((a)) +#endif + SIMDE_FUNCTION_ATTRIBUTES simde_v128_t simde_wasm_i64x2_splat (int64_t a) { @@ -763,6 +1241,52 @@ simde_wasm_i64x2_splat (int64_t a) { #define wasm_i64x2_splat(a) simde_wasm_i64x2_splat((a)) #endif +SIMDE_FUNCTION_ATTRIBUTES +simde_v128_t +simde_wasm_u64x2_splat (uint64_t a) { + #if defined(SIMDE_WASM_SIMD128_NATIVE) + return wasm_u64x2_splat(a); + #else + simde_v128_private r_; + + #if defined(SIMDE_X86_SSE2_NATIVE) && (!defined(HEDLEY_MSVC_VERSION) || HEDLEY_MSVC_VERSION_CHECK(19,0,0)) + r_.sse_m128i = _mm_set1_epi64x(HEDLEY_STATIC_CAST(int64_t, a)); + #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE) + r_.neon_u64 = vdupq_n_u64(a); + #elif defined(SIMDE_POWER_ALTIVEC_P7_NATIVE) || defined(SIMDE_ZARCH_ZVECTOR_13_NATIVE) + r_.altivec_u64 = vec_splats(HEDLEY_STATIC_CAST(unsigned long long, a)); + #else + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.u64) / sizeof(r_.u64[0])) ; i++) { + r_.u64[i] = a; + } + #endif + + return simde_v128_from_private(r_); + #endif +} +#if defined(SIMDE_WASM_SIMD128_ENABLE_NATIVE_ALIASES) + #define wasm_u64x2_splat(a) simde_wasm_u64x2_splat((a)) +#endif + +#if defined(SIMDE_WASM_SIMD128_NATIVE) + #define simde_wasm_i64x2_const_splat(a) wasm_i64x2_const_splat((a)) +#else + #define simde_wasm_i64x2_const_splat(a) simde_wasm_i64x2_splat(a); +#endif +#if defined(SIMDE_WASM_SIMD128_ENABLE_NATIVE_ALIASES) + #define wasm_i64x2_const_splat(a) simde_wasm_i64x2_const_splat((a)) +#endif + +#if defined(SIMDE_WASM_SIMD128_NATIVE) + #define simde_wasm_u64x2_const_splat(a) wasm_u64x2_const_splat((a)) +#else + #define simde_wasm_u64x2_const_splat(a) simde_wasm_u64x2_splat(a); +#endif +#if defined(SIMDE_WASM_SIMD128_ENABLE_NATIVE_ALIASES) + #define wasm_i64x2_const_splat(a) simde_wasm_i64x2_const_splat((a)) +#endif + SIMDE_FUNCTION_ATTRIBUTES simde_v128_t simde_wasm_f32x4_splat (simde_float32 a) { @@ -993,6 +1517,36 @@ simde_wasm_u16x8_extract_lane (simde_v128_t a, const int lane) { #define wasm_u16x8_extract_lane(a, lane) simde_wasm_u16x8_extract_lane((a), (lane)) #endif +SIMDE_FUNCTION_ATTRIBUTES +uint32_t +simde_wasm_u32x4_extract_lane (simde_v128_t a, const int lane) { + simde_v128_private a_ = simde_v128_to_private(a); + return a_.u32[lane & 3]; +} +#if defined(SIMDE_WASM_SIMD128_NATIVE) + #define simde_wasm_u32x4_extract_lane(a, lane) HEDLEY_STATIC_CAST(uint32_t, wasm_u32x4_extract_lane((a), (lane))) +#elif defined(SIMDE_ARM_NEON_A32V7_NATIVE) && !defined(SIMDE_BUG_CLANG_BAD_VGET_SET_LANE_TYPES) + #define simde_wasm_u32x4_extract_lane(a, lane) vgetq_lane_u32(simde_v128_to_neon_u32(a), (lane) & 3) +#endif +#if defined(SIMDE_WASM_SIMD128_ENABLE_NATIVE_ALIASES) + #define wasm_u32x4_extract_lane(a, lane) simde_wasm_u32x4_extract_lane((a), (lane)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +uint64_t +simde_wasm_u64x2_extract_lane (simde_v128_t a, const int lane) { + simde_v128_private a_ = simde_v128_to_private(a); + return a_.u64[lane & 1]; +} +#if defined(SIMDE_WASM_SIMD128_NATIVE) + #define simde_wasm_u64x2_extract_lane(a, lane) HEDLEY_STATIC_CAST(uint64_t, wasm_u64x2_extract_lane((a), (lane))) +#elif defined(SIMDE_ARM_NEON_A32V7_NATIVE) && !defined(SIMDE_BUG_CLANG_BAD_VGET_SET_LANE_TYPES) + #define simde_wasm_u64x2_extract_lane(a, lane) vgetq_lane_u64(simde_v128_to_neon_u64(a), (lane) & 1) +#endif +#if defined(SIMDE_WASM_SIMD128_ENABLE_NATIVE_ALIASES) + #define wasm_u64x2_extract_lane(a, lane) simde_wasm_u64x2_extract_lane((a), (lane)) +#endif + SIMDE_FUNCTION_ATTRIBUTES simde_float32 simde_wasm_f32x4_extract_lane (simde_v128_t a, const int lane) { @@ -3115,20 +3669,10 @@ simde_wasm_f32x4_abs (simde_v128_t a) { r_.neon_f32 = vabsq_f32(a_.neon_f32); #elif defined(SIMDE_POWER_ALTIVEC_P6_NATIVE) r_.altivec_f32 = vec_abs(a_.altivec_f32); - #elif defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR) - int32_t SIMDE_VECTOR(16) m = HEDLEY_REINTERPRET_CAST(__typeof__(m), a_.f32 < SIMDE_FLOAT32_C(0.0)); - r_.f32 = - HEDLEY_REINTERPRET_CAST( - __typeof__(r_.f32), - ( - (HEDLEY_REINTERPRET_CAST(__typeof__(m), -a_.f32) & m) | - (HEDLEY_REINTERPRET_CAST(__typeof__(m), a_.f32) & ~m) - ) - ); #else SIMDE_VECTORIZE for (size_t i = 0 ; i < (sizeof(r_.f32) / sizeof(r_.f32[0])) ; i++) { - r_.f32[i] = (a_.f32[i] < SIMDE_FLOAT32_C(0.0)) ? -a_.f32[i] : a_.f32[i]; + r_.f32[i] = simde_math_signbit(a_.f32[i]) ? -a_.f32[i] : a_.f32[i]; } #endif @@ -3155,20 +3699,10 @@ simde_wasm_f64x2_abs (simde_v128_t a) { r_.neon_f64 = vabsq_f64(a_.neon_f64); #elif defined(SIMDE_POWER_ALTIVEC_P7_NATIVE) r_.altivec_f64 = vec_abs(a_.altivec_f64); - #elif defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR) - int64_t SIMDE_VECTOR(16) m = HEDLEY_REINTERPRET_CAST(__typeof__(m), a_.f64 < SIMDE_FLOAT64_C(0.0)); - r_.f64 = - HEDLEY_REINTERPRET_CAST( - __typeof__(r_.f64), - ( - (HEDLEY_REINTERPRET_CAST(__typeof__(m), -a_.f64) & m) | - (HEDLEY_REINTERPRET_CAST(__typeof__(m), a_.f64) & ~m) - ) - ); #else SIMDE_VECTORIZE for (size_t i = 0 ; i < (sizeof(r_.f64) / sizeof(r_.f64[0])) ; i++) { - r_.f64[i] = (a_.f64[i] < SIMDE_FLOAT64_C(0.0)) ? -a_.f64[i] : a_.f64[i]; + r_.f64[i] = simde_math_signbit(a_.f64[i]) ? -a_.f64[i] : a_.f64[i]; } #endif @@ -3579,9 +4113,9 @@ simde_wasm_i8x16_shl (simde_v128_t a, uint32_t count) { r_; #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) - r_.neon_i8 = vshlq_s8(a_.neon_i8, vdupq_n_s8(HEDLEY_STATIC_CAST(int8_t, count))); + r_.neon_i8 = vshlq_s8(a_.neon_i8, vdupq_n_s8(HEDLEY_STATIC_CAST(int8_t, count & 7))); #elif defined(SIMDE_POWER_ALTIVEC_P6_NATIVE) - r_.altivec_i8 = vec_sl(a_.altivec_i8, vec_splats(HEDLEY_STATIC_CAST(unsigned char, count))); + r_.altivec_i8 = vec_sl(a_.altivec_i8, vec_splats(HEDLEY_STATIC_CAST(unsigned char, count & 7))); #elif defined(SIMDE_VECTOR_SUBSCRIPT) && defined(SIMDE_VECTOR_SCALAR) r_.i8 = a_.i8 << (count & 7); #else @@ -3611,9 +4145,9 @@ simde_wasm_i16x8_shl (simde_v128_t a, uint32_t count) { #if defined(SIMDE_X86_SSE2_NATIVE) return _mm_sll_epi16(a_.sse_m128i, _mm_cvtsi32_si128(count & 15)); #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE) - r_.neon_i16 = vshlq_s16(a_.neon_i16, vdupq_n_s16(HEDLEY_STATIC_CAST(int16_t, count))); + r_.neon_i16 = vshlq_s16(a_.neon_i16, vdupq_n_s16(HEDLEY_STATIC_CAST(int16_t, count & 15))); #elif defined(SIMDE_POWER_ALTIVEC_P6_NATIVE) - r_.altivec_i16 = vec_sl(a_.altivec_i16, vec_splats(HEDLEY_STATIC_CAST(unsigned short, count))); + r_.altivec_i16 = vec_sl(a_.altivec_i16, vec_splats(HEDLEY_STATIC_CAST(unsigned short, count & 15))); #elif defined(SIMDE_VECTOR_SUBSCRIPT) && defined(SIMDE_VECTOR_SCALAR) r_.i16 = a_.i16 << (count & 15); #else @@ -3643,9 +4177,9 @@ simde_wasm_i32x4_shl (simde_v128_t a, uint32_t count) { #if defined(SIMDE_X86_SSE2_NATIVE) return _mm_sll_epi32(a_.sse_m128i, _mm_cvtsi32_si128(count & 31)); #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE) - r_.neon_i32 = vshlq_s32(a_.neon_i32, vdupq_n_s32(HEDLEY_STATIC_CAST(int32_t, count))); + r_.neon_i32 = vshlq_s32(a_.neon_i32, vdupq_n_s32(HEDLEY_STATIC_CAST(int32_t, count & 31))); #elif defined(SIMDE_POWER_ALTIVEC_P6_NATIVE) - r_.altivec_i32 = vec_sl(a_.altivec_i32, vec_splats(HEDLEY_STATIC_CAST(unsigned int, count))); + r_.altivec_i32 = vec_sl(a_.altivec_i32, vec_splats(HEDLEY_STATIC_CAST(unsigned int, count & 31))); #elif defined(SIMDE_VECTOR_SUBSCRIPT) && defined(SIMDE_VECTOR_SCALAR) r_.i32 = a_.i32 << (count & 31); #else @@ -3666,6 +4200,9 @@ SIMDE_FUNCTION_ATTRIBUTES simde_v128_t simde_wasm_i64x2_shl (simde_v128_t a, uint32_t count) { #if defined(SIMDE_WASM_SIMD128_NATIVE) + #if defined(SIMDE_BUG_CLANG_60655) + count = count & 63; + #endif return wasm_i64x2_shl(a, count); #else simde_v128_private @@ -3675,9 +4212,9 @@ simde_wasm_i64x2_shl (simde_v128_t a, uint32_t count) { #if defined(SIMDE_X86_SSE2_NATIVE) return _mm_sll_epi64(a_.sse_m128i, _mm_cvtsi32_si128(count & 63)); #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE) - r_.neon_i64 = vshlq_s64(a_.neon_i64, vdupq_n_s64(HEDLEY_STATIC_CAST(int64_t, count))); + r_.neon_i64 = vshlq_s64(a_.neon_i64, vdupq_n_s64(HEDLEY_STATIC_CAST(int64_t, count & 63))); #elif defined(SIMDE_POWER_ALTIVEC_P8_NATIVE) - r_.altivec_i64 = vec_sl(a_.altivec_i64, vec_splats(HEDLEY_STATIC_CAST(unsigned long long, count))); + r_.altivec_i64 = vec_sl(a_.altivec_i64, vec_splats(HEDLEY_STATIC_CAST(unsigned long long, count & 63))); #elif defined(SIMDE_VECTOR_SUBSCRIPT) && defined(SIMDE_VECTOR_SCALAR) r_.i64 = a_.i64 << (count & 63); #else @@ -3707,9 +4244,9 @@ simde_wasm_i8x16_shr (simde_v128_t a, uint32_t count) { r_; #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) - r_.neon_i8 = vshlq_s8(a_.neon_i8, vdupq_n_s8(HEDLEY_STATIC_CAST(int8_t, -count))); + r_.neon_i8 = vshlq_s8(a_.neon_i8, vdupq_n_s8(-HEDLEY_STATIC_CAST(int8_t, count & 7))); #elif defined(SIMDE_POWER_ALTIVEC_P6_NATIVE) - r_.altivec_i8 = vec_sra(a_.altivec_i8, vec_splats(HEDLEY_STATIC_CAST(unsigned char, count))); + r_.altivec_i8 = vec_sra(a_.altivec_i8, vec_splats(HEDLEY_STATIC_CAST(unsigned char, count & 7))); #elif defined(SIMDE_VECTOR_SUBSCRIPT) && defined(SIMDE_VECTOR_SCALAR) r_.i8 = a_.i8 >> (count & 7); #else @@ -3739,9 +4276,9 @@ simde_wasm_i16x8_shr (simde_v128_t a, uint32_t count) { #if defined(SIMDE_X86_SSE2_NATIVE) return _mm_sra_epi16(a_.sse_m128i, _mm_cvtsi32_si128(count & 15)); #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE) - r_.neon_i16 = vshlq_s16(a_.neon_i16, vdupq_n_s16(HEDLEY_STATIC_CAST(int16_t, -count))); + r_.neon_i16 = vshlq_s16(a_.neon_i16, vdupq_n_s16(-HEDLEY_STATIC_CAST(int16_t, count & 15))); #elif defined(SIMDE_POWER_ALTIVEC_P6_NATIVE) - r_.altivec_i16 = vec_sra(a_.altivec_i16, vec_splats(HEDLEY_STATIC_CAST(unsigned short, count))); + r_.altivec_i16 = vec_sra(a_.altivec_i16, vec_splats(HEDLEY_STATIC_CAST(unsigned short, count & 15))); #elif defined(SIMDE_VECTOR_SUBSCRIPT) && defined(SIMDE_VECTOR_SCALAR) r_.i16 = a_.i16 >> (count & 15); #else @@ -3771,9 +4308,9 @@ simde_wasm_i32x4_shr (simde_v128_t a, uint32_t count) { #if defined(SIMDE_X86_SSE4_1_NATIVE) return _mm_sra_epi32(a_.sse_m128i, _mm_cvtsi32_si128(count & 31)); #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE) - r_.neon_i32 = vshlq_s32(a_.neon_i32, vdupq_n_s32(HEDLEY_STATIC_CAST(int32_t, -count))); + r_.neon_i32 = vshlq_s32(a_.neon_i32, vdupq_n_s32(-HEDLEY_STATIC_CAST(int32_t, count & 31))); #elif defined(SIMDE_POWER_ALTIVEC_P6_NATIVE) - r_.altivec_i32 = vec_sra(a_.altivec_i32, vec_splats(HEDLEY_STATIC_CAST(unsigned int, count))); + r_.altivec_i32 = vec_sra(a_.altivec_i32, vec_splats(HEDLEY_STATIC_CAST(unsigned int, count & 31))); #elif defined(SIMDE_VECTOR_SUBSCRIPT) && defined(SIMDE_VECTOR_SCALAR) r_.i32 = a_.i32 >> (count & 31); #else @@ -3794,6 +4331,9 @@ SIMDE_FUNCTION_ATTRIBUTES simde_v128_t simde_wasm_i64x2_shr (simde_v128_t a, uint32_t count) { #if defined(SIMDE_WASM_SIMD128_NATIVE) + #if defined(SIMDE_BUG_CLANG_60655) + count = count & 63; + #endif return wasm_i64x2_shr(a, count); #else simde_v128_private @@ -3803,9 +4343,9 @@ simde_wasm_i64x2_shr (simde_v128_t a, uint32_t count) { #if defined(SIMDE_X86_AVX512VL_NATIVE) return _mm_sra_epi64(a_.sse_m128i, _mm_cvtsi32_si128(count & 63)); #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE) - r_.neon_i64 = vshlq_s64(a_.neon_i64, vdupq_n_s64(HEDLEY_STATIC_CAST(int64_t, -count))); + r_.neon_i64 = vshlq_s64(a_.neon_i64, vdupq_n_s64(-HEDLEY_STATIC_CAST(int64_t, count & 63))); #elif defined(SIMDE_POWER_ALTIVEC_P8_NATIVE) - r_.altivec_i64 = vec_sra(a_.altivec_i64, vec_splats(HEDLEY_STATIC_CAST(unsigned long long, count))); + r_.altivec_i64 = vec_sra(a_.altivec_i64, vec_splats(HEDLEY_STATIC_CAST(unsigned long long, count & 63))); #elif defined(SIMDE_VECTOR_SUBSCRIPT) && defined(SIMDE_VECTOR_SCALAR) r_.i64 = a_.i64 >> (count & 63); #else @@ -3833,9 +4373,9 @@ simde_wasm_u8x16_shr (simde_v128_t a, uint32_t count) { r_; #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) - r_.neon_u8 = vshlq_u8(a_.neon_u8, vdupq_n_s8(HEDLEY_STATIC_CAST(int8_t, -count))); + r_.neon_u8 = vshlq_u8(a_.neon_u8, vdupq_n_s8(-HEDLEY_STATIC_CAST(int8_t, count & 7))); #elif defined(SIMDE_POWER_ALTIVEC_P6_NATIVE) - r_.altivec_u8 = vec_sr(a_.altivec_u8, vec_splats(HEDLEY_STATIC_CAST(unsigned char, count))); + r_.altivec_u8 = vec_sr(a_.altivec_u8, vec_splats(HEDLEY_STATIC_CAST(unsigned char, count & 7))); #elif defined(SIMDE_VECTOR_SUBSCRIPT) && defined(SIMDE_VECTOR_SCALAR) r_.u8 = a_.u8 >> (count & 7); #else @@ -3865,9 +4405,9 @@ simde_wasm_u16x8_shr (simde_v128_t a, uint32_t count) { #if defined(SIMDE_X86_SSE2_NATIVE) return _mm_srl_epi16(a_.sse_m128i, _mm_cvtsi32_si128(count & 15)); #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE) - r_.neon_u16 = vshlq_u16(a_.neon_u16, vdupq_n_s16(HEDLEY_STATIC_CAST(int16_t, -count))); + r_.neon_u16 = vshlq_u16(a_.neon_u16, vdupq_n_s16(-HEDLEY_STATIC_CAST(int16_t, count & 15))); #elif defined(SIMDE_POWER_ALTIVEC_P6_NATIVE) - r_.altivec_i16 = vec_sra(a_.altivec_i16, vec_splats(HEDLEY_STATIC_CAST(unsigned short, count))); + r_.altivec_u16 = vec_sr(a_.altivec_u16, vec_splats(HEDLEY_STATIC_CAST(unsigned short, count & 15))); #elif defined(SIMDE_VECTOR_SUBSCRIPT) && defined(SIMDE_VECTOR_SCALAR) r_.u16 = a_.u16 >> (count & 15); #else @@ -3897,9 +4437,9 @@ simde_wasm_u32x4_shr (simde_v128_t a, uint32_t count) { #if defined(SIMDE_X86_SSE4_1_NATIVE) return _mm_srl_epi32(a_.sse_m128i, _mm_cvtsi32_si128(count & 31)); #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE) - r_.neon_u32 = vshlq_u32(a_.neon_u32, vdupq_n_s32(HEDLEY_STATIC_CAST(int32_t, -count))); + r_.neon_u32 = vshlq_u32(a_.neon_u32, vdupq_n_s32(-HEDLEY_STATIC_CAST(int32_t, count & 31))); #elif defined(SIMDE_POWER_ALTIVEC_P6_NATIVE) - r_.altivec_i32 = vec_sra(a_.altivec_i32, vec_splats(HEDLEY_STATIC_CAST(unsigned int, count))); + r_.altivec_u32 = vec_sr(a_.altivec_u32, vec_splats(HEDLEY_STATIC_CAST(unsigned int, count & 31))); #elif defined(SIMDE_VECTOR_SUBSCRIPT) && defined(SIMDE_VECTOR_SCALAR) r_.u32 = a_.u32 >> (count & 31); #else @@ -3920,6 +4460,9 @@ SIMDE_FUNCTION_ATTRIBUTES simde_v128_t simde_wasm_u64x2_shr (simde_v128_t a, uint32_t count) { #if defined(SIMDE_WASM_SIMD128_NATIVE) + #if defined(SIMDE_BUG_CLANG_60655) + count = count & 63; + #endif return wasm_u64x2_shr(a, count); #else simde_v128_private @@ -3929,9 +4472,9 @@ simde_wasm_u64x2_shr (simde_v128_t a, uint32_t count) { #if defined(SIMDE_X86_SSE4_1_NATIVE) return _mm_srl_epi64(a_.sse_m128i, _mm_cvtsi32_si128(count & 63)); #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE) - r_.neon_u64 = vshlq_u64(a_.neon_u64, vdupq_n_s64(HEDLEY_STATIC_CAST(int64_t, -count))); + r_.neon_u64 = vshlq_u64(a_.neon_u64, vdupq_n_s64(-HEDLEY_STATIC_CAST(int64_t, count & 63))); #elif defined(SIMDE_POWER_ALTIVEC_P8_NATIVE) - r_.altivec_i64 = vec_sra(a_.altivec_i64, vec_splats(HEDLEY_STATIC_CAST(unsigned long long, count))); + r_.altivec_u64 = vec_sr(a_.altivec_u64, vec_splats(HEDLEY_STATIC_CAST(unsigned long long, count & 63))); #elif defined(SIMDE_VECTOR_SUBSCRIPT) && defined(SIMDE_VECTOR_SCALAR) r_.u64 = a_.u64 >> (count & 63); #else @@ -4317,12 +4860,6 @@ simde_wasm_i16x8_mul (simde_v128_t a, simde_v128_t b) { r_.sse_m128i = _mm_mullo_epi16(a_.sse_m128i, b_.sse_m128i); #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE) r_.neon_i16 = vmulq_s16(a_.neon_i16, b_.neon_i16); - #elif defined(SIMDE_POWER_ALTIVEC_P6_NATIVE) - r_.altivec_i16 = - vec_pack( - vec_mule(a_.altivec_i16, b_.altivec_i16), - vec_mulo(a_.altivec_i16, b_.altivec_i16) - ); #elif defined(SIMDE_VECTOR_SUBSCRIPT) r_.i16 = a_.i16 * b_.i16; #else @@ -4471,26 +5008,6 @@ simde_wasm_i16x8_q15mulr_sat (simde_v128_t a, simde_v128_t b) { /* https://github.com/WebAssembly/simd/pull/365 */ #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) r_.neon_i16 = vqrdmulhq_s16(a_.neon_i16, b_.neon_i16); - #elif defined(SIMDE_X86_SSSE3_NATIVE) - __m128i y = _mm_mulhrs_epi16(a_.sse_m128i, b_.sse_m128i); - __m128i tmp = _mm_cmpeq_epi16(y, _mm_set1_epi16(INT16_MAX)); - r_.sse_m128i = _mm_xor_si128(y, tmp); - #elif defined(SIMDE_X86_SSE2_NATIVE) - const __m128i prod_lo = _mm_mullo_epi16(a_.sse_m128i, b_.sse_m128i); - const __m128i prod_hi = _mm_mulhi_epi16(a_.sse_m128i, b_.sse_m128i); - const __m128i tmp = - _mm_add_epi16( - _mm_avg_epu16( - _mm_srli_epi16(prod_lo, 14), - _mm_setzero_si128() - ), - _mm_add_epi16(prod_hi, prod_hi) - ); - r_.sse_m128i = - _mm_xor_si128( - tmp, - _mm_cmpeq_epi16(_mm_set1_epi16(INT16_MAX), tmp) - ); #else SIMDE_VECTORIZE for (size_t i = 0 ; i < (sizeof(r_.i16) / sizeof(r_.i16[0])) ; i++) { @@ -4747,43 +5264,22 @@ simde_wasm_f32x4_min (simde_v128_t a, simde_v128_t b) { b_ = simde_v128_to_private(b), r_; - #if defined(SIMDE_X86_SSE4_1_NATIVE) - r_.sse_m128 = _mm_blendv_ps( - _mm_set1_ps(SIMDE_MATH_NANF), - _mm_min_ps(a_.sse_m128, b_.sse_m128), - _mm_cmpord_ps(a_.sse_m128, b_.sse_m128)); - #elif defined(SIMDE_X86_SSE2_NATIVE) - __m128 m = _mm_cmpord_ps(a_.sse_m128, b_.sse_m128); - r_.sse_m128 = - _mm_or_ps( - _mm_and_ps(m, _mm_min_ps(a_.sse_m128, b_.sse_m128)), - _mm_andnot_ps(m, _mm_set1_ps(SIMDE_MATH_NANF)) - ); - #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE) - r_.neon_f32 = vminq_f32(a_.neon_f32, b_.neon_f32); - #elif defined(SIMDE_POWER_ALTIVEC_P6_NATIVE) - SIMDE_POWER_ALTIVEC_VECTOR(SIMDE_POWER_ALTIVEC_BOOL int) condition; - SIMDE_POWER_ALTIVEC_VECTOR(SIMDE_POWER_ALTIVEC_BOOL int) a_lt_b = - vec_cmpgt(b_.altivec_f32, a_.altivec_f32); - - #if defined(SIMDE_POWER_ALTIVEC_P8_NATIVE) - condition = vec_orc(a_lt_b, vec_cmpeq(a_.altivec_f32, a_.altivec_f32)); - #else - SIMDE_POWER_ALTIVEC_VECTOR(SIMDE_POWER_ALTIVEC_BOOL int) a_not_nan = - vec_cmpeq(a_.altivec_f32, a_.altivec_f32); - condition = vec_or(a_lt_b, vec_nor(a_not_nan, a_not_nan)); - #endif - - r_.altivec_f32 = - vec_sel( - b_.altivec_f32, - a_.altivec_f32, - condition - ); + #if defined(SIMDE_X86_SSE2_NATIVE) && (!defined(HEDLEY_GCC_VERSION) || HEDLEY_GCC_VERSION_CHECK(6,0,0)) + // Inspired by https://github.com/v8/v8/blob/c750b6c85bd1ad1d27f7acc1812165f465515144/src/codegen/shared-ia32-x64/macro-assembler-shared-ia32-x64.cc#L202 + simde_v128_private scratch; + scratch.sse_m128 = a_.sse_m128; + scratch.sse_m128 = _mm_min_ps(scratch.sse_m128, b_.sse_m128); + r_.sse_m128 = b_.sse_m128; + r_.sse_m128 = _mm_min_ps(r_.sse_m128, a_.sse_m128); + scratch.sse_m128 = _mm_or_ps(scratch.sse_m128, r_.sse_m128); + r_.sse_m128 = _mm_cmpunord_ps(r_.sse_m128, scratch.sse_m128); + scratch.sse_m128 = _mm_or_ps(scratch.sse_m128, r_.sse_m128); + r_.sse_m128i = _mm_srli_epi32(r_.sse_m128i, 10); + r_.sse_m128 = _mm_andnot_ps(r_.sse_m128, scratch.sse_m128); #else SIMDE_VECTORIZE for (size_t i = 0 ; i < (sizeof(r_.f32) / sizeof(r_.f32[0])) ; i++) { - r_.f32[i] = (simde_math_isnan(a_.f32[i]) || (a_.f32[i] < b_.f32[i])) ? a_.f32[i] : b_.f32[i]; + r_.f32[i] = SIMDE_WASM_SIMD128_FMINF(a_.f32[i], b_.f32[i]); } #endif @@ -4805,34 +5301,22 @@ simde_wasm_f64x2_min (simde_v128_t a, simde_v128_t b) { b_ = simde_v128_to_private(b), r_; - #if defined(SIMDE_X86_SSE4_1_NATIVE) - r_.sse_m128d = _mm_blendv_pd( - _mm_set1_pd(SIMDE_MATH_NAN), - _mm_min_pd(a_.sse_m128d, b_.sse_m128d), - _mm_cmpord_pd(a_.sse_m128d, b_.sse_m128d)); - #elif defined(SIMDE_X86_SSE2_NATIVE) - __m128d m = _mm_cmpord_pd(a_.sse_m128d, b_.sse_m128d); - r_.sse_m128d = - _mm_or_pd( - _mm_and_pd(m, _mm_min_pd(a_.sse_m128d, b_.sse_m128d)), - _mm_andnot_pd(m, _mm_set1_pd(SIMDE_MATH_NAN)) - ); - #elif defined(SIMDE_POWER_ALTIVEC_P8_NATIVE) - r_.altivec_f64 = - vec_sel( - b_.altivec_f64, - a_.altivec_f64, - vec_orc( - vec_cmpgt(b_.altivec_f64, a_.altivec_f64), - vec_cmpeq(a_.altivec_f64, a_.altivec_f64) - ) - ); - #elif defined(SIMDE_ARM_NEON_A64V8_NATIVE) - r_.neon_f64 = vminq_f64(a_.neon_f64, b_.neon_f64); + #if defined(SIMDE_X86_SSE2_NATIVE) && (!defined(HEDLEY_GCC_VERSION) || HEDLEY_GCC_VERSION_CHECK(6,0,0)) + // Inspired by https://github.com/v8/v8/blob/c750b6c85bd1ad1d27f7acc1812165f465515144/src/codegen/shared-ia32-x64/macro-assembler-shared-ia32-x64.cc#L263 + simde_v128_private scratch; + scratch.sse_m128d = a_.sse_m128d; + scratch.sse_m128d = _mm_min_pd(scratch.sse_m128d, b_.sse_m128d); + r_.sse_m128d = b_.sse_m128d; + r_.sse_m128d = _mm_min_pd(r_.sse_m128d, a_.sse_m128d); + scratch.sse_m128d = _mm_or_pd(scratch.sse_m128d, r_.sse_m128d); + r_.sse_m128d = _mm_cmpunord_pd(r_.sse_m128d, scratch.sse_m128d); + scratch.sse_m128d = _mm_or_pd(scratch.sse_m128d, r_.sse_m128d); + r_.sse_m128i = _mm_srli_epi64(r_.sse_m128i, 13); + r_.sse_m128d = _mm_andnot_pd(r_.sse_m128d, scratch.sse_m128d); #else SIMDE_VECTORIZE for (size_t i = 0 ; i < (sizeof(r_.f64) / sizeof(r_.f64[0])) ; i++) { - r_.f64[i] = (simde_math_isnan(a_.f64[i]) || (a_.f64[i] < b_.f64[i])) ? a_.f64[i] : b_.f64[i]; + r_.f64[i] = SIMDE_WASM_SIMD128_FMIN(a_.f64[i], b_.f64[i]); } #endif @@ -5077,59 +5561,23 @@ simde_wasm_f32x4_max (simde_v128_t a, simde_v128_t b) { b_ = simde_v128_to_private(b), r_; - #if defined(SIMDE_X86_SSE4_1_NATIVE) - r_.sse_m128 = _mm_blendv_ps( - _mm_set1_ps(SIMDE_MATH_NANF), - _mm_max_ps(a_.sse_m128, b_.sse_m128), - _mm_cmpord_ps(a_.sse_m128, b_.sse_m128)); - #elif defined(SIMDE_X86_SSE_NATIVE) - __m128 m = _mm_or_ps(_mm_cmpneq_ps(a_.sse_m128, a_.sse_m128), _mm_cmpgt_ps(a_.sse_m128, b_.sse_m128)); - #if defined(SIMDE_X86_SSE4_1_NATIVE) - r_.ssse_m128 = _mm_blendv_ps(b_.sse_m128, a_.sse_m128, m); - #else - r_.sse_m128 = - _mm_or_ps( - _mm_and_ps(m, a_.sse_m128), - _mm_andnot_ps(m, b_.sse_m128) - ); - #endif - #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE) - r_.neon_f32 = vmaxq_f32(a_.neon_f32, b_.neon_f32); - #elif defined(SIMDE_POWER_ALTIVEC_P8_NATIVE) - r_.altivec_f32 = - vec_sel( - b_.altivec_f32, - a_.altivec_f32, - vec_orc( - vec_cmpgt(a_.altivec_f32, b_.altivec_f32), - vec_cmpeq(a_.altivec_f32, a_.altivec_f32) - ) - ); - #elif defined(SIMDE_POWER_ALTIVEC_P6_NATIVE) - SIMDE_POWER_ALTIVEC_VECTOR(SIMDE_POWER_ALTIVEC_BOOL int) cmpres = vec_cmpeq(a_.altivec_f32, a_.altivec_f32); - r_.altivec_f32 = - vec_sel( - b_.altivec_f32, - a_.altivec_f32, - vec_or( - vec_cmpgt(a_.altivec_f32, b_.altivec_f32), - vec_nor(cmpres, cmpres) - ) - ); - #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) - int32_t SIMDE_VECTOR(16) m = HEDLEY_REINTERPRET_CAST(__typeof__(m), (a_.f32 != a_.f32) | (a_.f32 > b_.f32)); - r_.f32 = - HEDLEY_REINTERPRET_CAST( - __typeof__(r_.f32), - ( - ( m & HEDLEY_REINTERPRET_CAST(__typeof__(m), a_.f32)) | - (~m & HEDLEY_REINTERPRET_CAST(__typeof__(m), b_.f32)) - ) - ); + #if defined(SIMDE_X86_SSE2_NATIVE) && (!defined(HEDLEY_GCC_VERSION) || HEDLEY_GCC_VERSION_CHECK(6,0,0)) + // Inspired by https://github.com/v8/v8/blob/c750b6c85bd1ad1d27f7acc1812165f465515144/src/codegen/shared-ia32-x64/macro-assembler-shared-ia32-x64.cc#L231 + simde_v128_private scratch; + scratch.sse_m128 = a_.sse_m128; + scratch.sse_m128 = _mm_max_ps(scratch.sse_m128, b_.sse_m128); + r_.sse_m128 = b_.sse_m128; + r_.sse_m128 = _mm_max_ps(r_.sse_m128, a_.sse_m128); + r_.sse_m128 = _mm_xor_ps(r_.sse_m128, scratch.sse_m128); + scratch.sse_m128 = _mm_or_ps(scratch.sse_m128, r_.sse_m128); + scratch.sse_m128 = _mm_sub_ps(scratch.sse_m128, r_.sse_m128); + r_.sse_m128 = _mm_cmpunord_ps(r_.sse_m128, scratch.sse_m128); + r_.sse_m128i = _mm_srli_epi32(r_.sse_m128i, 10); + r_.sse_m128 = _mm_andnot_ps(r_.sse_m128, scratch.sse_m128); #else SIMDE_VECTORIZE for (size_t i = 0 ; i < (sizeof(r_.f32) / sizeof(r_.f32[0])) ; i++) { - r_.f32[i] = (simde_math_isnan(a_.f32[i]) || (a_.f32[i] > b_.f32[i])) ? a_.f32[i] : b_.f32[i]; + r_.f32[i] = SIMDE_WASM_SIMD128_FMAXF(a_.f32[i], b_.f32[i]); } #endif @@ -5151,59 +5599,23 @@ simde_wasm_f64x2_max (simde_v128_t a, simde_v128_t b) { b_ = simde_v128_to_private(b), r_; - #if defined(SIMDE_X86_SSE4_1_NATIVE) - r_.sse_m128d = _mm_blendv_pd( - _mm_set1_pd(SIMDE_MATH_NAN), - _mm_max_pd(a_.sse_m128d, b_.sse_m128d), - _mm_cmpord_pd(a_.sse_m128d, b_.sse_m128d)); - #elif defined(SIMDE_X86_SSE2_NATIVE) - __m128d m = _mm_or_pd(_mm_cmpneq_pd(a_.sse_m128d, a_.sse_m128d), _mm_cmpgt_pd(a_.sse_m128d, b_.sse_m128d)); - #if defined(SIMDE_X86_SSE4_1_NATIVE) - r_.ssse_m128d = _mm_blendv_pd(b_.sse_m128d, a_.sse_m128d, m); - #else - r_.sse_m128d = - _mm_or_pd( - _mm_and_pd(m, a_.sse_m128d), - _mm_andnot_pd(m, b_.sse_m128d) - ); - #endif - #elif defined(SIMDE_ARM_NEON_A64V8_NATIVE) - r_.neon_f64 = vmaxq_f64(a_.neon_f64, b_.neon_f64); - #elif defined(SIMDE_POWER_ALTIVEC_P8_NATIVE) - r_.altivec_f64 = - vec_sel( - b_.altivec_f64, - a_.altivec_f64, - vec_orc( - vec_cmpgt(a_.altivec_f64, b_.altivec_f64), - vec_cmpeq(a_.altivec_f64, a_.altivec_f64) - ) - ); - #elif defined(SIMDE_POWER_ALTIVEC_P7_NATIVE) - SIMDE_POWER_ALTIVEC_VECTOR(SIMDE_POWER_ALTIVEC_BOOL long long) cmpres = vec_cmpeq(a_.altivec_f64, a_.altivec_f64); - r_.altivec_f64 = - vec_sel( - b_.altivec_f64, - a_.altivec_f64, - vec_or( - vec_cmpgt(a_.altivec_f64, b_.altivec_f64), - vec_nor(cmpres, cmpres) - ) - ); - #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) - int64_t SIMDE_VECTOR(16) m = HEDLEY_REINTERPRET_CAST(__typeof__(m), (a_.f64 != a_.f64) | (a_.f64 > b_.f64)); - r_.f64 = - HEDLEY_REINTERPRET_CAST( - __typeof__(r_.f64), - ( - ( m & HEDLEY_REINTERPRET_CAST(__typeof__(m), a_.f64)) | - (~m & HEDLEY_REINTERPRET_CAST(__typeof__(m), b_.f64)) - ) - ); + #if defined(SIMDE_X86_SSE2_NATIVE) && (!defined(HEDLEY_GCC_VERSION) || HEDLEY_GCC_VERSION_CHECK(6,0,0)) + // Inspired by https://github.com/v8/v8/blob/c750b6c85bd1ad1d27f7acc1812165f465515144/src/codegen/shared-ia32-x64/macro-assembler-shared-ia32-x64.cc#L301 + simde_v128_private scratch; + scratch.sse_m128d = a_.sse_m128d; + scratch.sse_m128d = _mm_max_pd(scratch.sse_m128d, b_.sse_m128d); + r_.sse_m128d = b_.sse_m128d; + r_.sse_m128d = _mm_max_pd(r_.sse_m128d, a_.sse_m128d); + r_.sse_m128d = _mm_xor_pd(r_.sse_m128d, scratch.sse_m128d); + scratch.sse_m128d = _mm_or_pd(scratch.sse_m128d, r_.sse_m128d); + scratch.sse_m128d = _mm_sub_pd(scratch.sse_m128d, r_.sse_m128d); + r_.sse_m128d = _mm_cmpunord_pd(r_.sse_m128d, scratch.sse_m128d); + r_.sse_m128i = _mm_srli_epi64(r_.sse_m128i, 13); + r_.sse_m128d = _mm_andnot_pd(r_.sse_m128d, scratch.sse_m128d); #else SIMDE_VECTORIZE for (size_t i = 0 ; i < (sizeof(r_.f64) / sizeof(r_.f64[0])) ; i++) { - r_.f64[i] = (simde_math_isnan(a_.f64[i]) || (a_.f64[i] > b_.f64[i])) ? a_.f64[i] : b_.f64[i]; + r_.f64[i] = SIMDE_WASM_SIMD128_FMAX(a_.f64[i], b_.f64[i]); } #endif @@ -5630,11 +6042,11 @@ simde_wasm_f64x2_pmin (simde_v128_t a, simde_v128_t b) { a_.neon_f64 ); #elif defined(SIMDE_POWER_ALTIVEC_P7_NATIVE) - r_.altivec_f32 = + r_.altivec_f64 = vec_sel( - a_.altivec_f32, - b_.altivec_f32, - vec_cmpgt(a_.altivec_f32, b_.altivec_f32) + a_.altivec_f64, + b_.altivec_f64, + vec_cmpgt(a_.altivec_f64, b_.altivec_f64) ); #else SIMDE_VECTORIZE @@ -5663,7 +6075,7 @@ simde_wasm_f32x4_pmax (simde_v128_t a, simde_v128_t b) { b_ = simde_v128_to_private(b), r_; - #if defined(SIMDE_X86_SSE2_NATIVE) + #if defined(SIMDE_X86_SSE_NATIVE) r_.sse_m128 = _mm_max_ps(b_.sse_m128, a_.sse_m128); #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE) r_.neon_f32 = vbslq_f32(vcltq_f32(a_.neon_f32, b_.neon_f32), b_.neon_f32, a_.neon_f32); @@ -7934,7 +8346,9 @@ simde_wasm_v128_load16_lane (const void * a, simde_v128_t vec, const int lane) simde_v128_private a_ = simde_v128_to_private(vec); - a_.i16[lane] = *HEDLEY_REINTERPRET_CAST(const int16_t *, a); + int16_t tmp = 0; + simde_memcpy(&tmp, a, sizeof(int16_t)); + a_.i16[lane] = tmp; return simde_v128_from_private(a_); } @@ -7952,7 +8366,9 @@ simde_wasm_v128_load32_lane (const void * a, simde_v128_t vec, const int lane) simde_v128_private a_ = simde_v128_to_private(vec); - a_.i32[lane] = *HEDLEY_REINTERPRET_CAST(const int32_t *, a); + int32_t tmp = 0; + simde_memcpy(&tmp, a, sizeof(int32_t)); + a_.i32[lane] = tmp; return simde_v128_from_private(a_); } @@ -7970,7 +8386,9 @@ simde_wasm_v128_load64_lane (const void * a, simde_v128_t vec, const int lane) simde_v128_private a_ = simde_v128_to_private(vec); - a_.i64[lane] = *HEDLEY_REINTERPRET_CAST(const int64_t *, a); + int64_t tmp = 0; + simde_memcpy(&tmp, a, sizeof(int64_t)); + a_.i64[lane] = tmp; return simde_v128_from_private(a_); } @@ -8183,7 +8601,7 @@ simde_wasm_i32x4_trunc_sat_f32x4 (simde_v128_t a) { #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) r_.neon_i32 = vcvtq_s32_f32(a_.neon_f32); #elif defined(SIMDE_CONVERT_VECTOR_) && defined(SIMDE_FAST_CONVERSION_RANGE) - SIMDE_CONVERT_VECTOR_(r_.f32, a_.f32); + SIMDE_CONVERT_VECTOR_(r_.i32, a_.f32); #elif defined(SIMDE_X86_SSE2_NATIVE) const __m128i i32_max_mask = _mm_castps_si128(_mm_cmpgt_ps(a_.sse_m128, _mm_set1_ps(SIMDE_FLOAT32_C(2147483520.0)))); const __m128 clamped = _mm_max_ps(a_.sse_m128, _mm_set1_ps(HEDLEY_STATIC_CAST(simde_float32, INT32_MIN))); @@ -8205,7 +8623,7 @@ simde_wasm_i32x4_trunc_sat_f32x4 (simde_v128_t a) { ); #endif r_.sse_m128i = _mm_and_si128(r_.sse_m128i, _mm_castps_si128(_mm_cmpord_ps(a_.sse_m128, a_.sse_m128))); - #elif defined(SIMDE_CONVERT_VECTOR_) && defined(SIMDE_IEEE754_STORAGE) + #elif defined(SIMDE_CONVERT_VECTOR_) && defined(SIMDE_IEEE754_STORAGE) && !defined(SIMDE_ARCH_POWER) SIMDE_CONVERT_VECTOR_(r_.i32, a_.f32); const __typeof__(a_.f32) max_representable = { SIMDE_FLOAT32_C(2147483520.0), SIMDE_FLOAT32_C(2147483520.0), SIMDE_FLOAT32_C(2147483520.0), SIMDE_FLOAT32_C(2147483520.0) }; @@ -8580,7 +8998,7 @@ simde_wasm_f32x4_ceil (simde_v128_t a) { #else SIMDE_VECTORIZE for (size_t i = 0 ; i < (sizeof(r_.f32) / sizeof(r_.f32[0])) ; i++) { - r_.f32[i] = simde_math_ceilf(a_.f32[i]); + r_.f32[i] = simde_math_quietf(simde_math_ceilf(a_.f32[i])); } #endif @@ -8603,30 +9021,6 @@ simde_wasm_f64x2_ceil (simde_v128_t a) { #if defined(SIMDE_X86_SSE4_1_NATIVE) r_.sse_m128d = _mm_round_pd(a_.sse_m128d, _MM_FROUND_TO_POS_INF | _MM_FROUND_NO_EXC); - #elif defined(SIMDE_X86_SSE2_NATIVE) - /* https://github.com/WebAssembly/simd/pull/232 */ - - const __m128d all_but_sign_set = _mm_castsi128_pd(_mm_set1_epi64x(INT64_C(0x7FFFFFFFFFFFFFFF))); - /* https://stackoverflow.com/a/55077612 explains this a bit */ - const __m128d bignum = _mm_set1_pd(4.50359962737049600000e+15); - const __m128d sign_cleared = _mm_and_pd(a_.sse_m128d, all_but_sign_set); - - __m128d mask = - _mm_and_pd( - _mm_cmpnle_pd(bignum, sign_cleared), - all_but_sign_set - ); - const __m128d tmp = - _mm_or_pd( - _mm_andnot_pd(mask, a_.sse_m128d), - _mm_and_pd (mask, _mm_sub_pd(_mm_add_pd(sign_cleared, bignum), bignum)) - ); - - r_.sse_m128d = - _mm_add_pd( - tmp, - _mm_and_pd(_mm_and_pd(_mm_cmplt_pd(tmp, a_.sse_m128d), all_but_sign_set), _mm_set1_pd(1.0)) - ); #elif defined(SIMDE_ARM_NEON_A64V8_NATIVE) r_.neon_f64 = vrndpq_f64(a_.neon_f64); #elif defined(SIMDE_POWER_ALTIVEC_P7_NATIVE) @@ -8634,7 +9028,7 @@ simde_wasm_f64x2_ceil (simde_v128_t a) { #else SIMDE_VECTORIZE for (size_t i = 0 ; i < (sizeof(r_.f64) / sizeof(r_.f64[0])) ; i++) { - r_.f64[i] = simde_math_ceil(a_.f64[i]); + r_.f64[i] = simde_math_quiet(simde_math_ceil(a_.f64[i])); } #endif @@ -8689,7 +9083,7 @@ simde_wasm_f32x4_floor (simde_v128_t a) { ) ); #elif defined(SIMDE_ARM_NEON_A32V8_NATIVE) - r_.neon_f32 = vrndmq_f32(a_.f32); + r_.neon_f32 = vrndmq_f32(a_.neon_f32); #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE) const int32x4_t input_as_int = vcvtq_s32_f32(a_.f32); const float32x4_t input_truncated = vcvtq_f32_s32(input_as_int); @@ -8722,7 +9116,7 @@ simde_wasm_f32x4_floor (simde_v128_t a) { #else SIMDE_VECTORIZE for (size_t i = 0 ; i < (sizeof(r_.f32) / sizeof(r_.f32[0])) ; i++) { - r_.f32[i] = simde_math_floorf(a_.f32[i]); + r_.f32[i] = simde_math_quietf(simde_math_floorf(a_.f32[i])); } #endif @@ -8745,7 +9139,7 @@ simde_wasm_f64x2_floor (simde_v128_t a) { SIMDE_VECTORIZE for (size_t i = 0 ; i < (sizeof(r_.f64) / sizeof(r_.f64[0])) ; i++) { - r_.f64[i] = simde_math_floor(a_.f64[i]); + r_.f64[i] = simde_math_quiet(simde_math_floor(a_.f64[i])); } return simde_v128_from_private(r_); @@ -8769,7 +9163,7 @@ simde_wasm_f32x4_trunc (simde_v128_t a) { SIMDE_VECTORIZE for (size_t i = 0 ; i < (sizeof(r_.f32) / sizeof(r_.f32[0])) ; i++) { - r_.f32[i] = simde_math_truncf(a_.f32[i]); + r_.f32[i] = simde_math_quietf(simde_math_truncf(a_.f32[i])); } return simde_v128_from_private(r_); @@ -8791,7 +9185,7 @@ simde_wasm_f64x2_trunc (simde_v128_t a) { SIMDE_VECTORIZE for (size_t i = 0 ; i < (sizeof(r_.f64) / sizeof(r_.f64[0])) ; i++) { - r_.f64[i] = simde_math_trunc(a_.f64[i]); + r_.f64[i] = simde_math_quiet(simde_math_trunc(a_.f64[i])); } return simde_v128_from_private(r_); @@ -8815,7 +9209,7 @@ simde_wasm_f32x4_nearest (simde_v128_t a) { SIMDE_VECTORIZE for (size_t i = 0 ; i < (sizeof(r_.f32) / sizeof(r_.f32[0])) ; i++) { - r_.f32[i] = simde_math_roundf(a_.f32[i]); + r_.f32[i] = simde_math_quietf(simde_math_nearbyintf(a_.f32[i])); } return simde_v128_from_private(r_); @@ -8837,7 +9231,7 @@ simde_wasm_f64x2_nearest (simde_v128_t a) { SIMDE_VECTORIZE for (size_t i = 0 ; i < (sizeof(r_.f64) / sizeof(r_.f64[0])) ; i++) { - r_.f64[i] = simde_math_round(a_.f64[i]); + r_.f64[i] = simde_math_quiet(simde_math_nearbyint(a_.f64[i])); } return simde_v128_from_private(r_); @@ -8868,7 +9262,7 @@ simde_wasm_f32x4_sqrt (simde_v128_t a) { #else SIMDE_VECTORIZE for (size_t i = 0 ; i < (sizeof(r_.f32) / sizeof(r_.f32[0])) ; i++) { - r_.f32[i] = simde_math_sqrtf(a_.f32[i]); + r_.f32[i] = simde_math_quietf(simde_math_sqrtf(a_.f32[i])); } #endif @@ -8889,7 +9283,7 @@ simde_wasm_f64x2_sqrt (simde_v128_t a) { a_ = simde_v128_to_private(a), r_; - #if defined(SIMDE_X86_SSE_NATIVE) + #if defined(SIMDE_X86_SSE2_NATIVE) r_.sse_m128d = _mm_sqrt_pd(a_.sse_m128d); #elif defined(SIMDE_ARM_NEON_A64V8_NATIVE) r_.neon_f64 = vsqrtq_f64(a_.neon_f64); @@ -8898,7 +9292,7 @@ simde_wasm_f64x2_sqrt (simde_v128_t a) { #else SIMDE_VECTORIZE for (size_t i = 0 ; i < (sizeof(r_.f64) / sizeof(r_.f64[0])) ; i++) { - r_.f64[i] = simde_math_sqrt(a_.f64[i]); + r_.f64[i] = simde_math_quiet(simde_math_sqrt(a_.f64[i])); } #endif diff --git a/lib/simd_wrapper/simde/x86/aes.h b/lib/simd_wrapper/simde/x86/aes.h new file mode 100644 index 00000000000..1d5b0492684 --- /dev/null +++ b/lib/simd_wrapper/simde/x86/aes.h @@ -0,0 +1,417 @@ +/* MIT License + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, copy, + * modify, merge, publish, distribute, sublicense, and/or sell copies + * of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + */ + +#if !defined(SIMDE_X86_AES_H) +#define SIMDE_X86_AES_H + +/* + * Advanced Encryption Standard + * @author Dani Huertas + * @email huertas.dani@gmail.com + * + * Based on the document FIPS PUB 197 + */ + +#include "sse2.h" + +/* + * Multiplication in GF(2^8) + * http://en.wikipedia.org/wiki/Finite_field_arithmetic + * Irreducible polynomial m(x) = x8 + x4 + x3 + x + 1 + * + * NOTE: This function can be easily replaced with a look up table for a speed + * boost, at the expense of an increase in memory size. + +SIMDE_FUNCTION_ATTRIBUTES +uint8_t gmult(uint8_t a, uint8_t b) { + uint8_t p = 0, i = 0, hbs = 0; + + for (i = 0; i < 8; i++) { + if (b & 1) { + p ^= a; + } + + hbs = a & 0x80; + a <<= 1; + if (hbs) a ^= 0x1b; // 0000 0001 0001 1011 + b >>= 1; + } + + return (uint8_t)p; +} + */ + +#if !(defined(SIMDE_ARM_NEON_A32V7_NATIVE) && defined(SIMDE_ARCH_ARM_CRYPTO)) + +#include "../simde-aes.h" + +/* + * Transformation in the Cipher and Inverse Cipher in which a Round + * Key is added to the State using an XOR operation. The length of a + * Round Key equals the size of the State (i.e., for Nb = 4, the Round + * Key length equals 128 bits/16 bytes). + */ +SIMDE_FUNCTION_ATTRIBUTES +void simde_x_aes_add_round_key(uint8_t *state, simde__m128i_private w, uint8_t r) { + + int Nb = simde_x_aes_Nb; + uint8_t c; + + for (c = 0; c < Nb; c++) { + state[Nb*0+c] = state[Nb*0+c]^w.u8[4*Nb*r+4*c+0]; + state[Nb*1+c] = state[Nb*1+c]^w.u8[4*Nb*r+4*c+1]; + state[Nb*2+c] = state[Nb*2+c]^w.u8[4*Nb*r+4*c+2]; + state[Nb*3+c] = state[Nb*3+c]^w.u8[4*Nb*r+4*c+3]; + } +} + +/* + * Transformation in the Cipher that takes all of the columns of the + * State and mixes their data (independently of one another) to + * produce new columns. + */ +SIMDE_FUNCTION_ATTRIBUTES +void simde_x_aes_mix_columns(uint8_t *state) { + + int Nb = simde_x_aes_Nb; + // uint8_t k[] = {0x02, 0x01, 0x01, 0x03}; // a(x) = {02} + {01}x + {01}x2 + {03}x3 + uint8_t i, j, col[4], res[4]; + + for (j = 0; j < Nb; j++) { + for (i = 0; i < 4; i++) { + col[i] = state[Nb*i+j]; + } + + //coef_mult(k, col, res); + simde_x_aes_coef_mult_lookup(0, col, res); + + for (i = 0; i < 4; i++) { + state[Nb*i+j] = res[i]; + } + } +} + +/* + * Transformation in the Inverse Cipher that is the inverse of + * MixColumns(). + */ +SIMDE_FUNCTION_ATTRIBUTES +void simde_x_aes_inv_mix_columns(uint8_t *state) { + + int Nb = simde_x_aes_Nb; + // uint8_t k[] = {0x0e, 0x09, 0x0d, 0x0b}; // a(x) = {0e} + {09}x + {0d}x2 + {0b}x3 + uint8_t i, j, col[4], res[4]; + + for (j = 0; j < Nb; j++) { + for (i = 0; i < 4; i++) { + col[i] = state[Nb*i+j]; + } + + //coef_mult(k, col, res); + simde_x_aes_coef_mult_lookup(4, col, res); + + for (i = 0; i < 4; i++) { + state[Nb*i+j] = res[i]; + } + } +} + +/* + * Transformation in the Cipher that processes the State by cyclically + * shifting the last three rows of the State by different offsets. + */ +SIMDE_FUNCTION_ATTRIBUTES +void simde_x_aes_shift_rows(uint8_t *state) { + + int Nb = simde_x_aes_Nb; + uint8_t i, k, s, tmp; + + for (i = 1; i < 4; i++) { + // shift(1,4)=1; shift(2,4)=2; shift(3,4)=3 + // shift(r, 4) = r; + s = 0; + while (s < i) { + tmp = state[Nb*i+0]; + + for (k = 1; k < Nb; k++) { + state[Nb*i+k-1] = state[Nb*i+k]; + } + + state[Nb*i+Nb-1] = tmp; + s++; + } + } +} + +/* + * Transformation in the Inverse Cipher that is the inverse of + * ShiftRows(). + */ +SIMDE_FUNCTION_ATTRIBUTES +void simde_x_aes_inv_shift_rows(uint8_t *state) { + + uint8_t Nb = simde_x_aes_Nb; + uint8_t i, k, s, tmp; + + for (i = 1; i < 4; i++) { + s = 0; + while (s < i) { + tmp = state[Nb*i+Nb-1]; + + for (k = Nb-1; k > 0; k--) { + state[Nb*i+k] = state[Nb*i+k-1]; + } + + state[Nb*i+0] = tmp; + s++; + } + } +} + +/* + * Transformation in the Cipher that processes the State using a non + * linear byte substitution table (S-box) that operates on each of the + * State bytes independently. + */ +SIMDE_FUNCTION_ATTRIBUTES +void simde_x_aes_sub_bytes(uint8_t *state) { + + int Nb = simde_x_aes_Nb; + uint8_t i, j; + + for (i = 0; i < 4; i++) { + for (j = 0; j < Nb; j++) { + // s_box row: yyyy ---- + // s_box col: ---- xxxx + // s_box[16*(yyyy) + xxxx] == s_box[yyyyxxxx] + state[Nb*i+j] = simde_x_aes_s_box[state[Nb*i+j]]; + } + } +} + +/* + * Transformation in the Inverse Cipher that is the inverse of + * SubBytes(). + */ +SIMDE_FUNCTION_ATTRIBUTES +void simde_x_aes_inv_sub_bytes(uint8_t *state) { + + int Nb = simde_x_aes_Nb; + uint8_t i, j; + + for (i = 0; i < 4; i++) { + for (j = 0; j < Nb; j++) { + state[Nb*i+j] = simde_x_aes_inv_s_box[state[Nb*i+j]]; + } + } +} + +/* + * Performs the AES cipher operation + */ +SIMDE_FUNCTION_ATTRIBUTES +void simde_x_aes_enc(simde__m128i_private in, simde__m128i_private *out, simde__m128i_private w, int is_last) { + + int Nb = simde_x_aes_Nb; + uint8_t state[4*simde_x_aes_Nb]; + uint8_t r = 0, i, j; + + for (i = 0; i < 4; i++) { + for (j = 0; j < Nb; j++) { + state[Nb*i+j] = in.u8[i+4*j]; + } + } + + simde_x_aes_sub_bytes(state); + simde_x_aes_shift_rows(state); + + if (!is_last) + simde_x_aes_mix_columns(state); + + simde_x_aes_add_round_key(state, w, r); + + for (i = 0; i < 4; i++) { + for (j = 0; j < Nb; j++) { + out->u8[i+4*j] = state[Nb*i+j]; + } + } +} + +/* + * Performs the AES inverse cipher operation + */ +SIMDE_FUNCTION_ATTRIBUTES +void simde_x_aes_dec(simde__m128i_private in, simde__m128i_private *out, simde__m128i_private w, int is_last) { + + int Nb = simde_x_aes_Nb; + uint8_t state[4*simde_x_aes_Nb]; + uint8_t r = 0, i, j; + + for (i = 0; i < 4; i++) { + for (j = 0; j < Nb; j++) { + state[Nb*i+j] = in.u8[i+4*j]; + } + } + + simde_x_aes_inv_shift_rows(state); + simde_x_aes_inv_sub_bytes(state); + + if (!is_last) + simde_x_aes_inv_mix_columns(state); + + simde_x_aes_add_round_key(state, w, r); + + for (i = 0; i < 4; i++) { + for (j = 0; j < Nb; j++) { + out->u8[i+4*j] = state[Nb*i+j]; + } + } +} +#endif // if !(defined(SIMDE_ARM_NEON_A32V7_NATIVE) && defined(SIMDE_ARCH_ARM_CRYPTO)) + +SIMDE_FUNCTION_ATTRIBUTES +simde__m128i simde_mm_aesenc_si128(simde__m128i a, simde__m128i round_key) { + #if defined(SIMDE_X86_AES_NATIVE) + return _mm_aesenc_si128(a, round_key); + #else + simde__m128i_private result_; + simde__m128i_private a_ = simde__m128i_to_private(a); + simde__m128i_private round_key_ = simde__m128i_to_private(round_key); + #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARCH_ARM_CRYPTO) + result_.neon_u8 = veorq_u8( + vaesmcq_u8(vaeseq_u8(a_.neon_u8, vdupq_n_u8(0))), + round_key_.neon_u8); + #else + simde_x_aes_enc(a_, &result_, round_key_, 0); + #endif + return simde__m128i_from_private(result_); + #endif +} +#if defined(SIMDE_X86_AES_ENABLE_NATIVE_ALIASES) + #define _mm_aesenc_si128(a, b) simde_mm_aesenc_si128(a, b) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde__m128i simde_mm_aesdec_si128(simde__m128i a, simde__m128i round_key) { + #if defined(SIMDE_X86_AES_NATIVE) + return _mm_aesdec_si128(a, round_key); + #else + simde__m128i_private result_; + simde__m128i_private a_ = simde__m128i_to_private(a); + simde__m128i_private round_key_ = simde__m128i_to_private(round_key); + #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARCH_ARM_CRYPTO) + result_.neon_u8 = veorq_u8( + vaesimcq_u8(vaesdq_u8(a_.neon_u8, vdupq_n_u8(0))), + round_key_.neon_u8); + #else + simde_x_aes_dec(a_, &result_, round_key_, 0); + #endif + return simde__m128i_from_private(result_); + #endif +} +#if defined(SIMDE_X86_AES_ENABLE_NATIVE_ALIASES) + #define _mm_aesdec_si128(a, b) simde_mm_aesdec_si128(a, b) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde__m128i simde_mm_aesenclast_si128(simde__m128i a, simde__m128i round_key) { + #if defined(SIMDE_X86_AES_NATIVE) + return _mm_aesenclast_si128(a, round_key); + #else + simde__m128i_private result_; + simde__m128i_private a_ = simde__m128i_to_private(a); + simde__m128i_private round_key_ = simde__m128i_to_private(round_key); + #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARCH_ARM_CRYPTO) + result_.neon_u8 = vaeseq_u8(a_.neon_u8, vdupq_n_u8(0)); + result_.neon_i32 = veorq_s32(result_.neon_i32, round_key_.neon_i32); // _mm_xor_si128 + #else + simde_x_aes_enc(a_, &result_, round_key_, 1); + #endif + return simde__m128i_from_private(result_); + #endif +} +#if defined(SIMDE_X86_AES_ENABLE_NATIVE_ALIASES) + #define _mm_aesenclast_si128(a, b) simde_mm_aesenclast_si128(a, b) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde__m128i simde_mm_aesdeclast_si128(simde__m128i a, simde__m128i round_key) { + #if defined(SIMDE_X86_AES_NATIVE) + return _mm_aesdeclast_si128(a, round_key); + #else + simde__m128i_private result_; + simde__m128i_private a_ = simde__m128i_to_private(a); + simde__m128i_private round_key_ = simde__m128i_to_private(round_key); + #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARCH_ARM_CRYPTO) + result_.neon_u8 = veorq_u8( + vaesdq_u8(a_.neon_u8, vdupq_n_u8(0)), + round_key_.neon_u8); + #else + simde_x_aes_dec(a_, &result_, round_key_, 1); + #endif + return simde__m128i_from_private(result_); + #endif +} +#if defined(SIMDE_X86_AES_ENABLE_NATIVE_ALIASES) + #define _mm_aesdeclast_si128(a, b) simde_mm_aesdeclast_si128(a, b) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde__m128i simde_mm_aesimc_si128(simde__m128i a) { + #if defined(SIMDE_X86_AES_NATIVE) + return _mm_aesimc_si128(a); + #else + simde__m128i_private result_ = simde__m128i_to_private(simde_mm_setzero_si128()); + simde__m128i_private a_ = simde__m128i_to_private(a); + + #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARCH_ARM_CRYPTO) + result_.neon_u8 = vaesimcq_u8(a_.neon_u8); + #else + int Nb = simde_x_aes_Nb; + // uint8_t k[] = {0x0e, 0x09, 0x0d, 0x0b}; // a(x) = {0e} + {09}x + {0d}x2 + {0b}x3 + uint8_t i, j, col[4], res[4]; + + for (j = 0; j < Nb; j++) { + for (i = 0; i < 4; i++) { + col[i] = a_.u8[Nb*j+i]; + } + + //coef_mult(k, col, res); + simde_x_aes_coef_mult_lookup(4, col, res); + + for (i = 0; i < 4; i++) { + result_.u8[Nb*j+i] = res[i]; + } + } + #endif + return simde__m128i_from_private(result_); + #endif +} +#if defined(SIMDE_X86_AES_ENABLE_NATIVE_ALIASES) + #define _mm_aesimc_si128(a) simde_mm_aesimc_si128(a) +#endif + +#undef simde_x_aes_Nb + +#endif /* !defined(SIMDE_X86_AES_H) */ diff --git a/lib/simd_wrapper/simde/x86/avx.h b/lib/simd_wrapper/simde/x86/avx.h index a10974c92d1..2314f955690 100644 --- a/lib/simd_wrapper/simde/x86/avx.h +++ b/lib/simd_wrapper/simde/x86/avx.h @@ -30,6 +30,7 @@ #define SIMDE_X86_AVX_H #include "sse4.2.h" +#include "../simde-f16.h" HEDLEY_DIAGNOSTIC_PUSH SIMDE_DISABLE_UNWANTED_DIAGNOSTICS @@ -165,6 +166,11 @@ typedef union { SIMDE_ALIGN_TO_32 simde_int128 i128 SIMDE_VECTOR(32) SIMDE_MAY_ALIAS; SIMDE_ALIGN_TO_32 simde_uint128 u128 SIMDE_VECTOR(32) SIMDE_MAY_ALIAS; #endif + #if defined(SIMDE_FLOAT16_VECTOR) + SIMDE_ALIGN_TO_32 simde_float16 f16 SIMDE_VECTOR(32) SIMDE_MAY_ALIAS; + #else + SIMDE_ALIGN_TO_32 simde_float16 f16[16]; + #endif SIMDE_ALIGN_TO_32 simde_float32 f32 SIMDE_VECTOR(32) SIMDE_MAY_ALIAS; SIMDE_ALIGN_TO_32 simde_float64 f64 SIMDE_VECTOR(32) SIMDE_MAY_ALIAS; SIMDE_ALIGN_TO_32 int_fast32_t i32f SIMDE_VECTOR(32) SIMDE_MAY_ALIAS; @@ -184,6 +190,7 @@ typedef union { SIMDE_ALIGN_TO_32 simde_int128 i128[2]; SIMDE_ALIGN_TO_32 simde_uint128 u128[2]; #endif + SIMDE_ALIGN_TO_32 simde_float16 f16[16]; SIMDE_ALIGN_TO_32 simde_float32 f32[8]; SIMDE_ALIGN_TO_32 simde_float64 f64[4]; #endif @@ -1943,6 +1950,8 @@ simde__m128 simde_mm_broadcast_ss (simde_float32 const * a) { #if defined(SIMDE_X86_AVX_NATIVE) return _mm_broadcast_ss(a); + #elif defined(SIMDE_WASM_SIMD128_NATIVE) + return simde__m128_from_wasm_v128(wasm_v128_load32_splat(a)); #else return simde_mm_set1_ps(*a); #endif @@ -2127,7 +2136,7 @@ simde_mm256_round_ps (simde__m256 a, const int rounding) { #elif SIMDE_NATURAL_VECTOR_SIZE_LE(128) && defined(SIMDE_STATEMENT_EXPR_) #define simde_mm256_round_ps(a, rounding) SIMDE_STATEMENT_EXPR_(({ \ simde__m256_private \ - simde_mm256_round_ps_r_, \ + simde_mm256_round_ps_r_ = simde__m256_to_private(simde_mm256_setzero_ps()), \ simde_mm256_round_ps_a_ = simde__m256_to_private(a); \ \ for (size_t simde_mm256_round_ps_i = 0 ; simde_mm256_round_ps_i < (sizeof(simde_mm256_round_ps_r_.m128) / sizeof(simde_mm256_round_ps_r_.m128[0])) ; simde_mm256_round_ps_i++) { \ @@ -2201,7 +2210,7 @@ simde_mm256_round_pd (simde__m256d a, const int rounding) { #elif SIMDE_NATURAL_VECTOR_SIZE_LE(128) && defined(SIMDE_STATEMENT_EXPR_) #define simde_mm256_round_pd(a, rounding) SIMDE_STATEMENT_EXPR_(({ \ simde__m256d_private \ - simde_mm256_round_pd_r_, \ + simde_mm256_round_pd_r_ = simde__m256d_to_private(simde_mm256_setzero_pd()), \ simde_mm256_round_pd_a_ = simde__m256d_to_private(a); \ \ for (size_t simde_mm256_round_pd_i = 0 ; simde_mm256_round_pd_i < (sizeof(simde_mm256_round_pd_r_.m128d) / sizeof(simde_mm256_round_pd_r_.m128d[0])) ; simde_mm256_round_pd_i++) { \ @@ -3093,7 +3102,7 @@ simde_mm256_cmp_ps #elif defined(SIMDE_STATEMENT_EXPR_) && SIMDE_NATURAL_VECTOR_SIZE_LE(128) #define simde_mm256_cmp_ps(a, b, imm8) SIMDE_STATEMENT_EXPR_(({ \ simde__m256_private \ - simde_mm256_cmp_ps_r_, \ + simde_mm256_cmp_ps_r_ = simde__m256_to_private(simde_mm256_setzero_ps()), \ simde_mm256_cmp_ps_a_ = simde__m256_to_private((a)), \ simde_mm256_cmp_ps_b_ = simde__m256_to_private((b)); \ \ @@ -3539,7 +3548,8 @@ simde_mm256_insert_epi8 (simde__m256i a, int8_t i, const int index) return simde__m256i_from_private(a_); } -#if defined(SIMDE_X86_AVX_NATIVE) +#if defined(SIMDE_X86_AVX_NATIVE) && \ + (!defined(HEDLEY_MSVC_VERSION) || HEDLEY_MSVC_VERSION_CHECK(19,10,0)) #define simde_mm256_insert_epi8(a, i, index) _mm256_insert_epi8(a, i, index) #endif #if defined(SIMDE_X86_AVX_ENABLE_NATIVE_ALIASES) @@ -3557,7 +3567,8 @@ simde_mm256_insert_epi16 (simde__m256i a, int16_t i, const int index) return simde__m256i_from_private(a_); } -#if defined(SIMDE_X86_AVX_NATIVE) +#if defined(SIMDE_X86_AVX_NATIVE) && \ + (!defined(HEDLEY_MSVC_VERSION) || HEDLEY_MSVC_VERSION_CHECK(19,10,0)) #define simde_mm256_insert_epi16(a, i, index) _mm256_insert_epi16(a, i, index) #endif #if defined(SIMDE_X86_AVX_ENABLE_NATIVE_ALIASES) @@ -3575,7 +3586,8 @@ simde_mm256_insert_epi32 (simde__m256i a, int32_t i, const int index) return simde__m256i_from_private(a_); } -#if defined(SIMDE_X86_AVX_NATIVE) +#if defined(SIMDE_X86_AVX_NATIVE) && \ + (!defined(HEDLEY_MSVC_VERSION) || HEDLEY_MSVC_VERSION_CHECK(19,10,0)) #define simde_mm256_insert_epi32(a, i, index) _mm256_insert_epi32(a, i, index) #endif #if defined(SIMDE_X86_AVX_ENABLE_NATIVE_ALIASES) @@ -3613,6 +3625,9 @@ simde__m256d simde_mm256_insertf128_pd(simde__m256d a, simde__m128d b, int imm8) return simde__m256d_from_private(a_); } +#if defined(SIMDE_X86_AVX_NATIVE) + #define simde_mm256_insertf128_pd(a, b, imm8) _mm256_insertf128_pd(a, b, imm8) +#endif #if defined(SIMDE_X86_AVX_ENABLE_NATIVE_ALIASES) #undef _mm256_insertf128_pd #define _mm256_insertf128_pd(a, b, imm8) simde_mm256_insertf128_pd(a, b, imm8) @@ -3628,6 +3643,9 @@ simde__m256 simde_mm256_insertf128_ps(simde__m256 a, simde__m128 b, int imm8) return simde__m256_from_private(a_); } +#if defined(SIMDE_X86_AVX_NATIVE) + #define simde_mm256_insertf128_ps(a, b, imm8) _mm256_insertf128_ps(a, b, imm8) +#endif #if defined(SIMDE_X86_AVX_ENABLE_NATIVE_ALIASES) #undef _mm256_insertf128_ps #define _mm256_insertf128_ps(a, b, imm8) simde_mm256_insertf128_ps(a, b, imm8) @@ -3643,6 +3661,9 @@ simde__m256i simde_mm256_insertf128_si256(simde__m256i a, simde__m128i b, int im return simde__m256i_from_private(a_); } +#if defined(SIMDE_X86_AVX_NATIVE) + #define simde_mm256_insertf128_si256(a, b, imm8) _mm256_insertf128_si256(a, b, imm8) +#endif #if defined(SIMDE_X86_AVX_ENABLE_NATIVE_ALIASES) #undef _mm256_insertf128_si256 #define _mm256_insertf128_si256(a, b, imm8) simde_mm256_insertf128_si256(a, b, imm8) @@ -3668,7 +3689,8 @@ simde_mm256_extract_epi32 (simde__m256i a, const int index) simde__m256i_private a_ = simde__m256i_to_private(a); return a_.i32[index]; } -#if defined(SIMDE_X86_AVX_NATIVE) +#if defined(SIMDE_X86_AVX_NATIVE) && \ + (!defined(HEDLEY_MSVC_VERSION) || HEDLEY_MSVC_VERSION_CHECK(19,10,0)) #define simde_mm256_extract_epi32(a, index) _mm256_extract_epi32(a, index) #endif #if defined(SIMDE_X86_AVX_ENABLE_NATIVE_ALIASES) @@ -3789,12 +3811,15 @@ simde_mm256_loadu_ps (const float a[HEDLEY_ARRAY_PARAM(8)]) { #define _mm256_loadu_ps(a) simde_mm256_loadu_ps(a) #endif +#if defined(SIMDE_X86_AVX512VL_NATIVE) && defined(SIMDE_X86_AVX512BW_NATIVE) \ + && !defined(SIMDE_BUG_GCC_95483) && !defined(SIMDE_BUG_CLANG_REV_344862) \ + && (!defined(HEDLEY_MSVC_VERSION) || HEDLEY_MSVC_VERSION_CHECK(19,20,0)) + #define simde_mm256_loadu_epi8(mem_addr) _mm256_loadu_epi8(mem_addr) +#else SIMDE_FUNCTION_ATTRIBUTES simde__m256i simde_mm256_loadu_epi8(void const * mem_addr) { - #if defined(SIMDE_X86_AVX512VL_NATIVE) && defined(SIMDE_X86_AVX512BW_NATIVE) && !defined(SIMDE_BUG_GCC_95483) && !defined(SIMDE_BUG_CLANG_REV_344862) - return _mm256_loadu_epi8(mem_addr); - #elif defined(SIMDE_X86_AVX_NATIVE) + #if defined(SIMDE_X86_AVX_NATIVE) return _mm256_loadu_si256(SIMDE_ALIGN_CAST(__m256i const *, mem_addr)); #else simde__m256i r; @@ -3802,18 +3827,22 @@ simde_mm256_loadu_epi8(void const * mem_addr) { return r; #endif } +#endif #define simde_x_mm256_loadu_epi8(mem_addr) simde_mm256_loadu_epi8(mem_addr) #if defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) || defined(SIMDE_X86_AVX512BW_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && (defined(SIMDE_BUG_GCC_95483) || defined(SIMDE_BUG_CLANG_REV_344862))) #undef _mm256_loadu_epi8 #define _mm256_loadu_epi8(a) simde_mm256_loadu_epi8(a) #endif +#if defined(SIMDE_X86_AVX512VL_NATIVE) && defined(SIMDE_X86_AVX512BW_NATIVE) \ + && !defined(SIMDE_BUG_GCC_95483) && !defined(SIMDE_BUG_CLANG_REV_344862) \ + && (!defined(HEDLEY_MSVC_VERSION) || HEDLEY_MSVC_VERSION_CHECK(19,20,0)) + #define simde_mm256_loadu_epi16(mem_addr) _mm256_loadu_epi16(mem_addr) +#else SIMDE_FUNCTION_ATTRIBUTES simde__m256i simde_mm256_loadu_epi16(void const * mem_addr) { - #if defined(SIMDE_X86_AVX512VL_NATIVE) && defined(SIMDE_X86_AVX512BW_NATIVE) && !defined(SIMDE_BUG_GCC_95483) && !defined(SIMDE_BUG_CLANG_REV_344862) - return _mm256_loadu_epi16(mem_addr); - #elif defined(SIMDE_X86_AVX_NATIVE) + #if defined(SIMDE_X86_AVX_NATIVE) return _mm256_loadu_si256(SIMDE_ALIGN_CAST(__m256i const *, mem_addr)); #else simde__m256i r; @@ -3821,18 +3850,22 @@ simde_mm256_loadu_epi16(void const * mem_addr) { return r; #endif } +#endif #define simde_x_mm256_loadu_epi16(mem_addr) simde_mm256_loadu_epi16(mem_addr) #if defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) || defined(SIMDE_X86_AVX512BW_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && (defined(SIMDE_BUG_GCC_95483) || defined(SIMDE_BUG_CLANG_REV_344862))) #undef _mm256_loadu_epi16 #define _mm256_loadu_epi16(a) simde_mm256_loadu_epi16(a) #endif +#if defined(SIMDE_X86_AVX512VL_NATIVE) && !defined(SIMDE_BUG_GCC_95483) \ + && !defined(SIMDE_BUG_CLANG_REV_344862) \ + && (!defined(HEDLEY_MSVC_VERSION) || HEDLEY_MSVC_VERSION_CHECK(19,20,0)) + #define simde_mm256_loadu_epi32(mem_addr) _mm256_loadu_epi32(mem_addr) +#else SIMDE_FUNCTION_ATTRIBUTES simde__m256i simde_mm256_loadu_epi32(void const * mem_addr) { - #if defined(SIMDE_X86_AVX512VL_NATIVE) && !defined(SIMDE_BUG_GCC_95483) && !defined(SIMDE_BUG_CLANG_REV_344862) - return _mm256_loadu_epi32(mem_addr); - #elif defined(SIMDE_X86_AVX_NATIVE) + #if defined(SIMDE_X86_AVX_NATIVE) return _mm256_loadu_si256(SIMDE_ALIGN_CAST(__m256i const *, mem_addr)); #else simde__m256i r; @@ -3840,18 +3873,22 @@ simde_mm256_loadu_epi32(void const * mem_addr) { return r; #endif } +#endif #define simde_x_mm256_loadu_epi32(mem_addr) simde_mm256_loadu_epi32(mem_addr) #if defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && (defined(SIMDE_BUG_GCC_95483) || defined(SIMDE_BUG_CLANG_REV_344862))) #undef _mm256_loadu_epi32 #define _mm256_loadu_epi32(a) simde_mm256_loadu_epi32(a) #endif +#if defined(SIMDE_X86_AVX512VL_NATIVE) && !defined(SIMDE_BUG_GCC_95483) \ + && !defined(SIMDE_BUG_CLANG_REV_344862) \ + && (!defined(HEDLEY_MSVC_VERSION) || HEDLEY_MSVC_VERSION_CHECK(19,20,0)) + #define simde_mm256_loadu_epi64(mem_addr) _mm256_loadu_epi64(mem_addr) +#else SIMDE_FUNCTION_ATTRIBUTES simde__m256i simde_mm256_loadu_epi64(void const * mem_addr) { - #if defined(SIMDE_X86_AVX512VL_NATIVE) && !defined(SIMDE_BUG_GCC_95483) && !defined(SIMDE_BUG_CLANG_REV_344862) - return _mm256_loadu_epi64(mem_addr); - #elif defined(SIMDE_X86_AVX_NATIVE) + #if defined(SIMDE_X86_AVX_NATIVE) return _mm256_loadu_si256(SIMDE_ALIGN_CAST(__m256i const *, mem_addr)); #else simde__m256i r; @@ -3859,6 +3896,7 @@ simde_mm256_loadu_epi64(void const * mem_addr) { return r; #endif } +#endif #define simde_x_mm256_loadu_epi64(mem_addr) simde_mm256_loadu_epi64(mem_addr) #if defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && (defined(SIMDE_BUG_GCC_95483) || defined(SIMDE_BUG_CLANG_REV_344862))) #undef _mm256_loadu_epi64 @@ -3931,7 +3969,7 @@ simde_mm256_loadu2_m128i (const simde__m128i* hiaddr, const simde__m128i* loaddr SIMDE_FUNCTION_ATTRIBUTES simde__m128d -simde_mm_maskload_pd (const simde_float64 mem_addr[HEDLEY_ARRAY_PARAM(4)], simde__m128i mask) { +simde_mm_maskload_pd (const simde_float64 mem_addr[HEDLEY_ARRAY_PARAM(2)], simde__m128i mask) { #if defined(SIMDE_X86_AVX_NATIVE) #if defined(__clang__) && !SIMDE_DETECT_CLANG_VERSION_CHECK(3,8,0) return _mm_maskload_pd(mem_addr, HEDLEY_REINTERPRET_CAST(simde__m128d, mask)); @@ -3939,19 +3977,26 @@ simde_mm_maskload_pd (const simde_float64 mem_addr[HEDLEY_ARRAY_PARAM(4)], simde return _mm_maskload_pd(mem_addr, mask); #endif #else - simde__m128d_private - mem_ = simde__m128d_to_private(simde_mm_loadu_pd(mem_addr)), - r_; - simde__m128i_private mask_ = simde__m128i_to_private(mask); + simde__m128d_private r_; + simde__m128i_private + mask_ = simde__m128i_to_private(mask), + mask_shr_; #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) - r_.neon_i64 = vandq_s64(mem_.neon_i64, vshrq_n_s64(mask_.neon_i64, 63)); + mask_shr_.neon_i64 = vshrq_n_s64(mask_.neon_i64, 63); + #elif defined(SIMDE_WASM_SIMD128_NATIVE) + return simde_mm_and_pd(simde_mm_load_pd(mem_addr), + simde__m128d_from_wasm_v128(wasm_i64x2_shr(mask_.wasm_v128, 63))); #else SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.f64) / sizeof(r_.f64[0])) ; i++) { - r_.i64[i] = mem_.i64[i] & (mask_.i64[i] >> 63); + for (size_t i = 0 ; i < (sizeof(mask_.i64) / sizeof(mask_.i64[0])) ; i++) { + mask_shr_.i64[i] = mask_.i64[i] >> 63; } #endif + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.f64) / sizeof(r_.f64[0])) ; i++) { + r_.f64[i] = mask_shr_.i64[i] ? mem_addr[i] : SIMDE_FLOAT64_C(0.0); + } return simde__m128d_from_private(r_); #endif @@ -3974,10 +4019,9 @@ simde_mm256_maskload_pd (const simde_float64 mem_addr[HEDLEY_ARRAY_PARAM(4)], si simde__m256d_private r_; simde__m256i_private mask_ = simde__m256i_to_private(mask); - r_ = simde__m256d_to_private(simde_mm256_loadu_pd(mem_addr)); SIMDE_VECTORIZE for (size_t i = 0 ; i < (sizeof(r_.f64) / sizeof(r_.f64[0])) ; i++) { - r_.i64[i] &= mask_.i64[i] >> 63; + r_.f64[i] = (mask_.i64[i] >> 63) ? mem_addr[i] : SIMDE_FLOAT64_C(0.0); } return simde__m256d_from_private(r_); @@ -3998,20 +4042,28 @@ simde_mm_maskload_ps (const simde_float32 mem_addr[HEDLEY_ARRAY_PARAM(4)], simde return _mm_maskload_ps(mem_addr, mask); #endif #else - simde__m128_private - mem_ = simde__m128_to_private(simde_mm_loadu_ps(mem_addr)), - r_; - simde__m128i_private mask_ = simde__m128i_to_private(mask); + simde__m128_private r_; + simde__m128i_private + mask_ = simde__m128i_to_private(mask), + mask_shr_; #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) - r_.neon_i32 = vandq_s32(mem_.neon_i32, vshrq_n_s32(mask_.neon_i32, 31)); + mask_shr_.neon_i32 = vshrq_n_s32(mask_.neon_i32, 31); + #elif defined(SIMDE_WASM_SIMD128_NATIVE) + return simde_mm_and_ps(simde_mm_load_ps(mem_addr), + simde__m128_from_wasm_v128(wasm_i32x4_shr(mask_.wasm_v128, 31))); #else SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.i32) / sizeof(r_.i32[0])) ; i++) { - r_.i32[i] = mem_.i32[i] & (mask_.i32[i] >> 31); + for (size_t i = 0 ; i < (sizeof(mask_.i32) / sizeof(mask_.i32[0])) ; i++) { + mask_shr_.i32[i] = mask_.i32[i] >> 31; } #endif + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.f32) / sizeof(r_.f32[0])) ; i++) { + r_.f32[i] = mask_shr_.i32[i] ? mem_addr[i] : SIMDE_FLOAT32_C(0.0); + } + return simde__m128_from_private(r_); #endif } @@ -4022,7 +4074,7 @@ simde_mm_maskload_ps (const simde_float32 mem_addr[HEDLEY_ARRAY_PARAM(4)], simde SIMDE_FUNCTION_ATTRIBUTES simde__m256 -simde_mm256_maskload_ps (const simde_float32 mem_addr[HEDLEY_ARRAY_PARAM(4)], simde__m256i mask) { +simde_mm256_maskload_ps (const simde_float32 mem_addr[HEDLEY_ARRAY_PARAM(8)], simde__m256i mask) { #if defined(SIMDE_X86_AVX_NATIVE) #if defined(__clang__) && !SIMDE_DETECT_CLANG_VERSION_CHECK(3,8,0) return _mm256_maskload_ps(mem_addr, HEDLEY_REINTERPRET_CAST(simde__m256, mask)); @@ -4033,10 +4085,9 @@ simde_mm256_maskload_ps (const simde_float32 mem_addr[HEDLEY_ARRAY_PARAM(4)], si simde__m256_private r_; simde__m256i_private mask_ = simde__m256i_to_private(mask); - r_ = simde__m256_to_private(simde_mm256_loadu_ps(mem_addr)); SIMDE_VECTORIZE for (size_t i = 0 ; i < (sizeof(r_.f32) / sizeof(r_.f32[0])) ; i++) { - r_.i32[i] &= mask_.i32[i] >> 31; + r_.f32[i] = (mask_.i32[i] >> 31) ? mem_addr[i] : SIMDE_FLOAT32_C(0.0); } return simde__m256_from_private(r_); @@ -4060,11 +4111,18 @@ simde_mm_maskstore_pd (simde_float64 mem_addr[HEDLEY_ARRAY_PARAM(2)], simde__m12 simde__m128i_private mask_ = simde__m128i_to_private(mask); simde__m128d_private a_ = simde__m128d_to_private(a); - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(a_.f64) / sizeof(a_.f64[0])) ; i++) { - if (mask_.u64[i] >> 63) - mem_addr[i] = a_.f64[i]; - } + #if defined(SIMDE_WASM_SIMD128_NATIVE) + if ((HEDLEY_STATIC_CAST(unsigned long long, wasm_i64x2_extract_lane(mask_.wasm_v128, 0)) & 0x8000000000000000ull) != 0) + mem_addr[0] = wasm_f64x2_extract_lane(a_.wasm_v128, 0); + if ((HEDLEY_STATIC_CAST(unsigned long long, wasm_i64x2_extract_lane(mask_.wasm_v128, 1)) & 0x8000000000000000ull) != 0) + mem_addr[1] = wasm_f64x2_extract_lane(a_.wasm_v128, 1); + #else + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(a_.f64) / sizeof(a_.f64[0])) ; i++) { + if (mask_.u64[i] >> 63) + mem_addr[i] = a_.f64[i]; + } + #endif #endif } #if defined(SIMDE_X86_AVX_ENABLE_NATIVE_ALIASES) @@ -4110,11 +4168,22 @@ simde_mm_maskstore_ps (simde_float32 mem_addr[HEDLEY_ARRAY_PARAM(4)], simde__m12 simde__m128i_private mask_ = simde__m128i_to_private(mask); simde__m128_private a_ = simde__m128_to_private(a); - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(a_.f32) / sizeof(a_.f32[0])) ; i++) { - if (mask_.u32[i] & (UINT32_C(1) << 31)) - mem_addr[i] = a_.f32[i]; - } + #if defined(SIMDE_WASM_SIMD128_NATIVE) + if ((HEDLEY_STATIC_CAST(unsigned long long, wasm_i32x4_extract_lane(mask_.wasm_v128, 0)) & 0x80000000ull) != 0) + mem_addr[0] = wasm_f32x4_extract_lane(a_.wasm_v128, 0); + if ((HEDLEY_STATIC_CAST(unsigned long long, wasm_i32x4_extract_lane(mask_.wasm_v128, 1)) & 0x80000000ull) != 0) + mem_addr[1] = wasm_f32x4_extract_lane(a_.wasm_v128, 1); + if ((HEDLEY_STATIC_CAST(unsigned long long, wasm_i32x4_extract_lane(mask_.wasm_v128, 2)) & 0x80000000ull) != 0) + mem_addr[2] = wasm_f32x4_extract_lane(a_.wasm_v128, 2); + if ((HEDLEY_STATIC_CAST(unsigned long long, wasm_i32x4_extract_lane(mask_.wasm_v128, 3)) & 0x80000000ull) != 0) + mem_addr[3] = wasm_f32x4_extract_lane(a_.wasm_v128, 3); + #else + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(a_.f32) / sizeof(a_.f32[0])) ; i++) { + if (mask_.u32[i] & (UINT32_C(1) << 31)) + mem_addr[i] = a_.f32[i]; + } + #endif #endif } #if defined(SIMDE_X86_AVX_ENABLE_NATIVE_ALIASES) @@ -4575,6 +4644,8 @@ simde_mm_permute_ps (simde__m128 a, const int imm8) } #if defined(SIMDE_X86_AVX_NATIVE) # define simde_mm_permute_ps(a, imm8) _mm_permute_ps(a, imm8) +#elif defined(SIMDE_WASM_SIMD128_NATIVE) +# define simde_mm_permute_ps(a, imm8) simde__m128_from_wasm_v128(wasm_i32x4_shuffle(simde__m128_to_wasm_v128(a), simde__m128_to_wasm_v128(a), ((imm8) & 3), (((imm8) >> 2) & 3 ), (((imm8) >> 4) & 3), (((imm8) >> 6) & 3))) #endif #if defined(SIMDE_X86_AVX_ENABLE_NATIVE_ALIASES) #undef _mm_permute_ps @@ -4599,6 +4670,8 @@ simde_mm_permute_pd (simde__m128d a, const int imm8) } #if defined(SIMDE_X86_AVX_NATIVE) # define simde_mm_permute_pd(a, imm8) _mm_permute_pd(a, imm8) +#elif defined(SIMDE_WASM_SIMD128_NATIVE) +# define simde_mm_permute_pd(a, imm8) simde__m128d_from_wasm_v128(wasm_i64x2_shuffle(simde__m128d_to_wasm_v128(a), simde__m128d_to_wasm_v128(a), ((imm8) & 1), (((imm8) >> 1) & 1 ))) #endif #if defined(SIMDE_X86_AVX_ENABLE_NATIVE_ALIASES) #undef _mm_permute_pd @@ -4616,10 +4689,18 @@ simde_mm_permutevar_ps (simde__m128 a, simde__m128i b) { a_ = simde__m128_to_private(a); simde__m128i_private b_ = simde__m128i_to_private(b); - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.f32) / sizeof(r_.f32[0])) ; i++) { - r_.f32[i] = a_.f32[b_.i32[i] & 3]; - } + #if defined(SIMDE_WASM_SIMD128_NATIVE) + r_.wasm_v128 = wasm_f32x4_make( + (a_.f32[wasm_i32x4_extract_lane(b_.wasm_v128, 0) & 3]), + (a_.f32[wasm_i32x4_extract_lane(b_.wasm_v128, 1) & 3]), + (a_.f32[wasm_i32x4_extract_lane(b_.wasm_v128, 2) & 3]), + (a_.f32[wasm_i32x4_extract_lane(b_.wasm_v128, 3) & 3])); + #else + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.f32) / sizeof(r_.f32[0])) ; i++) { + r_.f32[i] = a_.f32[b_.i32[i] & 3]; + } + #endif return simde__m128_from_private(r_); #endif @@ -4640,10 +4721,16 @@ simde_mm_permutevar_pd (simde__m128d a, simde__m128i b) { a_ = simde__m128d_to_private(a); simde__m128i_private b_ = simde__m128i_to_private(b); - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.f64) / sizeof(r_.f64[0])) ; i++) { - r_.f64[i] = a_.f64[(b_.i64[i] & 2) >> 1]; - } + #if defined(SIMDE_WASM_SIMD128_NATIVE) + r_.wasm_v128 = wasm_f64x2_make( + (a_.f64[(wasm_i64x2_extract_lane(b_.wasm_v128, 0) >> 1) & 1]), + (a_.f64[(wasm_i64x2_extract_lane(b_.wasm_v128, 1) >> 1) & 1])); + #else + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.f64) / sizeof(r_.f64[0])) ; i++) { + r_.f64[i] = a_.f64[(b_.i64[i] & 2) >> 1]; + } + #endif return simde__m128d_from_private(r_); #endif @@ -5049,8 +5136,8 @@ simde_mm256_shuffle_pd (simde__m256d a, simde__m256d b, const int imm8) #elif SIMDE_NATURAL_VECTOR_SIZE_LE(128) #define simde_mm256_shuffle_pd(a, b, imm8) \ simde_mm256_set_m128d( \ - simde_mm_shuffle_pd(simde_mm256_extractf128_pd(a, 1), simde_mm256_extractf128_pd(b, 1), (imm8 >> 0) & 3), \ - simde_mm_shuffle_pd(simde_mm256_extractf128_pd(a, 0), simde_mm256_extractf128_pd(b, 0), (imm8 >> 2) & 3)) + simde_mm_shuffle_pd(simde_mm256_extractf128_pd(a, 1), simde_mm256_extractf128_pd(b, 1), (imm8 >> 2) & 3), \ + simde_mm_shuffle_pd(simde_mm256_extractf128_pd(a, 0), simde_mm256_extractf128_pd(b, 0), (imm8 >> 0) & 3)) #elif defined(SIMDE_SHUFFLE_VECTOR_) #define simde_mm256_shuffle_pd(a, b, imm8) \ SIMDE_SHUFFLE_VECTOR_(64, 32, a, b, \ @@ -5258,6 +5345,8 @@ void simde_mm256_stream_ps (simde_float32 mem_addr[8], simde__m256 a) { #if defined(SIMDE_X86_AVX_NATIVE) _mm256_stream_ps(mem_addr, a); + #elif HEDLEY_HAS_BUILTIN(__builtin_nontemporal_store) && defined(SIMDE_VECTOR_SUBSCRIPT) + __builtin_nontemporal_store(a, SIMDE_ALIGN_CAST(__typeof__(a)*, mem_addr)); #else simde_memcpy(SIMDE_ALIGN_ASSUME_LIKE(mem_addr, simde__m256), &a, sizeof(a)); #endif @@ -5272,6 +5361,8 @@ void simde_mm256_stream_pd (simde_float64 mem_addr[4], simde__m256d a) { #if defined(SIMDE_X86_AVX_NATIVE) _mm256_stream_pd(mem_addr, a); + #elif HEDLEY_HAS_BUILTIN(__builtin_nontemporal_store) && defined(SIMDE_VECTOR_SUBSCRIPT) + __builtin_nontemporal_store(a, SIMDE_ALIGN_CAST(__typeof__(a)*, mem_addr)); #else simde_memcpy(SIMDE_ALIGN_ASSUME_LIKE(mem_addr, simde__m256d), &a, sizeof(a)); #endif @@ -5286,8 +5377,10 @@ void simde_mm256_stream_si256 (simde__m256i* mem_addr, simde__m256i a) { #if defined(SIMDE_X86_AVX_NATIVE) _mm256_stream_si256(mem_addr, a); + #elif HEDLEY_HAS_BUILTIN(__builtin_nontemporal_store) && defined(SIMDE_VECTOR_SUBSCRIPT) + __builtin_nontemporal_store(a, SIMDE_ALIGN_CAST(__typeof__(a)*, mem_addr)); #else - simde_memcpy(SIMDE_ALIGN_ASSUME_LIKE(mem_addr, simde__m256i), &a, sizeof(a)); + simde_memcpy(SIMDE_ALIGN_ASSUME_LIKE(mem_addr, simde__m256i), &a, sizeof(a)); #endif } #if defined(SIMDE_X86_AVX_ENABLE_NATIVE_ALIASES) @@ -6072,7 +6165,7 @@ simde_mm_testnzc_pd (simde__m128d a, simde__m128d b) { v128_t m = wasm_u64x2_shr(wasm_v128_and(a_.wasm_v128, b_.wasm_v128), 63); v128_t m2 = wasm_u64x2_shr(wasm_v128_andnot(b_.wasm_v128, a_.wasm_v128), 63); return HEDLEY_STATIC_CAST(int, (wasm_i64x2_extract_lane(m, 0) | wasm_i64x2_extract_lane(m, 1)) - & (wasm_i64x2_extract_lane(m2, 0) | wasm_i64x2_extract_lane(m2, 1))); + & (wasm_i64x2_extract_lane(m2, 0) | wasm_i64x2_extract_lane(m2, 1))); #else uint64_t rc = 0, rz = 0; for (size_t i = 0 ; i < (sizeof(a_.u64) / sizeof(a_.u64[0])) ; i++) { diff --git a/lib/simd_wrapper/simde/x86/avx2.h b/lib/simd_wrapper/simde/x86/avx2.h index e6d4b1f4667..c01c1e8f032 100644 --- a/lib/simd_wrapper/simde/x86/avx2.h +++ b/lib/simd_wrapper/simde/x86/avx2.h @@ -46,7 +46,7 @@ simde_mm256_abs_epi8 (simde__m256i a) { r_, a_ = simde__m256i_to_private(a); - #if SIMDE_NATURAL_VECTOR_SIZE_LE(128) + #if SIMDE_NATURAL_INT_VECTOR_SIZE_LE(128) r_.m128i[0] = simde_mm_abs_epi8(a_.m128i[0]); r_.m128i[1] = simde_mm_abs_epi8(a_.m128i[1]); #else @@ -74,7 +74,7 @@ simde_mm256_abs_epi16 (simde__m256i a) { r_, a_ = simde__m256i_to_private(a); - #if SIMDE_NATURAL_VECTOR_SIZE_LE(128) + #if SIMDE_NATURAL_INT_VECTOR_SIZE_LE(128) r_.m128i[0] = simde_mm_abs_epi16(a_.m128i[0]); r_.m128i[1] = simde_mm_abs_epi16(a_.m128i[1]); #else @@ -102,7 +102,7 @@ simde_mm256_abs_epi32(simde__m256i a) { r_, a_ = simde__m256i_to_private(a); - #if SIMDE_NATURAL_VECTOR_SIZE_LE(128) + #if SIMDE_NATURAL_INT_VECTOR_SIZE_LE(128) r_.m128i[0] = simde_mm_abs_epi32(a_.m128i[0]); r_.m128i[1] = simde_mm_abs_epi32(a_.m128i[1]); #else @@ -131,7 +131,7 @@ simde_mm256_add_epi8 (simde__m256i a, simde__m256i b) { a_ = simde__m256i_to_private(a), b_ = simde__m256i_to_private(b); - #if SIMDE_NATURAL_VECTOR_SIZE_LE(128) + #if SIMDE_NATURAL_INT_VECTOR_SIZE_LE(128) r_.m128i[0] = simde_mm_add_epi8(a_.m128i[0], b_.m128i[0]); r_.m128i[1] = simde_mm_add_epi8(a_.m128i[1], b_.m128i[1]); #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) @@ -162,7 +162,7 @@ simde_mm256_add_epi16 (simde__m256i a, simde__m256i b) { a_ = simde__m256i_to_private(a), b_ = simde__m256i_to_private(b); - #if SIMDE_NATURAL_VECTOR_SIZE_LE(128) + #if SIMDE_NATURAL_INT_VECTOR_SIZE_LE(128) r_.m128i[0] = simde_mm_add_epi16(a_.m128i[0], b_.m128i[0]); r_.m128i[1] = simde_mm_add_epi16(a_.m128i[1], b_.m128i[1]); #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) @@ -207,7 +207,7 @@ simde_mm256_add_epi32 (simde__m256i a, simde__m256i b) { a_ = simde__m256i_to_private(a), b_ = simde__m256i_to_private(b); - #if SIMDE_NATURAL_VECTOR_SIZE_LE(128) + #if SIMDE_NATURAL_INT_VECTOR_SIZE_LE(128) r_.m128i[0] = simde_mm_add_epi32(a_.m128i[0], b_.m128i[0]); r_.m128i[1] = simde_mm_add_epi32(a_.m128i[1], b_.m128i[1]); #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) @@ -252,7 +252,7 @@ simde_mm256_add_epi64 (simde__m256i a, simde__m256i b) { a_ = simde__m256i_to_private(a), b_ = simde__m256i_to_private(b); - #if SIMDE_NATURAL_VECTOR_SIZE_LE(128) + #if SIMDE_NATURAL_INT_VECTOR_SIZE_LE(128) r_.m128i[0] = simde_mm_add_epi64(a_.m128i[0], b_.m128i[0]); r_.m128i[1] = simde_mm_add_epi64(a_.m128i[1], b_.m128i[1]); #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) && !defined(SIMDE_BUG_CLANG_BAD_VI64_OPS) @@ -302,7 +302,7 @@ simde_mm256_alignr_epi8 (simde__m256i a, simde__m256i b, int count) } #if defined(SIMDE_X86_AVX2_NATIVE) && !defined(SIMDE_BUG_PGI_30106) # define simde_mm256_alignr_epi8(a, b, count) _mm256_alignr_epi8(a, b, count) -#elif SIMDE_NATURAL_VECTOR_SIZE_LE(128) +#elif SIMDE_NATURAL_INT_VECTOR_SIZE_LE(128) # define simde_mm256_alignr_epi8(a, b, count) \ simde_mm256_set_m128i( \ simde_mm_alignr_epi8(simde_mm256_extracti128_si256(a, 1), simde_mm256_extracti128_si256(b, 1), (count)), \ @@ -324,7 +324,7 @@ simde_mm256_and_si256 (simde__m256i a, simde__m256i b) { a_ = simde__m256i_to_private(a), b_ = simde__m256i_to_private(b); - #if SIMDE_NATURAL_VECTOR_SIZE_LE(128) + #if SIMDE_NATURAL_INT_VECTOR_SIZE_LE(128) r_.m128i[0] = simde_mm_and_si128(a_.m128i[0], b_.m128i[0]); r_.m128i[1] = simde_mm_and_si128(a_.m128i[1], b_.m128i[1]); #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) @@ -355,7 +355,7 @@ simde_mm256_andnot_si256 (simde__m256i a, simde__m256i b) { a_ = simde__m256i_to_private(a), b_ = simde__m256i_to_private(b); - #if SIMDE_NATURAL_VECTOR_SIZE_LE(128) + #if SIMDE_NATURAL_INT_VECTOR_SIZE_LE(128) r_.m128i[0] = simde_mm_andnot_si128(a_.m128i[0], b_.m128i[0]); r_.m128i[1] = simde_mm_andnot_si128(a_.m128i[1], b_.m128i[1]); #else @@ -384,7 +384,7 @@ simde_mm256_adds_epi8 (simde__m256i a, simde__m256i b) { a_ = simde__m256i_to_private(a), b_ = simde__m256i_to_private(b); - #if SIMDE_NATURAL_VECTOR_SIZE_LE(128) + #if SIMDE_NATURAL_INT_VECTOR_SIZE_LE(128) r_.m128i[0] = simde_mm_adds_epi8(a_.m128i[0], b_.m128i[0]); r_.m128i[1] = simde_mm_adds_epi8(a_.m128i[1], b_.m128i[1]); #else @@ -413,7 +413,7 @@ simde_mm256_adds_epi16(simde__m256i a, simde__m256i b) { a_ = simde__m256i_to_private(a), b_ = simde__m256i_to_private(b); - #if SIMDE_NATURAL_VECTOR_SIZE_LE(128) + #if SIMDE_NATURAL_INT_VECTOR_SIZE_LE(128) r_.m128i[0] = simde_mm_adds_epi16(a_.m128i[0], b_.m128i[0]); r_.m128i[1] = simde_mm_adds_epi16(a_.m128i[1], b_.m128i[1]); #else @@ -456,7 +456,7 @@ simde_mm256_adds_epu8 (simde__m256i a, simde__m256i b) { a_ = simde__m256i_to_private(a), b_ = simde__m256i_to_private(b); - #if SIMDE_NATURAL_VECTOR_SIZE_LE(128) + #if SIMDE_NATURAL_INT_VECTOR_SIZE_LE(128) r_.m128i[0] = simde_mm_adds_epu8(a_.m128i[0], b_.m128i[0]); r_.m128i[1] = simde_mm_adds_epu8(a_.m128i[1], b_.m128i[1]); #else @@ -485,7 +485,7 @@ simde_mm256_adds_epu16(simde__m256i a, simde__m256i b) { a_ = simde__m256i_to_private(a), b_ = simde__m256i_to_private(b); - #if SIMDE_NATURAL_VECTOR_SIZE_LE(128) + #if SIMDE_NATURAL_INT_VECTOR_SIZE_LE(128) r_.m128i[0] = simde_mm_adds_epu16(a_.m128i[0], b_.m128i[0]); r_.m128i[1] = simde_mm_adds_epu16(a_.m128i[1], b_.m128i[1]); #else @@ -569,7 +569,7 @@ simde_mm_blend_epi32(simde__m128i a, simde__m128i b, const int imm8) } #if defined(SIMDE_X86_AVX2_NATIVE) # define simde_mm_blend_epi32(a, b, imm8) _mm_blend_epi32(a, b, imm8) -#elif SIMDE_NATURAL_VECTOR_SIZE_LE(128) +#elif SIMDE_NATURAL_FLOAT_VECTOR_SIZE_LE(128) # define simde_mm_blend_epi32(a, b, imm8) \ simde_mm_castps_si128(simde_mm_blend_ps(simde_mm_castsi128_ps(a), simde_mm_castsi128_ps(b), (imm8))) #endif @@ -598,7 +598,7 @@ simde_mm256_blend_epi16(simde__m256i a, simde__m256i b, const int imm8) # define simde_mm256_blend_epi16(a, b, imm8) _mm256_castpd_si256(_mm256_blend_epi16(a, b, imm8)) #elif defined(SIMDE_X86_AVX2_NATIVE) # define simde_mm256_blend_epi16(a, b, imm8) _mm256_blend_epi16(a, b, imm8) -#elif SIMDE_NATURAL_VECTOR_SIZE_LE(128) +#elif SIMDE_NATURAL_INT_VECTOR_SIZE_LE(128) # define simde_mm256_blend_epi16(a, b, imm8) \ simde_mm256_set_m128i( \ simde_mm_blend_epi16(simde_mm256_extracti128_si256(a, 1), simde_mm256_extracti128_si256(b, 1), (imm8)), \ @@ -628,7 +628,7 @@ simde_mm256_blend_epi32(simde__m256i a, simde__m256i b, const int imm8) } #if defined(SIMDE_X86_AVX2_NATIVE) # define simde_mm256_blend_epi32(a, b, imm8) _mm256_blend_epi32(a, b, imm8) -#elif SIMDE_NATURAL_VECTOR_SIZE_LE(128) +#elif SIMDE_NATURAL_INT_VECTOR_SIZE_LE(128) # define simde_mm256_blend_epi32(a, b, imm8) \ simde_mm256_set_m128i( \ simde_mm_blend_epi32(simde_mm256_extracti128_si256(a, 1), simde_mm256_extracti128_si256(b, 1), (imm8) >> 4), \ @@ -652,17 +652,17 @@ simde_mm256_blendv_epi8(simde__m256i a, simde__m256i b, simde__m256i mask) { b_ = simde__m256i_to_private(b), mask_ = simde__m256i_to_private(mask); - #if SIMDE_NATURAL_VECTOR_SIZE_LE(128) + #if SIMDE_NATURAL_INT_VECTOR_SIZE_LE(128) r_.m128i[0] = simde_mm_blendv_epi8(a_.m128i[0], b_.m128i[0], mask_.m128i[0]); r_.m128i[1] = simde_mm_blendv_epi8(a_.m128i[1], b_.m128i[1], mask_.m128i[1]); + #elif defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR) + __typeof__(mask_.i8) tmp = mask_.i8 >> 7; + r_.i8 = (tmp & b_.i8) | (~tmp & a_.i8); #else SIMDE_VECTORIZE for (size_t i = 0 ; i < (sizeof(r_.u8) / sizeof(r_.u8[0])) ; i++) { - if (mask_.u8[i] & 0x80) { - r_.u8[i] = b_.u8[i]; - } else { - r_.u8[i] = a_.u8[i]; - } + int8_t tmp = mask_.i8[i] >> 7; + r_.i8[i] = (tmp & b_.i8[i]) | (~tmp & a_.i8[i]); } #endif @@ -858,14 +858,20 @@ simde__m128 simde_mm_broadcastss_ps (simde__m128 a) { #if defined(SIMDE_X86_AVX2_NATIVE) return _mm_broadcastss_ps(a); + #elif defined(SIMDE_X86_SSE_NATIVE) + return simde_mm_shuffle_ps(a, a, 0); #else simde__m128_private r_; simde__m128_private a_= simde__m128_to_private(a); - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.f32) / sizeof(r_.f32[0])) ; i++) { - r_.f32[i] = a_.f32[0]; - } + #if defined(SIMDE_SHUFFLE_VECTOR_) + r_.f32 = SIMDE_SHUFFLE_VECTOR_(32, 16, a_.f32, a_.f32, 0, 0, 0, 0); + #else + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.f32) / sizeof(r_.f32[0])) ; i++) { + r_.f32[i] = a_.f32[0]; + } + #endif return simde__m128_from_private(r_); #endif @@ -884,10 +890,19 @@ simde_mm256_broadcastss_ps (simde__m128 a) { simde__m256_private r_; simde__m128_private a_= simde__m128_to_private(a); - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.f32) / sizeof(r_.f32[0])) ; i++) { - r_.f32[i] = a_.f32[0]; - } + #if defined(SIMDE_X86_AVX_NATIVE) + __m128 tmp = _mm_permute_ps(a_.n, 0); + r_.n = _mm256_insertf128_ps(_mm256_castps128_ps256(tmp), tmp, 1); + #elif HEDLEY_HAS_BUILTIN(__builtin_shufflevector) + r_.f32 = __builtin_shufflevector(a_.f32, a_.f32, 0, 0, 0, 0, 0, 0, 0, 0); + #elif SIMDE_NATURAL_FLOAT_VECTOR_SIZE_LE(128) + r_.m128[0] = r_.m128[1] = simde_mm_broadcastss_ps(simde__m128_from_private(a_)); + #else + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.f32) / sizeof(r_.f32[0])) ; i++) { + r_.f32[i] = a_.f32[0]; + } + #endif return simde__m256_from_private(r_); #endif @@ -939,7 +954,7 @@ simde_mm256_broadcastsi128_si256 (simde__m128i a) { simde__m256i_private r_; simde__m128i_private a_ = simde__m128i_to_private(a); - #if SIMDE_NATURAL_VECTOR_SIZE_LE(128) + #if SIMDE_NATURAL_INT_VECTOR_SIZE_LE(128) r_.m128i_private[0] = a_; r_.m128i_private[1] = a_; #else @@ -1047,7 +1062,7 @@ simde_mm256_cmpeq_epi8 (simde__m256i a, simde__m256i b) { a_ = simde__m256i_to_private(a), b_ = simde__m256i_to_private(b); - #if SIMDE_NATURAL_VECTOR_SIZE_LE(128) + #if SIMDE_NATURAL_INT_VECTOR_SIZE_LE(128) r_.m128i[0] = simde_mm_cmpeq_epi8(a_.m128i[0], b_.m128i[0]); r_.m128i[1] = simde_mm_cmpeq_epi8(a_.m128i[1], b_.m128i[1]); #else @@ -1076,7 +1091,7 @@ simde_mm256_cmpeq_epi16 (simde__m256i a, simde__m256i b) { a_ = simde__m256i_to_private(a), b_ = simde__m256i_to_private(b); - #if SIMDE_NATURAL_VECTOR_SIZE_LE(128) + #if SIMDE_NATURAL_INT_VECTOR_SIZE_LE(128) r_.m128i[0] = simde_mm_cmpeq_epi16(a_.m128i[0], b_.m128i[0]); r_.m128i[1] = simde_mm_cmpeq_epi16(a_.m128i[1], b_.m128i[1]); #else @@ -1105,7 +1120,7 @@ simde_mm256_cmpeq_epi32 (simde__m256i a, simde__m256i b) { a_ = simde__m256i_to_private(a), b_ = simde__m256i_to_private(b); - #if SIMDE_NATURAL_VECTOR_SIZE_LE(128) + #if SIMDE_NATURAL_INT_VECTOR_SIZE_LE(128) r_.m128i[0] = simde_mm_cmpeq_epi32(a_.m128i[0], b_.m128i[0]); r_.m128i[1] = simde_mm_cmpeq_epi32(a_.m128i[1], b_.m128i[1]); #else @@ -1134,7 +1149,7 @@ simde_mm256_cmpeq_epi64 (simde__m256i a, simde__m256i b) { a_ = simde__m256i_to_private(a), b_ = simde__m256i_to_private(b); - #if SIMDE_NATURAL_VECTOR_SIZE_LE(128) + #if SIMDE_NATURAL_INT_VECTOR_SIZE_LE(128) r_.m128i[0] = simde_mm_cmpeq_epi64(a_.m128i[0], b_.m128i[0]); r_.m128i[1] = simde_mm_cmpeq_epi64(a_.m128i[1], b_.m128i[1]); #else @@ -1163,7 +1178,7 @@ simde_mm256_cmpgt_epi8 (simde__m256i a, simde__m256i b) { a_ = simde__m256i_to_private(a), b_ = simde__m256i_to_private(b); - #if SIMDE_NATURAL_VECTOR_SIZE_LE(128) + #if SIMDE_NATURAL_INT_VECTOR_SIZE_LE(128) r_.m128i[0] = simde_mm_cmpgt_epi8(a_.m128i[0], b_.m128i[0]); r_.m128i[1] = simde_mm_cmpgt_epi8(a_.m128i[1], b_.m128i[1]); #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) @@ -1194,7 +1209,7 @@ simde_mm256_cmpgt_epi16 (simde__m256i a, simde__m256i b) { a_ = simde__m256i_to_private(a), b_ = simde__m256i_to_private(b); - #if SIMDE_NATURAL_VECTOR_SIZE_LE(128) + #if SIMDE_NATURAL_INT_VECTOR_SIZE_LE(128) r_.m128i[0] = simde_mm_cmpgt_epi16(a_.m128i[0], b_.m128i[0]); r_.m128i[1] = simde_mm_cmpgt_epi16(a_.m128i[1], b_.m128i[1]); #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) @@ -1225,7 +1240,7 @@ simde_mm256_cmpgt_epi32 (simde__m256i a, simde__m256i b) { a_ = simde__m256i_to_private(a), b_ = simde__m256i_to_private(b); - #if SIMDE_NATURAL_VECTOR_SIZE_LE(128) + #if SIMDE_NATURAL_INT_VECTOR_SIZE_LE(128) r_.m128i[0] = simde_mm_cmpgt_epi32(a_.m128i[0], b_.m128i[0]); r_.m128i[1] = simde_mm_cmpgt_epi32(a_.m128i[1], b_.m128i[1]); #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) @@ -1256,7 +1271,7 @@ simde_mm256_cmpgt_epi64 (simde__m256i a, simde__m256i b) { a_ = simde__m256i_to_private(a), b_ = simde__m256i_to_private(b); - #if SIMDE_NATURAL_VECTOR_SIZE_LE(128) + #if SIMDE_NATURAL_INT_VECTOR_SIZE_LE(128) r_.m128i[0] = simde_mm_cmpgt_epi64(a_.m128i[0], b_.m128i[0]); r_.m128i[1] = simde_mm_cmpgt_epi64(a_.m128i[1], b_.m128i[1]); #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) @@ -1587,7 +1602,8 @@ simde_mm256_extract_epi8 (simde__m256i a, const int index) simde__m256i_private a_ = simde__m256i_to_private(a); return a_.i8[index]; } -#if defined(SIMDE_X86_AVX2_NATIVE) +#if defined(SIMDE_X86_AVX2_NATIVE) && \ + (!defined(HEDLEY_MSVC_VERSION) || HEDLEY_MSVC_VERSION_CHECK(19,10,0)) #define simde_mm256_extract_epi8(a, index) _mm256_extract_epi8(a, index) #endif #if defined(SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES) @@ -1602,7 +1618,8 @@ simde_mm256_extract_epi16 (simde__m256i a, const int index) simde__m256i_private a_ = simde__m256i_to_private(a); return a_.i16[index]; } -#if defined(SIMDE_X86_AVX2_NATIVE) +#if defined(SIMDE_X86_AVX2_NATIVE) && \ + (!defined(HEDLEY_MSVC_VERSION) || HEDLEY_MSVC_VERSION_CHECK(19,10,0)) #define simde_mm256_extract_epi16(a, index) _mm256_extract_epi16(a, index) #endif #if defined(SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES) @@ -2252,11 +2269,11 @@ simde_mm256_i32gather_ps(const simde_float32* base_addr, simde__m256i vindex, co return simde__m256_from_private(r_); } #if defined(SIMDE_X86_AVX2_NATIVE) - #define simde_mm256_i32gather_ps(base_addr, vindex, scale) _mm256_i32gather_ps(SIMDE_CHECKED_REINTERPRET_CAST(float const*, simde_float32 const*, base_addr), vindex, scale) + #define simde_mm256_i32gather_ps(base_addr, vindex, scale) _mm256_i32gather_ps(SIMDE_CHECKED_REINTERPRET_CAST(float const*, simde_float32 const*, (base_addr)), (vindex), (scale)) #endif #if defined(SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES) #undef _mm256_i32gather_ps - #define _mm256_i32gather_ps(base_addr, vindex, scale) simde_mm256_i32gather_ps(SIMDE_CHECKED_REINTERPRET_CAST(simde_float32 const*, float const*, base_addr), vindex, scale) + #define _mm256_i32gather_ps(base_addr, vindex, scale) simde_mm256_i32gather_ps(SIMDE_CHECKED_REINTERPRET_CAST(simde_float32 const*, float const*, (base_addr)), (vindex), (scale)) #endif SIMDE_FUNCTION_ATTRIBUTES @@ -2715,7 +2732,7 @@ simde_mm256_madd_epi16 (simde__m256i a, simde__m256i b) { a_ = simde__m256i_to_private(a), b_ = simde__m256i_to_private(b); - #if SIMDE_NATURAL_VECTOR_SIZE_LE(128) + #if SIMDE_NATURAL_INT_VECTOR_SIZE_LE(128) r_.m128i[0] = simde_mm_madd_epi16(a_.m128i[0], b_.m128i[0]); r_.m128i[1] = simde_mm_madd_epi16(a_.m128i[1], b_.m128i[1]); #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) && defined(SIMDE_CONVERT_VECTOR_) && HEDLEY_HAS_BUILTIN(__builtin_shufflevector) @@ -2759,7 +2776,7 @@ simde_mm256_maddubs_epi16 (simde__m256i a, simde__m256i b) { a_ = simde__m256i_to_private(a), b_ = simde__m256i_to_private(b); - #if SIMDE_NATURAL_VECTOR_SIZE_LE(128) + #if SIMDE_NATURAL_INT_VECTOR_SIZE_LE(128) r_.m128i[0] = simde_mm_maddubs_epi16(a_.m128i[0], b_.m128i[0]); r_.m128i[1] = simde_mm_maddubs_epi16(a_.m128i[1], b_.m128i[1]); #else @@ -2788,19 +2805,24 @@ simde_mm_maskload_epi32 (const int32_t mem_addr[HEDLEY_ARRAY_PARAM(4)], simde__m return _mm_maskload_epi32(mem_addr, mask); #else simde__m128i_private - mem_ = simde__m128i_to_private(simde_x_mm_loadu_epi32(mem_addr)), r_, - mask_ = simde__m128i_to_private(mask); + mask_ = simde__m128i_to_private(mask), + mask_shr_; #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) - r_.neon_i32 = vandq_s32(mem_.neon_i32, vshrq_n_s32(mask_.neon_i32, 31)); + mask_shr_.neon_i32 = vshrq_n_s32(mask_.neon_i32, 31); #else SIMDE_VECTORIZE for (size_t i = 0 ; i < (sizeof(r_.i32) / sizeof(r_.i32[0])) ; i++) { - r_.i32[i] = mem_.i32[i] & (mask_.i32[i] >> 31); + mask_shr_.i32[i] = mask_.i32[i] >> 31; } #endif + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.i32) / sizeof(r_.i32[0])) ; i++) { + r_.i32[i] = mask_shr_.i32[i] ? mem_addr[i] : INT32_C(0); + } + return simde__m128i_from_private(r_); #endif } @@ -2817,11 +2839,11 @@ simde_mm256_maskload_epi32 (const int32_t mem_addr[HEDLEY_ARRAY_PARAM(4)], simde #else simde__m256i_private mask_ = simde__m256i_to_private(mask), - r_ = simde__m256i_to_private(simde_x_mm256_loadu_epi32(mem_addr)); + r_; SIMDE_VECTORIZE for (size_t i = 0 ; i < (sizeof(r_.i32) / sizeof(r_.i32[0])) ; i++) { - r_.i32[i] &= mask_.i32[i] >> 31; + r_.i32[i] = (mask_.i32[i] >> 31) ? mem_addr[i] : INT32_C(0); } return simde__m256i_from_private(r_); @@ -2834,24 +2856,29 @@ simde_mm256_maskload_epi32 (const int32_t mem_addr[HEDLEY_ARRAY_PARAM(4)], simde SIMDE_FUNCTION_ATTRIBUTES simde__m128i -simde_mm_maskload_epi64 (const int64_t mem_addr[HEDLEY_ARRAY_PARAM(4)], simde__m128i mask) { +simde_mm_maskload_epi64 (const int64_t mem_addr[HEDLEY_ARRAY_PARAM(2)], simde__m128i mask) { #if defined(SIMDE_X86_AVX2_NATIVE) return _mm_maskload_epi64(HEDLEY_REINTERPRET_CAST(const long long *, mem_addr), mask); #else simde__m128i_private - mem_ = simde__m128i_to_private(simde_x_mm_loadu_epi64((mem_addr))), r_, - mask_ = simde__m128i_to_private(mask); + mask_ = simde__m128i_to_private(mask), + mask_shr_; #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) - r_.neon_i64 = vandq_s64(mem_.neon_i64, vshrq_n_s64(mask_.neon_i64, 63)); + mask_shr_.neon_i64 = vshrq_n_s64(mask_.neon_i64, 63); #else SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.i64) / sizeof(r_.i64[0])) ; i++) { - r_.i64[i] = mem_.i64[i] & (mask_.i64[i] >> 63); + for (size_t i = 0 ; i < (sizeof(mask_.i64) / sizeof(mask_.i64[0])) ; i++) { + mask_shr_.i64[i] = mask_.i64[i] >> 63; } #endif + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.i64) / sizeof(r_.i64[0])) ; i++) { + r_.i64[i] = mask_shr_.i64[i] ? mem_addr[i] : INT64_C(0); + } + return simde__m128i_from_private(r_); #endif } @@ -2868,11 +2895,11 @@ simde_mm256_maskload_epi64 (const int64_t mem_addr[HEDLEY_ARRAY_PARAM(4)], simde #else simde__m256i_private mask_ = simde__m256i_to_private(mask), - r_ = simde__m256i_to_private(simde_x_mm256_loadu_epi64((mem_addr))); + r_; SIMDE_VECTORIZE for (size_t i = 0 ; i < (sizeof(r_.i64) / sizeof(r_.i64[0])) ; i++) { - r_.i64[i] &= mask_.i64[i] >> 63; + r_.i64[i] = (mask_.i64[i] >> 63) ? mem_addr[i] : INT64_C(0); } return simde__m256i_from_private(r_); @@ -2978,7 +3005,7 @@ simde_mm256_max_epi8 (simde__m256i a, simde__m256i b) { a_ = simde__m256i_to_private(a), b_ = simde__m256i_to_private(b); - #if SIMDE_NATURAL_VECTOR_SIZE_LE(128) + #if SIMDE_NATURAL_INT_VECTOR_SIZE_LE(128) r_.m128i[0] = simde_mm_max_epi8(a_.m128i[0], b_.m128i[0]); r_.m128i[1] = simde_mm_max_epi8(a_.m128i[1], b_.m128i[1]); #else @@ -3007,7 +3034,7 @@ simde_mm256_max_epu8 (simde__m256i a, simde__m256i b) { a_ = simde__m256i_to_private(a), b_ = simde__m256i_to_private(b); - #if SIMDE_NATURAL_VECTOR_SIZE_LE(128) + #if SIMDE_NATURAL_INT_VECTOR_SIZE_LE(128) r_.m128i[0] = simde_mm_max_epu8(a_.m128i[0], b_.m128i[0]); r_.m128i[1] = simde_mm_max_epu8(a_.m128i[1], b_.m128i[1]); #else @@ -3036,7 +3063,7 @@ simde_mm256_max_epu16 (simde__m256i a, simde__m256i b) { a_ = simde__m256i_to_private(a), b_ = simde__m256i_to_private(b); - #if SIMDE_NATURAL_VECTOR_SIZE_LE(128) + #if SIMDE_NATURAL_INT_VECTOR_SIZE_LE(128) r_.m128i[0] = simde_mm_max_epu16(a_.m128i[0], b_.m128i[0]); r_.m128i[1] = simde_mm_max_epu16(a_.m128i[1], b_.m128i[1]); #else @@ -3065,7 +3092,7 @@ simde_mm256_max_epu32 (simde__m256i a, simde__m256i b) { a_ = simde__m256i_to_private(a), b_ = simde__m256i_to_private(b); - #if SIMDE_NATURAL_VECTOR_SIZE_LE(128) + #if SIMDE_NATURAL_INT_VECTOR_SIZE_LE(128) r_.m128i[0] = simde_mm_max_epu32(a_.m128i[0], b_.m128i[0]); r_.m128i[1] = simde_mm_max_epu32(a_.m128i[1], b_.m128i[1]); #else @@ -3094,7 +3121,7 @@ simde_mm256_max_epi16 (simde__m256i a, simde__m256i b) { a_ = simde__m256i_to_private(a), b_ = simde__m256i_to_private(b); - #if SIMDE_NATURAL_VECTOR_SIZE_LE(128) + #if SIMDE_NATURAL_INT_VECTOR_SIZE_LE(128) r_.m128i[0] = simde_mm_max_epi16(a_.m128i[0], b_.m128i[0]); r_.m128i[1] = simde_mm_max_epi16(a_.m128i[1], b_.m128i[1]); #else @@ -3123,7 +3150,7 @@ simde_mm256_max_epi32 (simde__m256i a, simde__m256i b) { a_ = simde__m256i_to_private(a), b_ = simde__m256i_to_private(b); - #if SIMDE_NATURAL_VECTOR_SIZE_LE(128) + #if SIMDE_NATURAL_INT_VECTOR_SIZE_LE(128) r_.m128i[0] = simde_mm_max_epi32(a_.m128i[0], b_.m128i[0]); r_.m128i[1] = simde_mm_max_epi32(a_.m128i[1], b_.m128i[1]); #else @@ -3152,7 +3179,7 @@ simde_mm256_min_epi8 (simde__m256i a, simde__m256i b) { a_ = simde__m256i_to_private(a), b_ = simde__m256i_to_private(b); - #if SIMDE_NATURAL_VECTOR_SIZE_LE(128) + #if SIMDE_NATURAL_INT_VECTOR_SIZE_LE(128) r_.m128i[0] = simde_mm_min_epi8(a_.m128i[0], b_.m128i[0]); r_.m128i[1] = simde_mm_min_epi8(a_.m128i[1], b_.m128i[1]); #else @@ -3181,7 +3208,7 @@ simde_mm256_min_epi16 (simde__m256i a, simde__m256i b) { a_ = simde__m256i_to_private(a), b_ = simde__m256i_to_private(b); - #if SIMDE_NATURAL_VECTOR_SIZE_LE(128) + #if SIMDE_NATURAL_INT_VECTOR_SIZE_LE(128) r_.m128i[0] = simde_mm_min_epi16(a_.m128i[0], b_.m128i[0]); r_.m128i[1] = simde_mm_min_epi16(a_.m128i[1], b_.m128i[1]); #else @@ -3210,7 +3237,7 @@ simde_mm256_min_epi32 (simde__m256i a, simde__m256i b) { a_ = simde__m256i_to_private(a), b_ = simde__m256i_to_private(b); - #if SIMDE_NATURAL_VECTOR_SIZE_LE(128) + #if SIMDE_NATURAL_INT_VECTOR_SIZE_LE(128) r_.m128i[0] = simde_mm_min_epi32(a_.m128i[0], b_.m128i[0]); r_.m128i[1] = simde_mm_min_epi32(a_.m128i[1], b_.m128i[1]); #else @@ -3239,7 +3266,7 @@ simde_mm256_min_epu8 (simde__m256i a, simde__m256i b) { a_ = simde__m256i_to_private(a), b_ = simde__m256i_to_private(b); - #if SIMDE_NATURAL_VECTOR_SIZE_LE(128) + #if SIMDE_NATURAL_INT_VECTOR_SIZE_LE(128) r_.m128i[0] = simde_mm_min_epu8(a_.m128i[0], b_.m128i[0]); r_.m128i[1] = simde_mm_min_epu8(a_.m128i[1], b_.m128i[1]); #else @@ -3268,7 +3295,7 @@ simde_mm256_min_epu16 (simde__m256i a, simde__m256i b) { a_ = simde__m256i_to_private(a), b_ = simde__m256i_to_private(b); - #if SIMDE_NATURAL_VECTOR_SIZE_LE(128) + #if SIMDE_NATURAL_INT_VECTOR_SIZE_LE(128) r_.m128i[0] = simde_mm_min_epu16(a_.m128i[0], b_.m128i[0]); r_.m128i[1] = simde_mm_min_epu16(a_.m128i[1], b_.m128i[1]); #else @@ -3297,7 +3324,7 @@ simde_mm256_min_epu32 (simde__m256i a, simde__m256i b) { a_ = simde__m256i_to_private(a), b_ = simde__m256i_to_private(b); - #if SIMDE_NATURAL_VECTOR_SIZE_LE(128) + #if SIMDE_NATURAL_INT_VECTOR_SIZE_LE(128) r_.m128i[0] = simde_mm_min_epu32(a_.m128i[0], b_.m128i[0]); r_.m128i[1] = simde_mm_min_epu32(a_.m128i[1], b_.m128i[1]); #else @@ -3324,7 +3351,7 @@ simde_mm256_movemask_epi8 (simde__m256i a) { simde__m256i_private a_ = simde__m256i_to_private(a); uint32_t r = 0; - #if SIMDE_NATURAL_VECTOR_SIZE_LE(128) + #if SIMDE_NATURAL_INT_VECTOR_SIZE_LE(128) for (size_t i = 0 ; i < (sizeof(a_.m128i) / sizeof(a_.m128i[0])) ; i++) { r |= HEDLEY_STATIC_CAST(uint32_t,simde_mm_movemask_epi8(a_.m128i[i])) << (16 * i); } @@ -3380,7 +3407,7 @@ simde_mm256_mpsadbw_epu8 (simde__m256i a, simde__m256i b, const int imm8) } #if defined(SIMDE_X86_AVX2_NATIVE) && SIMDE_DETECT_CLANG_VERSION_CHECK(3,9,0) #define simde_mm256_mpsadbw_epu8(a, b, imm8) _mm256_mpsadbw_epu8(a, b, imm8) -#elif SIMDE_NATURAL_VECTOR_SIZE_LE(128) +#elif SIMDE_NATURAL_INT_VECTOR_SIZE_LE(128) #define simde_mm256_mpsadbw_epu8(a, b, imm8) \ simde_mm256_set_m128i( \ simde_mm_mpsadbw_epu8(simde_mm256_extracti128_si256(a, 1), simde_mm256_extracti128_si256(b, 1), (imm8 >> 3)), \ @@ -3402,7 +3429,7 @@ simde_mm256_mul_epi32 (simde__m256i a, simde__m256i b) { a_ = simde__m256i_to_private(a), b_ = simde__m256i_to_private(b); - #if SIMDE_NATURAL_VECTOR_SIZE_LE(128) + #if SIMDE_NATURAL_INT_VECTOR_SIZE_LE(128) r_.m128i[0] = simde_mm_mul_epi32(a_.m128i[0], b_.m128i[0]); r_.m128i[1] = simde_mm_mul_epi32(a_.m128i[1], b_.m128i[1]); #else @@ -3432,7 +3459,7 @@ simde_mm256_mul_epu32 (simde__m256i a, simde__m256i b) { a_ = simde__m256i_to_private(a), b_ = simde__m256i_to_private(b); - #if SIMDE_NATURAL_VECTOR_SIZE_LE(128) + #if SIMDE_NATURAL_INT_VECTOR_SIZE_LE(128) r_.m128i[0] = simde_mm_mul_epu32(a_.m128i[0], b_.m128i[0]); r_.m128i[1] = simde_mm_mul_epu32(a_.m128i[1], b_.m128i[1]); #else @@ -3597,7 +3624,7 @@ simde_mm256_or_si256 (simde__m256i a, simde__m256i b) { a_ = simde__m256i_to_private(a), b_ = simde__m256i_to_private(b); - #if SIMDE_NATURAL_VECTOR_SIZE_LE(128) + #if SIMDE_NATURAL_INT_VECTOR_SIZE_LE(128) r_.m128i[0] = simde_mm_or_si128(a_.m128i[0], b_.m128i[0]); r_.m128i[1] = simde_mm_or_si128(a_.m128i[1], b_.m128i[1]); #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) @@ -3628,7 +3655,7 @@ simde_mm256_packs_epi16 (simde__m256i a, simde__m256i b) { a_ = simde__m256i_to_private(a), b_ = simde__m256i_to_private(b); - #if SIMDE_NATURAL_VECTOR_SIZE_LE(128) + #if SIMDE_NATURAL_INT_VECTOR_SIZE_LE(128) r_.m128i[0] = simde_mm_packs_epi16(a_.m128i[0], b_.m128i[0]); r_.m128i[1] = simde_mm_packs_epi16(a_.m128i[1], b_.m128i[1]); #else @@ -3664,7 +3691,7 @@ simde_mm256_packs_epi32 (simde__m256i a, simde__m256i b) { simde__m256i_to_private(b) }; - #if SIMDE_NATURAL_VECTOR_SIZE_LE(128) + #if SIMDE_NATURAL_INT_VECTOR_SIZE_LE(128) r_.m128i[0] = simde_mm_packs_epi32(v_[0].m128i[0], v_[1].m128i[0]); r_.m128i[1] = simde_mm_packs_epi32(v_[0].m128i[1], v_[1].m128i[1]); #else @@ -3694,7 +3721,7 @@ simde_mm256_packus_epi16 (simde__m256i a, simde__m256i b) { a_ = simde__m256i_to_private(a), b_ = simde__m256i_to_private(b); - #if SIMDE_NATURAL_VECTOR_SIZE_LE(128) + #if SIMDE_NATURAL_INT_VECTOR_SIZE_LE(128) r_.m128i[0] = simde_mm_packus_epi16(a_.m128i[0], b_.m128i[0]); r_.m128i[1] = simde_mm_packus_epi16(a_.m128i[1], b_.m128i[1]); #else @@ -3728,7 +3755,7 @@ simde_mm256_packus_epi32 (simde__m256i a, simde__m256i b) { a_ = simde__m256i_to_private(a), b_ = simde__m256i_to_private(b); - #if SIMDE_NATURAL_VECTOR_SIZE_LE(128) + #if SIMDE_NATURAL_INT_VECTOR_SIZE_LE(128) r_.m128i[0] = simde_mm_packus_epi32(a_.m128i[0], b_.m128i[0]); r_.m128i[1] = simde_mm_packus_epi32(a_.m128i[1], b_.m128i[1]); #else @@ -3883,7 +3910,7 @@ simde_mm256_sad_epu8 (simde__m256i a, simde__m256i b) { a_ = simde__m256i_to_private(a), b_ = simde__m256i_to_private(b); - #if SIMDE_NATURAL_VECTOR_SIZE_LE(128) + #if SIMDE_NATURAL_INT_VECTOR_SIZE_LE(128) r_.m128i[0] = simde_mm_sad_epu8(a_.m128i[0], b_.m128i[0]); r_.m128i[1] = simde_mm_sad_epu8(a_.m128i[1], b_.m128i[1]); #else @@ -3917,7 +3944,7 @@ simde_mm256_shuffle_epi8 (simde__m256i a, simde__m256i b) { a_ = simde__m256i_to_private(a), b_ = simde__m256i_to_private(b); - #if SIMDE_NATURAL_VECTOR_SIZE_LE(128) + #if SIMDE_NATURAL_INT_VECTOR_SIZE_LE(128) r_.m128i[0] = simde_mm_shuffle_epi8(a_.m128i[0], b_.m128i[0]); r_.m128i[1] = simde_mm_shuffle_epi8(a_.m128i[1], b_.m128i[1]); #else @@ -3955,18 +3982,18 @@ simde_mm256_shuffle_epi32 (simde__m256i a, const int imm8) } #if defined(SIMDE_X86_AVX2_NATIVE) # define simde_mm256_shuffle_epi32(a, imm8) _mm256_shuffle_epi32(a, imm8) -#elif SIMDE_NATURAL_VECTOR_SIZE_LE(128) && !defined(__PGI) +#elif SIMDE_NATURAL_INT_VECTOR_SIZE_LE(128) && !defined(__PGI) # define simde_mm256_shuffle_epi32(a, imm8) \ simde_mm256_set_m128i( \ simde_mm_shuffle_epi32(simde_mm256_extracti128_si256(a, 1), (imm8)), \ simde_mm_shuffle_epi32(simde_mm256_extracti128_si256(a, 0), (imm8))) #elif defined(SIMDE_SHUFFLE_VECTOR_) # define simde_mm256_shuffle_epi32(a, imm8) (__extension__ ({ \ - const simde__m256i_private simde__tmp_a_ = simde__m256i_to_private(a); \ + const simde__m256i_private simde_tmp_a_ = simde__m256i_to_private(a); \ simde__m256i_from_private((simde__m256i_private) { .i32 = \ SIMDE_SHUFFLE_VECTOR_(32, 32, \ - (simde__tmp_a_).i32, \ - (simde__tmp_a_).i32, \ + (simde_tmp_a_).i32, \ + (simde_tmp_a_).i32, \ ((imm8) ) & 3, \ ((imm8) >> 2) & 3, \ ((imm8) >> 4) & 3, \ @@ -3983,18 +4010,18 @@ simde_mm256_shuffle_epi32 (simde__m256i a, const int imm8) #if defined(SIMDE_X86_AVX2_NATIVE) # define simde_mm256_shufflehi_epi16(a, imm8) _mm256_shufflehi_epi16(a, imm8) -#elif SIMDE_NATURAL_VECTOR_SIZE_LE(128) +#elif SIMDE_NATURAL_INT_VECTOR_SIZE_LE(128) # define simde_mm256_shufflehi_epi16(a, imm8) \ simde_mm256_set_m128i( \ simde_mm_shufflehi_epi16(simde_mm256_extracti128_si256(a, 1), (imm8)), \ simde_mm_shufflehi_epi16(simde_mm256_extracti128_si256(a, 0), (imm8))) #elif defined(SIMDE_SHUFFLE_VECTOR_) # define simde_mm256_shufflehi_epi16(a, imm8) (__extension__ ({ \ - const simde__m256i_private simde__tmp_a_ = simde__m256i_to_private(a); \ + const simde__m256i_private simde_tmp_a_ = simde__m256i_to_private(a); \ simde__m256i_from_private((simde__m256i_private) { .i16 = \ SIMDE_SHUFFLE_VECTOR_(16, 32, \ - (simde__tmp_a_).i16, \ - (simde__tmp_a_).i16, \ + (simde_tmp_a_).i16, \ + (simde_tmp_a_).i16, \ 0, 1, 2, 3, \ (((imm8) ) & 3) + 4, \ (((imm8) >> 2) & 3) + 4, \ @@ -4019,18 +4046,18 @@ simde_mm256_shuffle_epi32 (simde__m256i a, const int imm8) #if defined(SIMDE_X86_AVX2_NATIVE) # define simde_mm256_shufflelo_epi16(a, imm8) _mm256_shufflelo_epi16(a, imm8) -#elif SIMDE_NATURAL_VECTOR_SIZE_LE(128) +#elif SIMDE_NATURAL_INT_VECTOR_SIZE_LE(128) # define simde_mm256_shufflelo_epi16(a, imm8) \ simde_mm256_set_m128i( \ simde_mm_shufflelo_epi16(simde_mm256_extracti128_si256(a, 1), (imm8)), \ simde_mm_shufflelo_epi16(simde_mm256_extracti128_si256(a, 0), (imm8))) #elif defined(SIMDE_SHUFFLE_VECTOR_) # define simde_mm256_shufflelo_epi16(a, imm8) (__extension__ ({ \ - const simde__m256i_private simde__tmp_a_ = simde__m256i_to_private(a); \ + const simde__m256i_private simde_tmp_a_ = simde__m256i_to_private(a); \ simde__m256i_from_private((simde__m256i_private) { .i16 = \ SIMDE_SHUFFLE_VECTOR_(16, 32, \ - (simde__tmp_a_).i16, \ - (simde__tmp_a_).i16, \ + (simde_tmp_a_).i16, \ + (simde_tmp_a_).i16, \ (((imm8) ) & 3), \ (((imm8) >> 2) & 3), \ (((imm8) >> 4) & 3), \ @@ -4065,7 +4092,7 @@ simde_mm256_sign_epi8 (simde__m256i a, simde__m256i b) { SIMDE_VECTORIZE for (size_t i = 0 ; i < (sizeof(r_.i8) / sizeof(r_.i8[0])) ; i++) { - r_.i8[i] = (b_.i8[i] < INT32_C(0)) ? -a_.i8[i] : a_.i8[i]; + r_.i8[i] = (b_.i8[i] == INT8_C(0)) ? INT8_C(0) : (b_.i8[i] < INT8_C(0)) ? -a_.i8[i] : a_.i8[i]; } return simde__m256i_from_private(r_); @@ -4089,7 +4116,7 @@ simde_mm256_sign_epi16 (simde__m256i a, simde__m256i b) { SIMDE_VECTORIZE for (size_t i = 0 ; i < (sizeof(r_.i16) / sizeof(r_.i16[0])) ; i++) { - r_.i16[i] = (b_.i16[i] < INT32_C(0)) ? -a_.i16[i] : a_.i16[i]; + r_.i16[i] = (b_.i16[i] == INT16_C(0)) ? INT16_C(0) : (b_.i16[i] < INT16_C(0)) ? -a_.i16[i] : a_.i16[i]; } return simde__m256i_from_private(r_); @@ -4113,7 +4140,7 @@ simde_mm256_sign_epi32(simde__m256i a, simde__m256i b) { SIMDE_VECTORIZE for (size_t i = 0; i < (sizeof(r_.i32) / sizeof(r_.i32[0])); i++) { - r_.i32[i] = (b_.i32[i] < INT32_C(0)) ? -a_.i32[i] : a_.i32[i]; + r_.i32[i] = (b_.i32[i] == INT32_C(0)) ? INT32_C(0) : (b_.i32[i] < INT32_C(0)) ? -a_.i32[i] : a_.i32[i]; } return simde__m256i_from_private(r_); @@ -4134,7 +4161,7 @@ simde_mm256_sll_epi16 (simde__m256i a, simde__m128i count) { r_, a_ = simde__m256i_to_private(a); - #if SIMDE_NATURAL_VECTOR_SIZE_LE(128) + #if SIMDE_NATURAL_INT_VECTOR_SIZE_LE(128) r_.m128i[0] = simde_mm_sll_epi16(a_.m128i[0], count); r_.m128i[1] = simde_mm_sll_epi16(a_.m128i[1], count); #else @@ -4173,7 +4200,7 @@ simde_mm256_sll_epi32 (simde__m256i a, simde__m128i count) { r_, a_ = simde__m256i_to_private(a); - #if SIMDE_NATURAL_VECTOR_SIZE_LE(128) + #if SIMDE_NATURAL_INT_VECTOR_SIZE_LE(128) r_.m128i[0] = simde_mm_sll_epi32(a_.m128i[0], count); r_.m128i[1] = simde_mm_sll_epi32(a_.m128i[1], count); #else @@ -4212,7 +4239,7 @@ simde_mm256_sll_epi64 (simde__m256i a, simde__m128i count) { r_, a_ = simde__m256i_to_private(a); - #if SIMDE_NATURAL_VECTOR_SIZE_LE(128) + #if SIMDE_NATURAL_INT_VECTOR_SIZE_LE(128) r_.m128i[0] = simde_mm_sll_epi64(a_.m128i[0], count); r_.m128i[1] = simde_mm_sll_epi64(a_.m128i[1], count); #else @@ -4271,7 +4298,7 @@ simde_mm256_slli_epi16 (simde__m256i a, const int imm8) } #if defined(SIMDE_X86_AVX2_NATIVE) # define simde_mm256_slli_epi16(a, imm8) _mm256_slli_epi16(a, imm8) -#elif SIMDE_NATURAL_VECTOR_SIZE_LE(128) +#elif SIMDE_NATURAL_INT_VECTOR_SIZE_LE(128) # define simde_mm256_slli_epi16(a, imm8) \ simde_mm256_set_m128i( \ simde_mm_slli_epi16(simde_mm256_extracti128_si256(a, 1), (imm8)), \ @@ -4308,7 +4335,7 @@ simde_mm256_slli_epi32 (simde__m256i a, const int imm8) } #if defined(SIMDE_X86_AVX2_NATIVE) # define simde_mm256_slli_epi32(a, imm8) _mm256_slli_epi32(a, imm8) -#elif SIMDE_NATURAL_VECTOR_SIZE_LE(128) +#elif SIMDE_NATURAL_INT_VECTOR_SIZE_LE(128) # define simde_mm256_slli_epi32(a, imm8) \ simde_mm256_set_m128i( \ simde_mm_slli_epi32(simde_mm256_extracti128_si256(a, 1), (imm8)), \ @@ -4340,7 +4367,7 @@ simde_mm256_slli_epi64 (simde__m256i a, const int imm8) } #if defined(SIMDE_X86_AVX2_NATIVE) # define simde_mm256_slli_epi64(a, imm8) _mm256_slli_epi64(a, imm8) -#elif SIMDE_NATURAL_VECTOR_SIZE_LE(128) +#elif SIMDE_NATURAL_INT_VECTOR_SIZE_LE(128) # define simde_mm256_slli_epi64(a, imm8) \ simde_mm256_set_m128i( \ simde_mm_slli_epi64(simde_mm256_extracti128_si256(a, 1), (imm8)), \ @@ -4371,7 +4398,7 @@ simde_mm256_slli_si256 (simde__m256i a, const int imm8) } #if defined(SIMDE_X86_AVX2_NATIVE) # define simde_mm256_slli_si256(a, imm8) _mm256_slli_si256(a, imm8) -#elif SIMDE_NATURAL_VECTOR_SIZE_LE(128) && !defined(__PGI) +#elif SIMDE_NATURAL_INT_VECTOR_SIZE_LE(128) && !defined(__PGI) # define simde_mm256_slli_si256(a, imm8) \ simde_mm256_set_m128i( \ simde_mm_slli_si128(simde_mm256_extracti128_si256(a, 1), (imm8)), \ @@ -4425,7 +4452,7 @@ simde_mm256_sllv_epi32 (simde__m256i a, simde__m256i b) { b_ = simde__m256i_to_private(b), r_; - #if SIMDE_NATURAL_VECTOR_SIZE_LE(128) + #if SIMDE_NATURAL_INT_VECTOR_SIZE_LE(128) r_.m128i[0] = simde_mm_sllv_epi32(a_.m128i[0], b_.m128i[0]); r_.m128i[1] = simde_mm_sllv_epi32(a_.m128i[1], b_.m128i[1]); #elif defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR) @@ -4485,7 +4512,7 @@ simde_mm256_sllv_epi64 (simde__m256i a, simde__m256i b) { b_ = simde__m256i_to_private(b), r_; - #if SIMDE_NATURAL_VECTOR_SIZE_LE(128) + #if SIMDE_NATURAL_INT_VECTOR_SIZE_LE(128) r_.m128i[0] = simde_mm_sllv_epi64(a_.m128i[0], b_.m128i[0]); r_.m128i[1] = simde_mm_sllv_epi64(a_.m128i[1], b_.m128i[1]); #elif defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR) @@ -4517,7 +4544,7 @@ simde_mm256_sra_epi16 (simde__m256i a, simde__m128i count) { r_, a_ = simde__m256i_to_private(a); - #if SIMDE_NATURAL_VECTOR_SIZE_LE(128) + #if SIMDE_NATURAL_INT_VECTOR_SIZE_LE(128) r_.m128i[0] = simde_mm_sra_epi16(a_.m128i[0], count); r_.m128i[1] = simde_mm_sra_epi16(a_.m128i[1], count); #else @@ -4556,7 +4583,7 @@ simde_mm256_sra_epi32 (simde__m256i a, simde__m128i count) { r_, a_ = simde__m256i_to_private(a); - #if SIMDE_NATURAL_VECTOR_SIZE_LE(128) + #if SIMDE_NATURAL_INT_VECTOR_SIZE_LE(128) r_.m128i[0] = simde_mm_sra_epi32(a_.m128i[0], count); r_.m128i[1] = simde_mm_sra_epi32(a_.m128i[1], count); #else @@ -4608,7 +4635,7 @@ simde_mm256_srai_epi16 (simde__m256i a, const int imm8) } #if defined(SIMDE_X86_AVX2_NATIVE) # define simde_mm256_srai_epi16(a, imm8) _mm256_srai_epi16(a, imm8) -#elif SIMDE_NATURAL_VECTOR_SIZE_LE(128) +#elif SIMDE_NATURAL_INT_VECTOR_SIZE_LE(128) # define simde_mm256_srai_epi16(a, imm8) \ simde_mm256_set_m128i( \ simde_mm_srai_epi16(simde_mm256_extracti128_si256(a, 1), (imm8)), \ @@ -4643,7 +4670,7 @@ simde_mm256_srai_epi32 (simde__m256i a, const int imm8) } #if defined(SIMDE_X86_AVX2_NATIVE) # define simde_mm256_srai_epi32(a, imm8) _mm256_srai_epi32(a, imm8) -#elif SIMDE_NATURAL_VECTOR_SIZE_LE(128) +#elif SIMDE_NATURAL_INT_VECTOR_SIZE_LE(128) # define simde_mm256_srai_epi32(a, imm8) \ simde_mm256_set_m128i( \ simde_mm_srai_epi32(simde_mm256_extracti128_si256(a, 1), (imm8)), \ @@ -4695,7 +4722,7 @@ simde_mm256_srav_epi32 (simde__m256i a, simde__m256i count) { a_ = simde__m256i_to_private(a), count_ = simde__m256i_to_private(count); - #if SIMDE_NATURAL_VECTOR_SIZE_LE(128) + #if SIMDE_NATURAL_INT_VECTOR_SIZE_LE(128) r_.m128i[0] = simde_mm_srav_epi32(a_.m128i[0], count_.m128i[0]); r_.m128i[1] = simde_mm_srav_epi32(a_.m128i[1], count_.m128i[1]); #else @@ -4725,7 +4752,7 @@ simde_mm256_srl_epi16 (simde__m256i a, simde__m128i count) { r_, a_ = simde__m256i_to_private(a); - #if SIMDE_NATURAL_VECTOR_SIZE_LE(128) + #if SIMDE_NATURAL_INT_VECTOR_SIZE_LE(128) r_.m128i[0] = simde_mm_srl_epi16(a_.m128i[0], count); r_.m128i[1] = simde_mm_srl_epi16(a_.m128i[1], count); #else @@ -4762,7 +4789,7 @@ simde_mm256_srl_epi32 (simde__m256i a, simde__m128i count) { r_, a_ = simde__m256i_to_private(a); - #if SIMDE_NATURAL_VECTOR_SIZE_LE(128) + #if SIMDE_NATURAL_INT_VECTOR_SIZE_LE(128) r_.m128i[0] = simde_mm_srl_epi32(a_.m128i[0], count); r_.m128i[1] = simde_mm_srl_epi32(a_.m128i[1], count); #else @@ -4799,7 +4826,7 @@ simde_mm256_srl_epi64 (simde__m256i a, simde__m128i count) { r_, a_ = simde__m256i_to_private(a); - #if SIMDE_NATURAL_VECTOR_SIZE_LE(128) + #if SIMDE_NATURAL_INT_VECTOR_SIZE_LE(128) r_.m128i[0] = simde_mm_srl_epi64(a_.m128i[0], count); r_.m128i[1] = simde_mm_srl_epi64(a_.m128i[1], count); #else @@ -4861,7 +4888,7 @@ simde_mm256_srli_epi16 (simde__m256i a, const int imm8) } #if defined(SIMDE_X86_AVX2_NATIVE) # define simde_mm256_srli_epi16(a, imm8) _mm256_srli_epi16(a, imm8) -#elif SIMDE_NATURAL_VECTOR_SIZE_LE(128) +#elif SIMDE_NATURAL_INT_VECTOR_SIZE_LE(128) # define simde_mm256_srli_epi16(a, imm8) \ simde_mm256_set_m128i( \ simde_mm_srli_epi16(simde_mm256_extracti128_si256(a, 1), (imm8)), \ @@ -4898,7 +4925,7 @@ simde_mm256_srli_epi32 (simde__m256i a, const int imm8) } #if defined(SIMDE_X86_AVX2_NATIVE) # define simde_mm256_srli_epi32(a, imm8) _mm256_srli_epi32(a, imm8) -#elif SIMDE_NATURAL_VECTOR_SIZE_LE(128) +#elif SIMDE_NATURAL_INT_VECTOR_SIZE_LE(128) # define simde_mm256_srli_epi32(a, imm8) \ simde_mm256_set_m128i( \ simde_mm_srli_epi32(simde_mm256_extracti128_si256(a, 1), (imm8)), \ @@ -4930,7 +4957,7 @@ simde_mm256_srli_epi64 (simde__m256i a, const int imm8) } #if defined(SIMDE_X86_AVX2_NATIVE) # define simde_mm256_srli_epi64(a, imm8) _mm256_srli_epi64(a, imm8) -#elif SIMDE_NATURAL_VECTOR_SIZE_LE(128) +#elif SIMDE_NATURAL_INT_VECTOR_SIZE_LE(128) # define simde_mm256_srli_epi64(a, imm8) \ simde_mm256_set_m128i( \ simde_mm_srli_epi64(simde_mm256_extracti128_si256(a, 1), (imm8)), \ @@ -4961,7 +4988,7 @@ simde_mm256_srli_si256 (simde__m256i a, const int imm8) } #if defined(SIMDE_X86_AVX2_NATIVE) # define simde_mm256_srli_si256(a, imm8) _mm256_srli_si256(a, imm8) -#elif SIMDE_NATURAL_VECTOR_SIZE_LE(128) && !defined(__PGI) +#elif SIMDE_NATURAL_INT_VECTOR_SIZE_LE(128) && !defined(__PGI) # define simde_mm256_srli_si256(a, imm8) \ simde_mm256_set_m128i( \ simde_mm_srli_si128(simde_mm256_extracti128_si256(a, 1), (imm8)), \ @@ -5090,6 +5117,8 @@ simde__m256i simde_mm256_stream_load_si256 (const simde__m256i* mem_addr) { #if defined(SIMDE_X86_AVX2_NATIVE) return _mm256_stream_load_si256(HEDLEY_CONST_CAST(simde__m256i*, mem_addr)); + #elif HEDLEY_HAS_BUILTIN(__builtin_nontemporal_store) && defined(SIMDE_VECTOR_SUBSCRIPT) + return __builtin_nontemporal_load(mem_addr); #else simde__m256i r; simde_memcpy(&r, SIMDE_ALIGN_ASSUME_LIKE(mem_addr, simde__m256i), sizeof(r)); @@ -5111,7 +5140,7 @@ simde_mm256_sub_epi8 (simde__m256i a, simde__m256i b) { a_ = simde__m256i_to_private(a), b_ = simde__m256i_to_private(b); - #if SIMDE_NATURAL_VECTOR_SIZE_LE(128) + #if SIMDE_NATURAL_INT_VECTOR_SIZE_LE(128) r_.m128i[0] = simde_mm_sub_epi8(a_.m128i[0], b_.m128i[0]); r_.m128i[1] = simde_mm_sub_epi8(a_.m128i[1], b_.m128i[1]); #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) @@ -5142,7 +5171,7 @@ simde_mm256_sub_epi16 (simde__m256i a, simde__m256i b) { a_ = simde__m256i_to_private(a), b_ = simde__m256i_to_private(b); - #if SIMDE_NATURAL_VECTOR_SIZE_LE(128) + #if SIMDE_NATURAL_INT_VECTOR_SIZE_LE(128) r_.m128i[0] = simde_mm_sub_epi16(a_.m128i[0], b_.m128i[0]); r_.m128i[1] = simde_mm_sub_epi16(a_.m128i[1], b_.m128i[1]); #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) @@ -5187,7 +5216,7 @@ simde_mm256_sub_epi32 (simde__m256i a, simde__m256i b) { a_ = simde__m256i_to_private(a), b_ = simde__m256i_to_private(b); - #if SIMDE_NATURAL_VECTOR_SIZE_LE(128) + #if SIMDE_NATURAL_INT_VECTOR_SIZE_LE(128) r_.m128i[0] = simde_mm_sub_epi32(a_.m128i[0], b_.m128i[0]); r_.m128i[1] = simde_mm_sub_epi32(a_.m128i[1], b_.m128i[1]); #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) @@ -5232,7 +5261,7 @@ simde_mm256_sub_epi64 (simde__m256i a, simde__m256i b) { a_ = simde__m256i_to_private(a), b_ = simde__m256i_to_private(b); - #if SIMDE_NATURAL_VECTOR_SIZE_LE(128) + #if SIMDE_NATURAL_INT_VECTOR_SIZE_LE(128) r_.m128i[0] = simde_mm_sub_epi64(a_.m128i[0], b_.m128i[0]); r_.m128i[1] = simde_mm_sub_epi64(a_.m128i[1], b_.m128i[1]); #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) @@ -5262,7 +5291,7 @@ simde_x_mm256_sub_epu32 (simde__m256i a, simde__m256i b) { #if defined(SIMDE_VECTOR_SUBSCRIPT_OPS) r_.u32 = a_.u32 - b_.u32; - #elif SIMDE_NATURAL_VECTOR_SIZE_LE(128) + #elif SIMDE_NATURAL_INT_VECTOR_SIZE_LE(128) r_.m128i[0] = simde_x_mm_sub_epu32(a_.m128i[0], b_.m128i[0]); r_.m128i[1] = simde_x_mm_sub_epu32(a_.m128i[1], b_.m128i[1]); #else @@ -5286,7 +5315,7 @@ simde_mm256_subs_epi8 (simde__m256i a, simde__m256i b) { a_ = simde__m256i_to_private(a), b_ = simde__m256i_to_private(b); - #if SIMDE_NATURAL_VECTOR_SIZE_LE(128) + #if SIMDE_NATURAL_INT_VECTOR_SIZE_LE(128) r_.m128i[0] = simde_mm_subs_epi8(a_.m128i[0], b_.m128i[0]); r_.m128i[1] = simde_mm_subs_epi8(a_.m128i[1], b_.m128i[1]); #else @@ -5315,7 +5344,7 @@ simde_mm256_subs_epi16(simde__m256i a, simde__m256i b) { a_ = simde__m256i_to_private(a), b_ = simde__m256i_to_private(b); - #if SIMDE_NATURAL_VECTOR_SIZE_LE(128) + #if SIMDE_NATURAL_INT_VECTOR_SIZE_LE(128) r_.m128i[0] = simde_mm_subs_epi16(a_.m128i[0], b_.m128i[0]); r_.m128i[1] = simde_mm_subs_epi16(a_.m128i[1], b_.m128i[1]); #else @@ -5358,7 +5387,7 @@ simde_mm256_subs_epu8 (simde__m256i a, simde__m256i b) { a_ = simde__m256i_to_private(a), b_ = simde__m256i_to_private(b); - #if SIMDE_NATURAL_VECTOR_SIZE_LE(128) + #if SIMDE_NATURAL_INT_VECTOR_SIZE_LE(128) r_.m128i[0] = simde_mm_subs_epu8(a_.m128i[0], b_.m128i[0]); r_.m128i[1] = simde_mm_subs_epu8(a_.m128i[1], b_.m128i[1]); #else @@ -5387,7 +5416,7 @@ simde_mm256_subs_epu16(simde__m256i a, simde__m256i b) { a_ = simde__m256i_to_private(a), b_ = simde__m256i_to_private(b); - #if SIMDE_NATURAL_VECTOR_SIZE_LE(128) + #if SIMDE_NATURAL_INT_VECTOR_SIZE_LE(128) r_.m128i[0] = simde_mm_subs_epu16(a_.m128i[0], b_.m128i[0]); r_.m128i[1] = simde_mm_subs_epu16(a_.m128i[1], b_.m128i[1]); #else @@ -5433,7 +5462,7 @@ simde_mm256_unpacklo_epi8 (simde__m256i a, simde__m256i b) { a_ = simde__m256i_to_private(a), b_ = simde__m256i_to_private(b); - #if SIMDE_NATURAL_VECTOR_SIZE_LE(128) + #if SIMDE_NATURAL_INT_VECTOR_SIZE_LE(128) r_.m128i[0] = simde_mm_unpacklo_epi8(a_.m128i[0], b_.m128i[0]); r_.m128i[1] = simde_mm_unpacklo_epi8(a_.m128i[1], b_.m128i[1]); #elif defined(SIMDE_SHUFFLE_VECTOR_) @@ -5469,7 +5498,7 @@ simde_mm256_unpacklo_epi16 (simde__m256i a, simde__m256i b) { a_ = simde__m256i_to_private(a), b_ = simde__m256i_to_private(b); - #if SIMDE_NATURAL_VECTOR_SIZE_LE(128) + #if SIMDE_NATURAL_INT_VECTOR_SIZE_LE(128) r_.m128i[0] = simde_mm_unpacklo_epi16(a_.m128i[0], b_.m128i[0]); r_.m128i[1] = simde_mm_unpacklo_epi16(a_.m128i[1], b_.m128i[1]); #elif defined(SIMDE_SHUFFLE_VECTOR_) @@ -5502,7 +5531,7 @@ simde_mm256_unpacklo_epi32 (simde__m256i a, simde__m256i b) { a_ = simde__m256i_to_private(a), b_ = simde__m256i_to_private(b); - #if SIMDE_NATURAL_VECTOR_SIZE_LE(128) + #if SIMDE_NATURAL_INT_VECTOR_SIZE_LE(128) r_.m128i[0] = simde_mm_unpacklo_epi32(a_.m128i[0], b_.m128i[0]); r_.m128i[1] = simde_mm_unpacklo_epi32(a_.m128i[1], b_.m128i[1]); #elif defined(SIMDE_SHUFFLE_VECTOR_) @@ -5535,7 +5564,7 @@ simde_mm256_unpacklo_epi64 (simde__m256i a, simde__m256i b) { a_ = simde__m256i_to_private(a), b_ = simde__m256i_to_private(b); - #if SIMDE_NATURAL_VECTOR_SIZE_LE(128) + #if SIMDE_NATURAL_INT_VECTOR_SIZE_LE(128) r_.m128i[0] = simde_mm_unpacklo_epi64(a_.m128i[0], b_.m128i[0]); r_.m128i[1] = simde_mm_unpacklo_epi64(a_.m128i[1], b_.m128i[1]); #elif defined(SIMDE_SHUFFLE_VECTOR_) @@ -5567,7 +5596,7 @@ simde_mm256_unpackhi_epi8 (simde__m256i a, simde__m256i b) { a_ = simde__m256i_to_private(a), b_ = simde__m256i_to_private(b); - #if SIMDE_NATURAL_VECTOR_SIZE_LE(128) + #if SIMDE_NATURAL_INT_VECTOR_SIZE_LE(128) r_.m128i[0] = simde_mm_unpackhi_epi8(a_.m128i[0], b_.m128i[0]); r_.m128i[1] = simde_mm_unpackhi_epi8(a_.m128i[1], b_.m128i[1]); #elif defined(SIMDE_SHUFFLE_VECTOR_) @@ -5603,7 +5632,7 @@ simde_mm256_unpackhi_epi16 (simde__m256i a, simde__m256i b) { a_ = simde__m256i_to_private(a), b_ = simde__m256i_to_private(b); - #if SIMDE_NATURAL_VECTOR_SIZE_LE(128) + #if SIMDE_NATURAL_INT_VECTOR_SIZE_LE(128) r_.m128i[0] = simde_mm_unpackhi_epi16(a_.m128i[0], b_.m128i[0]); r_.m128i[1] = simde_mm_unpackhi_epi16(a_.m128i[1], b_.m128i[1]); #elif defined(SIMDE_SHUFFLE_VECTOR_) @@ -5637,7 +5666,7 @@ simde_mm256_unpackhi_epi32 (simde__m256i a, simde__m256i b) { a_ = simde__m256i_to_private(a), b_ = simde__m256i_to_private(b); - #if SIMDE_NATURAL_VECTOR_SIZE_LE(128) + #if SIMDE_NATURAL_INT_VECTOR_SIZE_LE(128) r_.m128i[0] = simde_mm_unpackhi_epi32(a_.m128i[0], b_.m128i[0]); r_.m128i[1] = simde_mm_unpackhi_epi32(a_.m128i[1], b_.m128i[1]); #elif defined(SIMDE_SHUFFLE_VECTOR_) @@ -5670,7 +5699,7 @@ simde_mm256_unpackhi_epi64 (simde__m256i a, simde__m256i b) { a_ = simde__m256i_to_private(a), b_ = simde__m256i_to_private(b); - #if SIMDE_NATURAL_VECTOR_SIZE_LE(128) + #if SIMDE_NATURAL_INT_VECTOR_SIZE_LE(128) r_.m128i[0] = simde_mm_unpackhi_epi64(a_.m128i[0], b_.m128i[0]); r_.m128i[1] = simde_mm_unpackhi_epi64(a_.m128i[1], b_.m128i[1]); #elif defined(SIMDE_SHUFFLE_VECTOR_) @@ -5702,7 +5731,7 @@ simde_mm256_xor_si256 (simde__m256i a, simde__m256i b) { a_ = simde__m256i_to_private(a), b_ = simde__m256i_to_private(b); - #if SIMDE_NATURAL_VECTOR_SIZE_LE(128) + #if SIMDE_NATURAL_INT_VECTOR_SIZE_LE(128) r_.m128i[0] = simde_mm_xor_si128(a_.m128i[0], b_.m128i[0]); r_.m128i[1] = simde_mm_xor_si128(a_.m128i[1], b_.m128i[1]); #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) diff --git a/lib/simd_wrapper/simde/x86/avx512.h b/lib/simd_wrapper/simde/x86/avx512.h index 1215e8e4e46..103b46661d0 100644 --- a/lib/simd_wrapper/simde/x86/avx512.h +++ b/lib/simd_wrapper/simde/x86/avx512.h @@ -55,6 +55,7 @@ #include "avx512/cvt.h" #include "avx512/cvtt.h" #include "avx512/cvts.h" +#include "avx512/cvtus.h" #include "avx512/dbsad.h" #include "avx512/div.h" #include "avx512/dpbf16.h" @@ -71,8 +72,13 @@ #include "avx512/fmsub.h" #include "avx512/fnmadd.h" #include "avx512/fnmsub.h" +#include "avx512/fpclass.h" +#include "avx512/gather.h" #include "avx512/insert.h" +#include "avx512/kand.h" #include "avx512/kshift.h" +#include "avx512/knot.h" +#include "avx512/kxor.h" #include "avx512/load.h" #include "avx512/loadu.h" #include "avx512/lzcnt.h" @@ -92,11 +98,14 @@ #include "avx512/or.h" #include "avx512/packs.h" #include "avx512/packus.h" +#include "avx512/permutex.h" #include "avx512/permutexvar.h" #include "avx512/permutex2var.h" #include "avx512/popcnt.h" #include "avx512/range.h" #include "avx512/range_round.h" +#include "avx512/rcp.h" +#include "avx512/reduce.h" #include "avx512/rol.h" #include "avx512/rolv.h" #include "avx512/ror.h" diff --git a/lib/simd_wrapper/simde/x86/avx512/2intersect.h b/lib/simd_wrapper/simde/x86/avx512/2intersect.h index 66884f1dd1b..81b0ee1fbb8 100644 --- a/lib/simd_wrapper/simde/x86/avx512/2intersect.h +++ b/lib/simd_wrapper/simde/x86/avx512/2intersect.h @@ -37,36 +37,35 @@ SIMDE_BEGIN_DECLS_ SIMDE_FUNCTION_ATTRIBUTES void simde_mm_2intersect_epi32(simde__m128i a, simde__m128i b, simde__mmask8 *k1, simde__mmask8 *k2) { - #if defined(SIMDE_X86_AVX512VP2INTERSECT_NATIVE) && defined(SIMDE_X86_AVX512VL_NATIVE) - _mm_2intersect_epi32(a, b, k1, k2); - #else - simde__m128i_private - a_ = simde__m128i_to_private(a), - b_ = simde__m128i_to_private(b); - simde__mmask8 - k1_ = 0, - k2_ = 0; - - for (size_t i = 0 ; i < sizeof(a_.i32) / sizeof(a_.i32[0]) ; i++) { - #if defined(SIMDE_ENABLE_OPENMP) - #pragma omp simd reduction(|:k1_) reduction(|:k2_) - #else - SIMDE_VECTORIZE - #endif - for (size_t j = 0 ; j < sizeof(b_.i32) / sizeof(b_.i32[0]) ; j++) { - const int32_t m = a_.i32[i] == b_.i32[j]; - k1_ |= m << i; - k2_ |= m << j; - } + simde__m128i_private + a_ = simde__m128i_to_private(a), + b_ = simde__m128i_to_private(b); + simde__mmask8 + k1_ = 0, + k2_ = 0; + + for (size_t i = 0 ; i < sizeof(a_.i32) / sizeof(a_.i32[0]) ; i++) { + #if defined(SIMDE_ENABLE_OPENMP) + #pragma omp simd reduction(|:k1_) reduction(|:k2_) + #else + SIMDE_VECTORIZE + #endif + for (size_t j = 0 ; j < sizeof(b_.i32) / sizeof(b_.i32[0]) ; j++) { + const int32_t m = a_.i32[i] == b_.i32[j]; + k1_ |= m << i; + k2_ |= m << j; } + } - *k1 = k1_; - *k2 = k2_; - #endif + *k1 = k1_; + *k2 = k2_; } +#if defined(SIMDE_X86_AVX512VP2INTERSECT_NATIVE) && defined(SIMDE_X86_AVX512VL_NATIVE) + #define simde_mm_2intersect_epi32(a, b, k1, k2) _mm_2intersect_epi32(a, b, k1, k2) +#endif #if defined(SIMDE_X86_AVX512VP2INTERSECT_ENABLE_NATIVE_ALIASES) && defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) - #undef __mm_2intersect_epi32 - #define __mm_2intersect_epi32(a,b, k1, k2) simde_mm_2intersect_epi32(a, b, k1, k2) + #undef _mm_2intersect_epi32 + #define _mm_2intersect_epi32(a, b, k1, k2) simde_mm_2intersect_epi32(a, b, k1, k2) #endif SIMDE_FUNCTION_ATTRIBUTES @@ -100,8 +99,8 @@ simde_mm_2intersect_epi64(simde__m128i a, simde__m128i b, simde__mmask8 *k1, sim #endif } #if defined(SIMDE_X86_AVX512VP2INTERSECT_ENABLE_NATIVE_ALIASES) && defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) - #undef __mm_2intersect_epi64 - #define __mm_2intersect_epi64(a,b, k1, k2) simde_mm_2intersect_epi64(a, b, k1, k2) + #undef _mm_2intersect_epi64 + #define _mm_2intersect_epi64(a, b, k1, k2) simde_mm_2intersect_epi64(a, b, k1, k2) #endif SIMDE_FUNCTION_ATTRIBUTES @@ -136,7 +135,7 @@ simde_mm256_2intersect_epi32(simde__m256i a, simde__m256i b, simde__mmask8 *k1, } #if defined(SIMDE_X86_AVX512VP2INTERSECT_ENABLE_NATIVE_ALIASES) && defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) #undef _mm256_2intersect_epi32 - #define _mm256_2intersect_epi32(a,b, k1, k2) simde_mm256_2intersect_epi32(a, b, k1, k2) + #define _mm256_2intersect_epi32(a, b, k1, k2) simde_mm256_2intersect_epi32(a, b, k1, k2) #endif SIMDE_FUNCTION_ATTRIBUTES @@ -171,7 +170,7 @@ simde_mm256_2intersect_epi64(simde__m256i a, simde__m256i b, simde__mmask8 *k1, } #if defined(SIMDE_X86_AVX512VP2INTERSECT_ENABLE_NATIVE_ALIASES) && defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) #undef _mm256_2intersect_epi64 - #define _mm256_2intersect_epi64(a,b, k1, k2) simde_mm256_2intersect_epi64(a, b, k1, k2) + #define _mm256_2intersect_epi64(a, b, k1, k2) simde_mm256_2intersect_epi64(a, b, k1, k2) #endif SIMDE_FUNCTION_ATTRIBUTES diff --git a/lib/simd_wrapper/simde/x86/avx512/abs.h b/lib/simd_wrapper/simde/x86/avx512/abs.h index 5c0871b7532..5ff001485d8 100644 --- a/lib/simd_wrapper/simde/x86/avx512/abs.h +++ b/lib/simd_wrapper/simde/x86/avx512/abs.h @@ -524,7 +524,7 @@ simde_mm512_mask_abs_ps(simde__m512 src, simde__mmask16 k, simde__m512 v2) { SIMDE_FUNCTION_ATTRIBUTES simde__m512d simde_mm512_abs_pd(simde__m512d v2) { - #if defined(SIMDE_X86_AVX512F_NATIVE) && (!defined(HEDLEY_GCC_VERSION) || HEDLEY_GCC_VERSION_CHECK(8,3,0)) + #if defined(SIMDE_X86_AVX512F_NATIVE) && !defined(SIMDE_BUG_GCC_87467) return _mm512_abs_pd(v2); #elif defined(SIMDE_X86_AVX512F_NATIVE) && (!defined(HEDLEY_GCC_VERSION) || HEDLEY_GCC_VERSION_CHECK(7,0,0)) /* gcc bug: https://gcc.gnu.org/legacy-ml/gcc-patches/2018-01/msg01962.html */ @@ -560,7 +560,7 @@ simde_mm512_abs_pd(simde__m512d v2) { SIMDE_FUNCTION_ATTRIBUTES simde__m512d simde_mm512_mask_abs_pd(simde__m512d src, simde__mmask8 k, simde__m512d v2) { - #if defined(SIMDE_X86_AVX512F_NATIVE) && (!defined(HEDLEY_GCC_VERSION) || HEDLEY_GCC_VERSION_CHECK(8,3,0)) + #if defined(SIMDE_X86_AVX512F_NATIVE) && !defined(SIMDE_BUG_GCC_87467) return _mm512_mask_abs_pd(src, k, v2); #elif defined(SIMDE_X86_AVX512F_NATIVE) && (!defined(HEDLEY_GCC_VERSION) || HEDLEY_GCC_VERSION_CHECK(7,0,0)) /* gcc bug: https://gcc.gnu.org/legacy-ml/gcc-patches/2018-01/msg01962.html */ diff --git a/lib/simd_wrapper/simde/x86/avx512/add.h b/lib/simd_wrapper/simde/x86/avx512/add.h index 2c4c98e6c5a..d192b2f57e5 100644 --- a/lib/simd_wrapper/simde/x86/avx512/add.h +++ b/lib/simd_wrapper/simde/x86/avx512/add.h @@ -402,23 +402,7 @@ simde_mm512_add_epi32 (simde__m512i a, simde__m512i b) { a_ = simde__m512i_to_private(a), b_ = simde__m512i_to_private(b); - #if defined(SIMDE_ARM_SVE_NATIVE) - const size_t n = sizeof(a_.i32) / sizeof(a_.i32[0]); - size_t i = 0; - svbool_t pg = svwhilelt_b32(i, n); - do { - svint32_t - va = svld1_s32(pg, &(a_.i32[i])), - vb = svld1_s32(pg, &(b_.i32[i])); - svst1_s32(pg, &(r_.i32[i]), svadd_s32_x(pg, va, vb)); - i += svcntw(); - pg = svwhilelt_b32(i, n); - } while (svptest_any(svptrue_b32(), pg)); - #elif SIMDE_NATURAL_VECTOR_SIZE_LE(256) - for (size_t i = 0 ; i < (sizeof(r_.m256i) / sizeof(r_.m256i[0])) ; i++) { - r_.m256i[i] = simde_mm256_add_epi32(a_.m256i[i], b_.m256i[i]); - } - #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) + #if defined(SIMDE_VECTOR_SUBSCRIPT_OPS) r_.i32 = a_.i32 + b_.i32; #else SIMDE_VECTORIZE diff --git a/lib/simd_wrapper/simde/x86/avx512/cast.h b/lib/simd_wrapper/simde/x86/avx512/cast.h index 5c4cafa5f32..7f67a5730d9 100644 --- a/lib/simd_wrapper/simde/x86/avx512/cast.h +++ b/lib/simd_wrapper/simde/x86/avx512/cast.h @@ -100,6 +100,39 @@ simde_mm512_castps_si512 (simde__m512 a) { #define _mm512_castps_si512(a) simde_mm512_castps_si512(a) #endif +SIMDE_FUNCTION_ATTRIBUTES +simde__m512i +simde_mm512_castph_si512 (simde__m512h a) { + #if defined(SIMDE_X86_AVX512FP16_NATIVE) + return _mm512_castph_si512(a); + #else + simde__m512i r; + simde_memcpy(&r, &a, sizeof(r)); + return r; + #endif +} +#if defined(SIMDE_X86_AVX512FP16_ENABLE_NATIVE_ALIASES) + #undef _mm512_castph_si512 + #define _mm512_castph_si512(a) simde_mm512_castph_si512(a) +#endif + + +SIMDE_FUNCTION_ATTRIBUTES +simde__m512h +simde_mm512_castsi512_ph (simde__m512i a) { + #if defined(SIMDE_X86_AVX512FP16_NATIVE) + return _mm512_castsi512_ph(a); + #else + simde__m512h r; + simde_memcpy(&r, &a, sizeof(r)); + return r; + #endif +} +#if defined(SIMDE_X86_AVX512FP16_ENABLE_NATIVE_ALIASES) + #undef _mm512_castsi512_ph + #define _mm512_castsi512_ph(a) simde_mm512_castsi512_ph(a) +#endif + SIMDE_FUNCTION_ATTRIBUTES simde__m512 simde_mm512_castsi512_ps (simde__m512i a) { diff --git a/lib/simd_wrapper/simde/x86/avx512/cmp.h b/lib/simd_wrapper/simde/x86/avx512/cmp.h index 313d8bcb2c6..2a3b99c3b18 100644 --- a/lib/simd_wrapper/simde/x86/avx512/cmp.h +++ b/lib/simd_wrapper/simde/x86/avx512/cmp.h @@ -38,6 +38,208 @@ HEDLEY_DIAGNOSTIC_PUSH SIMDE_DISABLE_UNWANTED_DIAGNOSTICS SIMDE_BEGIN_DECLS_ +#if defined(__clang__) && SIMDE_FLOAT16_API == SIMDE_FLOAT16_API_FP16 +SIMDE_DIAGNOSTIC_DISABLE_DOUBLE_PROMOTION_ +#endif + +SIMDE_HUGE_FUNCTION_ATTRIBUTES +simde__mmask64 +simde_mm512_cmp_epi8_mask (simde__m512i a, simde__m512i b, const int imm8) + SIMDE_REQUIRE_CONSTANT_RANGE(imm8, 0, 7) { + simde__m512i_private + r_, + a_ = simde__m512i_to_private(a), + b_ = simde__m512i_to_private(b); + + switch (imm8) { + case SIMDE_MM_CMPINT_EQ: + #if defined(SIMDE_VECTOR_SUBSCRIPT_OPS) + r_.i8 = HEDLEY_REINTERPRET_CAST(__typeof__(r_.i8), (a_.i8 == b_.i8)); + #else + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.i8) / sizeof(r_.i8[0])) ; i++) { + r_.i8[i] = (a_.i8[i] == b_.i8[i]) ? ~UINT32_C(0) : UINT32_C(0); + } + #endif + break; + + case SIMDE_MM_CMPINT_LT: + #if defined(SIMDE_VECTOR_SUBSCRIPT_OPS) + r_.i8 = HEDLEY_REINTERPRET_CAST(__typeof__(r_.i8), (a_.i8 < b_.i8)); + #else + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.i8) / sizeof(r_.i8[0])) ; i++) { + r_.i8[i] = (a_.i8[i] < b_.i8[i]) ? ~UINT32_C(0) : UINT32_C(0); + } + #endif + break; + + case SIMDE_MM_CMPINT_LE: + #if defined(SIMDE_VECTOR_SUBSCRIPT_OPS) + r_.i8 = HEDLEY_REINTERPRET_CAST(__typeof__(r_.i8), (a_.i8 <= b_.i8)); + #else + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.i8) / sizeof(r_.i8[0])) ; i++) { + r_.i8[i] = (a_.i8[i] <= b_.i8[i]) ? ~UINT32_C(0) : UINT32_C(0); + } + #endif + break; + + case SIMDE_MM_CMPINT_FALSE: + r_ = simde__m512i_to_private(simde_mm512_setzero_si512()); + break; + + + case SIMDE_MM_CMPINT_NE: + #if defined(SIMDE_VECTOR_SUBSCRIPT_OPS) + r_.i8 = HEDLEY_REINTERPRET_CAST(__typeof__(r_.i8), (a_.i8 != b_.i8)); + #else + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.i8) / sizeof(r_.i8[0])) ; i++) { + r_.i8[i] = (a_.i8[i] != b_.i8[i]) ? ~UINT32_C(0) : UINT32_C(0); + } + #endif + break; + + case SIMDE_MM_CMPINT_NLT: + #if defined(SIMDE_VECTOR_SUBSCRIPT_OPS) + r_.i8 = HEDLEY_REINTERPRET_CAST(__typeof__(r_.i8), ~(a_.i8 < b_.i8)); + #else + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.i8) / sizeof(r_.i8[0])) ; i++) { + r_.i8[i] = !(a_.i8[i] < b_.i8[i]) ? ~UINT32_C(0) : UINT32_C(0); + } + #endif + break; + + case SIMDE_MM_CMPINT_NLE: + #if defined(SIMDE_VECTOR_SUBSCRIPT_OPS) + r_.i8 = HEDLEY_REINTERPRET_CAST(__typeof__(r_.i8), ~(a_.i8 <= b_.i8)); + #else + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.i8) / sizeof(r_.i8[0])) ; i++) { + r_.i8[i] = !(a_.i8[i] <= b_.i8[i]) ? ~UINT32_C(0) : UINT32_C(0); + } + #endif + break; + + case SIMDE_MM_CMPINT_TRUE: + r_ = simde__m512i_to_private(simde_x_mm512_setone_si512()); + break; + + default: + HEDLEY_UNREACHABLE(); + } + + return simde_mm512_movepi8_mask(simde__m512i_from_private(r_)); +} +#if defined(SIMDE_X86_AVX512BW_NATIVE) + #define simde_mm512_cmp_epi8_mask(a, b, imm8) _mm512_cmp_epi8_mask((a), (b), (imm8)) +#endif +#if defined(SIMDE_X86_AVX512BW_ENABLE_NATIVE_ALIASES) + #undef _mm512_cmp_epi8_mask + #define _mm512_cmp_epi8_mask(a, b, imm8) simde_mm512_cmp_epi8_mask((a), (b), (imm8)) +#endif + +SIMDE_HUGE_FUNCTION_ATTRIBUTES +simde__mmask8 +simde_mm256_cmp_epi32_mask (simde__m256i a, simde__m256i b, const int imm8) + SIMDE_REQUIRE_CONSTANT_RANGE(imm8, 0, 7) { + simde__m256i_private + r_, + a_ = simde__m256i_to_private(a), + b_ = simde__m256i_to_private(b); + + switch (imm8) { + case SIMDE_MM_CMPINT_EQ: + #if defined(SIMDE_VECTOR_SUBSCRIPT_OPS) + r_.i32 = HEDLEY_REINTERPRET_CAST(__typeof__(r_.i32), (a_.i32 == b_.i32)); + #else + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.i32) / sizeof(r_.i32[0])) ; i++) { + r_.i32[i] = (a_.i32[i] == b_.i32[i]) ? ~UINT32_C(0) : UINT32_C(0); + } + #endif + break; + + case SIMDE_MM_CMPINT_LT: + #if defined(SIMDE_VECTOR_SUBSCRIPT_OPS) + r_.i32 = HEDLEY_REINTERPRET_CAST(__typeof__(r_.i32), (a_.i32 < b_.i32)); + #else + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.i32) / sizeof(r_.i32[0])) ; i++) { + r_.i32[i] = (a_.i32[i] < b_.i32[i]) ? ~UINT32_C(0) : UINT32_C(0); + } + #endif + break; + + case SIMDE_MM_CMPINT_LE: + #if defined(SIMDE_VECTOR_SUBSCRIPT_OPS) + r_.i32 = HEDLEY_REINTERPRET_CAST(__typeof__(r_.i32), (a_.i32 <= b_.i32)); + #else + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.i32) / sizeof(r_.i32[0])) ; i++) { + r_.i32[i] = (a_.i32[i] <= b_.i32[i]) ? ~UINT32_C(0) : UINT32_C(0); + } + #endif + break; + + case SIMDE_MM_CMPINT_FALSE: + r_ = simde__m256i_to_private(simde_mm256_setzero_si256()); + break; + + + case SIMDE_MM_CMPINT_NE: + #if defined(SIMDE_VECTOR_SUBSCRIPT_OPS) + r_.i32 = HEDLEY_REINTERPRET_CAST(__typeof__(r_.i32), (a_.i32 != b_.i32)); + #else + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.i32) / sizeof(r_.i32[0])) ; i++) { + r_.i32[i] = (a_.i32[i] != b_.i32[i]) ? ~UINT32_C(0) : UINT32_C(0); + } + #endif + break; + + case SIMDE_MM_CMPINT_NLT: + #if defined(SIMDE_VECTOR_SUBSCRIPT_OPS) + r_.i32 = HEDLEY_REINTERPRET_CAST(__typeof__(r_.i32), ~(a_.i32 < b_.i32)); + #else + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.i32) / sizeof(r_.i32[0])) ; i++) { + r_.i32[i] = !(a_.i32[i] < b_.i32[i]) ? ~UINT32_C(0) : UINT32_C(0); + } + #endif + break; + + case SIMDE_MM_CMPINT_NLE: + #if defined(SIMDE_VECTOR_SUBSCRIPT_OPS) + r_.i32 = HEDLEY_REINTERPRET_CAST(__typeof__(r_.i32), ~(a_.i32 <= b_.i32)); + #else + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.i32) / sizeof(r_.i32[0])) ; i++) { + r_.i32[i] = !(a_.i32[i] <= b_.i32[i]) ? ~UINT32_C(0) : UINT32_C(0); + } + #endif + break; + + case SIMDE_MM_CMPINT_TRUE: + r_ = simde__m256i_to_private(simde_x_mm256_setone_si256()); + break; + + default: + HEDLEY_UNREACHABLE(); + } + + return simde_mm256_movepi32_mask(simde__m256i_from_private(r_)); +} +#if defined(SIMDE_X86_AVX512F_NATIVE) && defined(SIMDE_X86_AVX512VL_NATIVE) + #define simde_mm256_cmp_epi32_mask(a, b, imm8) _mm256_cmp_epi32_mask((a), (b), (imm8)) +#endif +#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) && defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) + #undef _mm256_cmp_epi32_mask + #define _mm256_cmp_epi32_mask(a, b, imm8) simde_mm256_cmp_epi32_mask((a), (b), (imm8)) +#endif + SIMDE_HUGE_FUNCTION_ATTRIBUTES simde__mmask16 simde_mm512_cmp_ps_mask (simde__m512 a, simde__m512 b, const int imm8) @@ -237,7 +439,7 @@ simde_mm512_cmp_ps_mask (simde__m512 a, simde__m512 b, const int imm8) #elif defined(SIMDE_STATEMENT_EXPR_) && SIMDE_NATURAL_VECTOR_SIZE_LE(128) #define simde_mm512_cmp_ps_mask(a, b, imm8) SIMDE_STATEMENT_EXPR_(({ \ simde__m512_private \ - simde_mm512_cmp_ps_mask_r_, \ + simde_mm512_cmp_ps_mask_r_ = simde__m512_to_private(simde_mm512_setzero_ps()), \ simde_mm512_cmp_ps_mask_a_ = simde__m512_to_private((a)), \ simde_mm512_cmp_ps_mask_b_ = simde__m512_to_private((b)); \ \ @@ -250,7 +452,7 @@ simde_mm512_cmp_ps_mask (simde__m512 a, simde__m512 b, const int imm8) #elif defined(SIMDE_STATEMENT_EXPR_) && SIMDE_NATURAL_VECTOR_SIZE_LE(256) #define simde_mm512_cmp_ps_mask(a, b, imm8) SIMDE_STATEMENT_EXPR_(({ \ simde__m512_private \ - simde_mm512_cmp_ps_mask_r_, \ + simde_mm512_cmp_ps_mask_r_ = simde__m512_to_private(simde_mm512_setzero_ps()), \ simde_mm512_cmp_ps_mask_a_ = simde__m512_to_private((a)), \ simde_mm512_cmp_ps_mask_b_ = simde__m512_to_private((b)); \ \ @@ -485,7 +687,7 @@ simde_mm512_cmp_pd_mask (simde__m512d a, simde__m512d b, const int imm8) #elif defined(SIMDE_STATEMENT_EXPR_) && SIMDE_NATURAL_VECTOR_SIZE_LE(128) #define simde_mm512_cmp_pd_mask(a, b, imm8) SIMDE_STATEMENT_EXPR_(({ \ simde__m512d_private \ - simde_mm512_cmp_pd_mask_r_, \ + simde_mm512_cmp_pd_mask_r_ = simde__m512d_to_private(simde_mm512_setzero_pd()), \ simde_mm512_cmp_pd_mask_a_ = simde__m512d_to_private((a)), \ simde_mm512_cmp_pd_mask_b_ = simde__m512d_to_private((b)); \ \ @@ -498,7 +700,7 @@ simde_mm512_cmp_pd_mask (simde__m512d a, simde__m512d b, const int imm8) #elif defined(SIMDE_STATEMENT_EXPR_) && SIMDE_NATURAL_VECTOR_SIZE_LE(256) #define simde_mm512_cmp_pd_mask(a, b, imm8) SIMDE_STATEMENT_EXPR_(({ \ simde__m512d_private \ - simde_mm512_cmp_pd_mask_r_, \ + simde_mm512_cmp_pd_mask_r_ = simde__m512d_to_private(simde_mm512_setzero_pd()), \ simde_mm512_cmp_pd_mask_a_ = simde__m512d_to_private((a)), \ simde_mm512_cmp_pd_mask_b_ = simde__m512d_to_private((b)); \ \ @@ -534,6 +736,978 @@ simde_mm512_cmp_pd_mask (simde__m512d a, simde__m512d b, const int imm8) #define _mm_cmp_pd_mask(a, b, imm8) simde_mm_cmp_pd_mask((a), (b), (imm8)) #endif +SIMDE_HUGE_FUNCTION_ATTRIBUTES +simde__mmask32 +simde_mm512_cmp_ph_mask (simde__m512h a, simde__m512h b, const int imm8) + SIMDE_REQUIRE_CONSTANT_RANGE(imm8, 0, 31) { + simde__m512h_private + r_, + a_ = simde__m512h_to_private(a), + b_ = simde__m512h_to_private(b); + + switch (imm8) { + case SIMDE_CMP_EQ_OQ: + case SIMDE_CMP_EQ_OS: + #if defined(SIMDE_VECTOR_SUBSCRIPT_OPS) && defined(SIMDE_FLOAT16_VECTOR) + r_.i16 = HEDLEY_REINTERPRET_CAST(__typeof__(r_.i16), (a_.f16 == b_.f16)); + #else + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.f16) / sizeof(r_.f16[0])) ; i++) { + r_.i16[i] = ( + simde_float16_as_uint16(a_.f16[i]) == simde_float16_as_uint16(b_.f16[i]) + && !simde_isnanhf(a_.f16[i]) && !simde_isnanhf(b_.f16[i]) + ) ? ~INT16_C(0) : INT16_C(0); + } + #endif + break; + + case SIMDE_CMP_LT_OQ: + case SIMDE_CMP_LT_OS: + #if defined(SIMDE_VECTOR_SUBSCRIPT_OPS) && defined(SIMDE_FLOAT16_VECTOR) + r_.i16 = HEDLEY_REINTERPRET_CAST(__typeof__(r_.i16), (a_.f16 < b_.f16)); + #else + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.f16) / sizeof(r_.f16[0])) ; i++) { + r_.i16[i] = (simde_float16_to_float32(a_.f16[i]) < simde_float16_to_float32(b_.f16[i])) ? ~INT16_C(0) : INT16_C(0); + } + #endif + break; + + case SIMDE_CMP_LE_OQ: + case SIMDE_CMP_LE_OS: + #if defined(SIMDE_VECTOR_SUBSCRIPT_OPS) && defined(SIMDE_FLOAT16_VECTOR) + r_.i16 = HEDLEY_REINTERPRET_CAST(__typeof__(r_.i16), (a_.f16 <= b_.f16)); + #else + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.f16) / sizeof(r_.f16[0])) ; i++) { + r_.i16[i] = (simde_float16_to_float32(a_.f16[i]) <= simde_float16_to_float32(b_.f16[i])) ? ~INT16_C(0) : INT16_C(0); + } + #endif + break; + + case SIMDE_CMP_UNORD_Q: + case SIMDE_CMP_UNORD_S: + #if defined(SIMDE_VECTOR_SUBSCRIPT_OPS) && defined(SIMDE_FLOAT16_VECTOR) + r_.i16 = HEDLEY_REINTERPRET_CAST(__typeof__(r_.i16), (a_.f16 != a_.f16) | (b_.f16 != b_.f16)); + #else + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.f16) / sizeof(r_.f16[0])) ; i++) { + r_.i16[i] = ( + (simde_float16_to_float32(a_.f16[i]) != simde_float16_to_float32(a_.f16[i])) + || (simde_float16_to_float32(b_.f16[i]) != simde_float16_to_float32(b_.f16[i])) + ) ? ~INT16_C(0) : INT16_C(0); + } + #endif + break; + + case SIMDE_CMP_NEQ_UQ: + case SIMDE_CMP_NEQ_US: + #if defined(SIMDE_VECTOR_SUBSCRIPT_OPS) && defined(SIMDE_FLOAT16_VECTOR) + r_.i16 = HEDLEY_REINTERPRET_CAST(__typeof__(r_.i16), (a_.f16 != b_.f16)); + #else + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.f16) / sizeof(r_.f16[0])) ; i++) { + r_.i16[i] = ( + (simde_float16_as_uint16(a_.f16[i]) != simde_float16_as_uint16(b_.f16[i])) + || simde_isnanhf(a_.f16[i]) || simde_isnanhf(b_.f16[i]) + ) ? ~INT16_C(0) : INT16_C(0); + } + #endif + break; + + case SIMDE_CMP_NEQ_OQ: + case SIMDE_CMP_NEQ_OS: + #if defined(SIMDE_VECTOR_SUBSCRIPT_OPS) && defined(SIMDE_FLOAT16_VECTOR) + r_.i16 = HEDLEY_REINTERPRET_CAST(__typeof__(r_.i16), (a_.f16 == a_.f16) & (b_.f16 == b_.f16) & (a_.f16 != b_.f16)); + #else + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.f16) / sizeof(r_.f16[0])) ; i++) { + r_.i16[i] = ( + !(simde_isnanhf(a_.f16[i]) || simde_isnanhf(b_.f16[i])) + && (simde_float16_as_uint16(a_.f16[i]) != simde_float16_as_uint16(b_.f16[i])) + ) ? ~INT16_C(0) : INT16_C(0); + } + #endif + break; + + case SIMDE_CMP_NLT_UQ: + case SIMDE_CMP_NLT_US: + #if defined(SIMDE_VECTOR_SUBSCRIPT_OPS) && defined(SIMDE_FLOAT16_VECTOR) + r_.i16 = HEDLEY_REINTERPRET_CAST(__typeof__(r_.i16), ~(a_.f16 < b_.f16)); + #else + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.f16) / sizeof(r_.f16[0])) ; i++) { + r_.i16[i] = !( + simde_float16_to_float32(a_.f16[i]) < simde_float16_to_float32(b_.f16[i]) + ) ? ~INT16_C(0) : INT16_C(0); + } + #endif + break; + + case SIMDE_CMP_NLE_UQ: + case SIMDE_CMP_NLE_US: + #if defined(SIMDE_VECTOR_SUBSCRIPT_OPS) && defined(SIMDE_FLOAT16_VECTOR) + r_.i16 = HEDLEY_REINTERPRET_CAST(__typeof__(r_.i16), ~(a_.f16 <= b_.f16)); + #else + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.f16) / sizeof(r_.f16[0])) ; i++) { + r_.i16[i] = !( + simde_float16_to_float32(a_.f16[i]) <= simde_float16_to_float32(b_.f16[i]) + ) ? ~INT16_C(0) : INT16_C(0); + } + #endif + break; + + case SIMDE_CMP_ORD_Q: + case SIMDE_CMP_ORD_S: + #if defined(SIMDE_VECTOR_SUBSCRIPT_OPS) && defined(SIMDE_FLOAT16_VECTOR) + r_.i16 = HEDLEY_REINTERPRET_CAST(__typeof__(r_.i16), ((a_.f16 == a_.f16) & (b_.f16 == b_.f16))); + #else + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.f16) / sizeof(r_.f16[0])) ; i++) { + r_.i16[i] = (simde_isnanhf(a_.f16[i]) || simde_isnanhf(b_.f16[i])) ? INT16_C(0) : ~INT16_C(0); + } + #endif + break; + + case SIMDE_CMP_EQ_UQ: + case SIMDE_CMP_EQ_US: + #if defined(SIMDE_VECTOR_SUBSCRIPT_OPS) && defined(SIMDE_FLOAT16_VECTOR) + r_.i16 = HEDLEY_REINTERPRET_CAST(__typeof__(r_.i16), (a_.f16 != a_.f16) | (b_.f16 != b_.f16) | (a_.f16 == b_.f16)); + #else + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.f16) / sizeof(r_.f16[0])) ; i++) { + r_.i16[i] = ( + (simde_isnanhf(a_.f16[i]) || simde_isnanhf(b_.f16[i])) + || (simde_float16_as_uint16(a_.f16[i]) == simde_float16_as_uint16(b_.f16[i])) + ) ? ~INT16_C(0) : INT16_C(0); + } + #endif + break; + + case SIMDE_CMP_NGE_UQ: + case SIMDE_CMP_NGE_US: + #if defined(SIMDE_VECTOR_SUBSCRIPT_OPS) && defined(SIMDE_FLOAT16_VECTOR) + r_.i16 = HEDLEY_REINTERPRET_CAST(__typeof__(r_.i16), ~(a_.f16 >= b_.f16)); + #else + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.f16) / sizeof(r_.f16[0])) ; i++) { + r_.i16[i] = !( + simde_float16_to_float32(a_.f16[i]) >= simde_float16_to_float32(b_.f16[i]) + ) ? ~INT16_C(0) : INT16_C(0); + } + #endif + break; + + case SIMDE_CMP_NGT_UQ: + case SIMDE_CMP_NGT_US: + #if defined(SIMDE_VECTOR_SUBSCRIPT_OPS) && defined(SIMDE_FLOAT16_VECTOR) + r_.i16 = HEDLEY_REINTERPRET_CAST(__typeof__(r_.i16), ~(a_.f16 > b_.f16)); + #else + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.f16) / sizeof(r_.f16[0])) ; i++) { + r_.i16[i] = !( + simde_float16_to_float32(a_.f16[i]) > simde_float16_to_float32(b_.f16[i]) + ) ? ~INT16_C(0) : INT16_C(0); + } + #endif + break; + + case SIMDE_CMP_FALSE_OQ: + case SIMDE_CMP_FALSE_OS: + r_ = simde__m512h_to_private(simde_mm512_setzero_ph()); + break; + + case SIMDE_CMP_GE_OQ: + case SIMDE_CMP_GE_OS: + #if defined(SIMDE_VECTOR_SUBSCRIPT_OPS) && defined(SIMDE_FLOAT16_VECTOR) + r_.i16 = HEDLEY_REINTERPRET_CAST(__typeof__(r_.i16), (a_.f16 >= b_.f16)); + #else + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.f16) / sizeof(r_.f16[0])) ; i++) { + r_.i16[i] = ( + simde_float16_to_float32(a_.f16[i]) >= simde_float16_to_float32(b_.f16[i]) + ) ? ~INT16_C(0) : INT16_C(0); + } + #endif + break; + + case SIMDE_CMP_GT_OQ: + case SIMDE_CMP_GT_OS: + #if defined(SIMDE_VECTOR_SUBSCRIPT_OPS) && defined(SIMDE_FLOAT16_VECTOR) + r_.i16 = HEDLEY_REINTERPRET_CAST(__typeof__(r_.i16), (a_.f16 > b_.f16)); + #else + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.f16) / sizeof(r_.f16[0])) ; i++) { + r_.i16[i] = ( + simde_float16_to_float32(a_.f16[i]) > simde_float16_to_float32(b_.f16[i]) + ) ? ~INT16_C(0) : INT16_C(0); + } + #endif + break; + + case SIMDE_CMP_TRUE_UQ: + case SIMDE_CMP_TRUE_US: + r_ = simde__m512h_to_private(simde_x_mm512_setone_ph()); + break; + + default: + HEDLEY_UNREACHABLE(); + } + + return simde_mm512_movepi16_mask(simde_mm512_castph_si512(simde__m512h_from_private(r_))); +} +#if defined(SIMDE_X86_AVX512FP16_NATIVE) + #define simde_mm512_cmp_ph_mask(a, b, imm8) _mm512_cmp_ph_mask((a), (b), (imm8)) +#endif +#if defined(SIMDE_X86_AVX512FP16_ENABLE_NATIVE_ALIASES) + #undef _mm512_cmp_ph_mask + #define _mm512_cmp_ph_mask(a, b, imm8) simde_mm512_cmp_ph_mask((a), (b), (imm8)) +#endif + +SIMDE_HUGE_FUNCTION_ATTRIBUTES +simde__mmask32 +simde_mm512_cmp_epi16_mask (simde__m512i a, simde__m512i b, const int imm8) + SIMDE_REQUIRE_CONSTANT_RANGE(imm8, 0, 7) { + simde__m512i_private + r_, + a_ = simde__m512i_to_private(a), + b_ = simde__m512i_to_private(b); + + switch (imm8) { + case SIMDE_MM_CMPINT_EQ: + #if defined(SIMDE_VECTOR_SUBSCRIPT_OPS) + r_.i16 = HEDLEY_REINTERPRET_CAST(__typeof__(r_.i16), (a_.i16 == b_.i16)); + #else + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.i16) / sizeof(r_.i16[0])) ; i++) { + r_.i16[i] = (a_.i16[i] == b_.i16[i]) ? ~UINT16_C(0) : UINT16_C(0); + } + #endif + break; + + case SIMDE_MM_CMPINT_LT: + #if defined(SIMDE_VECTOR_SUBSCRIPT_OPS) + r_.i16 = HEDLEY_REINTERPRET_CAST(__typeof__(r_.i16), (a_.i16 < b_.i16)); + #else + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.i16) / sizeof(r_.i16[0])) ; i++) { + r_.i16[i] = (a_.i16[i] < b_.i16[i]) ? ~UINT16_C(0) : UINT16_C(0); + } + #endif + break; + + case SIMDE_MM_CMPINT_LE: + #if defined(SIMDE_VECTOR_SUBSCRIPT_OPS) + r_.i16 = HEDLEY_REINTERPRET_CAST(__typeof__(r_.i16), (a_.i16 <= b_.i16)); + #else + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.i16) / sizeof(r_.i16[0])) ; i++) { + r_.i16[i] = (a_.i16[i] <= b_.i16[i]) ? ~UINT16_C(0) : UINT16_C(0); + } + #endif + break; + + case SIMDE_MM_CMPINT_FALSE: + r_ = simde__m512i_to_private(simde_mm512_setzero_si512()); + break; + + + case SIMDE_MM_CMPINT_NE: + #if defined(SIMDE_VECTOR_SUBSCRIPT_OPS) + r_.i16 = HEDLEY_REINTERPRET_CAST(__typeof__(r_.i16), (a_.i16 != b_.i16)); + #else + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.i16) / sizeof(r_.i16[0])) ; i++) { + r_.i16[i] = (a_.i16[i] != b_.i16[i]) ? ~UINT16_C(0) : UINT16_C(0); + } + #endif + break; + + case SIMDE_MM_CMPINT_NLT: + #if defined(SIMDE_VECTOR_SUBSCRIPT_OPS) + r_.i16 = HEDLEY_REINTERPRET_CAST(__typeof__(r_.i16), ~(a_.i16 < b_.i16)); + #else + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.i16) / sizeof(r_.i16[0])) ; i++) { + r_.i16[i] = !(a_.i16[i] < b_.i16[i]) ? ~UINT16_C(0) : UINT16_C(0); + } + #endif + break; + + case SIMDE_MM_CMPINT_NLE: + #if defined(SIMDE_VECTOR_SUBSCRIPT_OPS) + r_.i16 = HEDLEY_REINTERPRET_CAST(__typeof__(r_.i16), ~(a_.i16 <= b_.i16)); + #else + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.i16) / sizeof(r_.i16[0])) ; i++) { + r_.i16[i] = !(a_.i16[i] <= b_.i16[i]) ? ~UINT16_C(0) : UINT16_C(0); + } + #endif + break; + + case SIMDE_MM_CMPINT_TRUE: + r_ = simde__m512i_to_private(simde_x_mm512_setone_si512()); + break; + + default: + HEDLEY_UNREACHABLE(); + } + + return simde_mm512_movepi16_mask(simde__m512i_from_private(r_)); +} +#if defined(SIMDE_X86_AVX512BW_NATIVE) + #define simde_mm512_cmp_epi16_mask(a, b, imm8) _mm512_cmp_epi16_mask((a), (b), (imm8)) +#endif +#if defined(SIMDE_X86_AVX512BW_ENABLE_NATIVE_ALIASES) + #undef _mm512_cmp_epi16_mask + #define _mm512_cmp_epi16_mask(a, b, imm8) simde_mm512_cmp_epi16_mask((a), (b), (imm8)) +#endif + +#if defined(SIMDE_X86_AVX512BW_NATIVE) + #define simde_mm512_mask_cmp_epi16_mask(k1, a, b, imm8) _mm512_mask_cmp_epi16_mask(k1, a, b, imm8) +#else + #define simde_mm512_mask_cmp_epi16_mask(k1, a, b, imm8) (k1) & simde_mm512_cmp_epi16_mask(a, b, imm8) +#endif +#if defined(SIMDE_X86_AVX512BW_ENABLE_NATIVE_ALIASES) + #undef _mm512_mask_cmp_epi16_mask +#define _mm512_mask_cmp_epi16_mask(k1, a, b, imm8) simde_mm512_mask_cmp_epi16_mask((k1), (a), (b), (imm8)) +#endif + +SIMDE_HUGE_FUNCTION_ATTRIBUTES +simde__mmask16 +simde_mm512_cmp_epi32_mask (simde__m512i a, simde__m512i b, const int imm8) + SIMDE_REQUIRE_CONSTANT_RANGE(imm8, 0, 7) { + simde__m512i_private + r_, + a_ = simde__m512i_to_private(a), + b_ = simde__m512i_to_private(b); + + switch (imm8) { + case SIMDE_MM_CMPINT_EQ: + #if defined(SIMDE_VECTOR_SUBSCRIPT_OPS) + r_.i32 = HEDLEY_REINTERPRET_CAST(__typeof__(r_.i32), (a_.i32 == b_.i32)); + #else + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.i32) / sizeof(r_.i32[0])) ; i++) { + r_.i32[i] = (a_.i32[i] == b_.i32[i]) ? ~UINT32_C(0) : UINT32_C(0); + } + #endif + break; + + case SIMDE_MM_CMPINT_LT: + #if defined(SIMDE_VECTOR_SUBSCRIPT_OPS) + r_.i32 = HEDLEY_REINTERPRET_CAST(__typeof__(r_.i32), (a_.i32 < b_.i32)); + #else + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.i32) / sizeof(r_.i32[0])) ; i++) { + r_.i32[i] = (a_.i32[i] < b_.i32[i]) ? ~UINT32_C(0) : UINT32_C(0); + } + #endif + break; + + case SIMDE_MM_CMPINT_LE: + #if defined(SIMDE_VECTOR_SUBSCRIPT_OPS) + r_.i32 = HEDLEY_REINTERPRET_CAST(__typeof__(r_.i32), (a_.i32 <= b_.i32)); + #else + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.i32) / sizeof(r_.i32[0])) ; i++) { + r_.i32[i] = (a_.i32[i] <= b_.i32[i]) ? ~UINT32_C(0) : UINT32_C(0); + } + #endif + break; + + case SIMDE_MM_CMPINT_FALSE: + r_ = simde__m512i_to_private(simde_mm512_setzero_si512()); + break; + + + case SIMDE_MM_CMPINT_NE: + #if defined(SIMDE_VECTOR_SUBSCRIPT_OPS) + r_.i32 = HEDLEY_REINTERPRET_CAST(__typeof__(r_.i32), (a_.i32 != b_.i32)); + #else + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.i32) / sizeof(r_.i32[0])) ; i++) { + r_.i32[i] = (a_.i32[i] != b_.i32[i]) ? ~UINT32_C(0) : UINT32_C(0); + } + #endif + break; + + case SIMDE_MM_CMPINT_NLT: + #if defined(SIMDE_VECTOR_SUBSCRIPT_OPS) + r_.i32 = HEDLEY_REINTERPRET_CAST(__typeof__(r_.i32), ~(a_.i32 < b_.i32)); + #else + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.i32) / sizeof(r_.i32[0])) ; i++) { + r_.i32[i] = !(a_.i32[i] < b_.i32[i]) ? ~UINT32_C(0) : UINT32_C(0); + } + #endif + break; + + case SIMDE_MM_CMPINT_NLE: + #if defined(SIMDE_VECTOR_SUBSCRIPT_OPS) + r_.i32 = HEDLEY_REINTERPRET_CAST(__typeof__(r_.i32), ~(a_.i32 <= b_.i32)); + #else + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.i32) / sizeof(r_.i32[0])) ; i++) { + r_.i32[i] = !(a_.i32[i] <= b_.i32[i]) ? ~UINT32_C(0) : UINT32_C(0); + } + #endif + break; + + case SIMDE_MM_CMPINT_TRUE: + r_ = simde__m512i_to_private(simde_x_mm512_setone_si512()); + break; + + default: + HEDLEY_UNREACHABLE(); + } + + return simde_mm512_movepi32_mask(simde__m512i_from_private(r_)); +} +#if defined(SIMDE_X86_AVX512F_NATIVE) + #define simde_mm512_cmp_epi32_mask(a, b, imm8) _mm512_cmp_epi32_mask((a), (b), (imm8)) +#endif +#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) + #undef _mm512_cmp_epi32_mask + #define _mm512_cmp_epi32_mask(a, b, imm8) simde_mm512_cmp_epi32_mask((a), (b), (imm8)) +#endif + +SIMDE_HUGE_FUNCTION_ATTRIBUTES +simde__mmask8 +simde_mm512_cmp_epi64_mask (simde__m512i a, simde__m512i b, const int imm8) + SIMDE_REQUIRE_CONSTANT_RANGE(imm8, 0, 7) { + simde__m512i_private + r_, + a_ = simde__m512i_to_private(a), + b_ = simde__m512i_to_private(b); + + switch (imm8) { + case SIMDE_MM_CMPINT_EQ: + #if defined(SIMDE_VECTOR_SUBSCRIPT_OPS) + r_.i64 = HEDLEY_REINTERPRET_CAST(__typeof__(r_.i64), (a_.i64 == b_.i64)); + #else + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.i64) / sizeof(r_.i64[0])) ; i++) { + r_.i64[i] = (a_.i64[i] == b_.i64[i]) ? ~UINT64_C(0) : UINT64_C(0); + } + #endif + break; + + case SIMDE_MM_CMPINT_LT: + #if defined(SIMDE_VECTOR_SUBSCRIPT_OPS) + r_.i64 = HEDLEY_REINTERPRET_CAST(__typeof__(r_.i64), (a_.i64 < b_.i64)); + #else + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.i64) / sizeof(r_.i64[0])) ; i++) { + r_.i64[i] = (a_.i64[i] < b_.i64[i]) ? ~UINT64_C(0) : UINT64_C(0); + } + #endif + break; + + case SIMDE_MM_CMPINT_LE: + #if defined(SIMDE_VECTOR_SUBSCRIPT_OPS) + r_.i64 = HEDLEY_REINTERPRET_CAST(__typeof__(r_.i64), (a_.i64 <= b_.i64)); + #else + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.i64) / sizeof(r_.i64[0])) ; i++) { + r_.i64[i] = (a_.i64[i] <= b_.i64[i]) ? ~UINT64_C(0) : UINT64_C(0); + } + #endif + break; + + case SIMDE_MM_CMPINT_FALSE: + r_ = simde__m512i_to_private(simde_mm512_setzero_si512()); + break; + + + case SIMDE_MM_CMPINT_NE: + #if defined(SIMDE_VECTOR_SUBSCRIPT_OPS) + r_.i64 = HEDLEY_REINTERPRET_CAST(__typeof__(r_.i64), (a_.i64 != b_.i64)); + #else + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.i64) / sizeof(r_.i64[0])) ; i++) { + r_.i64[i] = (a_.i64[i] != b_.i64[i]) ? ~UINT64_C(0) : UINT64_C(0); + } + #endif + break; + + case SIMDE_MM_CMPINT_NLT: + #if defined(SIMDE_VECTOR_SUBSCRIPT_OPS) + r_.i64 = HEDLEY_REINTERPRET_CAST(__typeof__(r_.i64), ~(a_.i64 < b_.i64)); + #else + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.i64) / sizeof(r_.i64[0])) ; i++) { + r_.i64[i] = !(a_.i64[i] < b_.i64[i]) ? ~UINT64_C(0) : UINT64_C(0); + } + #endif + break; + + case SIMDE_MM_CMPINT_NLE: + #if defined(SIMDE_VECTOR_SUBSCRIPT_OPS) + r_.i64 = HEDLEY_REINTERPRET_CAST(__typeof__(r_.i64), ~(a_.i64 <= b_.i64)); + #else + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.i64) / sizeof(r_.i64[0])) ; i++) { + r_.i64[i] = !(a_.i64[i] <= b_.i64[i]) ? ~UINT64_C(0) : UINT64_C(0); + } + #endif + break; + + case SIMDE_MM_CMPINT_TRUE: + r_ = simde__m512i_to_private(simde_x_mm512_setone_si512()); + break; + + default: + HEDLEY_UNREACHABLE(); + } + + return simde_mm512_movepi64_mask(simde__m512i_from_private(r_)); +} +#if defined(SIMDE_X86_AVX512F_NATIVE) + #define simde_mm512_cmp_epi64_mask(a, b, imm8) _mm512_cmp_epi64_mask((a), (b), (imm8)) +#endif +#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) + #undef _mm512_cmp_epi64_mask + #define _mm512_cmp_epi64_mask(a, b, imm8) simde_mm512_cmp_epi64_mask((a), (b), (imm8)) +#endif + +SIMDE_HUGE_FUNCTION_ATTRIBUTES +simde__mmask32 +simde_mm512_cmp_epu16_mask (simde__m512i a, simde__m512i b, const int imm8) + SIMDE_REQUIRE_CONSTANT_RANGE(imm8, 0, 7) { + simde__m512i_private + r_, + a_ = simde__m512i_to_private(a), + b_ = simde__m512i_to_private(b); + + switch (imm8) { + case SIMDE_MM_CMPINT_EQ: + #if defined(SIMDE_VECTOR_SUBSCRIPT_OPS) + r_.u16 = HEDLEY_REINTERPRET_CAST(__typeof__(r_.u16), (a_.u16 == b_.u16)); + #else + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.u16) / sizeof(r_.u16[0])) ; i++) { + r_.u16[i] = (a_.u16[i] == b_.u16[i]) ? ~UINT16_C(0) : UINT16_C(0); + } + #endif + break; + + case SIMDE_MM_CMPINT_LT: + #if defined(SIMDE_VECTOR_SUBSCRIPT_OPS) + r_.u16 = HEDLEY_REINTERPRET_CAST(__typeof__(r_.u16), (a_.u16 < b_.u16)); + #else + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.u16) / sizeof(r_.u16[0])) ; i++) { + r_.u16[i] = (a_.u16[i] < b_.u16[i]) ? ~UINT16_C(0) : UINT16_C(0); + } + #endif + break; + + case SIMDE_MM_CMPINT_LE: + #if defined(SIMDE_VECTOR_SUBSCRIPT_OPS) + r_.u16 = HEDLEY_REINTERPRET_CAST(__typeof__(r_.u16), (a_.u16 <= b_.u16)); + #else + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.u16) / sizeof(r_.u16[0])) ; i++) { + r_.u16[i] = (a_.u16[i] <= b_.u16[i]) ? ~UINT16_C(0) : UINT16_C(0); + } + #endif + break; + + case SIMDE_MM_CMPINT_FALSE: + r_ = simde__m512i_to_private(simde_mm512_setzero_si512()); + break; + + + case SIMDE_MM_CMPINT_NE: + #if defined(SIMDE_VECTOR_SUBSCRIPT_OPS) + r_.u16 = HEDLEY_REINTERPRET_CAST(__typeof__(r_.u16), (a_.u16 != b_.u16)); + #else + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.u16) / sizeof(r_.u16[0])) ; i++) { + r_.u16[i] = (a_.u16[i] != b_.u16[i]) ? ~UINT16_C(0) : UINT16_C(0); + } + #endif + break; + + case SIMDE_MM_CMPINT_NLT: + #if defined(SIMDE_VECTOR_SUBSCRIPT_OPS) + r_.u16 = HEDLEY_REINTERPRET_CAST(__typeof__(r_.u16), ~(a_.u16 < b_.u16)); + #else + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.u16) / sizeof(r_.u16[0])) ; i++) { + r_.u16[i] = !(a_.u16[i] < b_.u16[i]) ? ~UINT16_C(0) : UINT16_C(0); + } + #endif + break; + + case SIMDE_MM_CMPINT_NLE: + #if defined(SIMDE_VECTOR_SUBSCRIPT_OPS) + r_.u16 = HEDLEY_REINTERPRET_CAST(__typeof__(r_.u16), ~(a_.u16 <= b_.u16)); + #else + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.u16) / sizeof(r_.u16[0])) ; i++) { + r_.u16[i] = !(a_.u16[i] <= b_.u16[i]) ? ~UINT16_C(0) : UINT16_C(0); + } + #endif + break; + + case SIMDE_MM_CMPINT_TRUE: + r_ = simde__m512i_to_private(simde_x_mm512_setone_si512()); + break; + + default: + HEDLEY_UNREACHABLE(); + } + + return simde_mm512_movepi16_mask(simde__m512i_from_private(r_)); +} +#if defined(SIMDE_X86_AVX512BW_NATIVE) + #define simde_mm512_cmp_epu16_mask(a, b, imm8) _mm512_cmp_epu16_mask((a), (b), (imm8)) +#endif +#if defined(SIMDE_X86_AVX512BW_ENABLE_NATIVE_ALIASES) + #undef _mm512_cmp_epu16_mask + #define _mm512_cmp_epu16_mask(a, b, imm8) simde_mm512_cmp_epu16_mask((a), (b), (imm8)) +#endif + +#if defined(SIMDE_X86_AVX512BW_NATIVE) + #define simde_mm512_mask_cmp_epu16_mask(k1, a, b, imm8) _mm512_mask_cmp_epu16_mask(k1, a, b, imm8) +#else + #define simde_mm512_mask_cmp_epu16_mask(k1, a, b, imm8) (k1) & simde_mm512_cmp_epu16_mask(a, b, imm8) +#endif +#if defined(SIMDE_X86_AVX512BW_ENABLE_NATIVE_ALIASES) + #undef _mm512_mask_cmp_epu16_mask +#define _mm512_mask_cmp_epu16_mask(k1, a, b, imm8) simde_mm512_mask_cmp_epu16_mask((k1), (a), (b), (imm8)) +#endif + +SIMDE_HUGE_FUNCTION_ATTRIBUTES +simde__mmask8 +simde_mm256_cmp_epu32_mask (simde__m256i a, simde__m256i b, const int imm8) + SIMDE_REQUIRE_CONSTANT_RANGE(imm8, 0, 7) { + simde__m256i_private + r_, + a_ = simde__m256i_to_private(a), + b_ = simde__m256i_to_private(b); + + switch (imm8) { + case SIMDE_MM_CMPINT_EQ: + #if defined(SIMDE_VECTOR_SUBSCRIPT_OPS) + r_.u32 = HEDLEY_REINTERPRET_CAST(__typeof__(r_.u32), (a_.u32 == b_.u32)); + #else + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.u32) / sizeof(r_.u32[0])) ; i++) { + r_.u32[i] = (a_.u32[i] == b_.u32[i]) ? ~UINT32_C(0) : UINT32_C(0); + } + #endif + break; + + case SIMDE_MM_CMPINT_LT: + #if defined(SIMDE_VECTOR_SUBSCRIPT_OPS) + r_.u32 = HEDLEY_REINTERPRET_CAST(__typeof__(r_.u32), (a_.u32 < b_.u32)); + #else + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.u32) / sizeof(r_.u32[0])) ; i++) { + r_.u32[i] = (a_.u32[i] < b_.u32[i]) ? ~UINT32_C(0) : UINT32_C(0); + } + #endif + break; + + case SIMDE_MM_CMPINT_LE: + #if defined(SIMDE_VECTOR_SUBSCRIPT_OPS) + r_.u32 = HEDLEY_REINTERPRET_CAST(__typeof__(r_.u32), (a_.u32 <= b_.u32)); + #else + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.u32) / sizeof(r_.u32[0])) ; i++) { + r_.u32[i] = (a_.u32[i] <= b_.u32[i]) ? ~UINT32_C(0) : UINT32_C(0); + } + #endif + break; + + case SIMDE_MM_CMPINT_FALSE: + r_ = simde__m256i_to_private(simde_mm256_setzero_si256()); + break; + + + case SIMDE_MM_CMPINT_NE: + #if defined(SIMDE_VECTOR_SUBSCRIPT_OPS) + r_.u32 = HEDLEY_REINTERPRET_CAST(__typeof__(r_.u32), (a_.u32 != b_.u32)); + #else + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.u32) / sizeof(r_.u32[0])) ; i++) { + r_.u32[i] = (a_.u32[i] != b_.u32[i]) ? ~UINT32_C(0) : UINT32_C(0); + } + #endif + break; + + case SIMDE_MM_CMPINT_NLT: + #if defined(SIMDE_VECTOR_SUBSCRIPT_OPS) + r_.u32 = HEDLEY_REINTERPRET_CAST(__typeof__(r_.u32), ~(a_.u32 < b_.u32)); + #else + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.u32) / sizeof(r_.u32[0])) ; i++) { + r_.u32[i] = !(a_.u32[i] < b_.u32[i]) ? ~UINT32_C(0) : UINT32_C(0); + } + #endif + break; + + case SIMDE_MM_CMPINT_NLE: + #if defined(SIMDE_VECTOR_SUBSCRIPT_OPS) + r_.u32 = HEDLEY_REINTERPRET_CAST(__typeof__(r_.u32), ~(a_.u32 <= b_.u32)); + #else + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.u32) / sizeof(r_.u32[0])) ; i++) { + r_.u32[i] = !(a_.u32[i] <= b_.u32[i]) ? ~UINT32_C(0) : UINT32_C(0); + } + #endif + break; + + case SIMDE_MM_CMPINT_TRUE: + r_ = simde__m256i_to_private(simde_x_mm256_setone_si256()); + break; + + default: + HEDLEY_UNREACHABLE(); + } + + return simde_mm256_movepi32_mask(simde__m256i_from_private(r_)); +} +#if defined(SIMDE_X86_AVX512F_NATIVE) && defined(SIMDE_X86_AVX512VL_NATIVE) + #define simde_mm256_cmp_epu32_mask(a, b, imm8) _mm256_cmp_epu32_mask((a), (b), (imm8)) +#endif +#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) && defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) + #undef _mm256_cmp_epu32_mask + #define _mm256_cmp_epu32_mask(a, b, imm8) simde_mm256_cmp_epu32_mask((a), (b), (imm8)) +#endif + +#if defined(SIMDE_X86_AVX512F_NATIVE) && defined(SIMDE_X86_AVX512VL_NATIVE) + #define simde_mm256_mask_cmp_epu32_mask(k1, a, b, imm8) _mm256_mask_cmp_epu32_mask(k1, a, b, imm8) +#else + #define simde_mm256_mask_cmp_epu32_mask(k1, a, b, imm8) (k1) & simde_mm256_cmp_epu32_mask((a), (b), (imm8)) +#endif +#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) && defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) + #undef _mm256_mask_cmp_epu32_mask +#define _mm256_mask_cmp_epu32_mask(a, b, imm8) simde_mm256_mask_cmp_epu32_mask((a), (b), (imm8)) +#endif + +SIMDE_HUGE_FUNCTION_ATTRIBUTES +simde__mmask16 +simde_mm512_cmp_epu32_mask (simde__m512i a, simde__m512i b, const int imm8) + SIMDE_REQUIRE_CONSTANT_RANGE(imm8, 0, 7) { + simde__m512i_private + r_, + a_ = simde__m512i_to_private(a), + b_ = simde__m512i_to_private(b); + + switch (imm8) { + case SIMDE_MM_CMPINT_EQ: + #if defined(SIMDE_VECTOR_SUBSCRIPT_OPS) + r_.u32 = HEDLEY_REINTERPRET_CAST(__typeof__(r_.u32), (a_.u32 == b_.u32)); + #else + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.u32) / sizeof(r_.u32[0])) ; i++) { + r_.u32[i] = (a_.u32[i] == b_.u32[i]) ? ~UINT32_C(0) : UINT32_C(0); + } + #endif + break; + + case SIMDE_MM_CMPINT_LT: + #if defined(SIMDE_VECTOR_SUBSCRIPT_OPS) + r_.u32 = HEDLEY_REINTERPRET_CAST(__typeof__(r_.u32), (a_.u32 < b_.u32)); + #else + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.u32) / sizeof(r_.u32[0])) ; i++) { + r_.u32[i] = (a_.u32[i] < b_.u32[i]) ? ~UINT32_C(0) : UINT32_C(0); + } + #endif + break; + + case SIMDE_MM_CMPINT_LE: + #if defined(SIMDE_VECTOR_SUBSCRIPT_OPS) + r_.u32 = HEDLEY_REINTERPRET_CAST(__typeof__(r_.u32), (a_.u32 <= b_.u32)); + #else + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.u32) / sizeof(r_.u32[0])) ; i++) { + r_.u32[i] = (a_.u32[i] <= b_.u32[i]) ? ~UINT32_C(0) : UINT32_C(0); + } + #endif + break; + + case SIMDE_MM_CMPINT_FALSE: + r_ = simde__m512i_to_private(simde_mm512_setzero_si512()); + break; + + + case SIMDE_MM_CMPINT_NE: + #if defined(SIMDE_VECTOR_SUBSCRIPT_OPS) + r_.u32 = HEDLEY_REINTERPRET_CAST(__typeof__(r_.u32), (a_.u32 != b_.u32)); + #else + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.u32) / sizeof(r_.u32[0])) ; i++) { + r_.u32[i] = (a_.u32[i] != b_.u32[i]) ? ~UINT32_C(0) : UINT32_C(0); + } + #endif + break; + + case SIMDE_MM_CMPINT_NLT: + #if defined(SIMDE_VECTOR_SUBSCRIPT_OPS) + r_.u32 = HEDLEY_REINTERPRET_CAST(__typeof__(r_.u32), ~(a_.u32 < b_.u32)); + #else + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.u32) / sizeof(r_.u32[0])) ; i++) { + r_.u32[i] = !(a_.u32[i] < b_.u32[i]) ? ~UINT32_C(0) : UINT32_C(0); + } + #endif + break; + + case SIMDE_MM_CMPINT_NLE: + #if defined(SIMDE_VECTOR_SUBSCRIPT_OPS) + r_.u32 = HEDLEY_REINTERPRET_CAST(__typeof__(r_.u32), ~(a_.u32 <= b_.u32)); + #else + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.u32) / sizeof(r_.u32[0])) ; i++) { + r_.u32[i] = !(a_.u32[i] <= b_.u32[i]) ? ~UINT32_C(0) : UINT32_C(0); + } + #endif + break; + + case SIMDE_MM_CMPINT_TRUE: + r_ = simde__m512i_to_private(simde_x_mm512_setone_si512()); + break; + + default: + HEDLEY_UNREACHABLE(); + } + + return simde_mm512_movepi32_mask(simde__m512i_from_private(r_)); +} +#if defined(SIMDE_X86_AVX512F_NATIVE) + #define simde_mm512_cmp_epu32_mask(a, b, imm8) _mm512_cmp_epu32_mask((a), (b), (imm8)) +#endif +#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) + #undef _mm512_cmp_epu32_mask + #define _mm512_cmp_epu32_mask(a, b, imm8) simde_mm512_cmp_epu32_mask((a), (b), (imm8)) +#endif + +#if defined(SIMDE_X86_AVX512F_NATIVE) + #define simde_mm512_mask_cmp_epu32_mask(k1, a, b, imm8) _mm512_mask_cmp_epu32_mask(k1, a, b, imm8) +#else + #define simde_mm512_mask_cmp_epu32_mask(k1, a, b, imm8) (k1) & simde_mm512_cmp_epu32_mask(a, b, imm8) +#endif +#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) + #undef _mm512_mask_cmp_epu32_mask +#define _mm512_mask_cmp_epu32_mask(k1, a, b, imm8) simde_mm512_mask_cmp_epu32_mask((k1), (a), (b), (imm8)) +#endif + +SIMDE_HUGE_FUNCTION_ATTRIBUTES +simde__mmask8 +simde_mm512_cmp_epu64_mask (simde__m512i a, simde__m512i b, const int imm8) + SIMDE_REQUIRE_CONSTANT_RANGE(imm8, 0, 7) { + simde__m512i_private + r_, + a_ = simde__m512i_to_private(a), + b_ = simde__m512i_to_private(b); + + switch (imm8) { + case SIMDE_MM_CMPINT_EQ: + #if defined(SIMDE_VECTOR_SUBSCRIPT_OPS) + r_.u64 = HEDLEY_REINTERPRET_CAST(__typeof__(r_.u64), (a_.u64 == b_.u64)); + #else + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.u64) / sizeof(r_.u64[0])) ; i++) { + r_.u64[i] = (a_.u64[i] == b_.u64[i]) ? ~UINT64_C(0) : UINT64_C(0); + } + #endif + break; + + case SIMDE_MM_CMPINT_LT: + #if defined(SIMDE_VECTOR_SUBSCRIPT_OPS) + r_.u64 = HEDLEY_REINTERPRET_CAST(__typeof__(r_.u64), (a_.u64 < b_.u64)); + #else + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.u64) / sizeof(r_.u64[0])) ; i++) { + r_.u64[i] = (a_.u64[i] < b_.u64[i]) ? ~UINT64_C(0) : UINT64_C(0); + } + #endif + break; + + case SIMDE_MM_CMPINT_LE: + #if defined(SIMDE_VECTOR_SUBSCRIPT_OPS) + r_.u64 = HEDLEY_REINTERPRET_CAST(__typeof__(r_.u64), (a_.u64 <= b_.u64)); + #else + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.u64) / sizeof(r_.u64[0])) ; i++) { + r_.u64[i] = (a_.u64[i] <= b_.u64[i]) ? ~UINT64_C(0) : UINT64_C(0); + } + #endif + break; + + case SIMDE_MM_CMPINT_FALSE: + r_ = simde__m512i_to_private(simde_mm512_setzero_si512()); + break; + + + case SIMDE_MM_CMPINT_NE: + #if defined(SIMDE_VECTOR_SUBSCRIPT_OPS) + r_.u64 = HEDLEY_REINTERPRET_CAST(__typeof__(r_.u64), (a_.u64 != b_.u64)); + #else + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.u64) / sizeof(r_.u64[0])) ; i++) { + r_.u64[i] = (a_.u64[i] != b_.u64[i]) ? ~UINT64_C(0) : UINT64_C(0); + } + #endif + break; + + case SIMDE_MM_CMPINT_NLT: + #if defined(SIMDE_VECTOR_SUBSCRIPT_OPS) + r_.u64 = HEDLEY_REINTERPRET_CAST(__typeof__(r_.u64), ~(a_.u64 < b_.u64)); + #else + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.u64) / sizeof(r_.u64[0])) ; i++) { + r_.u64[i] = !(a_.u64[i] < b_.u64[i]) ? ~UINT64_C(0) : UINT64_C(0); + } + #endif + break; + + case SIMDE_MM_CMPINT_NLE: + #if defined(SIMDE_VECTOR_SUBSCRIPT_OPS) + r_.u64 = HEDLEY_REINTERPRET_CAST(__typeof__(r_.u64), ~(a_.u64 <= b_.u64)); + #else + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.u64) / sizeof(r_.u64[0])) ; i++) { + r_.u64[i] = !(a_.u64[i] <= b_.u64[i]) ? ~UINT64_C(0) : UINT64_C(0); + } + #endif + break; + + case SIMDE_MM_CMPINT_TRUE: + r_ = simde__m512i_to_private(simde_x_mm512_setone_si512()); + break; + + default: + HEDLEY_UNREACHABLE(); + } + + return simde_mm512_movepi64_mask(simde__m512i_from_private(r_)); +} +#if defined(SIMDE_X86_AVX512F_NATIVE) + #define simde_mm512_cmp_epu64_mask(a, b, imm8) _mm512_cmp_epu64_mask((a), (b), (imm8)) +#endif +#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) + #undef _mm512_cmp_epu64_mask + #define _mm512_cmp_epu64_mask(a, b, imm8) simde_mm512_cmp_epu64_mask((a), (b), (imm8)) +#endif + +#if defined(SIMDE_X86_AVX512F_NATIVE) + #define simde_mm512_mask_cmp_epu64_mask(k1, a, b, imm8) _mm512_mask_cmp_epu64_mask(k1, a, b, imm8) +#else + #define simde_mm512_mask_cmp_epu64_mask(k1, a, b, imm8) (k1) & simde_mm512_cmp_epu64_mask(a, b, imm8) +#endif +#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) + #undef _mm512_mask_cmp_epu64_mask +#define _mm512_mask_cmp_epu64_mask(k1, a, b, imm8) simde_mm512_mask_cmp_epu64_mask((k1), (a), (b), (imm8)) +#endif + SIMDE_END_DECLS_ HEDLEY_DIAGNOSTIC_POP diff --git a/lib/simd_wrapper/simde/x86/avx512/cmpeq.h b/lib/simd_wrapper/simde/x86/avx512/cmpeq.h index 148c9318464..41f90b3e9b6 100644 --- a/lib/simd_wrapper/simde/x86/avx512/cmpeq.h +++ b/lib/simd_wrapper/simde/x86/avx512/cmpeq.h @@ -167,6 +167,54 @@ simde_mm512_mask_cmpeq_epi64_mask (simde__mmask8 k1, simde__m512i a, simde__m512 #define _mm512_mask_cmpeq_epi64_mask(k1, a, b) simde_mm512_mask_cmpeq_epi64_mask(k1, a, b) #endif +SIMDE_FUNCTION_ATTRIBUTES +simde__mmask32 +simde_mm512_cmpeq_epu16_mask (simde__m512i a, simde__m512i b) { + #if defined(SIMDE_X86_AVX512BW_NATIVE) + return _mm512_cmpeq_epu16_mask(a, b); + #else + simde__m512i_private + a_ = simde__m512i_to_private(a), + b_ = simde__m512i_to_private(b); + simde__mmask32 r; + + #if defined(SIMDE_VECTOR_SUBSCRIPT_OPS) + simde__m512i_private tmp; + + tmp.u16 = HEDLEY_REINTERPRET_CAST(__typeof__(tmp.u16), a_.u16 == b_.u16); + r = simde_mm512_movepi16_mask(simde__m512i_from_private(tmp)); + #else + r = 0; + + SIMDE_VECTORIZE_REDUCTION(|:r) + for (size_t i = 0 ; i < (sizeof(a_.u16) / sizeof(a_.u16[0])) ; i++) { + r |= (a_.u16[i] == b_.u16[i]) ? (UINT16_C(1) << i) : 0; + } + #endif + + return r; + #endif +} +#if defined(SIMDE_X86_AVX512BW_ENABLE_NATIVE_ALIASES) + #undef _mm512_cmpeq_epu16_mask + #define _mm512_cmpeq_epu16_mask(a, b) simde_mm512_cmpeq_epu16_mask((a), (b)) +#endif + + +SIMDE_FUNCTION_ATTRIBUTES +simde__mmask32 +simde_mm512_mask_cmpeq_epu16_mask(simde__mmask32 k1, simde__m512i a, simde__m512i b) { + #if defined(SIMDE_X86_AVX512BW_NATIVE) + return _mm512_mask_cmpeq_epu16_mask(k1, a, b); + #else + return k1 & simde_mm512_cmpeq_epu16_mask(a, b); + #endif +} +#if defined(SIMDE_X86_AVX512BW_ENABLE_NATIVE_ALIASES) + #undef _mm512_mask_cmpeq_epu16_mask + #define _mm512_mask_cmpeq_epu16_mask(k1, a, b) simde_mm512_mask_cmpeq_epu16_mask(k1, a, b) +#endif + SIMDE_FUNCTION_ATTRIBUTES simde__mmask16 simde_mm512_cmpeq_ps_mask (simde__m512 a, simde__m512 b) { diff --git a/lib/simd_wrapper/simde/x86/avx512/cmpge.h b/lib/simd_wrapper/simde/x86/avx512/cmpge.h index a94a0c4107b..d0d428790d9 100644 --- a/lib/simd_wrapper/simde/x86/avx512/cmpge.h +++ b/lib/simd_wrapper/simde/x86/avx512/cmpge.h @@ -78,8 +78,8 @@ simde_mm_cmpge_epi8_mask (simde__m128i a, simde__m128i b) { #endif } #if defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) || defined(SIMDE_X86_AVX512BW_ENABLE_NATIVE_ALIASES) - #undef _mm512_cmpge_epi8_mask - #define _mm512_cmpge_epi8_mask(a, b) simde_mm512_cmpge_epi8_mask((a), (b)) + #undef _mm_cmpge_epi8_mask + #define _mm_cmpge_epi8_mask(a, b) simde_mm_cmpge_epi8_mask((a), (b)) #endif SIMDE_FUNCTION_ATTRIBUTES @@ -93,7 +93,7 @@ simde_mm_mask_cmpge_epi8_mask(simde__mmask16 k, simde__m128i a, simde__m128i b) } #if defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) || defined(SIMDE_X86_AVX512VBW_ENABLE_NATIVE_ALIASES) #undef _mm_mask_cmpge_epi8_mask - #define _mm_mask_cmpge_epi8_mask(src, k, a, b) simde_mm_mask_cmpge_epi8_mask((src), (k), (a), (b)) + #define _mm_mask_cmpge_epi8_mask(k, a, b) simde_mm_mask_cmpge_epi8_mask((k), (a), (b)) #endif SIMDE_FUNCTION_ATTRIBUTES @@ -134,8 +134,8 @@ simde_mm256_cmpge_epi8_mask (simde__m256i a, simde__m256i b) { #endif } #if defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) || defined(SIMDE_X86_AVX512VBW_ENABLE_NATIVE_ALIASES) - #undef _mm512_cmpge_epi8_mask - #define _mm512_cmpge_epi8_mask(a, b) simde_mm512_cmpge_epi8_mask((a), (b)) + #undef _mm256_cmpge_epi8_mask + #define _mm256_cmpge_epi8_mask(a, b) simde_mm256_cmpge_epi8_mask((a), (b)) #endif SIMDE_FUNCTION_ATTRIBUTES @@ -149,7 +149,7 @@ simde_mm256_mask_cmpge_epi8_mask(simde__mmask32 k, simde__m256i a, simde__m256i } #if defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) #undef _mm256_mask_cmpge_epi8_mask - #define _mm256_mask_cmpge_epi8_mask(src, k, a, b) simde_mm256_mask_cmpge_epi8_mask((src), (k), (a), (b)) + #define _mm256_mask_cmpge_epi8_mask(k, a, b) simde_mm256_mask_cmpge_epi8_mask((k), (a), (b)) #endif SIMDE_FUNCTION_ATTRIBUTES @@ -209,7 +209,7 @@ simde_mm512_mask_cmpge_epi8_mask(simde__mmask64 k, simde__m512i a, simde__m512i } #if defined(SIMDE_X86_AVX512BW_ENABLE_NATIVE_ALIASES) #undef _mm512_mask_cmpge_epi8_mask - #define _mm512_mask_cmpge_epi8_mask(src, k, a, b) simde_mm512_mask_cmpge_epi8_mask((src), (k), (a), (b)) + #define _mm512_mask_cmpge_epi8_mask(k, a, b) simde_mm512_mask_cmpge_epi8_mask((k), (a), (b)) #endif SIMDE_FUNCTION_ATTRIBUTES @@ -252,8 +252,8 @@ simde_mm_cmpge_epu8_mask (simde__m128i a, simde__m128i b) { #endif } #if defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) || defined(SIMDE_X86_AVX512BW_ENABLE_NATIVE_ALIASES) - #undef _mm512_cmpge_epu8_mask - #define _mm512_cmpge_epu8_mask(a, b) simde_mm512_cmpge_epu8_mask((a), (b)) + #undef _mm_cmpge_epu8_mask + #define _mm_cmpge_epu8_mask(a, b) simde_mm_cmpge_epu8_mask((a), (b)) #endif SIMDE_FUNCTION_ATTRIBUTES @@ -267,7 +267,7 @@ simde_mm_mask_cmpge_epu8_mask(simde__mmask16 k, simde__m128i a, simde__m128i b) } #if defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) || defined(SIMDE_X86_AVX512BW_ENABLE_NATIVE_ALIASES) #undef _mm_mask_cmpge_epu8_mask - #define _mm_mask_cmpge_epu8_mask(src, k, a, b) simde_mm_mask_cmpge_epu8_mask((src), (k), (a), (b)) + #define _mm_mask_cmpge_epu8_mask(k, a, b) simde_mm_mask_cmpge_epu8_mask((k), (a), (b)) #endif SIMDE_FUNCTION_ATTRIBUTES @@ -308,8 +308,8 @@ simde_mm256_cmpge_epu8_mask (simde__m256i a, simde__m256i b) { #endif } #if defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) || defined(SIMDE_X86_AVX512BW_ENABLE_NATIVE_ALIASES) - #undef _mm512_cmpge_epu8_mask - #define _mm512_cmpge_epu8_mask(a, b) simde_mm512_cmpge_epu8_mask((a), (b)) + #undef _mm256_cmpge_epu8_mask + #define _mm256_cmpge_epu8_mask(a, b) simde_mm256_cmpge_epu8_mask((a), (b)) #endif SIMDE_FUNCTION_ATTRIBUTES @@ -323,7 +323,7 @@ simde_mm256_mask_cmpge_epu8_mask(simde__mmask32 k, simde__m256i a, simde__m256i } #if defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) || defined(SIMDE_X86_AVX512BW_ENABLE_NATIVE_ALIASES) #undef _mm256_mask_cmpge_epu8_mask - #define _mm256_mask_cmpge_epu8_mask(src, k, a, b) simde_mm256_mask_cmpge_epu8_mask((src), (k), (a), (b)) + #define _mm256_mask_cmpge_epu8_mask(k, a, b) simde_mm256_mask_cmpge_epu8_mask((k), (a), (b)) #endif SIMDE_FUNCTION_ATTRIBUTES @@ -383,7 +383,7 @@ simde_mm512_mask_cmpge_epu8_mask(simde__mmask64 k, simde__m512i a, simde__m512i } #if defined(SIMDE_X86_AVX512BW_ENABLE_NATIVE_ALIASES) #undef _mm512_mask_cmpge_epu8_mask - #define _mm512_mask_cmpge_epu8_mask(src, k, a, b) simde_mm512_mask_cmpge_epu8_mask((src), (k), (a), (b)) + #define _mm512_mask_cmpge_epu8_mask(k, a, b) simde_mm512_mask_cmpge_epu8_mask((k), (a), (b)) #endif SIMDE_FUNCTION_ATTRIBUTES @@ -426,8 +426,8 @@ simde_mm_cmpge_epi16_mask (simde__m128i a, simde__m128i b) { #endif } #if defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) || defined(SIMDE_X86_AVX512BW_ENABLE_NATIVE_ALIASES) - #undef _mm512_cmpge_epi16_mask - #define _mm512_cmpge_epi16_mask(a, b) simde_mm512_cmpge_epi16_mask((a), (b)) + #undef _mm_cmpge_epi16_mask + #define _mm_cmpge_epi16_mask(a, b) simde_mm_cmpge_epi16_mask((a), (b)) #endif SIMDE_FUNCTION_ATTRIBUTES @@ -441,7 +441,7 @@ simde_mm_mask_cmpge_epi16_mask(simde__mmask8 k, simde__m128i a, simde__m128i b) } #if defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) || defined(SIMDE_X86_AVX512BW_ENABLE_NATIVE_ALIASES) #undef _mm_mask_cmpge_epi16_mask - #define _mm_mask_cmpge_epi16_mask(src, k, a, b) simde_mm_mask_cmpge_epi16_mask((src), (k), (a), (b)) + #define _mm_mask_cmpge_epi16_mask(k, a, b) simde_mm_mask_cmpge_epi16_mask((k), (a), (b)) #endif SIMDE_FUNCTION_ATTRIBUTES @@ -482,8 +482,8 @@ simde_mm256_cmpge_epi16_mask (simde__m256i a, simde__m256i b) { #endif } #if defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) || defined(SIMDE_X86_AVX512BW_ENABLE_NATIVE_ALIASES) - #undef _mm512_cmpge_epi16_mask - #define _mm512_cmpge_epi16_mask(a, b) simde_mm512_cmpge_epi16_mask((a), (b)) + #undef _mm256_cmpge_epi16_mask + #define _mm256_cmpge_epi16_mask(a, b) simde_mm256_cmpge_epi16_mask((a), (b)) #endif SIMDE_FUNCTION_ATTRIBUTES @@ -497,7 +497,7 @@ simde_mm256_mask_cmpge_epi16_mask(simde__mmask16 k, simde__m256i a, simde__m256i } #if defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) || defined(SIMDE_X86_AVX512BW_ENABLE_NATIVE_ALIASES) #undef _mm256_mask_cmpge_epi16_mask - #define _mm256_mask_cmpge_epi16_mask(src, k, a, b) simde_mm256_mask_cmpge_epi16_mask((src), (k), (a), (b)) + #define _mm256_mask_cmpge_epi16_mask(k, a, b) simde_mm256_mask_cmpge_epi16_mask((k), (a), (b)) #endif SIMDE_FUNCTION_ATTRIBUTES @@ -557,7 +557,7 @@ simde_mm512_mask_cmpge_epi16_mask(simde__mmask32 k, simde__m512i a, simde__m512i } #if defined(SIMDE_X86_AVX512BW_ENABLE_NATIVE_ALIASES) #undef _mm512_mask_cmpge_epi16_mask - #define _mm512_mask_cmpge_epi16_mask(src, k, a, b) simde_mm512_mask_cmpge_epi16_mask((src), (k), (a), (b)) + #define _mm512_mask_cmpge_epi16_mask(k, a, b) simde_mm512_mask_cmpge_epi16_mask((k), (a), (b)) #endif SIMDE_FUNCTION_ATTRIBUTES @@ -600,8 +600,8 @@ simde_mm_cmpge_epu16_mask (simde__m128i a, simde__m128i b) { #endif } #if defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) || defined(SIMDE_X86_AVX512BW_ENABLE_NATIVE_ALIASES) - #undef _mm512_cmpge_epu16_mask - #define _mm512_cmpge_epu16_mask(a, b) simde_mm512_cmpge_epu16_mask((a), (b)) + #undef _mm_cmpge_epu16_mask + #define _mm_cmpge_epu16_mask(a, b) simde_mm_cmpge_epu16_mask((a), (b)) #endif SIMDE_FUNCTION_ATTRIBUTES @@ -615,7 +615,7 @@ simde_mm_mask_cmpge_epu16_mask(simde__mmask8 k, simde__m128i a, simde__m128i b) } #if defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) || defined(SIMDE_X86_AVX512BW_ENABLE_NATIVE_ALIASES) #undef _mm_mask_cmpge_epu16_mask - #define _mm_mask_cmpge_epu16_mask(src, k, a, b) simde_mm_mask_cmpge_epu16_mask((src), (k), (a), (b)) + #define _mm_mask_cmpge_epu16_mask(k, a, b) simde_mm_mask_cmpge_epu16_mask((k), (a), (b)) #endif SIMDE_FUNCTION_ATTRIBUTES @@ -656,8 +656,8 @@ simde_mm256_cmpge_epu16_mask (simde__m256i a, simde__m256i b) { #endif } #if defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) || defined(SIMDE_X86_AVX512BW_ENABLE_NATIVE_ALIASES) - #undef _mm512_cmpge_epu16_mask - #define _mm512_cmpge_epu16_mask(a, b) simde_mm512_cmpge_epu16_mask((a), (b)) + #undef _mm256_cmpge_epu16_mask + #define _mm256_cmpge_epu16_mask(a, b) simde_mm256_cmpge_epu16_mask((a), (b)) #endif SIMDE_FUNCTION_ATTRIBUTES @@ -671,7 +671,7 @@ simde_mm256_mask_cmpge_epu16_mask(simde__mmask16 k, simde__m256i a, simde__m256i } #if defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) || defined(SIMDE_X86_AVX512BW_ENABLE_NATIVE_ALIASES) #undef _mm256_mask_cmpge_epu16_mask - #define _mm256_mask_cmpge_epu16_mask(src, k, a, b) simde_mm256_mask_cmpge_epu16_mask((src), (k), (a), (b)) + #define _mm256_mask_cmpge_epu16_mask(k, a, b) simde_mm256_mask_cmpge_epu16_mask((k), (a), (b)) #endif SIMDE_FUNCTION_ATTRIBUTES @@ -731,7 +731,7 @@ simde_mm512_mask_cmpge_epu16_mask(simde__mmask32 k, simde__m512i a, simde__m512i } #if defined(SIMDE_X86_AVX512BW_ENABLE_NATIVE_ALIASES) #undef _mm512_mask_cmpge_epu16_mask - #define _mm512_mask_cmpge_epu16_mask(src, k, a, b) simde_mm512_mask_cmpge_epu16_mask((src), (k), (a), (b)) + #define _mm512_mask_cmpge_epu16_mask(k, a, b) simde_mm512_mask_cmpge_epu16_mask((k), (a), (b)) #endif SIMDE_FUNCTION_ATTRIBUTES @@ -774,8 +774,8 @@ simde_mm_cmpge_epi32_mask (simde__m128i a, simde__m128i b) { #endif } #if defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) - #undef _mm512_cmpge_epi32_mask - #define _mm512_cmpge_epi32_mask(a, b) simde_mm512_cmpge_epi32_mask((a), (b)) + #undef _mm_cmpge_epi32_mask + #define _mm_cmpge_epi32_mask(a, b) simde_mm_cmpge_epi32_mask((a), (b)) #endif SIMDE_FUNCTION_ATTRIBUTES @@ -789,7 +789,7 @@ simde_mm_mask_cmpge_epi32_mask(simde__mmask8 k, simde__m128i a, simde__m128i b) } #if defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) #undef _mm_mask_cmpge_epi32_mask - #define _mm_mask_cmpge_epi32_mask(src, k, a, b) simde_mm_mask_cmpge_epi32_mask((src), (k), (a), (b)) + #define _mm_mask_cmpge_epi32_mask(k, a, b) simde_mm_mask_cmpge_epi32_mask((k), (a), (b)) #endif SIMDE_FUNCTION_ATTRIBUTES @@ -830,8 +830,8 @@ simde_mm256_cmpge_epi32_mask (simde__m256i a, simde__m256i b) { #endif } #if defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) - #undef _mm512_cmpge_epi32_mask - #define _mm512_cmpge_epi32_mask(a, b) simde_mm512_cmpge_epi32_mask((a), (b)) + #undef _mm256_cmpge_epi32_mask + #define _mm256_cmpge_epi32_mask(a, b) simde_mm256_cmpge_epi32_mask((a), (b)) #endif SIMDE_FUNCTION_ATTRIBUTES @@ -845,7 +845,7 @@ simde_mm256_mask_cmpge_epi32_mask(simde__mmask8 k, simde__m256i a, simde__m256i } #if defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) #undef _mm256_mask_cmpge_epi32_mask - #define _mm256_mask_cmpge_epi32_mask(src, k, a, b) simde_mm256_mask_cmpge_epi32_mask((src), (k), (a), (b)) + #define _mm256_mask_cmpge_epi32_mask(k, a, b) simde_mm256_mask_cmpge_epi32_mask((k), (a), (b)) #endif SIMDE_FUNCTION_ATTRIBUTES @@ -905,7 +905,7 @@ simde_mm512_mask_cmpge_epi32_mask(simde__mmask16 k, simde__m512i a, simde__m512i } #if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) #undef _mm512_mask_cmpge_epi32_mask - #define _mm512_mask_cmpge_epi32_mask(src, k, a, b) simde_mm512_mask_cmpge_epi32_mask((src), (k), (a), (b)) + #define _mm512_mask_cmpge_epi32_mask(k, a, b) simde_mm512_mask_cmpge_epi32_mask((k), (a), (b)) #endif SIMDE_FUNCTION_ATTRIBUTES @@ -948,8 +948,8 @@ simde_mm_cmpge_epu32_mask (simde__m128i a, simde__m128i b) { #endif } #if defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) - #undef _mm512_cmpge_epu32_mask - #define _mm512_cmpge_epu32_mask(a, b) simde_mm512_cmpge_epu32_mask((a), (b)) + #undef _mm_cmpge_epu32_mask + #define _mm_cmpge_epu32_mask(a, b) simde_mm_cmpge_epu32_mask((a), (b)) #endif SIMDE_FUNCTION_ATTRIBUTES @@ -963,7 +963,7 @@ simde_mm_mask_cmpge_epu32_mask(simde__mmask8 k, simde__m128i a, simde__m128i b) } #if defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) #undef _mm_mask_cmpge_epu32_mask - #define _mm_mask_cmpge_epu32_mask(src, k, a, b) simde_mm_mask_cmpge_epu32_mask((src), (k), (a), (b)) + #define _mm_mask_cmpge_epu32_mask(k, a, b) simde_mm_mask_cmpge_epu32_mask((k), (a), (b)) #endif SIMDE_FUNCTION_ATTRIBUTES @@ -1004,8 +1004,8 @@ simde_mm256_cmpge_epu32_mask (simde__m256i a, simde__m256i b) { #endif } #if defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) - #undef _mm512_cmpge_epu32_mask - #define _mm512_cmpge_epu32_mask(a, b) simde_mm512_cmpge_epu32_mask((a), (b)) + #undef _mm256_cmpge_epu32_mask + #define _mm256_cmpge_epu32_mask(a, b) simde_mm256_cmpge_epu32_mask((a), (b)) #endif SIMDE_FUNCTION_ATTRIBUTES @@ -1019,7 +1019,7 @@ simde_mm256_mask_cmpge_epu32_mask(simde__mmask8 k, simde__m256i a, simde__m256i } #if defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) #undef _mm256_mask_cmpge_epu32_mask - #define _mm256_mask_cmpge_epu32_mask(src, k, a, b) simde_mm256_mask_cmpge_epu32_mask((src), (k), (a), (b)) + #define _mm256_mask_cmpge_epu32_mask(k, a, b) simde_mm256_mask_cmpge_epu32_mask((k), (a), (b)) #endif SIMDE_FUNCTION_ATTRIBUTES @@ -1079,7 +1079,7 @@ simde_mm512_mask_cmpge_epu32_mask(simde__mmask16 k, simde__m512i a, simde__m512i } #if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) #undef _mm512_mask_cmpge_epu32_mask - #define _mm512_mask_cmpge_epu32_mask(src, k, a, b) simde_mm512_mask_cmpge_epu32_mask((src), (k), (a), (b)) + #define _mm512_mask_cmpge_epu32_mask(k, a, b) simde_mm512_mask_cmpge_epu32_mask((k), (a), (b)) #endif SIMDE_FUNCTION_ATTRIBUTES @@ -1137,7 +1137,7 @@ simde_mm_mask_cmpge_epi64_mask(simde__mmask8 k, simde__m128i a, simde__m128i b) } #if defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) #undef _mm_mask_cmpge_epi64_mask - #define _mm_mask_cmpge_epi64_mask(src, k, a, b) simde_mm_mask_cmpge_epi64_mask((src), (k), (a), (b)) + #define _mm_mask_cmpge_epi64_mask(k, a, b) simde_mm_mask_cmpge_epi64_mask((k), (a), (b)) #endif SIMDE_FUNCTION_ATTRIBUTES @@ -1193,7 +1193,7 @@ simde_mm256_mask_cmpge_epi64_mask(simde__mmask8 k, simde__m256i a, simde__m256i } #if defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) #undef _mm256_mask_cmpge_epi64_mask - #define _mm256_mask_cmpge_epi64_mask(src, k, a, b) simde_mm256_mask_cmpge_epi64_mask((src), (k), (a), (b)) + #define _mm256_mask_cmpge_epi64_mask(k, a, b) simde_mm256_mask_cmpge_epi64_mask((k), (a), (b)) #endif SIMDE_FUNCTION_ATTRIBUTES @@ -1253,7 +1253,7 @@ simde_mm512_mask_cmpge_epi64_mask(simde__mmask8 k, simde__m512i a, simde__m512i } #if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) #undef _mm512_mask_cmpge_epi64_mask - #define _mm512_mask_cmpge_epi64_mask(src, k, a, b) simde_mm512_mask_cmpge_epi64_mask((src), (k), (a), (b)) + #define _mm512_mask_cmpge_epi64_mask(k, a, b) simde_mm512_mask_cmpge_epi64_mask((k), (a), (b)) #endif SIMDE_FUNCTION_ATTRIBUTES @@ -1294,8 +1294,8 @@ simde_mm_cmpge_epu64_mask (simde__m128i a, simde__m128i b) { #endif } #if defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) - #undef _mm512_cmpge_epu64_mask - #define _mm512_cmpge_epu64_mask(a, b) simde_mm512_cmpge_epu64_mask((a), (b)) + #undef _mm_cmpge_epu64_mask + #define _mm_cmpge_epu64_mask(a, b) simde_mm_cmpge_epu64_mask((a), (b)) #endif SIMDE_FUNCTION_ATTRIBUTES @@ -1309,7 +1309,7 @@ simde_mm_mask_cmpge_epu64_mask(simde__mmask8 k, simde__m128i a, simde__m128i b) } #if defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) #undef _mm_mask_cmpge_epu64_mask - #define _mm_mask_cmpge_epu64_mask(src, k, a, b) simde_mm_mask_cmpge_epu64_mask((src), (k), (a), (b)) + #define _mm_mask_cmpge_epu64_mask(k, a, b) simde_mm_mask_cmpge_epu64_mask((k), (a), (b)) #endif SIMDE_FUNCTION_ATTRIBUTES @@ -1350,8 +1350,8 @@ simde_mm256_cmpge_epu64_mask (simde__m256i a, simde__m256i b) { #endif } #if defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) - #undef _mm512_cmpge_epu64_mask - #define _mm512_cmpge_epu64_mask(a, b) simde_mm512_cmpge_epu64_mask((a), (b)) + #undef _mm256_cmpge_epu64_mask + #define _mm256_cmpge_epu64_mask(a, b) simde_mm256_cmpge_epu64_mask((a), (b)) #endif SIMDE_FUNCTION_ATTRIBUTES @@ -1365,7 +1365,7 @@ simde_mm256_mask_cmpge_epu64_mask(simde__mmask8 k, simde__m256i a, simde__m256i } #if defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) #undef _mm256_mask_cmpge_epu64_mask - #define _mm256_mask_cmpge_epu64_mask(src, k, a, b) simde_mm256_mask_cmpge_epu64_mask((src), (k), (a), (b)) + #define _mm256_mask_cmpge_epu64_mask(k, a, b) simde_mm256_mask_cmpge_epu64_mask((k), (a), (b)) #endif SIMDE_FUNCTION_ATTRIBUTES @@ -1425,7 +1425,7 @@ simde_mm512_mask_cmpge_epu64_mask(simde__mmask8 k, simde__m512i a, simde__m512i } #if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) #undef _mm512_mask_cmpge_epu64_mask - #define _mm512_mask_cmpge_epu64_mask(src, k, a, b) simde_mm512_mask_cmpge_epu64_mask((src), (k), (a), (b)) + #define _mm512_mask_cmpge_epu64_mask(k, a, b) simde_mm512_mask_cmpge_epu64_mask((k), (a), (b)) #endif SIMDE_END_DECLS_ diff --git a/lib/simd_wrapper/simde/x86/avx512/cmpgt.h b/lib/simd_wrapper/simde/x86/avx512/cmpgt.h index 2894df9bb82..15245f968fb 100644 --- a/lib/simd_wrapper/simde/x86/avx512/cmpgt.h +++ b/lib/simd_wrapper/simde/x86/avx512/cmpgt.h @@ -109,6 +109,29 @@ simde_mm512_cmpgt_epu8_mask (simde__m512i a, simde__m512i b) { #define _mm512_cmpgt_epu8_mask(a, b) simde_mm512_cmpgt_epu8_mask(a, b) #endif +SIMDE_FUNCTION_ATTRIBUTES +simde__mmask32 +simde_mm512_cmpgt_epi16_mask (simde__m512i a, simde__m512i b) { + #if defined(SIMDE_X86_AVX512BW_NATIVE) + return _mm512_cmpgt_epi16_mask(a, b); + #else + simde__m512i_private + r_, + a_ = simde__m512i_to_private(a), + b_ = simde__m512i_to_private(b); + + for (size_t i = 0 ; i < (sizeof(r_.m256i) / sizeof(r_.m256i[0])) ; i++) { + r_.m256i[i] = simde_mm256_cmpgt_epi16(a_.m256i[i], b_.m256i[i]); + } + + return simde_mm512_movepi16_mask(simde__m512i_from_private(r_)); + #endif +} +#if defined(SIMDE_X86_AVX512BW_ENABLE_NATIVE_ALIASES) + #undef _mm512_cmpgt_epi16_mask + #define _mm512_cmpgt_epi16_mask(a, b) simde_mm512_cmpgt_epi16_mask(a, b) +#endif + SIMDE_FUNCTION_ATTRIBUTES simde__mmask16 simde_mm512_cmpgt_epi32_mask (simde__m512i a, simde__m512i b) { diff --git a/lib/simd_wrapper/simde/x86/avx512/cmple.h b/lib/simd_wrapper/simde/x86/avx512/cmple.h index c83227f4824..9b3c3aad24a 100644 --- a/lib/simd_wrapper/simde/x86/avx512/cmple.h +++ b/lib/simd_wrapper/simde/x86/avx512/cmple.h @@ -76,8 +76,8 @@ simde_mm_cmple_epi8_mask (simde__m128i a, simde__m128i b) { #endif } #if defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) || defined(SIMDE_X86_AVX512BW_ENABLE_NATIVE_ALIASES) - #undef _mm512_cmple_epi8_mask - #define _mm512_cmple_epi8_mask(a, b) simde_mm512_cmple_epi8_mask((a), (b)) + #undef _mm_cmple_epi8_mask + #define _mm_cmple_epi8_mask(a, b) simde_mm_cmple_epi8_mask((a), (b)) #endif SIMDE_FUNCTION_ATTRIBUTES @@ -91,7 +91,7 @@ simde_mm_mask_cmple_epi8_mask(simde__mmask16 k, simde__m128i a, simde__m128i b) } #if defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) || defined(SIMDE_X86_AVX512VBW_ENABLE_NATIVE_ALIASES) #undef _mm_mask_cmple_epi8_mask - #define _mm_mask_cmple_epi8_mask(src, k, a, b) simde_mm_mask_cmple_epi8_mask((src), (k), (a), (b)) + #define _mm_mask_cmple_epi8_mask(k, a, b) simde_mm_mask_cmple_epi8_mask((k), (a), (b)) #endif SIMDE_FUNCTION_ATTRIBUTES @@ -132,8 +132,8 @@ simde_mm256_cmple_epi8_mask (simde__m256i a, simde__m256i b) { #endif } #if defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) || defined(SIMDE_X86_AVX512VBW_ENABLE_NATIVE_ALIASES) - #undef _mm512_cmple_epi8_mask - #define _mm512_cmple_epi8_mask(a, b) simde_mm512_cmple_epi8_mask((a), (b)) + #undef _mm256_cmple_epi8_mask + #define _mm256_cmple_epi8_mask(a, b) simde_mm256_cmple_epi8_mask((a), (b)) #endif SIMDE_FUNCTION_ATTRIBUTES @@ -147,7 +147,7 @@ simde_mm256_mask_cmple_epi8_mask(simde__mmask32 k, simde__m256i a, simde__m256i } #if defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) #undef _mm256_mask_cmple_epi8_mask - #define _mm256_mask_cmple_epi8_mask(src, k, a, b) simde_mm256_mask_cmple_epi8_mask((src), (k), (a), (b)) + #define _mm256_mask_cmple_epi8_mask(k, a, b) simde_mm256_mask_cmple_epi8_mask((k), (a), (b)) #endif SIMDE_FUNCTION_ATTRIBUTES @@ -207,7 +207,7 @@ simde_mm512_mask_cmple_epi8_mask(simde__mmask64 k, simde__m512i a, simde__m512i } #if defined(SIMDE_X86_AVX512BW_ENABLE_NATIVE_ALIASES) #undef _mm512_mask_cmple_epi8_mask - #define _mm512_mask_cmple_epi8_mask(src, k, a, b) simde_mm512_mask_cmple_epi8_mask((src), (k), (a), (b)) + #define _mm512_mask_cmple_epi8_mask(k, a, b) simde_mm512_mask_cmple_epi8_mask((k), (a), (b)) #endif SIMDE_FUNCTION_ATTRIBUTES @@ -250,8 +250,8 @@ simde_mm_cmple_epu8_mask (simde__m128i a, simde__m128i b) { #endif } #if defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) || defined(SIMDE_X86_AVX512BW_ENABLE_NATIVE_ALIASES) - #undef _mm512_cmple_epu8_mask - #define _mm512_cmple_epu8_mask(a, b) simde_mm512_cmple_epu8_mask((a), (b)) + #undef _mm_cmple_epu8_mask + #define _mm_cmple_epu8_mask(a, b) simde_mm_cmple_epu8_mask((a), (b)) #endif SIMDE_FUNCTION_ATTRIBUTES @@ -265,7 +265,7 @@ simde_mm_mask_cmple_epu8_mask(simde__mmask16 k, simde__m128i a, simde__m128i b) } #if defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) || defined(SIMDE_X86_AVX512BW_ENABLE_NATIVE_ALIASES) #undef _mm_mask_cmple_epu8_mask - #define _mm_mask_cmple_epu8_mask(src, k, a, b) simde_mm_mask_cmple_epu8_mask((src), (k), (a), (b)) + #define _mm_mask_cmple_epu8_mask(k, a, b) simde_mm_mask_cmple_epu8_mask((k), (a), (b)) #endif SIMDE_FUNCTION_ATTRIBUTES @@ -306,8 +306,8 @@ simde_mm256_cmple_epu8_mask (simde__m256i a, simde__m256i b) { #endif } #if defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) || defined(SIMDE_X86_AVX512BW_ENABLE_NATIVE_ALIASES) - #undef _mm512_cmple_epu8_mask - #define _mm512_cmple_epu8_mask(a, b) simde_mm512_cmple_epu8_mask((a), (b)) + #undef _mm256_cmple_epu8_mask + #define _mm256_cmple_epu8_mask(a, b) simde_mm256_cmple_epu8_mask((a), (b)) #endif SIMDE_FUNCTION_ATTRIBUTES @@ -321,7 +321,7 @@ simde_mm256_mask_cmple_epu8_mask(simde__mmask32 k, simde__m256i a, simde__m256i } #if defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) || defined(SIMDE_X86_AVX512BW_ENABLE_NATIVE_ALIASES) #undef _mm256_mask_cmple_epu8_mask - #define _mm256_mask_cmple_epu8_mask(src, k, a, b) simde_mm256_mask_cmple_epu8_mask((src), (k), (a), (b)) + #define _mm256_mask_cmple_epu8_mask(k, a, b) simde_mm256_mask_cmple_epu8_mask((k), (a), (b)) #endif SIMDE_FUNCTION_ATTRIBUTES @@ -381,7 +381,7 @@ simde_mm512_mask_cmple_epu8_mask(simde__mmask64 k, simde__m512i a, simde__m512i } #if defined(SIMDE_X86_AVX512BW_ENABLE_NATIVE_ALIASES) #undef _mm512_mask_cmple_epu8_mask - #define _mm512_mask_cmple_epu8_mask(src, k, a, b) simde_mm512_mask_cmple_epu8_mask((src), (k), (a), (b)) + #define _mm512_mask_cmple_epu8_mask(k, a, b) simde_mm512_mask_cmple_epu8_mask((k), (a), (b)) #endif SIMDE_FUNCTION_ATTRIBUTES @@ -424,8 +424,8 @@ simde_mm_cmple_epi16_mask (simde__m128i a, simde__m128i b) { #endif } #if defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) || defined(SIMDE_X86_AVX512BW_ENABLE_NATIVE_ALIASES) - #undef _mm512_cmple_epi16_mask - #define _mm512_cmple_epi16_mask(a, b) simde_mm512_cmple_epi16_mask((a), (b)) + #undef _mm_cmple_epi16_mask + #define _mm_cmple_epi16_mask(a, b) simde_mm_cmple_epi16_mask((a), (b)) #endif SIMDE_FUNCTION_ATTRIBUTES @@ -439,7 +439,7 @@ simde_mm_mask_cmple_epi16_mask(simde__mmask8 k, simde__m128i a, simde__m128i b) } #if defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) || defined(SIMDE_X86_AVX512BW_ENABLE_NATIVE_ALIASES) #undef _mm_mask_cmple_epi16_mask - #define _mm_mask_cmple_epi16_mask(src, k, a, b) simde_mm_mask_cmple_epi16_mask((src), (k), (a), (b)) + #define _mm_mask_cmple_epi16_mask(k, a, b) simde_mm_mask_cmple_epi16_mask((k), (a), (b)) #endif SIMDE_FUNCTION_ATTRIBUTES @@ -480,8 +480,8 @@ simde_mm256_cmple_epi16_mask (simde__m256i a, simde__m256i b) { #endif } #if defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) || defined(SIMDE_X86_AVX512BW_ENABLE_NATIVE_ALIASES) - #undef _mm512_cmple_epi16_mask - #define _mm512_cmple_epi16_mask(a, b) simde_mm512_cmple_epi16_mask((a), (b)) + #undef _mm256_cmple_epi16_mask + #define _mm256_cmple_epi16_mask(a, b) simde_mm256_cmple_epi16_mask((a), (b)) #endif SIMDE_FUNCTION_ATTRIBUTES @@ -495,7 +495,7 @@ simde_mm256_mask_cmple_epi16_mask(simde__mmask16 k, simde__m256i a, simde__m256i } #if defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) || defined(SIMDE_X86_AVX512BW_ENABLE_NATIVE_ALIASES) #undef _mm256_mask_cmple_epi16_mask - #define _mm256_mask_cmple_epi16_mask(src, k, a, b) simde_mm256_mask_cmple_epi16_mask((src), (k), (a), (b)) + #define _mm256_mask_cmple_epi16_mask(k, a, b) simde_mm256_mask_cmple_epi16_mask((k), (a), (b)) #endif SIMDE_FUNCTION_ATTRIBUTES @@ -555,7 +555,7 @@ simde_mm512_mask_cmple_epi16_mask(simde__mmask32 k, simde__m512i a, simde__m512i } #if defined(SIMDE_X86_AVX512BW_ENABLE_NATIVE_ALIASES) #undef _mm512_mask_cmple_epi16_mask - #define _mm512_mask_cmple_epi16_mask(src, k, a, b) simde_mm512_mask_cmple_epi16_mask((src), (k), (a), (b)) + #define _mm512_mask_cmple_epi16_mask(k, a, b) simde_mm512_mask_cmple_epi16_mask((k), (a), (b)) #endif SIMDE_FUNCTION_ATTRIBUTES @@ -598,8 +598,8 @@ simde_mm_cmple_epu16_mask (simde__m128i a, simde__m128i b) { #endif } #if defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) || defined(SIMDE_X86_AVX512BW_ENABLE_NATIVE_ALIASES) - #undef _mm512_cmple_epu16_mask - #define _mm512_cmple_epu16_mask(a, b) simde_mm512_cmple_epu16_mask((a), (b)) + #undef _mm_cmple_epu16_mask + #define _mm_cmple_epu16_mask(a, b) simde_mm_cmple_epu16_mask((a), (b)) #endif SIMDE_FUNCTION_ATTRIBUTES @@ -613,7 +613,7 @@ simde_mm_mask_cmple_epu16_mask(simde__mmask8 k, simde__m128i a, simde__m128i b) } #if defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) || defined(SIMDE_X86_AVX512BW_ENABLE_NATIVE_ALIASES) #undef _mm_mask_cmple_epu16_mask - #define _mm_mask_cmple_epu16_mask(src, k, a, b) simde_mm_mask_cmple_epu16_mask((src), (k), (a), (b)) + #define _mm_mask_cmple_epu16_mask(k, a, b) simde_mm_mask_cmple_epu16_mask((k), (a), (b)) #endif SIMDE_FUNCTION_ATTRIBUTES @@ -654,8 +654,8 @@ simde_mm256_cmple_epu16_mask (simde__m256i a, simde__m256i b) { #endif } #if defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) || defined(SIMDE_X86_AVX512BW_ENABLE_NATIVE_ALIASES) - #undef _mm512_cmple_epu16_mask - #define _mm512_cmple_epu16_mask(a, b) simde_mm512_cmple_epu16_mask((a), (b)) + #undef _mm256_cmple_epu16_mask + #define _mm256_cmple_epu16_mask(a, b) simde_mm256_cmple_epu16_mask((a), (b)) #endif SIMDE_FUNCTION_ATTRIBUTES @@ -669,7 +669,7 @@ simde_mm256_mask_cmple_epu16_mask(simde__mmask16 k, simde__m256i a, simde__m256i } #if defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) || defined(SIMDE_X86_AVX512BW_ENABLE_NATIVE_ALIASES) #undef _mm256_mask_cmple_epu16_mask - #define _mm256_mask_cmple_epu16_mask(src, k, a, b) simde_mm256_mask_cmple_epu16_mask((src), (k), (a), (b)) + #define _mm256_mask_cmple_epu16_mask(k, a, b) simde_mm256_mask_cmple_epu16_mask((k), (a), (b)) #endif SIMDE_FUNCTION_ATTRIBUTES @@ -729,7 +729,7 @@ simde_mm512_mask_cmple_epu16_mask(simde__mmask32 k, simde__m512i a, simde__m512i } #if defined(SIMDE_X86_AVX512BW_ENABLE_NATIVE_ALIASES) #undef _mm512_mask_cmple_epu16_mask - #define _mm512_mask_cmple_epu16_mask(src, k, a, b) simde_mm512_mask_cmple_epu16_mask((src), (k), (a), (b)) + #define _mm512_mask_cmple_epu16_mask(k, a, b) simde_mm512_mask_cmple_epu16_mask((k), (a), (b)) #endif SIMDE_FUNCTION_ATTRIBUTES @@ -772,8 +772,8 @@ simde_mm_cmple_epi32_mask (simde__m128i a, simde__m128i b) { #endif } #if defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) - #undef _mm512_cmple_epi32_mask - #define _mm512_cmple_epi32_mask(a, b) simde_mm512_cmple_epi32_mask((a), (b)) + #undef _mm_cmple_epi32_mask + #define _mm_cmple_epi32_mask(a, b) simde_mm_cmple_epi32_mask((a), (b)) #endif SIMDE_FUNCTION_ATTRIBUTES @@ -787,7 +787,7 @@ simde_mm_mask_cmple_epi32_mask(simde__mmask8 k, simde__m128i a, simde__m128i b) } #if defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) #undef _mm_mask_cmple_epi32_mask - #define _mm_mask_cmple_epi32_mask(src, k, a, b) simde_mm_mask_cmple_epi32_mask((src), (k), (a), (b)) + #define _mm_mask_cmple_epi32_mask(k, a, b) simde_mm_mask_cmple_epi32_mask((k), (a), (b)) #endif SIMDE_FUNCTION_ATTRIBUTES @@ -828,8 +828,8 @@ simde_mm256_cmple_epi32_mask (simde__m256i a, simde__m256i b) { #endif } #if defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) - #undef _mm512_cmple_epi32_mask - #define _mm512_cmple_epi32_mask(a, b) simde_mm512_cmple_epi32_mask((a), (b)) + #undef _mm256_cmple_epi32_mask + #define _mm256_cmple_epi32_mask(a, b) simde_mm256_cmple_epi32_mask((a), (b)) #endif SIMDE_FUNCTION_ATTRIBUTES @@ -843,7 +843,7 @@ simde_mm256_mask_cmple_epi32_mask(simde__mmask8 k, simde__m256i a, simde__m256i } #if defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) #undef _mm256_mask_cmple_epi32_mask - #define _mm256_mask_cmple_epi32_mask(src, k, a, b) simde_mm256_mask_cmple_epi32_mask((src), (k), (a), (b)) + #define _mm256_mask_cmple_epi32_mask(k, a, b) simde_mm256_mask_cmple_epi32_mask((k), (a), (b)) #endif SIMDE_FUNCTION_ATTRIBUTES @@ -903,7 +903,7 @@ simde_mm512_mask_cmple_epi32_mask(simde__mmask16 k, simde__m512i a, simde__m512i } #if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) #undef _mm512_mask_cmple_epi32_mask - #define _mm512_mask_cmple_epi32_mask(src, k, a, b) simde_mm512_mask_cmple_epi32_mask((src), (k), (a), (b)) + #define _mm512_mask_cmple_epi32_mask(k, a, b) simde_mm512_mask_cmple_epi32_mask((k), (a), (b)) #endif SIMDE_FUNCTION_ATTRIBUTES @@ -946,8 +946,8 @@ simde_mm_cmple_epu32_mask (simde__m128i a, simde__m128i b) { #endif } #if defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) - #undef _mm512_cmple_epu32_mask - #define _mm512_cmple_epu32_mask(a, b) simde_mm512_cmple_epu32_mask((a), (b)) + #undef _mm_cmple_epu32_mask + #define _mm_cmple_epu32_mask(a, b) simde_mm_cmple_epu32_mask((a), (b)) #endif SIMDE_FUNCTION_ATTRIBUTES @@ -961,7 +961,7 @@ simde_mm_mask_cmple_epu32_mask(simde__mmask8 k, simde__m128i a, simde__m128i b) } #if defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) #undef _mm_mask_cmple_epu32_mask - #define _mm_mask_cmple_epu32_mask(src, k, a, b) simde_mm_mask_cmple_epu32_mask((src), (k), (a), (b)) + #define _mm_mask_cmple_epu32_mask(k, a, b) simde_mm_mask_cmple_epu32_mask((k), (a), (b)) #endif SIMDE_FUNCTION_ATTRIBUTES @@ -1002,8 +1002,8 @@ simde_mm256_cmple_epu32_mask (simde__m256i a, simde__m256i b) { #endif } #if defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) - #undef _mm512_cmple_epu32_mask - #define _mm512_cmple_epu32_mask(a, b) simde_mm512_cmple_epu32_mask((a), (b)) + #undef _mm256_cmple_epu32_mask + #define _mm256_cmple_epu32_mask(a, b) simde_mm256_cmple_epu32_mask((a), (b)) #endif SIMDE_FUNCTION_ATTRIBUTES @@ -1017,7 +1017,7 @@ simde_mm256_mask_cmple_epu32_mask(simde__mmask8 k, simde__m256i a, simde__m256i } #if defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) #undef _mm256_mask_cmple_epu32_mask - #define _mm256_mask_cmple_epu32_mask(src, k, a, b) simde_mm256_mask_cmple_epu32_mask((src), (k), (a), (b)) + #define _mm256_mask_cmple_epu32_mask(k, a, b) simde_mm256_mask_cmple_epu32_mask((k), (a), (b)) #endif SIMDE_FUNCTION_ATTRIBUTES @@ -1077,7 +1077,7 @@ simde_mm512_mask_cmple_epu32_mask(simde__mmask16 k, simde__m512i a, simde__m512i } #if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) #undef _mm512_mask_cmple_epu32_mask - #define _mm512_mask_cmple_epu32_mask(src, k, a, b) simde_mm512_mask_cmple_epu32_mask((src), (k), (a), (b)) + #define _mm512_mask_cmple_epu32_mask(k, a, b) simde_mm512_mask_cmple_epu32_mask((k), (a), (b)) #endif SIMDE_FUNCTION_ATTRIBUTES @@ -1135,7 +1135,7 @@ simde_mm_mask_cmple_epi64_mask(simde__mmask8 k, simde__m128i a, simde__m128i b) } #if defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) #undef _mm_mask_cmple_epi64_mask - #define _mm_mask_cmple_epi64_mask(src, k, a, b) simde_mm_mask_cmple_epi64_mask((src), (k), (a), (b)) + #define _mm_mask_cmple_epi64_mask(k, a, b) simde_mm_mask_cmple_epi64_mask((k), (a), (b)) #endif SIMDE_FUNCTION_ATTRIBUTES @@ -1191,7 +1191,7 @@ simde_mm256_mask_cmple_epi64_mask(simde__mmask8 k, simde__m256i a, simde__m256i } #if defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) #undef _mm256_mask_cmple_epi64_mask - #define _mm256_mask_cmple_epi64_mask(src, k, a, b) simde_mm256_mask_cmple_epi64_mask((src), (k), (a), (b)) + #define _mm256_mask_cmple_epi64_mask(k, a, b) simde_mm256_mask_cmple_epi64_mask((k), (a), (b)) #endif SIMDE_FUNCTION_ATTRIBUTES @@ -1251,7 +1251,7 @@ simde_mm512_mask_cmple_epi64_mask(simde__mmask8 k, simde__m512i a, simde__m512i } #if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) #undef _mm512_mask_cmple_epi64_mask - #define _mm512_mask_cmple_epi64_mask(src, k, a, b) simde_mm512_mask_cmple_epi64_mask((src), (k), (a), (b)) + #define _mm512_mask_cmple_epi64_mask(k, a, b) simde_mm512_mask_cmple_epi64_mask((k), (a), (b)) #endif SIMDE_FUNCTION_ATTRIBUTES @@ -1292,8 +1292,8 @@ simde_mm_cmple_epu64_mask (simde__m128i a, simde__m128i b) { #endif } #if defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) - #undef _mm512_cmple_epu64_mask - #define _mm512_cmple_epu64_mask(a, b) simde_mm512_cmple_epu64_mask((a), (b)) + #undef _mm_cmple_epu64_mask + #define _mm_cmple_epu64_mask(a, b) simde_mm_cmple_epu64_mask((a), (b)) #endif SIMDE_FUNCTION_ATTRIBUTES @@ -1307,7 +1307,7 @@ simde_mm_mask_cmple_epu64_mask(simde__mmask8 k, simde__m128i a, simde__m128i b) } #if defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) #undef _mm_mask_cmple_epu64_mask - #define _mm_mask_cmple_epu64_mask(src, k, a, b) simde_mm_mask_cmple_epu64_mask((src), (k), (a), (b)) + #define _mm_mask_cmple_epu64_mask(k, a, b) simde_mm_mask_cmple_epu64_mask((k), (a), (b)) #endif SIMDE_FUNCTION_ATTRIBUTES @@ -1348,8 +1348,8 @@ simde_mm256_cmple_epu64_mask (simde__m256i a, simde__m256i b) { #endif } #if defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) - #undef _mm512_cmple_epu64_mask - #define _mm512_cmple_epu64_mask(a, b) simde_mm512_cmple_epu64_mask((a), (b)) + #undef _mm256_cmple_epu64_mask + #define _mm256_cmple_epu64_mask(a, b) simde_mm256_cmple_epu64_mask((a), (b)) #endif SIMDE_FUNCTION_ATTRIBUTES @@ -1363,7 +1363,7 @@ simde_mm256_mask_cmple_epu64_mask(simde__mmask8 k, simde__m256i a, simde__m256i } #if defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) #undef _mm256_mask_cmple_epu64_mask - #define _mm256_mask_cmple_epu64_mask(src, k, a, b) simde_mm256_mask_cmple_epu64_mask((src), (k), (a), (b)) + #define _mm256_mask_cmple_epu64_mask(k, a, b) simde_mm256_mask_cmple_epu64_mask((k), (a), (b)) #endif SIMDE_FUNCTION_ATTRIBUTES @@ -1423,7 +1423,7 @@ simde_mm512_mask_cmple_epu64_mask(simde__mmask8 k, simde__m512i a, simde__m512i } #if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) #undef _mm512_mask_cmple_epu64_mask - #define _mm512_mask_cmple_epu64_mask(src, k, a, b) simde_mm512_mask_cmple_epu64_mask((src), (k), (a), (b)) + #define _mm512_mask_cmple_epu64_mask(k, a, b) simde_mm512_mask_cmple_epu64_mask((k), (a), (b)) #endif SIMDE_END_DECLS_ diff --git a/lib/simd_wrapper/simde/x86/avx512/cmpneq.h b/lib/simd_wrapper/simde/x86/avx512/cmpneq.h index 6583155ddc7..6e9bf3364fc 100644 --- a/lib/simd_wrapper/simde/x86/avx512/cmpneq.h +++ b/lib/simd_wrapper/simde/x86/avx512/cmpneq.h @@ -61,7 +61,7 @@ simde_mm_mask_cmpneq_epi8_mask(simde__mmask16 k1, simde__m128i a, simde__m128i b } #if defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) || defined(SIMDE_X86_AVX512BW_ENABLE_NATIVE_ALIASES) #undef _mm_mask_cmpneq_epi8_mask - #define _mm_mask_cmpneq_epi8_mask(a, b) simde_mm_mask_cmpneq_epi8_mask((a), (b)) + #define _mm_mask_cmpneq_epi8_mask(k1, a, b) simde_mm_mask_cmpneq_epi8_mask((k1), (a), (b)) #endif SIMDE_FUNCTION_ATTRIBUTES @@ -89,7 +89,7 @@ simde_mm_mask_cmpneq_epu8_mask(simde__mmask16 k1, simde__m128i a, simde__m128i b } #if defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) || defined(SIMDE_X86_AVX512BW_ENABLE_NATIVE_ALIASES) #undef _mm_mask_cmpneq_epu8_mask - #define _mm_mask_cmpneq_epu8_mask(a, b) simde_mm_mask_cmpneq_epu8_mask((a), (b)) + #define _mm_mask_cmpneq_epu8_mask(k1, a, b) simde_mm_mask_cmpneq_epu8_mask((k1), (a), (b)) #endif SIMDE_FUNCTION_ATTRIBUTES @@ -117,7 +117,7 @@ simde_mm_mask_cmpneq_epi16_mask(simde__mmask8 k1, simde__m128i a, simde__m128i b } #if defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) || defined(SIMDE_X86_AVX512BW_ENABLE_NATIVE_ALIASES) #undef _mm_mask_cmpneq_epi16_mask - #define _mm_mask_cmpneq_epi16_mask(a, b) simde_mm_mask_cmpneq_epi16_mask((a), (b)) + #define _mm_mask_cmpneq_epi16_mask(k1, a, b) simde_mm_mask_cmpneq_epi16_mask((k1), (a), (b)) #endif SIMDE_FUNCTION_ATTRIBUTES @@ -145,7 +145,7 @@ simde_mm_mask_cmpneq_epu16_mask(simde__mmask8 k1, simde__m128i a, simde__m128i b } #if defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) || defined(SIMDE_X86_AVX512BW_ENABLE_NATIVE_ALIASES) #undef _mm_mask_cmpneq_epu16_mask - #define _mm_mask_cmpneq_epu16_mask(a, b) simde_mm_mask_cmpneq_epu16_mask((a), (b)) + #define _mm_mask_cmpneq_epu16_mask(k1, a, b) simde_mm_mask_cmpneq_epu16_mask((k1), (a), (b)) #endif SIMDE_FUNCTION_ATTRIBUTES @@ -173,7 +173,7 @@ simde_mm_mask_cmpneq_epi32_mask(simde__mmask8 k1, simde__m128i a, simde__m128i b } #if defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) #undef _mm_mask_cmpneq_epi32_mask - #define _mm_mask_cmpneq_epi32_mask(a, b) simde_mm_mask_cmpneq_epi32_mask((a), (b)) + #define _mm_mask_cmpneq_epi32_mask(k1, a, b) simde_mm_mask_cmpneq_epi32_mask((k1), (a), (b)) #endif SIMDE_FUNCTION_ATTRIBUTES @@ -201,7 +201,7 @@ simde_mm_mask_cmpneq_epu32_mask(simde__mmask8 k1, simde__m128i a, simde__m128i b } #if defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) #undef _mm_mask_cmpneq_epu32_mask - #define _mm_mask_cmpneq_epu32_mask(a, b) simde_mm_mask_cmpneq_epu32_mask((a), (b)) + #define _mm_mask_cmpneq_epu32_mask(k1, a, b) simde_mm_mask_cmpneq_epu32_mask((k1), (a), (b)) #endif SIMDE_FUNCTION_ATTRIBUTES @@ -229,7 +229,7 @@ simde_mm_mask_cmpneq_epi64_mask(simde__mmask8 k1, simde__m128i a, simde__m128i b } #if defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) #undef _mm_mask_cmpneq_epi64_mask - #define _mm_mask_cmpneq_epi64_mask(a, b) simde_mm_mask_cmpneq_epi64_mask((a), (b)) + #define _mm_mask_cmpneq_epi64_mask(k1, a, b) simde_mm_mask_cmpneq_epi64_mask((k1), (a), (b)) #endif SIMDE_FUNCTION_ATTRIBUTES @@ -257,7 +257,7 @@ simde_mm_mask_cmpneq_epu64_mask(simde__mmask8 k1, simde__m128i a, simde__m128i b } #if defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) #undef _mm_mask_cmpneq_epu64_mask - #define _mm_mask_cmpneq_epu64_mask(a, b) simde_mm_mask_cmpneq_epu64_mask((a), (b)) + #define _mm_mask_cmpneq_epu64_mask(k1, a, b) simde_mm_mask_cmpneq_epu64_mask((k1), (a), (b)) #endif SIMDE_FUNCTION_ATTRIBUTES @@ -285,7 +285,7 @@ simde_mm256_mask_cmpneq_epi8_mask(simde__mmask32 k1, simde__m256i a, simde__m256 } #if defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) || defined(SIMDE_X86_AVX512BW_ENABLE_NATIVE_ALIASES) #undef _mm256_mask_cmpneq_epi8_mask - #define _mm256_mask_cmpneq_epi8_mask(a, b) simde_mm256_mask_cmpneq_epi8_mask((a), (b)) + #define _mm256_mask_cmpneq_epi8_mask(k1, a, b) simde_mm256_mask_cmpneq_epi8_mask((k1), (a), (b)) #endif SIMDE_FUNCTION_ATTRIBUTES @@ -313,7 +313,7 @@ simde_mm256_mask_cmpneq_epu8_mask(simde__mmask32 k1, simde__m256i a, simde__m256 } #if defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) || defined(SIMDE_X86_AVX512BW_ENABLE_NATIVE_ALIASES) #undef _mm256_mask_cmpneq_epu8_mask - #define _mm256_mask_cmpneq_epu8_mask(a, b) simde_mm256_mask_cmpneq_epu8_mask((a), (b)) + #define _mm256_mask_cmpneq_epu8_mask(k1, a, b) simde_mm256_mask_cmpneq_epu8_mask((k1), (a), (b)) #endif SIMDE_FUNCTION_ATTRIBUTES @@ -341,7 +341,7 @@ simde_mm256_mask_cmpneq_epi16_mask(simde__mmask16 k1, simde__m256i a, simde__m25 } #if defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) || defined(SIMDE_X86_AVX512BW_ENABLE_NATIVE_ALIASES) #undef _mm256_mask_cmpneq_epi16_mask - #define _mm256_mask_cmpneq_epi16_mask(a, b) simde_mm256_mask_cmpneq_epi16_mask((a), (b)) + #define _mm256_mask_cmpneq_epi16_mask(k1, a, b) simde_mm256_mask_cmpneq_epi16_mask((k1), (a), (b)) #endif SIMDE_FUNCTION_ATTRIBUTES @@ -369,7 +369,7 @@ simde_mm256_mask_cmpneq_epu16_mask(simde__mmask16 k1, simde__m256i a, simde__m25 } #if defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) || defined(SIMDE_X86_AVX512BW_ENABLE_NATIVE_ALIASES) #undef _mm256_mask_cmpneq_epu16_mask - #define _mm256_mask_cmpneq_epu16_mask(a, b) simde_mm256_mask_cmpneq_epu16_mask((a), (b)) + #define _mm256_mask_cmpneq_epu16_mask(k1, a, b) simde_mm256_mask_cmpneq_epu16_mask((k1), (a), (b)) #endif SIMDE_FUNCTION_ATTRIBUTES @@ -397,7 +397,7 @@ simde_mm256_mask_cmpneq_epi32_mask(simde__mmask8 k1, simde__m256i a, simde__m256 } #if defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) #undef _mm256_mask_cmpneq_epi32_mask - #define _mm256_mask_cmpneq_epi32_mask(a, b) simde_mm256_mask_cmpneq_epi32_mask((a), (b)) + #define _mm256_mask_cmpneq_epi32_mask(k1, a, b) simde_mm256_mask_cmpneq_epi32_mask((k1), (a), (b)) #endif SIMDE_FUNCTION_ATTRIBUTES @@ -425,7 +425,7 @@ simde_mm256_mask_cmpneq_epu32_mask(simde__mmask8 k1, simde__m256i a, simde__m256 } #if defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) #undef _mm256_mask_cmpneq_epu32_mask - #define _mm256_mask_cmpneq_epu32_mask(a, b) simde_mm256_mask_cmpneq_epu32_mask((a), (b)) + #define _mm256_mask_cmpneq_epu32_mask(k1, a, b) simde_mm256_mask_cmpneq_epu32_mask((k1), (a), (b)) #endif SIMDE_FUNCTION_ATTRIBUTES @@ -453,7 +453,7 @@ simde_mm256_mask_cmpneq_epi64_mask(simde__mmask8 k1, simde__m256i a, simde__m256 } #if defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) #undef _mm256_mask_cmpneq_epi64_mask - #define _mm256_mask_cmpneq_epi64_mask(a, b) simde_mm256_mask_cmpneq_epi64_mask((a), (b)) + #define _mm256_mask_cmpneq_epi64_mask(k1, a, b) simde_mm256_mask_cmpneq_epi64_mask((k1), (a), (b)) #endif SIMDE_FUNCTION_ATTRIBUTES @@ -481,7 +481,7 @@ simde_mm256_mask_cmpneq_epu64_mask(simde__mmask8 k1, simde__m256i a, simde__m256 } #if defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) #undef _mm256_mask_cmpneq_epu64_mask - #define _mm256_mask_cmpneq_epu64_mask(a, b) simde_mm256_mask_cmpneq_epu64_mask((a), (b)) + #define _mm256_mask_cmpneq_epu64_mask(k1, a, b) simde_mm256_mask_cmpneq_epu64_mask((k1), (a), (b)) #endif SIMDE_END_DECLS_ diff --git a/lib/simd_wrapper/simde/x86/avx512/compress.h b/lib/simd_wrapper/simde/x86/avx512/compress.h index 1eb6fae45e5..06fffc733b1 100644 --- a/lib/simd_wrapper/simde/x86/avx512/compress.h +++ b/lib/simd_wrapper/simde/x86/avx512/compress.h @@ -34,14 +34,17 @@ simde_mm256_mask_compress_pd (simde__m256d src, simde__mmask8 k, simde__m256d a) } #if defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) && defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) #undef _mm256_mask_compress_pd - #define _mm256_mask_compress_pd(src, k, a) _mm256_mask_compress_pd(src, k, a) + #define _mm256_mask_compress_pd(src, k, a) simde_mm256_mask_compress_pd(src, k, a) #endif SIMDE_FUNCTION_ATTRIBUTES void simde_mm256_mask_compressstoreu_pd (void* base_addr, simde__mmask8 k, simde__m256d a) { - #if defined(SIMDE_X86_AVX512VL_NATIVE) && defined(SIMDE_X86_AVX512F_NATIVE) + #if defined(SIMDE_X86_AVX512VL_NATIVE) && defined(SIMDE_X86_AVX512F_NATIVE) && !defined(__znver4__) _mm256_mask_compressstoreu_pd(base_addr, k, a); + #elif defined(SIMDE_X86_AVX512VL_NATIVE) && defined(SIMDE_X86_AVX512F_NATIVE) && defined(__znver4__) + simde__mmask8 store_mask = _pext_u32(-1, k); + _mm256_mask_storeu_pd(base_addr, store_mask, _mm256_maskz_compress_pd(k, a)); #else simde__m256d_private a_ = simde__m256d_to_private(a); @@ -61,7 +64,7 @@ simde_mm256_mask_compressstoreu_pd (void* base_addr, simde__mmask8 k, simde__m25 } #if defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) && defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) #undef _mm256_mask_compressstoreu_pd - #define _mm256_mask_compressstoreu_pd(base_addr, k, a) _mm256_mask_compressstoreu_pd(base_addr, k, a) + #define _mm256_mask_compressstoreu_pd(base_addr, k, a) simde_mm256_mask_compressstoreu_pd(base_addr, k, a) #endif SIMDE_FUNCTION_ATTRIBUTES @@ -90,7 +93,7 @@ simde_mm256_maskz_compress_pd (simde__mmask8 k, simde__m256d a) { } #if defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) && defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) #undef _mm256_maskz_compress_pd - #define _mm256_maskz_compress_pd(k, a) _mm256_maskz_compress_pd(k, a) + #define _mm256_maskz_compress_pd(k, a) simde_mm256_maskz_compress_pd(k, a) #endif SIMDE_FUNCTION_ATTRIBUTES @@ -120,14 +123,17 @@ simde_mm256_mask_compress_ps (simde__m256 src, simde__mmask8 k, simde__m256 a) { } #if defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) && defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) #undef _mm256_mask_compress_ps - #define _mm256_mask_compress_ps(src, k, a) _mm256_mask_compress_ps(src, k, a) + #define _mm256_mask_compress_ps(src, k, a) simde_mm256_mask_compress_ps(src, k, a) #endif SIMDE_FUNCTION_ATTRIBUTES void simde_mm256_mask_compressstoreu_ps (void* base_addr, simde__mmask8 k, simde__m256 a) { - #if defined(SIMDE_X86_AVX512VL_NATIVE) && defined(SIMDE_X86_AVX512F_NATIVE) + #if defined(SIMDE_X86_AVX512VL_NATIVE) && defined(SIMDE_X86_AVX512F_NATIVE) && !defined(__znver4__) _mm256_mask_compressstoreu_ps(base_addr, k, a); + #elif defined(SIMDE_X86_AVX512VL_NATIVE) && defined(SIMDE_X86_AVX512F_NATIVE) && defined(__znver4__) + simde__mmask8 store_mask = _pext_u32(-1, k); + _mm256_mask_storeu_ps(base_addr, store_mask, _mm256_maskz_compress_ps(k, a)); #else simde__m256_private a_ = simde__m256_to_private(a); @@ -146,8 +152,8 @@ simde_mm256_mask_compressstoreu_ps (void* base_addr, simde__mmask8 k, simde__m25 #endif } #if defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) && defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) - #undef _mm256_mask_compressstoreu_pd - #define _mm256_mask_compressstoreu_ps(base_addr, k, a) _mm256_mask_compressstoreu_ps(base_addr, k, a) + #undef _mm256_mask_compressstoreu_ps + #define _mm256_mask_compressstoreu_ps(base_addr, k, a) simde_mm256_mask_compressstoreu_ps(base_addr, k, a) #endif SIMDE_FUNCTION_ATTRIBUTES @@ -176,7 +182,7 @@ simde_mm256_maskz_compress_ps (simde__mmask8 k, simde__m256 a) { } #if defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) && defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) #undef _mm256_maskz_compress_ps - #define _mm256_maskz_compress_ps(k, a) _mm256_maskz_compress_ps(k, a) + #define _mm256_maskz_compress_ps(k, a) simde_mm256_maskz_compress_ps(k, a) #endif SIMDE_FUNCTION_ATTRIBUTES @@ -206,14 +212,17 @@ simde_mm256_mask_compress_epi32 (simde__m256i src, simde__mmask8 k, simde__m256i } #if defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) && defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) #undef _mm256_mask_compress_epi32 - #define _mm256_mask_compress_epi32(src, k, a) _mm256_mask_compress_epi32(src, k, a) + #define _mm256_mask_compress_epi32(src, k, a) simde_mm256_mask_compress_epi32(src, k, a) #endif SIMDE_FUNCTION_ATTRIBUTES void simde_mm256_mask_compressstoreu_epi32 (void* base_addr, simde__mmask8 k, simde__m256i a) { - #if defined(SIMDE_X86_AVX512VL_NATIVE) && defined(SIMDE_X86_AVX512F_NATIVE) + #if defined(SIMDE_X86_AVX512VL_NATIVE) && defined(SIMDE_X86_AVX512F_NATIVE) && !defined(__znver4__) _mm256_mask_compressstoreu_epi32(base_addr, k, a); + #elif defined(SIMDE_X86_AVX512VL_NATIVE) && defined(SIMDE_X86_AVX512F_NATIVE) && defined(__znver4__) + simde__mmask8 store_mask = _pext_u32(-1, k); + _mm256_mask_storeu_epi32(base_addr, store_mask, _mm256_maskz_compress_epi32(k, a)); #else simde__m256i_private a_ = simde__m256i_to_private(a); @@ -233,7 +242,7 @@ simde_mm256_mask_compressstoreu_epi32 (void* base_addr, simde__mmask8 k, simde__ } #if defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) && defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) #undef _mm256_mask_compressstoreu_epi32 - #define _mm256_mask_compressstoreu_epi32(base_addr, k, a) _mm256_mask_compressstoreu_epi32(base_addr, k, a) + #define _mm256_mask_compressstoreu_epi32(base_addr, k, a) simde_mm256_mask_compressstoreu_epi32(base_addr, k, a) #endif SIMDE_FUNCTION_ATTRIBUTES @@ -262,7 +271,7 @@ simde_mm256_maskz_compress_epi32 (simde__mmask8 k, simde__m256i a) { } #if defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) && defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) #undef _mm256_maskz_compress_epi32 - #define _mm256_maskz_compress_epi32(k, a) _mm256_maskz_compress_epi32(k, a) + #define _mm256_maskz_compress_epi32(k, a) simde_mm256_maskz_compress_epi32(k, a) #endif SIMDE_FUNCTION_ATTRIBUTES @@ -292,14 +301,17 @@ simde_mm256_mask_compress_epi64 (simde__m256i src, simde__mmask8 k, simde__m256i } #if defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) && defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) #undef _mm256_mask_compress_epi64 - #define _mm256_mask_compress_epi64(src, k, a) _mm256_mask_compress_epi64(src, k, a) + #define _mm256_mask_compress_epi64(src, k, a) simde_mm256_mask_compress_epi64(src, k, a) #endif SIMDE_FUNCTION_ATTRIBUTES void simde_mm256_mask_compressstoreu_epi64 (void* base_addr, simde__mmask8 k, simde__m256i a) { - #if defined(SIMDE_X86_AVX512VL_NATIVE) && defined(SIMDE_X86_AVX512F_NATIVE) + #if defined(SIMDE_X86_AVX512VL_NATIVE) && defined(SIMDE_X86_AVX512F_NATIVE) && !defined(__znver4__) _mm256_mask_compressstoreu_epi64(base_addr, k, a); + #elif defined(SIMDE_X86_AVX512VL_NATIVE) && defined(SIMDE_X86_AVX512F_NATIVE) && defined(__znver4__) + simde__mmask8 store_mask = _pext_u32(-1, k); + _mm256_mask_storeu_epi64(base_addr, store_mask, _mm256_maskz_compress_epi64(k, a)); #else simde__m256i_private a_ = simde__m256i_to_private(a); @@ -319,7 +331,7 @@ simde_mm256_mask_compressstoreu_epi64 (void* base_addr, simde__mmask8 k, simde__ } #if defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) && defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) #undef _mm256_mask_compressstoreu_epi64 - #define _mm256_mask_compressstoreu_epi64(base_addr, k, a) _mm256_mask_compressstoreu_epi64(base_addr, k, a) + #define _mm256_mask_compressstoreu_epi64(base_addr, k, a) simde_mm256_mask_compressstoreu_epi64(base_addr, k, a) #endif SIMDE_FUNCTION_ATTRIBUTES @@ -348,7 +360,7 @@ simde_mm256_maskz_compress_epi64 (simde__mmask8 k, simde__m256i a) { } #if defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) && defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) #undef _mm256_maskz_compress_epi64 - #define _mm256_maskz_compress_epi64(k, a) _mm256_maskz_compress_epi64(k, a) + #define _mm256_maskz_compress_epi64(k, a) simde_mm256_maskz_compress_epi64(k, a) #endif SIMDE_FUNCTION_ATTRIBUTES @@ -378,14 +390,17 @@ simde_mm512_mask_compress_pd (simde__m512d src, simde__mmask8 k, simde__m512d a) } #if defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) && defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) #undef _mm512_mask_compress_pd - #define _mm512_mask_compress_pd(src, k, a) _mm512_mask_compress_pd(src, k, a) + #define _mm512_mask_compress_pd(src, k, a) simde_mm512_mask_compress_pd(src, k, a) #endif SIMDE_FUNCTION_ATTRIBUTES void simde_mm512_mask_compressstoreu_pd (void* base_addr, simde__mmask8 k, simde__m512d a) { - #if defined(SIMDE_X86_AVX512VL_NATIVE) && defined(SIMDE_X86_AVX512F_NATIVE) + #if defined(SIMDE_X86_AVX512VL_NATIVE) && defined(SIMDE_X86_AVX512F_NATIVE) && !defined(__znver4__) _mm512_mask_compressstoreu_pd(base_addr, k, a); + #elif defined(SIMDE_X86_AVX512VL_NATIVE) && defined(SIMDE_X86_AVX512F_NATIVE) && defined(__znver4__) + simde__mmask8 store_mask = _pext_u32(-1, k); + _mm512_mask_storeu_pd(base_addr, store_mask, _mm512_maskz_compress_pd(k, a)); #else simde__m512d_private a_ = simde__m512d_to_private(a); @@ -405,7 +420,7 @@ simde_mm512_mask_compressstoreu_pd (void* base_addr, simde__mmask8 k, simde__m51 } #if defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) && defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) #undef _mm512_mask_compressstoreu_pd - #define _mm512_mask_compressstoreu_pd(base_addr, k, a) _mm512_mask_compressstoreu_pd(base_addr, k, a) + #define _mm512_mask_compressstoreu_pd(base_addr, k, a) simde_mm512_mask_compressstoreu_pd(base_addr, k, a) #endif SIMDE_FUNCTION_ATTRIBUTES @@ -434,7 +449,7 @@ simde_mm512_maskz_compress_pd (simde__mmask8 k, simde__m512d a) { } #if defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) && defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) #undef _mm512_maskz_compress_pd - #define _mm512_maskz_compress_pd(k, a) _mm512_maskz_compress_pd(k, a) + #define _mm512_maskz_compress_pd(k, a) simde_mm512_maskz_compress_pd(k, a) #endif SIMDE_FUNCTION_ATTRIBUTES @@ -464,14 +479,17 @@ simde_mm512_mask_compress_ps (simde__m512 src, simde__mmask16 k, simde__m512 a) } #if defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) && defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) #undef _mm512_mask_compress_ps - #define _mm512_mask_compress_ps(src, k, a) _mm512_mask_compress_ps(src, k, a) + #define _mm512_mask_compress_ps(src, k, a) simde_mm512_mask_compress_ps(src, k, a) #endif SIMDE_FUNCTION_ATTRIBUTES void simde_mm512_mask_compressstoreu_ps (void* base_addr, simde__mmask16 k, simde__m512 a) { - #if defined(SIMDE_X86_AVX512VL_NATIVE) && defined(SIMDE_X86_AVX512F_NATIVE) + #if defined(SIMDE_X86_AVX512VL_NATIVE) && defined(SIMDE_X86_AVX512F_NATIVE) && !defined(__znver4__) _mm512_mask_compressstoreu_ps(base_addr, k, a); + #elif defined(SIMDE_X86_AVX512VL_NATIVE) && defined(SIMDE_X86_AVX512F_NATIVE) && defined(__znver4__) + simde__mmask16 store_mask = _pext_u32(-1, k); + _mm512_mask_storeu_ps(base_addr, store_mask, _mm512_maskz_compress_ps(k, a)); #else simde__m512_private a_ = simde__m512_to_private(a); @@ -490,8 +508,8 @@ simde_mm512_mask_compressstoreu_ps (void* base_addr, simde__mmask16 k, simde__m5 #endif } #if defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) && defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) - #undef _mm512_mask_compressstoreu_pd - #define _mm512_mask_compressstoreu_ps(base_addr, k, a) _mm512_mask_compressstoreu_ps(base_addr, k, a) + #undef _mm512_mask_compressstoreu_ps + #define _mm512_mask_compressstoreu_ps(base_addr, k, a) simde_mm512_mask_compressstoreu_ps(base_addr, k, a) #endif SIMDE_FUNCTION_ATTRIBUTES @@ -520,7 +538,7 @@ simde_mm512_maskz_compress_ps (simde__mmask16 k, simde__m512 a) { } #if defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) && defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) #undef _mm512_maskz_compress_ps - #define _mm512_maskz_compress_ps(k, a) _mm512_maskz_compress_ps(k, a) + #define _mm512_maskz_compress_ps(k, a) simde_mm512_maskz_compress_ps(k, a) #endif SIMDE_FUNCTION_ATTRIBUTES @@ -550,14 +568,47 @@ simde_mm512_mask_compress_epi32 (simde__m512i src, simde__mmask16 k, simde__m512 } #if defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) && defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) #undef _mm512_mask_compress_epi32 - #define _mm512_mask_compress_epi32(src, k, a) _mm512_mask_compress_epi32(src, k, a) + #define _mm512_mask_compress_epi32(src, k, a) simde_mm512_mask_compress_epi32(src, k, a) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +void +simde_mm512_mask_compressstoreu_epi16 (void* base_addr, simde__mmask32 k, simde__m512i a) { + #if defined(SIMDE_X86_AVX512VBMI2_NATIVE) && !defined(__znver4__) + _mm512_mask_compressstoreu_epi16(base_addr, k, a); + #elif defined(SIMDE_X86_AVX512VBMI2_NATIVE) && defined(__znver4__) + simde__mmask32 store_mask = _pext_u32(-1, k); + _mm512_mask_storeu_epi16(base_addr, store_mask, _mm512_maskz_compress_epi16(k, a)); + #else + simde__m512i_private + a_ = simde__m512i_to_private(a); + size_t ri = 0; + + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(a_.i16) / sizeof(a_.i16[0])) ; i++) { + if ((k >> i) & 1) { + a_.i16[ri++] = a_.i16[i]; + } + } + + simde_memcpy(base_addr, &a_, ri * sizeof(a_.i16[0])); + + return; + #endif +} +#if defined(SIMDE_X86_AVX512VBMI2_ENABLE_NATIVE_ALIASES) + #undef _mm512_mask_compressstoreu_epi16 + #define _mm512_mask_compressstoreu_epi16(base_addr, k, a) simde_mm512_mask_compressstoreu_epi16(base_addr, k, a) #endif SIMDE_FUNCTION_ATTRIBUTES void simde_mm512_mask_compressstoreu_epi32 (void* base_addr, simde__mmask16 k, simde__m512i a) { - #if defined(SIMDE_X86_AVX512VL_NATIVE) && defined(SIMDE_X86_AVX512F_NATIVE) + #if defined(SIMDE_X86_AVX512VL_NATIVE) && defined(SIMDE_X86_AVX512F_NATIVE) && !defined(__znver4__) _mm512_mask_compressstoreu_epi32(base_addr, k, a); + #elif defined(SIMDE_X86_AVX512VL_NATIVE) && defined(SIMDE_X86_AVX512F_NATIVE) && defined(__znver4__) + simde__mmask16 store_mask = _pext_u32(-1, k); + _mm512_mask_storeu_epi32(base_addr, store_mask, _mm512_maskz_compress_epi32(k, a)); #else simde__m512i_private a_ = simde__m512i_to_private(a); @@ -577,7 +628,7 @@ simde_mm512_mask_compressstoreu_epi32 (void* base_addr, simde__mmask16 k, simde_ } #if defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) && defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) #undef _mm512_mask_compressstoreu_epi32 - #define _mm512_mask_compressstoreu_epi32(base_addr, k, a) _mm512_mask_compressstoreu_epi32(base_addr, k, a) + #define _mm512_mask_compressstoreu_epi32(base_addr, k, a) simde_mm512_mask_compressstoreu_epi32(base_addr, k, a) #endif SIMDE_FUNCTION_ATTRIBUTES @@ -606,7 +657,7 @@ simde_mm512_maskz_compress_epi32 (simde__mmask16 k, simde__m512i a) { } #if defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) && defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) #undef _mm512_maskz_compress_epi32 - #define _mm512_maskz_compress_epi32(k, a) _mm512_maskz_compress_epi32(k, a) + #define _mm512_maskz_compress_epi32(k, a) simde_mm512_maskz_compress_epi32(k, a) #endif SIMDE_FUNCTION_ATTRIBUTES @@ -636,14 +687,17 @@ simde_mm512_mask_compress_epi64 (simde__m512i src, simde__mmask8 k, simde__m512i } #if defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) && defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) #undef _mm512_mask_compress_epi64 - #define _mm512_mask_compress_epi64(src, k, a) _mm512_mask_compress_epi64(src, k, a) + #define _mm512_mask_compress_epi64(src, k, a) simde_mm512_mask_compress_epi64(src, k, a) #endif SIMDE_FUNCTION_ATTRIBUTES void simde_mm512_mask_compressstoreu_epi64 (void* base_addr, simde__mmask8 k, simde__m512i a) { - #if defined(SIMDE_X86_AVX512VL_NATIVE) && defined(SIMDE_X86_AVX512F_NATIVE) + #if defined(SIMDE_X86_AVX512VL_NATIVE) && defined(SIMDE_X86_AVX512F_NATIVE) && !defined(__znver4__) _mm512_mask_compressstoreu_epi64(base_addr, k, a); + #elif defined(SIMDE_X86_AVX512VL_NATIVE) && defined(SIMDE_X86_AVX512F_NATIVE) && defined(__znver4__) + simde__mmask8 store_mask = _pext_u32(-1, k); + _mm512_mask_storeu_epi64(base_addr, store_mask, _mm512_maskz_compress_epi64(k, a)); #else simde__m512i_private a_ = simde__m512i_to_private(a); @@ -663,7 +717,7 @@ simde_mm512_mask_compressstoreu_epi64 (void* base_addr, simde__mmask8 k, simde__ } #if defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) && defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) #undef _mm512_mask_compressstoreu_epi64 - #define _mm512_mask_compressstoreu_epi64(base_addr, k, a) _mm512_mask_compressstoreu_epi64(base_addr, k, a) + #define _mm512_mask_compressstoreu_epi64(base_addr, k, a) simde_mm512_mask_compressstoreu_epi64(base_addr, k, a) #endif SIMDE_FUNCTION_ATTRIBUTES @@ -692,7 +746,7 @@ simde_mm512_maskz_compress_epi64 (simde__mmask8 k, simde__m512i a) { } #if defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) && defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) #undef _mm512_maskz_compress_epi64 - #define _mm512_maskz_compress_epi64(k, a) _mm512_maskz_compress_epi64(k, a) + #define _mm512_maskz_compress_epi64(k, a) simde_mm512_maskz_compress_epi64(k, a) #endif SIMDE_END_DECLS_ diff --git a/lib/simd_wrapper/simde/x86/avx512/cvt.h b/lib/simd_wrapper/simde/x86/avx512/cvt.h index 6abf8e897d9..579bcac10d7 100644 --- a/lib/simd_wrapper/simde/x86/avx512/cvt.h +++ b/lib/simd_wrapper/simde/x86/avx512/cvt.h @@ -32,6 +32,7 @@ #include "types.h" #include "mov.h" +#include "../../simde-f16.h" HEDLEY_DIAGNOSTIC_PUSH SIMDE_DISABLE_UNWANTED_DIAGNOSTICS @@ -106,9 +107,36 @@ simde_mm_maskz_cvtepi64_pd(simde__mmask8 k, simde__m128i a) { } #if defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) || defined(SIMDE_X86_AVX512DQ_ENABLE_NATIVE_ALIASES) #undef _mm_maskz_cvtepi64_pd - #define _mm_maskz_cvtepi64_pd(k, a) simde_mm_maskz_cvtepi64_pd(k, a) + #define _mm_maskz_cvtepi64_pd(k, a) simde_mm_maskz_cvtepi64_pd((k), (a)) #endif +SIMDE_FUNCTION_ATTRIBUTES +simde__m512i +simde_mm512_cvtepi16_epi32 (simde__m256i a) { + #if defined(SIMDE_X86_AVX512F_NATIVE) + return _mm512_cvtepi16_epi32(a); + #else + simde__m512i_private r_; + simde__m256i_private a_ = simde__m256i_to_private(a); + + #if defined(SIMDE_CONVERT_VECTOR_) + SIMDE_CONVERT_VECTOR_(r_.i32, a_.i16); + #else + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.i32) / sizeof(r_.i32[0])) ; i++) { + r_.i32[i] = a_.i16[i]; + } + #endif + + return simde__m512i_from_private(r_); + #endif +} +#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) + #undef _mm512_cvtepi16_epi32 + #define _mm512_cvtepi16_epi32(a) simde_mm512_cvtepi16_epi32(a) +#endif + + SIMDE_FUNCTION_ATTRIBUTES simde__m256i simde_mm512_cvtepi16_epi8 (simde__m512i a) { @@ -172,7 +200,10 @@ simde_mm512_cvtepi8_epi16 (simde__m256i a) { simde__m512i_private r_; simde__m256i_private a_ = simde__m256i_to_private(a); - #if defined(SIMDE_CONVERT_VECTOR_) + #if defined(SIMDE_X86_AVX2_NATIVE) + r_.m256i[0] = _mm256_cvtepi8_epi16(a_.m128i[0]); + r_.m256i[1] = _mm256_cvtepi8_epi16(a_.m128i[1]); + #elif defined(SIMDE_CONVERT_VECTOR_) SIMDE_CONVERT_VECTOR_(r_.i16, a_.i8); #else SIMDE_VECTORIZE @@ -189,6 +220,35 @@ simde_mm512_cvtepi8_epi16 (simde__m256i a) { #define _mm512_cvtepi8_epi16(a) simde_mm512_cvtepi8_epi16(a) #endif +SIMDE_FUNCTION_ATTRIBUTES +simde__m512 +simde_mm512_cvtepi32_ps (simde__m512i a) { + #if defined(SIMDE_X86_AVX512F_NATIVE) + return _mm512_cvtepi32_ps(a); + #else + simde__m512_private r_; + simde__m512i_private a_ = simde__m512i_to_private(a); + + #if defined(SIMDE_X86_AVX_NATIVE) + r_.m256[0] = _mm256_cvtepi32_ps(a_.m256i[0]); + r_.m256[1] = _mm256_cvtepi32_ps(a_.m256i[1]); + #elif defined(SIMDE_CONVERT_VECTOR_) + SIMDE_CONVERT_VECTOR_(r_.f32, a_.i32); + #else + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.i32) / sizeof(r_.i32[0])) ; i++) { + r_.f32[i] = HEDLEY_STATIC_CAST(simde_float32, a_.i32[i]); + } + #endif + + return simde__m512_from_private(r_); + #endif +} +#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) + #undef _mm512_cvtepi32_ps + #define _mm512_cvtepi32_ps(a) simde_mm512_cvtepi32_ps(a) +#endif + SIMDE_FUNCTION_ATTRIBUTES simde__m256i simde_mm512_cvtepi64_epi32 (simde__m512i a) { @@ -212,7 +272,33 @@ simde_mm512_cvtepi64_epi32 (simde__m512i a) { } #if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) #undef _mm512_cvtepi64_epi32 - #define _mm512_cvtepi64_epi32(a) simde_mm512_cvtepi64_epi32(a) + #define _mm512_cvtepi64_epi32(a) simde_mm512_cvtepi64_epi32((a)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde__m512i +simde_mm512_cvtepu16_epi32 (simde__m256i a) { + #if defined(SIMDE_X86_AVX512F_NATIVE) + return _mm512_cvtepu16_epi32(a); + #else + simde__m512i_private r_; + simde__m256i_private a_ = simde__m256i_to_private(a); + + #if defined(SIMDE_CONVERT_VECTOR_) + SIMDE_CONVERT_VECTOR_(r_.i32, a_.u16); + #else + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.i32) / sizeof(r_.i32[0])) ; i++) { + r_.i32[i] = HEDLEY_STATIC_CAST(int32_t, a_.u16[i]); + } + #endif + + return simde__m512i_from_private(r_); + #endif +} +#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) + #undef _mm512_cvtepu16_epi32 + #define _mm512_cvtepu16_epi32(a) simde_mm512_cvtepu16_epi32(a) #endif SIMDE_FUNCTION_ATTRIBUTES @@ -247,8 +333,67 @@ simde_mm512_cvtepu32_ps (simde__m512i a) { #endif } #if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) - #undef _mm512_cvtepu32_epi32 - #define _mm512_cvtepu32_epi32(a) simde_mm512_cvtepu32_ps(a) + #undef _mm512_cvtepu32_ps + #define _mm512_cvtepu32_ps(a) simde_mm512_cvtepu32_ps(a) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde__m512 +simde_mm512_cvtph_ps(simde__m256i a) { + #if defined(SIMDE_X86_AVX512F_NATIVE) + return _mm512_cvtph_ps(a); + #endif + simde__m256i_private a_ = simde__m256i_to_private(a); + simde__m512_private r_; + + #if defined(SIMDE_X86_F16C_NATIVE) + r_.m256[0] = _mm256_cvtph_ps(a_.m128i[0]); + r_.m256[1] = _mm256_cvtph_ps(a_.m128i[1]); + #elif defined(SIMDE_FLOAT16_VECTOR) + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.f32) / sizeof(r_.f32[0])) ; i++) { + r_.f32[i] = simde_float16_to_float32(a_.f16[i]); + } + #else + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.f32) / sizeof(r_.f32[0])) ; i++) { + r_.f32[i] = simde_float16_to_float32(simde_uint16_as_float16(a_.u16[i])); + } + #endif + + return simde__m512_from_private(r_); +} +#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) + #undef _mm512_cvtph_ps + #define _mm512_cvtph_ps(a) simde_mm512_cvtph_ps(a) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde__m512i +simde_mm512_cvtps_epi32(simde__m512 a) { + #if defined(SIMDE_X86_AVX512F_NATIVE) + return _mm512_cvtps_epi32(a); + #endif + simde__m512_private a_ = simde__m512_to_private(a); + simde__m512i_private r_; + + #if defined(SIMDE_X86_AVX_NATIVE) + r_.m256i[0] = _mm256_cvtps_epi32(a_.m256[0]); + r_.m256i[1] = _mm256_cvtps_epi32(a_.m256[1]); + #elif defined(simde_math_nearbyintf) + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.i32) / sizeof(r_.i32[0])) ; i++) { + r_.i32[i] = SIMDE_CONVERT_FTOI(int32_t, simde_math_nearbyintf(a_.f32[i])); + } + #else + HEDLEY_UNREACHABLE(); + #endif + + return simde__m512i_from_private(r_); +} +#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) + #undef _mm512_cvtps_epi32 + #define _mm512_cvtps_epi32(a) simde_mm512_cvtps_epi32(a) #endif SIMDE_END_DECLS_ diff --git a/lib/simd_wrapper/simde/x86/avx512/cvts.h b/lib/simd_wrapper/simde/x86/avx512/cvts.h index c35c2f9e488..0194889a73a 100644 --- a/lib/simd_wrapper/simde/x86/avx512/cvts.h +++ b/lib/simd_wrapper/simde/x86/avx512/cvts.h @@ -31,6 +31,8 @@ #include "types.h" #include "mov.h" +#include "storeu.h" +#include "loadu.h" HEDLEY_DIAGNOSTIC_PUSH SIMDE_DISABLE_UNWANTED_DIAGNOSTICS @@ -362,6 +364,34 @@ simde_mm512_mask_cvtsepi32_epi8 (simde__m128i src, simde__mmask16 k, simde__m512 #define _mm512_mask_cvtsepi32_epi8(src, k, a) simde_mm512_mask_cvtsepi32_epi8(src, k, a) #endif +SIMDE_FUNCTION_ATTRIBUTES +void +simde_mm512_mask_cvtsepi32_storeu_epi8 (void* base_addr, simde__mmask16 k, simde__m512i a) { + #if defined(SIMDE_X86_AVX512F_NATIVE) + _mm512_mask_cvtsepi32_storeu_epi8(base_addr, k, a); + #else + simde__m256i_private r_ = simde__m256i_to_private(simde_mm256_loadu_epi8(base_addr)); + simde__m512i_private a_ = simde__m512i_to_private(a); + + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(a_.i32) / sizeof(a_.i32[0])) ; i++) { + r_.i8[i] = ((k>>i) &1 ) ? + ((a_.i32[i] < INT8_MIN) + ? (INT8_MIN) + : ((a_.i32[i] > INT8_MAX) + ? (INT8_MAX) + : HEDLEY_STATIC_CAST(int8_t, a_.i32[i]))) : r_.i8[i]; + } + + simde_mm256_storeu_epi8(base_addr, simde__m256i_from_private(r_)); + #endif +} +#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) + #undef _mm512_mask_cvtsepi32_storeu_epi8 + #define _mm512_mask_cvtsepi32_storeu_epi8(base_addr, k, a) simde_mm512_mask_cvtsepi32_storeu_epi8(base_addr, k, a) +#endif + + SIMDE_FUNCTION_ATTRIBUTES simde__m128i simde_mm512_maskz_cvtsepi32_epi8 (simde__mmask16 k, simde__m512i a) { @@ -444,6 +474,34 @@ simde_mm512_mask_cvtsepi32_epi16 (simde__m256i src, simde__mmask16 k, simde__m51 #define _mm512_mask_cvtsepi32_epi16(src, k, a) simde_mm512_mask_cvtsepi32_epi16(src, k, a) #endif +SIMDE_FUNCTION_ATTRIBUTES +void +simde_mm512_mask_cvtsepi32_storeu_epi16 (void* base_addr, simde__mmask16 k, simde__m512i a) { + #if defined(SIMDE_X86_AVX512F_NATIVE) + _mm512_mask_cvtsepi32_storeu_epi16(base_addr, k, a); + #else + simde__m256i_private r_; + simde__m256i_private src_ = simde__m256i_to_private(simde_mm256_loadu_epi16(base_addr)); + simde__m512i_private a_ = simde__m512i_to_private(a); + + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(a_.i32) / sizeof(a_.i32[0])) ; i++) { + r_.i16[i] = ((k>>i) &1 ) ? + ((a_.i32[i] < INT16_MIN) + ? (INT16_MIN) + : ((a_.i32[i] > INT16_MAX) + ? (INT16_MAX) + : HEDLEY_STATIC_CAST(int16_t, a_.i32[i]))) : src_.i16[i]; + } + + simde_mm256_storeu_epi16(base_addr, simde__m256i_from_private(r_)); + #endif +} +#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) + #undef _mm512_mask_cvtsepi32_storeu_epi16 + #define _mm512_mask_cvtsepi32_storeu_epi16(base_addr, k, a) simde_mm512_mask_cvtsepi32_storeu_epi16(base_addr, k, a) +#endif + SIMDE_FUNCTION_ATTRIBUTES simde__m256i simde_mm512_maskz_cvtsepi32_epi16 (simde__mmask16 k, simde__m512i a) { diff --git a/lib/simd_wrapper/simde/x86/avx512/cvtt.h b/lib/simd_wrapper/simde/x86/avx512/cvtt.h index 044507ce444..937f7fb7253 100644 --- a/lib/simd_wrapper/simde/x86/avx512/cvtt.h +++ b/lib/simd_wrapper/simde/x86/avx512/cvtt.h @@ -98,6 +98,32 @@ simde_mm_maskz_cvttpd_epi64(simde__mmask8 k, simde__m128d a) { #define _mm_maskz_cvttpd_epi64(k, a) simde_mm_maskz_cvttpd_epi64(k, a) #endif +SIMDE_FUNCTION_ATTRIBUTES +simde__m512i +simde_mm512_cvttps_epi32 (simde__m512 a) { + #if defined(SIMDE_X86_AVX512F_NATIVE) + return _mm512_cvttps_epi32(a); + #else + simde__m512i_private r_; + simde__m512_private a_ = simde__m512_to_private(a); + + #if defined(simde_math_truncf) + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.i32) / sizeof(r_.i32[0])) ; i++) { + r_.i32[i] = SIMDE_CONVERT_FTOI(int32_t, simde_math_truncf(a_.f32[i])); + } + #else + HEDLEY_UNREACHABLE(); + #endif + + return simde__m512i_from_private(r_); + #endif +} +#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) + #undef _mm512_cvttps_epi32 + #define _mm512_cvttps_epi32(a) simde_mm512_cvttps_epi32(a) +#endif + SIMDE_END_DECLS_ HEDLEY_DIAGNOSTIC_POP diff --git a/lib/simd_wrapper/simde/x86/avx512/cvtus.h b/lib/simd_wrapper/simde/x86/avx512/cvtus.h new file mode 100644 index 00000000000..ce423f6c9c9 --- /dev/null +++ b/lib/simd_wrapper/simde/x86/avx512/cvtus.h @@ -0,0 +1,67 @@ +/* SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, copy, + * modify, merge, publish, distribute, sublicense, and/or sell copies + * of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + * Copyright: + * 2023 Michael R. Crusoe + */ + +#if !defined(SIMDE_X86_AVX512_CVTUS_H) +#define SIMDE_X86_AVX512_CVTUS_H + +#include "types.h" +#include "mov.h" +#include "storeu.h" +#include "loadu.h" + +HEDLEY_DIAGNOSTIC_PUSH +SIMDE_DISABLE_UNWANTED_DIAGNOSTICS +SIMDE_BEGIN_DECLS_ + +SIMDE_FUNCTION_ATTRIBUTES +void +simde_mm512_mask_cvtusepi32_storeu_epi8 (void* base_addr, simde__mmask16 k, simde__m512i a) { + #if defined(SIMDE_X86_AVX512F_NATIVE) + _mm512_mask_cvtusepi32_storeu_epi8(base_addr, k, a); + #else + simde__m256i_private r_ = simde__m256i_to_private(simde_mm256_loadu_epi8(base_addr)); + simde__m512i_private a_ = simde__m512i_to_private(a); + + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(a_.u32) / sizeof(a_.u32[0])) ; i++) { + r_.i8[i] = ((k>>i) &1 ) ? + ((a_.u32[i] > UINT8_MAX) + ? (HEDLEY_STATIC_CAST(int8_t, UINT8_MAX)) + : HEDLEY_STATIC_CAST(int8_t, a_.u32[i])) : r_.i8[i]; + } + + simde_mm256_storeu_epi8(base_addr, simde__m256i_from_private(r_)); + #endif +} +#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) + #undef _mm512_mask_cvtusepi32_storeu_epi8 + #define _mm512_mask_cvtusepi32_storeu_epi8(base_addr, k, a) simde_mm512_mask_cvtusepi32_storeu_epi8((base_addr), (k), (a)) +#endif + +SIMDE_END_DECLS_ +HEDLEY_DIAGNOSTIC_POP + +#endif /* !defined(SIMDE_X86_AVX512_CVTUS_H) */ diff --git a/lib/simd_wrapper/simde/x86/avx512/dpbf16.h b/lib/simd_wrapper/simde/x86/avx512/dpbf16.h index 56f2c68f15d..81e2aead258 100644 --- a/lib/simd_wrapper/simde/x86/avx512/dpbf16.h +++ b/lib/simd_wrapper/simde/x86/avx512/dpbf16.h @@ -20,7 +20,7 @@ simde_mm_dpbf16_ps (simde__m128 src, simde__m128bh a, simde__m128bh b) { a_ = simde__m128bh_to_private(a), b_ = simde__m128bh_to_private(b); - #if defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR) && defined(SIMDE_CONVERT_VECTOR_) && defined(SIMDE_SHUFFLE_VECTOR_) + #if ! ( defined(SIMDE_ARCH_X86) && defined(HEDLEY_GCC_VERSION) ) && defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR) && defined(SIMDE_CONVERT_VECTOR_) && defined(SIMDE_SHUFFLE_VECTOR_) uint32_t x1 SIMDE_VECTOR(32); uint32_t x2 SIMDE_VECTOR(32); simde__m128_private @@ -109,7 +109,7 @@ simde_mm256_dpbf16_ps (simde__m256 src, simde__m256bh a, simde__m256bh b) { a_ = simde__m256bh_to_private(a), b_ = simde__m256bh_to_private(b); - #if defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR) && defined(SIMDE_CONVERT_VECTOR_) && defined(SIMDE_SHUFFLE_VECTOR_) + #if ! ( defined(SIMDE_ARCH_X86) && defined(HEDLEY_GCC_VERSION) ) && defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR) && defined(SIMDE_CONVERT_VECTOR_) && defined(SIMDE_SHUFFLE_VECTOR_) uint32_t x1 SIMDE_VECTOR(64); uint32_t x2 SIMDE_VECTOR(64); simde__m256_private @@ -198,7 +198,7 @@ simde_mm512_dpbf16_ps (simde__m512 src, simde__m512bh a, simde__m512bh b) { a_ = simde__m512bh_to_private(a), b_ = simde__m512bh_to_private(b); - #if defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR) && defined(SIMDE_CONVERT_VECTOR_) && defined(SIMDE_SHUFFLE_VECTOR_) + #if ! ( defined(SIMDE_ARCH_X86) && defined(HEDLEY_GCC_VERSION) ) && defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR) && defined(SIMDE_CONVERT_VECTOR_) && defined(SIMDE_SHUFFLE_VECTOR_) uint32_t x1 SIMDE_VECTOR(128); uint32_t x2 SIMDE_VECTOR(128); simde__m512_private diff --git a/lib/simd_wrapper/simde/x86/avx512/extract.h b/lib/simd_wrapper/simde/x86/avx512/extract.h index 2261513ea5a..251715cf4c9 100644 --- a/lib/simd_wrapper/simde/x86/avx512/extract.h +++ b/lib/simd_wrapper/simde/x86/avx512/extract.h @@ -35,6 +35,23 @@ HEDLEY_DIAGNOSTIC_PUSH SIMDE_DISABLE_UNWANTED_DIAGNOSTICS SIMDE_BEGIN_DECLS_ +SIMDE_FUNCTION_ATTRIBUTES +simde__m128 +simde_mm256_extractf32x4_ps (simde__m256 a, int imm8) + SIMDE_REQUIRE_CONSTANT_RANGE(imm8, 0, 1) { + simde__m256_private a_ = simde__m256_to_private(a); + + return a_.m128[imm8 & 1]; +} +#if defined(SIMDE_X86_AVX512F_NATIVE) && defined(SIMDE_X86_AVX512VL_NATIVE) + #define simde_mm256_extractf32x4_ps(a, imm8) _mm256_extractf32x4_ps(a, imm8) +#endif +#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) && defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) + #undef _mm256_extractf32x4_ps + #define _mm256_extractf32x4_ps(a, imm8) simde_mm256_extractf32x4_ps((a), (imm8)) +#endif + + SIMDE_FUNCTION_ATTRIBUTES simde__m128 simde_mm512_extractf32x4_ps (simde__m512 a, int imm8) @@ -61,27 +78,43 @@ simde_mm512_extractf32x4_ps (simde__m512 a, int imm8) #endif #if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) #undef _mm512_extractf32x4_ps - #define _mm512_extractf32x4_ps(a, imm8) simde_mm512_extractf32x4_ps(a, imm8) + #define _mm512_extractf32x4_ps(a, imm8) simde_mm512_extractf32x4_ps((a), (imm8)) #endif #if defined(SIMDE_X86_AVX512F_NATIVE) && (!defined(HEDLEY_GCC_VERSION) || HEDLEY_GCC_VERSION_CHECK(7,0,0)) #define simde_mm512_mask_extractf32x4_ps(src, k, a, imm8) _mm512_mask_extractf32x4_ps(src, k, a, imm8) #else - #define simde_mm512_mask_extractf32x4_ps(src, k, a, imm8) simde_mm_mask_mov_ps(src, k, simde_mm512_extractf32x4_ps(a, imm8)) + #define simde_mm512_mask_extractf32x4_ps(src, k, a, imm8) simde_mm_mask_mov_ps((src), (k), simde_mm512_extractf32x4_ps((a), (imm8))) #endif #if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) #undef _mm512_mask_extractf32x4_ps - #define _mm512_mask_extractf32x4_ps(src, k, a, imm8) simde_mm512_mask_extractf32x4_ps(src, k, a, imm8) + #define _mm512_mask_extractf32x4_ps(src, k, a, imm8) simde_mm512_mask_extractf32x4_ps((src), (k), (a), (imm8)) #endif #if defined(SIMDE_X86_AVX512F_NATIVE) && (!defined(HEDLEY_GCC_VERSION) || HEDLEY_GCC_VERSION_CHECK(7,0,0)) #define simde_mm512_maskz_extractf32x4_ps(k, a, imm8) _mm512_maskz_extractf32x4_ps(k, a, imm8) #else - #define simde_mm512_maskz_extractf32x4_ps(k, a, imm8) simde_mm_maskz_mov_ps(k, simde_mm512_extractf32x4_ps(a, imm8)) + #define simde_mm512_maskz_extractf32x4_ps(k, a, imm8) simde_mm_maskz_mov_ps((k), simde_mm512_extractf32x4_ps((a), (imm8))) #endif #if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) #undef _mm512_maskz_extractf32x4_ps - #define _mm512_maskz_extractf32x4_ps(k, a, imm8) simde_mm512_maskz_extractf32x4_ps(k, a, imm8) + #define _mm512_maskz_extractf32x4_ps(k, a, imm8) simde_mm512_maskz_extractf32x4_ps((k), (a), (imm8)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde__m256 +simde_mm512_extractf32x8_ps (simde__m512 a, int imm8) + SIMDE_REQUIRE_CONSTANT_RANGE(imm8, 0, 1) { + simde__m512_private a_ = simde__m512_to_private(a); + + return a_.m256[imm8 & 1]; +} +#if defined(SIMDE_X86_AVX512DQ_NATIVE) + #define simde_mm512_extractf32x8_ps(a, imm8) _mm512_extractf32x8_ps(a, imm8) +#endif +#if defined(SIMDE_X86_AVX512DQ_ENABLE_NATIVE_ALIASES) + #undef _mm512_extractf32x8_ps + #define _mm512_extractf32x8_ps(a, imm8) simde_mm512_extractf32x8_ps(a, imm8) #endif SIMDE_FUNCTION_ATTRIBUTES @@ -156,6 +189,42 @@ simde_mm512_extracti32x4_epi32 (simde__m512i a, int imm8) #define _mm512_maskz_extracti32x4_epi32(k, a, imm8) simde_mm512_maskz_extracti32x4_epi32(k, a, imm8) #endif +SIMDE_FUNCTION_ATTRIBUTES +simde__m256i +simde_mm512_extracti32x8_epi32 (simde__m512i a, int imm8) + SIMDE_REQUIRE_CONSTANT_RANGE(imm8, 0, 1) { + simde__m512i_private a_ = simde__m512i_to_private(a); + + return a_.m256i[imm8 & 1]; +} +#if defined(SIMDE_X86_AVX512DQ_NATIVE) && (!defined(HEDLEY_GCC_VERSION) || HEDLEY_GCC_VERSION_CHECK(7,0,0)) && !defined(SIMDE_BUG_CLANG_REV_299346) + #define simde_mm512_extracti32x8_epi32(a, imm8) _mm512_extracti32x8_epi32(a, imm8) +#endif +#if defined(SIMDE_X86_AVX512DQ_ENABLE_NATIVE_ALIASES) + #undef _mm512_extracti32x8_epi32 + #define _mm512_extracti32x8_epi32(a, imm8) simde_mm512_extracti32x8_epi32((a), (imm8)) +#endif + +#if defined(SIMDE_X86_AVX51FDQ_NATIVE) && (!defined(HEDLEY_GCC_VERSION) || HEDLEY_GCC_VERSION_CHECK(7,0,0)) && !defined(SIMDE_BUG_CLANG_REV_299346) + #define simde_mm512_mask_extracti32x8_epi32(src, k, a, imm8) _mm512_mask_extracti32x8_epi32(src, k, a, imm8) +#else + #define simde_mm512_mask_extracti32x8_epi32(src, k, a, imm8) simde_mm256_mask_mov_epi32((src), (k), simde_mm512_extracti32x8_epi32((a), (imm8))) +#endif +#if defined(SIMDE_X86_AVX512DQ_ENABLE_NATIVE_ALIASES) + #undef _mm512_mask_extracti32x8_epi32 + #define _mm512_mask_extracti32x8_epi32(src, k, a, imm8) simde_mm512_mask_extracti32x8_epi32((src), (k), (a), (imm8)) +#endif + +#if defined(SIMDE_X86_AVX512DQ_NATIVE) && (!defined(HEDLEY_GCC_VERSION) || HEDLEY_GCC_VERSION_CHECK(7,0,0)) && !defined(SIMDE_BUG_CLANG_REV_299346) + #define simde_mm512_maskz_extracti32x8_epi32(k, a, imm8) _mm512_maskz_extracti32x8_epi32(k, a, imm8) +#else + #define simde_mm512_maskz_extracti32x8_epi32(k, a, imm8) simde_mm256_maskz_mov_epi32((k), simde_mm512_extracti32x8_epi32((a), (imm8))) +#endif +#if defined(SIMDE_X86_AVX512DQ_ENABLE_NATIVE_ALIASES) + #undef _mm512_maskz_extracti32x8_epi32 + #define _mm512_maskz_extracti32x8_epi32(k, a, imm8) simde_mm512_maskz_extracti32x8_epi32((k), (a), (imm8)) +#endif + SIMDE_FUNCTION_ATTRIBUTES simde__m256i simde_mm512_extracti64x4_epi64 (simde__m512i a, int imm8) @@ -169,27 +238,27 @@ simde_mm512_extracti64x4_epi64 (simde__m512i a, int imm8) #endif #if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) #undef _mm512_extracti64x4_epi64 - #define _mm512_extracti64x4_epi64(a, imm8) simde_mm512_extracti64x4_epi64(a, imm8) + #define _mm512_extracti64x4_epi64(a, imm8) simde_mm512_extracti64x4_epi64((a), (imm8)) #endif #if defined(SIMDE_X86_AVX512F_NATIVE) && (!defined(HEDLEY_GCC_VERSION) || HEDLEY_GCC_VERSION_CHECK(7,0,0)) && !defined(SIMDE_BUG_CLANG_REV_299346) #define simde_mm512_mask_extracti64x4_epi64(src, k, a, imm8) _mm512_mask_extracti64x4_epi64(src, k, a, imm8) #else - #define simde_mm512_mask_extracti64x4_epi64(src, k, a, imm8) simde_mm256_mask_mov_epi64(src, k, simde_mm512_extracti64x4_epi64(a, imm8)) + #define simde_mm512_mask_extracti64x4_epi64(src, k, a, imm8) simde_mm256_mask_mov_epi64((src), (k), simde_mm512_extracti64x4_epi64((a), (imm8))) #endif #if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) #undef _mm512_mask_extracti64x4_epi64 - #define _mm512_mask_extracti64x4_epi64(src, k, a, imm8) simde_mm512_mask_extracti64x4_epi64(src, k, a, imm8) + #define _mm512_mask_extracti64x4_epi64(src, k, a, imm8) simde_mm512_mask_extracti64x4_epi64((src), (k), (a), (imm8)) #endif #if defined(SIMDE_X86_AVX512F_NATIVE) && (!defined(HEDLEY_GCC_VERSION) || HEDLEY_GCC_VERSION_CHECK(7,0,0)) && !defined(SIMDE_BUG_CLANG_REV_299346) #define simde_mm512_maskz_extracti64x4_epi64(k, a, imm8) _mm512_maskz_extracti64x4_epi64(k, a, imm8) #else - #define simde_mm512_maskz_extracti64x4_epi64(k, a, imm8) simde_mm256_maskz_mov_epi64(k, simde_mm512_extracti64x4_epi64(a, imm8)) + #define simde_mm512_maskz_extracti64x4_epi64(k, a, imm8) simde_mm256_maskz_mov_epi64((k), simde_mm512_extracti64x4_epi64((a), (imm8))) #endif #if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) #undef _mm512_maskz_extracti64x4_epi64 - #define _mm512_maskz_extracti64x4_epi64(k, a, imm8) simde_mm512_maskz_extracti64x4_epi64(k, a, imm8) + #define _mm512_maskz_extracti64x4_epi64(k, a, imm8) simde_mm512_maskz_extracti64x4_epi64((k), (a), (imm8)) #endif SIMDE_END_DECLS_ diff --git a/lib/simd_wrapper/simde/x86/avx512/fmsub.h b/lib/simd_wrapper/simde/x86/avx512/fmsub.h index 626294cb388..4f52d407431 100644 --- a/lib/simd_wrapper/simde/x86/avx512/fmsub.h +++ b/lib/simd_wrapper/simde/x86/avx512/fmsub.h @@ -47,7 +47,7 @@ simde_mm256_mask3_fmsub_pd (simde__m256d a, simde__m256d b, simde__m256d c, simd } #if defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) #undef _mm256_mask3_fmsub_pd - #define _mm256_mask3_fmsub_pd(a, b, c, k) _mm256_mask3_fmsub_pd(a, b, c, k) + #define _mm256_mask3_fmsub_pd(a, b, c, k) simde_mm256_mask3_fmsub_pd(a, b, c, k) #endif SIMDE_FUNCTION_ATTRIBUTES @@ -61,7 +61,7 @@ simde_mm256_mask_fmsub_pd (simde__m256d a, simde__mmask8 k, simde__m256d b, simd } #if defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) #undef _mm256_mask_fmsub_pd - #define _mm256_mask_fmsub_pd(a, k, b, c) _mm256_mask_fmsub_pd(a, k, b, c) + #define _mm256_mask_fmsub_pd(a, k, b, c) simde_mm256_mask_fmsub_pd(a, k, b, c) #endif SIMDE_FUNCTION_ATTRIBUTES @@ -75,7 +75,7 @@ simde_mm256_maskz_fmsub_pd (simde__mmask8 k, simde__m256d a, simde__m256d b, sim } #if defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) #undef _mm256_maskz_fmsub_pd - #define _mm256_maskz_fmsub_pd(k, a, b, c) _mm256_maskz_fmsub_pd(k, a, b, c) + #define _mm256_maskz_fmsub_pd(k, a, b, c) simde_mm256_maskz_fmsub_pd(k, a, b, c) #endif SIMDE_FUNCTION_ATTRIBUTES @@ -89,7 +89,7 @@ simde_mm_mask3_fmsub_pd (simde__m128d a, simde__m128d b, simde__m128d c, simde__ } #if defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) #undef _mm_mask3_fmsub_pd - #define _mm_mask3_fmsub_pd(a, b, c, k) _mm_mask3_fmsub_pd(a, b, c, k) + #define _mm_mask3_fmsub_pd(a, b, c, k) simde_mm_mask3_fmsub_pd(a, b, c, k) #endif SIMDE_FUNCTION_ATTRIBUTES @@ -103,7 +103,7 @@ simde_mm_mask_fmsub_pd (simde__m128d a, simde__mmask8 k, simde__m128d b, simde__ } #if defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) #undef _mm_mask_fmsub_pd - #define _mm_mask_fmsub_pd(a, k, b, c) _mm_mask_fmsub_pd(a, k, b, c) + #define _mm_mask_fmsub_pd(a, k, b, c) simde_mm_mask_fmsub_pd(a, k, b, c) #endif SIMDE_FUNCTION_ATTRIBUTES @@ -117,7 +117,7 @@ simde_mm_maskz_fmsub_pd (simde__mmask8 k, simde__m128d a, simde__m128d b, simde_ } #if defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) #undef _mm_maskz_fmsub_pd - #define _mm_maskz_fmsub_pd(k, a, b, c) _mm_maskz_fmsub_pd(k, a, b, c) + #define _mm_maskz_fmsub_pd(k, a, b, c) simde_mm_maskz_fmsub_pd(k, a, b, c) #endif SIMDE_FUNCTION_ATTRIBUTES @@ -131,7 +131,7 @@ simde_mm256_mask3_fmsub_ps (simde__m256 a, simde__m256 b, simde__m256 c, simde__ } #if defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) #undef _mm256_mask3_fmsub_ps - #define _mm256_mask3_fmsub_ps(a, b, c, k) _mm256_mask3_fmsub_ps(a, b, c, k) + #define _mm256_mask3_fmsub_ps(a, b, c, k) simde_mm256_mask3_fmsub_ps(a, b, c, k) #endif SIMDE_FUNCTION_ATTRIBUTES @@ -145,7 +145,7 @@ simde_mm256_mask_fmsub_ps (simde__m256 a, simde__mmask8 k, simde__m256 b, simde_ } #if defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) #undef _mm256_mask_fmsub_ps - #define _mm256_mask_fmsub_ps(a, k, b, c) _mm256_mask_fmsub_ps(a, k, b, c) + #define _mm256_mask_fmsub_ps(a, k, b, c) simde_mm256_mask_fmsub_ps(a, k, b, c) #endif SIMDE_FUNCTION_ATTRIBUTES @@ -159,7 +159,7 @@ simde_mm256_maskz_fmsub_ps (simde__mmask8 k, simde__m256 a, simde__m256 b, simde } #if defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) #undef _mm256_maskz_fmsub_ps - #define _mm256_maskz_fmsub_ps(k, a, b, c) _mm256_maskz_fmsub_ps(k, a, b, c) + #define _mm256_maskz_fmsub_ps(k, a, b, c) simde_mm256_maskz_fmsub_ps(k, a, b, c) #endif SIMDE_FUNCTION_ATTRIBUTES @@ -173,7 +173,7 @@ simde_mm_mask3_fmsub_ps (simde__m128 a, simde__m128 b, simde__m128 c, simde__mma } #if defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) #undef _mm_mask3_fmsub_ps - #define _mm_mask3_fmsub_ps(a, b, c, k) _mm_mask3_fmsub_ps(a, b, c, k) + #define _mm_mask3_fmsub_ps(a, b, c, k) simde_mm_mask3_fmsub_ps(a, b, c, k) #endif SIMDE_FUNCTION_ATTRIBUTES @@ -187,7 +187,7 @@ simde_mm_mask_fmsub_ps (simde__m128 a, simde__mmask8 k, simde__m128 b, simde__m1 } #if defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) #undef _mm_mask_fmsub_ps - #define _mm_mask_fmsub_ps(a, k, b, c) _mm_mask_fmsub_ps(a, k, b, c) + #define _mm_mask_fmsub_ps(a, k, b, c) simde_mm_mask_fmsub_ps(a, k, b, c) #endif SIMDE_FUNCTION_ATTRIBUTES @@ -201,7 +201,7 @@ simde_mm_maskz_fmsub_ps (simde__mmask8 k, simde__m128 a, simde__m128 b, simde__m } #if defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) #undef _mm_maskz_fmsub_ps - #define _mm_maskz_fmsub_ps(k, a, b, c) _mm_maskz_fmsub_ps(k, a, b, c) + #define _mm_maskz_fmsub_ps(k, a, b, c) simde_mm_maskz_fmsub_ps(k, a, b, c) #endif SIMDE_FUNCTION_ATTRIBUTES diff --git a/lib/simd_wrapper/simde/x86/avx512/fpclass.h b/lib/simd_wrapper/simde/x86/avx512/fpclass.h new file mode 100644 index 00000000000..1765570d7ac --- /dev/null +++ b/lib/simd_wrapper/simde/x86/avx512/fpclass.h @@ -0,0 +1,99 @@ +/* SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, copy, + * modify, merge, publish, distribute, sublicense, and/or sell copies + * of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + * Copyright: + * 2023 Michael R. Crusoe + */ + +#if !defined(SIMDE_X86_AVX512_FPCLASS_H) +#define SIMDE_X86_AVX512_FPCLASS_H + +#include "types.h" + +HEDLEY_DIAGNOSTIC_PUSH +SIMDE_DISABLE_UNWANTED_DIAGNOSTICS +SIMDE_BEGIN_DECLS_ + +SIMDE_FUNCTION_ATTRIBUTES +simde__mmask8 +simde_mm256_fpclass_ps_mask(simde__m256 a, int imm8) + SIMDE_REQUIRE_CONSTANT_RANGE(imm8, 0, 0x88) { + simde__mmask8 r = 0; + simde__m256_private a_ = simde__m256_to_private(a); + + for (size_t i = 0 ; i < (sizeof(a_.f32) / sizeof(a_.f32[0])) ; i++) { + r |= simde_math_fpclassf(a_.f32[i], imm8) ? (UINT8_C(1) << i) : 0; + } + return r; +} +#if defined(SIMDE_X86_AVX512DQ_NATIVE) && defined(SIMDE_X86_AVX512VL_NATIVE) +# define simde_mm256_fpclass_ps_mask(a, imm8) _mm256_fpclass_ps_mask((a), (imm8)) +#endif +#if defined(SIMDE_X86_AVX512DQ_ENABLE_NATIVE_ALIASES) && defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) +# undef _mm256_fpclass_ps_mask +# define _mm256_fpclass_ps_mask(a, imm8) simde_mm256_fpclass_ps_mask((a), (imm8)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde__mmask32 +simde_mm512_fpclass_ph_mask(simde__m512h a, int imm8) + SIMDE_REQUIRE_CONSTANT_RANGE(imm8, 0, 0x88) { + simde__mmask32 r = 0; + simde__m512h_private a_ = simde__m512h_to_private(a); + + for (size_t i = 0 ; i < (sizeof(a_.f16) / sizeof(a_.f16[0])) ; i++) { + r |= simde_fpclasshf(a_.f16[i], imm8) ? (UINT8_C(1) << i) : 0; + } + return r; +} +#if defined(SIMDE_X86_AVX512FP16_NATIVE) +# define simde_mm512_fpclass_ph_mask(a, imm8) _mm512_fpclass_ph_mask((a), (imm8)) +#endif +#if defined(SIMDE_X86_AVX512FP16_ENABLE_NATIVE_ALIASES) +# undef _mm512_fpclass_ph_mask +# define _mm512_fpclass_ph_mask(a, imm8) simde_mm512_fpclass_ph_mask((a), (imm8)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde__mmask8 +simde_mm512_fpclass_pd_mask(simde__m512d a, int imm8) + SIMDE_REQUIRE_CONSTANT_RANGE(imm8, 0, 0x88) { + simde__mmask8 r = 0; + simde__m512d_private a_ = simde__m512d_to_private(a); + + for (size_t i = 0 ; i < (sizeof(a_.f64) / sizeof(a_.f64[0])) ; i++) { + r |= simde_math_fpclass(a_.f64[i], imm8) ? (UINT8_C(1) << i) : 0; + } + return r; +} +#if defined(SIMDE_X86_AVX512DQ_NATIVE) +# define simde_mm512_fpclass_pd_mask(a, imm8) _mm512_fpclass_pd_mask((a), (imm8)) +#endif +#if defined(SIMDE_X86_AVX512DQ_ENABLE_NATIVE_ALIASES) +# undef _mm512_fpclass_pd_mask +# define _mm512_fpclass_pd_mask(a, imm8) simde_mm512_fpclass_pd_mask((a), (imm8)) +#endif + +SIMDE_END_DECLS_ +HEDLEY_DIAGNOSTIC_POP + +#endif /* !defined(SIMDE_X86_AVX512_FPCLASS_H) */ diff --git a/lib/simd_wrapper/simde/x86/avx512/gather.h b/lib/simd_wrapper/simde/x86/avx512/gather.h new file mode 100644 index 00000000000..8dec2ee0aa9 --- /dev/null +++ b/lib/simd_wrapper/simde/x86/avx512/gather.h @@ -0,0 +1,312 @@ +/* SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, copy, + * modify, merge, publish, distribute, sublicense, and/or sell copies + * of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + * Copyright: + * 2023 Michael R. Crusoe + */ + +#if !defined(SIMDE_X86_AVX512_GATHER_H) +#define SIMDE_X86_AVX512_GATHER_H + +#include "types.h" +#include "../avx2.h" +#include "extract.h" + +HEDLEY_DIAGNOSTIC_PUSH +SIMDE_DISABLE_UNWANTED_DIAGNOSTICS +SIMDE_BEGIN_DECLS_ + +SIMDE_FUNCTION_ATTRIBUTES +simde__m512 +simde_mm512_i32gather_ps(simde__m512i vindex, const void* base_addr, const int32_t scale) + SIMDE_REQUIRE_CONSTANT(scale) + HEDLEY_REQUIRE_MSG((scale && scale <= 8 && !(scale & (scale - 1))), "`scale' must be a power of two less than or equal to 8") { + simde__m512i_private vindex_ = simde__m512i_to_private(vindex); + simde__m512_private r_ = simde__m512_to_private(simde_mm512_setzero_ps()); + const uint8_t* addr = HEDLEY_REINTERPRET_CAST(const uint8_t*, base_addr); + + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(vindex_.i32) / sizeof(vindex_.i32[0])) ; i++) { + const uint8_t* src = addr + (HEDLEY_STATIC_CAST(size_t , vindex_.i32[i]) * HEDLEY_STATIC_CAST(size_t , scale)); + simde_float32 dst; + simde_memcpy(&dst, src, sizeof(dst)); + r_.f32[i] = dst; + } + + return simde__m512_from_private(r_); +} +#if defined(SIMDE_X86_AVX512F_NATIVE) && (!defined(__clang__) || SIMDE_DETECT_CLANG_VERSION_CHECK(10,0,0)) + #define simde_mm512_i32gather_ps(vindex, base_addr, scale) _mm512_i32gather_ps((vindex), (base_addr), (scale)) +#elif defined(SIMDE_X86_AVX2_NATIVE) && defined(SIMDE_STATEMENT_EXPR_) + #define simde_mm512_i32gather_ps(vindex, base_addr, scale) SIMDE_STATEMENT_EXPR_(({\ + simde__m512_private simde_mm512_i32gather_ps_r_; \ + simde__m512i_private simde_mm512_i32gather_ps_vindex_ = simde__m512i_to_private((vindex)); \ + simde_mm512_i32gather_ps_r_.m256[0] = _mm256_i32gather_ps( \ + HEDLEY_STATIC_CAST(float const*, (base_addr)), simde_mm512_i32gather_ps_vindex_.m256i[0], (scale)); \ + simde_mm512_i32gather_ps_r_.m256[1] = _mm256_i32gather_ps( \ + HEDLEY_STATIC_CAST(float const*, (base_addr)), simde_mm512_i32gather_ps_vindex_.m256i[1], (scale)); \ + simde__m512_from_private(simde_mm512_i32gather_ps_r_); \ + })) +#elif defined(SIMDE_X86_AVX2_NATIVE) && !defined(SIMDE_STATEMENT_EXPR_) + #define simde_mm512_i32gather_ps(vindex, base_addr, scale) \ + simde_x_mm512_set_m256( \ + _mm256_i32gather_ps(HEDLEY_STATIC_CAST(float const*, (base_addr)), \ + simde_mm512_extracti32x8_epi32((vindex), 1), (scale)), \ + _mm256_i32gather_ps(HEDLEY_STATIC_CAST(float const*, (base_addr)), \ + simde_mm512_extracti32x8_epi32((vindex), 0), (scale)) ) +#endif +#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) + #undef _mm512_i32gather_ps + #define _mm512_i32gather_ps(vindex, base_addr, scale) simde_mm512_i32gather_ps((vindex), (base_addr), (scale)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde__m256i +simde_mm512_i64gather_epi32(simde__m512i vindex, const void* base_addr, const int32_t scale) + SIMDE_REQUIRE_CONSTANT(scale) + HEDLEY_REQUIRE_MSG((scale && scale <= 8 && !(scale & (scale - 1))), "`scale' must be a power of two less than or equal to 8") { + simde__m512i_private vindex_; + simde__m256i_private r_; + vindex_ = simde__m512i_to_private(vindex); + r_ = simde__m256i_to_private(simde_mm256_setzero_si256()); + const uint8_t* addr = HEDLEY_REINTERPRET_CAST(const uint8_t*, base_addr); + + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(vindex_.i64) / sizeof(vindex_.i64[0])) ; i++) { + const uint8_t* src = addr + (HEDLEY_STATIC_CAST(size_t , vindex_.i64[i]) * HEDLEY_STATIC_CAST(size_t , scale)); + int32_t dst; + simde_memcpy(&dst, src, sizeof(dst)); + r_.i32[i] = dst; + } + + return simde__m256i_from_private(r_); +} +#if defined(SIMDE_X86_AVX512F_NATIVE) + #define simde_mm512_i64gather_epi32(vindex, base_addr, scale) _mm512_i64gather_epi32((vindex), (base_addr), (scale)) +#elif defined(SIMDE_X86_AVX2_NATIVE) && defined(SIMDE_STATEMENT_EXPR_) + #define simde_mm512_i64gather_epi32(vindex, base_addr, scale) SIMDE_STATEMENT_EXPR_(({\ + simde__m256i_private simde_mm512_i64gather_epi32_r_; \ + simde__m512i_private simde_mm512_i64gather_epi32_vindex_ = simde__m512i_to_private((vindex)); \ + simde_mm512_i64gather_epi32_r_.m128i[0] = _mm256_i64gather_epi32( \ + HEDLEY_STATIC_CAST(int const*, (base_addr)), simde_mm512_i64gather_epi32_vindex_.m256i[0], (scale)); \ + simde_mm512_i64gather_epi32_r_.m128i[1] = _mm256_i64gather_epi32( \ + HEDLEY_STATIC_CAST(int const*, (base_addr)), simde_mm512_i64gather_epi32_vindex_.m256i[1], (scale)); \ + simde__m256i_from_private(simde_mm512_i64gather_epi32_r_); \ + })) +#elif defined(SIMDE_X86_AVX2_NATIVE) && !defined(SIMDE_STATEMENT_EXPR_) + #define simde_mm512_i64gather_epi32(vindex, base_addr, scale) \ + _mm256_insertf128_si256( \ + _mm256_castsi128_si256( \ + _mm256_i64gather_epi32(HEDLEY_STATIC_CAST(int const*, (base_addr)), \ + simde_mm512_extracti64x4_epi64((vindex), 0), (scale))), \ + _mm256_i64gather_epi32(HEDLEY_STATIC_CAST(int const*, (base_addr)), \ + simde_mm512_extracti64x4_epi64((vindex), 1), (scale)), \ + 1) +#endif +#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) + #undef _mm512_i64gather_epi32 + #define _mm512_i64gather_epi32(vindex, base_addr, scale) simde_mm512_i64gather_epi32((vindex), (base_addr), (scale)) +#endif + +#if defined(SIMDE_X86_AVX512F_NATIVE) + #define simde_mm512_mask_i64gather_epi32(src, k, vindex, base_addr, scale) _mm512_mask_i64gather_epi32((src), (k), (vindex), (base_addr), (scale)) +#else + #define simde_mm512_mask_i64gather_epi32(src, k, vindex, base_addr, scale) simde_mm256_mask_mov_epi32(src, k, simde_mm512_i64gather_epi32((vindex), (base_addr), (scale))) +#endif +#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) + #undef _mm512_mask_i64gather_epi32 + #define _mm512_mask_i64gather_epi32(src, k, vindex, base_addr, scale) simde_mm512_mask_i64gather_epi32((src), (k), (vindex), (base_addr), (scale)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde__m512i +simde_mm512_i64gather_epi64(simde__m512i vindex, const void* base_addr, const int32_t scale) + SIMDE_REQUIRE_CONSTANT(scale) + HEDLEY_REQUIRE_MSG((scale && scale <= 8 && !(scale & (scale - 1))), "`scale' must be a power of two less than or equal to 8") { + simde__m512i_private + vindex_ = simde__m512i_to_private(vindex), + r_ = simde__m512i_to_private(simde_mm512_setzero_si512()); + const uint8_t* addr = HEDLEY_REINTERPRET_CAST(const uint8_t*, base_addr); + + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(vindex_.i64) / sizeof(vindex_.i64[0])) ; i++) { + const uint8_t* src = addr + (HEDLEY_STATIC_CAST(size_t , vindex_.i64[i]) * HEDLEY_STATIC_CAST(size_t , scale)); + int64_t dst; + simde_memcpy(&dst, src, sizeof(dst)); + r_.i64[i] = dst; + } + + return simde__m512i_from_private(r_); +} +#if defined(SIMDE_X86_AVX512F_NATIVE) + #define simde_mm512_i64gather_epi64(vindex, base_addr, scale) _mm512_i64gather_epi64((vindex), (base_addr), (scale)) +#elif defined(SIMDE_X86_AVX2_NATIVE) && defined(SIMDE_STATEMENT_EXPR_) + #define simde_mm512_i64gather_epi64(vindex, base_addr, scale) SIMDE_STATEMENT_EXPR_(({\ + simde__m512i_private simde_mm512_i64gather_epi64_r_, \ + simde_mm512_i64gather_epi64_vindex_ = simde__m512i_to_private((vindex)); \ + simde_mm512_i64gather_epi64_r_.m256i[0] = _mm256_i64gather_epi64( \ + HEDLEY_STATIC_CAST(long long const*, (base_addr)), simde_mm512_i64gather_epi64_vindex_.m256i[0], (scale)); \ + simde_mm512_i64gather_epi64_r_.m256i[1] = _mm256_i64gather_epi64( \ + HEDLEY_STATIC_CAST(long long const*, (base_addr)), simde_mm512_i64gather_epi64_vindex_.m256i[1], (scale)); \ + simde__m512i_from_private(simde_mm512_i64gather_epi64_r_); \ + })) +#elif defined(SIMDE_X86_AVX2_NATIVE) && !defined(SIMDE_STATEMENT_EXPR_) + #define simde_mm512_i64gather_epi64(vindex, base_addr, scale) \ + simde_x_mm512_set_m256i( \ + _mm256_i64gather_epi64(HEDLEY_STATIC_CAST(long long const*, (base_addr)), \ + simde_mm512_extracti32x8_epi32((vindex), 1), (scale)), \ + _mm256_i64gather_epi64(HEDLEY_STATIC_CAST(long long const*, (base_addr)), \ + simde_mm512_extracti32x8_epi32((vindex), 0), (scale)) ) +#endif +#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) + #undef _mm512_i64gather_epi64 + #define _mm512_i64gather_epi64(vindex, base_addr, scale) simde_mm512_i64gather_epi64(vindex, (base_addr), (scale)) +#endif + +#if defined(SIMDE_X86_AVX512F_NATIVE) + #define simde_mm512_mask_i64gather_epi64(src, k, vindex, base_addr, scale) _mm512_mask_i64gather_epi64((src), (k), (vindex), (base_addr), (scale)) +#else + #define simde_mm512_mask_i64gather_epi64(src, k, vindex, base_addr, scale) simde_mm512_mask_mov_epi64((src), (k), simde_mm512_i64gather_epi64((vindex), (base_addr), (scale))) +#endif +#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) + #undef _mm512_mask_i64gather_epi64 + #define _mm512_mask_i64gather_epi64(src, k, vindex, base_addr, scale) simde_mm512_mask_i64gather_epi64((src), (k), (vindex), (base_addr), (scale)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde__m512d +simde_mm512_i64gather_pd(simde__m512i vindex, const void* base_addr, const int32_t scale) + SIMDE_REQUIRE_CONSTANT(scale) + HEDLEY_REQUIRE_MSG((scale && scale <= 8 && !(scale & (scale - 1))), "`scale' must be a power of two less than or equal to 8") { + simde__m512i_private vindex_; + simde__m512d_private r_; + vindex_ = simde__m512i_to_private(vindex); + r_ = simde__m512d_to_private(simde_mm512_setzero_pd()); + const uint8_t* addr = HEDLEY_REINTERPRET_CAST(const uint8_t*, base_addr); + + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(vindex_.i64) / sizeof(vindex_.i64[0])) ; i++) { + const uint8_t* src = addr + (HEDLEY_STATIC_CAST(size_t , vindex_.i64[i]) * HEDLEY_STATIC_CAST(size_t , scale)); + simde_float64 dst; + simde_memcpy(&dst, src, sizeof(dst)); + r_.f64[i] = dst; + } + + return simde__m512d_from_private(r_); +} +#if defined(SIMDE_X86_AVX512F_NATIVE) + #define simde_mm512_i64gather_pd(vindex, base_addr, scale) _mm512_i64gather_pd((vindex), (base_addr), (scale)) +#elif defined(SIMDE_X86_AVX2_NATIVE) && defined(SIMDE_STATEMENT_EXPR_) + #define simde_mm512_i64gather_pd(vindex, base_addr, scale) SIMDE_STATEMENT_EXPR_(({\ + simde__m512d_private simde_mm512_i64gather_pd_r_; \ + simde__m512i_private simde_mm512_i64gather_pd_vindex_ = simde__m512i_to_private((vindex)); \ + simde_mm512_i64gather_pd_r_.m256d[0] = _mm256_i64gather_pd( \ + HEDLEY_STATIC_CAST(double const*, (base_addr)), simde_mm512_i64gather_pd_vindex_.m256i[0], (scale)); \ + simde_mm512_i64gather_pd_r_.m256d[1] = _mm256_i64gather_pd( \ + HEDLEY_STATIC_CAST(double const*, (base_addr)), simde_mm512_i64gather_pd_vindex_.m256i[1], (scale)); \ + simde__m512d_from_private(simde_mm512_i64gather_pd_r_); \ + })) +#elif defined(SIMDE_X86_AVX2_NATIVE) && !defined(SIMDE_STATEMENT_EXPR_) + #define simde_mm512_i64gather_pd(vindex, base_addr, scale) \ + simde_x_mm512_set_m256d( \ + _mm256_i64gather_pd(HEDLEY_STATIC_CAST(double const*, (base_addr)), \ + simde_mm512_extracti64x4_epi64((vindex), 1), (scale)), \ + _mm256_i64gather_pd(HEDLEY_STATIC_CAST(double const*, (base_addr)), \ + simde_mm512_extracti64x4_epi64((vindex), 0), (scale)) ) +#endif +#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) + #undef _mm512_i64gather_pd + #define _mm512_i64gather_pd(vindex, base_addr, scale) simde_mm512_i64gather_pd((vindex), (base_addr), (scale)) +#endif + +#if defined(SIMDE_X86_AVX512F_NATIVE) + #define simde_mm512_mask_i64gather_pd(src, k, vindex, base_addr, scale) _mm512_mask_i64gather_pd((src), (k), (vindex), (base_addr), (scale)) +#else + #define simde_mm512_mask_i64gather_pd(src, k, vindex, base_addr, scale) simde_mm512_mask_mov_pd((src), (k), simde_mm512_i64gather_pd((vindex), (base_addr), (scale))) +#endif +#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) + #undef _mm512_mask_i64gather_pd + #define _mm512_mask_i64gather_pd(src, k, vindex, base_addr, scale) simde_mm512_mask_i64gather_pd((src), (k), (vindex), (base_addr), (scale)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde__m256 +simde_mm512_i64gather_ps(simde__m512i vindex, const void* base_addr, const int32_t scale) + SIMDE_REQUIRE_CONSTANT(scale) + HEDLEY_REQUIRE_MSG((scale && scale <= 8 && !(scale & (scale - 1))), "`scale' must be a power of two less than or equal to 8") { + simde__m512i_private vindex_; + simde__m256_private r_; + vindex_ = simde__m512i_to_private(vindex); + r_ = simde__m256_to_private(simde_mm256_setzero_ps()); + const uint8_t* addr = HEDLEY_REINTERPRET_CAST(const uint8_t*, base_addr); + + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(vindex_.i64) / sizeof(vindex_.i64[0])) ; i++) { + const uint8_t* src = addr + (HEDLEY_STATIC_CAST(size_t , vindex_.i64[i]) * HEDLEY_STATIC_CAST(size_t , scale)); + simde_float32 dst; + simde_memcpy(&dst, src, sizeof(dst)); + r_.f32[i] = dst; + } + + return simde__m256_from_private(r_); +} +#if defined(SIMDE_X86_AVX512F_NATIVE) + #define simde_mm512_i64gather_ps(vindex, base_addr, scale) _mm512_i64gather_ps((vindex), (base_addr), (scale)) +#elif defined(SIMDE_X86_AVX2_NATIVE) && defined(SIMDE_STATEMENT_EXPR_) + #define simde_mm512_i64gather_ps(vindex, base_addr, scale) SIMDE_STATEMENT_EXPR_(({\ + simde__m256_private simde_mm512_i64gather_ps_r_; \ + simde__m512i_private simde_mm512_i64gather_ps_vindex_ = simde__m512i_to_private((vindex)); \ + simde_mm512_i64gather_ps_r_.m128[0] = _mm256_i64gather_ps( \ + HEDLEY_STATIC_CAST(float const*, (base_addr)), simde_mm512_i64gather_ps_vindex_.m256i[0], (scale)); \ + simde_mm512_i64gather_ps_r_.m128[1] = _mm256_i64gather_ps( \ + HEDLEY_STATIC_CAST(float const*, (base_addr)), simde_mm512_i64gather_ps_vindex_.m256i[1], (scale)); \ + simde__m256_from_private(simde_mm512_i64gather_ps_r_); \ + })) +#elif defined(SIMDE_X86_AVX2_NATIVE) && !defined(SIMDE_STATEMENT_EXPR_) + #define simde_mm512_i64gather_ps(vindex, base_addr, scale) \ + _mm256_insertf128_ps( \ + _mm256_castps128_ps256( \ + _mm256_i64gather_ps(HEDLEY_STATIC_CAST(float const*, (base_addr)), \ + simde_mm512_extracti64x4_epi64((vindex), 0), (scale))), \ + _mm256_i64gather_ps(HEDLEY_STATIC_CAST(float const*, (base_addr)), \ + simde_mm512_extracti64x4_epi64((vindex), 1), (scale)), \ + 1) +#endif +#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) + #undef _mm512_i64gather_ps + #define _mm512_i64gather_ps(vindex, base_addr, scale) simde_mm512_i64gather_ps((vindex), (base_addr), (scale)) +#endif + +#if defined(SIMDE_X86_AVX512F_NATIVE) + #define simde_mm512_mask_i64gather_ps(src, k, vindex, base_addr, scale) _mm512_mask_i64gather_ps((src), (k), (vindex), (base_addr), (scale)) +#else + #define simde_mm512_mask_i64gather_ps(src, k, vindex, base_addr, scale) simde_mm256_mask_mov_ps((src), (k), simde_mm512_i64gather_ps((vindex), (base_addr), (scale))) +#endif +#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) + #undef _mm512_mask_i64gather_ps + #define _mm512_mask_i64gather_ps(src, k, vindex, base_addr, scale) simde_mm512_mask_i64gather_ps((src), (k), (vindex), (base_addr), (scale)) +#endif + +SIMDE_END_DECLS_ +HEDLEY_DIAGNOSTIC_POP + +#endif /* !defined(SIMDE_X86_AVX512_GATHER_H) */ diff --git a/lib/simd_wrapper/simde/x86/avx512/insert.h b/lib/simd_wrapper/simde/x86/avx512/insert.h index 5a9da038a8b..67120d31cdc 100644 --- a/lib/simd_wrapper/simde/x86/avx512/insert.h +++ b/lib/simd_wrapper/simde/x86/avx512/insert.h @@ -41,7 +41,13 @@ simde_mm512_insertf32x4 (simde__m512 a, simde__m128 b, int imm8) SIMDE_REQUIRE_CONSTANT_RANGE(imm8, 0, 3) { #if defined(SIMDE_X86_AVX512F_NATIVE) simde__m512 r; - SIMDE_CONSTIFY_4_(_mm512_insertf32x4, r, (HEDLEY_UNREACHABLE(), simde_mm512_setzero_ps ()), imm8, a, b); + switch(imm8) { + case 0: r = _mm512_insertf32x4(a, b, 0); break; + case 1: r = _mm512_insertf32x4(a, b, 1); break; + case 2: r = _mm512_insertf32x4(a, b, 2); break; + case 3: r = _mm512_insertf32x4(a, b, 3); break; + default: HEDLEY_UNREACHABLE(); r = simde_mm512_setzero_ps(); break; + } return r; #else simde__m512_private a_ = simde__m512_to_private(a); @@ -295,7 +301,7 @@ simde_mm512_mask_insertf32x8(simde__m512 src, simde__mmask16 k, simde__m512 a, s } #if defined(SIMDE_X86_AVX512DQ_ENABLE_NATIVE_ALIASES) #undef _mm512_mask_insertf32x8 - #define _mm512_mask_insertf32x8(src, k, a, b, imm8) simde_mm512_mask_insertf32x8(src, k, a, b, imms8) + #define _mm512_mask_insertf32x8(src, k, a, b, imm8) simde_mm512_mask_insertf32x8(src, k, a, b, imm8) #endif SIMDE_FUNCTION_ATTRIBUTES @@ -313,7 +319,7 @@ simde_mm512_maskz_insertf32x8(simde__mmask16 k, simde__m512 a, simde__m256 b, co } #if defined(SIMDE_X86_AVX512DQ_ENABLE_NATIVE_ALIASES) #undef _mm512_maskz_insertf32x8 - #define _mm512_maskz_insertf32x8(k, a, b, imm8) simde_mm512_maskz_insertf32x8(k, a, b, imms8) + #define _mm512_maskz_insertf32x8(k, a, b, imm8) simde_mm512_maskz_insertf32x8(k, a, b, imm8) #endif SIMDE_FUNCTION_ATTRIBUTES @@ -349,7 +355,7 @@ simde_mm512_mask_insertf64x2(simde__m512d src, simde__mmask8 k, simde__m512d a, } #if defined(SIMDE_X86_AVX512DQ_ENABLE_NATIVE_ALIASES) #undef _mm512_mask_insertf64x2 - #define _mm512_mask_insertf64x2(src, k, a, b, imm8) simde_mm512_mask_insertf64x2(src, k, a, b, imms8) + #define _mm512_mask_insertf64x2(src, k, a, b, imm8) simde_mm512_mask_insertf64x2(src, k, a, b, imm8) #endif SIMDE_FUNCTION_ATTRIBUTES @@ -367,7 +373,7 @@ simde_mm512_maskz_insertf64x2(simde__mmask8 k, simde__m512d a, simde__m128d b, c } #if defined(SIMDE_X86_AVX512DQ_ENABLE_NATIVE_ALIASES) #undef _mm512_maskz_insertf64x2 - #define _mm512_maskz_insertf64x2(k, a, b, imm8) simde_mm512_maskz_insertf64x2(k, a, b, imms8) + #define _mm512_maskz_insertf64x2(k, a, b, imm8) simde_mm512_maskz_insertf64x2(k, a, b, imm8) #endif SIMDE_FUNCTION_ATTRIBUTES @@ -403,7 +409,7 @@ simde_mm512_mask_inserti32x8(simde__m512i src, simde__mmask16 k, simde__m512i a, } #if defined(SIMDE_X86_AVX512DQ_ENABLE_NATIVE_ALIASES) #undef _mm512_mask_inserti32x8 - #define _mm512_mask_inserti32x8(src, k, a, b, imm8) simde_mm512_mask_inserti32x8(src, k, a, b, imms8) + #define _mm512_mask_inserti32x8(src, k, a, b, imm8) simde_mm512_mask_inserti32x8(src, k, a, b, imm8) #endif SIMDE_FUNCTION_ATTRIBUTES @@ -421,7 +427,7 @@ simde_mm512_maskz_inserti32x8(simde__mmask16 k, simde__m512i a, simde__m256i b, } #if defined(SIMDE_X86_AVX512DQ_ENABLE_NATIVE_ALIASES) #undef _mm512_maskz_inserti32x8 - #define _mm512_maskz_inserti32x8(k, a, b, imm8) simde_mm512_maskz_inserti32x8(k, a, b, imms8) + #define _mm512_maskz_inserti32x8(k, a, b, imm8) simde_mm512_maskz_inserti32x8(k, a, b, imm8) #endif SIMDE_FUNCTION_ATTRIBUTES @@ -457,7 +463,7 @@ simde_mm512_mask_inserti64x2(simde__m512i src, simde__mmask8 k, simde__m512i a, } #if defined(SIMDE_X86_AVX512DQ_ENABLE_NATIVE_ALIASES) #undef _mm512_mask_inserti64x2 - #define _mm512_mask_inserti64x2(src, k, a, b, imm8) simde_mm512_mask_inserti64x2(src, k, a, b, imms8) + #define _mm512_mask_inserti64x2(src, k, a, b, imm8) simde_mm512_mask_inserti64x2(src, k, a, b, imm8) #endif SIMDE_FUNCTION_ATTRIBUTES @@ -475,7 +481,7 @@ simde_mm512_maskz_inserti64x2(simde__mmask8 k, simde__m512i a, simde__m128i b, c } #if defined(SIMDE_X86_AVX512DQ_ENABLE_NATIVE_ALIASES) #undef _mm512_maskz_inserti64x2 - #define _mm512_maskz_inserti64x2(k, a, b, imm8) simde_mm512_maskz_inserti64x2(k, a, b, imms8) + #define _mm512_maskz_inserti64x2(k, a, b, imm8) simde_mm512_maskz_inserti64x2(k, a, b, imm8) #endif SIMDE_END_DECLS_ diff --git a/lib/simd_wrapper/simde/x86/avx512/kand.h b/lib/simd_wrapper/simde/x86/avx512/kand.h new file mode 100644 index 00000000000..78641000746 --- /dev/null +++ b/lib/simd_wrapper/simde/x86/avx512/kand.h @@ -0,0 +1,53 @@ +/* SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, copy, + * modify, merge, publish, distribute, sublicense, and/or sell copies + * of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + * Copyright: + * 2023 Michael R. Crusoe + */ + +#if !defined(SIMDE_X86_AVX512_KAND_H) +#define SIMDE_X86_AVX512_KAND_H + +#include "types.h" + +HEDLEY_DIAGNOSTIC_PUSH +SIMDE_DISABLE_UNWANTED_DIAGNOSTICS +SIMDE_BEGIN_DECLS_ + +SIMDE_FUNCTION_ATTRIBUTES +simde__mmask16 +simde_mm512_kand (simde__mmask16 a, simde__mmask16 b) { + #if defined(SIMDE_X86_AVX512F_NATIVE) + return _mm512_kand(a, b); + #else + return a & b; + #endif +} +#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) + #undef _mm512_kand + #define _mm512_kand(a, b) simde_mm512_kand((a), (b)) +#endif + +SIMDE_END_DECLS_ +HEDLEY_DIAGNOSTIC_POP + +#endif /* !defined(SIMDE_X86_AVX512_KAND_H) */ diff --git a/lib/simd_wrapper/simde/x86/avx512/knot.h b/lib/simd_wrapper/simde/x86/avx512/knot.h new file mode 100644 index 00000000000..3b4696e8b30 --- /dev/null +++ b/lib/simd_wrapper/simde/x86/avx512/knot.h @@ -0,0 +1,106 @@ +/* SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, copy, + * modify, merge, publish, distribute, sublicense, and/or sell copies + * of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + * Copyright: + * 2023 Michael R. Crusoe + */ + +#if !defined(SIMDE_X86_AVX512_KNOT_H) +#define SIMDE_X86_AVX512_KNOT_H + +#include "types.h" + +HEDLEY_DIAGNOSTIC_PUSH +SIMDE_DISABLE_UNWANTED_DIAGNOSTICS +SIMDE_BEGIN_DECLS_ + +SIMDE_FUNCTION_ATTRIBUTES +simde__mmask8 +simde_knot_mask8 (simde__mmask8 a) { + #if defined(SIMDE_X86_AVX512DQ_NATIVE) \ + && (!defined(__clang__) || SIMDE_DETECT_CLANG_VERSION_CHECK(8,0,0)) \ + && (!defined(HEDLEY_MSVC_VERSION) || HEDLEY_MSVC_VERSION_CHECK(19,20,0)) + return _knot_mask8(a); + #else + return ~a; + #endif +} +#if defined(SIMDE_X86_AVX512DQ_ENABLE_NATIVE_ALIASES) + #undef _knot_mask8 + #define _knot_mask8(a) simde_knot_mask8(a) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde__mmask16 +simde_knot_mask16 (simde__mmask16 a) { + #if defined(SIMDE_X86_AVX512F_NATIVE) \ + && (!defined(__clang__) || SIMDE_DETECT_CLANG_VERSION_CHECK(8,0,0)) \ + && (!defined(HEDLEY_MSVC_VERSION) || HEDLEY_MSVC_VERSION_CHECK(19,20,0)) + return _knot_mask16(a); + #else + return ~a; + #endif +} +#define simde_mm512_knot(a) simde_knot_mask16(a) +#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) + #undef _knot_mask16 + #undef _mm512_knot + #define _knot_mask16(a) simde_knot_mask16(a) + #define _mm512_knot(a) simde_knot_mask16(a) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde__mmask32 +simde_knot_mask32 (simde__mmask32 a) { + #if defined(SIMDE_X86_AVX512BW_NATIVE) \ + && (!defined(__clang__) || SIMDE_DETECT_CLANG_VERSION_CHECK(8,0,0)) \ + && (!defined(HEDLEY_MSVC_VERSION) || HEDLEY_MSVC_VERSION_CHECK(19,20,0)) + return _knot_mask32(a); + #else + return ~a; + #endif +} +#if defined(SIMDE_X86_AVX512BW_ENABLE_NATIVE_ALIASES) + #undef _knot_mask32 + #define _knot_mask32(a) simde_knot_mask32(a) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde__mmask64 +simde_knot_mask64 (simde__mmask64 a) { + #if defined(SIMDE_X86_AVX512BW_NATIVE) \ + && (!defined(__clang__) || SIMDE_DETECT_CLANG_VERSION_CHECK(8,0,0)) \ + && (!defined(HEDLEY_MSVC_VERSION) || HEDLEY_MSVC_VERSION_CHECK(19,20,0)) + return _knot_mask64(a); + #else + return ~a; + #endif +} +#if defined(SIMDE_X86_AVX512BW_ENABLE_NATIVE_ALIASES) + #undef _knot_mask64 + #define _knot_mask64(a) simde_knot_mask64(a) +#endif + +SIMDE_END_DECLS_ +HEDLEY_DIAGNOSTIC_POP + +#endif /* !defined(SIMDE_X86_AVX512_KNOT_H) */ diff --git a/lib/simd_wrapper/simde/x86/avx512/kxor.h b/lib/simd_wrapper/simde/x86/avx512/kxor.h new file mode 100644 index 00000000000..45f5d04da61 --- /dev/null +++ b/lib/simd_wrapper/simde/x86/avx512/kxor.h @@ -0,0 +1,107 @@ +/* SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, copy, + * modify, merge, publish, distribute, sublicense, and/or sell copies + * of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + * Copyright: + * 2023 Michael R. Crusoe + */ + +#if !defined(SIMDE_X86_AVX512_KXOR_H) +#define SIMDE_X86_AVX512_KXOR_H + +#include "types.h" + +HEDLEY_DIAGNOSTIC_PUSH +SIMDE_DISABLE_UNWANTED_DIAGNOSTICS +SIMDE_BEGIN_DECLS_ + +SIMDE_FUNCTION_ATTRIBUTES +simde__mmask8 +simde_kxor_mask8 (simde__mmask8 a, simde__mmask8 b) { + #if defined(SIMDE_X86_AVX512DQ_NATIVE) \ + && (!defined(__clang__) || SIMDE_DETECT_CLANG_VERSION_CHECK(8,0,0)) \ + && (!defined(HEDLEY_MSVC_VERSION) || HEDLEY_MSVC_VERSION_CHECK(19,20,0)) + return _kxor_mask8(a, b); + #else + return a^b; + #endif +} +#if defined(SIMDE_X86_AVX512DQ_ENABLE_NATIVE_ALIASES) + #undef _kxor_mask8 + #define _kxor_mask8(a, b) simde_kxor_mask8(a, b) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde__mmask16 +simde_kxor_mask16 (simde__mmask16 a, simde__mmask16 b) { + #if defined(SIMDE_X86_AVX512F_NATIVE) \ + && (!defined(__clang__) || SIMDE_DETECT_CLANG_VERSION_CHECK(8,0,0)) \ + && (!defined(HEDLEY_MSVC_VERSION) || HEDLEY_MSVC_VERSION_CHECK(19,20,0)) + return _kxor_mask16(a, b); + #else + return a^b; + #endif +} +#define simde_mm512_kxor(a, b) simde_kxor_mask16(a, b) +#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) + #undef _kxor_mask16 + #undef _mm512_kxor + #define _kxor_mask16(a, b) simde_kxor_mask16(a, b) + #define _mm512_kxor(a, b) simde_kxor_mask16(a, b) +#endif + + +SIMDE_FUNCTION_ATTRIBUTES +simde__mmask32 +simde_kxor_mask32 (simde__mmask32 a, simde__mmask32 b) { + #if defined(SIMDE_X86_AVX512BW_NATIVE) \ + && (!defined(__clang__) || SIMDE_DETECT_CLANG_VERSION_CHECK(8,0,0)) \ + && (!defined(HEDLEY_MSVC_VERSION) || HEDLEY_MSVC_VERSION_CHECK(19,20,0)) + return _kxor_mask32(a, b); + #else + return a^b; + #endif +} +#if defined(SIMDE_X86_AVX512BW_ENABLE_NATIVE_ALIASES) + #undef _kxor_mask32 + #define _kxor_mask32(a, b) simde_kxor_mask32(a, b) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde__mmask64 +simde_kxor_mask64 (simde__mmask64 a, simde__mmask64 b) { + #if defined(SIMDE_X86_AVX512BW_NATIVE) \ + && (!defined(__clang__) || SIMDE_DETECT_CLANG_VERSION_CHECK(8,0,0)) \ + && (!defined(HEDLEY_MSVC_VERSION) || HEDLEY_MSVC_VERSION_CHECK(19,20,0)) + return _kxor_mask64(a, b); + #else + return a^b; + #endif +} +#if defined(SIMDE_X86_AVX512BW_ENABLE_NATIVE_ALIASES) + #undef _kxor_mask64 + #define _kxor_mask64(a, b) simde_kxor_mask64(a, b) +#endif + +SIMDE_END_DECLS_ +HEDLEY_DIAGNOSTIC_POP + +#endif /* !defined(SIMDE_X86_AVX512_KXOR_H) */ diff --git a/lib/simd_wrapper/simde/x86/avx512/load.h b/lib/simd_wrapper/simde/x86/avx512/load.h index 03d7327c74b..6a4af937dad 100644 --- a/lib/simd_wrapper/simde/x86/avx512/load.h +++ b/lib/simd_wrapper/simde/x86/avx512/load.h @@ -33,6 +33,54 @@ HEDLEY_DIAGNOSTIC_PUSH SIMDE_DISABLE_UNWANTED_DIAGNOSTICS SIMDE_BEGIN_DECLS_ +SIMDE_FUNCTION_ATTRIBUTES +simde__m512d +simde_mm512_load_pd (void const * mem_addr) { + #if defined(SIMDE_X86_AVX512F_NATIVE) + return _mm512_load_pd(SIMDE_ALIGN_ASSUME_LIKE(mem_addr, simde__m512d)); + #else + simde__m512d r; + simde_memcpy(&r, SIMDE_ALIGN_ASSUME_LIKE(mem_addr, simde__m512d), sizeof(r)); + return r; + #endif +} +#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) + #undef _mm512_load_pd + #define _mm512_load_pd(a) simde_mm512_load_pd(a) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde__m512 +simde_mm512_load_ps (void const * mem_addr) { + #if defined(SIMDE_X86_AVX512F_NATIVE) + return _mm512_load_ps(SIMDE_ALIGN_ASSUME_LIKE(mem_addr, simde__m512)); + #else + simde__m512 r; + simde_memcpy(&r, SIMDE_ALIGN_ASSUME_LIKE(mem_addr, simde__m512), sizeof(r)); + return r; + #endif +} +#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) + #undef _mm512_load_ps + #define _mm512_load_ps(a) simde_mm512_load_ps(a) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde__m512h +simde_mm512_load_ph (void const * mem_addr) { + #if defined(SIMDE_X86_AVX512FP16_NATIVE) + return _mm512_load_ph(SIMDE_ALIGN_ASSUME_LIKE(mem_addr, simde__m512h)); + #else + simde__m512h r; + simde_memcpy(&r, SIMDE_ALIGN_ASSUME_LIKE(mem_addr, simde__m512h), sizeof(r)); + return r; + #endif +} +#if defined(SIMDE_X86_AVX512FP16_ENABLE_NATIVE_ALIASES) + #undef _mm512_load_ph + #define _mm512_load_ph(a) simde_mm512_load_ph(a) +#endif + SIMDE_FUNCTION_ATTRIBUTES simde__m512i simde_mm512_load_si512 (void const * mem_addr) { diff --git a/lib/simd_wrapper/simde/x86/avx512/loadu.h b/lib/simd_wrapper/simde/x86/avx512/loadu.h index 06f3bd83bfb..4a31966b484 100644 --- a/lib/simd_wrapper/simde/x86/avx512/loadu.h +++ b/lib/simd_wrapper/simde/x86/avx512/loadu.h @@ -73,46 +73,222 @@ simde_mm512_loadu_pd (void const * mem_addr) { #define _mm512_loadu_pd(a) simde_mm512_loadu_pd(a) #endif +SIMDE_FUNCTION_ATTRIBUTES +simde__m512h +simde_mm512_loadu_ph (void const * mem_addr) { + #if defined(SIMDE_X86_AVX512FP16_NATIVE) + return _mm512_loadu_ph(mem_addr); + #else + simde__m512h r; + simde_memcpy(&r, mem_addr, sizeof(r)); + return r; + #endif +} +#if defined(SIMDE_X86_AVX512FP16_ENABLE_NATIVE_ALIASES) + #undef _mm512_loadu_ph + #define _mm512_loadu_ph(a) simde_mm512_loadu_ph(a) +#endif + SIMDE_FUNCTION_ATTRIBUTES simde__m512i simde_mm512_loadu_si512 (void const * mem_addr) { - #if defined(SIMDE_X86_AVX512F_NATIVE) - return _mm512_loadu_si512(HEDLEY_REINTERPRET_CAST(void const*, mem_addr)); + simde__m512i r; + #if HEDLEY_GNUC_HAS_ATTRIBUTE(may_alias,3,3,0) + HEDLEY_DIAGNOSTIC_PUSH + SIMDE_DIAGNOSTIC_DISABLE_PACKED_ + struct simde_mm512_loadu_si512_s { + __typeof__(r) v; + } __attribute__((__packed__, __may_alias__)); + r = HEDLEY_REINTERPRET_CAST(const struct simde_mm512_loadu_si512_s *, mem_addr)->v; + HEDLEY_DIAGNOSTIC_POP #else - simde__m512i r; - - #if HEDLEY_GNUC_HAS_ATTRIBUTE(may_alias,3,3,0) - HEDLEY_DIAGNOSTIC_PUSH - SIMDE_DIAGNOSTIC_DISABLE_PACKED_ - struct simde_mm512_loadu_si512_s { - __typeof__(r) v; - } __attribute__((__packed__, __may_alias__)); - r = HEDLEY_REINTERPRET_CAST(const struct simde_mm512_loadu_si512_s *, mem_addr)->v; - HEDLEY_DIAGNOSTIC_POP - #else - simde_memcpy(&r, mem_addr, sizeof(r)); - #endif - - return r; + simde_memcpy(&r, mem_addr, sizeof(r)); #endif + + return r; } -#define simde_mm512_loadu_epi8(mem_addr) simde_mm512_loadu_si512(mem_addr) -#define simde_mm512_loadu_epi16(mem_addr) simde_mm512_loadu_si512(mem_addr) -#define simde_mm512_loadu_epi32(mem_addr) simde_mm512_loadu_si512(mem_addr) -#define simde_mm512_loadu_epi64(mem_addr) simde_mm512_loadu_si512(mem_addr) +#if defined(SIMDE_X86_AVX512F_NATIVE) && (!defined(HEDLEY_GCC_VERSION) || HEDLEY_GCC_VERSION_CHECK(10,0,0)) && (!defined(__clang__) || SIMDE_DETECT_CLANG_VERSION_CHECK(8,0,0)) + #define simde_mm512_loadu_si512(mem_addr) _mm512_loadu_si512(mem_addr) + #define simde_mm512_loadu_epi32(mem_addr) _mm512_loadu_epi32(mem_addr) + #define simde_mm512_loadu_epi64(mem_addr) _mm512_loadu_epi64(mem_addr) +#else + #define simde_mm512_loadu_epi32(mem_addr) simde_mm512_loadu_si512(mem_addr) + #define simde_mm512_loadu_epi64(mem_addr) simde_mm512_loadu_si512(mem_addr) +#endif +#if defined(SIMDE_X86_AVX512BW_NATIVE) && (!defined(HEDLEY_GCC_VERSION) || HEDLEY_GCC_VERSION_CHECK(11,0,0)) && (!defined(__clang__) || SIMDE_DETECT_CLANG_VERSION_CHECK(8,0,0)) + #define simde_mm512_loadu_epi8(mem_addr) _mm512_loadu_epi8(mem_addr) + #define simde_mm512_loadu_epi16(mem_addr) _mm512_loadu_epi16(mem_addr) +#else + #define simde_mm512_loadu_epi8(mem_addr) simde_mm512_loadu_si512(mem_addr) + #define simde_mm512_loadu_epi16(mem_addr) simde_mm512_loadu_si512(mem_addr) +#endif #if defined(SIMDE_X86_AVX512BW_ENABLE_NATIVE_ALIASES) #undef _mm512_loadu_epi8 #undef _mm512_loadu_epi16 - #define _mm512_loadu_epi8(a) simde_mm512_loadu_si512(a) - #define _mm512_loadu_epi16(a) simde_mm512_loadu_si512(a) + #define _mm512_loadu_epi8(a) simde_mm512_loadu_epi8(a) + #define _mm512_loadu_epi16(a) simde_mm512_loadu_epi16(a) #endif #if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) #undef _mm512_loadu_epi32 #undef _mm512_loadu_epi64 #undef _mm512_loadu_si512 #define _mm512_loadu_si512(a) simde_mm512_loadu_si512(a) - #define _mm512_loadu_epi32(a) simde_mm512_loadu_si512(a) - #define _mm512_loadu_epi64(a) simde_mm512_loadu_si512(a) + #define _mm512_loadu_epi32(a) simde_mm512_loadu_epi32(a) + #define _mm512_loadu_epi64(a) simde_mm512_loadu_epi64(a) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde__m256i +simde_mm256_maskz_loadu_epi16 (simde__mmask16 k, void const * mem_addr) { + #if defined(SIMDE_X86_AVX512BW_NATIVE) && defined(SIMDE_X86_AVX512VL_NATIVE) + return _mm256_maskz_loadu_epi16(k, HEDLEY_REINTERPRET_CAST(void const*, mem_addr)); + #else + return simde_mm256_maskz_mov_epi16(k, simde_mm256_loadu_epi16(mem_addr)); + #endif +} +#if defined(SIMDE_X86_AVX512BW_ENABLE_NATIVE_ALIASES) && defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) + #undef _mm256_maskz_loadu_epi16 + #define _mm256_maskz_loadu_epi16(k, mem_addr) simde_mm256_maskz_loadu_epi16(k, mem_addr) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde__m256 +simde_mm256_maskz_loadu_ps (simde__mmask8 k, void const * mem_addr) { + #if defined(SIMDE_X86_AVX512F_NATIVE) && defined(SIMDE_X86_AVX512VL_NATIVE) + return _mm256_maskz_loadu_ps(k, HEDLEY_REINTERPRET_CAST(void const*, mem_addr)); + #else + return simde_mm256_maskz_mov_ps(k, simde_mm256_loadu_ps(HEDLEY_REINTERPRET_CAST(const float*, mem_addr))); + #endif +} +#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) && defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) + #undef _mm256_maskz_loadu_ps + #define _mm256_maskz_loadu_ps(k, mem_addr) simde_mm256_maskz_loadu_ps(k, mem_addr) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde__m512i +simde_mm512_mask_loadu_epi16 (simde__m512i src, simde__mmask32 k, void const * mem_addr) { + #if defined(SIMDE_X86_AVX512BW_NATIVE) + return _mm512_mask_loadu_epi16(src, k, HEDLEY_REINTERPRET_CAST(void const*, mem_addr)); + #else + return simde_mm512_mask_mov_epi16(src, k, simde_mm512_loadu_epi16(mem_addr)); + #endif +} +#if defined(SIMDE_X86_AVX512BW_ENABLE_NATIVE_ALIASES) + #undef _mm512_mask_loadu_epi16 + #define _mm512_mask_loadu_epi16(src, k, mem_addr) simde_mm512_mask_loadu_epi16(src, k, mem_addr) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde__m512i +simde_mm512_maskz_loadu_epi16 (simde__mmask32 k, void const * mem_addr) { + #if defined(SIMDE_X86_AVX512BW_NATIVE) + return _mm512_maskz_loadu_epi16(k, HEDLEY_REINTERPRET_CAST(void const*, mem_addr)); + #else + return simde_mm512_maskz_mov_epi16(k, simde_mm512_loadu_epi16(mem_addr)); + #endif +} +#if defined(SIMDE_X86_AVX512BW_ENABLE_NATIVE_ALIASES) + #undef _mm512_maskz_loadu_epi16 + #define _mm512_maskz_loadu_epi16(k, mem_addr) simde_mm512_maskz_loadu_epi16(k, mem_addr) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde__m512i +simde_mm512_mask_loadu_epi32 (simde__m512i src, simde__mmask16 k, void const * mem_addr) { + #if defined(SIMDE_X86_AVX512F_NATIVE) + return _mm512_mask_loadu_epi32(src, k, HEDLEY_REINTERPRET_CAST(void const*, mem_addr)); + #else + return simde_mm512_mask_mov_epi32(src, k, simde_mm512_loadu_epi32(mem_addr)); + #endif +} +#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) + #undef _mm512_mask_loadu_epi32 + #define _mm512_mask_loadu_epi32(src, k, mem_addr) simde_mm512_mask_loadu_epi32(src, k, mem_addr) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde__m512i +simde_mm512_mask_loadu_epi64 (simde__m512i src, simde__mmask8 k, void const * mem_addr) { + #if defined(SIMDE_X86_AVX512F_NATIVE) + return _mm512_mask_loadu_epi64(src, k, HEDLEY_REINTERPRET_CAST(void const*, mem_addr)); + #else + return simde_mm512_mask_mov_epi64(src, k, simde_mm512_loadu_epi64(mem_addr)); + #endif +} +#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) + #undef _mm512_mask_loadu_epi64 + #define _mm512_mask_loadu_epi64(src, k, mem_addr) simde_mm512_mask_loadu_epi64(src, k, mem_addr) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde__m512i +simde_mm512_maskz_loadu_epi64 (simde__mmask8 k, void const * mem_addr) { + #if defined(SIMDE_X86_AVX512F_NATIVE) + return _mm512_maskz_loadu_epi64(k, HEDLEY_REINTERPRET_CAST(void const*, mem_addr)); + #else + return simde_mm512_maskz_mov_epi64(k, simde_mm512_loadu_epi64(mem_addr)); + #endif +} +#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) + #undef _mm512_maskz_loadu_epi64 + #define _mm512_maskz_loadu_epi64(k, mem_addr) simde_mm512_maskz_loadu_epi64((k), (mem_addr)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde__m512d +simde_mm512_mask_loadu_pd (simde__m512d src, simde__mmask8 k, void const * mem_addr) { + #if defined(SIMDE_X86_AVX512F_NATIVE) + return _mm512_mask_loadu_pd(src, k, HEDLEY_REINTERPRET_CAST(void const*, mem_addr)); + #else + return simde_mm512_mask_mov_pd(src, k, simde_mm512_loadu_pd(mem_addr)); + #endif +} +#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) + #undef _mm512_mask_loadu_pd + #define _mm512_mask_loadu_pd(src, k, mem_addr) simde_mm512_mask_loadu_pd(src, k, mem_addr) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde__m512 +simde_mm512_mask_loadu_ps (simde__m512 src, simde__mmask16 k, void const * mem_addr) { + #if defined(SIMDE_X86_AVX512F_NATIVE) + return _mm512_mask_loadu_ps(src, k, HEDLEY_REINTERPRET_CAST(void const*, mem_addr)); + #else + return simde_mm512_mask_mov_ps(src, k, simde_mm512_loadu_ps(mem_addr)); + #endif +} +#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) + #undef _mm512_mask_loadu_ps + #define _mm512_mask_loadu_ps(src, k, mem_addr) simde_mm512_mask_loadu_ps(src, k, mem_addr) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde__m512 +simde_mm512_maskz_loadu_ps (simde__mmask16 k, void const * mem_addr) { + #if defined(SIMDE_X86_AVX512F_NATIVE) + return _mm512_maskz_loadu_ps(k, HEDLEY_REINTERPRET_CAST(void const*, mem_addr)); + #else + return simde_mm512_maskz_mov_ps(k, simde_mm512_loadu_ps(mem_addr)); + #endif +} +#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) + #undef _mm512_maskz_loadu_ps + #define _mm512_maskz_loadu_ps(k, mem_addr) simde_mm512_maskz_loadu_ps(k, mem_addr) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde__m512d +simde_mm512_maskz_loadu_pd (simde__mmask8 k, void const * mem_addr) { + #if defined(SIMDE_X86_AVX512F_NATIVE) + return _mm512_maskz_loadu_pd(k, HEDLEY_REINTERPRET_CAST(void const*, mem_addr)); + #else + return simde_mm512_maskz_mov_pd(k, simde_mm512_loadu_pd(mem_addr)); + #endif +} +#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) + #undef _mm512_maskz_loadu_pd + #define _mm512_maskz_loadu_pd(k, mem_addr) simde_mm512_maskz_loadu_pd(k, mem_addr) #endif SIMDE_END_DECLS_ diff --git a/lib/simd_wrapper/simde/x86/avx512/madd.h b/lib/simd_wrapper/simde/x86/avx512/madd.h index 153bf067dd1..547d71ce4be 100644 --- a/lib/simd_wrapper/simde/x86/avx512/madd.h +++ b/lib/simd_wrapper/simde/x86/avx512/madd.h @@ -61,7 +61,7 @@ simde_mm_maskz_madd_epi16 (simde__mmask8 k, simde__m128i a, simde__m128i b) { } #if defined(SIMDE_X86_AVX512BW_ENABLE_NATIVE_ALIASES) && defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) #undef _mm_maskz_madd_epi16 - #define _mm_maskz_madd_epi16(src, k, a, b) simde_mm_maskz_madd_epi16(src, k, a, b) + #define _mm_maskz_madd_epi16(k, a, b) simde_mm_maskz_madd_epi16(k, a, b) #endif SIMDE_FUNCTION_ATTRIBUTES @@ -89,7 +89,7 @@ simde_mm256_maskz_madd_epi16 (simde__mmask8 k, simde__m256i a, simde__m256i b) { } #if defined(SIMDE_X86_AVX512BW_ENABLE_NATIVE_ALIASES) && defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) #undef _mm256_maskz_madd_epi16 - #define _mm256_maskz_madd_epi16(src, k, a, b) simde_mm256_maskz_madd_epi16(src, k, a, b) + #define _mm256_maskz_madd_epi16(k, a, b) simde_mm256_maskz_madd_epi16(k, a, b) #endif SIMDE_FUNCTION_ATTRIBUTES @@ -120,7 +120,7 @@ simde_mm512_madd_epi16 (simde__m512i a, simde__m512i b) { } #if defined(SIMDE_X86_AVX512BW_ENABLE_NATIVE_ALIASES) #undef _mm512_madd_epi16 - #define _mm512_madd_epi16(src, k, a, b) simde_mm512_madd_epi16(src, k, a, b) + #define _mm512_madd_epi16(a, b) simde_mm512_madd_epi16(a, b) #endif SIMDE_FUNCTION_ATTRIBUTES @@ -148,7 +148,7 @@ simde_mm512_maskz_madd_epi16 (simde__mmask16 k, simde__m512i a, simde__m512i b) } #if defined(SIMDE_X86_AVX512BW_ENABLE_NATIVE_ALIASES) #undef _mm512_maskz_madd_epi16 - #define _mm512_maskz_madd_epi16(src, k, a, b) simde_mm512_maskz_madd_epi16(src, k, a, b) + #define _mm512_maskz_madd_epi16(k, a, b) simde_mm512_maskz_madd_epi16(k, a, b) #endif SIMDE_END_DECLS_ diff --git a/lib/simd_wrapper/simde/x86/avx512/maddubs.h b/lib/simd_wrapper/simde/x86/avx512/maddubs.h index 4b3d73917aa..43b5594cfa3 100644 --- a/lib/simd_wrapper/simde/x86/avx512/maddubs.h +++ b/lib/simd_wrapper/simde/x86/avx512/maddubs.h @@ -48,7 +48,7 @@ simde_mm_mask_maddubs_epi16 (simde__m128i src, simde__mmask8 k, simde__m128i a, } #if defined(SIMDE_X86_AVX512BW_ENABLE_NATIVE_ALIASES) && defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) #undef _mm_mask_maddubs_epi16 - #define _mm_mask_maddubs_epi16(a, b) simde_mm_mask_maddubs_epi16(a, b) + #define _mm_mask_maddubs_epi16(src, k, a, b) simde_mm_mask_maddubs_epi16(src, k, a, b) #endif SIMDE_FUNCTION_ATTRIBUTES @@ -62,7 +62,7 @@ simde_mm_maskz_maddubs_epi16 (simde__mmask8 k, simde__m128i a, simde__m128i b) { } #if defined(SIMDE_X86_AVX512BW_ENABLE_NATIVE_ALIASES) && defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) #undef _mm_maskz_maddubs_epi16 - #define _mm_maskz_maddubs_epi16(a, b) simde_mm_maskz_maddubs_epi16(a, b) + #define _mm_maskz_maddubs_epi16(k, a, b) simde_mm_maskz_maddubs_epi16(k, a, b) #endif SIMDE_FUNCTION_ATTRIBUTES @@ -76,7 +76,7 @@ simde_mm256_mask_maddubs_epi16 (simde__m256i src, simde__mmask16 k, simde__m256i } #if defined(SIMDE_X86_AVX512BW_ENABLE_NATIVE_ALIASES) && defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) #undef _mm256_mask_maddubs_epi16 - #define _mm256_mask_maddubs_epi16(a, b) simde_mm256_mask_maddubs_epi16(a, b) + #define _mm256_mask_maddubs_epi16(src, k, a, b) simde_mm256_mask_maddubs_epi16(src, k, a, b) #endif SIMDE_FUNCTION_ATTRIBUTES @@ -90,7 +90,7 @@ simde_mm256_maskz_maddubs_epi16 (simde__mmask16 k, simde__m256i a, simde__m256i } #if defined(SIMDE_X86_AVX512BW_ENABLE_NATIVE_ALIASES) && defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) #undef _mm256_maskz_maddubs_epi16 - #define _mm256_maskz_maddubs_epi16(a, b) simde_mm256_maskz_maddubs_epi16(a, b) + #define _mm256_maskz_maddubs_epi16(k, a, b) simde_mm256_maskz_maddubs_epi16(k, a, b) #endif SIMDE_FUNCTION_ATTRIBUTES @@ -136,7 +136,7 @@ simde_mm512_mask_maddubs_epi16 (simde__m512i src, simde__mmask32 k, simde__m512i } #if defined(SIMDE_X86_AVX512BW_ENABLE_NATIVE_ALIASES) #undef _mm512_mask_maddubs_epi16 - #define _mm512_mask_maddubs_epi16(a, b) simde_mm512_mask_maddubs_epi16(a, b) + #define _mm512_mask_maddubs_epi16(src, k, a, b) simde_mm512_mask_maddubs_epi16(src, k, a, b) #endif SIMDE_FUNCTION_ATTRIBUTES @@ -150,7 +150,7 @@ simde_mm512_maskz_maddubs_epi16 (simde__mmask32 k, simde__m512i a, simde__m512i } #if defined(SIMDE_X86_AVX512BW_ENABLE_NATIVE_ALIASES) #undef _mm512_maskz_maddubs_epi16 - #define _mm512_maskz_maddubs_epi16(a, b) simde_mm512_maskz_maddubs_epi16(a, b) + #define _mm512_maskz_maddubs_epi16(k, a, b) simde_mm512_maskz_maddubs_epi16(k, a, b) #endif SIMDE_END_DECLS_ diff --git a/lib/simd_wrapper/simde/x86/avx512/max.h b/lib/simd_wrapper/simde/x86/avx512/max.h index 8bec526ad7a..29ef0b37c50 100644 --- a/lib/simd_wrapper/simde/x86/avx512/max.h +++ b/lib/simd_wrapper/simde/x86/avx512/max.h @@ -553,6 +553,30 @@ simde_mm512_max_pd (simde__m512d a, simde__m512d b) { #define _mm512_max_pd(a, b) simde_mm512_max_pd(a, b) #endif +SIMDE_FUNCTION_ATTRIBUTES +simde__m512h +simde_mm512_max_ph (simde__m512h a, simde__m512h b) { + #if defined(SIMDE_X86_AVX512FP16_NATIVE) + return _mm512_max_ph(a, b); + #else + simde__m512h_private + r_, + a_ = simde__m512h_to_private(a), + b_ = simde__m512h_to_private(b); + + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.f16) / sizeof(r_.f16[0])) ; i++) { + r_.f16[i] = simde_float16_to_float32(a_.f16[i]) > simde_float16_to_float32(b_.f16[i]) ? a_.f16[i] : b_.f16[i]; + } + + return simde__m512h_from_private(r_); + #endif +} +#if defined(SIMDE_X86_AVX512FP16_ENABLE_NATIVE_ALIASES) + #undef _mm512_max_ph + #define _mm512_max_ph(a, b) simde_mm512_max_ph(a, b) +#endif + SIMDE_FUNCTION_ATTRIBUTES simde__m512d simde_mm512_mask_max_pd(simde__m512d src, simde__mmask8 k, simde__m512d a, simde__m512d b) { diff --git a/lib/simd_wrapper/simde/x86/avx512/min.h b/lib/simd_wrapper/simde/x86/avx512/min.h index 03ee638b405..2e1dd8437f1 100644 --- a/lib/simd_wrapper/simde/x86/avx512/min.h +++ b/lib/simd_wrapper/simde/x86/avx512/min.h @@ -581,6 +581,30 @@ simde_mm512_maskz_min_pd(simde__mmask8 k, simde__m512d a, simde__m512d b) { #define _mm512_maskz_min_pd(k, a, b) simde_mm512_maskz_min_pd(k, a, b) #endif +SIMDE_FUNCTION_ATTRIBUTES +simde__m512h +simde_mm512_min_ph (simde__m512h a, simde__m512h b) { + #if defined(SIMDE_X86_AVX512FP16_NATIVE) + return _mm512_min_ph(a, b); + #else + simde__m512h_private + r_, + a_ = simde__m512h_to_private(a), + b_ = simde__m512h_to_private(b); + + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.f16) / sizeof(r_.f16[0])) ; i++) { + r_.f16[i] = simde_float16_to_float32(a_.f16[i]) < simde_float16_to_float32(b_.f16[i]) ? a_.f16[i] : b_.f16[i]; + } + + return simde__m512h_from_private(r_); + #endif +} +#if defined(SIMDE_X86_AVX512FP16_ENABLE_NATIVE_ALIASES) + #undef _mm512_min_ph + #define _mm512_min_ph(a, b) simde_mm512_min_ph(a, b) +#endif + SIMDE_END_DECLS_ HEDLEY_DIAGNOSTIC_POP diff --git a/lib/simd_wrapper/simde/x86/avx512/mov.h b/lib/simd_wrapper/simde/x86/avx512/mov.h index 25d5e49b976..cee9dbb375b 100644 --- a/lib/simd_wrapper/simde/x86/avx512/mov.h +++ b/lib/simd_wrapper/simde/x86/avx512/mov.h @@ -451,6 +451,12 @@ simde_mm512_mask_mov_ps (simde__m512 src, simde__mmask16 k, simde__m512 a) { #define _mm512_mask_mov_ps(src, k, a) simde_mm512_mask_mov_ps(src, k, a) #endif +SIMDE_FUNCTION_ATTRIBUTES +simde__m512h +simde_x_mm512_mask_mov_ph (simde__m512h src, simde__mmask32 k, simde__m512h a) { + return simde_mm512_castsi512_ph(simde_mm512_mask_mov_epi16(simde_mm512_castph_si512(src), k, simde_mm512_castph_si512(a))); +} + SIMDE_FUNCTION_ATTRIBUTES simde__m128i simde_mm_maskz_mov_epi8 (simde__mmask16 k, simde__m128i a) { diff --git a/lib/simd_wrapper/simde/x86/avx512/multishift.h b/lib/simd_wrapper/simde/x86/avx512/multishift.h index e6a6c097917..5388d0d0744 100644 --- a/lib/simd_wrapper/simde/x86/avx512/multishift.h +++ b/lib/simd_wrapper/simde/x86/avx512/multishift.h @@ -57,7 +57,7 @@ simde_mm_maskz_multishift_epi64_epi8 (simde__mmask16 k, simde__m128i a, simde__m } #if defined(SIMDE_X86_AVX512VBMI_ENABLE_NATIVE_ALIASES) && defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) #undef _mm_maskz_multishift_epi64_epi8 - #define _mm_maskz_multishift_epi64_epi8(src, k, a, b) simde_mm_maskz_multishift_epi64_epi8(src, k, a, b) + #define _mm_maskz_multishift_epi64_epi8(k, a, b) simde_mm_maskz_multishift_epi64_epi8(k, a, b) #endif SIMDE_FUNCTION_ATTRIBUTES @@ -109,7 +109,7 @@ simde_mm256_maskz_multishift_epi64_epi8 (simde__mmask32 k, simde__m256i a, simde } #if defined(SIMDE_X86_AVX512VBMI_ENABLE_NATIVE_ALIASES) && defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) #undef _mm256_maskz_multishift_epi64_epi8 - #define _mm256_maskz_multishift_epi64_epi8(src, k, a, b) simde_mm256_maskz_multishift_epi64_epi8(src, k, a, b) + #define _mm256_maskz_multishift_epi64_epi8(k, a, b) simde_mm256_maskz_multishift_epi64_epi8(k, a, b) #endif SIMDE_FUNCTION_ATTRIBUTES @@ -161,7 +161,7 @@ simde_mm512_maskz_multishift_epi64_epi8 (simde__mmask64 k, simde__m512i a, simde } #if defined(SIMDE_X86_AVX512VBMI_ENABLE_NATIVE_ALIASES) #undef _mm512_maskz_multishift_epi64_epi8 - #define _mm512_maskz_multishift_epi64_epi8(src, k, a, b) simde_mm512_maskz_multishift_epi64_epi8(src, k, a, b) + #define _mm512_maskz_multishift_epi64_epi8(k, a, b) simde_mm512_maskz_multishift_epi64_epi8(k, a, b) #endif SIMDE_END_DECLS_ diff --git a/lib/simd_wrapper/simde/x86/avx512/permutex.h b/lib/simd_wrapper/simde/x86/avx512/permutex.h new file mode 100644 index 00000000000..91c35cc2183 --- /dev/null +++ b/lib/simd_wrapper/simde/x86/avx512/permutex.h @@ -0,0 +1,101 @@ +/* SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, copy, + * modify, merge, publish, distribute, sublicense, and/or sell copies + * of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + * Copyright: + * 2023 Michael R. Crusoe + */ + +#if !defined(SIMDE_X86_AVX512_PERMUTEX_H) +#define SIMDE_X86_AVX512_PERMUTEX_H + +#include "types.h" + +HEDLEY_DIAGNOSTIC_PUSH +SIMDE_DISABLE_UNWANTED_DIAGNOSTICS +SIMDE_BEGIN_DECLS_ + +SIMDE_FUNCTION_ATTRIBUTES +simde__m256i +simde_mm256_permutex_epi64 (simde__m256i a, const int imm8) { + simde__m256i_private + a_ = simde__m256i_to_private(a), + r_; + + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.i64) / sizeof(r_.i64[0])) ; i++) { + r_.i64[i] = a_.i64[(imm8 >> (i*2)) & 3]; + } + + return simde__m256i_from_private(r_); +} +#if defined(SIMDE_X86_AVX512F_NATIVE) && defined(SIMDE_X86_AVX512VL_NATIVE) + #define simde_mm256_permutex_epi64(a, imm8) _mm256_permutex_epi64((a), (imm8)) +#endif +#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) && defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) + #undef _mm256_permutex_epi64 + #define _mm256_permutex_epi64(a, imm8) simde_mm256_permutex_epi64((a), (imm8)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde__m512i +simde_mm512_permutex_epi64 (simde__m512i a, const int imm8) { + simde__m512i_private + a_ = simde__m512i_to_private(a), + r_; + + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.m256i_private[0].i64) / sizeof(r_.m256i_private[0].i64[0])) ; i++) { + r_.m256i_private[0].i64[i] = a_.m256i_private[0].i64[(imm8 >> (i*2)) & 3]; + r_.m256i_private[1].i64[i] = a_.m256i_private[1].i64[(imm8 >> (i*2)) & 3]; + } + + return simde__m512i_from_private(r_); +} +#if defined(SIMDE_X86_AVX512F_NATIVE) + #define simde_mm512_permutex_epi64(a, imm8) _mm512_permutex_epi64((a), (imm8)) +#elif defined(SIMDE_STATEMENT_EXPR_) + #define simde_mm512_permutex_epi64(a, imm8) SIMDE_STATEMENT_EXPR_(({\ + simde__m512i_private simde_mm512_permutex_epi64_a_ = simde__m512i_to_private((a)), simde_mm512_permutex_epi64_r_; \ + simde_mm512_permutex_epi64_r_.m256i[0] = simde_mm256_permutex_epi64(simde_mm512_permutex_epi64_a_.m256i[0], (imm8)); \ + simde_mm512_permutex_epi64_r_.m256i[1] = simde_mm256_permutex_epi64(simde_mm512_permutex_epi64_a_.m256i[1], (imm8)); \ + simde__m512i_from_private(simde_mm512_permutex_epi64_r_); \ + })) +#endif +#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) + #undef _mm512_permutex_epi64 + #define _mm512_permutex_epi64(a, imm8) simde_mm512_permutex_epi64((a), (imm8)) +#endif + +#if defined(SIMDE_X86_AVX512F_NATIVE) + #define simde_mm512_mask_permutex_epi64(src, k, a, imm8) _mm512_mask_permutex_epi64((src), (k), (a), (imm8)) +#else + #define simde_mm512_mask_permutex_epi64(src, k, a, imm8) simde_mm512_mask_mov_epi64((src), (k), simde_mm512_permutex_epi64((a), (imm8))) +#endif +#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) + #undef _mm512_mask_permutex_epi64 + #define _mm512_mask_permutex_epi64(src, k, a, imm8) simde_mm512_mask_permutex_epi64((src), (k), (a), (imm8)) +#endif + +SIMDE_END_DECLS_ +HEDLEY_DIAGNOSTIC_POP + +#endif /* !defined(SIMDE_X86_AVX512_PERMUTEX_H) */ diff --git a/lib/simd_wrapper/simde/x86/avx512/permutex2var.h b/lib/simd_wrapper/simde/x86/avx512/permutex2var.h index 0637ac1b904..b6480c200b0 100644 --- a/lib/simd_wrapper/simde/x86/avx512/permutex2var.h +++ b/lib/simd_wrapper/simde/x86/avx512/permutex2var.h @@ -703,8 +703,8 @@ simde_mm256_permutex2var_epi16 (simde__m256i a, simde__m256i idx, simde__m256i b _mm256_castsi256_ps(tb), _mm256_castsi256_ps(select))); - lo = HEDLEY_REINTERPRET_CAST(__typeof__(lo), _mm256_blend_epi16(_mm256_slli_epi32(hilo2, 16), hilo, 0x55)); - hi = HEDLEY_REINTERPRET_CAST(__typeof__(hi), _mm256_blend_epi16(hilo2, _mm256_srli_epi32(hilo, 16), 0x55)); + lo = _mm256_blend_epi16(_mm256_slli_epi32(hilo2, 16), hilo, 0x55); + hi = _mm256_blend_epi16(hilo2, _mm256_srli_epi32(hilo, 16), 0x55); select = _mm256_cmpeq_epi16(_mm256_and_si256(idx, ones), ones); return _mm256_blendv_epi8(lo, hi, select); @@ -1178,8 +1178,8 @@ simde_mm512_permutex2var_epi16 (simde__m512i a, simde__m512i idx, simde__m512i b _mm256_castsi256_ps(hilo2), _mm256_castsi256_ps(select))); - lo = HEDLEY_REINTERPRET_CAST(__typeof__(lo), _mm256_blend_epi16(_mm256_slli_epi32(hilo2, 16), hilo1, 0x55)); - hi = HEDLEY_REINTERPRET_CAST(__typeof__(hi), _mm256_blend_epi16(hilo2, _mm256_srli_epi32(hilo1, 16), 0x55)); + lo = _mm256_blend_epi16(_mm256_slli_epi32(hilo2, 16), hilo1, 0x55); + hi = _mm256_blend_epi16(hilo2, _mm256_srli_epi32(hilo1, 16), 0x55); select = _mm256_cmpeq_epi16(_mm256_and_si256(idx1, ones), ones); r_.m256i[i] = _mm256_blendv_epi8(lo, hi, select); diff --git a/lib/simd_wrapper/simde/x86/avx512/permutexvar.h b/lib/simd_wrapper/simde/x86/avx512/permutexvar.h index 6172372366f..1b4bf7ac68f 100644 --- a/lib/simd_wrapper/simde/x86/avx512/permutexvar.h +++ b/lib/simd_wrapper/simde/x86/avx512/permutexvar.h @@ -1146,6 +1146,20 @@ simde_mm512_permutexvar_ps (simde__m512i idx, simde__m512 a) { #define _mm512_permutexvar_ps(idx, a) simde_mm512_permutexvar_ps(idx, a) #endif +SIMDE_FUNCTION_ATTRIBUTES +simde__m512h +simde_mm512_permutexvar_ph (simde__m512i idx, simde__m512h a) { + #if defined(SIMDE_X86_AVX512FP16_NATIVE) + return _mm512_permutexvar_ph(idx, a); + #else + return simde_mm512_castsi512_ph(simde_mm512_permutexvar_epi16(idx, simde_mm512_castph_si512(a))); + #endif +} +#if defined(SIMDE_X86_AVX512FP16_ENABLE_NATIVE_ALIASES) + #undef _mm512_permutexvar_ph + #define _mm512_permutexvar_ph(idx, a) simde_mm512_permutexvar_ph(idx, a) +#endif + SIMDE_FUNCTION_ATTRIBUTES simde__m512 simde_mm512_mask_permutexvar_ps (simde__m512 src, simde__mmask16 k, simde__m512i idx, simde__m512 a) { diff --git a/lib/simd_wrapper/simde/x86/avx512/range.h b/lib/simd_wrapper/simde/x86/avx512/range.h index 5361aa3676f..1d8c0fb497f 100644 --- a/lib/simd_wrapper/simde/x86/avx512/range.h +++ b/lib/simd_wrapper/simde/x86/avx512/range.h @@ -128,7 +128,7 @@ simde_mm256_range_ps (simde__m256 a, simde__m256 b, int imm8) #elif SIMDE_NATURAL_VECTOR_SIZE_LE(128) && defined(SIMDE_STATEMENT_EXPR_) #define simde_mm256_range_ps(a, b, imm8) SIMDE_STATEMENT_EXPR_(({ \ simde__m256_private \ - simde_mm256_range_ps_r_, \ + simde_mm256_range_ps_r_ = simde__m256_to_private(simde_mm256_setzero_ps()), \ simde_mm256_range_ps_a_ = simde__m256_to_private(a), \ simde_mm256_range_ps_b_ = simde__m256_to_private(b); \ \ @@ -208,7 +208,7 @@ simde_mm512_range_ps (simde__m512 a, simde__m512 b, int imm8) #elif SIMDE_NATURAL_VECTOR_SIZE_LE(128) && defined(SIMDE_STATEMENT_EXPR_) #define simde_mm512_range_ps(a, b, imm8) SIMDE_STATEMENT_EXPR_(({ \ simde__m512_private \ - simde_mm512_range_ps_r_, \ + simde_mm512_range_ps_r_ = simde__m512_to_private(simde_mm512_setzero_ps()), \ simde_mm512_range_ps_a_ = simde__m512_to_private(a), \ simde_mm512_range_ps_b_ = simde__m512_to_private(b); \ \ @@ -221,7 +221,7 @@ simde_mm512_range_ps (simde__m512 a, simde__m512 b, int imm8) #elif SIMDE_NATURAL_VECTOR_SIZE_LE(256) && defined(SIMDE_STATEMENT_EXPR_) #define simde_mm512_range_ps(a, b, imm8) SIMDE_STATEMENT_EXPR_(({ \ simde__m512_private \ - simde_mm512_range_ps_r_, \ + simde_mm512_range_ps_r_ = simde__m512_to_private(simde_mm512_setzero_ps()), \ simde_mm512_range_ps_a_ = simde__m512_to_private(a), \ simde_mm512_range_ps_b_ = simde__m512_to_private(b); \ \ @@ -368,7 +368,7 @@ simde_mm256_range_pd (simde__m256d a, simde__m256d b, int imm8) #elif SIMDE_NATURAL_VECTOR_SIZE_LE(128) && defined(SIMDE_STATEMENT_EXPR_) #define simde_mm256_range_pd(a, b, imm8) SIMDE_STATEMENT_EXPR_(({ \ simde__m256d_private \ - simde_mm256_range_pd_r_, \ + simde_mm256_range_pd_r_ = simde__m256d_to_private(simde_mm256_setzero_pd()), \ simde_mm256_range_pd_a_ = simde__m256d_to_private(a), \ simde_mm256_range_pd_b_ = simde__m256d_to_private(b); \ \ @@ -448,7 +448,7 @@ simde_mm512_range_pd (simde__m512d a, simde__m512d b, int imm8) #elif SIMDE_NATURAL_VECTOR_SIZE_LE(128) && defined(SIMDE_STATEMENT_EXPR_) #define simde_mm512_range_pd(a, b, imm8) SIMDE_STATEMENT_EXPR_(({ \ simde__m512d_private \ - simde_mm512_range_pd_r_, \ + simde_mm512_range_pd_r_ = simde__m512d_to_private(simde_mm512_setzero_pd()), \ simde_mm512_range_pd_a_ = simde__m512d_to_private(a), \ simde_mm512_range_pd_b_ = simde__m512d_to_private(b); \ \ @@ -461,7 +461,7 @@ simde_mm512_range_pd (simde__m512d a, simde__m512d b, int imm8) #elif SIMDE_NATURAL_VECTOR_SIZE_LE(256) && defined(SIMDE_STATEMENT_EXPR_) #define simde_mm512_range_pd(a, b, imm8) SIMDE_STATEMENT_EXPR_(({ \ simde__m512d_private \ - simde_mm512_range_pd_r_, \ + simde_mm512_range_pd_r_ = simde__m512d_to_private(simde_mm512_setzero_pd()), \ simde_mm512_range_pd_a_ = simde__m512d_to_private(a), \ simde_mm512_range_pd_b_ = simde__m512d_to_private(b); \ \ @@ -615,7 +615,7 @@ simde_mm512_range_pd (simde__m512d a, simde__m512d b, int imm8) #endif #if defined(SIMDE_X86_AVX512DQ_ENABLE_NATIVE_ALIASES) #undef _mm_maskz_range_ss - #define _mm_maskz_range_ss(k, a, b, imm8) simde_mm_mask_range_ss(k, a, b, imm8) + #define _mm_maskz_range_ss(k, a, b, imm8) simde_mm_maskz_range_ss(k, a, b, imm8) #endif #if (SIMDE_NATURAL_VECTOR_SIZE > 0) && defined(SIMDE_FAST_EXCEPTIONS) @@ -736,7 +736,7 @@ simde_mm512_range_pd (simde__m512d a, simde__m512d b, int imm8) #endif #if defined(SIMDE_X86_AVX512DQ_ENABLE_NATIVE_ALIASES) #undef _mm_maskz_range_sd - #define _mm_maskz_range_sd(k, a, b, imm8) simde_mm_mask_range_sd(k, a, b, imm8) + #define _mm_maskz_range_sd(k, a, b, imm8) simde_mm_maskz_range_sd(k, a, b, imm8) #endif SIMDE_END_DECLS_ diff --git a/lib/simd_wrapper/simde/x86/avx512/range_round.h b/lib/simd_wrapper/simde/x86/avx512/range_round.h index 6f4a7b6b86e..7bf13207582 100644 --- a/lib/simd_wrapper/simde/x86/avx512/range_round.h +++ b/lib/simd_wrapper/simde/x86/avx512/range_round.h @@ -117,7 +117,7 @@ SIMDE_BEGIN_DECLS_ #endif #if defined(SIMDE_X86_AVX512DQ_ENABLE_NATIVE_ALIASES) #undef _mm512_mask_range_round_ps - #define _mm512_mask_range_round_ps(src, k, a, b, imm8) simde_mm512_mask_range_round_ps(src, k, a, b, imm8) + #define _mm512_mask_range_round_ps(src, k, a, b, imm8, sae) simde_mm512_mask_range_round_ps(src, k, a, b, imm8, sae) #endif #if defined(SIMDE_X86_AVX512DQ_NATIVE) @@ -173,7 +173,7 @@ SIMDE_BEGIN_DECLS_ #endif #if defined(SIMDE_X86_AVX512DQ_ENABLE_NATIVE_ALIASES) #undef _mm512_maskz_range_round_ps - #define _mm512_maskz_range_round_ps(k, a, b, imm8) simde_mm512_maskz_range_round_ps(k, a, b, imm8) + #define _mm512_maskz_range_round_ps(k, a, b, imm8, sae) simde_mm512_maskz_range_round_ps(k, a, b, imm8, sae) #endif #if defined(SIMDE_X86_AVX512DQ_NATIVE) @@ -285,7 +285,7 @@ SIMDE_BEGIN_DECLS_ #endif #if defined(SIMDE_X86_AVX512DQ_ENABLE_NATIVE_ALIASES) #undef _mm512_mask_range_round_pd - #define _mm512_mask_range_round_pd(src, k, a, b, imm8) simde_mm512_mask_range_round_pd(src, k, a, b, imm8) + #define _mm512_mask_range_round_pd(src, k, a, b, imm8, sae) simde_mm512_mask_range_round_pd(src, k, a, b, imm8, sae) #endif #if defined(SIMDE_X86_AVX512DQ_NATIVE) @@ -341,7 +341,7 @@ SIMDE_BEGIN_DECLS_ #endif #if defined(SIMDE_X86_AVX512DQ_ENABLE_NATIVE_ALIASES) #undef _mm512_maskz_range_round_pd - #define _mm512_maskz_range_round_pd(k, a, b, imm8) simde_mm512_maskz_range_round_pd(k, a, b, imm8) + #define _mm512_maskz_range_round_pd(k, a, b, imm8, sae) simde_mm512_maskz_range_round_pd(k, a, b, imm8, sae) #endif #if defined(SIMDE_X86_AVX512DQ_NATIVE) @@ -453,7 +453,7 @@ SIMDE_BEGIN_DECLS_ #endif #if defined(SIMDE_X86_AVX512DQ_ENABLE_NATIVE_ALIASES) #undef _mm_mask_range_round_ss - #define _mm_mask_range_round_ss(src, k, a, b, imm8) simde_mm_mask_range_round_ss(src, k, a, b, imm8) + #define _mm_mask_range_round_ss(src, k, a, b, imm8, sae) simde_mm_mask_range_round_ss(src, k, a, b, imm8, sae) #endif #if defined(SIMDE_X86_AVX512DQ_NATIVE) @@ -509,7 +509,7 @@ SIMDE_BEGIN_DECLS_ #endif #if defined(SIMDE_X86_AVX512DQ_ENABLE_NATIVE_ALIASES) #undef _mm_maskz_range_round_ss - #define _mm_maskz_range_round_ss(k, a, b, imm8) simde_mm_maskz_range_round_ss(k, a, b, imm8) + #define _mm_maskz_range_round_ss(k, a, b, imm8, sae) simde_mm_maskz_range_round_ss(k, a, b, imm8, sae) #endif #if defined(SIMDE_X86_AVX512DQ_NATIVE) @@ -621,7 +621,7 @@ SIMDE_BEGIN_DECLS_ #endif #if defined(SIMDE_X86_AVX512DQ_ENABLE_NATIVE_ALIASES) #undef _mm_mask_range_round_sd - #define _mm_mask_range_round_sd(src, k, a, b, imm8) simde_mm_mask_range_round_sd(src, k, a, b, imm8) + #define _mm_mask_range_round_sd(src, k, a, b, imm8, sae) simde_mm_mask_range_round_sd(src, k, a, b, imm8, sae) #endif #if defined(SIMDE_X86_AVX512DQ_NATIVE) @@ -677,7 +677,7 @@ SIMDE_BEGIN_DECLS_ #endif #if defined(SIMDE_X86_AVX512DQ_ENABLE_NATIVE_ALIASES) #undef _mm_maskz_range_round_sd - #define _mm_maskz_range_round_sd(k, a, b, imm8) simde_mm_maskz_range_round_sd(k, a, b, imm8) + #define _mm_maskz_range_round_sd(k, a, b, imm8, sae) simde_mm_maskz_range_round_sd(k, a, b, imm8, sae) #endif SIMDE_END_DECLS_ diff --git a/lib/simd_wrapper/simde/x86/avx512/rcp.h b/lib/simd_wrapper/simde/x86/avx512/rcp.h new file mode 100644 index 00000000000..b1b394cfe30 --- /dev/null +++ b/lib/simd_wrapper/simde/x86/avx512/rcp.h @@ -0,0 +1,65 @@ +/* SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, copy, + * modify, merge, publish, distribute, sublicense, and/or sell copies + * of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + * Copyright: + * 2023 Michael R. Crusoe + */ + +#if !defined(SIMDE_X86_AVX512_RCP_H) +#define SIMDE_X86_AVX512_RCP_H + +#include "types.h" + +HEDLEY_DIAGNOSTIC_PUSH +SIMDE_DISABLE_UNWANTED_DIAGNOSTICS +SIMDE_BEGIN_DECLS_ + +// TODO: "The maximum relative error for this approximation is less than 2^-14." +// vs 1.5*2^-12 for _mm{,256}_rcp_ps + +SIMDE_FUNCTION_ATTRIBUTES +simde__m512 +simde_mm512_rcp14_ps (simde__m512 a) { + #if defined(SIMDE_X86_AVX512F_NATIVE) + return _mm512_rcp14_ps(a); + #else + simde__m512_private + r_, + a_ = simde__m512_to_private(a); + + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.f32) / sizeof(r_.f32[0])) ; i++) { + r_.f32[i] = SIMDE_FLOAT32_C(1.0) / a_.f32[i]; + } + + return simde__m512_from_private(r_); + #endif +} +#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) + #undef _mm512_rcp14_ps + #define _mm512_rcp14_ps(a) simde_mm512_rcp14_ps(a) +#endif + +SIMDE_END_DECLS_ +HEDLEY_DIAGNOSTIC_POP + +#endif /* !defined(SIMDE_X86_AVX512_RCP_H) */ diff --git a/lib/simd_wrapper/simde/x86/avx512/reduce.h b/lib/simd_wrapper/simde/x86/avx512/reduce.h new file mode 100644 index 00000000000..c007572e20a --- /dev/null +++ b/lib/simd_wrapper/simde/x86/avx512/reduce.h @@ -0,0 +1,355 @@ +/* SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, copy, + * modify, merge, publish, distribute, sublicense, and/or sell copies + * of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + * Copyright: + * 2023 Michael R. Crusoe + */ + +#if !defined(SIMDE_X86_AVX512_REDUCE_H) +#define SIMDE_X86_AVX512_REDUCE_H + +#include "types.h" + +HEDLEY_DIAGNOSTIC_PUSH +SIMDE_DISABLE_UNWANTED_DIAGNOSTICS +SIMDE_BEGIN_DECLS_ + +#if defined(__clang__) && SIMDE_FLOAT16_API == SIMDE_FLOAT16_API_FP16 +SIMDE_DIAGNOSTIC_DISABLE_DOUBLE_PROMOTION_ +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_float16 +simde_mm512_reduce_max_ph(simde__m512h a) { + #if defined(SIMDE_X86_AVX512FP16_NATIVE) + return _mm512_reduce_max_ph(a); + #else + simde__m512h_private a_; + simde_float16 r; + a_ = simde__m512h_to_private(a); + + r = SIMDE_NINFINITYHF; + #if defined(SIMDE_FLOAT16_VECTOR) + SIMDE_VECTORIZE_REDUCTION(max:r) + #endif + for (size_t i = 0 ; i < (sizeof(a_.f16) / sizeof(a_.f16[0])) ; i++) { + r = simde_float16_to_float32(a_.f16[i]) > simde_float16_to_float32(r) ? a_.f16[i] : r; + } + return r; + #endif +} +#if defined(SIMDE_X86_AVX512FP16_ENABLE_NATIVE_ALIASES) +# define _mm512_reduce_max_ph(a) simde_mm512_reduce_max_ph((a)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_float16 +simde_mm512_reduce_min_ph(simde__m512h a) { + #if defined(SIMDE_X86_AVX512FP16_NATIVE) + return _mm512_reduce_min_ph(a); + #else + simde__m512h_private a_; + simde_float16 r; + a_ = simde__m512h_to_private(a); + + r = SIMDE_INFINITYHF; + #if defined(SIMDE_FLOAT16_VECTOR) + SIMDE_VECTORIZE_REDUCTION(min:r) + #endif + for (size_t i = 0 ; i < (sizeof(a_.f16) / sizeof(a_.f16[0])) ; i++) { + r = simde_float16_to_float32(a_.f16[i]) < simde_float16_to_float32(r) ? a_.f16[i] : r; + } + return r; + #endif +} +#if defined(SIMDE_X86_AVX512FP16_ENABLE_NATIVE_ALIASES) +# define _mm512_reduce_min_ph(a) simde_mm512_reduce_min_ph((a)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +int32_t +simde_mm512_reduce_max_epi32(simde__m512i a) { + #if defined(SIMDE_X86_AVX512F_NATIVE) + return _mm512_reduce_max_epi32(a); + #else + simde__m512i_private a_; + int32_t r; + a_ = simde__m512i_to_private(a); + + r = -INT32_MAX; + SIMDE_VECTORIZE_REDUCTION(max:r) + for (size_t i = 0 ; i < (sizeof(a_.i32) / sizeof(a_.i32[0])) ; i++) { + r = a_.i32[i] > r ? a_.i32[i] : r; + } + return r; + #endif +} +#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) +# define _mm512_reduce_max_epi32(a) simde_mm512_reduce_max_epi32((a)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +int64_t +simde_mm512_reduce_max_epi64(simde__m512i a) { + #if defined(SIMDE_X86_AVX512F_NATIVE) + return _mm512_reduce_max_epi64(a); + #else + simde__m512i_private a_; + int64_t r; + a_ = simde__m512i_to_private(a); + + r = -INT64_MAX; + SIMDE_VECTORIZE_REDUCTION(max:r) + for (size_t i = 0 ; i < (sizeof(a_.i64) / sizeof(a_.i64[0])) ; i++) { + r = a_.i64[i] > r ? a_.i64[i] : r; + } + return r; + #endif +} +#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) +# define _mm512_reduce_max_epi64(a) simde_mm512_reduce_max_epi64((a)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +uint32_t +simde_mm512_reduce_max_epu32(simde__m512i a) { + #if defined(SIMDE_X86_AVX512F_NATIVE) + return _mm512_reduce_max_epu32(a); + #else + simde__m512i_private a_; + uint32_t r; + a_ = simde__m512i_to_private(a); + + r = 0; + SIMDE_VECTORIZE_REDUCTION(max:r) + for (size_t i = 0 ; i < (sizeof(a_.u32) / sizeof(a_.u32[0])) ; i++) { + r = a_.u32[i] > r ? a_.u32[i] : r; + } + return r; + #endif +} +#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) +# define _mm512_reduce_max_epu32(a) simde_mm512_reduce_max_epu32((a)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +uint64_t +simde_mm512_reduce_max_epu64(simde__m512i a) { + #if defined(SIMDE_X86_AVX512F_NATIVE) + return _mm512_reduce_max_epu64(a); + #else + simde__m512i_private a_; + uint64_t r; + a_ = simde__m512i_to_private(a); + + r = 0; + SIMDE_VECTORIZE_REDUCTION(max:r) + for (size_t i = 0 ; i < (sizeof(a_.u64) / sizeof(a_.u64[0])) ; i++) { + r = a_.u64[i] > r ? a_.u64[i] : r; + } + return r; + #endif +} +#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) +# define _mm512_reduce_max_epu64(a) simde_mm512_reduce_max_epu64((a)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_float64 +simde_mm512_reduce_max_pd(simde__m512d a) { + #if defined(SIMDE_X86_AVX512F_NATIVE) + return _mm512_reduce_max_pd(a); + #else + simde__m512d_private a_; + simde_float64 r; + a_ = simde__m512d_to_private(a); + + r = -SIMDE_MATH_INFINITY; + SIMDE_VECTORIZE_REDUCTION(max:r) + for (size_t i = 0 ; i < (sizeof(a_.f64) / sizeof(a_.f64[0])) ; i++) { + r = a_.f64[i] > r ? a_.f64[i] : r; + } + return r; + #endif +} +#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) +# define _mm512_reduce_max_pd(a) simde_mm512_reduce_max_pd((a)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_float32 +simde_mm512_reduce_max_ps(simde__m512 a) { + #if defined(SIMDE_X86_AVX512F_NATIVE) + return _mm512_reduce_max_ps(a); + #else + simde__m512_private a_; + simde_float32 r; + a_ = simde__m512_to_private(a); + + r = -SIMDE_MATH_INFINITYF; + SIMDE_VECTORIZE_REDUCTION(max:r) + for (size_t i = 0 ; i < (sizeof(a_.f32) / sizeof(a_.f32[0])) ; i++) { + r = a_.f32[i] > r ? a_.f32[i] : r; + } + return r; + #endif +} +#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) +# define _mm512_reduce_max_ps(a) simde_mm512_reduce_max_ps((a)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +int32_t +simde_mm512_reduce_min_epi32(simde__m512i a) { + #if defined(SIMDE_X86_AVX512F_NATIVE) + return _mm512_reduce_min_epi32(a); + #else + simde__m512i_private a_; + int32_t r; + a_ = simde__m512i_to_private(a); + + r = INT32_MAX; + SIMDE_VECTORIZE_REDUCTION(min:r) + for (size_t i = 0 ; i < (sizeof(a_.i32) / sizeof(a_.i32[0])) ; i++) { + r = a_.i32[i] < r ? a_.i32[i] : r; + } + return r; + #endif +} +#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) +# define _mm512_reduce_min_epi32(a) simde_mm512_reduce_min_epi32((a)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +int64_t +simde_mm512_reduce_min_epi64(simde__m512i a) { + #if defined(SIMDE_X86_AVX512F_NATIVE) + return _mm512_reduce_min_epi64(a); + #else + simde__m512i_private a_; + int64_t r; + a_ = simde__m512i_to_private(a); + + r = INT64_MAX; + SIMDE_VECTORIZE_REDUCTION(min:r) + for (size_t i = 0 ; i < (sizeof(a_.i64) / sizeof(a_.i64[0])) ; i++) { + r = a_.i64[i] < r ? a_.i64[i] : r; + } + return r; + #endif +} +#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) +# define _mm512_reduce_min_epi64(a) simde_mm512_reduce_min_epi64((a)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +uint32_t +simde_mm512_reduce_min_epu32(simde__m512i a) { + #if defined(SIMDE_X86_AVX512F_NATIVE) + return _mm512_reduce_min_epu32(a); + #else + simde__m512i_private a_; + uint32_t r; + a_ = simde__m512i_to_private(a); + + r = UINT32_MAX; + SIMDE_VECTORIZE_REDUCTION(min:r) + for (size_t i = 0 ; i < (sizeof(a_.u32) / sizeof(a_.u32[0])) ; i++) { + r = a_.u32[i] < r ? a_.u32[i] : r; + } + return r; + #endif +} +#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) +# define _mm512_reduce_min_epu32(a) simde_mm512_reduce_min_epu32((a)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +uint64_t +simde_mm512_reduce_min_epu64(simde__m512i a) { + #if defined(SIMDE_X86_AVX512F_NATIVE) + return _mm512_reduce_min_epu64(a); + #else + simde__m512i_private a_; + uint64_t r; + a_ = simde__m512i_to_private(a); + + r = UINT64_MAX; + SIMDE_VECTORIZE_REDUCTION(min:r) + for (size_t i = 0 ; i < (sizeof(a_.u64) / sizeof(a_.u64[0])) ; i++) { + r = a_.u64[i] < r ? a_.u64[i] : r; + } + return r; + #endif +} +#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) +# define _mm512_reduce_min_epu64(a) simde_mm512_reduce_min_epu64((a)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_float64 +simde_mm512_reduce_min_pd(simde__m512d a) { + #if defined(SIMDE_X86_AVX512F_NATIVE) + return _mm512_reduce_min_pd(a); + #else + simde__m512d_private a_; + simde_float64 r; + a_ = simde__m512d_to_private(a); + + r = SIMDE_MATH_INFINITY; + SIMDE_VECTORIZE_REDUCTION(min:r) + for (size_t i = 0 ; i < (sizeof(a_.f64) / sizeof(a_.f64[0])) ; i++) { + r = a_.f64[i] < r ? a_.f64[i] : r; + } + return r; + #endif +} +#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) +# define _mm512_reduce_min_pd(a) simde_mm512_reduce_min_pd((a)) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +simde_float32 +simde_mm512_reduce_min_ps(simde__m512 a) { + #if defined(SIMDE_X86_AVX512F_NATIVE) + return _mm512_reduce_min_ps(a); + #else + simde__m512_private a_; + simde_float32 r; + a_ = simde__m512_to_private(a); + + r = SIMDE_MATH_INFINITYF; + SIMDE_VECTORIZE_REDUCTION(min:r) + for (size_t i = 0 ; i < (sizeof(a_.f32) / sizeof(a_.f32[0])) ; i++) { + r = a_.f32[i] < r ? a_.f32[i] : r; + } + return r; + #endif +} +#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) +# define _mm512_reduce_min_ps(a) simde_mm512_reduce_min_ps((a)) +#endif + +SIMDE_END_DECLS_ +HEDLEY_DIAGNOSTIC_POP + +#endif /* !defined(SIMDE_X86_AVX512_REDUCE_H) */ diff --git a/lib/simd_wrapper/simde/x86/avx512/rol.h b/lib/simd_wrapper/simde/x86/avx512/rol.h index 835bf6bbbe7..5bdf98bc11f 100644 --- a/lib/simd_wrapper/simde/x86/avx512/rol.h +++ b/lib/simd_wrapper/simde/x86/avx512/rol.h @@ -73,7 +73,7 @@ SIMDE_BEGIN_DECLS_ #endif #if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) && defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) #undef _mm_maskz_rol_epi32 - #define _mm_maskz_rol_epi32(src, k, a, imm8) simde_mm_maskz_rol_epi32(src, k, a, imm8) + #define _mm_maskz_rol_epi32(k, a, imm8) simde_mm_maskz_rol_epi32(k, a, imm8) #endif #if defined(SIMDE_X86_AVX512F_NATIVE) && defined(SIMDE_X86_AVX512VL_NATIVE) diff --git a/lib/simd_wrapper/simde/x86/avx512/ror.h b/lib/simd_wrapper/simde/x86/avx512/ror.h index 464f71f0f78..7cac56c7ed8 100644 --- a/lib/simd_wrapper/simde/x86/avx512/ror.h +++ b/lib/simd_wrapper/simde/x86/avx512/ror.h @@ -73,7 +73,7 @@ SIMDE_BEGIN_DECLS_ #endif #if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) && defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) #undef _mm_maskz_ror_epi32 - #define _mm_maskz_ror_epi32(src, k, a, imm8) simde_mm_maskz_ror_epi32(src, k, a, imm8) + #define _mm_maskz_ror_epi32(k, a, imm8) simde_mm_maskz_ror_epi32(k, a, imm8) #endif #if defined(SIMDE_X86_AVX512F_NATIVE) && defined(SIMDE_X86_AVX512VL_NATIVE) diff --git a/lib/simd_wrapper/simde/x86/avx512/round.h b/lib/simd_wrapper/simde/x86/avx512/round.h index 954e348c1a7..684dbe04551 100644 --- a/lib/simd_wrapper/simde/x86/avx512/round.h +++ b/lib/simd_wrapper/simde/x86/avx512/round.h @@ -10,7 +10,7 @@ SIMDE_BEGIN_DECLS_ #if SIMDE_NATURAL_VECTOR_SIZE_LE(256) && defined(SIMDE_STATEMENT_EXPR_) #define simde_x_mm512_round_ps(a, rounding) SIMDE_STATEMENT_EXPR_(({ \ simde__m512_private \ - simde_x_mm512_round_ps_r_, \ + simde_x_mm512_round_ps_r_ = simde__m512_to_private(simde_mm512_setzero_ps()), \ simde_x_mm512_round_ps_a_ = simde__m512_to_private(a); \ \ for (size_t simde_x_mm512_round_ps_i = 0 ; simde_x_mm512_round_ps_i < (sizeof(simde_x_mm512_round_ps_r_.m256) / sizeof(simde_x_mm512_round_ps_r_.m256[0])) ; simde_x_mm512_round_ps_i++) { \ @@ -148,7 +148,7 @@ SIMDE_BEGIN_DECLS_ #if SIMDE_NATURAL_VECTOR_SIZE_LE(256) && defined(SIMDE_STATEMENT_EXPR_) #define simde_x_mm512_round_pd(a, rounding) SIMDE_STATEMENT_EXPR_(({ \ simde__m512d_private \ - simde_x_mm512_round_pd_r_, \ + simde_x_mm512_round_pd_r_ = simde__m512d_to_private(simde_mm512_setzero_pd()), \ simde_x_mm512_round_pd_a_ = simde__m512d_to_private(a); \ \ for (size_t simde_x_mm512_round_pd_i = 0 ; simde_x_mm512_round_pd_i < (sizeof(simde_x_mm512_round_pd_r_.m256d) / sizeof(simde_x_mm512_round_pd_r_.m256d[0])) ; simde_x_mm512_round_pd_i++) { \ diff --git a/lib/simd_wrapper/simde/x86/avx512/roundscale.h b/lib/simd_wrapper/simde/x86/avx512/roundscale.h index b44923c243a..80c9abf2bc0 100644 --- a/lib/simd_wrapper/simde/x86/avx512/roundscale.h +++ b/lib/simd_wrapper/simde/x86/avx512/roundscale.h @@ -18,7 +18,7 @@ SIMDE_BEGIN_DECLS_ SIMDE_FUNCTION_ATTRIBUTES simde__m128 simde_mm_roundscale_ps_internal_ (simde__m128 result, simde__m128 a, int imm8) - SIMDE_REQUIRE_CONSTANT_RANGE(imm8, 0, 255) { + SIMDE_REQUIRE_RANGE(imm8, 0, 255) { HEDLEY_STATIC_CAST(void, imm8); simde__m128 r, clear_sign; @@ -73,7 +73,7 @@ SIMDE_BEGIN_DECLS_ #elif SIMDE_NATURAL_VECTOR_SIZE_LE(128) && defined(SIMDE_STATEMENT_EXPR_) #define simde_mm256_roundscale_ps(a, imm8) SIMDE_STATEMENT_EXPR_(({ \ simde__m256_private \ - simde_mm256_roundscale_ps_r_, \ + simde_mm256_roundscale_ps_r_ = simde__m256_to_private(simde_mm256_setzero_ps()), \ simde_mm256_roundscale_ps_a_ = simde__m256_to_private(a); \ \ for (size_t simde_mm256_roundscale_ps_i = 0 ; simde_mm256_roundscale_ps_i < (sizeof(simde_mm256_roundscale_ps_r_.m128) / sizeof(simde_mm256_roundscale_ps_r_.m128[0])) ; simde_mm256_roundscale_ps_i++) { \ @@ -85,7 +85,7 @@ SIMDE_BEGIN_DECLS_ #else SIMDE_FUNCTION_ATTRIBUTES simde__m256 - simde_mm256_roundscale_ps_internal_ (simde__m256 result, simde__m256 a, int imm8) + simde_mm256_roundscale_ps_internal_ (simde__m256 result, simde__m256 a, const int imm8) SIMDE_REQUIRE_CONSTANT_RANGE(imm8, 0, 255) { HEDLEY_STATIC_CAST(void, imm8); @@ -141,7 +141,7 @@ SIMDE_BEGIN_DECLS_ #elif SIMDE_NATURAL_VECTOR_SIZE_LE(256) && defined(SIMDE_STATEMENT_EXPR_) #define simde_mm512_roundscale_ps(a, imm8) SIMDE_STATEMENT_EXPR_(({ \ simde__m512_private \ - simde_mm512_roundscale_ps_r_, \ + simde_mm512_roundscale_ps_r_ = simde__m512_to_private(simde_mm512_setzero_ps()), \ simde_mm512_roundscale_ps_a_ = simde__m512_to_private(a); \ \ for (size_t simde_mm512_roundscale_ps_i = 0 ; simde_mm512_roundscale_ps_i < (sizeof(simde_mm512_roundscale_ps_r_.m256) / sizeof(simde_mm512_roundscale_ps_r_.m256[0])) ; simde_mm512_roundscale_ps_i++) { \ @@ -154,7 +154,7 @@ SIMDE_BEGIN_DECLS_ SIMDE_FUNCTION_ATTRIBUTES simde__m512 simde_mm512_roundscale_ps_internal_ (simde__m512 result, simde__m512 a, int imm8) - SIMDE_REQUIRE_CONSTANT_RANGE(imm8, 0, 255) { + SIMDE_REQUIRE_RANGE(imm8, 0, 255) { HEDLEY_STATIC_CAST(void, imm8); simde__m512 r, clear_sign; @@ -210,7 +210,7 @@ SIMDE_BEGIN_DECLS_ SIMDE_FUNCTION_ATTRIBUTES simde__m128d simde_mm_roundscale_pd_internal_ (simde__m128d result, simde__m128d a, int imm8) - SIMDE_REQUIRE_CONSTANT_RANGE(imm8, 0, 255) { + SIMDE_REQUIRE_RANGE(imm8, 0, 255) { HEDLEY_STATIC_CAST(void, imm8); simde__m128d r, clear_sign; @@ -265,7 +265,7 @@ SIMDE_BEGIN_DECLS_ #elif SIMDE_NATURAL_VECTOR_SIZE_LE(128) && defined(SIMDE_STATEMENT_EXPR_) #define simde_mm256_roundscale_pd(a, imm8) SIMDE_STATEMENT_EXPR_(({ \ simde__m256d_private \ - simde_mm256_roundscale_pd_r_, \ + simde_mm256_roundscale_pd_r_ = simde__m256d_to_private(simde_mm256_setzero_pd()), \ simde_mm256_roundscale_pd_a_ = simde__m256d_to_private(a); \ \ for (size_t simde_mm256_roundscale_pd_i = 0 ; simde_mm256_roundscale_pd_i < (sizeof(simde_mm256_roundscale_pd_r_.m128d) / sizeof(simde_mm256_roundscale_pd_r_.m128d[0])) ; simde_mm256_roundscale_pd_i++) { \ @@ -278,7 +278,7 @@ SIMDE_BEGIN_DECLS_ SIMDE_FUNCTION_ATTRIBUTES simde__m256d simde_mm256_roundscale_pd_internal_ (simde__m256d result, simde__m256d a, int imm8) - SIMDE_REQUIRE_CONSTANT_RANGE(imm8, 0, 255) { + SIMDE_REQUIRE_RANGE(imm8, 0, 255) { HEDLEY_STATIC_CAST(void, imm8); simde__m256d r, clear_sign; @@ -333,7 +333,7 @@ SIMDE_BEGIN_DECLS_ #elif SIMDE_NATURAL_VECTOR_SIZE_LE(256) && defined(SIMDE_STATEMENT_EXPR_) #define simde_mm512_roundscale_pd(a, imm8) SIMDE_STATEMENT_EXPR_(({ \ simde__m512d_private \ - simde_mm512_roundscale_pd_r_, \ + simde_mm512_roundscale_pd_r_ = simde__m512d_to_private(simde_mm512_setzero_pd()), \ simde_mm512_roundscale_pd_a_ = simde__m512d_to_private(a); \ \ for (size_t simde_mm512_roundscale_pd_i = 0 ; simde_mm512_roundscale_pd_i < (sizeof(simde_mm512_roundscale_pd_r_.m256d) / sizeof(simde_mm512_roundscale_pd_r_.m256d[0])) ; simde_mm512_roundscale_pd_i++) { \ @@ -346,7 +346,7 @@ SIMDE_BEGIN_DECLS_ SIMDE_FUNCTION_ATTRIBUTES simde__m512d simde_mm512_roundscale_pd_internal_ (simde__m512d result, simde__m512d a, int imm8) - SIMDE_REQUIRE_CONSTANT_RANGE(imm8, 0, 255) { + SIMDE_REQUIRE_RANGE(imm8, 0, 255) { HEDLEY_STATIC_CAST(void, imm8); simde__m512d r, clear_sign; @@ -401,7 +401,7 @@ SIMDE_BEGIN_DECLS_ #else SIMDE_FUNCTION_ATTRIBUTES simde__m128 - simde_mm_roundscale_ss_internal_ (simde__m128 result, simde__m128 b, int imm8) + simde_mm_roundscale_ss_internal_ (simde__m128 result, simde__m128 b, const int imm8) SIMDE_REQUIRE_CONSTANT_RANGE(imm8, 0, 255) { HEDLEY_STATIC_CAST(void, imm8); @@ -508,7 +508,7 @@ SIMDE_BEGIN_DECLS_ #else SIMDE_FUNCTION_ATTRIBUTES simde__m128d - simde_mm_roundscale_sd_internal_ (simde__m128d result, simde__m128d b, int imm8) + simde_mm_roundscale_sd_internal_ (simde__m128d result, simde__m128d b, const int imm8) SIMDE_REQUIRE_CONSTANT_RANGE(imm8, 0, 255) { HEDLEY_STATIC_CAST(void, imm8); diff --git a/lib/simd_wrapper/simde/x86/avx512/roundscale_round.h b/lib/simd_wrapper/simde/x86/avx512/roundscale_round.h index debc1133051..f941e48da0e 100644 --- a/lib/simd_wrapper/simde/x86/avx512/roundscale_round.h +++ b/lib/simd_wrapper/simde/x86/avx512/roundscale_round.h @@ -8,6 +8,11 @@ HEDLEY_DIAGNOSTIC_PUSH SIMDE_DISABLE_UNWANTED_DIAGNOSTICS SIMDE_BEGIN_DECLS_ +#if defined(HEDLEY_MSVC_VERSION) +#pragma warning( push ) +#pragma warning( disable : 4244 ) +#endif + #if defined(SIMDE_X86_AVX512F_NATIVE) #define simde_mm512_roundscale_round_ps(a, imm8, sae) _mm512_roundscale_round_ps(a, imm8, sae) #elif defined(SIMDE_FAST_EXCEPTIONS) @@ -37,8 +42,7 @@ SIMDE_BEGIN_DECLS_ SIMDE_FUNCTION_ATTRIBUTES simde__m512 simde_mm512_roundscale_round_ps (simde__m512 a, int imm8, int sae) - SIMDE_REQUIRE_CONSTANT_RANGE(imm8, 0, 15) - SIMDE_REQUIRE_CONSTANT(sae) { + SIMDE_REQUIRE_RANGE(imm8, 0, 15) { simde__m512 r; if (sae & SIMDE_MM_FROUND_NO_EXC) { @@ -93,8 +97,7 @@ SIMDE_BEGIN_DECLS_ SIMDE_FUNCTION_ATTRIBUTES simde__m512 simde_mm512_mask_roundscale_round_ps (simde__m512 src, simde__mmask8 k, simde__m512 a, int imm8, int sae) - SIMDE_REQUIRE_CONSTANT_RANGE(imm8, 0, 15) - SIMDE_REQUIRE_CONSTANT(sae) { + SIMDE_REQUIRE_RANGE(imm8, 0, 15) { simde__m512 r; if (sae & SIMDE_MM_FROUND_NO_EXC) { @@ -149,8 +152,7 @@ SIMDE_BEGIN_DECLS_ SIMDE_FUNCTION_ATTRIBUTES simde__m512 simde_mm512_maskz_roundscale_round_ps (simde__mmask8 k, simde__m512 a, int imm8, int sae) - SIMDE_REQUIRE_CONSTANT_RANGE(imm8, 0, 15) - SIMDE_REQUIRE_CONSTANT(sae) { + SIMDE_REQUIRE_RANGE(imm8, 0, 15) { simde__m512 r; if (sae & SIMDE_MM_FROUND_NO_EXC) { @@ -205,8 +207,7 @@ SIMDE_BEGIN_DECLS_ SIMDE_FUNCTION_ATTRIBUTES simde__m512d simde_mm512_roundscale_round_pd (simde__m512d a, int imm8, int sae) - SIMDE_REQUIRE_CONSTANT_RANGE(imm8, 0, 15) - SIMDE_REQUIRE_CONSTANT(sae) { + SIMDE_REQUIRE_RANGE(imm8, 0, 15) { simde__m512d r; if (sae & SIMDE_MM_FROUND_NO_EXC) { @@ -261,8 +262,7 @@ SIMDE_BEGIN_DECLS_ SIMDE_FUNCTION_ATTRIBUTES simde__m512d simde_mm512_mask_roundscale_round_pd (simde__m512d src, simde__mmask8 k, simde__m512d a, int imm8, int sae) - SIMDE_REQUIRE_CONSTANT_RANGE(imm8, 0, 15) - SIMDE_REQUIRE_CONSTANT(sae) { + SIMDE_REQUIRE_RANGE(imm8, 0, 15) { simde__m512d r; if (sae & SIMDE_MM_FROUND_NO_EXC) { @@ -317,8 +317,7 @@ SIMDE_BEGIN_DECLS_ SIMDE_FUNCTION_ATTRIBUTES simde__m512d simde_mm512_maskz_roundscale_round_pd (simde__mmask8 k, simde__m512d a, int imm8, int sae) - SIMDE_REQUIRE_CONSTANT_RANGE(imm8, 0, 15) - SIMDE_REQUIRE_CONSTANT(sae) { + SIMDE_REQUIRE_RANGE(imm8, 0, 15) { simde__m512d r; if (sae & SIMDE_MM_FROUND_NO_EXC) { @@ -369,10 +368,10 @@ SIMDE_BEGIN_DECLS_ #else #define simde_mm_roundscale_round_ss(a, b, imm8, sae) simde_mm_roundscale_ss(a, b, imm8) #endif -#else +#elif !(defined(HEDLEY_MSVC_VERSION) && defined(SIMDE_X86_AVX_NATIVE)) SIMDE_FUNCTION_ATTRIBUTES simde__m128 - simde_mm_roundscale_round_ss (simde__m128 a, simde__m128 b, int imm8, int sae) + simde_mm_roundscale_round_ss (simde__m128 a, simde__m128 b, const int imm8, const int sae) SIMDE_REQUIRE_CONSTANT_RANGE(imm8, 0, 255) SIMDE_REQUIRE_CONSTANT(sae) { simde__m128 r; @@ -425,10 +424,10 @@ SIMDE_BEGIN_DECLS_ #else #define simde_mm_mask_roundscale_round_ss(src, k, a, b, imm8, sae) simde_mm_mask_roundscale_ss(src, k, a, b, imm8) #endif -#else +#elif !(defined(HEDLEY_MSVC_VERSION) && defined(SIMDE_X86_AVX_NATIVE)) SIMDE_FUNCTION_ATTRIBUTES simde__m128 - simde_mm_mask_roundscale_round_ss (simde__m128 src, simde__mmask8 k, simde__m128 a, simde__m128 b, int imm8, int sae) + simde_mm_mask_roundscale_round_ss (simde__m128 src, simde__mmask8 k, simde__m128 a, simde__m128 b, const int imm8, const int sae) SIMDE_REQUIRE_CONSTANT_RANGE(imm8, 0, 255) SIMDE_REQUIRE_CONSTANT(sae) { simde__m128 r; @@ -481,10 +480,10 @@ SIMDE_BEGIN_DECLS_ #else #define simde_mm_maskz_roundscale_round_ss(k, a, b, imm8, sae) simde_mm_maskz_roundscale_ss(k, a, b, imm8) #endif -#else +#elif !(defined(HEDLEY_MSVC_VERSION) && defined(SIMDE_X86_AVX_NATIVE)) SIMDE_FUNCTION_ATTRIBUTES simde__m128 - simde_mm_maskz_roundscale_round_ss (simde__mmask8 k, simde__m128 a, simde__m128 b, int imm8, int sae) + simde_mm_maskz_roundscale_round_ss (simde__mmask8 k, simde__m128 a, simde__m128 b, const int imm8, const int sae) SIMDE_REQUIRE_CONSTANT_RANGE(imm8, 0, 255) SIMDE_REQUIRE_CONSTANT(sae) { simde__m128 r; @@ -512,6 +511,11 @@ SIMDE_BEGIN_DECLS_ #define _mm_maskz_roundscale_round_ss(k, a, b, imm8, sae) simde_mm_maskz_roundscale_round_ss(k, a, b, imm8, sae) #endif +#if defined(HEDLEY_MSVC_VERSION) +#pragma warning( pop ) +#endif + + #if defined(SIMDE_X86_AVX512F_NATIVE) #define simde_mm_roundscale_round_sd(a, b, imm8, sae) _mm_roundscale_round_sd(a, b, imm8, sae) #elif defined(SIMDE_FAST_EXCEPTIONS) @@ -537,10 +541,10 @@ SIMDE_BEGIN_DECLS_ #else #define simde_mm_roundscale_round_sd(a, b, imm8, sae) simde_mm_roundscale_sd(a, b, imm8) #endif -#else +#elif !(defined(HEDLEY_MSVC_VERSION) && defined(SIMDE_X86_AVX_NATIVE)) SIMDE_FUNCTION_ATTRIBUTES simde__m128d - simde_mm_roundscale_round_sd (simde__m128d a, simde__m128d b, int imm8, int sae) + simde_mm_roundscale_round_sd (simde__m128d a, simde__m128d b, const int imm8, const int sae) SIMDE_REQUIRE_CONSTANT_RANGE(imm8, 0, 255) SIMDE_REQUIRE_CONSTANT(sae) { simde__m128d r; @@ -593,10 +597,10 @@ SIMDE_BEGIN_DECLS_ #else #define simde_mm_mask_roundscale_round_sd(src, k, a, b, imm8, sae) simde_mm_mask_roundscale_sd(src, k, a, b, imm8) #endif -#else +#elif !(defined(HEDLEY_MSVC_VERSION) && defined(SIMDE_X86_AVX_NATIVE)) SIMDE_FUNCTION_ATTRIBUTES simde__m128d - simde_mm_mask_roundscale_round_sd (simde__m128d src, simde__mmask8 k, simde__m128d a, simde__m128d b, int imm8, int sae) + simde_mm_mask_roundscale_round_sd (simde__m128d src, simde__mmask8 k, simde__m128d a, simde__m128d b, const int imm8, const int sae) SIMDE_REQUIRE_CONSTANT_RANGE(imm8, 0, 255) SIMDE_REQUIRE_CONSTANT(sae) { simde__m128d r; @@ -649,10 +653,10 @@ SIMDE_BEGIN_DECLS_ #else #define simde_mm_maskz_roundscale_round_sd(k, a, b, imm8, sae) simde_mm_maskz_roundscale_sd(k, a, b, imm8) #endif -#else +#elif !(defined(HEDLEY_MSVC_VERSION) && defined(SIMDE_X86_AVX_NATIVE)) SIMDE_FUNCTION_ATTRIBUTES simde__m128d - simde_mm_maskz_roundscale_round_sd (simde__mmask8 k, simde__m128d a, simde__m128d b, int imm8, int sae) + simde_mm_maskz_roundscale_round_sd (simde__mmask8 k, simde__m128d a, simde__m128d b, const int imm8, const int sae) SIMDE_REQUIRE_CONSTANT_RANGE(imm8, 0, 255) SIMDE_REQUIRE_CONSTANT(sae) { simde__m128d r; diff --git a/lib/simd_wrapper/simde/x86/avx512/scalef.h b/lib/simd_wrapper/simde/x86/avx512/scalef.h index c8c6d7c64d4..11673317545 100644 --- a/lib/simd_wrapper/simde/x86/avx512/scalef.h +++ b/lib/simd_wrapper/simde/x86/avx512/scalef.h @@ -284,8 +284,8 @@ simde_mm_scalef_ss (simde__m128 a, simde__m128 b) { SIMDE_FUNCTION_ATTRIBUTES simde__m128 simde_mm_mask_scalef_ss (simde__m128 src, simde__mmask8 k, simde__m128 a, simde__m128 b) { - #if defined(SIMDE_X86_AVX512F_NATIVE) && !defined(SIMDE_BUG_GCC_95483) - return _mm_mask_scalef_ss(src, k, a, b); + #if defined(SIMDE_X86_AVX512F_NATIVE) && !defined(HEDLEY_GCC_VERSION) + return _mm_mask_scalef_round_ss(src, k, a, b, _MM_FROUND_CUR_DIRECTION); #else simde__m128_private src_ = simde__m128_to_private(src), @@ -305,7 +305,7 @@ simde_mm_mask_scalef_ss (simde__m128 src, simde__mmask8 k, simde__m128 a, simde_ SIMDE_FUNCTION_ATTRIBUTES simde__m128 simde_mm_maskz_scalef_ss (simde__mmask8 k, simde__m128 a, simde__m128 b) { - #if defined(SIMDE_X86_AVX512F_NATIVE) && !defined(SIMDE_BUG_GCC_95483) + #if defined(SIMDE_X86_AVX512F_NATIVE) && !defined(SIMDE_BUG_GCC_95483) && !defined(SIMDE_BUG_GCC_105339) return _mm_maskz_scalef_ss(k, a, b); #else simde__m128_private @@ -345,7 +345,7 @@ simde_mm_scalef_sd (simde__m128d a, simde__m128d b) { SIMDE_FUNCTION_ATTRIBUTES simde__m128d simde_mm_mask_scalef_sd (simde__m128d src, simde__mmask8 k, simde__m128d a, simde__m128d b) { - #if defined(SIMDE_X86_AVX512F_NATIVE) && !defined(SIMDE_BUG_GCC_95483) + #if defined(SIMDE_X86_AVX512F_NATIVE) && !defined(SIMDE_BUG_GCC_95483) && !defined(SIMDE_BUG_GCC_105339) return _mm_mask_scalef_sd(src, k, a, b); #else simde__m128d_private @@ -366,7 +366,7 @@ simde_mm_mask_scalef_sd (simde__m128d src, simde__mmask8 k, simde__m128d a, simd SIMDE_FUNCTION_ATTRIBUTES simde__m128d simde_mm_maskz_scalef_sd (simde__mmask8 k, simde__m128d a, simde__m128d b) { - #if defined(SIMDE_X86_AVX512F_NATIVE) && !defined(SIMDE_BUG_GCC_95483) + #if defined(SIMDE_X86_AVX512F_NATIVE) && !defined(SIMDE_BUG_GCC_95483) && !defined(SIMDE_BUG_GCC_105339) return _mm_maskz_scalef_sd(k, a, b); #else simde__m128d_private diff --git a/lib/simd_wrapper/simde/x86/avx512/set.h b/lib/simd_wrapper/simde/x86/avx512/set.h index 1e681af688b..d87a72ce3d8 100644 --- a/lib/simd_wrapper/simde/x86/avx512/set.h +++ b/lib/simd_wrapper/simde/x86/avx512/set.h @@ -401,7 +401,7 @@ SIMDE_FUNCTION_ATTRIBUTES simde__m512i simde_x_mm512_set_m128i (simde__m128i a, simde__m128i b, simde__m128i c, simde__m128i d) { #if defined(SIMDE_X86_AVX512F_NATIVE) - SIMDE_ALIGN_LIKE_16(simde__m128i) simde__m128i v[] = { d, c, b, a }; + SIMDE_ALIGN_TO_64 simde__m128i v[] = { d, c, b, a }; return simde_mm512_load_si512(HEDLEY_STATIC_CAST(__m512i *, HEDLEY_STATIC_CAST(void *, v))); #else simde__m512i_private r_; @@ -415,11 +415,27 @@ simde_x_mm512_set_m128i (simde__m128i a, simde__m128i b, simde__m128i c, simde__ #endif } +SIMDE_FUNCTION_ATTRIBUTES +simde__m512 +simde_x_mm512_set_m256 (simde__m256 a, simde__m256 b) { + #if defined(SIMDE_X86_AVX512F_NATIVE) + SIMDE_ALIGN_TO_64 simde__m256 v[] = { b, a }; + return simde_mm512_load_ps(HEDLEY_STATIC_CAST(__m512 *, HEDLEY_STATIC_CAST(void *, v))); + #else + simde__m512_private r_; + + r_.m256[0] = b; + r_.m256[1] = a; + + return simde__m512_from_private(r_); + #endif +} + SIMDE_FUNCTION_ATTRIBUTES simde__m512i simde_x_mm512_set_m256i (simde__m256i a, simde__m256i b) { #if defined(SIMDE_X86_AVX512F_NATIVE) - SIMDE_ALIGN_LIKE_32(simde__m256i) simde__m256i v[] = { b, a }; + SIMDE_ALIGN_TO_64 simde__m256i v[] = { b, a }; return simde_mm512_load_si512(HEDLEY_STATIC_CAST(__m512i *, HEDLEY_STATIC_CAST(void *, v))); #else simde__m512i_private r_; @@ -431,6 +447,22 @@ simde_x_mm512_set_m256i (simde__m256i a, simde__m256i b) { #endif } +SIMDE_FUNCTION_ATTRIBUTES +simde__m512d +simde_x_mm512_set_m256d (simde__m256d a, simde__m256d b) { + #if defined(SIMDE_X86_AVX512F_NATIVE) + SIMDE_ALIGN_TO_64 simde__m256d v[] = { b, a }; + return simde_mm512_load_pd(HEDLEY_STATIC_CAST(__m512d *, HEDLEY_STATIC_CAST(void *, v))); + #else + simde__m512d_private r_; + + r_.m256d[0] = b; + r_.m256d[1] = a; + + return simde__m512d_from_private(r_); + #endif +} + SIMDE_FUNCTION_ATTRIBUTES simde__m512 simde_mm512_set_ps (simde_float32 e15, simde_float32 e14, simde_float32 e13, simde_float32 e12, @@ -484,6 +516,56 @@ simde_mm512_set_pd (simde_float64 e7, simde_float64 e6, simde_float64 e5, simde_ #define _mm512_set_pd(e7, e6, e5, e4, e3, e2, e1, e0) simde_mm512_set_pd(e7, e6, e5, e4, e3, e2, e1, e0) #endif +SIMDE_FUNCTION_ATTRIBUTES +simde__m512h +simde_mm512_set_ph (simde_float16 e31, simde_float16 e30, simde_float16 e29, simde_float16 e28, simde_float16 e27, simde_float16 e26, simde_float16 e25, simde_float16 e24, + simde_float16 e23, simde_float16 e22, simde_float16 e21, simde_float16 e20, simde_float16 e19, simde_float16 e18, simde_float16 e17, simde_float16 e16, + simde_float16 e15, simde_float16 e14, simde_float16 e13, simde_float16 e12, simde_float16 e11, simde_float16 e10, simde_float16 e9, simde_float16 e8, + simde_float16 e7, simde_float16 e6, simde_float16 e5, simde_float16 e4, simde_float16 e3, simde_float16 e2, simde_float16 e1, simde_float16 e0) { + simde__m512h_private r_; + + r_.f16[0] = e0; + r_.f16[1] = e1; + r_.f16[2] = e2; + r_.f16[3] = e3; + r_.f16[4] = e4; + r_.f16[5] = e5; + r_.f16[6] = e6; + r_.f16[7] = e7; + r_.f16[8] = e8; + r_.f16[9] = e9; + r_.f16[10] = e10; + r_.f16[11] = e11; + r_.f16[12] = e12; + r_.f16[13] = e13; + r_.f16[14] = e14; + r_.f16[15] = e15; + r_.f16[16] = e16; + r_.f16[17] = e17; + r_.f16[18] = e18; + r_.f16[19] = e19; + r_.f16[20] = e20; + r_.f16[21] = e21; + r_.f16[22] = e22; + r_.f16[23] = e23; + r_.f16[24] = e24; + r_.f16[25] = e25; + r_.f16[26] = e26; + r_.f16[27] = e27; + r_.f16[28] = e28; + r_.f16[29] = e29; + r_.f16[30] = e30; + r_.f16[31] = e31; + + return simde__m512h_from_private(r_); +} +#if defined(SIMDE_X86_AVX512FP16_ENABLE_NATIVE_ALIASES) + #undef _mm512_set_ph + #define _mm512_set_ph(e31, e30, e29, e28, e27, e26, e25, e24, e23, e22, e21, e20, e19, e18, e17, e16, e15, e14, e13, e12, e11, e10, e9, e8, e7, e6, e5, e4, e3, e2, e1, e0) \ + simde_mm512_set_ph(e31, e30, e29, e28, e27, e26, e25, e24, e23, e22, e21, e20, e19, e18, e17, e16, e15, e14, e13, e12, e11, e10, e9, e8, e7, e6, e5, e4, e3, e2, e1, e0) +#endif + + SIMDE_END_DECLS_ HEDLEY_DIAGNOSTIC_POP diff --git a/lib/simd_wrapper/simde/x86/avx512/set1.h b/lib/simd_wrapper/simde/x86/avx512/set1.h index 82c9c8cca2e..33ae841830a 100644 --- a/lib/simd_wrapper/simde/x86/avx512/set1.h +++ b/lib/simd_wrapper/simde/x86/avx512/set1.h @@ -325,6 +325,27 @@ simde_mm512_set1_pd (simde_float64 a) { #define _mm512_set1_pd(a) simde_mm512_set1_pd(a) #endif +SIMDE_FUNCTION_ATTRIBUTES +simde__m512h +simde_mm512_set1_ph (simde_float16 a) { + #if defined(SIMDE_X86_AVX512FP16_NATIVE) + return _mm512_set1_ph(a); + #else + simde__m512h_private r_; + + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.f16) / sizeof(r_.f16[0])) ; i++) { + r_.f16[i] = a; + } + + return simde__m512h_from_private(r_); + #endif +} +#if defined(SIMDE_X86_AVX512FP16_ENABLE_NATIVE_ALIASES) + #undef _mm512_set1_ph + #define _mm512_set1_ph(a) simde_mm512_set1_ph(a) +#endif + SIMDE_END_DECLS_ HEDLEY_DIAGNOSTIC_POP diff --git a/lib/simd_wrapper/simde/x86/avx512/setone.h b/lib/simd_wrapper/simde/x86/avx512/setone.h index 087dbb56725..df2f6e8bbae 100644 --- a/lib/simd_wrapper/simde/x86/avx512/setone.h +++ b/lib/simd_wrapper/simde/x86/avx512/setone.h @@ -60,6 +60,12 @@ simde_x_mm512_setone_pd(void) { return simde_mm512_castsi512_pd(simde_x_mm512_setone_si512()); } +SIMDE_FUNCTION_ATTRIBUTES +simde__m512h +simde_x_mm512_setone_ph(void) { + return simde_mm512_castsi512_ph(simde_x_mm512_setone_si512()); +} + SIMDE_END_DECLS_ HEDLEY_DIAGNOSTIC_POP diff --git a/lib/simd_wrapper/simde/x86/avx512/setzero.h b/lib/simd_wrapper/simde/x86/avx512/setzero.h index c34381735ad..c5bfdc4583f 100644 --- a/lib/simd_wrapper/simde/x86/avx512/setzero.h +++ b/lib/simd_wrapper/simde/x86/avx512/setzero.h @@ -66,8 +66,8 @@ simde_mm512_setzero_ps(void) { #endif } #if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) - #undef _mm512_setzero_si512 - #define _mm512_setzero_si512() simde_mm512_setzero_si512() + #undef _mm512_setzero_ps + #define _mm512_setzero_ps() simde_mm512_setzero_ps() #endif SIMDE_FUNCTION_ATTRIBUTES @@ -80,10 +80,25 @@ simde_mm512_setzero_pd(void) { #endif } #if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) - #undef _mm512_setzero_si512 - #define _mm512_setzero_si512() simde_mm512_setzero_si512() + #undef _mm512_setzero_pd + #define _mm512_setzero_pd() simde_mm512_setzero_pd() #endif +SIMDE_FUNCTION_ATTRIBUTES +simde__m512h +simde_mm512_setzero_ph(void) { + #if defined(SIMDE_X86_AVX512FP16_NATIVE) + return _mm512_setzero_ph(); + #else + return simde_mm512_castsi512_ph(simde_mm512_setzero_si512()); + #endif +} +#if defined(SIMDE_X86_AVX512FP16_ENABLE_NATIVE_ALIASES) + #undef _mm512_setzero_ph + #define _mm512_setzero_ph() simde_mm512_setzero_ph() +#endif + + SIMDE_END_DECLS_ HEDLEY_DIAGNOSTIC_POP diff --git a/lib/simd_wrapper/simde/x86/avx512/shuffle.h b/lib/simd_wrapper/simde/x86/avx512/shuffle.h index 93fc577afac..d1c537f34ec 100644 --- a/lib/simd_wrapper/simde/x86/avx512/shuffle.h +++ b/lib/simd_wrapper/simde/x86/avx512/shuffle.h @@ -23,6 +23,7 @@ * Copyright: * 2020 Evan Nemerson * 2020 Christopher Moore + * 2023 Michael R. Crusoe */ #if !defined(SIMDE_X86_AVX512_SHUFFLE_H) @@ -31,6 +32,7 @@ #include "types.h" #include "../avx2.h" #include "mov.h" +#include "extract.h" HEDLEY_DIAGNOSTIC_PUSH SIMDE_DISABLE_UNWANTED_DIAGNOSTICS @@ -53,8 +55,8 @@ simde_mm512_shuffle_epi8 (simde__m512i a, simde__m512i b) { } #else SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.u8) / sizeof(r_.u8[0])) ; i++) { - r_.u8[i] = (b_.u8[i] & 0x80) ? 0 : a_.u8[(b_.u8[i] & 0x0f) + (i & 0x30)]; + for (size_t i = 0 ; i < (sizeof(r_.i8) / sizeof(r_.i8[0])) ; i++) { + r_.i8[i] = (b_.i8[i] & 0x80) ? 0 : a_.i8[(b_.i8[i] & 0x0f) + (i & 0x30)]; } #endif @@ -94,6 +96,35 @@ simde_mm512_maskz_shuffle_epi8 (simde__mmask64 k, simde__m512i a, simde__m512i b #define _mm512_maskz_shuffle_epi8(k, a, b) simde_mm512_maskz_shuffle_epi8(k, a, b) #endif +#if defined(SIMDE_X86_AVX512F_NATIVE) +# define simde_mm512_shuffle_epi32(a, imm8) _mm512_shuffle_epi32((a), (imm8)) +#elif defined(SIMDE_STATEMENT_EXPR_) +# define simde_mm512_shuffle_epi32(a, imm8) SIMDE_STATEMENT_EXPR_(({ \ + simde__m512i_private simde_mm512_shuffle_epi32_r_, \ + simde_mm512_shuffle_epi32_a_ = simde__m512i_to_private((a)); \ + simde_mm512_shuffle_epi32_r_.m128i[0] = simde_mm_shuffle_epi32( \ + simde_mm512_shuffle_epi32_a_.m128i[0], (imm8)); \ + simde_mm512_shuffle_epi32_r_.m128i[1] = simde_mm_shuffle_epi32( \ + simde_mm512_shuffle_epi32_a_.m128i[1], (imm8)); \ + simde_mm512_shuffle_epi32_r_.m128i[2] = simde_mm_shuffle_epi32( \ + simde_mm512_shuffle_epi32_a_.m128i[2], (imm8)); \ + simde_mm512_shuffle_epi32_r_.m128i[3] = simde_mm_shuffle_epi32( \ + simde_mm512_shuffle_epi32_a_.m128i[3], (imm8)); \ + simde__m512i_from_private(simde_mm512_shuffle_epi32_r_); \ + })) +#else +# define simde_mm512_shuffle_epi32(a, imm8) \ + simde_x_mm512_set_m128i( \ + simde_mm_shuffle_epi32(simde_mm512_extracti32x4_epi32(a, 3), (imm8)), \ + simde_mm_shuffle_epi32(simde_mm512_extracti32x4_epi32(a, 2), (imm8)), \ + simde_mm_shuffle_epi32(simde_mm512_extracti32x4_epi32(a, 1), (imm8)), \ + simde_mm_shuffle_epi32(simde_mm512_extracti32x4_epi32(a, 0), (imm8))) +#endif +#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) + #undef _mm512_shuffle_epi32 + #define _mm512_shuffle_epi32(a, imm8) simde_mm512_shuffle_epi32((a), (imm8)) +#endif + SIMDE_FUNCTION_ATTRIBUTES simde__m256i simde_mm256_shuffle_i32x4 (simde__m256i a, simde__m256i b, const int imm8) @@ -131,6 +162,34 @@ simde_mm256_shuffle_i32x4 (simde__m256i a, simde__m256i b, const int imm8) #define simde_mm256_maskz_shuffle_f64x2(k, a, b, imm8) simde_mm256_maskz_mov_pd(k, simde_mm256_shuffle_f64x2(a, b, imm8)) #define simde_mm256_mask_shuffle_f64x2(src, k, a, b, imm8) simde_mm256_mask_mov_pd(src, k, simde_mm256_shuffle_f64x2(a, b, imm8)) +#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) && defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) + #undef _mm256_maskz_shuffle_i32x4 + #undef _mm256_mask_shuffle_i32x4 + #define _mm256_maskz_shuffle_i32x4(k, a, b, imm8) simde_mm256_maskz_shuffle_i32x4(k, a, b, imm8) + #define _mm256_mask_shuffle_i32x4(src, k, a, b, imm8) simde_mm256_mask_shuffle_i32x4(src, k, a, b, imm8) + + #undef _mm256_shuffle_f32x4 + #undef _mm256_maskz_shuffle_f32x4 + #undef _mm256_mask_shuffle_f32x4 + #define _mm256_shuffle_f32x4(a, b, imm8) simde_mm256_shuffle_f32x4(a, b, imm8) + #define _mm256_maskz_shuffle_f32x4(k, a, b, imm8) simde_mm256_maskz_shuffle_f32x4(k, a, b, imm8) + #define _mm256_mask_shuffle_f32x4(src, k, a, b, imm8) simde_mm256_mask_shuffle_f32x4(src, k, a, b, imm8) + + #undef _mm256_shuffle_i64x2 + #undef _mm256_maskz_shuffle_i64x2 + #undef _mm256_mask_shuffle_i64x2 + #define _mm256_shuffle_i64x2(a, b, imm8) simde_mm256_shuffle_i64x2(a, b, imm8) + #define _mm256_maskz_shuffle_i64x2(k, a, b, imm8) simde_mm256_maskz_shuffle_i64x2(k, a, b, imm8) + #define _mm256_mask_shuffle_i64x2(src, k, a, b, imm8) simde_mm256_mask_shuffle_i64x2(src, k, a, b, imm8) + + #undef _mm256_shuffle_f64x2 + #undef _mm256_maskz_shuffle_f64x2 + #undef _mm256_mask_shuffle_f64x2 + #define _mm256_shuffle_f64x2(a, b, imm8) simde_mm256_shuffle_f64x2(a, b, imm8) + #define _mm256_maskz_shuffle_f64x2(k, a, b, imm8) simde_mm256_maskz_shuffle_f64x2(k, a, b, imm8) + #define _mm256_mask_shuffle_f64x2(src, k, a, b, imm8) simde_mm256_mask_shuffle_f64x2(src, k, a, b, imm8) +#endif + SIMDE_FUNCTION_ATTRIBUTES simde__m512i simde_mm512_shuffle_i32x4 (simde__m512i a, simde__m512i b, const int imm8) @@ -170,6 +229,34 @@ simde_mm512_shuffle_i32x4 (simde__m512i a, simde__m512i b, const int imm8) #define simde_mm512_maskz_shuffle_f64x2(k, a, b, imm8) simde_mm512_maskz_mov_pd(k, simde_mm512_shuffle_f64x2(a, b, imm8)) #define simde_mm512_mask_shuffle_f64x2(src, k, a, b, imm8) simde_mm512_mask_mov_pd(src, k, simde_mm512_shuffle_f64x2(a, b, imm8)) +#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) + #undef _mm512_maskz_shuffle_i32x4 + #undef _mm512_mask_shuffle_i32x4 + #define _mm512_maskz_shuffle_i32x4(k, a, b, imm8) simde_mm512_maskz_shuffle_i32x4(k, a, b, imm8) + #define _mm512_mask_shuffle_i32x4(src, k, a, b, imm8) simde_mm512_mask_shuffle_i32x4(src, k, a, b, imm8) + + #undef _mm512_shuffle_f32x4 + #undef _mm512_maskz_shuffle_f32x4 + #undef _mm512_mask_shuffle_f32x4 + #define _mm512_shuffle_f32x4(a, b, imm8) simde_mm512_shuffle_f32x4(a, b, imm8) + #define _mm512_maskz_shuffle_f32x4(k, a, b, imm8) simde_mm512_maskz_shuffle_f32x4(k, a, b, imm8) + #define _mm512_mask_shuffle_f32x4(src, k, a, b, imm8) simde_mm512_mask_shuffle_f32x4(src, k, a, b, imm8) + + #undef _mm512_shuffle_i64x2 + #undef _mm512_maskz_shuffle_i64x2 + #undef _mm512_mask_shuffle_i64x2 + #define _mm512_shuffle_i64x2(a, b, imm8) simde_mm512_shuffle_i64x2(a, b, imm8) + #define _mm512_maskz_shuffle_i64x2(k, a, b, imm8) simde_mm512_maskz_shuffle_i64x2(k, a, b, imm8) + #define _mm512_mask_shuffle_i64x2(src, k, a, b, imm8) simde_mm512_mask_shuffle_i64x2(src, k, a, b, imm8) + + #undef _mm512_shuffle_f64x2 + #undef _mm512_maskz_shuffle_f64x2 + #undef _mm512_mask_shuffle_f64x2 + #define _mm512_shuffle_f64x2(a, b, imm8) simde_mm512_shuffle_f64x2(a, b, imm8) + #define _mm512_maskz_shuffle_f64x2(k, a, b, imm8) simde_mm512_maskz_shuffle_f64x2(k, a, b, imm8) + #define _mm512_mask_shuffle_f64x2(src, k, a, b, imm8) simde_mm512_mask_shuffle_f64x2(src, k, a, b, imm8) +#endif + #if defined(SIMDE_X86_AVX512F_NATIVE) #define simde_mm512_shuffle_ps(a, b, imm8) _mm512_shuffle_ps(a, b, imm8) #elif SIMDE_NATURAL_VECTOR_SIZE_LE(256) && defined(SIMDE_STATEMENT_EXPR_) @@ -224,8 +311,8 @@ simde_mm512_shuffle_i32x4 (simde__m512i a, simde__m512i b, const int imm8) a_ = simde__m512_to_private(a), b_ = simde__m512_to_private(b); + const size_t halfway = (sizeof(r_.m128_private[0].f32) / sizeof(r_.m128_private[0].f32[0]) / 2); for (size_t i = 0 ; i < (sizeof(r_.m128_private) / sizeof(r_.m128_private[0])) ; i++) { - const size_t halfway = (sizeof(r_.m128_private[i].f32) / sizeof(r_.m128_private[i].f32[0]) / 2); SIMDE_VECTORIZE for (size_t j = 0 ; j < halfway ; j++) { r_.m128_private[i].f32[j] = a_.m128_private[i].f32[(imm8 >> (j * 2)) & 3]; @@ -241,6 +328,89 @@ simde_mm512_shuffle_i32x4 (simde__m512i a, simde__m512i b, const int imm8) #define _mm512_shuffle_ps(a, b, imm8) simde_mm512_shuffle_ps(a, b, imm8) #endif +SIMDE_FUNCTION_ATTRIBUTES +simde__m512d +simde_mm512_shuffle_pd(simde__m512d a, simde__m512d b, int imm8) + SIMDE_REQUIRE_CONSTANT_RANGE (imm8, 0, 255) { + simde__m512d_private + r_, + a_ = simde__m512d_to_private(a), + b_ = simde__m512d_to_private(b); + + SIMDE_VECTORIZE + for (size_t i = 0 ; i < ((sizeof(r_.f64) / sizeof(r_.f64[0])) / 2) ; i++) { + r_.f64[i * 2] = (imm8 & ( 1 << (i*2) )) ? a_.f64[i * 2 + 1]: a_.f64[i * 2]; + r_.f64[i * 2 + 1] = (imm8 & ( 1 << (i*2+1) )) ? b_.f64[i * 2 + 1]: b_.f64[i * 2]; + } + + return simde__m512d_from_private(r_); +} +#if defined(SIMDE_X86_AVX512F_NATIVE) + #define simde_mm512_shuffle_pd(a, b, imm8) _mm512_shuffle_pd(a, b, imm8) +#endif +#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) + #undef _mm512_shuffle_pd + #define _mm512_shuffle_pd(a, b, imm8) simde_mm512_shuffle_pd(a, b, imm8) +#endif + +#if defined(SIMDE_X86_AVX512BW_NATIVE) +# define simde_mm512_shufflehi_epi16(a, imm8) _mm512_shufflehi_epi16(a, imm8) +#elif defined(SIMDE_STATEMENT_EXPR_) +# define simde_mm512_shufflehi_epi16(a, imm8) SIMDE_STATEMENT_EXPR_(({ \ + simde__m512i_private simde_mm512_shufflehi_epi16_r_, \ + simde_mm512_shufflehi_epi16_a_ = simde__m512i_to_private((a)); \ + simde_mm512_shufflehi_epi16_r_.m128i[0] = simde_mm_shufflehi_epi16( \ + simde_mm512_shufflehi_epi16_a_.m128i[0], (imm8)); \ + simde_mm512_shufflehi_epi16_r_.m128i[1] = simde_mm_shufflehi_epi16( \ + simde_mm512_shufflehi_epi16_a_.m128i[1], (imm8)); \ + simde_mm512_shufflehi_epi16_r_.m128i[2] = simde_mm_shufflehi_epi16( \ + simde_mm512_shufflehi_epi16_a_.m128i[2], (imm8)); \ + simde_mm512_shufflehi_epi16_r_.m128i[3] = simde_mm_shufflehi_epi16( \ + simde_mm512_shufflehi_epi16_a_.m128i[3], (imm8)); \ + simde__m512i_from_private(simde_mm512_shufflehi_epi16_r_); \ + })) +#else +# define simde_mm512_shufflehi_epi16(a, imm8) \ + simde_x_mm512_set_m128i( \ + simde_mm_shufflehi_epi16(simde_mm512_extracti32x4_epi32((a), 3), (imm8)), \ + simde_mm_shufflehi_epi16(simde_mm512_extracti32x4_epi32((a), 2), (imm8)), \ + simde_mm_shufflehi_epi16(simde_mm512_extracti32x4_epi32((a), 1), (imm8)), \ + simde_mm_shufflehi_epi16(simde_mm512_extracti32x4_epi32((a), 0), (imm8))) +#endif +#if defined(SIMDE_X86_AVX512BW_ENABLE_NATIVE_ALIASES) + #undef _mm512_shufflehi_epi16 + #define _mm512_shufflehi_epi16(a, imm8) simde_mm512_shufflehi_epi16(a, imm8) +#endif + +#if defined(SIMDE_X86_AVX512BW_NATIVE) +# define simde_mm512_shufflelo_epi16(a, imm8) _mm512_shufflelo_epi16(a, imm8) +#elif defined(SIMDE_STATEMENT_EXPR_) +# define simde_mm512_shufflelo_epi16(a, imm8) SIMDE_STATEMENT_EXPR_(({ \ + simde__m512i_private simde_mm512_shufflelo_epi16_r_, \ + simde_mm512_shufflelo_epi16_a_ = simde__m512i_to_private((a)); \ + simde_mm512_shufflelo_epi16_r_.m128i[0] = simde_mm_shufflelo_epi16( \ + simde_mm512_shufflelo_epi16_a_.m128i[0], (imm8)); \ + simde_mm512_shufflelo_epi16_r_.m128i[1] = simde_mm_shufflelo_epi16( \ + simde_mm512_shufflelo_epi16_a_.m128i[1], (imm8)); \ + simde_mm512_shufflelo_epi16_r_.m128i[2] = simde_mm_shufflelo_epi16( \ + simde_mm512_shufflelo_epi16_a_.m128i[2], (imm8)); \ + simde_mm512_shufflelo_epi16_r_.m128i[3] = simde_mm_shufflelo_epi16( \ + simde_mm512_shufflelo_epi16_a_.m128i[3], (imm8)); \ + simde__m512i_from_private(simde_mm512_shufflelo_epi16_r_); \ + })) +#else +# define simde_mm512_shufflelo_epi16(a, imm8) \ + simde_x_mm512_set_m128i( \ + simde_mm_shufflelo_epi16(simde_mm512_extracti32x4_epi32((a), 3), (imm8)), \ + simde_mm_shufflelo_epi16(simde_mm512_extracti32x4_epi32((a), 2), (imm8)), \ + simde_mm_shufflelo_epi16(simde_mm512_extracti32x4_epi32((a), 1), (imm8)), \ + simde_mm_shufflelo_epi16(simde_mm512_extracti32x4_epi32((a), 0), (imm8))) +#endif +#if defined(SIMDE_X86_AVX512BW_ENABLE_NATIVE_ALIASES) + #undef _mm512_shufflelo_epi16 + #define _mm512_shufflelo_epi16(a, imm8) simde_mm512_shufflelo_epi16(a, imm8) +#endif + SIMDE_END_DECLS_ HEDLEY_DIAGNOSTIC_POP diff --git a/lib/simd_wrapper/simde/x86/avx512/sll.h b/lib/simd_wrapper/simde/x86/avx512/sll.h index 8cc94464807..18fbbb8cebb 100644 --- a/lib/simd_wrapper/simde/x86/avx512/sll.h +++ b/lib/simd_wrapper/simde/x86/avx512/sll.h @@ -102,7 +102,7 @@ simde_mm512_maskz_sll_epi16 (simde__mmask32 k, simde__m512i a, simde__m128i coun } #if defined(SIMDE_X86_AVX512BW_ENABLE_NATIVE_ALIASES) #undef _mm512_maskz_sll_epi16 - #define _mm512_maskz_sll_epi16(src, k, a, count) simde_mm512_maskz_sll_epi16(src, k, a, count) + #define _mm512_maskz_sll_epi16(k, a, count) simde_mm512_maskz_sll_epi16(k, a, count) #endif SIMDE_FUNCTION_ATTRIBUTES diff --git a/lib/simd_wrapper/simde/x86/avx512/srai.h b/lib/simd_wrapper/simde/x86/avx512/srai.h index e7ba354aae6..4fcbd95c0b0 100644 --- a/lib/simd_wrapper/simde/x86/avx512/srai.h +++ b/lib/simd_wrapper/simde/x86/avx512/srai.h @@ -64,6 +64,32 @@ simde_mm512_srai_epi16 (simde__m512i a, const int imm8) { #define _mm512_srai_epi16(a, imm8) simde_mm512_srai_epi16(a, imm8) #endif +SIMDE_FUNCTION_ATTRIBUTES +simde__m512i +simde_mm512_srai_epi32 (simde__m512i a, const unsigned int imm8) { + simde__m512i_private + r_, + a_ = simde__m512i_to_private(a); + + #if defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR) + r_.i32 = a_.i32 >> HEDLEY_STATIC_CAST(int32_t, imm8); + #else + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.i32) / sizeof(r_.i32[0])) ; i++) { + r_.i32[i] = a_.i32[i] >> imm8; + } + #endif + + return simde__m512i_from_private(r_); +} +#if defined(SIMDE_X86_AVX512F_NATIVE) +# define simde_mm512_srai_epi32(a, imm8) _mm512_srai_epi32(a, imm8) +#endif +#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) + #undef _mm512_srai_epi32 + #define _mm512_srai_epi32(a, imm8) simde_mm512_srai_epi32(a, imm8) +#endif + SIMDE_END_DECLS_ HEDLEY_DIAGNOSTIC_POP diff --git a/lib/simd_wrapper/simde/x86/avx512/storeu.h b/lib/simd_wrapper/simde/x86/avx512/storeu.h index dee1db09161..e00801faf9f 100644 --- a/lib/simd_wrapper/simde/x86/avx512/storeu.h +++ b/lib/simd_wrapper/simde/x86/avx512/storeu.h @@ -28,11 +28,45 @@ #define SIMDE_X86_AVX512_STOREU_H #include "types.h" +#include "mov.h" +#include "setzero.h" HEDLEY_DIAGNOSTIC_PUSH SIMDE_DISABLE_UNWANTED_DIAGNOSTICS SIMDE_BEGIN_DECLS_ +#define simde_mm256_storeu_epi8(mem_addr, a) simde_mm256_storeu_si256(mem_addr, a) +#define simde_mm256_storeu_epi16(mem_addr, a) simde_mm256_storeu_si256(mem_addr, a) +#define simde_mm256_storeu_epi32(mem_addr, a) simde_mm256_storeu_si256(mem_addr, a) +#define simde_mm256_storeu_epi64(mem_addr, a) simde_mm256_storeu_si256(mem_addr, a) +#if defined(SIMDE_X86_AVX512BW_ENABLE_NATIVE_ALIASES) && defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) + #undef _mm256_storeu_epi8 + #undef _mm256_storeu_epi16 + #define _mm256_storeu_epi8(mem_addr, a) simde_mm256_storeu_si256(mem_addr, a) + #define _mm256_storeu_epi16(mem_addr, a) simde_mm256_storeu_si256(mem_addr, a) +#endif +#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) && defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) + #undef _mm256_storeu_epi32 + #undef _mm256_storeu_epi64 + #define _mm256_storeu_epi32(mem_addr, a) simde_mm256_storeu_si256(mem_addr, a) + #define _mm256_storeu_epi64(mem_addr, a) simde_mm256_storeu_si256(mem_addr, a) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +void +simde_mm256_mask_storeu_epi16 (void * mem_addr, simde__mmask16 k, simde__m256i a) { + #if defined(SIMDE_X86_AVX512BW_NATIVE) && defined(SIMDE_X86_AVX512VL_NATIVE) + _mm256_mask_storeu_epi16(HEDLEY_REINTERPRET_CAST(void*, mem_addr), k, a); + #else + const simde__m256i zero = simde_mm256_setzero_si256(); + simde_mm256_storeu_epi16(mem_addr, simde_mm256_mask_mov_epi16(zero, k, a)); + #endif +} +#if defined(SIMDE_X86_AVX512BW_ENABLE_NATIVE_ALIASES) && defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) + #undef _mm256_mask_storeu_epi16 + #define _mm256_mask_storeu_epi16(mem_addr, k, a) simde_mm256_mask_storeu_epi16(mem_addr, k, a) +#endif + SIMDE_FUNCTION_ATTRIBUTES void simde_mm512_storeu_ps (void * mem_addr, simde__m512 a) { @@ -61,6 +95,20 @@ simde_mm512_storeu_pd (void * mem_addr, simde__m512d a) { #define _mm512_storeu_pd(mem_addr, a) simde_mm512_storeu_pd(mem_addr, a) #endif +SIMDE_FUNCTION_ATTRIBUTES +void +simde_mm512_storeu_ph (void * mem_addr, simde__m512h a) { + #if defined(SIMDE_X86_AVX512FP16_NATIVE) + _mm512_storeu_ph(mem_addr, a); + #else + simde_memcpy(mem_addr, &a, sizeof(a)); + #endif +} +#if defined(SIMDE_X86_AVX512FP16_ENABLE_NATIVE_ALIASES) + #undef _mm512_storeu_ph + #define _mm512_storeu_ph(mem_addr, a) simde_mm512_storeu_ph(mem_addr, a) +#endif + SIMDE_FUNCTION_ATTRIBUTES void simde_mm512_storeu_si512 (void * mem_addr, simde__m512i a) { @@ -74,19 +122,96 @@ simde_mm512_storeu_si512 (void * mem_addr, simde__m512i a) { #define simde_mm512_storeu_epi16(mem_addr, a) simde_mm512_storeu_si512(mem_addr, a) #define simde_mm512_storeu_epi32(mem_addr, a) simde_mm512_storeu_si512(mem_addr, a) #define simde_mm512_storeu_epi64(mem_addr, a) simde_mm512_storeu_si512(mem_addr, a) -#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_X86_AVX512BW_ENABLE_NATIVE_ALIASES) #undef _mm512_storeu_epi8 #undef _mm512_storeu_epi16 + #define _mm512_storeu_epi16(mem_addr, a) simde_mm512_storeu_si512(mem_addr, a) + #define _mm512_storeu_epi8(mem_addr, a) simde_mm512_storeu_si512(mem_addr, a) +#endif +#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) #undef _mm512_storeu_epi32 #undef _mm512_storeu_epi64 #undef _mm512_storeu_si512 #define _mm512_storeu_si512(mem_addr, a) simde_mm512_storeu_si512(mem_addr, a) - #define _mm512_storeu_epi8(mem_addr, a) simde_mm512_storeu_si512(mem_addr, a) - #define _mm512_storeu_epi16(mem_addr, a) simde_mm512_storeu_si512(mem_addr, a) #define _mm512_storeu_epi32(mem_addr, a) simde_mm512_storeu_si512(mem_addr, a) #define _mm512_storeu_epi64(mem_addr, a) simde_mm512_storeu_si512(mem_addr, a) #endif +SIMDE_FUNCTION_ATTRIBUTES +void +simde_mm512_mask_storeu_epi16 (void * mem_addr, simde__mmask32 k, simde__m512i a) { + #if defined(SIMDE_X86_AVX512BW_NATIVE) + _mm512_mask_storeu_epi16(HEDLEY_REINTERPRET_CAST(void*, mem_addr), k, a); + #else + const simde__m512i zero = simde_mm512_setzero_si512(); + simde_mm512_storeu_epi16(mem_addr, simde_mm512_mask_mov_epi16(zero, k, a)); + #endif +} +#if defined(SIMDE_X86_AVX512BW_ENABLE_NATIVE_ALIASES) + #undef _mm512_mask_storeu_epi16 + #define _mm512_mask_storeu_epi16(mem_addr, k, a) simde_mm512_mask_storeu_epi16(mem_addr, k, a) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +void +simde_mm512_mask_storeu_epi32 (void * mem_addr, simde__mmask16 k, simde__m512i a) { + #if defined(SIMDE_X86_AVX512F_NATIVE) + _mm512_mask_storeu_epi32(HEDLEY_REINTERPRET_CAST(void*, mem_addr), k, a); + #else + const simde__m512i zero = simde_mm512_setzero_si512(); + simde_mm512_storeu_epi32(mem_addr, simde_mm512_mask_mov_epi32(zero, k, a)); + #endif +} +#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) + #undef _mm512_mask_storeu_epi32 + #define _mm512_mask_storeu_epi32(mem_addr, k, a) simde_mm512_mask_storeu_epi32(mem_addr, k, a) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +void +simde_mm512_mask_storeu_epi64 (void * mem_addr, simde__mmask8 k, simde__m512i a) { + #if defined(SIMDE_X86_AVX512F_NATIVE) + _mm512_mask_storeu_epi64(HEDLEY_REINTERPRET_CAST(void*, mem_addr), k, a); + #else + const simde__m512i zero = simde_mm512_setzero_si512(); + simde_mm512_storeu_epi64(mem_addr, simde_mm512_mask_mov_epi64(zero, k, a)); + #endif +} +#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) + #undef _mm512_mask_storeu_epi64 + #define _mm512_mask_storeu_epi64(mem_addr, k, a) simde_mm512_mask_storeu_epi64(mem_addr, k, a) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +void +simde_mm512_mask_storeu_ps (void * mem_addr, simde__mmask16 k, simde__m512 a) { + #if defined(SIMDE_X86_AVX512F_NATIVE) + _mm512_mask_storeu_ps(HEDLEY_REINTERPRET_CAST(void*, mem_addr), k, a); + #else + const simde__m512 zero = simde_mm512_setzero_ps(); + simde_mm512_storeu_ps(mem_addr, simde_mm512_mask_mov_ps(zero, k, a)); + #endif +} +#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) + #undef _mm512_mask_storeu_ps + #define _mm512_mask_storeu_ps(mem_addr, k, a) simde_mm512_mask_storeu_ps(mem_addr, k, a) +#endif + +SIMDE_FUNCTION_ATTRIBUTES +void +simde_mm512_mask_storeu_pd (void * mem_addr, simde__mmask8 k, simde__m512d a) { + #if defined(SIMDE_X86_AVX512F_NATIVE) + _mm512_mask_storeu_pd(HEDLEY_REINTERPRET_CAST(void*, mem_addr), k, a); + #else + const simde__m512d zero = simde_mm512_setzero_pd(); + simde_mm512_storeu_pd(mem_addr, simde_mm512_mask_mov_pd(zero, k, a)); + #endif +} +#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) + #undef _mm512_mask_storeu_pd + #define _mm512_mask_storeu_pd(mem_addr, k, a) simde_mm512_mask_storeu_pd(mem_addr, k, a) +#endif + SIMDE_END_DECLS_ HEDLEY_DIAGNOSTIC_POP diff --git a/lib/simd_wrapper/simde/x86/avx512/types.h b/lib/simd_wrapper/simde/x86/avx512/types.h index 37a07e17eb4..639df25949b 100644 --- a/lib/simd_wrapper/simde/x86/avx512/types.h +++ b/lib/simd_wrapper/simde/x86/avx512/types.h @@ -26,8 +26,8 @@ #if !defined(SIMDE_X86_AVX512_TYPES_H) #define SIMDE_X86_AVX512_TYPES_H - #include "../avx.h" +#include "../../simde-f16.h" HEDLEY_DIAGNOSTIC_PUSH SIMDE_DISABLE_UNWANTED_DIAGNOSTICS @@ -376,6 +376,73 @@ typedef union { #endif } simde__m512d_private; +typedef union { + #if defined(SIMDE_VECTOR_SUBSCRIPT) + SIMDE_AVX512_ALIGN int8_t i8 SIMDE_VECTOR(64) SIMDE_MAY_ALIAS; + SIMDE_AVX512_ALIGN int16_t i16 SIMDE_VECTOR(64) SIMDE_MAY_ALIAS; + SIMDE_AVX512_ALIGN int32_t i32 SIMDE_VECTOR(64) SIMDE_MAY_ALIAS; + SIMDE_AVX512_ALIGN int64_t i64 SIMDE_VECTOR(64) SIMDE_MAY_ALIAS; + SIMDE_AVX512_ALIGN uint8_t u8 SIMDE_VECTOR(64) SIMDE_MAY_ALIAS; + SIMDE_AVX512_ALIGN uint16_t u16 SIMDE_VECTOR(64) SIMDE_MAY_ALIAS; + SIMDE_AVX512_ALIGN uint32_t u32 SIMDE_VECTOR(64) SIMDE_MAY_ALIAS; + SIMDE_AVX512_ALIGN uint64_t u64 SIMDE_VECTOR(64) SIMDE_MAY_ALIAS; + #if defined(SIMDE_HAVE_INT128_) + SIMDE_AVX512_ALIGN simde_int128 i128 SIMDE_VECTOR(64) SIMDE_MAY_ALIAS; + SIMDE_AVX512_ALIGN simde_uint128 u128 SIMDE_VECTOR(64) SIMDE_MAY_ALIAS; + #endif + #if defined(SIMDE_FLOAT16_VECTOR) + SIMDE_ALIGN_TO_16 simde_float16 f16 SIMDE_VECTOR(64) SIMDE_MAY_ALIAS; + #else + SIMDE_AVX512_ALIGN simde_float16 f16[32]; + #endif + SIMDE_AVX512_ALIGN simde_float32 f32 SIMDE_VECTOR(64) SIMDE_MAY_ALIAS; + SIMDE_AVX512_ALIGN simde_float64 f64 SIMDE_VECTOR(64) SIMDE_MAY_ALIAS; + SIMDE_AVX512_ALIGN int_fast32_t i32f SIMDE_VECTOR(64) SIMDE_MAY_ALIAS; + SIMDE_AVX512_ALIGN uint_fast32_t u32f SIMDE_VECTOR(64) SIMDE_MAY_ALIAS; + #else + SIMDE_AVX512_ALIGN int8_t i8[64]; + SIMDE_AVX512_ALIGN int16_t i16[32]; + SIMDE_AVX512_ALIGN int32_t i32[16]; + SIMDE_AVX512_ALIGN int64_t i64[8]; + SIMDE_AVX512_ALIGN uint8_t u8[64]; + SIMDE_AVX512_ALIGN uint16_t u16[32]; + SIMDE_AVX512_ALIGN uint32_t u32[16]; + SIMDE_AVX512_ALIGN uint64_t u64[8]; + #if defined(SIMDE_HAVE_INT128_) + SIMDE_AVX512_ALIGN simde_int128 i128[4]; + SIMDE_AVX512_ALIGN simde_uint128 u128[4]; + #endif + SIMDE_AVX512_ALIGN simde_float16 f16[32]; + SIMDE_AVX512_ALIGN simde_float32 f32[16]; + SIMDE_AVX512_ALIGN simde_float64 f64[8]; + SIMDE_AVX512_ALIGN int_fast32_t i32f[64 / sizeof(int_fast32_t)]; + SIMDE_AVX512_ALIGN uint_fast32_t u32f[64 / sizeof(uint_fast32_t)]; + #endif + + SIMDE_AVX512_ALIGN simde__m128d_private m128d_private[4]; + SIMDE_AVX512_ALIGN simde__m128d m128d[4]; + SIMDE_AVX512_ALIGN simde__m256d_private m256d_private[2]; + SIMDE_AVX512_ALIGN simde__m256d m256d[2]; + + #if defined(SIMDE_X86_AVX512FP16_NATIVE) + SIMDE_AVX512_ALIGN __m512h n; + #elif defined(SIMDE_POWER_ALTIVEC_P6_NATIVE) + SIMDE_ALIGN_TO_16 SIMDE_POWER_ALTIVEC_VECTOR(unsigned char) altivec_u8[4]; + SIMDE_ALIGN_TO_16 SIMDE_POWER_ALTIVEC_VECTOR(unsigned short) altivec_u16[4]; + SIMDE_ALIGN_TO_16 SIMDE_POWER_ALTIVEC_VECTOR(unsigned int) altivec_u32[4]; + SIMDE_ALIGN_TO_16 SIMDE_POWER_ALTIVEC_VECTOR(signed char) altivec_i8[4]; + SIMDE_ALIGN_TO_16 SIMDE_POWER_ALTIVEC_VECTOR(signed short) altivec_i16[4]; + SIMDE_ALIGN_TO_16 SIMDE_POWER_ALTIVEC_VECTOR(signed int) altivec_i32[4]; + SIMDE_ALIGN_TO_16 SIMDE_POWER_ALTIVEC_VECTOR(float) altivec_f32[4]; + #if defined(SIMDE_POWER_ALTIVEC_P7_NATIVE) + SIMDE_ALIGN_TO_16 SIMDE_POWER_ALTIVEC_VECTOR(unsigned long long) altivec_u64[4]; + SIMDE_ALIGN_TO_16 SIMDE_POWER_ALTIVEC_VECTOR(signed long long) altivec_i64[4]; + SIMDE_ALIGN_TO_16 SIMDE_POWER_ALTIVEC_VECTOR(double) altivec_f64[4]; + #endif + #endif +} simde__m512h_private; + + typedef union { #if defined(SIMDE_VECTOR_SUBSCRIPT) SIMDE_AVX512_ALIGN int8_t i8 SIMDE_VECTOR(64) SIMDE_MAY_ALIAS; @@ -460,7 +527,9 @@ typedef union { * * As for the ICC check, unlike other compilers, merely using the * AVX-512 types causes ICC to generate AVX-512 instructions. */ -#if (defined(_MM_CMPINT_GE) || defined(_MM_CMPINT_NLT)) && (defined(SIMDE_X86_AVX512F_NATIVE) || !defined(HEDLEY_INTEL_VERSION)) +#if (defined(_MM_CMPINT_GE) || defined(_MM_CMPINT_NLT)) && \ + (defined(SIMDE_X86_AVX512F_NATIVE) || \ + !(defined(HEDLEY_INTEL_VERSION) || (defined(HEDLEY_MSVC_VERSION) && !defined(__clang__)))) typedef __m512 simde__m512; typedef __m512i simde__m512i; typedef __m512d simde__m512d; @@ -476,7 +545,7 @@ typedef union { typedef simde__m512_private simde__m512; typedef simde__m512i_private simde__m512i; typedef simde__m512d_private simde__m512d; - #endif + #endif typedef uint8_t simde__mmask8; typedef uint16_t simde__mmask16; @@ -498,6 +567,16 @@ typedef union { #endif #endif +#if defined(SIMDE_X86_AVX512FP16_NATIVE) + typedef __m512h simde__m512h; +#else + #if defined(SIMDE_VECTOR_SUBSCRIPT) && defined(SIMDE_FLOAT16_VECTOR) + typedef simde_float16 simde__m512h SIMDE_AVX512_ALIGN SIMDE_VECTOR(64) SIMDE_MAY_ALIAS; + #else + typedef simde__m512h_private simde__m512h; + #endif +#endif + /* These are really part of AVX-512VL / AVX-512BW (in GCC __mmask32 is * in avx512vlintrin.h and __mmask64 is in avx512bwintrin.h, in clang * both are in avx512bwintrin.h), not AVX-512F. However, we don't have @@ -512,6 +591,31 @@ typedef union { * issue and we'll try to figure out a work-around. */ typedef uint32_t simde__mmask32; typedef uint64_t simde__mmask64; +#if !defined(__mmask16) && defined(SIMDE_ENABLE_NATIVE_ALIASES) + #if !defined(HEDLEY_INTEL_VERSION) + typedef uint16_t __mmask16; + #else + #define __mmask16 uint16_t; + #endif +#endif +#if !defined(__mmask32) && defined(SIMDE_ENABLE_NATIVE_ALIASES) + #if !defined(HEDLEY_INTEL_VERSION) + typedef uint32_t __mmask32; + #else + #define __mmask32 uint32_t; + #endif +#endif +#if !defined(__mmask64) && defined(SIMDE_ENABLE_NATIVE_ALIASES) + #if !defined(HEDLEY_INTEL_VERSION) + #if defined(HEDLEY_GCC_VERSION) + typedef unsigned long long __mmask64; + #else + typedef uint64_t __mmask64; + #endif + #else + #define __mmask64 uint64_t; + #endif +#endif #if !defined(SIMDE_X86_AVX512F_NATIVE) && defined(SIMDE_ENABLE_NATIVE_ALIASES) #if !defined(HEDLEY_INTEL_VERSION) @@ -537,6 +641,18 @@ typedef uint64_t simde__mmask64; #endif #endif +#if !defined(SIMDE_X86_AVX512FP16_NATIVE) && defined(SIMDE_ENABLE_NATIVE_ALIASES) + #if !defined(HEDLEY_INTEL_VERSION) + //typedef simde__m128h __m128h; + //typedef simde__m256h __m256h; + typedef simde__m512h __m512h; + #else + //#define __m128h simde__m128h + //#define __m256h simde__m256h + #define __m512h simde__m512h + #endif +#endif + HEDLEY_STATIC_ASSERT(16 == sizeof(simde__m128bh), "simde__m128bh size incorrect"); HEDLEY_STATIC_ASSERT(16 == sizeof(simde__m128bh_private), "simde__m128bh_private size incorrect"); HEDLEY_STATIC_ASSERT(32 == sizeof(simde__m256bh), "simde__m256bh size incorrect"); @@ -549,6 +665,8 @@ HEDLEY_STATIC_ASSERT(64 == sizeof(simde__m512i), "simde__m512i size incorrect"); HEDLEY_STATIC_ASSERT(64 == sizeof(simde__m512i_private), "simde__m512i_private size incorrect"); HEDLEY_STATIC_ASSERT(64 == sizeof(simde__m512d), "simde__m512d size incorrect"); HEDLEY_STATIC_ASSERT(64 == sizeof(simde__m512d_private), "simde__m512d_private size incorrect"); +HEDLEY_STATIC_ASSERT(64 == sizeof(simde__m512h), "simde__m512h size incorrect"); +HEDLEY_STATIC_ASSERT(64 == sizeof(simde__m512h_private), "simde__m512h_private size incorrect"); #if defined(SIMDE_CHECK_ALIGNMENT) && defined(SIMDE_ALIGN_OF) HEDLEY_STATIC_ASSERT(SIMDE_ALIGN_OF(simde__m128bh) == 16, "simde__m128bh is not 16-byte aligned"); HEDLEY_STATIC_ASSERT(SIMDE_ALIGN_OF(simde__m128bh_private) == 16, "simde__m128bh_private is not 16-byte aligned"); @@ -562,6 +680,27 @@ HEDLEY_STATIC_ASSERT(SIMDE_ALIGN_OF(simde__m512i) == 32, "simde__m512i is not 32 HEDLEY_STATIC_ASSERT(SIMDE_ALIGN_OF(simde__m512i_private) == 32, "simde__m512i_private is not 32-byte aligned"); HEDLEY_STATIC_ASSERT(SIMDE_ALIGN_OF(simde__m512d) == 32, "simde__m512d is not 32-byte aligned"); HEDLEY_STATIC_ASSERT(SIMDE_ALIGN_OF(simde__m512d_private) == 32, "simde__m512d_private is not 32-byte aligned"); +HEDLEY_STATIC_ASSERT(SIMDE_ALIGN_OF(simde__m512h) == 32, "simde__m512h is not 32-byte aligned"); +HEDLEY_STATIC_ASSERT(SIMDE_ALIGN_OF(simde__m512h_private) == 32, "simde__m512h_private is not 32-byte aligned"); +#endif + +#define SIMDE_MM_CMPINT_EQ 0 +#define SIMDE_MM_CMPINT_LT 1 +#define SIMDE_MM_CMPINT_LE 2 +#define SIMDE_MM_CMPINT_FALSE 3 +#define SIMDE_MM_CMPINT_NE 4 +#define SIMDE_MM_CMPINT_NLT 5 +#define SIMDE_MM_CMPINT_NLE 6 +#define SIMDE_MM_CMPINT_TRUE 7 +#if defined(SIMDE_X86_AVX512BW_ENABLE_NATIVE_ALIASES) && !defined(_MM_CMPINT_EQ) +#define _MM_CMPINT_EQ SIMDE_MM_CMPINT_EQ +#define _MM_CMPINT_LT SIMDE_MM_CMPINT_LT +#define _MM_CMPINT_LE SIMDE_MM_CMPINT_LE +#define _MM_CMPINT_FALSE SIMDE_MM_CMPINT_FALSE +#define _MM_CMPINT_NE SIMDE_MM_CMPINT_NE +#define _MM_CMPINT_NLT SIMDE_MM_CMPINT_NLT +#define _MM_CMPINT_NLE SIMDE_MM_CMPINT_NLE +#define _MM_CMPINT_TRUE SIMDE_CMPINT_TRUE #endif SIMDE_FUNCTION_ATTRIBUTES @@ -660,6 +799,22 @@ simde__m512d_to_private(simde__m512d v) { return r; } +SIMDE_FUNCTION_ATTRIBUTES +simde__m512h +simde__m512h_from_private(simde__m512h_private v) { + simde__m512h r; + simde_memcpy(&r, &v, sizeof(r)); + return r; +} + +SIMDE_FUNCTION_ATTRIBUTES +simde__m512h_private +simde__m512h_to_private(simde__m512h v) { + simde__m512h_private r; + simde_memcpy(&r, &v, sizeof(r)); + return r; +} + SIMDE_END_DECLS_ HEDLEY_DIAGNOSTIC_POP diff --git a/lib/simd_wrapper/simde/x86/avx512/xorsign.h b/lib/simd_wrapper/simde/x86/avx512/xorsign.h index 38fb5f94242..f7fdc8c972e 100644 --- a/lib/simd_wrapper/simde/x86/avx512/xorsign.h +++ b/lib/simd_wrapper/simde/x86/avx512/xorsign.h @@ -26,7 +26,7 @@ */ /* This is a SIMDe extension which is not part of AVX-512. It exists - * because a lot of numerical methods in SIMDe have algoriths which do + * because a lot of numerical methods in SIMDe have algorithms which do * something like: * * float sgn = input < 0 ? -1 : 1; diff --git a/lib/simd_wrapper/simde/x86/clmul.h b/lib/simd_wrapper/simde/x86/clmul.h index 5ba97d7ab0c..e2d8b4cf8b3 100644 --- a/lib/simd_wrapper/simde/x86/clmul.h +++ b/lib/simd_wrapper/simde/x86/clmul.h @@ -120,21 +120,7 @@ simde_mm_clmulepi64_si128 (simde__m128i a, simde__m128i b, const int imm8) b_ = simde__m128i_to_private(b), r_; - #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && defined(__ARM_FEATURE_AES) - uint64x1_t A = ((imm8) & 0x01) ? vget_high_u64(a_.neon_u64) : vget_low_u64(a_.neon_u64); - uint64x1_t B = ((imm8) & 0x10) ? vget_high_u64(b_.neon_u64) : vget_low_u64(b_.neon_u64); - #if defined(SIMDE_BUG_CLANG_48257) - HEDLEY_DIAGNOSTIC_PUSH - SIMDE_DIAGNOSTIC_DISABLE_VECTOR_CONVERSION_ - #endif - poly64_t A_ = vget_lane_p64(vreinterpret_p64_u64(A), 0); - poly64_t B_ = vget_lane_p64(vreinterpret_p64_u64(B), 0); - #if defined(SIMDE_BUG_CLANG_48257) - HEDLEY_DIAGNOSTIC_POP - #endif - poly128_t R = vmull_p64(A_, B_); - r_.neon_u64 = vreinterpretq_u64_p128(R); - #elif SIMDE_NATURAL_VECTOR_SIZE_GE(128) + #if SIMDE_NATURAL_VECTOR_SIZE_GE(128) #if defined(SIMDE_SHUFFLE_VECTOR_) switch (imm8 & 0x11) { case 0x00: @@ -211,9 +197,9 @@ simde_mm_clmulepi64_si128 (simde__m128i a, simde__m128i b, const int imm8) SIMDE_LCC_REVERT_DEPRECATED_WARNINGS \ })) #else - #define simde_mm_clmulepi64_si128(a, b, imm8) simde_mm_clmulepi64_si128(a, b, imm8) + #define simde_mm_clmulepi64_si128(a, b, imm8) _mm_clmulepi64_si128(a, b, imm8) #endif -#elif defined(SIMDE_ARM_NEON_A64V8_NATIVE) && defined(__ARM_FEATURE_AES) +#elif defined(SIMDE_ARM_NEON_A64V8_NATIVE) && defined(SIMDE_ARCH_ARM_AES) && !defined(__clang__) #define simde_mm_clmulepi64_si128(a, b, imm8) \ simde__m128i_from_neon_u64( \ vreinterpretq_u64_p128( \ @@ -238,84 +224,61 @@ simde_mm256_clmulepi64_epi128 (simde__m256i a, simde__m256i b, const int imm8) b_ = simde__m256i_to_private(b), r_; - #if defined(SIMDE_X86_PCLMUL_NATIVE) - SIMDE_LCC_DISABLE_DEPRECATED_WARNINGS - switch (imm8 & 0x11) { + simde__m128i_private a_lo_, b_lo_, r_lo_, a_hi_, b_hi_, r_hi_; + + #if HEDLEY_HAS_BUILTIN(__builtin_shufflevector) && !defined(HEDLEY_IBM_VERSION) + switch (imm8 & 0x01) { case 0x00: - r_.m128i[0] = _mm_clmulepi64_si128(a_.m128i[0], b_.m128i[0], 0x00); - r_.m128i[1] = _mm_clmulepi64_si128(a_.m128i[1], b_.m128i[1], 0x00); + a_lo_.u64 = __builtin_shufflevector(a_.u64, a_.u64, 0, 2); break; case 0x01: - r_.m128i[0] = _mm_clmulepi64_si128(a_.m128i[0], b_.m128i[0], 0x01); - r_.m128i[1] = _mm_clmulepi64_si128(a_.m128i[1], b_.m128i[1], 0x01); + a_lo_.u64 = __builtin_shufflevector(a_.u64, a_.u64, 1, 3); break; - case 0x10: - r_.m128i[0] = _mm_clmulepi64_si128(a_.m128i[0], b_.m128i[0], 0x10); - r_.m128i[1] = _mm_clmulepi64_si128(a_.m128i[1], b_.m128i[1], 0x10); + } + switch (imm8 & 0x10) { + case 0x00: + b_lo_.u64 = __builtin_shufflevector(b_.u64, b_.u64, 0, 2); break; - case 0x11: - r_.m128i[0] = _mm_clmulepi64_si128(a_.m128i[0], b_.m128i[0], 0x11); - r_.m128i[1] = _mm_clmulepi64_si128(a_.m128i[1], b_.m128i[1], 0x11); + case 0x10: + b_lo_.u64 = __builtin_shufflevector(b_.u64, b_.u64, 1, 3); break; } - SIMDE_LCC_REVERT_DEPRECATED_WARNINGS #else - simde__m128i_private a_lo_, b_lo_, r_lo_, a_hi_, b_hi_, r_hi_; - - #if HEDLEY_HAS_BUILTIN(__builtin_shufflevector) && !defined(HEDLEY_IBM_VERSION) - switch (imm8 & 0x01) { - case 0x00: - a_lo_.u64 = __builtin_shufflevector(a_.u64, a_.u64, 0, 2); - break; - case 0x01: - a_lo_.u64 = __builtin_shufflevector(a_.u64, a_.u64, 1, 3); - break; - } - switch (imm8 & 0x10) { - case 0x00: - b_lo_.u64 = __builtin_shufflevector(b_.u64, b_.u64, 0, 2); - break; - case 0x10: - b_lo_.u64 = __builtin_shufflevector(b_.u64, b_.u64, 1, 3); - break; - } - #else - a_lo_.u64[0] = a_.u64[((imm8 >> 0) & 1) + 0]; - a_lo_.u64[1] = a_.u64[((imm8 >> 0) & 1) + 2]; - b_lo_.u64[0] = b_.u64[((imm8 >> 4) & 1) + 0]; - b_lo_.u64[1] = b_.u64[((imm8 >> 4) & 1) + 2]; - #endif + a_lo_.u64[0] = a_.u64[((imm8 >> 0) & 1) + 0]; + a_lo_.u64[1] = a_.u64[((imm8 >> 0) & 1) + 2]; + b_lo_.u64[0] = b_.u64[((imm8 >> 4) & 1) + 0]; + b_lo_.u64[1] = b_.u64[((imm8 >> 4) & 1) + 2]; + #endif - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_hi_.u64) / sizeof(r_hi_.u64[0])) ; i++) { - a_hi_.u64[i] = simde_x_bitreverse_u64(a_lo_.u64[i]); - b_hi_.u64[i] = simde_x_bitreverse_u64(b_lo_.u64[i]); + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_hi_.u64) / sizeof(r_hi_.u64[0])) ; i++) { + a_hi_.u64[i] = simde_x_bitreverse_u64(a_lo_.u64[i]); + b_hi_.u64[i] = simde_x_bitreverse_u64(b_lo_.u64[i]); - r_lo_.u64[i] = simde_x_clmul_u64(a_lo_.u64[i], b_lo_.u64[i]); - r_hi_.u64[i] = simde_x_clmul_u64(a_hi_.u64[i], b_hi_.u64[i]); + r_lo_.u64[i] = simde_x_clmul_u64(a_lo_.u64[i], b_lo_.u64[i]); + r_hi_.u64[i] = simde_x_clmul_u64(a_hi_.u64[i], b_hi_.u64[i]); - r_hi_.u64[i] = simde_x_bitreverse_u64(r_hi_.u64[i]) >> 1; - } + r_hi_.u64[i] = simde_x_bitreverse_u64(r_hi_.u64[i]) >> 1; + } - #if HEDLEY_HAS_BUILTIN(__builtin_shufflevector) && !defined(HEDLEY_IBM_VERSION) - r_.u64 = __builtin_shufflevector(r_lo_.u64, r_hi_.u64, 0, 2, 1, 3); - #elif defined(SIMDE_SHUFFLE_VECTOR_) - r_ = simde__m256i_to_private(simde_mm256_set_m128i(simde__m128i_from_private(r_hi_), simde__m128i_from_private(r_lo_))); - r_.u64 = SIMDE_SHUFFLE_VECTOR_(64, 32, r_.u64, r_.u64, 0, 2, 1, 3); - #else - r_.u64[0] = r_lo_.u64[0]; - r_.u64[1] = r_hi_.u64[0]; - r_.u64[2] = r_lo_.u64[1]; - r_.u64[3] = r_hi_.u64[1]; - #endif + #if HEDLEY_HAS_BUILTIN(__builtin_shufflevector) && !defined(HEDLEY_IBM_VERSION) + r_.u64 = __builtin_shufflevector(r_lo_.u64, r_hi_.u64, 0, 2, 1, 3); + #elif defined(SIMDE_SHUFFLE_VECTOR_) + r_ = simde__m256i_to_private(simde_mm256_set_m128i(simde__m128i_from_private(r_hi_), simde__m128i_from_private(r_lo_))); + r_.u64 = SIMDE_SHUFFLE_VECTOR_(64, 32, r_.u64, r_.u64, 0, 2, 1, 3); + #else + r_.u64[0] = r_lo_.u64[0]; + r_.u64[1] = r_hi_.u64[0]; + r_.u64[2] = r_lo_.u64[1]; + r_.u64[3] = r_hi_.u64[1]; #endif return simde__m256i_from_private(r_); } -#if defined(SIMDE_X86_VPCLMULQDQ_NATIVE) && defined(SIMDE_X86_AVX_NATIVE) +#if defined(SIMDE_X86_VPCLMULQDQ_NATIVE) && defined(SIMDE_X86_AVX512VL_NATIVE) #define simde_mm256_clmulepi64_epi128(a, b, imm8) _mm256_clmulepi64_epi128(a, b, imm8) #endif -#if defined(SIMDE_X86_VPCLMULQDQ_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_X86_VPCLMULQDQ_ENABLE_NATIVE_ALIASES) && defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) #undef _mm256_clmulepi64_epi128 #define _mm256_clmulepi64_epi128(a, b, imm8) simde_mm256_clmulepi64_epi128(a, b, imm8) #endif @@ -409,10 +372,10 @@ simde_mm512_clmulepi64_epi128 (simde__m512i a, simde__m512i b, const int imm8) return simde__m512i_from_private(r_); } -#if defined(SIMDE_X86_VPCLMULQDQ_NATIVE) +#if defined(SIMDE_X86_VPCLMULQDQ_NATIVE) && defined(SIMDE_X86_AVX512F_NATIVE) #define simde_mm512_clmulepi64_epi128(a, b, imm8) _mm512_clmulepi64_epi128(a, b, imm8) #endif -#if defined(SIMDE_X86_VPCLMULQDQ_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_X86_VPCLMULQDQ_ENABLE_NATIVE_ALIASES) && defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) #undef _mm512_clmulepi64_epi128 #define _mm512_clmulepi64_epi128(a, b, imm8) simde_mm512_clmulepi64_epi128(a, b, imm8) #endif diff --git a/lib/simd_wrapper/simde/x86/f16c.h b/lib/simd_wrapper/simde/x86/f16c.h index 51ba779acfd..9522bf6f62e 100644 --- a/lib/simd_wrapper/simde/x86/f16c.h +++ b/lib/simd_wrapper/simde/x86/f16c.h @@ -43,34 +43,31 @@ SIMDE_BEGIN_DECLS_ SIMDE_FUNCTION_ATTRIBUTES simde__m128i -simde_mm_cvtps_ph(simde__m128 a, const int sae) { - #if defined(SIMDE_X86_F16C_NATIVE) - SIMDE_LCC_DISABLE_DEPRECATED_WARNINGS - switch (sae & SIMDE_MM_FROUND_NO_EXC) { - case SIMDE_MM_FROUND_NO_EXC: - return _mm_cvtps_ph(a, SIMDE_MM_FROUND_NO_EXC); - default: - return _mm_cvtps_ph(a, 0); - } - SIMDE_LCC_REVERT_DEPRECATED_WARNINGS - #else - simde__m128_private a_ = simde__m128_to_private(a); - simde__m128i_private r_ = simde__m128i_to_private(simde_mm_setzero_si128()); +simde_mm_cvtps_ph(simde__m128 a, const int imm8) { + simde__m128_private a_ = simde__m128_to_private(a); + simde__m128i_private r_ = simde__m128i_to_private(simde_mm_setzero_si128()); - HEDLEY_STATIC_CAST(void, sae); - - #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) && defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) - r_.neon_f16 = vcombine_f16(vcvt_f16_f32(a_.neon_f32), vdup_n_f16(SIMDE_FLOAT16_C(0.0))); - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(a_.f32) / sizeof(a_.f32[0])) ; i++) { - r_.u16[i] = simde_float16_as_uint16(simde_float16_from_float32(a_.f32[i])); - } - #endif + HEDLEY_STATIC_CAST(void, imm8); - return simde__m128i_from_private(r_); + #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) && defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) + r_.neon_f16 = vcombine_f16(vcvt_f16_f32(a_.neon_f32), vdup_n_f16(SIMDE_FLOAT16_C(0.0))); + #elif defined(SIMDE_FLOAT16_VECTOR) + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(a_.f32) / sizeof(a_.f32[0])) ; i++) { + r_.f16[i] = simde_float16_from_float32(a_.f32[i]); + } + #else + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(a_.f32) / sizeof(a_.f32[0])) ; i++) { + r_.u16[i] = simde_float16_as_uint16(simde_float16_from_float32(a_.f32[i])); + } #endif + + return simde__m128i_from_private(r_); } +#if defined(SIMDE_X86_F16C_NATIVE) + #define simde_mm_cvtps_ph(a, imm8) _mm_cvtps_ph(a, imm8) +#endif #if defined(SIMDE_X86_F16C_ENABLE_NATIVE_ALIASES) #define _mm_cvtps_ph(a, sae) simde_mm_cvtps_ph(a, sae) #endif @@ -86,6 +83,11 @@ simde_mm_cvtph_ps(simde__m128i a) { #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) && defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) r_.neon_f32 = vcvt_f32_f16(vget_low_f16(a_.neon_f16)); + #elif defined(SIMDE_FLOAT16_VECTOR) + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(a_.f32) / sizeof(a_.f32[0])) ; i++) { + r_.f32[i] = simde_float16_to_float32(a_.f16[i]); + } #else SIMDE_VECTORIZE for (size_t i = 0 ; i < (sizeof(a_.f32) / sizeof(a_.f32[0])) ; i++) { @@ -102,39 +104,32 @@ simde_mm_cvtph_ps(simde__m128i a) { SIMDE_FUNCTION_ATTRIBUTES simde__m128i -simde_mm256_cvtps_ph(simde__m256 a, const int sae) { - #if defined(SIMDE_X86_F16C_NATIVE) && defined(SIMDE_X86_AVX_NATIVE) - SIMDE_LCC_DISABLE_DEPRECATED_WARNINGS - switch (sae & SIMDE_MM_FROUND_NO_EXC) { - case SIMDE_MM_FROUND_NO_EXC: - return _mm256_cvtps_ph(a, SIMDE_MM_FROUND_NO_EXC); - default: - return _mm256_cvtps_ph(a, 0); - } - SIMDE_LCC_REVERT_DEPRECATED_WARNINGS - #else - simde__m256_private a_ = simde__m256_to_private(a); - simde__m128i_private r_; +simde_mm256_cvtps_ph(simde__m256 a, const int imm8) { + simde__m256_private a_ = simde__m256_to_private(a); + simde__m128i_private r_; - HEDLEY_STATIC_CAST(void, sae); + HEDLEY_STATIC_CAST(void, imm8); - #if defined(SIMDE_X86_F16C_NATIVE) - return _mm_castps_si128(_mm_movelh_ps( - _mm_castsi128_ps(_mm_cvtps_ph(a_.m128[0], SIMDE_MM_FROUND_NO_EXC)), - _mm_castsi128_ps(_mm_cvtps_ph(a_.m128[1], SIMDE_MM_FROUND_NO_EXC)) - )); - #else - SIMDE_VECTORIZE + #if defined(SIMDE_FLOAT16_VECTOR) + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(a_.f32) / sizeof(a_.f32[0])) ; i++) { + r_.f16[i] = simde_float16_from_float32(a_.f32[i]); + } + #else + SIMDE_VECTORIZE for (size_t i = 0 ; i < (sizeof(a_.f32) / sizeof(a_.f32[0])) ; i++) { r_.u16[i] = simde_float16_as_uint16(simde_float16_from_float32(a_.f32[i])); } - #endif - - return simde__m128i_from_private(r_); #endif + + + return simde__m128i_from_private(r_); } +#if defined(SIMDE_X86_F16C_NATIVE) + #define simde_mm256_cvtps_ph(a, imm8) _mm256_cvtps_ph(a, imm8) +#endif #if defined(SIMDE_X86_F16C_ENABLE_NATIVE_ALIASES) - #define _mm256_cvtps_ph(a, sae) simde_mm256_cvtps_ph(a, sae) + #define _mm256_cvtps_ph(a, imm8) simde_mm256_cvtps_ph(a, imm8) #endif SIMDE_FUNCTION_ATTRIBUTES @@ -151,10 +146,17 @@ simde_mm256_cvtph_ps(simde__m128i a) { simde__m128i_private a_ = simde__m128i_to_private(a); simde__m256_private r_; - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.f32) / sizeof(r_.f32[0])) ; i++) { - r_.f32[i] = simde_float16_to_float32(simde_uint16_as_float16(a_.u16[i])); - } + #if defined(SIMDE_FLOAT16_VECTOR) + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.f32) / sizeof(r_.f32[0])) ; i++) { + r_.f32[i] = simde_float16_to_float32(a_.f16[i]); + } + #else + SIMDE_VECTORIZE + for (size_t i = 0 ; i < (sizeof(r_.f32) / sizeof(r_.f32[0])) ; i++) { + r_.f32[i] = simde_float16_to_float32(simde_uint16_as_float16(a_.u16[i])); + } + #endif return simde__m256_from_private(r_); #endif diff --git a/lib/simd_wrapper/simde/x86/fma.h b/lib/simd_wrapper/simde/x86/fma.h index 6ed68d5bfc5..630efc54afa 100644 --- a/lib/simd_wrapper/simde/x86/fma.h +++ b/lib/simd_wrapper/simde/x86/fma.h @@ -101,7 +101,7 @@ simde_mm_fmadd_ps (simde__m128 a, simde__m128 b, simde__m128 c) { #if defined(SIMDE_POWER_ALTIVEC_P7_NATIVE) r_.altivec_f32 = vec_madd(a_.altivec_f32, b_.altivec_f32, c_.altivec_f32); - #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE) && defined(__ARM_FEATURE_FMA) + #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE) && defined(SIMDE_ARCH_ARM_FMA) r_.neon_f32 = vfmaq_f32(c_.neon_f32, b_.neon_f32, a_.neon_f32); #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE) r_.neon_f32 = vmlaq_f32(c_.neon_f32, b_.neon_f32, a_.neon_f32); @@ -489,7 +489,7 @@ simde_mm_fnmadd_ps (simde__m128 a, simde__m128 b, simde__m128 c) { b_ = simde__m128_to_private(b), c_ = simde__m128_to_private(c); - #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) && defined(__ARM_FEATURE_FMA) + #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) && defined(SIMDE_ARCH_ARM_FMA) r_.neon_f32 = vfmsq_f32(c_.neon_f32, a_.neon_f32, b_.neon_f32); #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE) r_.neon_f32 = vmlsq_f32(c_.neon_f32, a_.neon_f32, b_.neon_f32); diff --git a/lib/simd_wrapper/simde/x86/gfni.h b/lib/simd_wrapper/simde/x86/gfni.h index c586382ebd9..5982a340917 100644 --- a/lib/simd_wrapper/simde/x86/gfni.h +++ b/lib/simd_wrapper/simde/x86/gfni.h @@ -124,7 +124,7 @@ simde_x_mm_gf2p8matrix_multiply_epi64_epi8 (simde__m128i x, simde__m128i A) { SIMDE_VECTORIZE #endif for (int i = 0 ; i < 8 ; i++) { - p = _mm_set1_epi16(_mm_movemask_epi8(a)); + p = _mm_set1_epi16(HEDLEY_STATIC_CAST(short, _mm_movemask_epi8(a))); p = _mm_and_si128(p, _mm_cmpgt_epi8(zero, X)); r = _mm_xor_si128(r, p); a = _mm_add_epi8(a, a); @@ -267,7 +267,7 @@ simde_x_mm_gf2p8matrix_multiply_epi64_epi8 (simde__m128i x, simde__m128i A) { for (int i = 0 ; i < 8 ; i++) { #if defined(SIMDE_BUG_CLANG_50932) p = HEDLEY_REINTERPRET_CAST(SIMDE_POWER_ALTIVEC_VECTOR(unsigned char), - vec_bperm(HEDLEY_STATIC_CAST(SIMDE_POWER_ALTIVEC_VECTOR(unsigned __int128), a), bit_select)); + vec_bperm(HEDLEY_REINTERPRET_CAST(SIMDE_POWER_ALTIVEC_VECTOR(unsigned __int128), a), bit_select)); #else p = vec_bperm(a, bit_select); #endif diff --git a/lib/simd_wrapper/simde/x86/mmx.h b/lib/simd_wrapper/simde/x86/mmx.h index b46bd93824f..e294af8e9a6 100644 --- a/lib/simd_wrapper/simde/x86/mmx.h +++ b/lib/simd_wrapper/simde/x86/mmx.h @@ -1467,18 +1467,17 @@ simde_mm_slli_pi16 (simde__m64 a, int count) { simde__m64_private r_; simde__m64_private a_ = simde__m64_to_private(a); - #if defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR) && defined(SIMDE_BUG_CLANG_POWER9_16x4_BAD_SHIFT) + #if defined(SIMDE_MIPS_LOONGSON_MMI_NATIVE) + r_.mmi_i16 = psllh_s(a_.mmi_i16, count); + #elif defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR) && defined(SIMDE_BUG_CLANG_POWER9_16x4_BAD_SHIFT) if (HEDLEY_UNLIKELY(count > 15)) return simde_mm_setzero_si64(); r_.i16 = a_.i16 << HEDLEY_STATIC_CAST(int16_t, count); #elif defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR) r_.i16 = a_.i16 << count; - #elif defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR) #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE) r_.neon_i16 = vshl_s16(a_.neon_i16, vmov_n_s16((int16_t) count)); - #elif defined(SIMDE_MIPS_LOONGSON_MMI_NATIVE) - r_.mmi_i16 = psllh_s(a_.mmi_i16, b_.mmi_i16); #else SIMDE_VECTORIZE for (size_t i = 0 ; i < (sizeof(r_.u16) / sizeof(r_.u16[0])) ; i++) { @@ -2157,10 +2156,10 @@ simde_mm_unpackhi_pi8 (simde__m64 a, simde__m64 b) { #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) r_.neon_i8 = vzip2_s8(a_.neon_i8, b_.neon_i8); - #elif defined(SIMDE_SHUFFLE_VECTOR_) - r_.i8 = SIMDE_SHUFFLE_VECTOR_(8, 8, a_.i8, b_.i8, 4, 12, 5, 13, 6, 14, 7, 15); #elif defined(SIMDE_MIPS_LOONGSON_MMI_NATIVE) r_.mmi_i8 = punpckhbh_s(a_.mmi_i8, b_.mmi_i8); + #elif defined(SIMDE_SHUFFLE_VECTOR_) + r_.i8 = SIMDE_SHUFFLE_VECTOR_(8, 8, a_.i8, b_.i8, 4, 12, 5, 13, 6, 14, 7, 15); #else r_.i8[0] = a_.i8[4]; r_.i8[1] = b_.i8[4]; diff --git a/lib/simd_wrapper/simde/x86/sse.h b/lib/simd_wrapper/simde/x86/sse.h index f5311c14b7b..a8855f5400d 100644 --- a/lib/simd_wrapper/simde/x86/sse.h +++ b/lib/simd_wrapper/simde/x86/sse.h @@ -31,8 +31,10 @@ #define SIMDE_X86_SSE_H #include "mmx.h" +#include "../simde-f16.h" #if defined(_WIN32) && !defined(SIMDE_X86_SSE_NATIVE) && defined(_MSC_VER) + #define NOMINMAX #include #endif @@ -58,6 +60,11 @@ typedef union { SIMDE_ALIGN_TO_16 simde_int128 i128 SIMDE_VECTOR(16) SIMDE_MAY_ALIAS; SIMDE_ALIGN_TO_16 simde_uint128 u128 SIMDE_VECTOR(16) SIMDE_MAY_ALIAS; #endif + #if defined(SIMDE_FLOAT16_VECTOR) + SIMDE_ALIGN_TO_16 simde_float16 f16 SIMDE_VECTOR(16) SIMDE_MAY_ALIAS; + #else + SIMDE_ALIGN_TO_16 simde_float16 f16[8]; + #endif SIMDE_ALIGN_TO_16 simde_float32 f32 SIMDE_VECTOR(16) SIMDE_MAY_ALIAS; SIMDE_ALIGN_TO_16 int_fast32_t i32f SIMDE_VECTOR(16) SIMDE_MAY_ALIAS; SIMDE_ALIGN_TO_16 uint_fast32_t u32f SIMDE_VECTOR(16) SIMDE_MAY_ALIAS; @@ -74,6 +81,7 @@ typedef union { SIMDE_ALIGN_TO_16 simde_int128 i128[1]; SIMDE_ALIGN_TO_16 simde_uint128 u128[1]; #endif + SIMDE_ALIGN_TO_16 simde_float16 f16[8]; SIMDE_ALIGN_TO_16 simde_float32 f32[4]; SIMDE_ALIGN_TO_16 int_fast32_t i32f[16 / sizeof(int_fast32_t)]; SIMDE_ALIGN_TO_16 uint_fast32_t u32f[16 / sizeof(uint_fast32_t)]; @@ -121,6 +129,17 @@ typedef union { SIMDE_ALIGN_TO_16 SIMDE_POWER_ALTIVEC_VECTOR(signed long long) altivec_i64; SIMDE_ALIGN_TO_16 SIMDE_POWER_ALTIVEC_VECTOR(double) altivec_f64; #endif + #elif defined(SIMDE_LOONGARCH_LSX_NATIVE) + v16i8 lsx_i8; + v8i16 lsx_i16; + v4i32 lsx_i32; + v2i64 lsx_i64; + v16u8 lsx_u8; + v8u16 lsx_u16; + v4u32 lsx_u32; + v2u64 lsx_u64; + v4f32 lsx_f32; + v2f64 lsx_f64; #endif } simde__m128_private; @@ -132,6 +151,8 @@ typedef union { typedef v128_t simde__m128; #elif defined(SIMDE_POWER_ALTIVEC_P6_NATIVE) || defined(SIMDE_ZARCH_ZVECTOR_13_NATIVE) typedef SIMDE_POWER_ALTIVEC_VECTOR(float) simde__m128; +#elif defined(SIMDE_LOONGARCH_LSX_NATIVE) + typedef v4f32 simde__m128; #elif defined(SIMDE_VECTOR_SUBSCRIPT) typedef simde_float32 simde__m128 SIMDE_ALIGN_TO_16 SIMDE_VECTOR(16) SIMDE_MAY_ALIAS; #else @@ -215,6 +236,19 @@ simde__m128_to_private(simde__m128 v) { SIMDE_X86_GENERATE_CONVERSION_FUNCTION(m128, v128_t, wasm, v128); #endif /* defined(SIMDE_POWER_ALTIVEC_P6_NATIVE) */ +#if defined(SIMDE_LOONGARCH_LSX_NATIVE) + SIMDE_X86_GENERATE_CONVERSION_FUNCTION(m128, v16i8, lsx, i8) + SIMDE_X86_GENERATE_CONVERSION_FUNCTION(m128, v8i16, lsx, i16) + SIMDE_X86_GENERATE_CONVERSION_FUNCTION(m128, v4i32, lsx, i32) + SIMDE_X86_GENERATE_CONVERSION_FUNCTION(m128, v2i64, lsx, i64) + SIMDE_X86_GENERATE_CONVERSION_FUNCTION(m128, v16u8, lsx, u8) + SIMDE_X86_GENERATE_CONVERSION_FUNCTION(m128, v8u16, lsx, u16) + SIMDE_X86_GENERATE_CONVERSION_FUNCTION(m128, v4u32, lsx, u32) + SIMDE_X86_GENERATE_CONVERSION_FUNCTION(m128, v2u64, lsx, u64) + SIMDE_X86_GENERATE_CONVERSION_FUNCTION(m128, v4f32, lsx, f32) + SIMDE_X86_GENERATE_CONVERSION_FUNCTION(m128, v2f64, lsx, f64) +#endif /* defined(SIMDE_LOONGARCH_LSX_NATIVE) */ + enum { #if defined(SIMDE_X86_SSE_NATIVE) SIMDE_MM_ROUND_NEAREST = _MM_ROUND_NEAREST, @@ -228,6 +262,14 @@ enum { SIMDE_MM_ROUND_TOWARD_ZERO = 0x6000 #endif }; +#if defined(_MM_ROUND_MASK) +# define SIMDE_MM_ROUND_MASK _MM_ROUND_MASK +#else +# define SIMDE_MM_ROUND_MASK (0x6000) +#endif +#if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES) + #define _MM_ROUND_MASK SIMDE_MM_ROUND_MASK +#endif #if defined(_MM_FROUND_TO_NEAREST_INT) # define SIMDE_MM_FROUND_TO_NEAREST_INT _MM_FROUND_TO_NEAREST_INT @@ -395,7 +437,7 @@ enum { #endif SIMDE_FUNCTION_ATTRIBUTES -unsigned int +uint32_t SIMDE_MM_GET_ROUNDING_MODE(void) { #if defined(SIMDE_X86_SSE_NATIVE) return _MM_GET_ROUNDING_MODE(); @@ -443,7 +485,7 @@ SIMDE_MM_GET_ROUNDING_MODE(void) { SIMDE_FUNCTION_ATTRIBUTES void -SIMDE_MM_SET_ROUNDING_MODE(unsigned int a) { +SIMDE_MM_SET_ROUNDING_MODE(uint32_t a) { #if defined(SIMDE_X86_SSE_NATIVE) _MM_SET_ROUNDING_MODE(a); #elif defined(SIMDE_HAVE_FENV_H) @@ -497,7 +539,7 @@ SIMDE_MM_GET_FLUSH_ZERO_MODE (void) { #endif } #if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES) - #define _MM_SET_FLUSH_ZERO_MODE(a) SIMDE_MM_SET_FLUSH_ZERO_MODE(a) + #define _MM_GET_FLUSH_ZERO_MODE(a) SIMDE_MM_GET_FLUSH_ZERO_MODE(a) #endif SIMDE_FUNCTION_ATTRIBUTES @@ -532,7 +574,7 @@ simde_mm_setcsr (uint32_t a) { #if defined(SIMDE_X86_SSE_NATIVE) _mm_setcsr(a); #else - SIMDE_MM_SET_ROUNDING_MODE(HEDLEY_STATIC_CAST(unsigned int, a)); + SIMDE_MM_SET_ROUNDING_MODE(HEDLEY_STATIC_CAST(uint32_t, a & SIMDE_MM_ROUND_MASK)); #endif } #if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES) @@ -569,6 +611,8 @@ simde_x_mm_round_ps (simde__m128 a, int rounding, int lax_rounding) r_.altivec_f32 = HEDLEY_REINTERPRET_CAST(SIMDE_POWER_ALTIVEC_VECTOR(float), vec_round(a_.altivec_f32)); #elif defined(SIMDE_ARM_NEON_A32V8_NATIVE) && !defined(SIMDE_BUG_GCC_95399) r_.neon_f32 = vrndiq_f32(a_.neon_f32); + #elif defined(SIMDE_WASM_SIMD128_NATIVE) + r_.wasm_v128 = wasm_f32x4_nearest(a_.wasm_v128); #elif defined(simde_math_nearbyintf) SIMDE_VECTORIZE for (size_t i = 0 ; i < (sizeof(r_.f32) / sizeof(r_.f32[0])) ; i++) { @@ -584,6 +628,10 @@ simde_x_mm_round_ps (simde__m128 a, int rounding, int lax_rounding) r_.altivec_f32 = HEDLEY_REINTERPRET_CAST(SIMDE_POWER_ALTIVEC_VECTOR(float), vec_rint(a_.altivec_f32)); #elif defined(SIMDE_ARM_NEON_A32V8_NATIVE) r_.neon_f32 = vrndnq_f32(a_.neon_f32); + #elif defined(SIMDE_LOONGARCH_LSX_NATIVE) + r_.lsx_i64 = __lsx_vfrintrne_s(a_.lsx_f32); + #elif defined(SIMDE_WASM_SIMD128_NATIVE) + r_.wasm_v128 = wasm_f32x4_nearest(a_.wasm_v128); #elif defined(simde_math_roundevenf) SIMDE_VECTORIZE for (size_t i = 0 ; i < (sizeof(r_.f32) / sizeof(r_.f32[0])) ; i++) { @@ -599,6 +647,10 @@ simde_x_mm_round_ps (simde__m128 a, int rounding, int lax_rounding) r_.altivec_f32 = HEDLEY_REINTERPRET_CAST(SIMDE_POWER_ALTIVEC_VECTOR(float), vec_floor(a_.altivec_f32)); #elif defined(SIMDE_ARM_NEON_A32V8_NATIVE) r_.neon_f32 = vrndmq_f32(a_.neon_f32); + #elif defined(SIMDE_LOONGARCH_LSX_NATIVE) + r_.lsx_i64 = __lsx_vfrintrm_s(a_.lsx_f32); + #elif defined(SIMDE_WASM_SIMD128_NATIVE) + r_.wasm_v128 = wasm_f32x4_floor(a_.wasm_v128); #elif defined(simde_math_floorf) SIMDE_VECTORIZE for (size_t i = 0 ; i < (sizeof(r_.f32) / sizeof(r_.f32[0])) ; i++) { @@ -614,6 +666,10 @@ simde_x_mm_round_ps (simde__m128 a, int rounding, int lax_rounding) r_.altivec_f32 = HEDLEY_REINTERPRET_CAST(SIMDE_POWER_ALTIVEC_VECTOR(float), vec_ceil(a_.altivec_f32)); #elif defined(SIMDE_ARM_NEON_A32V8_NATIVE) r_.neon_f32 = vrndpq_f32(a_.neon_f32); + #elif defined(SIMDE_LOONGARCH_LSX_NATIVE) + r_.lsx_i64 = __lsx_vfrintrp_s(a_.lsx_f32); + #elif defined(SIMDE_WASM_SIMD128_NATIVE) + r_.wasm_v128 = wasm_f32x4_ceil(a_.wasm_v128); #elif defined(simde_math_ceilf) SIMDE_VECTORIZE for (size_t i = 0 ; i < (sizeof(r_.f32) / sizeof(r_.f32[0])) ; i++) { @@ -629,6 +685,10 @@ simde_x_mm_round_ps (simde__m128 a, int rounding, int lax_rounding) r_.altivec_f32 = HEDLEY_REINTERPRET_CAST(SIMDE_POWER_ALTIVEC_VECTOR(float), vec_trunc(a_.altivec_f32)); #elif defined(SIMDE_ARM_NEON_A32V8_NATIVE) r_.neon_f32 = vrndq_f32(a_.neon_f32); + #elif defined(SIMDE_LOONGARCH_LSX_NATIVE) + r_.lsx_i64 = __lsx_vfrintrz_s(a_.lsx_f32); + #elif defined(SIMDE_WASM_SIMD128_NATIVE) + r_.wasm_v128 = wasm_f32x4_trunc(a_.wasm_v128); #elif defined(simde_math_truncf) SIMDE_VECTORIZE for (size_t i = 0 ; i < (sizeof(r_.f32) / sizeof(r_.f32[0])) ; i++) { @@ -691,6 +751,10 @@ simde_mm_set_ps1 (simde_float32 a) { #elif defined(SIMDE_POWER_ALTIVEC_P6_NATIVE) || defined(SIMDE_ZARCH_ZVECTOR_14_NATIVE) (void) a; return vec_splats(a); + #elif defined(SIMDE_LOONGARCH_LSX_NATIVE) + return (simde__m128)__lsx_vldrepl_w(&a, 0); + #elif defined(SIMDE_WASM_SIMD128_NATIVE) + return wasm_f32x4_splat(a); #else return simde_mm_set_ps(a, a, a, a); #endif @@ -712,15 +776,17 @@ simde_mm_move_ss (simde__m128 a, simde__m128 b) { a_ = simde__m128_to_private(a), b_ = simde__m128_to_private(b); - #if defined(SIMDE_SHUFFLE_VECTOR_) - r_.f32 = SIMDE_SHUFFLE_VECTOR_(32, 16, a_.f32, b_.f32, 4, 1, 2, 3); - #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE) + #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) r_.neon_f32 = vsetq_lane_f32(vgetq_lane_f32(b_.neon_f32, 0), a_.neon_f32, 0); #elif defined(SIMDE_POWER_ALTIVEC_P6_NATIVE) static const SIMDE_POWER_ALTIVEC_VECTOR(unsigned int) m = { ~0U, 0U, 0U, 0U }; r_.altivec_f32 = vec_sel(a_.altivec_f32, b_.altivec_f32, m); #elif defined(SIMDE_WASM_SIMD128_NATIVE) r_.wasm_v128 = wasm_i8x16_shuffle(b_.wasm_v128, a_.wasm_v128, 0, 1, 2, 3, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31); + #elif defined(SIMDE_LOONGARCH_LSX_NATIVE) + r_.lsx_i64 = __lsx_vextrins_w(a_.lsx_i64, b_.lsx_i64, 0); + #elif defined(SIMDE_SHUFFLE_VECTOR_) + r_.f32 = SIMDE_SHUFFLE_VECTOR_(32, 16, a_.f32, b_.f32, 4, 1, 2, 3); #else r_.f32[0] = b_.f32[0]; r_.f32[1] = a_.f32[1]; @@ -738,7 +804,7 @@ simde_mm_move_ss (simde__m128 a, simde__m128 b) { SIMDE_FUNCTION_ATTRIBUTES simde__m128 simde_x_mm_broadcastlow_ps(simde__m128 a) { - /* This function broadcasts the first element in the inpu vector to + /* This function broadcasts the first element in the input vector to * all lanes. It is used to avoid generating spurious exceptions in * *_ss functions since there may be garbage in the upper lanes. */ @@ -753,6 +819,10 @@ simde_x_mm_broadcastlow_ps(simde__m128 a) { r_.neon_f32 = vdupq_laneq_f32(a_.neon_f32, 0); #elif defined(SIMDE_POWER_ALTIVEC_P6_NATIVE) r_.altivec_f32 = vec_splat(a_.altivec_f32, 0); + #elif defined(SIMDE_LOONGARCH_LSX_NATIVE) + r_.lsx_i64 = __lsx_vreplvei_w(a_.lsx_i64, 0); + #elif defined(SIMDE_WASM_SIMD128_NATIVE) + r_.wasm_v128 = wasm_f32x4_splat(a_.f32[0]); #elif defined(SIMDE_SHUFFLE_VECTOR_) r_.f32 = SIMDE_SHUFFLE_VECTOR_(32, 16, a_.f32, a_.f32, 0, 0, 0, 0); #else @@ -783,6 +853,8 @@ simde_mm_add_ps (simde__m128 a, simde__m128 b) { r_.wasm_v128 = wasm_f32x4_add(a_.wasm_v128, b_.wasm_v128); #elif defined(SIMDE_POWER_ALTIVEC_P6_NATIVE) r_.altivec_f32 = vec_add(a_.altivec_f32, b_.altivec_f32); + #elif defined(SIMDE_LOONGARCH_LSX_NATIVE) + r_.lsx_f32 = __lsx_vfadd_s(a_.lsx_f32, b_.lsx_f32); #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) r_.f32 = a_.f32 + b_.f32; #else @@ -848,6 +920,8 @@ simde_mm_and_ps (simde__m128 a, simde__m128 b) { r_.neon_i32 = vandq_s32(a_.neon_i32, b_.neon_i32); #elif defined(SIMDE_WASM_SIMD128_NATIVE) r_.wasm_v128 = wasm_v128_and(a_.wasm_v128, b_.wasm_v128); + #elif defined(SIMDE_LOONGARCH_LSX_NATIVE) + r_.lsx_i64 = __lsx_vand_v(a_.lsx_i64, b_.lsx_i64); #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) r_.i32 = a_.i32 & b_.i32; #elif defined(SIMDE_POWER_ALTIVEC_P6_NATIVE) @@ -883,6 +957,8 @@ simde_mm_andnot_ps (simde__m128 a, simde__m128 b) { r_.wasm_v128 = wasm_v128_andnot(b_.wasm_v128, a_.wasm_v128); #elif defined(SIMDE_POWER_ALTIVEC_P6_NATIVE) || defined(SIMDE_ZARCH_ZVECTOR_14_NATIVE) r_.altivec_f32 = vec_andc(b_.altivec_f32, a_.altivec_f32); + #elif defined(SIMDE_LOONGARCH_LSX_NATIVE) + r_.lsx_i64 = __lsx_vandn_v(a_.lsx_i64, b_.lsx_i64); #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) r_.i32 = ~a_.i32 & b_.i32; #else @@ -916,6 +992,8 @@ simde_mm_xor_ps (simde__m128 a, simde__m128 b) { r_.wasm_v128 = wasm_v128_xor(a_.wasm_v128, b_.wasm_v128); #elif defined(SIMDE_POWER_ALTIVEC_P6_NATIVE) r_.altivec_i32 = vec_xor(a_.altivec_i32, b_.altivec_i32); + #elif defined(SIMDE_LOONGARCH_LSX_NATIVE) + r_.lsx_i64 = __lsx_vxor_v(a_.lsx_i64, b_.lsx_i64); #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) r_.i32f = a_.i32f ^ b_.i32f; #else @@ -949,6 +1027,8 @@ simde_mm_or_ps (simde__m128 a, simde__m128 b) { r_.wasm_v128 = wasm_v128_or(a_.wasm_v128, b_.wasm_v128); #elif defined(SIMDE_POWER_ALTIVEC_P6_NATIVE) r_.altivec_i32 = vec_or(a_.altivec_i32, b_.altivec_i32); + #elif defined(SIMDE_LOONGARCH_LSX_NATIVE) + r_.lsx_i64 = __lsx_vor_v(a_.lsx_i64, b_.lsx_i64); #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) r_.i32f = a_.i32f | b_.i32f; #else @@ -987,6 +1067,8 @@ simde_x_mm_not_ps(simde__m128 a) { r_.altivec_i32 = vec_nor(a_.altivec_i32, a_.altivec_i32); #elif defined(SIMDE_WASM_SIMD128_NATIVE) r_.wasm_v128 = wasm_v128_not(a_.wasm_v128); + #elif defined(SIMDE_LOONGARCH_LSX_NATIVE) + r_.lsx_i64 = __lsx_vnor_v(a_.lsx_i64, a_.lsx_i64); #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) r_.i32 = ~a_.i32; #else @@ -1026,6 +1108,8 @@ simde_x_mm_select_ps(simde__m128 a, simde__m128 b, simde__m128 mask) { r_.wasm_v128 = wasm_v128_bitselect(b_.wasm_v128, a_.wasm_v128, mask_.wasm_v128); #elif defined(SIMDE_POWER_ALTIVEC_P6_NATIVE) || defined(SIMDE_ZARCH_ZVECTOR_13_NATIVE) r_.altivec_i32 = vec_sel(a_.altivec_i32, b_.altivec_i32, mask_.altivec_u32); + #elif defined(SIMDE_LOONGARCH_LSX_NATIVE) + r_.lsx_i64 = __lsx_vbitsel_v(a_.lsx_i64, b_.lsx_i64, mask_.lsx_i64); #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) r_.i32 = a_.i32 ^ ((a_.i32 ^ b_.i32) & mask_.i32); #else @@ -1160,6 +1244,8 @@ simde_mm_cmpeq_ps (simde__m128 a, simde__m128 b) { r_.wasm_v128 = wasm_f32x4_eq(a_.wasm_v128, b_.wasm_v128); #elif defined(SIMDE_POWER_ALTIVEC_P6_NATIVE) || defined(SIMDE_ZARCH_ZVECTOR_14_NATIVE) r_.altivec_f32 = HEDLEY_REINTERPRET_CAST(SIMDE_POWER_ALTIVEC_VECTOR(float), vec_cmpeq(a_.altivec_f32, b_.altivec_f32)); + #elif defined(SIMDE_LOONGARCH_LSX_NATIVE) + r_.lsx_i64 = __lsx_vfcmp_ceq_s(a_.lsx_f32, b_.lsx_f32); #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) r_.i32 = HEDLEY_REINTERPRET_CAST(__typeof__(r_.i32), a_.f32 == b_.f32); #else @@ -1221,6 +1307,8 @@ simde_mm_cmpge_ps (simde__m128 a, simde__m128 b) { r_.wasm_v128 = wasm_f32x4_ge(a_.wasm_v128, b_.wasm_v128); #elif defined(SIMDE_POWER_ALTIVEC_P6_NATIVE) r_.altivec_f32 = HEDLEY_REINTERPRET_CAST(SIMDE_POWER_ALTIVEC_VECTOR(float), vec_cmpge(a_.altivec_f32, b_.altivec_f32)); + #elif defined(SIMDE_LOONGARCH_LSX_NATIVE) + r_.lsx_i64 = __lsx_vfcmp_cle_s(b_.lsx_f32, a_.lsx_f32); #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) r_.i32 = HEDLEY_REINTERPRET_CAST(__typeof__(r_.i32), (a_.f32 >= b_.f32)); #else @@ -1282,6 +1370,8 @@ simde_mm_cmpgt_ps (simde__m128 a, simde__m128 b) { r_.wasm_v128 = wasm_f32x4_gt(a_.wasm_v128, b_.wasm_v128); #elif defined(SIMDE_POWER_ALTIVEC_P6_NATIVE) r_.altivec_f32 = HEDLEY_REINTERPRET_CAST(SIMDE_POWER_ALTIVEC_VECTOR(float), vec_cmpgt(a_.altivec_f32, b_.altivec_f32)); + #elif defined(SIMDE_LOONGARCH_LSX_NATIVE) + r_.lsx_i64 = __lsx_vfcmp_clt_s(b_.lsx_f32, a_.lsx_f32); #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) r_.i32 = HEDLEY_REINTERPRET_CAST(__typeof__(r_.i32), (a_.f32 > b_.f32)); #else @@ -1343,6 +1433,8 @@ simde_mm_cmple_ps (simde__m128 a, simde__m128 b) { r_.wasm_v128 = wasm_f32x4_le(a_.wasm_v128, b_.wasm_v128); #elif defined(SIMDE_POWER_ALTIVEC_P6_NATIVE) r_.altivec_f32 = HEDLEY_REINTERPRET_CAST(SIMDE_POWER_ALTIVEC_VECTOR(float), vec_cmple(a_.altivec_f32, b_.altivec_f32)); + #elif defined(SIMDE_LOONGARCH_LSX_NATIVE) + r_.lsx_i64 = __lsx_vfcmp_cle_s(a_.lsx_f32, b_.lsx_f32); #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) r_.i32 = HEDLEY_REINTERPRET_CAST(__typeof__(r_.i32), (a_.f32 <= b_.f32)); #else @@ -1404,6 +1496,8 @@ simde_mm_cmplt_ps (simde__m128 a, simde__m128 b) { r_.wasm_v128 = wasm_f32x4_lt(a_.wasm_v128, b_.wasm_v128); #elif defined(SIMDE_POWER_ALTIVEC_P6_NATIVE) r_.altivec_f32 = HEDLEY_REINTERPRET_CAST(SIMDE_POWER_ALTIVEC_VECTOR(float), vec_cmplt(a_.altivec_f32, b_.altivec_f32)); + #elif defined(SIMDE_LOONGARCH_LSX_NATIVE) + r_.lsx_i64 = __lsx_vfcmp_clt_s(a_.lsx_f32, b_.lsx_f32); #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) r_.i32 = HEDLEY_REINTERPRET_CAST(__typeof__(r_.i32), (a_.f32 < b_.f32)); #else @@ -1466,6 +1560,8 @@ simde_mm_cmpneq_ps (simde__m128 a, simde__m128 b) { #elif defined(SIMDE_POWER_ALTIVEC_P7_NATIVE) || defined(SIMDE_ZARCH_ZVECTOR_14_NATIVE) r_.altivec_f32 = HEDLEY_REINTERPRET_CAST(SIMDE_POWER_ALTIVEC_VECTOR(float), vec_cmpeq(a_.altivec_f32, b_.altivec_f32)); r_.altivec_f32 = HEDLEY_REINTERPRET_CAST(SIMDE_POWER_ALTIVEC_VECTOR(float), vec_nor(r_.altivec_f32, r_.altivec_f32)); + #elif defined(SIMDE_LOONGARCH_LSX_NATIVE) + r_.lsx_i64 = __lsx_vfcmp_cune_s(a_.lsx_f32, b_.lsx_f32); #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) r_.i32 = HEDLEY_REINTERPRET_CAST(__typeof__(r_.i32), (a_.f32 != b_.f32)); #else @@ -1607,6 +1703,9 @@ simde_mm_cmpord_ps (simde__m128 a, simde__m128 b) { #elif defined(SIMDE_POWER_ALTIVEC_P6_NATIVE) r_.altivec_f32 = HEDLEY_REINTERPRET_CAST(SIMDE_POWER_ALTIVEC_VECTOR(float), vec_and(vec_cmpeq(a_.altivec_f32, a_.altivec_f32), vec_cmpeq(b_.altivec_f32, b_.altivec_f32))); + #elif defined(SIMDE_LOONGARCH_LSX_NATIVE) + r_.lsx_i64 = __lsx_vfcmp_cun_s(a_.lsx_f32, b_.lsx_f32); + r_.lsx_i64 = __lsx_vnor_v(r_.lsx_i64, r_.lsx_i64); #elif defined(simde_math_isnanf) SIMDE_VECTORIZE for (size_t i = 0 ; i < (sizeof(r_.f32) / sizeof(r_.f32[0])) ; i++) { @@ -1649,6 +1748,8 @@ simde_mm_cmpunord_ps (simde__m128 a, simde__m128 b) { r_.altivec_f32 = HEDLEY_REINTERPRET_CAST(SIMDE_POWER_ALTIVEC_VECTOR(float), vec_and(vec_cmpeq(a_.altivec_f32, a_.altivec_f32), vec_cmpeq(b_.altivec_f32, b_.altivec_f32))); r_.altivec_f32 = vec_nor(r_.altivec_f32, r_.altivec_f32); + #elif defined(SIMDE_LOONGARCH_LSX_NATIVE) + r_.lsx_i64 = __lsx_vfcmp_cun_s(a_.lsx_f32, b_.lsx_f32); #elif defined(simde_math_isnanf) SIMDE_VECTORIZE for (size_t i = 0 ; i < (sizeof(r_.f32) / sizeof(r_.f32[0])) ; i++) { @@ -1713,6 +1814,8 @@ simde_mm_comieq_ss (simde__m128 a, simde__m128 b) { uint32x4_t a_or_b_nan = vmvnq_u32(vandq_u32(a_not_nan, b_not_nan)); uint32x4_t a_eq_b = vceqq_f32(a_.neon_f32, b_.neon_f32); return !!(vgetq_lane_u32(vorrq_u32(a_or_b_nan, a_eq_b), 0) != 0); + #elif defined(SIMDE_WASM_SIMD128_NATIVE) + return wasm_f32x4_extract_lane(a_.wasm_v128, 0) == wasm_f32x4_extract_lane(b_.wasm_v128, 0); #else return a_.f32[0] == b_.f32[0]; #endif @@ -1738,6 +1841,8 @@ simde_mm_comige_ss (simde__m128 a, simde__m128 b) { uint32x4_t a_and_b_not_nan = vandq_u32(a_not_nan, b_not_nan); uint32x4_t a_ge_b = vcgeq_f32(a_.neon_f32, b_.neon_f32); return !!(vgetq_lane_u32(vandq_u32(a_and_b_not_nan, a_ge_b), 0) != 0); + #elif defined(SIMDE_WASM_SIMD128_NATIVE) + return wasm_f32x4_extract_lane(a_.wasm_v128, 0) >= wasm_f32x4_extract_lane(b_.wasm_v128, 0); #else return a_.f32[0] >= b_.f32[0]; #endif @@ -1763,6 +1868,8 @@ simde_mm_comigt_ss (simde__m128 a, simde__m128 b) { uint32x4_t a_and_b_not_nan = vandq_u32(a_not_nan, b_not_nan); uint32x4_t a_gt_b = vcgtq_f32(a_.neon_f32, b_.neon_f32); return !!(vgetq_lane_u32(vandq_u32(a_and_b_not_nan, a_gt_b), 0) != 0); + #elif defined(SIMDE_WASM_SIMD128_NATIVE) + return wasm_f32x4_extract_lane(a_.wasm_v128, 0) > wasm_f32x4_extract_lane(b_.wasm_v128, 0); #else return a_.f32[0] > b_.f32[0]; #endif @@ -1788,6 +1895,8 @@ simde_mm_comile_ss (simde__m128 a, simde__m128 b) { uint32x4_t a_or_b_nan = vmvnq_u32(vandq_u32(a_not_nan, b_not_nan)); uint32x4_t a_le_b = vcleq_f32(a_.neon_f32, b_.neon_f32); return !!(vgetq_lane_u32(vorrq_u32(a_or_b_nan, a_le_b), 0) != 0); + #elif defined(SIMDE_WASM_SIMD128_NATIVE) + return wasm_f32x4_extract_lane(a_.wasm_v128, 0) <= wasm_f32x4_extract_lane(b_.wasm_v128, 0); #else return a_.f32[0] <= b_.f32[0]; #endif @@ -1813,6 +1922,8 @@ simde_mm_comilt_ss (simde__m128 a, simde__m128 b) { uint32x4_t a_or_b_nan = vmvnq_u32(vandq_u32(a_not_nan, b_not_nan)); uint32x4_t a_lt_b = vcltq_f32(a_.neon_f32, b_.neon_f32); return !!(vgetq_lane_u32(vorrq_u32(a_or_b_nan, a_lt_b), 0) != 0); + #elif defined(SIMDE_WASM_SIMD128_NATIVE) + return wasm_f32x4_extract_lane(a_.wasm_v128, 0) < wasm_f32x4_extract_lane(b_.wasm_v128, 0); #else return a_.f32[0] < b_.f32[0]; #endif @@ -1838,6 +1949,8 @@ simde_mm_comineq_ss (simde__m128 a, simde__m128 b) { uint32x4_t a_and_b_not_nan = vandq_u32(a_not_nan, b_not_nan); uint32x4_t a_neq_b = vmvnq_u32(vceqq_f32(a_.neon_f32, b_.neon_f32)); return !!(vgetq_lane_u32(vandq_u32(a_and_b_not_nan, a_neq_b), 0) != 0); + #elif defined(SIMDE_WASM_SIMD128_NATIVE) + return wasm_f32x4_extract_lane(a_.wasm_v128, 0) != wasm_f32x4_extract_lane(b_.wasm_v128, 0); #else return a_.f32[0] != b_.f32[0]; #endif @@ -1870,6 +1983,9 @@ simde_x_mm_copysign_ps(simde__m128 dest, simde__m128 src) { #elif defined(SIMDE_POWER_ALTIVEC_P6_NATIVE) || defined(SIMDE_ZARCH_ZVECTOR_14_NATIVE) const SIMDE_POWER_ALTIVEC_VECTOR(unsigned int) sign_pos = HEDLEY_REINTERPRET_CAST(SIMDE_POWER_ALTIVEC_VECTOR(unsigned int), vec_splats(-0.0f)); r_.altivec_f32 = vec_sel(dest_.altivec_f32, src_.altivec_f32, sign_pos); + #elif defined(SIMDE_LOONGARCH_LSX_NATIVE) + const v4f32 sign_pos = {-0.0f, -0.0f, -0.0f, -0.0f}; + r_.lsx_i64 = __lsx_vbitsel_v(dest_.lsx_i64, src_.lsx_i64, (v2i64)sign_pos); #elif defined(SIMDE_IEEE754_STORAGE) (void) src_; (void) dest_; @@ -2492,6 +2608,8 @@ simde_mm_div_ps (simde__m128 a, simde__m128 b) { r_.wasm_v128 = wasm_f32x4_div(a_.wasm_v128, b_.wasm_v128); #elif defined(SIMDE_POWER_ALTIVEC_P7_NATIVE) r_.altivec_f32 = vec_div(a_.altivec_f32, b_.altivec_f32); + #elif defined(SIMDE_LOONGARCH_LASX_NATIVE) + r_.lsx_f32 = __lsx_vfdiv_s(a_.lsx_f32, b_.lsx_f32); #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) r_.f32 = a_.f32 / b_.f32; #else @@ -2596,6 +2714,10 @@ simde_mm_load_ps (simde_float32 const mem_addr[HEDLEY_ARRAY_PARAM(4)]) { r_.altivec_f32 = vec_vsx_ld(0, mem_addr); #elif defined(SIMDE_POWER_ALTIVEC_P6_NATIVE) r_.altivec_f32 = vec_ld(0, mem_addr); + #elif defined(SIMDE_LOONGARCH_LSX_NATIVE) + r_.lsx_i64 = __lsx_vld(mem_addr, 0); + #elif defined(SIMDE_WASM_SIMD128_NATIVE) + r_.wasm_v128 = wasm_v128_load(mem_addr); #else simde_memcpy(&r_, SIMDE_ALIGN_ASSUME_LIKE(mem_addr, simde__m128), sizeof(r_)); #endif @@ -2617,6 +2739,10 @@ simde_mm_load1_ps (simde_float32 const* mem_addr) { #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) r_.neon_f32 = vld1q_dup_f32(mem_addr); + #elif defined(SIMDE_LOONGARCH_LSX_NATIVE) + r_.lsx_i64 = __lsx_vldrepl_w(mem_addr, 0); + #elif defined(SIMDE_WASM_SIMD128_NATIVE) + r_.wasm_v128 = wasm_v128_load32_splat(mem_addr); #else r_ = simde__m128_to_private(simde_mm_set1_ps(*mem_addr)); #endif @@ -2640,6 +2766,8 @@ simde_mm_load_ss (simde_float32 const* mem_addr) { #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) r_.neon_f32 = vsetq_lane_f32(*mem_addr, vdupq_n_f32(0), 0); + #elif defined(SIMDE_WASM_SIMD128_NATIVE) + r_.wasm_v128 = wasm_v128_load32_zero(mem_addr); #else r_.f32[0] = *mem_addr; r_.i32[1] = 0; @@ -2657,7 +2785,7 @@ simde_mm_load_ss (simde_float32 const* mem_addr) { SIMDE_FUNCTION_ATTRIBUTES simde__m128 simde_mm_loadh_pi (simde__m128 a, simde__m64 const* mem_addr) { - #if defined(SIMDE_X86_SSE_NATIVE) && defined(SIMDE_X86_MMX_NATIVE) + #if defined(SIMDE_X86_SSE_NATIVE) return _mm_loadh_pi(a, HEDLEY_REINTERPRET_CAST(__m64 const*, mem_addr)); #else simde__m128_private @@ -2666,6 +2794,8 @@ simde_mm_loadh_pi (simde__m128 a, simde__m64 const* mem_addr) { #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) r_.neon_f32 = vcombine_f32(vget_low_f32(a_.neon_f32), vld1_f32(HEDLEY_REINTERPRET_CAST(const float32_t*, mem_addr))); + #elif defined(SIMDE_WASM_SIMD128_NATIVE) + r_.wasm_v128 = wasm_v128_load64_lane(mem_addr, a_.wasm_v128, 1); #else simde__m64_private b_ = *HEDLEY_REINTERPRET_CAST(simde__m64_private const*, mem_addr); r_.f32[0] = a_.f32[0]; @@ -2707,6 +2837,8 @@ simde_mm_loadl_pi (simde__m128 a, simde__m64 const* mem_addr) { #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) r_.neon_f32 = vcombine_f32(vld1_f32( HEDLEY_REINTERPRET_CAST(const float32_t*, mem_addr)), vget_high_f32(a_.neon_f32)); + #elif defined(SIMDE_WASM_SIMD128_NATIVE) + r_.wasm_v128 = wasm_v128_load64_lane(mem_addr, a_.wasm_v128, 0); #else simde__m64_private b_; simde_memcpy(&b_, mem_addr, sizeof(b_)); @@ -2742,6 +2874,8 @@ simde_mm_loadr_ps (simde_float32 const mem_addr[HEDLEY_ARRAY_PARAM(4)]) { r_.neon_f32 = vextq_f32(r_.neon_f32, r_.neon_f32, 2); #elif defined(SIMDE_POWER_ALTIVEC_P7_NATIVE) && defined(__PPC64__) r_.altivec_f32 = vec_reve(v_.altivec_f32); + #elif defined(SIMDE_LOONGARCH_LSX_NATIVE) + r_.lsx_i64 = __lsx_vshuf4i_w(v_.lsx_i64, 0x1b); #elif defined(SIMDE_SHUFFLE_VECTOR_) r_.f32 = SIMDE_SHUFFLE_VECTOR_(32, 16, v_.f32, v_.f32, 3, 2, 1, 0); #else @@ -2772,6 +2906,8 @@ simde_mm_loadu_ps (simde_float32 const mem_addr[HEDLEY_ARRAY_PARAM(4)]) { r_.wasm_v128 = wasm_v128_load(mem_addr); #elif defined(SIMDE_POWER_ALTIVEC_P7_NATIVE) && defined(__PPC64__) r_.altivec_f32 = vec_vsx_ld(0, mem_addr); + #elif defined(SIMDE_LOONGARCH_LSX_NATIVE) + r_.lsx_i64 = __lsx_vld(mem_addr, 0); #else simde_memcpy(&r_, mem_addr, sizeof(r_)); #endif @@ -2857,6 +2993,8 @@ simde_mm_max_ps (simde__m128 a, simde__m128 b) { r_.altivec_f32 = vec_max(a_.altivec_f32, b_.altivec_f32); #elif defined(SIMDE_POWER_ALTIVEC_P6_NATIVE) || defined(SIMDE_ZARCH_ZVECTOR_14_NATIVE) r_.altivec_f32 = vec_sel(b_.altivec_f32, a_.altivec_f32, vec_cmpgt(a_.altivec_f32, b_.altivec_f32)); + #elif defined(SIMDE_LOONGARCH_LSX_NATIVE) && defined(SIMDE_FAST_NANS) + r_.lsx_f32 = __lsx_vfmax_s(a_.lsx_f32, b_.lsx_f32); #else SIMDE_VECTORIZE for (size_t i = 0 ; i < (sizeof(r_.f32) / sizeof(r_.f32[0])) ; i++) { @@ -2982,6 +3120,8 @@ simde_mm_min_ps (simde__m128 a, simde__m128 b) { #else r_.altivec_f32 = vec_sel(b_.altivec_f32, a_.altivec_f32, vec_cmpgt(b_.altivec_f32, a_.altivec_f32)); #endif + #elif defined(SIMDE_FAST_NANS) && defined(SIMDE_LOONGARCH_LSX_NATIVE) + r_.lsx_f32 = __lsx_vfmin_s(a_.lsx_f32, b_.lsx_f32); #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) uint32_t SIMDE_VECTOR(16) m = HEDLEY_REINTERPRET_CAST(__typeof__(m), a_.f32 < b_.f32); r_.f32 = @@ -3077,13 +3217,17 @@ simde_mm_movehl_ps (simde__m128 a, simde__m128 b) { a_ = simde__m128_to_private(a), b_ = simde__m128_to_private(b); - #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) + #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + r_.neon_u64 = vzip2q_u64(b_.neon_u64, a_.neon_u64); + #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE) float32x2_t a32 = vget_high_f32(a_.neon_f32); float32x2_t b32 = vget_high_f32(b_.neon_f32); r_.neon_f32 = vcombine_f32(b32, a32); #elif defined(SIMDE_POWER_ALTIVEC_P8_NATIVE) || defined(SIMDE_ZARCH_ZVECTOR_13_NATIVE) r_.altivec_f32 = HEDLEY_REINTERPRET_CAST(SIMDE_POWER_ALTIVEC_VECTOR(float), vec_mergel(b_.altivec_i64, a_.altivec_i64)); + #elif defined(SIMDE_LOONGARCH_LSX_NATIVE) + r_.lsx_i64 = __lsx_vilvh_d(a_.lsx_i64, b_.lsx_i64); #elif defined(SIMDE_SHUFFLE_VECTOR_) r_.f32 = SIMDE_SHUFFLE_VECTOR_(32, 16, a_.f32, b_.f32, 6, 7, 2, 3); #else @@ -3111,15 +3255,17 @@ simde_mm_movelh_ps (simde__m128 a, simde__m128 b) { a_ = simde__m128_to_private(a), b_ = simde__m128_to_private(b); - #if defined(SIMDE_SHUFFLE_VECTOR_) - r_.f32 = SIMDE_SHUFFLE_VECTOR_(32, 16, a_.f32, b_.f32, 0, 1, 4, 5); - #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE) + #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) float32x2_t a10 = vget_low_f32(a_.neon_f32); float32x2_t b10 = vget_low_f32(b_.neon_f32); r_.neon_f32 = vcombine_f32(a10, b10); #elif defined(SIMDE_POWER_ALTIVEC_P8_NATIVE) || defined(SIMDE_ZARCH_ZVECTOR_13_NATIVE) r_.altivec_f32 = HEDLEY_REINTERPRET_CAST(SIMDE_POWER_ALTIVEC_VECTOR(float), vec_mergeh(a_.altivec_i64, b_.altivec_i64)); + #elif defined(SIMDE_LOONGARCH_LSX_NATIVE) + r_.lsx_i64 = __lsx_vilvl_d(b_.lsx_i64, a_.lsx_i64); + #elif defined(SIMDE_SHUFFLE_VECTOR_) + r_.f32 = SIMDE_SHUFFLE_VECTOR_(32, 16, a_.f32, b_.f32, 0, 1, 4, 5); #else r_.f32[0] = a_.f32[0]; r_.f32[1] = a_.f32[1]; @@ -3171,34 +3317,23 @@ simde_mm_movemask_pi8 (simde__m64 a) { SIMDE_FUNCTION_ATTRIBUTES int simde_mm_movemask_ps (simde__m128 a) { - #if defined(SIMDE_X86_SSE_NATIVE) && defined(SIMDE_X86_MMX_NATIVE) + #if defined(SIMDE_X86_SSE_NATIVE) return _mm_movemask_ps(a); #else int r = 0; simde__m128_private a_ = simde__m128_to_private(a); - #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) + #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) + static const int32_t shift[4] = {0, 1, 2, 3}; + uint32x4_t tmp = vshrq_n_u32(a_.neon_u32, 31); + return HEDLEY_STATIC_CAST(int32_t, vaddvq_u32(vshlq_u32(tmp, vld1q_s32(shift)))); + #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE) // Shift out everything but the sign bits with a 32-bit unsigned shift right. uint64x2_t high_bits = vreinterpretq_u64_u32(vshrq_n_u32(a_.neon_u32, 31)); // Merge the two pairs together with a 64-bit unsigned shift right + add. uint8x16_t paired = vreinterpretq_u8_u64(vsraq_n_u64(high_bits, high_bits, 31)); // Extract the result. return vgetq_lane_u8(paired, 0) | (vgetq_lane_u8(paired, 8) << 2); - #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE) - static const uint32_t md[4] = { - 1 << 0, 1 << 1, 1 << 2, 1 << 3 - }; - - uint32x4_t extended = vreinterpretq_u32_s32(vshrq_n_s32(a_.neon_i32, 31)); - uint32x4_t masked = vandq_u32(vld1q_u32(md), extended); - #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) - return HEDLEY_STATIC_CAST(int32_t, vaddvq_u32(masked)); - #else - uint64x2_t t64 = vpaddlq_u32(masked); - return - HEDLEY_STATIC_CAST(int, vgetq_lane_u64(t64, 0)) + - HEDLEY_STATIC_CAST(int, vgetq_lane_u64(t64, 1)); - #endif #elif defined(SIMDE_POWER_ALTIVEC_P8_NATIVE) && defined(SIMDE_BUG_CLANG_50932) SIMDE_POWER_ALTIVEC_VECTOR(unsigned char) idx = { 96, 64, 32, 0, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 }; SIMDE_POWER_ALTIVEC_VECTOR(unsigned char) res = HEDLEY_REINTERPRET_CAST(SIMDE_POWER_ALTIVEC_VECTOR(unsigned char), vec_bperm(HEDLEY_REINTERPRET_CAST(SIMDE_POWER_ALTIVEC_VECTOR(unsigned __int128), a_.altivec_u64), idx)); @@ -3207,6 +3342,11 @@ simde_mm_movemask_ps (simde__m128 a) { SIMDE_POWER_ALTIVEC_VECTOR(unsigned char) idx = { 96, 64, 32, 0, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 }; SIMDE_POWER_ALTIVEC_VECTOR(unsigned char) res = vec_bperm(a_.altivec_u8, idx); return HEDLEY_STATIC_CAST(int32_t, vec_extract(HEDLEY_REINTERPRET_CAST(SIMDE_POWER_ALTIVEC_VECTOR(signed int), res), 2)); + #elif defined(SIMDE_LOONGARCH_LSX_NATIVE) + v2i64 t64 = __lsx_vmskltz_w(a_.lsx_i64); + r = __lsx_vpickve2gr_wu(t64, 0); + #elif defined(SIMDE_WASM_SIMD128_NATIVE) + return HEDLEY_STATIC_CAST(int32_t, wasm_i32x4_bitmask(a_.wasm_v128)); #else SIMDE_VECTORIZE_REDUCTION(|:r) for (size_t i = 0 ; i < sizeof(a_.u32) / sizeof(a_.u32[0]) ; i++) { @@ -3240,6 +3380,8 @@ simde_mm_mul_ps (simde__m128 a, simde__m128 b) { r_.f32 = a_.f32 * b_.f32; #elif defined(SIMDE_POWER_ALTIVEC_P7_NATIVE) r_.altivec_f32 = vec_mul(a_.altivec_f32, b_.altivec_f32); + #elif defined(SIMDE_LOONGARCH_LSX_NATIVE) + r_.lsx_f32 = __lsx_vfmul_s(a_.lsx_f32, b_.lsx_f32); #else SIMDE_VECTORIZE for (size_t i = 0 ; i < (sizeof(r_.f32) / sizeof(r_.f32[0])) ; i++) { @@ -3471,6 +3613,9 @@ simde_mm_prefetch (const void* p, int i) { __prefetch_by_load(p, 0, 1); break; } + #elif HEDLEY_MSVC_VERSION + (void) i; + (void) p; #endif } #if defined(SIMDE_X86_SSE_NATIVE) @@ -3506,6 +3651,9 @@ simde_x_mm_negate_ps(simde__m128 a) { r_.wasm_v128 = wasm_f32x4_neg(a_.wasm_v128); #elif defined(SIMDE_POWER_ALTIVEC_P8_NATIVE) r_.altivec_f32 = vec_neg(a_.altivec_f32); + #elif defined(SIMDE_LOONGARCH_LSX_NATIVE) + const v4f32 f32 = {0.0f, 0.0f, 0.0f, 0.0f}; + r_.lsx_f32 = __lsx_vfsub_s(f32, a_.lsx_f32); #elif defined(SIMDE_VECTOR_NEGATE) r_.f32 = -a_.f32; #else @@ -3543,6 +3691,8 @@ simde_mm_rcp_ps (simde__m128 a) { r_.wasm_v128 = wasm_f32x4_div(simde_mm_set1_ps(1.0f), a_.wasm_v128); #elif defined(SIMDE_POWER_ALTIVEC_P6_NATIVE) r_.altivec_f32 = vec_re(a_.altivec_f32); + #elif defined(SIMDE_LOONGARCH_LSX_NATIVE) + r_.lsx_f32 = __lsx_vfrecip_s(a_.lsx_f32); #elif defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR) r_.f32 = 1.0f / a_.f32; #elif defined(SIMDE_IEEE754_STORAGE) @@ -3611,6 +3761,10 @@ simde_mm_rsqrt_ps (simde__m128 a) { r_.neon_f32 = vrsqrteq_f32(a_.neon_f32); #elif defined(SIMDE_POWER_ALTIVEC_P6_NATIVE) r_.altivec_f32 = vec_rsqrte(a_.altivec_f32); + #elif defined(SIMDE_LOONGARCH_LSX_NATIVE) + r_.lsx_f32 = __lsx_vfrsqrt_s(a_.lsx_f32); + #elif defined(SIMDE_WASM_SIMD128_NATIVE) + r_.wasm_v128 = wasm_f32x4_div(simde_mm_set1_ps(1.0f), wasm_f32x4_sqrt(a_.wasm_v128)); #elif defined(SIMDE_IEEE754_STORAGE) /* https://basesandframes.files.wordpress.com/2020/04/even_faster_math_functions_green_2020.pdf Pages 100 - 103 */ @@ -3793,6 +3947,8 @@ simde_mm_setzero_ps (void) { return vdupq_n_f32(SIMDE_FLOAT32_C(0.0)); #elif defined(SIMDE_POWER_ALTIVEC_P6_NATIVE) return vec_splats(SIMDE_FLOAT32_C(0.0)); + #elif defined(SIMDE_WASM_SIMD128_NATIVE) + return wasm_f32x4_const(0.f, 0.f, 0.f, 0.f); #else simde__m128 r; simde_memset(&r, 0, sizeof(r)); @@ -3874,11 +4030,11 @@ simde_mm_sfence (void) { # define simde_mm_shuffle_pi16(a, imm8) _mm_shuffle_pi16(a, imm8) #elif defined(SIMDE_SHUFFLE_VECTOR_) # define simde_mm_shuffle_pi16(a, imm8) (__extension__ ({ \ - const simde__m64_private simde__tmp_a_ = simde__m64_to_private(a); \ + const simde__m64_private simde_tmp_a_ = simde__m64_to_private(a); \ simde__m64_from_private((simde__m64_private) { .i16 = \ SIMDE_SHUFFLE_VECTOR_(16, 8, \ - (simde__tmp_a_).i16, \ - (simde__tmp_a_).i16, \ + (simde_tmp_a_).i16, \ + (simde_tmp_a_).i16, \ (((imm8) ) & 3), \ (((imm8) >> 2) & 3), \ (((imm8) >> 4) & 3), \ @@ -3931,21 +4087,21 @@ simde_mm_shuffle_ps (simde__m128 a, simde__m128 b, const int imm8) } #if defined(SIMDE_X86_SSE_NATIVE) && !defined(__PGI) # define simde_mm_shuffle_ps(a, b, imm8) _mm_shuffle_ps(a, b, imm8) -#elif defined(SIMDE_SHUFFLE_VECTOR_) +#elif defined(SIMDE_WASM_SIMD128_NATIVE) #define simde_mm_shuffle_ps(a, b, imm8) (__extension__ ({ \ - simde__m128_from_private((simde__m128_private) { .f32 = \ - SIMDE_SHUFFLE_VECTOR_(32, 16, \ - simde__m128_to_private(a).f32, \ - simde__m128_to_private(b).f32, \ - (((imm8) ) & 3), \ - (((imm8) >> 2) & 3), \ - (((imm8) >> 4) & 3) + 4, \ - (((imm8) >> 6) & 3) + 4) }); })) + simde__m128_from_private((simde__m128_private) { .wasm_v128 = \ + wasm_i32x4_shuffle( \ + simde__m128_to_private(a).wasm_v128, \ + simde__m128_to_private(b).wasm_v128, \ + (((imm8) ) & 3), \ + (((imm8) >> 2) & 3), \ + (((imm8) >> 4) & 3) + 4, \ + (((imm8) >> 6) & 3) + 4) }); })) #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE) && defined(SIMDE_STATEMENT_EXPR_) #define simde_mm_shuffle_ps(a, b, imm8) \ (__extension__({ \ - float32x4_t simde_mm_shuffle_ps_a_ = simde__m128i_to_neon_f32(a); \ - float32x4_t simde_mm_shuffle_ps_b_ = simde__m128i_to_neon_f32(b); \ + float32x4_t simde_mm_shuffle_ps_a_ = simde__m128_to_neon_f32(a); \ + float32x4_t simde_mm_shuffle_ps_b_ = simde__m128_to_neon_f32(b); \ float32x4_t simde_mm_shuffle_ps_r_; \ \ simde_mm_shuffle_ps_r_ = vmovq_n_f32(vgetq_lane_f32(simde_mm_shuffle_ps_a_, (imm8) & (0x3))); \ @@ -3953,6 +4109,16 @@ simde_mm_shuffle_ps (simde__m128 a, simde__m128 b, const int imm8) simde_mm_shuffle_ps_r_ = vsetq_lane_f32(vgetq_lane_f32(simde_mm_shuffle_ps_b_, ((imm8) >> 4) & 0x3), simde_mm_shuffle_ps_r_, 2); \ vsetq_lane_f32(vgetq_lane_f32(simde_mm_shuffle_ps_b_, ((imm8) >> 6) & 0x3), simde_mm_shuffle_ps_r_, 3); \ })) +#elif defined(SIMDE_SHUFFLE_VECTOR_) + #define simde_mm_shuffle_ps(a, b, imm8) (__extension__ ({ \ + simde__m128_from_private((simde__m128_private) { .f32 = \ + SIMDE_SHUFFLE_VECTOR_(32, 16, \ + simde__m128_to_private(a).f32, \ + simde__m128_to_private(b).f32, \ + (((imm8) ) & 3), \ + (((imm8) >> 2) & 3), \ + (((imm8) >> 4) & 3) + 4, \ + (((imm8) >> 6) & 3) + 4) }); })) #endif #if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES) # define _mm_shuffle_ps(a, b, imm8) simde_mm_shuffle_ps((a), (b), imm8) @@ -3980,6 +4146,8 @@ simde_mm_sqrt_ps (simde__m128 a) { r_.wasm_v128 = wasm_f32x4_sqrt(a_.wasm_v128); #elif defined(SIMDE_POWER_ALTIVEC_P7_NATIVE) || defined(SIMDE_ZARCH_ZVECTOR_14_NATIVE) r_.altivec_f32 = vec_sqrt(a_.altivec_f32); + #elif defined(SIMDE_LOONGARCH_LSX_NATIVE) + r_.lsx_f32 = __lsx_vfsqrt_s(a_.lsx_f32); #elif defined(simde_math_sqrt) SIMDE_VECTORIZE for (size_t i = 0 ; i < sizeof(r_.f32) / sizeof(r_.f32[0]) ; i++) { @@ -4044,6 +4212,8 @@ simde_mm_store_ps (simde_float32 mem_addr[4], simde__m128 a) { vec_st(a_.altivec_f32, 0, mem_addr); #elif defined(SIMDE_WASM_SIMD128_NATIVE) wasm_v128_store(mem_addr, a_.wasm_v128); + #elif defined(SIMDE_LOONGARCH_LSX_NATIVE) + __lsx_vst(a_.lsx_f32, mem_addr, 0); #else simde_memcpy(mem_addr, &a_, sizeof(a)); #endif @@ -4069,6 +4239,8 @@ simde_mm_store1_ps (simde_float32 mem_addr[4], simde__m128 a) { wasm_v128_store(mem_addr_, wasm_i32x4_shuffle(a_.wasm_v128, a_.wasm_v128, 0, 0, 0, 0)); #elif defined(SIMDE_POWER_ALTIVEC_P6_NATIVE) vec_st(vec_splat(a_.altivec_f32, 0), 0, mem_addr_); + #elif defined(SIMDE_LOONGARCH_LSX_NATIVE) + __lsx_vst(__lsx_vreplvei_w(a_.lsx_f32, 0), mem_addr_, 0); #elif defined(SIMDE_SHUFFLE_VECTOR_) simde__m128_private tmp_; tmp_.f32 = SIMDE_SHUFFLE_VECTOR_(32, 16, a_.f32, a_.f32, 0, 0, 0, 0); @@ -4097,6 +4269,10 @@ simde_mm_store_ss (simde_float32* mem_addr, simde__m128 a) { #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) vst1q_lane_f32(mem_addr, a_.neon_f32, 0); + #elif defined(SIMDE_LOONGARCH_LSX_NATIVE) + __lsx_vstelm_w(a_.lsx_f32, mem_addr, 0, 0); + #elif defined(SIMDE_WASM_SIMD128_NATIVE) + wasm_v128_store32_lane(HEDLEY_REINTERPRET_CAST(void*, mem_addr), a_.wasm_v128, 0); #else *mem_addr = a_.f32[0]; #endif @@ -4116,6 +4292,8 @@ simde_mm_storeh_pi (simde__m64* mem_addr, simde__m128 a) { #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) vst1_f32(HEDLEY_REINTERPRET_CAST(float32_t*, mem_addr), vget_high_f32(a_.neon_f32)); + #elif defined(SIMDE_WASM_SIMD128_NATIVE) + wasm_v128_store64_lane(HEDLEY_REINTERPRET_CAST(void*, mem_addr), a_.wasm_v128, 1); #else simde_memcpy(mem_addr, &(a_.m64[1]), sizeof(a_.m64[1])); #endif @@ -4130,6 +4308,9 @@ void simde_mm_storel_pi (simde__m64* mem_addr, simde__m128 a) { #if defined(SIMDE_X86_SSE_NATIVE) _mm_storel_pi(HEDLEY_REINTERPRET_CAST(__m64*, mem_addr), a); + #elif defined(SIMDE_WASM_SIMD128_NATIVE) + simde__m128_private a_ = simde__m128_to_private(a); + wasm_v128_store64_lane(HEDLEY_REINTERPRET_CAST(void*, mem_addr), a_.wasm_v128, 0); #else simde__m64_private* dest_ = HEDLEY_REINTERPRET_CAST(simde__m64_private*, mem_addr); simde__m128_private a_ = simde__m128_to_private(a); @@ -4159,6 +4340,8 @@ simde_mm_storer_ps (simde_float32 mem_addr[4], simde__m128 a) { #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE) float32x4_t tmp = vrev64q_f32(a_.neon_f32); vst1q_f32(mem_addr, vextq_f32(tmp, tmp, 2)); + #elif defined(SIMDE_LOONGARCH_LSX_NATIVE) + __lsx_vst(__lsx_vshuf4i_w(a_.lsx_f32, 0x1b), mem_addr, 0); #elif defined(SIMDE_SHUFFLE_VECTOR_) a_.f32 = SIMDE_SHUFFLE_VECTOR_(32, 16, a_.f32, a_.f32, 3, 2, 1, 0); simde_mm_store_ps(mem_addr, simde__m128_from_private(a_)); @@ -4186,6 +4369,10 @@ simde_mm_storeu_ps (simde_float32 mem_addr[4], simde__m128 a) { vst1q_f32(mem_addr, a_.neon_f32); #elif defined(SIMDE_POWER_ALTIVEC_P7_NATIVE) vec_vsx_st(a_.altivec_f32, 0, mem_addr); + #elif defined(SIMDE_LOONGARCH_LSX_NATIVE) + __lsx_vst(a_.lsx_f32, mem_addr, 0); + #elif defined(SIMDE_WASM_SIMD128_NATIVE) + wasm_v128_store(mem_addr, a_.wasm_v128); #else simde_memcpy(mem_addr, &a_, sizeof(a_)); #endif @@ -4212,6 +4399,8 @@ simde_mm_sub_ps (simde__m128 a, simde__m128 b) { r_.wasm_v128 = wasm_f32x4_sub(a_.wasm_v128, b_.wasm_v128); #elif defined(SIMDE_POWER_ALTIVEC_P6_NATIVE) r_.altivec_f32 = vec_sub(a_.altivec_f32, b_.altivec_f32); + #elif defined(SIMDE_LOONGARCH_LSX_NATIVE) + r_.lsx_f32 = __lsx_vfsub_s(a_.lsx_f32, b_.lsx_f32); #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) r_.f32 = a_.f32 - b_.f32; #else @@ -4273,6 +4462,8 @@ simde_mm_ucomieq_ss (simde__m128 a, simde__m128 b) { uint32x4_t a_or_b_nan = vmvnq_u32(vandq_u32(a_not_nan, b_not_nan)); uint32x4_t a_eq_b = vceqq_f32(a_.neon_f32, b_.neon_f32); r = !!(vgetq_lane_u32(vorrq_u32(a_or_b_nan, a_eq_b), 0) != 0); + #elif defined(SIMDE_WASM_SIMD128_NATIVE) + r = wasm_f32x4_extract_lane(a_.wasm_v128, 0) == wasm_f32x4_extract_lane(b_.wasm_v128, 0); #elif defined(SIMDE_HAVE_FENV_H) fenv_t envp; int x = feholdexcept(&envp); @@ -4307,6 +4498,8 @@ simde_mm_ucomige_ss (simde__m128 a, simde__m128 b) { uint32x4_t a_and_b_not_nan = vandq_u32(a_not_nan, b_not_nan); uint32x4_t a_ge_b = vcgeq_f32(a_.neon_f32, b_.neon_f32); r = !!(vgetq_lane_u32(vandq_u32(a_and_b_not_nan, a_ge_b), 0) != 0); + #elif defined(SIMDE_WASM_SIMD128_NATIVE) + r = wasm_f32x4_extract_lane(a_.wasm_v128, 0) >= wasm_f32x4_extract_lane(b_.wasm_v128, 0); #elif defined(SIMDE_HAVE_FENV_H) fenv_t envp; int x = feholdexcept(&envp); @@ -4341,6 +4534,8 @@ simde_mm_ucomigt_ss (simde__m128 a, simde__m128 b) { uint32x4_t a_and_b_not_nan = vandq_u32(a_not_nan, b_not_nan); uint32x4_t a_gt_b = vcgtq_f32(a_.neon_f32, b_.neon_f32); r = !!(vgetq_lane_u32(vandq_u32(a_and_b_not_nan, a_gt_b), 0) != 0); + #elif defined(SIMDE_WASM_SIMD128_NATIVE) + r = wasm_f32x4_extract_lane(a_.wasm_v128, 0) > wasm_f32x4_extract_lane(b_.wasm_v128, 0); #elif defined(SIMDE_HAVE_FENV_H) fenv_t envp; int x = feholdexcept(&envp); @@ -4375,6 +4570,8 @@ simde_mm_ucomile_ss (simde__m128 a, simde__m128 b) { uint32x4_t a_or_b_nan = vmvnq_u32(vandq_u32(a_not_nan, b_not_nan)); uint32x4_t a_le_b = vcleq_f32(a_.neon_f32, b_.neon_f32); r = !!(vgetq_lane_u32(vorrq_u32(a_or_b_nan, a_le_b), 0) != 0); + #elif defined(SIMDE_WASM_SIMD128_NATIVE) + r = wasm_f32x4_extract_lane(a_.wasm_v128, 0) <= wasm_f32x4_extract_lane(b_.wasm_v128, 0); #elif defined(SIMDE_HAVE_FENV_H) fenv_t envp; int x = feholdexcept(&envp); @@ -4409,6 +4606,8 @@ simde_mm_ucomilt_ss (simde__m128 a, simde__m128 b) { uint32x4_t a_or_b_nan = vmvnq_u32(vandq_u32(a_not_nan, b_not_nan)); uint32x4_t a_lt_b = vcltq_f32(a_.neon_f32, b_.neon_f32); r = !!(vgetq_lane_u32(vorrq_u32(a_or_b_nan, a_lt_b), 0) != 0); + #elif defined(SIMDE_WASM_SIMD128_NATIVE) + r = wasm_f32x4_extract_lane(a_.wasm_v128, 0) < wasm_f32x4_extract_lane(b_.wasm_v128, 0); #elif defined(SIMDE_HAVE_FENV_H) fenv_t envp; int x = feholdexcept(&envp); @@ -4443,6 +4642,8 @@ simde_mm_ucomineq_ss (simde__m128 a, simde__m128 b) { uint32x4_t a_and_b_not_nan = vandq_u32(a_not_nan, b_not_nan); uint32x4_t a_neq_b = vmvnq_u32(vceqq_f32(a_.neon_f32, b_.neon_f32)); r = !!(vgetq_lane_u32(vandq_u32(a_and_b_not_nan, a_neq_b), 0) != 0); + #elif defined(SIMDE_WASM_SIMD128_NATIVE) + r = wasm_f32x4_extract_lane(a_.wasm_v128, 0) != wasm_f32x4_extract_lane(b_.wasm_v128, 0); #elif defined(SIMDE_HAVE_FENV_H) fenv_t envp; int x = feholdexcept(&envp); @@ -4470,11 +4671,6 @@ simde_mm_ucomineq_ss (simde__m128 a, simde__m128 b) { # endif #endif -#if defined(SIMDE_DIAGNOSTIC_DISABLE_UNINITIALIZED_) - HEDLEY_DIAGNOSTIC_PUSH - SIMDE_DIAGNOSTIC_DISABLE_UNINITIALIZED_ -#endif - SIMDE_FUNCTION_ATTRIBUTES simde__m128 simde_mm_unpackhi_ps (simde__m128 a, simde__m128 b) { @@ -4493,6 +4689,10 @@ simde_mm_unpackhi_ps (simde__m128 a, simde__m128 b) { float32x2_t b1 = vget_high_f32(b_.neon_f32); float32x2x2_t result = vzip_f32(a1, b1); r_.neon_f32 = vcombine_f32(result.val[0], result.val[1]); + #elif defined(SIMDE_LOONGARCH_LSX_NATIVE) + r_.lsx_i64 = __lsx_vilvh_w(b_.lsx_i64, a_.lsx_i64); + #elif defined(SIMDE_WASM_SIMD128_NATIVE) + r_.wasm_v128 = wasm_i32x4_shuffle(a_.wasm_v128, b_.wasm_v128, 2, 6, 3, 7); #elif defined(SIMDE_SHUFFLE_VECTOR_) r_.f32 = SIMDE_SHUFFLE_VECTOR_(32, 16, a_.f32, b_.f32, 2, 6, 3, 7); #else @@ -4524,13 +4724,17 @@ simde_mm_unpacklo_ps (simde__m128 a, simde__m128 b) { r_.neon_f32 = vzip1q_f32(a_.neon_f32, b_.neon_f32); #elif defined(SIMDE_POWER_ALTIVEC_P6_NATIVE) r_.altivec_f32 = vec_mergeh(a_.altivec_f32, b_.altivec_f32); - #elif defined(SIMDE_SHUFFLE_VECTOR_) - r_.f32 = SIMDE_SHUFFLE_VECTOR_(32, 16, a_.f32, b_.f32, 0, 4, 1, 5); + #elif defined(SIMDE_LOONGARCH_LSX_NATIVE) + r_.lsx_i64 = __lsx_vilvl_w(b_.lsx_i64, a_.lsx_i64); + #elif defined(SIMDE_WASM_SIMD128_NATIVE) + r_.wasm_v128 = wasm_i32x4_shuffle(a_.wasm_v128, b_.wasm_v128, 0, 4, 1, 5); #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE) float32x2_t a1 = vget_low_f32(a_.neon_f32); float32x2_t b1 = vget_low_f32(b_.neon_f32); float32x2x2_t result = vzip_f32(a1, b1); r_.neon_f32 = vcombine_f32(result.val[0], result.val[1]); + #elif defined(SIMDE_SHUFFLE_VECTOR_) + r_.f32 = SIMDE_SHUFFLE_VECTOR_(32, 16, a_.f32, b_.f32, 0, 4, 1, 5); #else r_.f32[0] = a_.f32[0]; r_.f32[1] = b_.f32[0]; @@ -4550,16 +4754,19 @@ void simde_mm_stream_pi (simde__m64* mem_addr, simde__m64 a) { #if defined(SIMDE_X86_SSE_NATIVE) && defined(SIMDE_X86_MMX_NATIVE) _mm_stream_pi(HEDLEY_REINTERPRET_CAST(__m64*, mem_addr), a); + #elif HEDLEY_HAS_BUILTIN(__builtin_nontemporal_store) && ( \ + defined(SIMDE_ARM_NEON_A32V7_NATIVE) || defined(SIMDE_MIPS_LOONGSON_MMI_NATIVE) || \ + defined(SIMDE_VECTOR_SUBSCRIPT)) + __builtin_nontemporal_store(a, mem_addr); + #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE) + simde__m64_private a_ = simde__m64_to_private(a); + vst1_s64(HEDLEY_REINTERPRET_CAST(int64_t *, mem_addr), a_.neon_i64); #else simde__m64_private* dest = HEDLEY_REINTERPRET_CAST(simde__m64_private*, mem_addr), a_ = simde__m64_to_private(a); - #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) - dest->i64[0] = vget_lane_s64(a_.neon_i64, 0); - #else - dest->i64[0] = a_.i64[0]; - #endif + dest->i64[0] = a_.i64[0]; #endif } #if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES) @@ -4571,9 +4778,11 @@ void simde_mm_stream_ps (simde_float32 mem_addr[4], simde__m128 a) { #if defined(SIMDE_X86_SSE_NATIVE) _mm_stream_ps(mem_addr, a); - #elif HEDLEY_HAS_BUILTIN(__builtin_nontemporal_store) && defined(SIMDE_VECTOR_SUBSCRIPT_OPS) - simde__m128_private a_ = simde__m128_to_private(a); - __builtin_nontemporal_store(a_.f32, SIMDE_ALIGN_CAST(__typeof__(a_.f32)*, mem_addr)); + #elif HEDLEY_HAS_BUILTIN(__builtin_nontemporal_store) && ( \ + defined(SIMDE_ARM_NEON_A32V7_NATIVE) || defined(SIMDE_VECTOR_SUBSCRIPT) || \ + defined(SIMDE_WASM_SIMD128_NATIVE) || defined(SIMDE_POWER_ALTIVEC_P6_NATIVE) || \ + defined(SIMDE_ZARCH_ZVECTOR_13_NATIVE) || defined(SIMDE_LOONGARCH_LSX_NATIVE)) + __builtin_nontemporal_store(a, SIMDE_ALIGN_ASSUME_CAST(__typeof__(a)*, mem_addr)); #else simde_mm_store_ps(mem_addr, a); #endif diff --git a/lib/simd_wrapper/simde/x86/sse2.h b/lib/simd_wrapper/simde/x86/sse2.h index d4bd1950ea5..be85177df77 100644 --- a/lib/simd_wrapper/simde/x86/sse2.h +++ b/lib/simd_wrapper/simde/x86/sse2.h @@ -33,6 +33,7 @@ #define SIMDE_X86_SSE2_H #include "sse.h" +#include "../simde-f16.h" HEDLEY_DIAGNOSTIC_PUSH SIMDE_DISABLE_UNWANTED_DIAGNOSTICS @@ -52,6 +53,11 @@ typedef union { SIMDE_ALIGN_TO_16 simde_int128 i128 SIMDE_VECTOR(16) SIMDE_MAY_ALIAS; SIMDE_ALIGN_TO_16 simde_uint128 u128 SIMDE_VECTOR(16) SIMDE_MAY_ALIAS; #endif + #if defined(SIMDE_FLOAT16_VECTOR) + SIMDE_ALIGN_TO_16 simde_float16 f16 SIMDE_VECTOR(16) SIMDE_MAY_ALIAS; + #else + SIMDE_ALIGN_TO_16 simde_float16 f16[8]; + #endif SIMDE_ALIGN_TO_16 simde_float32 f32 SIMDE_VECTOR(16) SIMDE_MAY_ALIAS; SIMDE_ALIGN_TO_16 simde_float64 f64 SIMDE_VECTOR(16) SIMDE_MAY_ALIAS; @@ -70,6 +76,7 @@ typedef union { SIMDE_ALIGN_TO_16 simde_int128 i128[1]; SIMDE_ALIGN_TO_16 simde_uint128 u128[1]; #endif + SIMDE_ALIGN_TO_16 simde_float16 f16[8]; SIMDE_ALIGN_TO_16 simde_float32 f32[4]; SIMDE_ALIGN_TO_16 simde_float64 f64[2]; @@ -167,7 +174,7 @@ typedef union { SIMDE_ALIGN_TO_16 simde__m64_private m64_private[2]; SIMDE_ALIGN_TO_16 simde__m64 m64[2]; - #if defined(SIMDE_X86_SSE2_NATIVE) + #if defined(SIMDE_X86_SSE2_NATIVE) || defined(SIMDE_X86_SVML_NATIVE) SIMDE_ALIGN_TO_16 __m128d n; #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE) SIMDE_ALIGN_TO_16 int8x16_t neon_i8; @@ -219,7 +226,7 @@ typedef union { #endif } simde__m128d_private; -#if defined(SIMDE_X86_SSE2_NATIVE) +#if defined(SIMDE_X86_SSE2_NATIVE) || defined(SIMDE_X86_SVML_NATIVE) typedef __m128i simde__m128i; typedef __m128d simde__m128d; #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE) @@ -249,7 +256,7 @@ typedef union { typedef simde__m128d_private simde__m128d; #endif -#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES) +#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES) || defined(SIMDE_X86_SVML_ENABLE_NATIVE_ALIASES) typedef simde__m128i __m128i; typedef simde__m128d __m128d; #endif @@ -442,6 +449,8 @@ simde_x_mm_abs_pd(simde__m128d a) { r_.neon_f64 = vabsq_f64(a_.neon_f64); #elif defined(SIMDE_POWER_ALTIVEC_P7_NATIVE) || defined(SIMDE_ZARCH_ZVECTOR_13_NATIVE) r_.altivec_f64 = vec_abs(a_.altivec_f64); + #elif defined(SIMDE_WASM_SIMD128_NATIVE) + r_.wasm_v128 = wasm_f64x2_abs(a_.wasm_v128); #else SIMDE_VECTORIZE for (size_t i = 0 ; i < (sizeof(r_.f64) / sizeof(r_.f64[0])) ; i++) { @@ -740,6 +749,8 @@ simde_x_mm_broadcastlow_pd(simde__m128d a) { r_.neon_f64 = vdupq_laneq_f64(a_.neon_f64, 0); #elif defined(SIMDE_POWER_ALTIVEC_P7_NATIVE) r_.altivec_f64 = vec_splat(a_.altivec_f64, 0); + #elif defined(SIMDE_WASM_SIMD128_NATIVE) + r_.wasm_v128 = wasm_f64x2_splat(a_.f64[0]); #elif defined(SIMDE_SHUFFLE_VECTOR_) r_.f64 = SIMDE_SHUFFLE_VECTOR_(64, 16, a_.f64, a_.f64, 0, 0); #else @@ -974,6 +985,8 @@ simde_mm_and_si128 (simde__m128i a, simde__m128i b) { r_.neon_i32 = vandq_s32(b_.neon_i32, a_.neon_i32); #elif defined(SIMDE_POWER_ALTIVEC_P6_NATIVE) r_.altivec_u32f = vec_and(a_.altivec_u32f, b_.altivec_u32f); + #elif defined(SIMDE_WASM_SIMD128_NATIVE) + r_.wasm_v128 = wasm_v128_and(a_.wasm_v128, b_.wasm_v128); #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) r_.i32f = a_.i32f & b_.i32f; #else @@ -1040,6 +1053,8 @@ simde_mm_andnot_si128 (simde__m128i a, simde__m128i b) { r_.neon_i32 = vbicq_s32(b_.neon_i32, a_.neon_i32); #elif defined(SIMDE_POWER_ALTIVEC_P6_NATIVE) || defined(SIMDE_ZARCH_ZVECTOR_13_NATIVE) r_.altivec_i32 = vec_andc(b_.altivec_i32, a_.altivec_i32); + #elif defined(SIMDE_WASM_SIMD128_NATIVE) + r_.wasm_v128 = wasm_v128_andnot(b_.wasm_v128, a_.wasm_v128); #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) r_.i32f = ~a_.i32f & b_.i32f; #else @@ -1233,18 +1248,39 @@ simde_mm_bslli_si128 (simde__m128i a, const int imm8) #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE) && !defined(__clang__) #define simde_mm_bslli_si128(a, imm8) \ simde__m128i_from_neon_i8(((imm8) <= 0) ? simde__m128i_to_neon_i8(a) : (((imm8) > 15) ? (vdupq_n_s8(0)) : (vextq_s8(vdupq_n_s8(0), simde__m128i_to_neon_i8(a), 16 - (imm8))))) +#elif defined(SIMDE_WASM_SIMD128_NATIVE) + #define simde_mm_bslli_si128(a, imm8) __extension__ ({ \ + simde__m128i_from_wasm_v128( \ + wasm_i8x16_shuffle(wasm_i32x4_splat(INT32_C(0)), \ + simde__m128i_to_wasm_v128((a)), \ + ((imm8)&0xF0) ? 0 : 16 - ((imm8)&0xF), \ + ((imm8)&0xF0) ? 0 : 17 - ((imm8)&0xF), \ + ((imm8)&0xF0) ? 0 : 18 - ((imm8)&0xF), \ + ((imm8)&0xF0) ? 0 : 19 - ((imm8)&0xF), \ + ((imm8)&0xF0) ? 0 : 20 - ((imm8)&0xF), \ + ((imm8)&0xF0) ? 0 : 21 - ((imm8)&0xF), \ + ((imm8)&0xF0) ? 0 : 22 - ((imm8)&0xF), \ + ((imm8)&0xF0) ? 0 : 23 - ((imm8)&0xF), \ + ((imm8)&0xF0) ? 0 : 24 - ((imm8)&0xF), \ + ((imm8)&0xF0) ? 0 : 25 - ((imm8)&0xF), \ + ((imm8)&0xF0) ? 0 : 26 - ((imm8)&0xF), \ + ((imm8)&0xF0) ? 0 : 27 - ((imm8)&0xF), \ + ((imm8)&0xF0) ? 0 : 28 - ((imm8)&0xF), \ + ((imm8)&0xF0) ? 0 : 29 - ((imm8)&0xF), \ + ((imm8)&0xF0) ? 0 : 30 - ((imm8)&0xF), \ + ((imm8)&0xF0) ? 0 : 31 - ((imm8)&0xF))); }) #elif defined(SIMDE_SHUFFLE_VECTOR_) && !defined(SIMDE_POWER_ALTIVEC_P6_NATIVE) && !defined(SIMDE_ZARCH_ZVECTOR_13_NATIVE) #define simde_mm_bslli_si128(a, imm8) (__extension__ ({ \ - const simde__m128i_private simde__tmp_a_ = simde__m128i_to_private(a); \ - const simde__m128i_private simde__tmp_z_ = simde__m128i_to_private(simde_mm_setzero_si128()); \ - simde__m128i_private simde__tmp_r_; \ + const simde__m128i_private simde_tmp_a_ = simde__m128i_to_private(a); \ + const simde__m128i_private simde_tmp_z_ = simde__m128i_to_private(simde_mm_setzero_si128()); \ + simde__m128i_private simde_tmp_r_; \ if (HEDLEY_UNLIKELY(imm8 > 15)) { \ - simde__tmp_r_ = simde__m128i_to_private(simde_mm_setzero_si128()); \ + simde_tmp_r_ = simde__m128i_to_private(simde_mm_setzero_si128()); \ } else { \ - simde__tmp_r_.i8 = \ + simde_tmp_r_.i8 = \ SIMDE_SHUFFLE_VECTOR_(8, 16, \ - simde__tmp_z_.i8, \ - (simde__tmp_a_).i8, \ + simde_tmp_z_.i8, \ + (simde_tmp_a_).i8, \ HEDLEY_STATIC_CAST(int8_t, (16 - imm8) & 31), \ HEDLEY_STATIC_CAST(int8_t, (17 - imm8) & 31), \ HEDLEY_STATIC_CAST(int8_t, (18 - imm8) & 31), \ @@ -1262,7 +1298,7 @@ simde_mm_bslli_si128 (simde__m128i a, const int imm8) HEDLEY_STATIC_CAST(int8_t, (30 - imm8) & 31), \ HEDLEY_STATIC_CAST(int8_t, (31 - imm8) & 31)); \ } \ - simde__m128i_from_private(simde__tmp_r_); })) + simde__m128i_from_private(simde_tmp_r_); })) #endif #define simde_mm_slli_si128(a, imm8) simde_mm_bslli_si128(a, imm8) #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES) @@ -1307,18 +1343,48 @@ simde_mm_bsrli_si128 (simde__m128i a, const int imm8) #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE) && !defined(__clang__) #define simde_mm_bsrli_si128(a, imm8) \ simde__m128i_from_neon_i8(((imm8 < 0) || (imm8 > 15)) ? vdupq_n_s8(0) : (vextq_s8(simde__m128i_to_private(a).neon_i8, vdupq_n_s8(0), ((imm8 & 15) != 0) ? imm8 : (imm8 & 15)))) +#elif defined(SIMDE_WASM_SIMD128_NATIVE) + #define simde_mm_bsrli_si128(a, imm8) (__extension__ ({ \ + const simde__m128i_private simde_tmp_a_ = simde__m128i_to_private(a); \ + const simde__m128i_private simde_tmp_z_ = simde__m128i_to_private(simde_mm_setzero_si128()); \ + simde__m128i_private simde_tmp_r_ = simde__m128i_to_private(a); \ + if (HEDLEY_UNLIKELY(imm8 > 15)) { \ + simde_tmp_r_ = simde__m128i_to_private(simde_mm_setzero_si128()); \ + } else { \ + simde_tmp_r_.wasm_v128 = \ + wasm_i8x16_shuffle( \ + simde_tmp_z_.wasm_v128, \ + simde_tmp_a_.wasm_v128, \ + HEDLEY_STATIC_CAST(int8_t, (imm8 + 16) & 31), \ + HEDLEY_STATIC_CAST(int8_t, (imm8 + 17) & 31), \ + HEDLEY_STATIC_CAST(int8_t, (imm8 + 18) & 31), \ + HEDLEY_STATIC_CAST(int8_t, (imm8 + 19) & 31), \ + HEDLEY_STATIC_CAST(int8_t, (imm8 + 20) & 31), \ + HEDLEY_STATIC_CAST(int8_t, (imm8 + 21) & 31), \ + HEDLEY_STATIC_CAST(int8_t, (imm8 + 22) & 31), \ + HEDLEY_STATIC_CAST(int8_t, (imm8 + 23) & 31), \ + HEDLEY_STATIC_CAST(int8_t, (imm8 + 24) & 31), \ + HEDLEY_STATIC_CAST(int8_t, (imm8 + 25) & 31), \ + HEDLEY_STATIC_CAST(int8_t, (imm8 + 26) & 31), \ + HEDLEY_STATIC_CAST(int8_t, (imm8 + 27) & 31), \ + HEDLEY_STATIC_CAST(int8_t, (imm8 + 28) & 31), \ + HEDLEY_STATIC_CAST(int8_t, (imm8 + 29) & 31), \ + HEDLEY_STATIC_CAST(int8_t, (imm8 + 30) & 31), \ + HEDLEY_STATIC_CAST(int8_t, (imm8 + 31) & 31)); \ + } \ + simde__m128i_from_private(simde_tmp_r_); })) #elif defined(SIMDE_SHUFFLE_VECTOR_) && !defined(SIMDE_POWER_ALTIVEC_P6_NATIVE) && !defined(SIMDE_ZARCH_ZVECTOR_13_NATIVE) #define simde_mm_bsrli_si128(a, imm8) (__extension__ ({ \ - const simde__m128i_private simde__tmp_a_ = simde__m128i_to_private(a); \ - const simde__m128i_private simde__tmp_z_ = simde__m128i_to_private(simde_mm_setzero_si128()); \ - simde__m128i_private simde__tmp_r_ = simde__m128i_to_private(a); \ + const simde__m128i_private simde_tmp_a_ = simde__m128i_to_private(a); \ + const simde__m128i_private simde_tmp_z_ = simde__m128i_to_private(simde_mm_setzero_si128()); \ + simde__m128i_private simde_tmp_r_ = simde__m128i_to_private(a); \ if (HEDLEY_UNLIKELY(imm8 > 15)) { \ - simde__tmp_r_ = simde__m128i_to_private(simde_mm_setzero_si128()); \ + simde_tmp_r_ = simde__m128i_to_private(simde_mm_setzero_si128()); \ } else { \ - simde__tmp_r_.i8 = \ + simde_tmp_r_.i8 = \ SIMDE_SHUFFLE_VECTOR_(8, 16, \ - simde__tmp_z_.i8, \ - (simde__tmp_a_).i8, \ + simde_tmp_z_.i8, \ + (simde_tmp_a_).i8, \ HEDLEY_STATIC_CAST(int8_t, (imm8 + 16) & 31), \ HEDLEY_STATIC_CAST(int8_t, (imm8 + 17) & 31), \ HEDLEY_STATIC_CAST(int8_t, (imm8 + 18) & 31), \ @@ -1336,7 +1402,7 @@ simde_mm_bsrli_si128 (simde__m128i a, const int imm8) HEDLEY_STATIC_CAST(int8_t, (imm8 + 30) & 31), \ HEDLEY_STATIC_CAST(int8_t, (imm8 + 31) & 31)); \ } \ - simde__m128i_from_private(simde__tmp_r_); })) + simde__m128i_from_private(simde_tmp_r_); })) #endif #define simde_mm_srli_si128(a, imm8) simde_mm_bsrli_si128((a), (imm8)) #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES) @@ -1354,7 +1420,7 @@ simde_mm_clflush (void const* p) { #endif } #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES) - #define _mm_clflush(a, b) simde_mm_clflush() + #define _mm_clflush(p) simde_mm_clflush(p) #endif SIMDE_FUNCTION_ATTRIBUTES @@ -2404,6 +2470,9 @@ simde_mm_cmpord_pd (simde__m128d a, simde__m128d b) { uint64x2_t ceqaa = vceqq_f64(a_.neon_f64, a_.neon_f64); uint64x2_t ceqbb = vceqq_f64(b_.neon_f64, b_.neon_f64); r_.neon_u64 = vandq_u64(ceqaa, ceqbb); + #elif defined(SIMDE_WASM_SIMD128_NATIVE) + r_.wasm_v128 = wasm_v128_and(wasm_f64x2_eq(a_.wasm_v128, a_.wasm_v128), + wasm_f64x2_eq(b_.wasm_v128, b_.wasm_v128)); #elif defined(simde_math_isnan) SIMDE_VECTORIZE for (size_t i = 0 ; i < (sizeof(r_.f64) / sizeof(r_.f64[0])) ; i++) { @@ -2429,6 +2498,8 @@ simde_mm_cvtsd_f64 (simde__m128d a) { simde__m128d_private a_ = simde__m128d_to_private(a); #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) return HEDLEY_STATIC_CAST(simde_float64, vgetq_lane_f64(a_.neon_f64, 0)); + #elif defined(SIMDE_WASM_SIMD128_NATIVE) + return HEDLEY_STATIC_CAST(simde_float64, wasm_f64x2_extract_lane(a_.wasm_v128, 0)); #else return a_.f64[0]; #endif @@ -2482,6 +2553,9 @@ simde_mm_cmpunord_pd (simde__m128d a, simde__m128d b) { uint64x2_t ceqaa = vceqq_f64(a_.neon_f64, a_.neon_f64); uint64x2_t ceqbb = vceqq_f64(b_.neon_f64, b_.neon_f64); r_.neon_u64 = vreinterpretq_u64_u32(vmvnq_u32(vreinterpretq_u32_u64(vandq_u64(ceqaa, ceqbb)))); + #elif defined(SIMDE_WASM_SIMD128_NATIVE) + r_.wasm_v128 = wasm_v128_or(wasm_f64x2_ne(a_.wasm_v128, a_.wasm_v128), + wasm_f64x2_ne(b_.wasm_v128, b_.wasm_v128)); #elif defined(simde_math_isnan) SIMDE_VECTORIZE for (size_t i = 0 ; i < (sizeof(r_.f64) / sizeof(r_.f64[0])) ; i++) { @@ -2536,7 +2610,9 @@ simde_mm_cvtepi32_pd (simde__m128i a) { simde__m128d_private r_; simde__m128i_private a_ = simde__m128i_to_private(a); - #if defined(SIMDE_CONVERT_VECTOR_) + #if defined(SIMDE_WASM_SIMD128_NATIVE) + r_.wasm_v128 = wasm_f64x2_convert_low_i32x4(a_.wasm_v128); + #elif defined(SIMDE_CONVERT_VECTOR_) SIMDE_CONVERT_VECTOR_(r_.f64, a_.m64_private[0].i32); #else SIMDE_VECTORIZE @@ -2746,7 +2822,9 @@ simde_mm_cvtps_pd (simde__m128 a) { simde__m128d_private r_; simde__m128_private a_ = simde__m128_to_private(a); - #if defined(SIMDE_CONVERT_VECTOR_) + #if defined(SIMDE_WASM_SIMD128_NATIVE) + r_.wasm_v128 = wasm_f64x2_promote_low_f32x4(a_.wasm_v128); + #elif defined(SIMDE_CONVERT_VECTOR_) SIMDE_CONVERT_VECTOR_(r_.f64, a_.m64_private[0].f32); #elif defined(SIMDE_ARM_NEON_A64V8_NATIVE) r_.neon_f64 = vcvt_f64_f32(vget_low_f32(a_.neon_f32)); @@ -3156,7 +3234,7 @@ simde_mm_cvttps_epi32 (simde__m128 a) { r_.wasm_v128 = wasm_v128_bitselect(r_.wasm_v128, wasm_i32x4_splat(INT32_MIN), valid_input); #endif - #elif defined(SIMDE_CONVERT_VECTOR_) + #elif defined(SIMDE_CONVERT_VECTOR_) && !defined(SIMDE_ARCH_POWER) SIMDE_CONVERT_VECTOR_(r_.i32, a_.f32); #if !defined(SIMDE_FAST_CONVERSION_RANGE) || !defined(SIMDE_FAST_NANS) @@ -3318,6 +3396,8 @@ simde_mm_extract_epi16 (simde__m128i a, const int imm8) #define simde_mm_extract_epi16(a, imm8) _mm_extract_epi16(a, imm8) #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE) #define simde_mm_extract_epi16(a, imm8) (HEDLEY_STATIC_CAST(int32_t, vgetq_lane_s16(simde__m128i_to_private(a).neon_i16, (imm8))) & (INT32_C(0x0000ffff))) +#elif defined(SIMDE_WASM_SIMD128_NATIVE) + #define simde_mm_extract_epi16(a, imm8) HEDLEY_STATIC_CAST(int32_t, wasm_u16x8_extract_lane(simde__m128i_to_wasm_v128((a)), (imm8) & 7)) #endif #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES) #define _mm_extract_epi16(a, imm8) simde_mm_extract_epi16(a, imm8) @@ -3335,6 +3415,8 @@ simde_mm_insert_epi16 (simde__m128i a, int16_t i, const int imm8) #define simde_mm_insert_epi16(a, i, imm8) _mm_insert_epi16((a), (i), (imm8)) #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE) #define simde_mm_insert_epi16(a, i, imm8) simde__m128i_from_neon_i16(vsetq_lane_s16((i), simde__m128i_to_neon_i16(a), (imm8))) +#elif defined(SIMDE_WASM_SIMD128_NATIVE) + #define simde_mm_insert_epi16(a, i, imm8) wasm_i16x8_replace_lane(simde__m128i_to_wasm_v128((a)), (imm8) & 7, (i)) #endif #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES) #define _mm_insert_epi16(a, i, imm8) simde_mm_insert_epi16(a, i, imm8) @@ -3352,6 +3434,8 @@ simde_mm_load_pd (simde_float64 const mem_addr[HEDLEY_ARRAY_PARAM(2)]) { r_.neon_f64 = vld1q_f64(mem_addr); #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE) r_.neon_u32 = vld1q_u32(HEDLEY_REINTERPRET_CAST(uint32_t const*, mem_addr)); + #elif defined(SIMDE_WASM_SIMD128_NATIVE) + r_.wasm_v128 = wasm_v128_load(mem_addr); #else simde_memcpy(&r_, SIMDE_ALIGN_ASSUME_LIKE(mem_addr, simde__m128d), sizeof(r_)); #endif @@ -3392,6 +3476,8 @@ simde_mm_load_sd (simde_float64 const* mem_addr) { #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) r_.neon_f64 = vsetq_lane_f64(*mem_addr, vdupq_n_f64(0), 0); + #elif defined(SIMDE_WASM_SIMD128_NATIVE) + r_.wasm_v128 = wasm_v128_load64_zero(HEDLEY_REINTERPRET_CAST(const void*, mem_addr)); #else r_.f64[0] = *mem_addr; r_.u64[1] = UINT64_C(0); @@ -3409,13 +3495,13 @@ simde__m128i simde_mm_load_si128 (simde__m128i const* mem_addr) { #if defined(SIMDE_X86_SSE2_NATIVE) return _mm_load_si128(HEDLEY_REINTERPRET_CAST(__m128i const*, mem_addr)); + #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE) + return vld1q_s64(HEDLEY_REINTERPRET_CAST(int64_t const*, mem_addr)); #else simde__m128i_private r_; #if defined(SIMDE_POWER_ALTIVEC_P6_NATIVE) r_.altivec_i32 = vec_ld(0, HEDLEY_REINTERPRET_CAST(SIMDE_POWER_ALTIVEC_VECTOR(int) const*, mem_addr)); - #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE) - r_.neon_i32 = vld1q_s32(HEDLEY_REINTERPRET_CAST(int32_t const*, mem_addr)); #else simde_memcpy(&r_, SIMDE_ALIGN_ASSUME_LIKE(mem_addr, simde__m128i), sizeof(simde__m128i)); #endif @@ -3439,6 +3525,8 @@ simde_mm_loadh_pd (simde__m128d a, simde_float64 const* mem_addr) { #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) r_.neon_f64 = vcombine_f64(vget_low_f64(a_.neon_f64), vld1_f64(HEDLEY_REINTERPRET_CAST(const float64_t*, mem_addr))); + #elif defined(SIMDE_WASM_SIMD128_NATIVE) + r_.wasm_v128 = wasm_v128_load64_lane(HEDLEY_REINTERPRET_CAST(const void*, mem_addr), a_.wasm_v128, 1); #else simde_float64 t; @@ -3492,6 +3580,8 @@ simde_mm_loadl_pd (simde__m128d a, simde_float64 const* mem_addr) { #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) r_.neon_f64 = vcombine_f64(vld1_f64( HEDLEY_REINTERPRET_CAST(const float64_t*, mem_addr)), vget_high_f64(a_.neon_f64)); + #elif defined(SIMDE_WASM_SIMD128_NATIVE) + r_.wasm_v128 = wasm_v128_load64_lane(HEDLEY_REINTERPRET_CAST(const void*, mem_addr), a_.wasm_v128, 0); #else r_.f64[0] = *mem_addr; r_.u64[1] = a_.u64[1]; @@ -3553,12 +3643,15 @@ simde_mm_loadu_pd (simde_float64 const mem_addr[HEDLEY_ARRAY_PARAM(2)]) { #define _mm_loadu_pd(mem_addr) simde_mm_loadu_pd(mem_addr) #endif +#if defined(SIMDE_X86_AVX512VL_NATIVE) && defined(SIMDE_X86_AVX512BW_NATIVE) \ + && !defined(SIMDE_BUG_GCC_95483) && !defined(SIMDE_BUG_CLANG_REV_344862) \ + && (!defined(HEDLEY_MSVC_VERSION) || HEDLEY_MSVC_VERSION_CHECK(19,20,0)) + #define simde_mm_loadu_epi8(mem_addr) _mm_loadu_epi8(mem_addr) +#else SIMDE_FUNCTION_ATTRIBUTES simde__m128i simde_mm_loadu_epi8(void const * mem_addr) { - #if defined(SIMDE_X86_AVX512VL_NATIVE) && defined(SIMDE_X86_AVX512BW_NATIVE) && !defined(SIMDE_BUG_GCC_95483) && !defined(SIMDE_BUG_CLANG_REV_344862) - return _mm_loadu_epi8(mem_addr); - #elif defined(SIMDE_X86_SSE2_NATIVE) + #if defined(SIMDE_X86_SSE2_NATIVE) return _mm_loadu_si128(SIMDE_ALIGN_CAST(__m128i const *, mem_addr)); #else simde__m128i_private r_; @@ -3572,18 +3665,22 @@ simde_mm_loadu_epi8(void const * mem_addr) { return simde__m128i_from_private(r_); #endif } +#endif #define simde_x_mm_loadu_epi8(mem_addr) simde_mm_loadu_epi8(mem_addr) #if defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) || defined(SIMDE_X86_AVX512BW_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && (defined(SIMDE_BUG_GCC_95483) || defined(SIMDE_BUG_CLANG_REV_344862))) #undef _mm_loadu_epi8 #define _mm_loadu_epi8(a) simde_mm_loadu_epi8(a) #endif +#if defined(SIMDE_X86_AVX512VL_NATIVE) && defined(SIMDE_X86_AVX512BW_NATIVE) \ + && !defined(SIMDE_BUG_GCC_95483) && !defined(SIMDE_BUG_CLANG_REV_344862) \ + && (!defined(HEDLEY_MSVC_VERSION) || HEDLEY_MSVC_VERSION_CHECK(19,20,0)) + #define simde_mm_loadu_epi16(mem_addr) _mm_loadu_epi16(mem_addr) +#else SIMDE_FUNCTION_ATTRIBUTES simde__m128i simde_mm_loadu_epi16(void const * mem_addr) { - #if defined(SIMDE_X86_AVX512VL_NATIVE) && defined(SIMDE_X86_AVX512BW_NATIVE) && !defined(SIMDE_BUG_GCC_95483) && !defined(SIMDE_BUG_CLANG_REV_344862) - return _mm_loadu_epi16(mem_addr); - #elif defined(SIMDE_X86_SSE2_NATIVE) + #if defined(SIMDE_X86_SSE2_NATIVE) return _mm_loadu_si128(SIMDE_ALIGN_CAST(__m128i const *, mem_addr)); #else simde__m128i_private r_; @@ -3597,18 +3694,21 @@ simde_mm_loadu_epi16(void const * mem_addr) { return simde__m128i_from_private(r_); #endif } +#endif #define simde_x_mm_loadu_epi16(mem_addr) simde_mm_loadu_epi16(mem_addr) #if defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) || defined(SIMDE_X86_AVX512BW_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && (defined(SIMDE_BUG_GCC_95483) || defined(SIMDE_BUG_CLANG_REV_344862))) #undef _mm_loadu_epi16 #define _mm_loadu_epi16(a) simde_mm_loadu_epi16(a) #endif +#if defined(SIMDE_X86_AVX512VL_NATIVE) && !defined(SIMDE_BUG_GCC_95483) \ + && !defined(SIMDE_BUG_CLANG_REV_344862) && (!defined(HEDLEY_MSVC_VERSION) || HEDLEY_MSVC_VERSION_CHECK(19,20,0)) + #define simde_mm_loadu_epi32(mem_addr) _mm_loadu_epi32(mem_addr) +#else SIMDE_FUNCTION_ATTRIBUTES simde__m128i simde_mm_loadu_epi32(void const * mem_addr) { - #if defined(SIMDE_X86_AVX512VL_NATIVE) && !defined(SIMDE_BUG_GCC_95483) && !defined(SIMDE_BUG_CLANG_REV_344862) - return _mm_loadu_epi32(mem_addr); - #elif defined(SIMDE_X86_SSE2_NATIVE) + #if defined(SIMDE_X86_SSE2_NATIVE) return _mm_loadu_si128(SIMDE_ALIGN_CAST(__m128i const *, mem_addr)); #else simde__m128i_private r_; @@ -3622,18 +3722,22 @@ simde_mm_loadu_epi32(void const * mem_addr) { return simde__m128i_from_private(r_); #endif } +#endif #define simde_x_mm_loadu_epi32(mem_addr) simde_mm_loadu_epi32(mem_addr) #if defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && (defined(SIMDE_BUG_GCC_95483) || defined(SIMDE_BUG_CLANG_REV_344862))) #undef _mm_loadu_epi32 #define _mm_loadu_epi32(a) simde_mm_loadu_epi32(a) #endif +#if defined(SIMDE_X86_AVX512VL_NATIVE) && !defined(SIMDE_BUG_GCC_95483) \ + && !defined(SIMDE_BUG_CLANG_REV_344862) \ + && (!defined(HEDLEY_MSVC_VERSION) || HEDLEY_MSVC_VERSION_CHECK(19,20,0)) + #define simde_mm_loadu_epi64(mem_addr) _mm_loadu_epi64(mem_addr) +#else SIMDE_FUNCTION_ATTRIBUTES simde__m128i simde_mm_loadu_epi64(void const * mem_addr) { - #if defined(SIMDE_X86_AVX512VL_NATIVE) && !defined(SIMDE_BUG_GCC_95483) && !defined(SIMDE_BUG_CLANG_REV_344862) - return _mm_loadu_epi64(mem_addr); - #elif defined(SIMDE_X86_SSE2_NATIVE) + #if defined(SIMDE_X86_SSE2_NATIVE) return _mm_loadu_si128(SIMDE_ALIGN_CAST(__m128i const *, mem_addr)); #else simde__m128i_private r_; @@ -3647,6 +3751,7 @@ simde_mm_loadu_epi64(void const * mem_addr) { return simde__m128i_from_private(r_); #endif } +#endif #define simde_x_mm_loadu_epi64(mem_addr) simde_mm_loadu_epi64(mem_addr) #if defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && (defined(SIMDE_BUG_GCC_95483) || defined(SIMDE_BUG_CLANG_REV_344862))) #undef _mm_loadu_epi64 @@ -3707,6 +3812,8 @@ simde_mm_madd_epi16 (simde__m128i a, simde__m128i b) { r_.altivec_i32 = vec_msum(a_.altivec_i16, b_.altivec_i16, vec_splats(0)); #elif defined(SIMDE_ZARCH_ZVECTOR_13_NATIVE) r_.altivec_i32 = vec_mule(a_.altivec_i16, b_.altivec_i16) + vec_mulo(a_.altivec_i16, b_.altivec_i16); + #elif defined(SIMDE_WASM_SIMD128_NATIVE) + r_.wasm_v128 = wasm_i32x4_dot_i16x8(a_.wasm_v128, b_.wasm_v128); #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) && defined(SIMDE_CONVERT_VECTOR_) && HEDLEY_HAS_BUILTIN(__builtin_shufflevector) int32_t SIMDE_VECTOR(32) a32, b32, p32; SIMDE_CONVERT_VECTOR_(a32, a_.i16); @@ -3790,6 +3897,8 @@ simde_mm_movemask_epi8 (simde__m128i a) { #elif defined(SIMDE_POWER_ALTIVEC_P8_NATIVE) && !defined(HEDLEY_IBM_VERSION) && (SIMDE_ENDIAN_ORDER == SIMDE_ENDIAN_BIG) static const SIMDE_POWER_ALTIVEC_VECTOR(unsigned char) perm = { 120, 112, 104, 96, 88, 80, 72, 64, 56, 48, 40, 32, 24, 16, 8, 0 }; r = HEDLEY_STATIC_CAST(int32_t, vec_extract(vec_vbpermq(a_.altivec_u8, perm), 14)); + #elif defined(SIMDE_WASM_SIMD128_NATIVE) + r = HEDLEY_STATIC_CAST(int32_t, wasm_i8x16_bitmask(a_.wasm_v128)); #else SIMDE_VECTORIZE_REDUCTION(|:r) for (size_t i = 0 ; i < (sizeof(a_.u8) / sizeof(a_.u8[0])) ; i++) { @@ -3829,6 +3938,8 @@ simde_mm_movemask_pd (simde__m128d a) { SIMDE_POWER_ALTIVEC_VECTOR(unsigned char) idx = { 64, 0, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 }; SIMDE_POWER_ALTIVEC_VECTOR(unsigned char) res = vec_bperm(a_.altivec_u8, idx); r = HEDLEY_STATIC_CAST(int32_t, vec_extract(HEDLEY_REINTERPRET_CAST(SIMDE_POWER_ALTIVEC_VECTOR(signed int), res), 2)); + #elif defined(SIMDE_WASM_SIMD128_NATIVE) + r = HEDLEY_STATIC_CAST(int32_t, wasm_i64x2_bitmask(a_.wasm_v128)); #else SIMDE_VECTORIZE_REDUCTION(|:r) for (size_t i = 0 ; i < (sizeof(a_.u64) / sizeof(a_.u64[0])) ; i++) { @@ -4146,6 +4257,8 @@ simde_mm_move_epi64 (simde__m128i a) { #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) r_.neon_i64 = vsetq_lane_s64(0, a_.neon_i64, 1); + #elif defined(SIMDE_WASM_SIMD128_NATIVE) + r_.wasm_v128 = wasm_i64x2_shuffle(a_.wasm_v128, wasm_i64x2_const(0, 0), 0, 2); #else r_.i64[0] = a_.i64[0]; r_.i64[1] = 0; @@ -4173,6 +4286,10 @@ simde_mm_mul_epu32 (simde__m128i a, simde__m128i b) { uint32x2_t a_lo = vmovn_u64(a_.neon_u64); uint32x2_t b_lo = vmovn_u64(b_.neon_u64); r_.neon_u64 = vmull_u32(a_lo, b_lo); + #elif defined(SIMDE_WASM_SIMD128_NATIVE) + r_.wasm_v128 = wasm_u64x2_extmul_low_u32x4( + wasm_i32x4_shuffle(a_.wasm_v128, a_.wasm_v128, 0, 2, 0, 2), + wasm_i32x4_shuffle(b_.wasm_v128, b_.wasm_v128, 0, 2, 0, 2)); #elif defined(SIMDE_SHUFFLE_VECTOR_) && (SIMDE_ENDIAN_ORDER == SIMDE_ENDIAN_LITTLE) __typeof__(a_.u32) z = { 0, }; a_.u32 = SIMDE_SHUFFLE_VECTOR_(32, 16, a_.u32, z, 0, 4, 2, 6); @@ -4201,7 +4318,9 @@ simde_x_mm_mul_epi64 (simde__m128i a, simde__m128i b) { a_ = simde__m128i_to_private(a), b_ = simde__m128i_to_private(b); - #if defined(SIMDE_VECTOR_SUBSCRIPT_OPS) + #if defined(SIMDE_WASM_SIMD128_NATIVE) + r_.wasm_v128 = wasm_i64x2_mul(a_.wasm_v128, b_.wasm_v128); + #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) r_.i64 = a_.i64 * b_.i64; #else SIMDE_VECTORIZE @@ -4343,6 +4462,10 @@ simde_mm_mulhi_epi16 (simde__m128i a, simde__m128i b) { uint16x8x2_t rv = vuzpq_u16(vreinterpretq_u16_s32(ab3210), vreinterpretq_u16_s32(ab7654)); r_.neon_u16 = rv.val[1]; #endif + #elif defined(SIMDE_WASM_SIMD128_NATIVE) + const v128_t lo = wasm_i32x4_extmul_low_i16x8(a_.wasm_v128, b_.wasm_v128); + const v128_t hi = wasm_i32x4_extmul_high_i16x8(a_.wasm_v128, b_.wasm_v128); + r_.wasm_v128 = wasm_i16x8_shuffle(lo, hi, 1, 3, 5, 7, 9, 11, 13, 15); #else SIMDE_VECTORIZE for (size_t i = 0 ; i < (sizeof(r_.i16) / sizeof(r_.i16[0])) ; i++) { @@ -4382,6 +4505,10 @@ simde_mm_mulhi_epu16 (simde__m128i a, simde__m128i b) { uint16x8x2_t neon_r = vuzpq_u16(vreinterpretq_u16_u32(ab3210), vreinterpretq_u16_u32(ab7654)); r_.neon_u16 = neon_r.val[1]; #endif + #elif defined(SIMDE_WASM_SIMD128_NATIVE) + const v128_t lo = wasm_u32x4_extmul_low_u16x8(a_.wasm_v128, b_.wasm_v128); + const v128_t hi = wasm_u32x4_extmul_high_u16x8(a_.wasm_v128, b_.wasm_v128); + r_.wasm_v128 = wasm_i16x8_shuffle(lo, hi, 1, 3, 5, 7, 9, 11, 13, 15); #else SIMDE_VECTORIZE for (size_t i = 0 ; i < (sizeof(r_.u16) / sizeof(r_.u16[0])) ; i++) { @@ -4413,6 +4540,8 @@ simde_mm_mullo_epi16 (simde__m128i a, simde__m128i b) { (void) a_; (void) b_; r_.altivec_i16 = vec_mul(a_.altivec_i16, b_.altivec_i16); + #elif defined(SIMDE_WASM_SIMD128_NATIVE) + r_.wasm_v128 = wasm_i16x8_mul(a_.wasm_v128, b_.wasm_v128); #else SIMDE_VECTORIZE for (size_t i = 0 ; i < (sizeof(r_.i16) / sizeof(r_.i16[0])) ; i++) { @@ -4473,6 +4602,8 @@ simde_mm_or_si128 (simde__m128i a, simde__m128i b) { r_.neon_i32 = vorrq_s32(a_.neon_i32, b_.neon_i32); #elif defined(SIMDE_POWER_ALTIVEC_P6_NATIVE) r_.altivec_i32 = vec_or(a_.altivec_i32, b_.altivec_i32); + #elif defined(SIMDE_WASM_SIMD128_NATIVE) + r_.wasm_v128 = wasm_v128_or(a_.wasm_v128, b_.wasm_v128); #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) r_.i32f = a_.i32f | b_.i32f; #else @@ -4639,6 +4770,28 @@ void simde_mm_pause (void) { #if defined(SIMDE_X86_SSE2_NATIVE) _mm_pause(); + #elif defined(SIMDE_ARCH_X86) + #if defined(_MSC_VER) + __asm pause; + #else + __asm__ __volatile__("pause"); + #endif + #elif defined(SIMDE_ARCH_ARM_NEON) + #if defined(_MSC_VER) + __isb(_ARM64_BARRIER_SY); + #else + __asm__ __volatile__("isb\n"); + #endif + #elif defined(SIMDE_ARCH_POWER) + __asm__ __volatile__ ("or 27,27,27" ::: "memory"); + #elif defined(SIMDE_ARCH_WASM) + __asm__ __volatile__ ("nop"); + #elif defined(HEDLEY_GCC_VERSION) + #if defined(SIMDE_ARCH_RISCV) + __builtin_riscv_pause(); + #else + __asm__ __volatile__ ("nop" ::: "memory"); + #endif #endif } #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES) @@ -4768,7 +4921,8 @@ simde__m128i simde_mm_loadu_si16 (void const* mem_addr) { #if defined(SIMDE_X86_SSE2_NATIVE) && ( \ SIMDE_DETECT_CLANG_VERSION_CHECK(8,0,0) || \ - HEDLEY_INTEL_VERSION_CHECK(20,21,1)) + HEDLEY_INTEL_VERSION_CHECK(20,21,1) || \ + HEDLEY_GCC_VERSION_CHECK(12,1,0)) return _mm_loadu_si16(mem_addr); #else int16_t val; @@ -4812,8 +4966,15 @@ simde__m128i simde_mm_loadu_si32 (void const* mem_addr) { #if defined(SIMDE_X86_SSE2_NATIVE) && ( \ SIMDE_DETECT_CLANG_VERSION_CHECK(8,0,0) || \ - HEDLEY_INTEL_VERSION_CHECK(20,21,1)) + HEDLEY_INTEL_VERSION_CHECK(20,21,1) || \ + HEDLEY_GCC_VERSION_CHECK(12,1,0)) return _mm_loadu_si32(mem_addr); + #elif defined(SIMDE_WASM_SIMD128_NATIVE) + return simde__m128i_from_wasm_v128(wasm_v128_load32_zero(mem_addr)); + #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE) + simde__m128i_private r_; + r_.neon_i32 = vsetq_lane_s32(* HEDLEY_REINTERPRET_CAST(const int32_t *, mem_addr), vdupq_n_s32(0), 0); + return simde__m128i_from_private(r_); #else int32_t val; simde_memcpy(&val, mem_addr, sizeof(val)); @@ -4911,6 +5072,8 @@ simde_x_mm_set_epu8 (uint8_t e15, uint8_t e14, uint8_t e13, uint8_t e12, e8, e9, e10, e11, e12, e13, e14, e15}; r_.neon_u8 = vld1q_u8(data); + #elif defined(SIMDE_WASM_SIMD128_NATIVE) + r_.wasm_v128 = wasm_u8x16_make(e0, e1, e2, e3, e4, e5, e6, e7, e8, e9, e10, e11, e12, e13, e14, e15); #else r_.u8[ 0] = e0; r_.u8[ 1] = e1; r_.u8[ 2] = e2; r_.u8[ 3] = e3; r_.u8[ 4] = e4; r_.u8[ 5] = e5; r_.u8[ 6] = e6; r_.u8[ 7] = e7; @@ -4936,6 +5099,8 @@ simde_x_mm_set_epu16 (uint16_t e7, uint16_t e6, uint16_t e5, uint16_t e4, #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) SIMDE_ALIGN_LIKE_16(uint16x8_t) uint16_t data[8] = { e0, e1, e2, e3, e4, e5, e6, e7 }; r_.neon_u16 = vld1q_u16(data); + #elif defined(SIMDE_WASM_SIMD128_NATIVE) + r_.wasm_v128 = wasm_u16x8_make(e0, e1, e2, e3, e4, e5, e6, e7); #else r_.u16[0] = e0; r_.u16[1] = e1; r_.u16[2] = e2; r_.u16[3] = e3; r_.u16[4] = e4; r_.u16[5] = e5; r_.u16[6] = e6; r_.u16[7] = e7; @@ -4957,6 +5122,8 @@ simde_x_mm_set_epu32 (uint32_t e3, uint32_t e2, uint32_t e1, uint32_t e0) { #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) SIMDE_ALIGN_LIKE_16(uint32x4_t) uint32_t data[4] = { e0, e1, e2, e3 }; r_.neon_u32 = vld1q_u32(data); + #elif defined(SIMDE_WASM_SIMD128_NATIVE) + r_.wasm_v128 = wasm_u32x4_make(e0, e1, e2, e3); #else r_.u32[0] = e0; r_.u32[1] = e1; @@ -4979,6 +5146,8 @@ simde_x_mm_set_epu64x (uint64_t e1, uint64_t e0) { #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) SIMDE_ALIGN_LIKE_16(uint64x2_t) uint64_t data[2] = {e0, e1}; r_.neon_u64 = vld1q_u64(data); + #elif defined(SIMDE_WASM_SIMD128_NATIVE) + r_.wasm_v128 = wasm_u64x2_make(e0, e1); #else r_.u64[0] = e0; r_.u64[1] = e1; @@ -4995,6 +5164,8 @@ simde_mm_set_sd (simde_float64 a) { return _mm_set_sd(a); #elif defined(SIMDE_ARM_NEON_A64V8_NATIVE) return vsetq_lane_f64(a, vdupq_n_f64(SIMDE_FLOAT64_C(0.0)), 0); + #elif defined(SIMDE_WASM_SIMD128_NATIVE) + return simde__m128d_from_wasm_v128(wasm_f64x2_make(a, 0)); #else return simde_mm_set_pd(SIMDE_FLOAT64_C(0.0), a); #endif @@ -5134,6 +5305,8 @@ simde__m128i simde_x_mm_set1_epu8 (uint8_t value) { #if defined(SIMDE_POWER_ALTIVEC_P6_NATIVE) return simde__m128i_from_altivec_u8(vec_splats(HEDLEY_STATIC_CAST(unsigned char, value))); + #elif defined(SIMDE_WASM_SIMD128_NATIVE) + return simde__m128i_from_wasm_v128(wasm_u8x16_splat(value)); #else return simde_mm_set1_epi8(HEDLEY_STATIC_CAST(int8_t, value)); #endif @@ -5144,6 +5317,8 @@ simde__m128i simde_x_mm_set1_epu16 (uint16_t value) { #if defined(SIMDE_POWER_ALTIVEC_P6_NATIVE) return simde__m128i_from_altivec_u16(vec_splats(HEDLEY_STATIC_CAST(unsigned short, value))); + #elif defined(SIMDE_WASM_SIMD128_NATIVE) + return simde__m128i_from_wasm_v128(wasm_u16x8_splat(value)); #else return simde_mm_set1_epi16(HEDLEY_STATIC_CAST(int16_t, value)); #endif @@ -5154,6 +5329,8 @@ simde__m128i simde_x_mm_set1_epu32 (uint32_t value) { #if defined(SIMDE_POWER_ALTIVEC_P6_NATIVE) return simde__m128i_from_altivec_u32(vec_splats(HEDLEY_STATIC_CAST(unsigned int, value))); + #elif defined(SIMDE_WASM_SIMD128_NATIVE) + return simde__m128i_from_wasm_v128(wasm_u32x4_splat(value)); #else return simde_mm_set1_epi32(HEDLEY_STATIC_CAST(int32_t, value)); #endif @@ -5164,6 +5341,8 @@ simde__m128i simde_x_mm_set1_epu64 (uint64_t value) { #if defined(SIMDE_POWER_ALTIVEC_P7_NATIVE) return simde__m128i_from_altivec_u64(vec_splats(HEDLEY_STATIC_CAST(unsigned long long, value))); + #elif defined(SIMDE_WASM_SIMD128_NATIVE) + return simde__m128i_from_wasm_v128(wasm_u64x2_splat(value)); #else return simde_mm_set1_epi64x(HEDLEY_STATIC_CAST(int64_t, value)); #endif @@ -5247,6 +5426,8 @@ simde__m128d simde_mm_setzero_pd (void) { #if defined(SIMDE_X86_SSE2_NATIVE) return _mm_setzero_pd(); + #elif defined(SIMDE_WASM_SIMD128_NATIVE) + return simde__m128d_from_wasm_v128(wasm_f64x2_const(0.0, 0.0)); #else return simde_mm_castsi128_pd(simde_mm_setzero_si128()); #endif @@ -5326,17 +5507,17 @@ simde_mm_shuffle_epi32 (simde__m128i a, const int imm8) } #if defined(SIMDE_X86_SSE2_NATIVE) #define simde_mm_shuffle_epi32(a, imm8) _mm_shuffle_epi32((a), (imm8)) -#elif defined(SIMDE_SHUFFLE_VECTOR_) +#elif defined(SIMDE_WASM_SIMD128_NATIVE) #define simde_mm_shuffle_epi32(a, imm8) (__extension__ ({ \ - const simde__m128i_private simde__tmp_a_ = simde__m128i_to_private(a); \ - simde__m128i_from_private((simde__m128i_private) { .i32 = \ - SIMDE_SHUFFLE_VECTOR_(32, 16, \ - (simde__tmp_a_).i32, \ - (simde__tmp_a_).i32, \ + const simde__m128i_private simde_tmp_a_ = simde__m128i_to_private(a); \ + simde__m128i_from_wasm_v128( \ + wasm_i32x4_shuffle( \ + (simde_tmp_a_).wasm_v128, \ + (simde_tmp_a_).wasm_v128, \ ((imm8) ) & 3, \ ((imm8) >> 2) & 3, \ ((imm8) >> 4) & 3, \ - ((imm8) >> 6) & 3) }); })) + ((imm8) >> 6) & 3)); })) #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE) && defined(SIMDE_STATEMENT_EXPR_) #define simde_mm_shuffle_epi32(a, imm8) \ (__extension__ ({ \ @@ -5348,6 +5529,17 @@ simde_mm_shuffle_epi32 (simde__m128i a, const int imm8) simde_mm_shuffle_epi32_r_ = vsetq_lane_s32(vgetq_lane_s32(simde_mm_shuffle_epi32_a_, ((imm8) >> 6) & 0x3), simde_mm_shuffle_epi32_r_, 3); \ vreinterpretq_s64_s32(simde_mm_shuffle_epi32_r_); \ })) +#elif defined(SIMDE_SHUFFLE_VECTOR_) + #define simde_mm_shuffle_epi32(a, imm8) (__extension__ ({ \ + const simde__m128i_private simde_tmp_a_ = simde__m128i_to_private(a); \ + simde__m128i_from_private((simde__m128i_private) { .i32 = \ + SIMDE_SHUFFLE_VECTOR_(32, 16, \ + (simde_tmp_a_).i32, \ + (simde_tmp_a_).i32, \ + ((imm8) ) & 3, \ + ((imm8) >> 2) & 3, \ + ((imm8) >> 4) & 3, \ + ((imm8) >> 6) & 3) }); })) #endif #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES) #define _mm_shuffle_epi32(a, imm8) simde_mm_shuffle_epi32(a, imm8) @@ -5402,18 +5594,6 @@ simde_mm_shufflehi_epi16 (simde__m128i a, const int imm8) } #if defined(SIMDE_X86_SSE2_NATIVE) #define simde_mm_shufflehi_epi16(a, imm8) _mm_shufflehi_epi16((a), (imm8)) -#elif defined(SIMDE_SHUFFLE_VECTOR_) - #define simde_mm_shufflehi_epi16(a, imm8) (__extension__ ({ \ - const simde__m128i_private simde__tmp_a_ = simde__m128i_to_private(a); \ - simde__m128i_from_private((simde__m128i_private) { .i16 = \ - SIMDE_SHUFFLE_VECTOR_(16, 16, \ - (simde__tmp_a_).i16, \ - (simde__tmp_a_).i16, \ - 0, 1, 2, 3, \ - (((imm8) ) & 3) + 4, \ - (((imm8) >> 2) & 3) + 4, \ - (((imm8) >> 4) & 3) + 4, \ - (((imm8) >> 6) & 3) + 4) }); })) #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE) && defined(SIMDE_STATEMENT_EXPR_) #define simde_mm_shufflehi_epi16(a, imm8) \ (__extension__ ({ \ @@ -5425,6 +5605,30 @@ simde_mm_shufflehi_epi16 (simde__m128i a, const int imm8) simde_mm_shufflehi_epi16_r_ = vsetq_lane_s16(vgetq_lane_s16(simde_mm_shufflehi_epi16_a_, (((imm8) >> 6) & 0x3) + 4), simde_mm_shufflehi_epi16_r_, 7); \ simde__m128i_from_neon_i16(simde_mm_shufflehi_epi16_r_); \ })) +#elif defined(SIMDE_WASM_SIMD128_NATIVE) + #define simde_mm_shufflehi_epi16(a, imm8) (__extension__ ({ \ + const simde__m128i_private simde_tmp_a_ = simde__m128i_to_private(a); \ + simde__m128i_from_private((simde__m128i_private) { .wasm_v128 = \ + wasm_i16x8_shuffle( \ + (simde_tmp_a_).wasm_v128, \ + (simde_tmp_a_).wasm_v128, \ + 0, 1, 2, 3, \ + (((imm8) ) & 3) + 4, \ + (((imm8) >> 2) & 3) + 4, \ + (((imm8) >> 4) & 3) + 4, \ + (((imm8) >> 6) & 3) + 4) }); })) +#elif defined(SIMDE_SHUFFLE_VECTOR_) + #define simde_mm_shufflehi_epi16(a, imm8) (__extension__ ({ \ + const simde__m128i_private simde_tmp_a_ = simde__m128i_to_private(a); \ + simde__m128i_from_private((simde__m128i_private) { .i16 = \ + SIMDE_SHUFFLE_VECTOR_(16, 16, \ + (simde_tmp_a_).i16, \ + (simde_tmp_a_).i16, \ + 0, 1, 2, 3, \ + (((imm8) ) & 3) + 4, \ + (((imm8) >> 2) & 3) + 4, \ + (((imm8) >> 4) & 3) + 4, \ + (((imm8) >> 6) & 3) + 4) }); })) #endif #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES) #define _mm_shufflehi_epi16(a, imm8) simde_mm_shufflehi_epi16(a, imm8) @@ -5450,18 +5654,17 @@ simde_mm_shufflelo_epi16 (simde__m128i a, const int imm8) } #if defined(SIMDE_X86_SSE2_NATIVE) #define simde_mm_shufflelo_epi16(a, imm8) _mm_shufflelo_epi16((a), (imm8)) -#elif defined(SIMDE_SHUFFLE_VECTOR_) - #define simde_mm_shufflelo_epi16(a, imm8) (__extension__ ({ \ - const simde__m128i_private simde__tmp_a_ = simde__m128i_to_private(a); \ - simde__m128i_from_private((simde__m128i_private) { .i16 = \ - SIMDE_SHUFFLE_VECTOR_(16, 16, \ - (simde__tmp_a_).i16, \ - (simde__tmp_a_).i16, \ - (((imm8) ) & 3), \ - (((imm8) >> 2) & 3), \ - (((imm8) >> 4) & 3), \ - (((imm8) >> 6) & 3), \ - 4, 5, 6, 7) }); })) +#elif defined(SIMDE_WASM_SIMD128_NATIVE) + #define simde_mm_shufflelo_epi16(a, imm8) \ + simde__m128i_from_wasm_v128( \ + wasm_i16x8_shuffle( \ + simde__m128i_to_wasm_v128((a)), \ + wasm_i16x8_splat(0), \ + (((imm8) & 0x03) ), \ + (((imm8) & 0x0c) >> 2), \ + (((imm8) & 0x30) >> 4), \ + (((imm8) & 0xc0) >> 6), \ + 4, 5, 6, 7)) #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE) && defined(SIMDE_STATEMENT_EXPR_) #define simde_mm_shufflelo_epi16(a, imm8) \ (__extension__({ \ @@ -5473,6 +5676,18 @@ simde_mm_shufflelo_epi16 (simde__m128i a, const int imm8) simde_mm_shufflelo_epi16_r_ = vsetq_lane_s16(vgetq_lane_s16(simde_mm_shufflelo_epi16_a_, (((imm8) >> 6) & 0x3)), simde_mm_shufflelo_epi16_r_, 3); \ simde__m128i_from_neon_i16(simde_mm_shufflelo_epi16_r_); \ })) +#elif defined(SIMDE_SHUFFLE_VECTOR_) + #define simde_mm_shufflelo_epi16(a, imm8) (__extension__ ({ \ + const simde__m128i_private simde_tmp_a_ = simde__m128i_to_private(a); \ + simde__m128i_from_private((simde__m128i_private) { .i16 = \ + SIMDE_SHUFFLE_VECTOR_(16, 16, \ + (simde_tmp_a_).i16, \ + (simde_tmp_a_).i16, \ + (((imm8) ) & 3), \ + (((imm8) >> 2) & 3), \ + (((imm8) >> 4) & 3), \ + (((imm8) >> 6) & 3), \ + 4, 5, 6, 7) }); })) #endif #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES) #define _mm_shufflelo_epi16(a, imm8) simde_mm_shufflelo_epi16(a, imm8) @@ -6194,6 +6409,8 @@ simde_mm_store_sd (simde_float64* mem_addr, simde__m128d a) { #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE) const int64_t v = vgetq_lane_s64(a_.neon_i64, 0); simde_memcpy(HEDLEY_REINTERPRET_CAST(int64_t*, mem_addr), &v, sizeof(v)); + #elif defined(SIMDE_WASM_SIMD128_NATIVE) + wasm_v128_store64_lane(HEDLEY_REINTERPRET_CAST(void*, mem_addr), a_.wasm_v128, 0); #else simde_float64 v = a_.f64[0]; simde_memcpy(mem_addr, &v, sizeof(simde_float64)); @@ -6233,6 +6450,8 @@ void #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) *mem_addr = vgetq_lane_f64(a_.neon_f64, 1); + #elif defined(SIMDE_WASM_SIMD128_NATIVE) + wasm_v128_store64_lane(HEDLEY_REINTERPRET_CAST(void*, mem_addr), a_.wasm_v128, 1); #else *mem_addr = a_.f64[1]; #endif @@ -6277,6 +6496,8 @@ void simde_mm_storel_pd (simde_float64* mem_addr, simde__m128d a) { #if defined(SIMDE_X86_SSE2_NATIVE) _mm_storel_pd(mem_addr, a); + #elif defined(SIMDE_WASM_SIMD128_NATIVE) + wasm_v128_store64_lane(HEDLEY_REINTERPRET_CAST(void*, mem_addr), simde__m128d_to_wasm_v128(a), 0); #else simde__m128d_private a_ = simde__m128d_to_private(a); @@ -6303,6 +6524,9 @@ simde_mm_storer_pd (simde_float64 mem_addr[2], simde__m128d a) { #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) vst1q_s64(HEDLEY_REINTERPRET_CAST(int64_t*, mem_addr), vextq_s64(a_.neon_i64, a_.neon_i64, 1)); + #elif defined(SIMDE_WASM_SIMD128_NATIVE) + a_.wasm_v128 = wasm_i64x2_shuffle(a_.wasm_v128, a_.wasm_v128, 1, 0); + simde_mm_store_pd(mem_addr, simde__m128d_from_private(a_)); #elif defined(SIMDE_SHUFFLE_VECTOR_) a_.f64 = SIMDE_SHUFFLE_VECTOR_(64, 16, a_.f64, a_.f64, 1, 0); simde_mm_store_pd(mem_addr, simde__m128d_from_private(a_)); @@ -6369,6 +6593,8 @@ simde_mm_storeu_si32 (void* mem_addr, simde__m128i a) { HEDLEY_GCC_VERSION_CHECK(11,0,0) || \ HEDLEY_INTEL_VERSION_CHECK(20,21,1)) _mm_storeu_si32(mem_addr, a); + #elif defined(SIMDE_WASM_SIMD128_NATIVE) + wasm_v128_store32_lane(mem_addr, simde__m128i_to_wasm_v128(a), 0); #else int32_t val = simde_mm_cvtsi128_si32(a); simde_memcpy(mem_addr, &val, sizeof(val)); @@ -6400,8 +6626,13 @@ void simde_mm_stream_pd (simde_float64 mem_addr[HEDLEY_ARRAY_PARAM(2)], simde__m128d a) { #if defined(SIMDE_X86_SSE2_NATIVE) _mm_stream_pd(mem_addr, a); + #elif HEDLEY_HAS_BUILTIN(__builtin_nontemporal_store) && ( \ + defined(SIMDE_VECTOR_SUBSCRIPT) || defined(SIMDE_ARM_NEON_A64V8_NATIVE) || \ + defined(SIMDE_WASM_SIMD128_NATIVE) || defined(SIMDE_POWER_ALTIVEC_P7_NATIVE) || \ + defined(SIMDE_ZARCH_ZVECTOR_13_NATIVE)) + __builtin_nontemporal_store(a, SIMDE_ALIGN_CAST(__typeof__(a)*, mem_addr)); #else - simde_memcpy(mem_addr, &a, sizeof(a)); + simde_mm_store_pd(mem_addr, a); #endif } #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES) @@ -6413,8 +6644,13 @@ void simde_mm_stream_si128 (simde__m128i* mem_addr, simde__m128i a) { #if defined(SIMDE_X86_SSE2_NATIVE) && defined(SIMDE_ARCH_AMD64) _mm_stream_si128(HEDLEY_STATIC_CAST(__m128i*, mem_addr), a); + #elif HEDLEY_HAS_BUILTIN(__builtin_nontemporal_store) && ( \ + defined(SIMDE_VECTOR_SUBSCRIPT) || defined(SIMDE_ARM_NEON_A32V7_NATIVE) || \ + defined(SIMDE_WASM_SIMD128_NATIVE) || defined(SIMDE_POWER_ALTIVEC_P6_NATIVE) || \ + defined(SIMDE_ZARCH_ZVECTOR_13_NATIVE)) + __builtin_nontemporal_store(a, SIMDE_ALIGN_CAST(__typeof__(a)*, mem_addr)); #else - simde_memcpy(mem_addr, &a, sizeof(a)); + simde_mm_store_si128(mem_addr, a); #endif } #if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES) @@ -6426,6 +6662,10 @@ void simde_mm_stream_si32 (int32_t* mem_addr, int32_t a) { #if defined(SIMDE_X86_SSE2_NATIVE) _mm_stream_si32(mem_addr, a); + #elif HEDLEY_HAS_BUILTIN(__builtin_nontemporal_store) + __builtin_nontemporal_store(a, mem_addr); + #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE) + vst1q_lane_s32(mem_addr, vdupq_n_s32(a), 0); #else *mem_addr = a; #endif @@ -6439,6 +6679,10 @@ void simde_mm_stream_si64 (int64_t* mem_addr, int64_t a) { #if defined(SIMDE_X86_SSE2_NATIVE) && defined(SIMDE_ARCH_AMD64) && !defined(HEDLEY_MSVC_VERSION) _mm_stream_si64(SIMDE_CHECKED_REINTERPRET_CAST(long long int*, int64_t*, mem_addr), a); + #elif HEDLEY_HAS_BUILTIN(__builtin_nontemporal_store) + __builtin_nontemporal_store(a, mem_addr); + #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE) + vst1_s64(mem_addr, vdup_n_s64(a)); #else *mem_addr = a; #endif @@ -6462,6 +6706,8 @@ simde_mm_sub_epi8 (simde__m128i a, simde__m128i b) { #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) r_.neon_i8 = vsubq_s8(a_.neon_i8, b_.neon_i8); + #elif defined(SIMDE_WASM_SIMD128_NATIVE) + r_.wasm_v128 = wasm_i8x16_sub(a_.wasm_v128, b_.wasm_v128); #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) r_.i8 = a_.i8 - b_.i8; #else @@ -6491,6 +6737,8 @@ simde_mm_sub_epi16 (simde__m128i a, simde__m128i b) { #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) r_.neon_i16 = vsubq_s16(a_.neon_i16, b_.neon_i16); + #elif defined(SIMDE_WASM_SIMD128_NATIVE) + r_.wasm_v128 = wasm_i16x8_sub(a_.wasm_v128, b_.wasm_v128); #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) r_.i16 = a_.i16 - b_.i16; #else @@ -6520,6 +6768,8 @@ simde_mm_sub_epi32 (simde__m128i a, simde__m128i b) { #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) r_.neon_i32 = vsubq_s32(a_.neon_i32, b_.neon_i32); + #elif defined(SIMDE_WASM_SIMD128_NATIVE) + r_.wasm_v128 = wasm_i32x4_sub(a_.wasm_v128, b_.wasm_v128); #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) r_.i32 = a_.i32 - b_.i32; #else @@ -6549,6 +6799,8 @@ simde_mm_sub_epi64 (simde__m128i a, simde__m128i b) { #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) r_.neon_i64 = vsubq_s64(a_.neon_i64, b_.neon_i64); + #elif defined(SIMDE_WASM_SIMD128_NATIVE) + r_.wasm_v128 = wasm_i64x2_sub(a_.wasm_v128, b_.wasm_v128); #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) r_.i64 = a_.i64 - b_.i64; #else @@ -7005,15 +7257,6 @@ simde_mm_ucomineq_sd (simde__m128d a, simde__m128d b) { #define _mm_ucomineq_sd(a, b) simde_mm_ucomineq_sd(a, b) #endif -#if defined(SIMDE_DIAGNOSTIC_DISABLE_UNINITIALIZED_) - HEDLEY_DIAGNOSTIC_PUSH - SIMDE_DIAGNOSTIC_DISABLE_UNINITIALIZED_ -#endif - -#if defined(SIMDE_DIAGNOSTIC_DISABLE_UNINITIALIZED_) - HEDLEY_DIAGNOSTIC_POP -#endif - SIMDE_FUNCTION_ATTRIBUTES void simde_mm_lfence (void) { @@ -7058,6 +7301,8 @@ simde_mm_unpackhi_epi8 (simde__m128i a, simde__m128i b) { int8x8_t b1 = vreinterpret_s8_s16(vget_high_s16(b_.neon_i16)); int8x8x2_t result = vzip_s8(a1, b1); r_.neon_i8 = vcombine_s8(result.val[0], result.val[1]); + #elif defined(SIMDE_WASM_SIMD128_NATIVE) + r_.wasm_v128 = wasm_i8x16_shuffle(a_.wasm_v128, b_.wasm_v128, 8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31); #elif defined(SIMDE_SHUFFLE_VECTOR_) r_.i8 = SIMDE_SHUFFLE_VECTOR_(8, 16, a_.i8, b_.i8, 8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31); #else @@ -7093,6 +7338,8 @@ simde_mm_unpackhi_epi16 (simde__m128i a, simde__m128i b) { int16x4_t b1 = vget_high_s16(b_.neon_i16); int16x4x2_t result = vzip_s16(a1, b1); r_.neon_i16 = vcombine_s16(result.val[0], result.val[1]); + #elif defined(SIMDE_WASM_SIMD128_NATIVE) + r_.wasm_v128 = wasm_i16x8_shuffle(a_.wasm_v128, b_.wasm_v128, 4, 12, 5, 13, 6, 14, 7, 15); #elif defined(SIMDE_SHUFFLE_VECTOR_) r_.i16 = SIMDE_SHUFFLE_VECTOR_(16, 16, a_.i16, b_.i16, 4, 12, 5, 13, 6, 14, 7, 15); #else @@ -7128,6 +7375,8 @@ simde_mm_unpackhi_epi32 (simde__m128i a, simde__m128i b) { int32x2_t b1 = vget_high_s32(b_.neon_i32); int32x2x2_t result = vzip_s32(a1, b1); r_.neon_i32 = vcombine_s32(result.val[0], result.val[1]); + #elif defined(SIMDE_WASM_SIMD128_NATIVE) + r_.wasm_v128 = wasm_i32x4_shuffle(a_.wasm_v128, b_.wasm_v128, 2, 6, 3, 7); #elif defined(SIMDE_SHUFFLE_VECTOR_) r_.i32 = SIMDE_SHUFFLE_VECTOR_(32, 16, a_.i32, b_.i32, 2, 6, 3, 7); #else @@ -7160,6 +7409,8 @@ simde_mm_unpackhi_epi64 (simde__m128i a, simde__m128i b) { int64x1_t a_h = vget_high_s64(a_.neon_i64); int64x1_t b_h = vget_high_s64(b_.neon_i64); r_.neon_i64 = vcombine_s64(a_h, b_h); + #elif defined(SIMDE_WASM_SIMD128_NATIVE) + r_.wasm_v128 = wasm_i64x2_shuffle(a_.wasm_v128, b_.wasm_v128, 1, 3); #elif defined(SIMDE_SHUFFLE_VECTOR_) r_.i64 = SIMDE_SHUFFLE_VECTOR_(64, 16, a_.i64, b_.i64, 1, 3); #else @@ -7227,6 +7478,8 @@ simde_mm_unpacklo_epi8 (simde__m128i a, simde__m128i b) { int8x8_t b1 = vreinterpret_s8_s16(vget_low_s16(b_.neon_i16)); int8x8x2_t result = vzip_s8(a1, b1); r_.neon_i8 = vcombine_s8(result.val[0], result.val[1]); + #elif defined(SIMDE_WASM_SIMD128_NATIVE) + r_.wasm_v128 = wasm_i8x16_shuffle(a_.wasm_v128, b_.wasm_v128, 0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23); #elif defined(SIMDE_SHUFFLE_VECTOR_) r_.i8 = SIMDE_SHUFFLE_VECTOR_(8, 16, a_.i8, b_.i8, 0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23); #else @@ -7262,6 +7515,8 @@ simde_mm_unpacklo_epi16 (simde__m128i a, simde__m128i b) { int16x4_t b1 = vget_low_s16(b_.neon_i16); int16x4x2_t result = vzip_s16(a1, b1); r_.neon_i16 = vcombine_s16(result.val[0], result.val[1]); + #elif defined(SIMDE_WASM_SIMD128_NATIVE) + r_.wasm_v128 = wasm_i16x8_shuffle(a_.wasm_v128, b_.wasm_v128, 0, 8, 1, 9, 2, 10, 3, 11); #elif defined(SIMDE_SHUFFLE_VECTOR_) r_.i16 = SIMDE_SHUFFLE_VECTOR_(16, 16, a_.i16, b_.i16, 0, 8, 1, 9, 2, 10, 3, 11); #else @@ -7297,6 +7552,8 @@ simde_mm_unpacklo_epi32 (simde__m128i a, simde__m128i b) { int32x2_t b1 = vget_low_s32(b_.neon_i32); int32x2x2_t result = vzip_s32(a1, b1); r_.neon_i32 = vcombine_s32(result.val[0], result.val[1]); + #elif defined(SIMDE_WASM_SIMD128_NATIVE) + r_.wasm_v128 = wasm_i32x4_shuffle(a_.wasm_v128, b_.wasm_v128, 0, 4, 1, 5); #elif defined(SIMDE_SHUFFLE_VECTOR_) r_.i32 = SIMDE_SHUFFLE_VECTOR_(32, 16, a_.i32, b_.i32, 0, 4, 1, 5); #else @@ -7329,6 +7586,8 @@ simde_mm_unpacklo_epi64 (simde__m128i a, simde__m128i b) { int64x1_t a_l = vget_low_s64(a_.neon_i64); int64x1_t b_l = vget_low_s64(b_.neon_i64); r_.neon_i64 = vcombine_s64(a_l, b_l); + #elif defined(SIMDE_WASM_SIMD128_NATIVE) + r_.wasm_v128 = wasm_i64x2_shuffle(a_.wasm_v128, b_.wasm_v128, 0, 2); #elif defined(SIMDE_SHUFFLE_VECTOR_) r_.i64 = SIMDE_SHUFFLE_VECTOR_(64, 16, a_.i64, b_.i64, 0, 2); #else @@ -7359,6 +7618,8 @@ simde_mm_unpacklo_pd (simde__m128d a, simde__m128d b) { #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) r_.neon_f64 = vzip1q_f64(a_.neon_f64, b_.neon_f64); + #elif defined(SIMDE_WASM_SIMD128_NATIVE) + r_.wasm_v128 = wasm_i64x2_shuffle(a_.wasm_v128, b_.wasm_v128, 0, 2); #elif defined(SIMDE_SHUFFLE_VECTOR_) r_.f64 = SIMDE_SHUFFLE_VECTOR_(64, 16, a_.f64, b_.f64, 0, 2); #else @@ -7421,6 +7682,8 @@ simde_mm_xor_si128 (simde__m128i a, simde__m128i b) { r_.neon_i32 = veorq_s32(a_.neon_i32, b_.neon_i32); #elif defined(SIMDE_POWER_ALTIVEC_P6_NATIVE) r_.altivec_i32 = vec_xor(a_.altivec_i32, b_.altivec_i32); + #elif defined(SIMDE_WASM_SIMD128_NATIVE) + r_.wasm_v128 = wasm_v128_xor(b_.wasm_v128, a_.wasm_v128); #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) r_.i32f = a_.i32f ^ b_.i32f; #else diff --git a/lib/simd_wrapper/simde/x86/sse3.h b/lib/simd_wrapper/simde/x86/sse3.h index f46e2798a21..db2683c3027 100644 --- a/lib/simd_wrapper/simde/x86/sse3.h +++ b/lib/simd_wrapper/simde/x86/sse3.h @@ -46,6 +46,8 @@ simde_x_mm_deinterleaveeven_epi16 (simde__m128i a, simde__m128i b) { #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE) int16x8x2_t t = vuzpq_s16(a_.neon_i16, b_.neon_i16); r_.neon_i16 = t.val[0]; + #elif defined(SIMDE_WASM_SIMD128_NATIVE) + r_.wasm_v128 = wasm_i16x8_shuffle(a_.wasm_v128, b_.wasm_v128, 0, 2, 4, 6, 8, 10, 12, 14); #elif defined(SIMDE_SHUFFLE_VECTOR_) r_.i16 = SIMDE_SHUFFLE_VECTOR_(16, 16, a_.i16, b_.i16, 0, 2, 4, 6, 8, 10, 12, 14); #else @@ -72,6 +74,8 @@ simde_x_mm_deinterleaveodd_epi16 (simde__m128i a, simde__m128i b) { #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE) int16x8x2_t t = vuzpq_s16(a_.neon_i16, b_.neon_i16); r_.neon_i16 = t.val[1]; + #elif defined(SIMDE_WASM_SIMD128_NATIVE) + r_.wasm_v128 = wasm_i16x8_shuffle(a_.wasm_v128, b_.wasm_v128, 1, 3, 5, 7, 9, 11, 13, 15); #elif defined(SIMDE_SHUFFLE_VECTOR_) r_.i16 = SIMDE_SHUFFLE_VECTOR_(16, 16, a_.i16, b_.i16, 1, 3, 5, 7, 9, 11, 13, 15); #else @@ -98,6 +102,8 @@ simde_x_mm_deinterleaveeven_epi32 (simde__m128i a, simde__m128i b) { #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE) int32x4x2_t t = vuzpq_s32(a_.neon_i32, b_.neon_i32); r_.neon_i32 = t.val[0]; + #elif defined(SIMDE_WASM_SIMD128_NATIVE) + r_.wasm_v128 = wasm_i32x4_shuffle(a_.wasm_v128, b_.wasm_v128, 0, 2, 4, 6); #elif defined(SIMDE_SHUFFLE_VECTOR_) r_.i32 = SIMDE_SHUFFLE_VECTOR_(32, 16, a_.i32, b_.i32, 0, 2, 4, 6); #else @@ -124,6 +130,8 @@ simde_x_mm_deinterleaveodd_epi32 (simde__m128i a, simde__m128i b) { #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE) int32x4x2_t t = vuzpq_s32(a_.neon_i32, b_.neon_i32); r_.neon_i32 = t.val[1]; + #elif defined(SIMDE_WASM_SIMD128_NATIVE) + r_.wasm_v128 = wasm_i32x4_shuffle(a_.wasm_v128, b_.wasm_v128, 1, 3, 5, 7); #elif defined(SIMDE_SHUFFLE_VECTOR_) r_.i32 = SIMDE_SHUFFLE_VECTOR_(32, 16, a_.i32, b_.i32, 1, 3, 5, 7); #else @@ -150,6 +158,8 @@ simde_x_mm_deinterleaveeven_ps (simde__m128 a, simde__m128 b) { #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE) float32x4x2_t t = vuzpq_f32(a_.neon_f32, b_.neon_f32); r_.neon_f32 = t.val[0]; + #elif defined(SIMDE_WASM_SIMD128_NATIVE) + r_.wasm_v128 = wasm_i32x4_shuffle(a_.wasm_v128, b_.wasm_v128, 0, 2, 4, 6); #elif defined(SIMDE_SHUFFLE_VECTOR_) r_.f32 = SIMDE_SHUFFLE_VECTOR_(32, 16, a_.f32, b_.f32, 0, 2, 4, 6); #else @@ -176,6 +186,8 @@ simde_x_mm_deinterleaveodd_ps (simde__m128 a, simde__m128 b) { #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE) float32x4x2_t t = vuzpq_f32(a_.neon_f32, b_.neon_f32); r_.neon_f32 = t.val[1]; + #elif defined(SIMDE_WASM_SIMD128_NATIVE) + r_.wasm_v128 = wasm_i32x4_shuffle(a_.wasm_v128, b_.wasm_v128, 1, 3, 5, 7); #elif defined(SIMDE_SHUFFLE_VECTOR_) r_.f32 = SIMDE_SHUFFLE_VECTOR_(32, 16, a_.f32, b_.f32, 1, 3, 5, 7); #else @@ -199,6 +211,8 @@ simde_x_mm_deinterleaveeven_pd (simde__m128d a, simde__m128d b) { #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) r_.neon_f64 = vuzp1q_f64(a_.neon_f64, b_.neon_f64); + #elif defined(SIMDE_WASM_SIMD128_NATIVE) + r_.wasm_v128 = wasm_i64x2_shuffle(a_.wasm_v128, b_.wasm_v128, 0, 2); #elif defined(SIMDE_SHUFFLE_VECTOR_) r_.f64 = SIMDE_SHUFFLE_VECTOR_(64, 16, a_.f64, b_.f64, 0, 2); #else @@ -222,6 +236,8 @@ simde_x_mm_deinterleaveodd_pd (simde__m128d a, simde__m128d b) { #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) r_.neon_f64 = vuzp2q_f64(a_.neon_f64, b_.neon_f64); + #elif defined(SIMDE_WASM_SIMD128_NATIVE) + r_.wasm_v128 = wasm_i64x2_shuffle(a_.wasm_v128, b_.wasm_v128, 1, 3); #elif defined(SIMDE_SHUFFLE_VECTOR_) r_.f64 = SIMDE_SHUFFLE_VECTOR_(64, 16, a_.f64, b_.f64, 1, 3); #else @@ -294,7 +310,7 @@ simde_mm_addsub_ps (simde__m128 a, simde__m128 b) { #endif } #if defined(SIMDE_X86_SSE3_ENABLE_NATIVE_ALIASES) -# define _mm_addsub_ps(a, b) simde_mm_addsub_ps(a, b) +# define _mm_addsub_ps(a, b) simde_mm_addsub_ps((a), (b)) #endif SIMDE_FUNCTION_ATTRIBUTES diff --git a/lib/simd_wrapper/simde/x86/sse4.1.h b/lib/simd_wrapper/simde/x86/sse4.1.h index 57f1029c19f..15a197b95b3 100644 --- a/lib/simd_wrapper/simde/x86/sse4.1.h +++ b/lib/simd_wrapper/simde/x86/sse4.1.h @@ -352,6 +352,9 @@ simde__m128d simde_mm_blendv_pd (simde__m128d a, simde__m128d b, simde__m128d mask) { #if defined(SIMDE_X86_SSE4_1_NATIVE) return _mm_blendv_pd(a, b, mask); + #elif defined(SIMDE_WASM_SIMD128_NATIVE) + v128_t m_ = wasm_i64x2_shr(HEDLEY_REINTERPRET_CAST(v128_t, mask), 63); + return simde__m128d_from_wasm_v128(wasm_v128_bitselect(simde__m128d_to_wasm_v128(b), simde__m128d_to_wasm_v128(a), m_)); #else return simde_mm_castsi128_pd(simde_x_mm_blendv_epi64(simde_mm_castpd_si128(a), simde_mm_castpd_si128(b), simde_mm_castpd_si128(mask))); #endif @@ -366,6 +369,9 @@ simde__m128 simde_mm_blendv_ps (simde__m128 a, simde__m128 b, simde__m128 mask) { #if defined(SIMDE_X86_SSE4_1_NATIVE) return _mm_blendv_ps(a, b, mask); + #elif defined(SIMDE_WASM_SIMD128_NATIVE) + v128_t m_ = wasm_i32x4_shr(HEDLEY_REINTERPRET_CAST(v128_t, mask), 31); + return simde__m128d_from_wasm_v128(wasm_v128_bitselect(simde__m128d_to_wasm_v128(b), simde__m128d_to_wasm_v128(a), m_)); #else return simde_mm_castsi128_ps(simde_x_mm_blendv_epi32(simde_mm_castps_si128(a), simde_mm_castps_si128(b), simde_mm_castps_si128(mask))); #endif @@ -395,6 +401,8 @@ simde_mm_round_pd (simde__m128d a, int rounding) r_.altivec_f64 = HEDLEY_REINTERPRET_CAST(SIMDE_POWER_ALTIVEC_VECTOR(double), vec_round(a_.altivec_f64)); #elif defined(SIMDE_ARM_NEON_A64V8_NATIVE) r_.neon_f64 = vrndiq_f64(a_.neon_f64); + #elif defined(SIMDE_WASM_SIMD128_NATIVE) + r_.wasm_v128 = wasm_f64x2_nearest(a_.wasm_v128); #elif defined(simde_math_nearbyint) SIMDE_VECTORIZE for (size_t i = 0 ; i < (sizeof(r_.f64) / sizeof(r_.f64[0])) ; i++) { @@ -410,6 +418,8 @@ simde_mm_round_pd (simde__m128d a, int rounding) r_.altivec_f64 = HEDLEY_REINTERPRET_CAST(SIMDE_POWER_ALTIVEC_VECTOR(double), vec_round(a_.altivec_f64)); #elif defined(SIMDE_ARM_NEON_A64V8_NATIVE) r_.neon_f64 = vrndaq_f64(a_.neon_f64); + #elif defined(SIMDE_WASM_SIMD128_NATIVE) + r_.wasm_v128 = wasm_f64x2_nearest(a_.wasm_v128); #elif defined(simde_math_roundeven) SIMDE_VECTORIZE for (size_t i = 0 ; i < (sizeof(r_.f64) / sizeof(r_.f64[0])) ; i++) { @@ -425,6 +435,8 @@ simde_mm_round_pd (simde__m128d a, int rounding) r_.altivec_f64 = HEDLEY_REINTERPRET_CAST(SIMDE_POWER_ALTIVEC_VECTOR(double), vec_floor(a_.altivec_f64)); #elif defined(SIMDE_ARM_NEON_A64V8_NATIVE) r_.neon_f64 = vrndmq_f64(a_.neon_f64); + #elif defined(SIMDE_WASM_SIMD128_NATIVE) + r_.wasm_v128 = wasm_f64x2_floor(a_.wasm_v128); #else SIMDE_VECTORIZE for (size_t i = 0 ; i < (sizeof(r_.f64) / sizeof(r_.f64[0])) ; i++) { @@ -438,6 +450,8 @@ simde_mm_round_pd (simde__m128d a, int rounding) r_.altivec_f64 = HEDLEY_REINTERPRET_CAST(SIMDE_POWER_ALTIVEC_VECTOR(double), vec_ceil(a_.altivec_f64)); #elif defined(SIMDE_ARM_NEON_A64V8_NATIVE) r_.neon_f64 = vrndpq_f64(a_.neon_f64); + #elif defined(SIMDE_WASM_SIMD128_NATIVE) + r_.wasm_v128 = wasm_f64x2_ceil(a_.wasm_v128); #elif defined(simde_math_ceil) SIMDE_VECTORIZE for (size_t i = 0 ; i < (sizeof(r_.f64) / sizeof(r_.f64[0])) ; i++) { @@ -453,6 +467,8 @@ simde_mm_round_pd (simde__m128d a, int rounding) r_.altivec_f64 = HEDLEY_REINTERPRET_CAST(SIMDE_POWER_ALTIVEC_VECTOR(double), vec_trunc(a_.altivec_f64)); #elif defined(SIMDE_ARM_NEON_A64V8_NATIVE) r_.neon_f64 = vrndq_f64(a_.neon_f64); + #elif defined(SIMDE_WASM_SIMD128_NATIVE) + r_.wasm_v128 = wasm_f64x2_trunc(a_.wasm_v128); #else SIMDE_VECTORIZE for (size_t i = 0 ; i < (sizeof(r_.f64) / sizeof(r_.f64[0])) ; i++) { @@ -478,6 +494,9 @@ simde_mm_round_pd (simde__m128d a, int rounding) SIMDE_FUNCTION_ATTRIBUTES simde__m128d simde_mm_ceil_pd (simde__m128d a) { + #if defined(SIMDE_WASM_SIMD128_NATIVE) + return simde__m128d_from_wasm_v128(wasm_f64x2_ceil(simde__m128d_to_wasm_v128(a))); + #endif return simde_mm_round_pd(a, SIMDE_MM_FROUND_TO_POS_INF); } #if defined(SIMDE_X86_SSE4_1_ENABLE_NATIVE_ALIASES) @@ -488,6 +507,9 @@ simde_mm_ceil_pd (simde__m128d a) { SIMDE_FUNCTION_ATTRIBUTES simde__m128 simde_mm_ceil_ps (simde__m128 a) { + #if defined(SIMDE_WASM_SIMD128_NATIVE) + return simde__m128_from_wasm_v128(wasm_f32x4_ceil(simde__m128_to_wasm_v128(a))); + #endif return simde_mm_round_ps(a, SIMDE_MM_FROUND_TO_POS_INF); } #if defined(SIMDE_X86_SSE4_1_ENABLE_NATIVE_ALIASES) @@ -602,6 +624,8 @@ simde_mm_cvtepi8_epi16 (simde__m128i a) { int8x16_t s8x16 = a_.neon_i8; /* xxxx xxxx xxxx DCBA */ int16x8_t s16x8 = vmovl_s8(vget_low_s8(s8x16)); /* 0x0x 0x0x 0D0C 0B0A */ r_.neon_i16 = s16x8; + #elif defined(SIMDE_WASM_SIMD128_NATIVE) + r_.wasm_v128 = wasm_i16x8_extend_low_i8x16(a_.wasm_v128); #elif defined(SIMDE_SHUFFLE_VECTOR_) && defined(SIMDE_VECTOR_SCALAR) && (SIMDE_ENDIAN_ORDER == SIMDE_ENDIAN_LITTLE) r_.i16 = HEDLEY_REINTERPRET_CAST(__typeof__(r_.i16), SIMDE_SHUFFLE_VECTOR_(8, 16, a_.i8, a_.i8, -1, 0, -1, 1, -1, 2, -1, 3, @@ -643,6 +667,8 @@ simde_mm_cvtepi8_epi32 (simde__m128i a) { int16x8_t s16x8 = vmovl_s8(vget_low_s8(s8x16)); /* 0x0x 0x0x 0D0C 0B0A */ int32x4_t s32x4 = vmovl_s16(vget_low_s16(s16x8)); /* 000D 000C 000B 000A */ r_.neon_i32 = s32x4; + #elif defined(SIMDE_WASM_SIMD128_NATIVE) + r_.wasm_v128 = wasm_i32x4_extend_low_i16x8(wasm_i16x8_extend_low_i8x16(a_.wasm_v128)); #elif defined(SIMDE_SHUFFLE_VECTOR_) && defined(SIMDE_VECTOR_SCALAR) && (SIMDE_ENDIAN_ORDER == SIMDE_ENDIAN_LITTLE) r_.i32 = HEDLEY_REINTERPRET_CAST(__typeof__(r_.i32), SIMDE_SHUFFLE_VECTOR_(8, 16, a_.i8, a_.i8, -1, -1, -1, 0, -1, -1, -1, 1, @@ -679,6 +705,10 @@ simde_mm_cvtepi8_epi64 (simde__m128i a) { int32x4_t s32x4 = vmovl_s16(vget_low_s16(s16x8)); /* 000x 000x 000B 000A */ int64x2_t s64x2 = vmovl_s32(vget_low_s32(s32x4)); /* 0000 000B 0000 000A */ r_.neon_i64 = s64x2; + #elif defined(SIMDE_WASM_SIMD128_NATIVE) + v128_t extra = wasm_i32x4_extend_low_i16x8(wasm_i16x8_extend_low_i8x16(a_.wasm_v128)); + v128_t sign = wasm_i32x4_gt(wasm_i64x2_const(0, 0), extra); + r_.wasm_v128 = wasm_i32x4_shuffle(extra, sign, 0, 4, 1, 5); #elif (!defined(SIMDE_ARCH_X86) && !defined(SIMDE_ARCH_AMD64)) && defined(SIMDE_SHUFFLE_VECTOR_) && defined(SIMDE_VECTOR_SCALAR) && (SIMDE_ENDIAN_ORDER == SIMDE_ENDIAN_LITTLE) /* Disabled on x86 due to lack of 64-bit arithmetic shift until * until AVX-512 (at which point we would be using the native @@ -718,6 +748,8 @@ simde_mm_cvtepu8_epi16 (simde__m128i a) { uint8x16_t u8x16 = a_.neon_u8; /* xxxx xxxx xxxx DCBA */ uint16x8_t u16x8 = vmovl_u8(vget_low_u8(u8x16)); /* 0x0x 0x0x 0D0C 0B0A */ r_.neon_u16 = u16x8; + #elif defined(SIMDE_WASM_SIMD128_NATIVE) + r_.wasm_v128 = wasm_u16x8_extend_low_u8x16(a_.wasm_v128); #elif defined(SIMDE_SHUFFLE_VECTOR_) && (SIMDE_ENDIAN_ORDER == SIMDE_ENDIAN_LITTLE) __typeof__(r_.i8) z = { 0, }; r_.i16 = HEDLEY_REINTERPRET_CAST(__typeof__(r_.i16), SIMDE_SHUFFLE_VECTOR_(8, 16, a_.i8, z, @@ -765,6 +797,8 @@ simde_mm_cvtepu8_epi32 (simde__m128i a) { uint16x8_t u16x8 = vmovl_u8(vget_low_u8(u8x16)); /* 0x0x 0x0x 0D0C 0B0A */ uint32x4_t u32x4 = vmovl_u16(vget_low_u16(u16x8)); /* 000D 000C 000B 000A */ r_.neon_u32 = u32x4; + #elif defined(SIMDE_WASM_SIMD128_NATIVE) + r_.wasm_v128 = wasm_u32x4_extend_low_u16x8(wasm_u16x8_extend_low_u8x16(a_.wasm_v128)); #elif defined(SIMDE_SHUFFLE_VECTOR_) && (SIMDE_ENDIAN_ORDER == SIMDE_ENDIAN_LITTLE) __typeof__(r_.i8) z = { 0, }; r_.i32 = HEDLEY_REINTERPRET_CAST(__typeof__(r_.i32), SIMDE_SHUFFLE_VECTOR_(8, 16, a_.i8, z, @@ -845,6 +879,8 @@ simde_mm_cvtepi16_epi32 (simde__m128i a) { #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) r_.neon_i32 = vmovl_s16(vget_low_s16(a_.neon_i16)); + #elif defined(SIMDE_WASM_SIMD128_NATIVE) + r_.wasm_v128 = wasm_i32x4_extend_low_i16x8(a_.wasm_v128); #elif !defined(SIMDE_ARCH_X86) && defined(SIMDE_SHUFFLE_VECTOR_) && defined(SIMDE_VECTOR_SCALAR) && (SIMDE_ENDIAN_ORDER == SIMDE_ENDIAN_LITTLE) r_.i32 = HEDLEY_REINTERPRET_CAST(__typeof__(r_.i32), SIMDE_SHUFFLE_VECTOR_(16, 16, a_.i16, a_.i16, 8, 0, 10, 1, 12, 2, 14, 3)); r_.i32 >>= 16; @@ -877,6 +913,8 @@ simde_mm_cvtepu16_epi32 (simde__m128i a) { #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) r_.neon_u32 = vmovl_u16(vget_low_u16(a_.neon_u16)); + #elif defined(SIMDE_WASM_SIMD128_NATIVE) + r_.wasm_v128 = wasm_u32x4_extend_low_u16x8(a_.wasm_v128); #elif defined(SIMDE_SHUFFLE_VECTOR_) && (SIMDE_ENDIAN_ORDER == SIMDE_ENDIAN_LITTLE) __typeof__(r_.u16) z = { 0, }; r_.i32 = HEDLEY_REINTERPRET_CAST(__typeof__(r_.i32), SIMDE_SHUFFLE_VECTOR_(16, 16, a_.u16, z, @@ -1206,7 +1244,9 @@ simde_mm_extract_epi8 (simde__m128i a, const int imm8) #if defined(SIMDE_X86_SSE4_1_NATIVE) && !defined(SIMDE_BUG_GCC_BAD_MM_EXTRACT_EPI8) # define simde_mm_extract_epi8(a, imm8) HEDLEY_STATIC_CAST(int8_t, _mm_extract_epi8(a, imm8)) #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE) -# define simde_mm_extract_epi8(a, imm8) vgetq_lane_s8(simde__m128i_to_private(a).neon_i8, imm8) +# define simde_mm_extract_epi8(a, imm8) vgetq_lane_s8(simde__m128i_to_neon_i8(a), imm8) +#elif defined(SIMDE_WASM_SIMD128_NATIVE) +# define simde_mm_extract_epi8(a, imm8) wasm_u8x16_extract_lane(simde__m128i_to_wasm_v128((a)), (imm8) & 15) #endif #if defined(SIMDE_X86_SSE4_1_ENABLE_NATIVE_ALIASES) #undef _mm_extract_epi8 @@ -1236,9 +1276,11 @@ simde_mm_extract_epi32 (simde__m128i a, const int imm8) #if defined(SIMDE_X86_SSE4_1_NATIVE) # define simde_mm_extract_epi32(a, imm8) _mm_extract_epi32(a, imm8) #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE) -# define simde_mm_extract_epi32(a, imm8) vgetq_lane_s32(simde__m128i_to_private(a).neon_i32, imm8) +# define simde_mm_extract_epi32(a, imm8) vgetq_lane_s32(simde__m128i_to_neon_i32(a), imm8) #elif defined(SIMDE_POWER_ALTIVEC_P6_NATIVE) -# define simde_mm_extract_epi32(a, imm8) HEDLEY_STATIC_CAST(int32_t, vec_extract(simde__m128i_to_private(a).altivec_i32, imm8)) +# define simde_mm_extract_epi32(a, imm8) HEDLEY_STATIC_CAST(int32_t, vec_extract(simde__m128i_to_altivec_i32(a), imm8)) +#elif defined(SIMDE_WASM_SIMD128_NATIVE) +# define simde_mm_extract_epi32(a, imm8) wasm_i32x4_extract_lane(simde__m128i_to_wasm_v128((a)), (imm8) & 3) #endif #if defined(SIMDE_X86_SSE4_1_ENABLE_NATIVE_ALIASES) #undef _mm_extract_epi32 @@ -1268,9 +1310,9 @@ simde_mm_extract_epi64 (simde__m128i a, const int imm8) #if defined(SIMDE_X86_SSE4_1_NATIVE) && defined(SIMDE_ARCH_AMD64) # define simde_mm_extract_epi64(a, imm8) _mm_extract_epi64(a, imm8) #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE) -# define simde_mm_extract_epi64(a, imm8) vgetq_lane_s64(simde__m128i_to_private(a).neon_i64, imm8) +# define simde_mm_extract_epi64(a, imm8) vgetq_lane_s64(simde__m128i_to_neon_i64(a), imm8) #elif defined(SIMDE_POWER_ALTIVEC_P7_NATIVE) -# define simde_mm_extract_epi64(a, imm8) HEDLEY_STATIC_CAST(int64_t, vec_extract(simde__m128i_to_private(a).altivec_i64, imm8)) +# define simde_mm_extract_epi64(a, imm8) HEDLEY_STATIC_CAST(int64_t, vec_extract(simde__m128i_to_altivec_i64(a), imm8)) #endif #if defined(SIMDE_X86_SSE4_1_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && !defined(SIMDE_ARCH_AMD64)) #undef _mm_extract_epi64 @@ -1292,7 +1334,9 @@ simde_mm_extract_ps (simde__m128 a, const int imm8) #if defined(SIMDE_X86_SSE4_1_NATIVE) #define simde_mm_extract_ps(a, imm8) _mm_extract_ps(a, imm8) #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE) - #define simde_mm_extract_ps(a, imm8) vgetq_lane_s32(simde__m128_to_private(a).neon_i32, imm8) + #define simde_mm_extract_ps(a, imm8) vgetq_lane_s32(simde__m128_to_neon_i32(a), imm8) +#elif defined(SIMDE_WASM_SIMD128_NATIVE) + #define simde_mm_extract_ps(a, imm8) wasm_i32x4_extract_lane(simde__m128_to_wasm_v128((a)), (imm8) & 3) #endif #if defined(SIMDE_X86_SSE4_1_ENABLE_NATIVE_ALIASES) #undef _mm_extract_ps @@ -1302,6 +1346,9 @@ simde_mm_extract_ps (simde__m128 a, const int imm8) SIMDE_FUNCTION_ATTRIBUTES simde__m128d simde_mm_floor_pd (simde__m128d a) { + #if defined(SIMDE_WASM_SIMD128_NATIVE) + return simde__m128d_from_wasm_v128(wasm_f64x2_floor(simde__m128d_to_wasm_v128(a))); + #endif return simde_mm_round_pd(a, SIMDE_MM_FROUND_TO_NEG_INF); } #if defined(SIMDE_X86_SSE4_1_ENABLE_NATIVE_ALIASES) @@ -1312,6 +1359,9 @@ simde_mm_floor_pd (simde__m128d a) { SIMDE_FUNCTION_ATTRIBUTES simde__m128 simde_mm_floor_ps (simde__m128 a) { + #if defined(SIMDE_WASM_SIMD128_NATIVE) + return simde__m128_from_wasm_v128(wasm_f32x4_floor(simde__m128_to_wasm_v128(a))); + #endif return simde_mm_round_ps(a, SIMDE_MM_FROUND_TO_NEG_INF); } #if defined(SIMDE_X86_SSE4_1_ENABLE_NATIVE_ALIASES) @@ -1399,6 +1449,8 @@ simde_mm_insert_epi8 (simde__m128i a, int i, const int imm8) #endif #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE) # define simde_mm_insert_epi8(a, i, imm8) simde__m128i_from_neon_i8(vsetq_lane_s8(i, simde__m128i_to_neon_i8(a), imm8)) +#elif defined(SIMDE_WASM_SIMD128_NATIVE) +# define simde_mm_insert_epi8(a, i, imm8) simde__m128i_from_wasm_v128(wasm_i8x16_replace_lane(simde__m128i_to_wasm_v128((a)), (imm8) & 15, HEDLEY_STATIC_CAST(int8_t, (i)))) #endif #if defined(SIMDE_X86_SSE4_1_ENABLE_NATIVE_ALIASES) #undef _mm_insert_epi8 @@ -1424,6 +1476,8 @@ simde_mm_insert_epi32 (simde__m128i a, int i, const int imm8) #endif #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE) # define simde_mm_insert_epi32(a, i, imm8) simde__m128i_from_neon_i32(vsetq_lane_s32(i, simde__m128i_to_neon_i32(a), imm8)) +#elif defined(SIMDE_WASM_SIMD128_NATIVE) +# define simde_mm_insert_epi32(a, i, imm8) simde__m128i_from_wasm_v128(wasm_i32x4_replace_lane(simde__m128i_to_wasm_v128((a)), (imm8) & 3, (i))) #endif #if defined(SIMDE_X86_SSE4_1_ENABLE_NATIVE_ALIASES) #undef _mm_insert_epi32 @@ -1461,6 +1515,8 @@ simde_mm_insert_epi64 (simde__m128i a, int64_t i, const int imm8) # define simde_mm_insert_epi64(a, i, imm8) _mm_insert_epi64(a, i, imm8) #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE) # define simde_mm_insert_epi64(a, i, imm8) simde__m128i_from_neon_i64(vsetq_lane_s64(i, simde__m128i_to_neon_i64(a), imm8)) +#elif defined(SIMDE_WASM_SIMD128_NATIVE) +# define simde_mm_insert_epi64(a, i, imm8) simde__m128i_from_wasm_v128(wasm_i64x2_replace_lane(simde__m128i_to_wasm_v128((a)), (imm8) & 1, (i))) #endif #if defined(SIMDE_X86_SSE4_1_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && !defined(SIMDE_ARCH_AMD64)) #undef _mm_insert_epi64 @@ -1476,12 +1532,12 @@ simde_mm_insert_ps (simde__m128 a, simde__m128 b, const int imm8) a_ = simde__m128_to_private(a), b_ = simde__m128_to_private(b); - a_.f32[0] = b_.f32[(imm8 >> 6) & 3]; - a_.f32[(imm8 >> 4) & 3] = a_.f32[0]; + float tmp1_ = b_.f32[(imm8 >> 6) & 3]; + a_.f32[(imm8 >> 4) & 3] = tmp1_; SIMDE_VECTORIZE for (size_t i = 0 ; i < (sizeof(r_.f32) / sizeof(r_.f32[0])) ; i++) { - r_.f32[i] = (imm8 >> i) ? SIMDE_FLOAT32_C(0.0) : a_.f32[i]; + r_.f32[i] = ((imm8 >> i) & 1 ) ? SIMDE_FLOAT32_C(0.0) : a_.f32[i]; } return simde__m128_from_private(r_); @@ -2083,10 +2139,13 @@ simde__m128i simde_mm_stream_load_si128 (const simde__m128i* mem_addr) { #if defined(SIMDE_X86_SSE4_1_NATIVE) return _mm_stream_load_si128(HEDLEY_CONST_CAST(simde__m128i*, mem_addr)); - #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE) - return vreinterpretq_s64_s32(vld1q_s32(HEDLEY_REINTERPRET_CAST(int32_t const*, mem_addr))); + #elif HEDLEY_HAS_BUILTIN(__builtin_nontemporal_load) && ( \ + defined(SIMDE_ARM_NEON_A32V7_NATIVE) || defined(SIMDE_VECTOR_SUBSCRIPT) || \ + defined(SIMDE_WASM_SIMD128_NATIVE) || defined(SIMDE_POWER_ALTIVEC_P6_NATIVE) || \ + defined(SIMDE_ZARCH_ZVECTOR_13_NATIVE)) + return __builtin_nontemporal_load(mem_addr); #else - return *mem_addr; + return simde_mm_load_si128(mem_addr); #endif } #if defined(SIMDE_X86_SSE4_1_ENABLE_NATIVE_ALIASES) @@ -2106,7 +2165,9 @@ simde_mm_test_all_ones (simde__m128i a) { #if defined(SIMDE_POWER_ALTIVEC_P6_NATIVE) r = vec_all_eq(a_.altivec_i32, vec_splats(~0)); #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE) - return r = ((vgetq_lane_s64(a_.neon_i64, 0) & vgetq_lane_s64(a_.neon_i64, 1)) == ~HEDLEY_STATIC_CAST(int64_t, 0)); + r = ((vgetq_lane_s64(a_.neon_i64, 0) & vgetq_lane_s64(a_.neon_i64, 1)) == ~HEDLEY_STATIC_CAST(int64_t, 0)); + #elif defined(SIMDE_WASM_SIMD128_NATIVE) + r = HEDLEY_STATIC_CAST(unsigned long long, wasm_i64x2_extract_lane(a_.wasm_v128, 0) & wasm_i64x2_extract_lane(a_.wasm_v128, 1)) == 0xFFFFFFFFFFFFFFFFull; #else int_fast32_t r_ = ~HEDLEY_STATIC_CAST(int_fast32_t, 0); @@ -2138,7 +2199,9 @@ simde_mm_test_all_zeros (simde__m128i a, simde__m128i mask) { #if defined(SIMDE_POWER_ALTIVEC_P6_NATIVE) r = vec_all_eq(tmp_.altivec_i32, vec_splats(0)); #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE) - return !(vgetq_lane_s64(tmp_.neon_i64, 0) | vgetq_lane_s64(tmp_.neon_i64, 1)); + r = !(vgetq_lane_s64(tmp_.neon_i64, 0) | vgetq_lane_s64(tmp_.neon_i64, 1)); + #elif defined(SIMDE_WASM_SIMD128_NATIVE) + r = (wasm_i64x2_extract_lane(tmp_.wasm_v128, 0) | wasm_i64x2_extract_lane(tmp_.wasm_v128, 1)) == 0; #else int_fast32_t r_ = HEDLEY_STATIC_CAST(int_fast32_t, 0); @@ -2172,6 +2235,13 @@ simde_mm_test_mix_ones_zeros (simde__m128i a, simde__m128i mask) { int64x2_t s640 = vandq_s64(a_.neon_i64, mask_.neon_i64); int64x2_t s641 = vandq_s64(vreinterpretq_s64_s32(vmvnq_s32(vreinterpretq_s32_s64(a_.neon_i64))), mask_.neon_i64); return (((vgetq_lane_s64(s640, 0) | vgetq_lane_s64(s640, 1)) & (vgetq_lane_s64(s641, 0) | vgetq_lane_s64(s641, 1)))!=0); + #elif defined(SIMDE_WASM_SIMD128_NATIVE) + v128_t m = wasm_v128_and(a_.wasm_v128, mask_.wasm_v128); + long long c0 = wasm_i64x2_extract_lane(m, 0); + long long c1 = wasm_i64x2_extract_lane(m, 1); + long long ones = c0 | c1; + long long zeros = ~(c0 & c1); + return ones && zeros; #else for (size_t i = 0 ; i < (sizeof(a_.u64) / sizeof(a_.u64[0])) ; i++) if (((a_.u64[i] & mask_.u64[i]) != 0) && ((~a_.u64[i] & mask_.u64[i]) != 0)) @@ -2198,7 +2268,10 @@ simde_mm_testc_si128 (simde__m128i a, simde__m128i b) { #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) int64x2_t s64 = vbicq_s64(b_.neon_i64, a_.neon_i64); - return !(vgetq_lane_s64(s64, 0) & vgetq_lane_s64(s64, 1)); + return !(vgetq_lane_s64(s64, 0) | vgetq_lane_s64(s64, 1)); + #elif defined(SIMDE_WASM_SIMD128_NATIVE) + v128_t m = wasm_v128_andnot(b_.wasm_v128, a_.wasm_v128); + return (wasm_i64x2_extract_lane(m, 0) | wasm_i64x2_extract_lane(m, 1)) == 0; #else int_fast32_t r = 0; @@ -2229,7 +2302,13 @@ simde_mm_testnzc_si128 (simde__m128i a, simde__m128i b) { #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) int64x2_t s640 = vandq_s64(b_.neon_i64, a_.neon_i64); int64x2_t s641 = vbicq_s64(b_.neon_i64, a_.neon_i64); - return (((vgetq_lane_s64(s640, 0) | vgetq_lane_s64(s640, 1)) & (vgetq_lane_s64(s641, 0) | vgetq_lane_s64(s641, 1)))!=0); + return !( !(vgetq_lane_s64(s641, 0) || vgetq_lane_s64(s641, 1)) \ + || !(vgetq_lane_s64(s640, 0) || vgetq_lane_s64(s640, 1)) ); + #elif defined(SIMDE_WASM_SIMD128_NATIVE) + v128_t m1 = wasm_v128_and(a_.wasm_v128, b_.wasm_v128); + v128_t m2 = wasm_v128_andnot(b_.wasm_v128, a_.wasm_v128); + return (wasm_i64x2_extract_lane(m1, 0) | wasm_i64x2_extract_lane(m1, 1)) \ + && (wasm_i64x2_extract_lane(m2, 0) | wasm_i64x2_extract_lane(m2, 1)); #else for (size_t i = 0 ; i < (sizeof(a_.u64) / sizeof(a_.u64[0])) ; i++) { if (((a_.u64[i] & b_.u64[i]) != 0) && ((~a_.u64[i] & b_.u64[i]) != 0)) @@ -2258,14 +2337,22 @@ simde_mm_testz_si128 (simde__m128i a, simde__m128i b) { #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) int64x2_t s64 = vandq_s64(a_.neon_i64, b_.neon_i64); return !(vgetq_lane_s64(s64, 0) | vgetq_lane_s64(s64, 1)); + #elif defined(SIMDE_WASM_SIMD128_NATIVE) + v128_t m = wasm_v128_and(a_.wasm_v128, b_.wasm_v128); + return (wasm_i64x2_extract_lane(m, 0) | wasm_i64x2_extract_lane(m, 1)) == 0; + #elif defined(SIMDE_HAVE_INT128_) + if ((a_.u128[0] & b_.u128[0]) == 0) { + return 1; + } + return 0; #else for (size_t i = 0 ; i < (sizeof(a_.u64) / sizeof(a_.u64[0])) ; i++) { - if ((a_.u64[i] & b_.u64[i]) == 0) - return 1; + if ((a_.u64[i] & b_.u64[i]) > 0) + return 0; } #endif - return 0; + return 1; #endif } #if defined(SIMDE_X86_SSE4_1_ENABLE_NATIVE_ALIASES) diff --git a/lib/simd_wrapper/simde/x86/sse4.2.h b/lib/simd_wrapper/simde/x86/sse4.2.h index 504fe2f0b95..ae9e7569e62 100644 --- a/lib/simd_wrapper/simde/x86/sse4.2.h +++ b/lib/simd_wrapper/simde/x86/sse4.2.h @@ -172,6 +172,8 @@ simde_mm_cmpgt_epi64 (simde__m128i a, simde__m128i b) { r_.neon_i64 = vshrq_n_s64(vqsubq_s64(b_.neon_i64, a_.neon_i64), 63); #elif defined(SIMDE_POWER_ALTIVEC_P8_NATIVE) r_.altivec_u64 = HEDLEY_REINTERPRET_CAST(SIMDE_POWER_ALTIVEC_VECTOR(unsigned long long), vec_cmpgt(a_.altivec_i64, b_.altivec_i64)); + #elif defined(SIMDE_WASM_SIMD128_NATIVE) + r_.wasm_v128 = wasm_i64x2_gt(a_.wasm_v128, b_.wasm_v128); #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) r_.i64 = HEDLEY_REINTERPRET_CAST(__typeof__(r_.i64), a_.i64 > b_.i64); #else diff --git a/lib/simd_wrapper/simde/x86/ssse3.h b/lib/simd_wrapper/simde/x86/ssse3.h index 9c88f016f12..6c4c12d5f83 100644 --- a/lib/simd_wrapper/simde/x86/ssse3.h +++ b/lib/simd_wrapper/simde/x86/ssse3.h @@ -334,6 +334,9 @@ simde_mm_shuffle_epi8 (simde__m128i a, simde__m128i b) { SIMDE_POWER_ALTIVEC_VECTOR(signed char) msb_mask = HEDLEY_REINTERPRET_CAST(SIMDE_POWER_ALTIVEC_VECTOR(signed char), vec_cmplt(b_.altivec_i8, z)); SIMDE_POWER_ALTIVEC_VECTOR(signed char) c = vec_perm(a_.altivec_i8, a_.altivec_i8, HEDLEY_REINTERPRET_CAST(SIMDE_POWER_ALTIVEC_VECTOR(unsigned char), b_.altivec_i8)); r_.altivec_i8 = vec_sel(c, z, HEDLEY_REINTERPRET_CAST(SIMDE_POWER_ALTIVEC_VECTOR(unsigned char), msb_mask)); + #elif defined(SIMDE_WASM_SIMD128_NATIVE) + r_.wasm_v128 = wasm_i8x16_swizzle( + a_.wasm_v128, wasm_v128_and(b_.wasm_v128, wasm_i8x16_splat(0x8F))); #else for (size_t i = 0 ; i < (sizeof(r_.i8) / sizeof(r_.i8[0])) ; i++) { r_.i8[i] = a_.i8[b_.i8[i] & 15] & (~(b_.i8[i]) >> 7); @@ -763,6 +766,15 @@ simde_mm_mulhrs_epi16 (simde__m128i a, simde__m128i b) { /* Join together */ r_.neon_i16 = vcombine_s16(narrow_lo, narrow_hi); + #elif defined(SIMDE_WASM_SIMD128_NATIVE) + v128_t __lo = wasm_i32x4_mul(wasm_i32x4_extend_low_i16x8(a_.wasm_v128), wasm_i32x4_extend_low_i16x8(b_.wasm_v128)); + v128_t __hi = wasm_i32x4_mul(wasm_i32x4_extend_high_i16x8(a_.wasm_v128), wasm_i32x4_extend_high_i16x8(b_.wasm_v128)); + const v128_t __inc = wasm_i32x4_splat(0x4000); + __lo = wasm_i32x4_add(__lo, __inc); + __hi = wasm_i32x4_add(__hi, __inc); + __lo = wasm_i32x4_add(__lo, __lo); + __hi = wasm_i32x4_add(__hi, __hi); + r_.wasm_v128 = wasm_i16x8_shuffle(__lo, __hi, 1, 3, 5, 7, 9, 11, 13, 15); #else SIMDE_VECTORIZE for (size_t i = 0 ; i < (sizeof(r_.i16) / sizeof(r_.i16[0])) ; i++) { diff --git a/lib/simd_wrapper/simde/x86/svml.h b/lib/simd_wrapper/simde/x86/svml.h index 81509e96ab1..40fe0cd6d7d 100644 --- a/lib/simd_wrapper/simde/x86/svml.h +++ b/lib/simd_wrapper/simde/x86/svml.h @@ -49,18 +49,10 @@ #include "../simde-complex.h" -#if !defined(SIMDE_X86_SVML_NATIVE) && defined(SIMDE_ENABLE_NATIVE_ALIASES) -# define SIMDE_X86_SVML_ENABLE_NATIVE_ALIASES -#endif - HEDLEY_DIAGNOSTIC_PUSH SIMDE_DISABLE_UNWANTED_DIAGNOSTICS SIMDE_BEGIN_DECLS_ -#if !defined(SIMDE_X86_SVML_NATIVE) && defined(SIMDE_ENABLE_NATIVE_ALIASES) -# define SIMDE_X86_SVML_ENABLE_NATIVE_ALIASES -#endif - SIMDE_FUNCTION_ATTRIBUTES simde__m128 simde_mm_acos_ps (simde__m128 a) { @@ -2683,7 +2675,7 @@ simde_mm512_mask_cosh_pd(simde__m512d src, simde__mmask8 k, simde__m512d a) { SIMDE_FUNCTION_ATTRIBUTES simde__m128i simde_mm_div_epi8 (simde__m128i a, simde__m128i b) { - #if defined(SIMDE_X86_SVML_NATIVE) && defined(SIMDE_X86_SSE2_NATIVE) + #if defined(SIMDE_X86_SVML_NATIVE) && defined(SIMDE_X86_SSE_NATIVE) return _mm_div_epi8(a, b); #else simde__m128i_private @@ -2713,7 +2705,7 @@ simde_mm_div_epi8 (simde__m128i a, simde__m128i b) { SIMDE_FUNCTION_ATTRIBUTES simde__m128i simde_mm_div_epi16 (simde__m128i a, simde__m128i b) { - #if defined(SIMDE_X86_SVML_NATIVE) && defined(SIMDE_X86_SSE2_NATIVE) + #if defined(SIMDE_X86_SVML_NATIVE) && defined(SIMDE_X86_SSE_NATIVE) return _mm_div_epi16(a, b); #else simde__m128i_private @@ -2743,7 +2735,7 @@ simde_mm_div_epi16 (simde__m128i a, simde__m128i b) { SIMDE_FUNCTION_ATTRIBUTES simde__m128i simde_mm_div_epi32 (simde__m128i a, simde__m128i b) { - #if defined(SIMDE_X86_SVML_NATIVE) && defined(SIMDE_X86_SSE2_NATIVE) + #if defined(SIMDE_X86_SVML_NATIVE) && defined(SIMDE_X86_SSE_NATIVE) return _mm_div_epi32(a, b); #else simde__m128i_private @@ -2776,7 +2768,7 @@ simde_mm_div_epi32 (simde__m128i a, simde__m128i b) { SIMDE_FUNCTION_ATTRIBUTES simde__m128i simde_mm_div_epi64 (simde__m128i a, simde__m128i b) { - #if defined(SIMDE_X86_SVML_NATIVE) && defined(SIMDE_X86_SSE2_NATIVE) + #if defined(SIMDE_X86_SVML_NATIVE) && defined(SIMDE_X86_SSE_NATIVE) return _mm_div_epi64(a, b); #else simde__m128i_private @@ -2806,7 +2798,7 @@ simde_mm_div_epi64 (simde__m128i a, simde__m128i b) { SIMDE_FUNCTION_ATTRIBUTES simde__m128i simde_mm_div_epu8 (simde__m128i a, simde__m128i b) { - #if defined(SIMDE_X86_SVML_NATIVE) && defined(SIMDE_X86_SSE2_NATIVE) + #if defined(SIMDE_X86_SVML_NATIVE) && defined(SIMDE_X86_SSE_NATIVE) return _mm_div_epu8(a, b); #else simde__m128i_private @@ -2836,7 +2828,7 @@ simde_mm_div_epu8 (simde__m128i a, simde__m128i b) { SIMDE_FUNCTION_ATTRIBUTES simde__m128i simde_mm_div_epu16 (simde__m128i a, simde__m128i b) { - #if defined(SIMDE_X86_SVML_NATIVE) && defined(SIMDE_X86_SSE2_NATIVE) + #if defined(SIMDE_X86_SVML_NATIVE) && defined(SIMDE_X86_SSE_NATIVE) return _mm_div_epu16(a, b); #else simde__m128i_private @@ -2866,7 +2858,7 @@ simde_mm_div_epu16 (simde__m128i a, simde__m128i b) { SIMDE_FUNCTION_ATTRIBUTES simde__m128i simde_mm_div_epu32 (simde__m128i a, simde__m128i b) { - #if defined(SIMDE_X86_SVML_NATIVE) && defined(SIMDE_X86_SSE2_NATIVE) + #if defined(SIMDE_X86_SVML_NATIVE) && defined(SIMDE_X86_SSE_NATIVE) return _mm_div_epu32(a, b); #else simde__m128i_private @@ -2899,7 +2891,7 @@ simde_mm_div_epu32 (simde__m128i a, simde__m128i b) { SIMDE_FUNCTION_ATTRIBUTES simde__m128i simde_mm_div_epu64 (simde__m128i a, simde__m128i b) { - #if defined(SIMDE_X86_SVML_NATIVE) && defined(SIMDE_X86_SSE2_NATIVE) + #if defined(SIMDE_X86_SVML_NATIVE) && defined(SIMDE_X86_SSE_NATIVE) return _mm_div_epu64(a, b); #else simde__m128i_private @@ -5101,7 +5093,7 @@ simde_mm512_mask_cdfnorm_pd(simde__m512d src, simde__mmask8 k, simde__m512d a) { SIMDE_FUNCTION_ATTRIBUTES simde__m128i simde_mm_idivrem_epi32 (simde__m128i* mem_addr, simde__m128i a, simde__m128i b) { - #if defined(SIMDE_X86_SVML_NATIVE) && defined(SIMDE_X86_SSE2_NATIVE) + #if defined(SIMDE_X86_SVML_NATIVE) && defined(SIMDE_X86_SSE_NATIVE) return _mm_idivrem_epi32(HEDLEY_REINTERPRET_CAST(__m128i*, mem_addr), a, b); #else simde__m128i r; @@ -8825,7 +8817,7 @@ simde_mm_clog_ps (simde__m128 a) { SIMDE_FUNCTION_ATTRIBUTES simde__m256 simde_mm256_clog_ps (simde__m256 a) { - #if defined(SIMDE_X86_SVML_NATIVE) && defined(SIMDE_X86_SSE_NATIVE) + #if defined(SIMDE_X86_SVML_NATIVE) && defined(SIMDE_X86_AVX_NATIVE) return _mm256_clog_ps(a); #else simde__m256_private @@ -8880,7 +8872,7 @@ simde_mm_csqrt_ps (simde__m128 a) { SIMDE_FUNCTION_ATTRIBUTES simde__m256 simde_mm256_csqrt_ps (simde__m256 a) { - #if defined(SIMDE_X86_SVML_NATIVE) && defined(SIMDE_X86_SSE_NATIVE) + #if defined(SIMDE_X86_SVML_NATIVE) && defined(SIMDE_X86_AVX_NATIVE) return _mm256_csqrt_ps(a); #else simde__m256_private @@ -8910,7 +8902,7 @@ simde_mm256_csqrt_ps (simde__m256 a) { SIMDE_FUNCTION_ATTRIBUTES simde__m128i simde_mm_rem_epi8 (simde__m128i a, simde__m128i b) { - #if defined(SIMDE_X86_SVML_NATIVE) && defined(SIMDE_X86_SSE2_NATIVE) + #if defined(SIMDE_X86_SVML_NATIVE) && defined(SIMDE_X86_SSE_NATIVE) return _mm_rem_epi8(a, b); #else simde__m128i_private @@ -8938,7 +8930,7 @@ simde_mm_rem_epi8 (simde__m128i a, simde__m128i b) { SIMDE_FUNCTION_ATTRIBUTES simde__m128i simde_mm_rem_epi16 (simde__m128i a, simde__m128i b) { - #if defined(SIMDE_X86_SVML_NATIVE) && defined(SIMDE_X86_SSE2_NATIVE) + #if defined(SIMDE_X86_SVML_NATIVE) && defined(SIMDE_X86_SSE_NATIVE) return _mm_rem_epi16(a, b); #else simde__m128i_private @@ -8966,7 +8958,7 @@ simde_mm_rem_epi16 (simde__m128i a, simde__m128i b) { SIMDE_FUNCTION_ATTRIBUTES simde__m128i simde_mm_rem_epi32 (simde__m128i a, simde__m128i b) { - #if defined(SIMDE_X86_SVML_NATIVE) && defined(SIMDE_X86_SSE2_NATIVE) + #if defined(SIMDE_X86_SVML_NATIVE) && defined(SIMDE_X86_SSE_NATIVE) return _mm_rem_epi32(a, b); #else simde__m128i_private @@ -8997,7 +8989,7 @@ simde_mm_rem_epi32 (simde__m128i a, simde__m128i b) { SIMDE_FUNCTION_ATTRIBUTES simde__m128i simde_mm_rem_epi64 (simde__m128i a, simde__m128i b) { - #if defined(SIMDE_X86_SVML_NATIVE) && defined(SIMDE_X86_SSE2_NATIVE) + #if defined(SIMDE_X86_SVML_NATIVE) && defined(SIMDE_X86_SSE_NATIVE) return _mm_rem_epi64(a, b); #else simde__m128i_private @@ -9025,7 +9017,7 @@ simde_mm_rem_epi64 (simde__m128i a, simde__m128i b) { SIMDE_FUNCTION_ATTRIBUTES simde__m128i simde_mm_rem_epu8 (simde__m128i a, simde__m128i b) { - #if defined(SIMDE_X86_SVML_NATIVE) && defined(SIMDE_X86_SSE2_NATIVE) + #if defined(SIMDE_X86_SVML_NATIVE) && defined(SIMDE_X86_SSE_NATIVE) return _mm_rem_epu8(a, b); #else simde__m128i_private @@ -9053,7 +9045,7 @@ simde_mm_rem_epu8 (simde__m128i a, simde__m128i b) { SIMDE_FUNCTION_ATTRIBUTES simde__m128i simde_mm_rem_epu16 (simde__m128i a, simde__m128i b) { - #if defined(SIMDE_X86_SVML_NATIVE) && defined(SIMDE_X86_SSE2_NATIVE) + #if defined(SIMDE_X86_SVML_NATIVE) && defined(SIMDE_X86_SSE_NATIVE) return _mm_rem_epu16(a, b); #else simde__m128i_private @@ -9081,7 +9073,7 @@ simde_mm_rem_epu16 (simde__m128i a, simde__m128i b) { SIMDE_FUNCTION_ATTRIBUTES simde__m128i simde_mm_rem_epu32 (simde__m128i a, simde__m128i b) { - #if defined(SIMDE_X86_SVML_NATIVE) && defined(SIMDE_X86_SSE2_NATIVE) + #if defined(SIMDE_X86_SVML_NATIVE) && defined(SIMDE_X86_SSE_NATIVE) return _mm_rem_epu32(a, b); #else simde__m128i_private @@ -9112,7 +9104,7 @@ simde_mm_rem_epu32 (simde__m128i a, simde__m128i b) { SIMDE_FUNCTION_ATTRIBUTES simde__m128i simde_mm_rem_epu64 (simde__m128i a, simde__m128i b) { - #if defined(SIMDE_X86_SVML_NATIVE) && defined(SIMDE_X86_SSE2_NATIVE) + #if defined(SIMDE_X86_SVML_NATIVE) && defined(SIMDE_X86_SSE_NATIVE) return _mm_rem_epu64(a, b); #else simde__m128i_private @@ -12095,7 +12087,7 @@ simde_mm512_mask_trunc_pd(simde__m512d src, simde__mmask8 k, simde__m512d a) { SIMDE_FUNCTION_ATTRIBUTES simde__m128i simde_mm_udivrem_epi32 (simde__m128i * mem_addr, simde__m128i a, simde__m128i b) { - #if defined(SIMDE_X86_SVML_NATIVE) && defined(SIMDE_X86_SSE2_NATIVE) + #if defined(SIMDE_X86_SVML_NATIVE) && defined(SIMDE_X86_SSE_NATIVE) return _mm_udivrem_epi32(mem_addr, a, b); #else simde__m128i r; diff --git a/lib/simd_wrapper/simde/x86/xop.h b/lib/simd_wrapper/simde/x86/xop.h index 8b83ed27930..5249f06d7e3 100644 --- a/lib/simd_wrapper/simde/x86/xop.h +++ b/lib/simd_wrapper/simde/x86/xop.h @@ -3727,7 +3727,7 @@ simde_mm256_permute2_pd (simde__m256d a, simde__m256d b, simde__m256i c, const i SIMDE_LCC_REVERT_DEPRECATED_WARNINGS \ })) #else - #define simde_mm256_permute2_pd(a, b, c, imm8) simde_undeprecated_mm256_permute2_pd((a), (b), (c), (imm8)) + #define simde_mm256_permute2_pd(a, b, c, imm8) _mm256_permute2_pd((a), (b), (c), (imm8)) #endif #endif #if defined(SIMDE_X86_XOP_ENABLE_NATIVE_ALIASES) diff --git a/src/config/favorite_track_status.cpp b/src/config/favorite_track_status.cpp new file mode 100644 index 00000000000..f5dedb43310 --- /dev/null +++ b/src/config/favorite_track_status.cpp @@ -0,0 +1,129 @@ +// +// SuperTuxKart - a fun racing game with go-kart +// Copyright (C) 2012-2015 SuperTuxKart-Team +// +// This program is free software; you can redistribute it and/or +// modify it under the terms of the GNU General Public License +// as published by the Free Software Foundation; either version 3 +// of the License, or (at your option) any later version. +// +// This program is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +// GNU General Public License for more details. +// +// You should have received a copy of the GNU General Public License +// along with this program; if not, write to the Free Software +// Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. + +#include "config/favorite_track_status.hpp" + +#include "config/player_manager.hpp" +#include "io/utf_writer.hpp" +#include "io/xml_node.hpp" +#include "utils/string_utils.hpp" + +const std::string FavoriteTrackStatus::DEFAULT_FAVORITE_GROUP_NAME = "Favorites"; + +//------------------------------------------------------------------------------ +FavoriteTrackStatus::FavoriteTrackStatus(const XMLNode* node) +{ + std::vector xml_favorite_tracks; + std::vector xml_favorite_groups; + + if (node) + { + node->getNodes("track", xml_favorite_tracks); + node->getNodes("group", xml_favorite_groups); + } + for (unsigned int i = 0; i < xml_favorite_tracks.size(); i++) + { + std::string temp_string; + xml_favorite_tracks[i]->get("ident", &temp_string); + m_favorite_tracks[DEFAULT_FAVORITE_GROUP_NAME].insert(temp_string); + } + for (unsigned int i = 0; i < xml_favorite_groups.size(); i++) + { + std::string temp_group_string; + std::vector temp_group; + + xml_favorite_groups[i]->get("name", &temp_group_string); + xml_favorite_groups[i]->getNodes("track", temp_group); + + for (unsigned int j = 0; j < temp_group.size(); j++) + { + std::string temp_string; + temp_group[j]->get("ident", &temp_string); + m_favorite_tracks[temp_group_string].insert(temp_string); + } + } +} // FavoriteTrackStatus + +//------------------------------------------------------------------------------ +FavoriteTrackStatus::~FavoriteTrackStatus() +{ + +} // ~FavoriteTrackStatus + +//------------------------------------------------------------------------------ +/** Adds a new favorite track to this player profile and to the group + * of favorite tracks of the Track Manager. + * To be used only if this player profile is the current player. + */ +bool FavoriteTrackStatus::isFavoriteTrack(std::string ident) +{ + return m_favorite_tracks[DEFAULT_FAVORITE_GROUP_NAME].find(ident) + != m_favorite_tracks[DEFAULT_FAVORITE_GROUP_NAME].end(); +} // addFavoriteTrack + +//------------------------------------------------------------------------------ +/** Adds a new favorite track to this player profile and to the group + * of favorite tracks of the Track Manager. + */ +void FavoriteTrackStatus::addFavoriteTrack(std::string ident, std::string group) +{ + m_favorite_tracks[group].insert(ident); +} // addFavoriteTrack + +//------------------------------------------------------------------------------ +/** Removes a favorite track from this player profile and from the group + * of favorite tracks of the Track Manager. + */ +void FavoriteTrackStatus::removeFavoriteTrack(std::string ident, std::string group) +{ + if (m_favorite_tracks[group].find(ident) != m_favorite_tracks[group].end()) + { + m_favorite_tracks[group].erase(ident); + } +} // removeFavoriteTrack + +//------------------------------------------------------------------------------ +/** Writes the data for this player to the specified UTFWriter. + * \param out The utf writer to write the data to. + */ +void FavoriteTrackStatus::save(UTFWriter &out) +{ + out << " \n"; + for (auto it_group = m_favorite_tracks.begin(); it_group != m_favorite_tracks.end(); it_group++) + { + std::string group_name = it_group->first; + + if (group_name == DEFAULT_FAVORITE_GROUP_NAME) + { + for (auto it_track = it_group->second.begin(); it_track != it_group->second.end(); it_track++) + { + out << " \n"; + } + } + else + { + out << " \n"; + for (auto it_track = it_group->second.begin(); it_track != it_group->second.end(); it_track++) + { + out << " \n"; + } + out << " \n"; + } + } + out << " \n"; +} // save diff --git a/src/config/favorite_track_status.hpp b/src/config/favorite_track_status.hpp new file mode 100644 index 00000000000..6defd245dcc --- /dev/null +++ b/src/config/favorite_track_status.hpp @@ -0,0 +1,74 @@ +// +// SuperTuxKart - a fun racing game with go-kart +// Copyright (C) 2024 SuperTuxKart-Team +// +// This program is free software; you can redistribute it and/or +// modify it under the terms of the GNU General Public License +// as published by the Free Software Foundation; either version 3 +// of the License, or (at your option) any later version. +// +// This program is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +// GNU General Public License for more details. +// +// You should have received a copy of the GNU General Public License +// along with this program; if not, write to the Free Software +// Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. + +#ifndef HEADER_FAVORITE_TRACK_STATUS_HPP +#define HEADER_FAVORITE_TRACK_STATUS_HPP + +#include "utils/leak_check.hpp" + +#include + +#include +#include +#include + +using namespace irr; + +class TrackManager; +class UTFWriter; +class XMLNode; + +/** Class for managing player profiles (name, usage frequency, + * etc.). All PlayerProfiles are managed by the PlayerManager. + * A PlayerProfile keeps track of the story mode progress using an instance + * of StoryModeStatus, and achievements with AchievementsStatus. All data + * is saved in the players.xml file. + * This class also defines the interface for handling online data. All of + * the online handling is done in the derived class OnlinePlayerProfile, + * where the interface is fully implemented. + * \ingroup config + */ +class FavoriteTrackStatus +{ +private: + LEAK_CHECK() + + /** unordered_map > .*/ + std::unordered_map > m_favorite_tracks; + +public: + friend class TrackManager; + + static const std::string DEFAULT_FAVORITE_GROUP_NAME; + + FavoriteTrackStatus(const XMLNode *node); + + virtual ~FavoriteTrackStatus(); + + void save(UTFWriter &out); + + bool isFavoriteTrack(std::string ident); + + void addFavoriteTrack(std::string ident, std::string group = DEFAULT_FAVORITE_GROUP_NAME); + + void removeFavoriteTrack(std::string ident, std::string group = DEFAULT_FAVORITE_GROUP_NAME); +}; // class PlayerProfile + +#endif + +/*EOF*/ diff --git a/src/config/player_manager.cpp b/src/config/player_manager.cpp index ed1b04bccce..f3637e49fde 100644 --- a/src/config/player_manager.cpp +++ b/src/config/player_manager.cpp @@ -480,7 +480,9 @@ void PlayerManager::setCurrentPlayer(PlayerProfile *player) m_current_player = player; if(m_current_player) + { m_current_player->computeActive(); + } if (player_has_changed) story_mode_timer->playerHasChanged(); diff --git a/src/config/player_profile.cpp b/src/config/player_profile.cpp index 980f805bee0..52c4016f965 100644 --- a/src/config/player_profile.cpp +++ b/src/config/player_profile.cpp @@ -27,6 +27,7 @@ #include "karts/kart_properties.hpp" #include "karts/kart_properties_manager.hpp" #include "online/online_player_profile.hpp" +#include "tracks/track_manager.hpp" #include "utils/string_utils.hpp" //------------------------------------------------------------------------------ @@ -78,6 +79,7 @@ PlayerProfile::PlayerProfile(const XMLNode* node) m_remember_password = false; m_story_mode_status = NULL; m_achievements_status = NULL; + m_favorite_track_status = NULL; m_default_kart_color = 0.0f; m_icon_filename = ""; @@ -104,6 +106,7 @@ PlayerProfile::~PlayerProfile() { delete m_story_mode_status; delete m_achievements_status; + delete m_favorite_track_status; #ifdef DEBUG m_magic_number = 0xDEADBEEF; #endif @@ -111,21 +114,29 @@ PlayerProfile::~PlayerProfile() //------------------------------------------------------------------------------ -/** This function loads the achievement and story mode data. These can only - * be loaded after the UnlockManager is created, which needs the karts - * and tracks to be loaded first. +/** This function loads the achievement, story mode and favorites data. + * These can only be loaded after the karts and tracks. */ void PlayerProfile::loadRemainingData(const XMLNode *node) { + assert(m_story_mode_status == NULL); const XMLNode *xml_story_mode = node->getNode("story-mode"); - m_story_mode_status = - unlock_manager->createStoryModeStatus(xml_story_mode); + m_story_mode_status = unlock_manager->createStoryModeStatus(xml_story_mode); + + assert(m_achievements_status == NULL); const XMLNode *xml_achievements = node->getNode("achievements"); m_achievements_status = AchievementsManager::get() - ->createAchievementsStatus(xml_achievements, m_unique_id == 1); + ->createAchievementsStatus(xml_achievements, m_unique_id == 1); + + // We first load the list of all favorite tracks + // Some favorites may correspond to uninstalled addons, so we do not sanitize the strings + assert(m_favorite_track_status == NULL); + const XMLNode *xml_favorites = node->getNode("favorites"); + m_favorite_track_status = new FavoriteTrackStatus(xml_favorites); + // Fix up any potentially missing icons. addIcon(); -} // initRemainingData +} // loadRemainingData //------------------------------------------------------------------------------ /** Initialises the story- and achievement data structure in case of the first @@ -136,6 +147,7 @@ void PlayerProfile::initRemainingData() m_story_mode_status = unlock_manager->createStoryModeStatus(); m_achievements_status = AchievementsManager::get()->createAchievementsStatus(); + m_favorite_track_status = new FavoriteTrackStatus(NULL); addIcon(); } // initRemainingData @@ -223,11 +235,14 @@ void PlayerProfile::save(UTFWriter &out) if (player != NULL && (getName() == player->getName())) is_current_player = true; - if(m_story_mode_status) + if (m_story_mode_status) m_story_mode_status->save(out, is_current_player); - if(m_achievements_status) + if (m_achievements_status) m_achievements_status->save(out); + + if (m_favorite_track_status) + m_favorite_track_status->save(out); } out << " \n"; } // save diff --git a/src/config/player_profile.hpp b/src/config/player_profile.hpp index f70c5fe99e6..d0ecf5c9811 100644 --- a/src/config/player_profile.hpp +++ b/src/config/player_profile.hpp @@ -20,6 +20,7 @@ #define HEADER_PLAYER_PROFILE_HPP #include "challenges/story_mode_status.hpp" +#include "config/favorite_track_status.hpp" #include "network/remote_kart_info.hpp" #include "utils/leak_check.hpp" #include "utils/no_copy.hpp" @@ -111,8 +112,12 @@ class PlayerProfile : public NoCopy /** The complete challenge state. */ StoryModeStatus *m_story_mode_status; + /** The complete achievement data. */ AchievementsStatus *m_achievements_status; + /** The favorite tracks selected by this player. */ + FavoriteTrackStatus *m_favorite_track_status; + public: PlayerProfile(const core::stringw &name, bool is_guest = false); @@ -289,6 +294,23 @@ class PlayerProfile : public NoCopy // ---------------------------------------------------------------------------------------- StoryModeStatus* getStoryModeStatus() { return m_story_mode_status; } // ---------------------------------------------------------------------------------------- + FavoriteTrackStatus* getFavoriteTrackStatus() { return m_favorite_track_status; } + // ---------------------------------------------------------------------------------------- + bool isFavoriteTrack(std::string ident) + { + return m_favorite_track_status->isFavoriteTrack(ident); + } // getNumBestTrophies + void addFavoriteTrack(std::string ident, std::string group = + FavoriteTrackStatus::DEFAULT_FAVORITE_GROUP_NAME) + { + m_favorite_track_status->addFavoriteTrack(ident, group); + } // getNumBestTrophies + void removeFavoriteTrack(std::string ident, std::string group = + FavoriteTrackStatus::DEFAULT_FAVORITE_GROUP_NAME) + { + m_favorite_track_status->removeFavoriteTrack(ident, group); + } // getNumBestTrophies + // ---------------------------------------------------------------------------------------- /** If a session was saved, return the id of the saved user. */ int getSavedUserId() const { diff --git a/src/guiengine/skin.cpp b/src/guiengine/skin.cpp index a3b32407101..be5a7a12941 100644 --- a/src/guiengine/skin.cpp +++ b/src/guiengine/skin.cpp @@ -1143,6 +1143,7 @@ void Skin::drawRatingBar(Widget *w, const core::recti &rect, RatingBarWidget *ratingBar = (RatingBarWidget*)w; const ITexture *texture = SkinConfig::m_render_params["rating::neutral"].getImage(); + int all_steps = ratingBar->getSteps(); const int texture_w = texture->getSize().Width / 4; const int texture_h = texture->getSize().Height; const float aspect_ratio = 1.0f; @@ -1216,9 +1217,10 @@ void Skin::drawRatingBar(Widget *w, const core::recti &rect, star_rect.LowerRightCorner.Y = y_from + star_h; int step = ratingBar->getStepsOfStar(i); + int begin = roundf(2.0f * step / (all_steps - 1)) * texture_w; // Round to the closest actual image - const core::recti source_area(texture_w * step, 0, - texture_w * (step + 1), texture_h); + const core::recti source_area(begin, 0, + begin + texture_w, texture_h); draw2DImage(texture, star_rect, source_area, @@ -1473,76 +1475,44 @@ void Skin::drawRibbonChild(const core::recti &rect, Widget* widget, } } + // Handle focus from players - //Handle drawing for the first player int nPlayersOnThisItem = 0; if (mark_focused) { - if (use_glow) - { - // don't mark filler items as focused - if (widget->m_properties[PROP_ID] == RibbonWidget::NO_ITEM_ID) - return; - - static float glow_effect = 0; - - const float dt = GUIEngine::getLatestDt(); - glow_effect += dt * 3; - if (glow_effect > 6.2832f /* 2*PI */) glow_effect -= 6.2832f; - float grow = 10.0f * sinf(glow_effect); - - const int glow_center_x = rect.UpperLeftCorner.X - + rect.getWidth() / 2; - const int glow_center_y = rect.LowerRightCorner.Y; - - ITexture* tex_ficonhighlight = - SkinConfig::m_render_params["focusHalo::neutral"] - .getImage(); - const int texture_w = tex_ficonhighlight->getSize().Width; - const int texture_h = tex_ficonhighlight->getSize().Height; - - core::recti source_area(0, 0, texture_w, texture_h); - - float scale = (float)std::min(irr_driver->getActualScreenSize().Height / 1080.0f, - irr_driver->getActualScreenSize().Width / 1350.0f); - int size = (int)((90.0f + grow) * scale); - const core::recti rect2(glow_center_x - size, - glow_center_y - size / 2, - glow_center_x + size, - glow_center_y + size / 2); + // Don't mark filler items as focused + if (widget->m_properties[PROP_ID] == RibbonWidget::NO_ITEM_ID) + return; + + // Hide focus when not forced + if (!always_show_selection && !focused && !parent_focused) + return; + + nPlayersOnThisItem = 1; + } - draw2DImage(tex_ficonhighlight, rect2, - source_area, - /*clipping*/ 0, - /*color*/ 0, - /*alpha*/true); - } - // if we're not using glow, draw square focus instead - else + for (unsigned i = 1; i < MAX_PLAYER_COUNT; i++) + { + // ---- Draw selection for other players than player 1 + if (parentRibbon->isFocusedForPlayer(i) && + parentRibbon->getSelectionIDString(i) == + widget->m_properties[PROP_ID]) { - const bool show_focus = (focused || parent_focused); - - if (!always_show_selection && !show_focus) return; - - // don't mark filler items as focused - if (widget->m_properties[PROP_ID] == RibbonWidget::NO_ITEM_ID) - return; - - drawBoxFromStretchableTexture(parentRibbonWidget, rect, - SkinConfig::m_render_params["squareFocusHalo1::neutral"]); nPlayersOnThisItem++; } - } // end if mark_focused + } - //Handle drawing for everyone else - for (unsigned i = 1; i < MAX_PLAYER_COUNT; i++) + // Handle drawing for everyone else + for (unsigned i = MAX_PLAYER_COUNT - 1; i >= 1; i--) { // ---- Draw selection for other players than player 1 if (parentRibbon->isFocusedForPlayer(i) && parentRibbon->getSelectionIDString(i) == widget->m_properties[PROP_ID]) { + nPlayersOnThisItem--; + short red_previous = parentRibbonWidget->m_skin_r; short green_previous = parentRibbonWidget->m_skin_g; short blue_previous = parentRibbonWidget->m_skin_b; @@ -1589,10 +1559,55 @@ void Skin::drawRibbonChild(const core::recti &rect, Widget* widget, parentRibbonWidget->m_skin_g = green_previous; parentRibbonWidget->m_skin_b = blue_previous; } - nPlayersOnThisItem++; } } + // Handle drawing for the first player + if (mark_focused) + { + if (use_glow) + { + static float glow_effect = 0; + + const float dt = GUIEngine::getLatestDt(); + glow_effect += dt * 3; + if (glow_effect > 6.2832f /* 2*PI */) glow_effect -= 6.2832f; + float grow = 10.0f * sinf(glow_effect); + + const int glow_center_x = rect.UpperLeftCorner.X + + rect.getWidth() / 2; + const int glow_center_y = rect.LowerRightCorner.Y; + + ITexture* tex_ficonhighlight = + SkinConfig::m_render_params["focusHalo::neutral"] + .getImage(); + const int texture_w = tex_ficonhighlight->getSize().Width; + const int texture_h = tex_ficonhighlight->getSize().Height; + + core::recti source_area(0, 0, texture_w, texture_h); + + float scale = (float)std::min(irr_driver->getActualScreenSize().Height / 1080.0f, + irr_driver->getActualScreenSize().Width / 1350.0f); + int size = (int)((90.0f + grow) * scale); + const core::recti rect2(glow_center_x - size, + glow_center_y - size / 2, + glow_center_x + size, + glow_center_y + size / 2); + + draw2DImage(tex_ficonhighlight, rect2, + source_area, + /*clipping*/ 0, + /*color*/ 0, + /*alpha*/true); + } + // if we're not using glow, draw square focus instead + else + { + drawBoxFromStretchableTexture(parentRibbonWidget, rect, + SkinConfig::m_render_params["squareFocusHalo1::neutral"]); + } + } // end if mark_focused + drawIconButton(rect, widget, pressed, focused); } // end if icon ribbons @@ -2562,6 +2577,13 @@ void Skin::drawBadgeOn(const Widget* widget, const core::recti& rect) "down.png"); doDrawBadge(texture, rect, max_icon_size, false); } + if (widget->m_badges & HEART_BADGE) + { + float max_icon_size = 0.43f; + video::ITexture* texture = irr_driver->getTexture(FileManager::GUI_ICON, + "heart.png"); + doDrawBadge(texture, rect, max_icon_size, false); + } } // drawBadgeOn // ----------------------------------------------------------------------------- diff --git a/src/guiengine/widget.hpp b/src/guiengine/widget.hpp index 1b8ce3d8364..5f61738591f 100644 --- a/src/guiengine/widget.hpp +++ b/src/guiengine/widget.hpp @@ -80,7 +80,9 @@ namespace GUIEngine /** A anchor badge to indicate that this player receives a handicap */ ANCHOR_BADGE = 256, /** A down arrow badge to indicate new addons for downloading */ - DOWN_BADGE = 512 + DOWN_BADGE = 512, + /** A heart badge, to indicate e.g. a favorite track */ + HEART_BADGE = 1024 }; diff --git a/src/guiengine/widgets/rating_bar_widget.hpp b/src/guiengine/widgets/rating_bar_widget.hpp index ff3147e15a9..a18db253016 100644 --- a/src/guiengine/widgets/rating_bar_widget.hpp +++ b/src/guiengine/widgets/rating_bar_widget.hpp @@ -53,8 +53,6 @@ namespace GUIEngine RatingBarWidget(); virtual ~RatingBarWidget() {} - - void add() OVERRIDE; @@ -69,6 +67,11 @@ namespace GUIEngine /** Get the current number of stars of the widget. */ int getStarNumber() {return m_stars; }; + + /** Set steps in each star. Interpolate steps-2 values between 0 star and 1 star. */ + void setSteps(int steps) { m_steps = steps; } + + int getSteps() { return m_steps; } int getStepsOfStar(int index); diff --git a/src/states_screens/arenas_screen.cpp b/src/states_screens/arenas_screen.cpp index fff9cc6e050..ec840e1d8eb 100644 --- a/src/states_screens/arenas_screen.cpp +++ b/src/states_screens/arenas_screen.cpp @@ -20,6 +20,7 @@ #include "config/user_config.hpp" #include "graphics/irr_driver.hpp" #include "guiengine/widget.hpp" +#include "guiengine/widgets/check_box_widget.hpp" #include "guiengine/widgets/dynamic_ribbon_widget.hpp" #include "guiengine/widgets/icon_button_widget.hpp" #include "io/file_manager.hpp" @@ -57,6 +58,12 @@ void ArenasScreen::loadedFromFile() void ArenasScreen::beforeAddingWidget() { + // Add user-defined group to track groups + track_manager->setFavoriteTrackStatus(PlayerManager::getCurrentPlayer()->getFavoriteTrackStatus()); + + CheckBoxWidget* favorite_cb = getWidget("favorite"); + assert( favorite_cb != NULL ); + favorite_cb->setState(false); // Dynamically add tabs RibbonWidget* tabs = this->getWidget("trackgroups"); @@ -79,6 +86,8 @@ void ArenasScreen::beforeAddingWidget() //I18N: track group name FOR_GETTEXT_ONLY( _("All") ) //I18N: track group name + FOR_GETTEXT_ONLY( _("Favorites") ) + //I18N: track group name FOR_GETTEXT_ONLY( _("Standard") ) //I18N: track group name FOR_GETTEXT_ONLY( _("Add-Ons") ) @@ -204,8 +213,20 @@ void ArenasScreen::eventCallback(Widget* widget, const std::string& name, const Track* clicked_track = track_manager->getTrack(selection); if (clicked_track != NULL) { - TrackInfoScreen::getInstance()->setTrack(clicked_track); - TrackInfoScreen::getInstance()->push(); + if (getWidget("favorite")->getState()) + { + if(PlayerManager::getCurrentPlayer()->isFavoriteTrack(clicked_track->getIdent())) + PlayerManager::getCurrentPlayer()->removeFavoriteTrack(clicked_track->getIdent()); + else + PlayerManager::getCurrentPlayer()->addFavoriteTrack(clicked_track->getIdent()); + + buildTrackList(); + } + else + { + TrackInfoScreen::getInstance()->setTrack(clicked_track); + TrackInfoScreen::getInstance()->push(); + } } // clickedTrack != NULL } // if random_track @@ -225,6 +246,9 @@ void ArenasScreen::eventCallback(Widget* widget, const std::string& name, const void ArenasScreen::buildTrackList() { + // Add user-defined group to track groups + track_manager->setFavoriteTrackStatus(PlayerManager::getCurrentPlayer()->getFavoriteTrackStatus()); + DynamicRibbonWidget* w = this->getWidget("tracks"); assert( w != NULL ); @@ -237,6 +261,7 @@ void ArenasScreen::buildTrackList() bool soccer_mode = RaceManager::get()->getMinorMode() == RaceManager::MINOR_MODE_SOCCER; bool arenas_have_navmesh = false; + PtrVector tracks; if (curr_group_name == ALL_ARENA_GROUPS_ID) { @@ -275,19 +300,8 @@ void ArenasScreen::buildTrackList() continue; } } - - if (PlayerManager::getCurrentPlayer()->isLocked(curr->getIdent())) - { - w->addItem( _("Locked : solve active challenges to gain access to more!"), - "locked", curr->getScreenshotFile(), LOCKED_BADGE ); - } - else - { - w->addItem(curr->getName(), curr->getIdent(), curr->getScreenshotFile(), 0, - IconButtonWidget::ICON_PATH_TYPE_ABSOLUTE ); - } + tracks.push_back(curr); } - } else { @@ -327,17 +341,29 @@ void ArenasScreen::buildTrackList() continue; } } + tracks.push_back(curr); + } + } + tracks.insertionSort(); - if (PlayerManager::getCurrentPlayer()->isLocked(curr->getIdent())) - { - w->addItem( _("Locked : solve active challenges to gain access to more!"), - "locked", curr->getScreenshotFile(), LOCKED_BADGE ); - } - else - { - w->addItem(curr->getName(), curr->getIdent(), curr->getScreenshotFile(), 0, - IconButtonWidget::ICON_PATH_TYPE_ABSOLUTE ); - } + for (unsigned int i = 0; i < tracks.size(); i++) + { + Track *curr = tracks.get(i); + if (PlayerManager::getCurrentPlayer()->isLocked(curr->getIdent())) + { + w->addItem( _("Locked : solve active challenges to gain access to more!"), + "locked", curr->getScreenshotFile(), LOCKED_BADGE ); + } + else if (PlayerManager::getCurrentPlayer()->isFavoriteTrack(curr->getIdent())) + { + w->addItem(curr->getName(), curr->getIdent(), + curr->getScreenshotFile(), HEART_BADGE, + IconButtonWidget::ICON_PATH_TYPE_ABSOLUTE); + } + else + { + w->addItem(curr->getName(), curr->getIdent(), curr->getScreenshotFile(), 0, + IconButtonWidget::ICON_PATH_TYPE_ABSOLUTE ); } } if (arenas_have_navmesh || RaceManager::get()->getNumLocalPlayers() > 1 || diff --git a/src/states_screens/dialogs/vote_dialog.cpp b/src/states_screens/dialogs/vote_dialog.cpp index 56b4bf0b498..9849101e322 100644 --- a/src/states_screens/dialogs/vote_dialog.cpp +++ b/src/states_screens/dialogs/vote_dialog.cpp @@ -53,6 +53,7 @@ VoteDialog::VoteDialog(const std::string & addon_id) m_rating_widget = getWidget("rating"); assert(m_rating_widget != NULL); + m_rating_widget->setSteps(2); m_rating_widget->setRating(0); m_rating_widget->allowVoting(); m_options_widget = getWidget("options"); diff --git a/src/states_screens/easter_egg_screen.cpp b/src/states_screens/easter_egg_screen.cpp index 6c348c2f726..771a21f0d3a 100644 --- a/src/states_screens/easter_egg_screen.cpp +++ b/src/states_screens/easter_egg_screen.cpp @@ -121,6 +121,9 @@ void EasterEggScreen::eventCallback(Widget* widget, const std::string& name, con void EasterEggScreen::beforeAddingWidget() { + // Add user-defined group to track groups + track_manager->setFavoriteTrackStatus(PlayerManager::getCurrentPlayer()->getFavoriteTrackStatus()); + Screen::init(); // Dynamically add tabs RibbonWidget* tabs = this->getWidget("trackgroups"); @@ -201,6 +204,9 @@ void EasterEggScreen::init() void EasterEggScreen::buildTrackList() { + // Add user-defined group to track groups + track_manager->setFavoriteTrackStatus(PlayerManager::getCurrentPlayer()->getFavoriteTrackStatus()); + DynamicRibbonWidget* tracks_widget = this->getWidget("tracks"); assert( tracks_widget != NULL ); @@ -213,6 +219,8 @@ void EasterEggScreen::buildTrackList() const std::string curr_group_name = tabs->getSelectionIDString(0); + PtrVector tracks; + // Build track list if (curr_group_name == ALL_TRACK_GROUPS_ID) { @@ -227,21 +235,8 @@ void EasterEggScreen::buildTrackList() if (curr->isArena() || curr->isSoccer()) continue; if (curr->isInternal()) continue; - if (PlayerManager::getCurrentPlayer()->isLocked(curr->getIdent())) - { - tracks_widget->addItem( _("Locked : solve active challenges to gain access to more!"), - "locked", curr->getScreenshotFile(), LOCKED_BADGE, - IconButtonWidget::ICON_PATH_TYPE_ABSOLUTE); - } - else - { - tracks_widget->addItem(curr->getName(), curr->getIdent(), - curr->getScreenshotFile(), 0, - IconButtonWidget::ICON_PATH_TYPE_ABSOLUTE ); - m_random_track_list.push_back(curr->getIdent()); - } + tracks.push_back(curr); } - } else { @@ -258,19 +253,34 @@ void EasterEggScreen::buildTrackList() if (curr->isSoccer()) continue; if (curr->isInternal()) continue; - if (PlayerManager::getCurrentPlayer()->isLocked(curr->getIdent())) - { - tracks_widget->addItem( _("Locked : solve active challenges to gain access to more!"), - "locked", curr->getScreenshotFile(), LOCKED_BADGE, - IconButtonWidget::ICON_PATH_TYPE_ABSOLUTE); - } - else - { - tracks_widget->addItem(curr->getName(), curr->getIdent(), - curr->getScreenshotFile(), 0 /* no badge */, - IconButtonWidget::ICON_PATH_TYPE_ABSOLUTE ); - m_random_track_list.push_back(curr->getIdent()); - } + tracks.push_back(curr); + } + } + tracks.insertionSort(); + + for (int n=0; nisLocked(curr->getIdent())) + { + tracks_widget->addItem( _("Locked : solve active challenges to gain access to more!"), + "locked", curr->getScreenshotFile(), LOCKED_BADGE, + IconButtonWidget::ICON_PATH_TYPE_ABSOLUTE); + } + else if (PlayerManager::getCurrentPlayer()->isFavoriteTrack(curr->getIdent())) + { + tracks_widget->addItem(curr->getName(), curr->getIdent(), + curr->getScreenshotFile(), HEART_BADGE, + IconButtonWidget::ICON_PATH_TYPE_ABSOLUTE ); + m_random_track_list.push_back(curr->getIdent()); + } + else + { + tracks_widget->addItem(curr->getName(), curr->getIdent(), + curr->getScreenshotFile(), 0, + IconButtonWidget::ICON_PATH_TYPE_ABSOLUTE ); + m_random_track_list.push_back(curr->getIdent()); } } diff --git a/src/states_screens/online/online_profile_achievements.cpp b/src/states_screens/online/online_profile_achievements.cpp index 73742ad3eb3..4822c24998c 100644 --- a/src/states_screens/online/online_profile_achievements.cpp +++ b/src/states_screens/online/online_profile_achievements.cpp @@ -228,8 +228,8 @@ void BaseOnlineProfileAchievements::displayResults() if (a->getInfo()->isSecret() && !a->isAchieved()) continue; ListWidget::ListCell title(a->getInfo()->getName(), -1, 2); - ListWidget::ListCell goals(a->getGoalProgressAsString(), -1, 1); - ListWidget::ListCell progress(a->getProgressAsString(), -1, 1); + ListWidget::ListCell goals(a->getGoalProgressAsString(), -1, 1, true); + ListWidget::ListCell progress(a->getProgressAsString(), -1, 1, true); row.push_back(title); row.push_back(goals); row.push_back(progress); diff --git a/src/states_screens/online/tracks_screen.cpp b/src/states_screens/online/tracks_screen.cpp index 49d53a0a8aa..434c8e9f658 100644 --- a/src/states_screens/online/tracks_screen.cpp +++ b/src/states_screens/online/tracks_screen.cpp @@ -219,6 +219,9 @@ void TracksScreen::beforeAddingWidget() { Screen::init(); + // Add user-defined group to track groups + track_manager->setFavoriteTrackStatus(PlayerManager::getCurrentPlayer()->getFavoriteTrackStatus()); + m_selected_track = NULL; m_search_track = NULL; m_timer = getWidget("timer"); @@ -567,6 +570,9 @@ void TracksScreen::init() */ void TracksScreen::buildTrackList() { + // Add user-defined group to track groups + track_manager->setFavoriteTrackStatus(PlayerManager::getCurrentPlayer()->getFavoriteTrackStatus()); + DynamicRibbonWidget* tracks_widget = this->getWidget("tracks"); RibbonWidget* tabs = this->getWidget("trackgroups"); @@ -575,7 +581,6 @@ void TracksScreen::buildTrackList() m_random_track_list.clear(); const std::string& curr_group_name = tabs->getSelectionIDString(0); - const int track_amount = (int)track_manager->getNumberOfTracks(); // First build a list of all tracks to be displayed // (e.g. exclude arenas, ...) @@ -587,35 +592,75 @@ void TracksScreen::buildTrackList() assert(clrp); } PtrVector tracks; - for (int n = 0; n < track_amount; n++) + if (curr_group_name == ALL_TRACK_GROUPS_ID) { - Track* curr = track_manager->getTrack(n); - core::stringw search_text; - if (m_search_track) - { - search_text = m_search_track->getText(); - search_text.make_lower(); - } - if (!search_text.empty() && - curr->getName().make_lower().find(search_text.c_str()) == -1) - continue; - if (RaceManager::get()->getMinorMode() == RaceManager::MINOR_MODE_EASTER_EGG - && !curr->hasEasterEggs()) - continue; - if (!is_network && - (curr->isArena() || curr->isSoccer() || curr->isInternal())) - continue; - if (curr_group_name != ALL_TRACK_GROUPS_ID && - !curr->isInGroup(curr_group_name)) continue; - if (is_network && - clrp->getAvailableTracks().find(curr->getIdent()) == - clrp->getAvailableTracks().end()) + const int track_amount = (int)track_manager->getNumberOfTracks(); + for (int n = 0; n < track_amount; n++) { - continue; - } - tracks.push_back(curr); - } // for ngetTrack(n); + core::stringw search_text; + if (m_search_track) + { + search_text = m_search_track->getText(); + search_text.make_lower(); + } + if (!search_text.empty() && + curr->getName().make_lower().find(search_text.c_str()) == -1) + continue; + if (RaceManager::get()->getMinorMode() == RaceManager::MINOR_MODE_EASTER_EGG + && !curr->hasEasterEggs()) + continue; + if (!is_network && + (curr->isArena() || curr->isSoccer() || curr->isInternal())) + continue; + if (is_network && + clrp->getAvailableTracks().find(curr->getIdent()) == + clrp->getAvailableTracks().end()) + { + continue; + } + tracks.push_back(curr); + } // for n curr_tracks = track_manager->getTracksInGroup(curr_group_name); + const std::vector& curr_arenas = track_manager->getArenasInGroup(curr_group_name, false); + const std::vector& curr_soccers = track_manager->getArenasInGroup(curr_group_name, true); + curr_tracks.insert(curr_tracks.end(), curr_arenas.begin(), curr_arenas.end()); + curr_tracks.insert(curr_tracks.end(), curr_soccers.begin(), curr_soccers.end()); + + const int track_amount = (int)curr_tracks.size(); + + for (int n = 0; n < track_amount; n++) + { + Track* curr = track_manager->getTrack(curr_tracks[n]); + core::stringw search_text; + if (m_search_track) + { + search_text = m_search_track->getText(); + search_text.make_lower(); + } + if (!search_text.empty() && + curr->getName().make_lower().find(search_text.c_str()) == -1) + continue; + if (RaceManager::get()->getMinorMode() == RaceManager::MINOR_MODE_EASTER_EGG + && !curr->hasEasterEggs()) + continue; + if (!is_network && + (curr->isArena() || curr->isSoccer() || curr->isInternal())) + continue; + if (is_network && + clrp->getAvailableTracks().find(curr->getIdent()) == + clrp->getAvailableTracks().end()) + { + continue; + } + tracks.push_back(curr); + } // for ngetScreenshotFile(), LOCKED_BADGE, IconButtonWidget::ICON_PATH_TYPE_ABSOLUTE); } + else if (PlayerManager::getCurrentPlayer()->isFavoriteTrack(curr->getIdent())) + { + tracks_widget->addItem(curr->getName(), curr->getIdent(), + curr->getScreenshotFile(), HEART_BADGE, + IconButtonWidget::ICON_PATH_TYPE_ABSOLUTE ); + m_random_track_list.push_back(curr->getIdent()); + } else { tracks_widget->addItem(curr->getName(), diff --git a/src/states_screens/race_result_gui.cpp b/src/states_screens/race_result_gui.cpp index 7a2c4fb00d9..bf22b9fc669 100644 --- a/src/states_screens/race_result_gui.cpp +++ b/src/states_screens/race_result_gui.cpp @@ -2307,11 +2307,12 @@ void RaceResultGUI::displayBenchmarkSummary() // Draw the results core::dimension2du rect = font->getDimension(title_text.c_str()); current_y += rect.Height; + int info_y = current_y; current_x /= 2; font = GUIEngine::getFont(); rect = font->getDimension(title_text.c_str()); - core::stringw info_text[5]; + core::stringw info_text[8]; core::stringw value = StringUtils::toWString( StringUtils::timeToString(float(profiler.getTotalFrametime())/1000000.0f, 2, true)); info_text[0] = _("Test duration: %s", value); @@ -2320,7 +2321,7 @@ void RaceResultGUI::displayBenchmarkSummary() value = StringUtils::toWString(profiler.getFPSMetricsLow()); info_text[2] = _("Steady FPS: %s", value); value = StringUtils::toWString(profiler.getFPSMetricsMid()); - info_text[3] = _("Mostly Stable FPS: %s", value); // TODO - better name + info_text[3] = _("Mostly Steady FPS: %s", value); // TODO - better name value = StringUtils::toWString(profiler.getFPSMetricsHigh()); info_text[4] = _("Typical FPS: %s", value); @@ -2330,10 +2331,35 @@ void RaceResultGUI::displayBenchmarkSummary() font->draw(info_text[i].c_str(), pos, white_color, true, false); current_y += (5 * rect.Height) / 4; } - // TODO : Draw info on the settings - // * resolution - // * Render scale - // * graphics settings + + // Draw info on the graphical settings + current_y = info_y; + current_x *= 3; + + value = StringUtils::toWString(UserConfigParams::m_real_width); + info_text[0] = _("Horizontal resolution: %s", value); + value = StringUtils::toWString(UserConfigParams::m_real_height); + info_text[1] = _("Vertical resolution: %s", value); + info_text[2] = UserConfigParams::m_dynamic_lights ? _("Dynamic lighting: ON") + : _("Dynamic lighting: OFF"); + value = StringUtils::toWString(UserConfigParams::m_dynamic_lights ? + UserConfigParams::m_scale_rtts_factor * 100 : 100); + info_text[3] = _("Render resolution: %s%%", value); + info_text[4] = UserConfigParams::m_mlaa ? _("Anti-aliasing: ON") + : _("Anti-aliasing : OFF"); + info_text[5] = UserConfigParams::m_degraded_IBL ? _("Image-based lighting: OFF") + : _("Image-based lighting: ON"); + info_text[6] = UserConfigParams::m_ssao ? _("Ambient occlusion: ON") + : _("Ambient occlusion: OFF"); + value = StringUtils::toWString(UserConfigParams::m_shadows_resolution); + info_text[7] = _("Shadow resolution: %s", value); + + for (int i=0; i<8; i++) + { + pos = core::rect(current_x, current_y, current_x, current_y); + font->draw(info_text[i].c_str(), pos, white_color, true, false); + current_y += (5 * rect.Height) / 4; + } #endif } // displayBenchmarkSummary diff --git a/src/states_screens/tracks_and_gp_screen.cpp b/src/states_screens/tracks_and_gp_screen.cpp index 97e1b6205ec..a8ab6542edd 100644 --- a/src/states_screens/tracks_and_gp_screen.cpp +++ b/src/states_screens/tracks_and_gp_screen.cpp @@ -18,10 +18,12 @@ #include "states_screens/tracks_and_gp_screen.hpp" #include "challenges/unlock_manager.hpp" +#include "config/favorite_track_status.hpp" #include "config/player_manager.hpp" #include "config/user_config.hpp" #include "graphics/stk_tex_manager.hpp" #include "guiengine/widget.hpp" +#include "guiengine/widgets/check_box_widget.hpp" #include "guiengine/widgets/dynamic_ribbon_widget.hpp" #include "guiengine/widgets/icon_button_widget.hpp" #include "io/file_manager.hpp" @@ -86,8 +88,21 @@ void TracksAndGPScreen::eventCallback(Widget* widget, const std::string& name, if (track) { - TrackInfoScreen::getInstance()->setTrack(track); - TrackInfoScreen::getInstance()->push(); + // In favorite edit mode, switch the status of the selected track + if (getWidget("favorite")->getState()) + { + if(PlayerManager::getCurrentPlayer()->isFavoriteTrack(track->getIdent())) + PlayerManager::getCurrentPlayer()->removeFavoriteTrack(track->getIdent()); + else + PlayerManager::getCurrentPlayer()->addFavoriteTrack(track->getIdent()); + + buildTrackList(); + } + else // Normal mode + { + TrackInfoScreen::getInstance()->setTrack(track); + TrackInfoScreen::getInstance()->push(); + } } // if clicked_track } // name=="tracks" @@ -132,6 +147,7 @@ void TracksAndGPScreen::eventCallback(Widget* widget, const std::string& name, { StateManager::get()->escapePressed(); } + // The favorite track checkbox does not need any specific additional handling } // eventCallback // ----------------------------------------------------------------------------- @@ -139,9 +155,17 @@ void TracksAndGPScreen::eventCallback(Widget* widget, const std::string& name, void TracksAndGPScreen::beforeAddingWidget() { Screen::init(); + + // Add user-defined group to track groups + track_manager->setFavoriteTrackStatus(PlayerManager::getCurrentPlayer()->getFavoriteTrackStatus()); + RibbonWidget* tabs = getWidget("trackgroups"); tabs->clearAllChildren(); + CheckBoxWidget* favorite_cb = getWidget("favorite"); + assert( favorite_cb != NULL ); + favorite_cb->setState(false); + const std::vector& groups = track_manager->getAllTrackGroups(); const int group_amount = (int)groups.size(); @@ -156,6 +180,8 @@ void TracksAndGPScreen::beforeAddingWidget() //I18N: track group name FOR_GETTEXT_ONLY( _("All") ) //I18N: track group name + FOR_GETTEXT_ONLY( _("Favorites") ) + //I18N: track group name FOR_GETTEXT_ONLY( _("Standard") ) //I18N: track group name FOR_GETTEXT_ONLY( _("Add-Ons") ) @@ -250,11 +276,14 @@ void TracksAndGPScreen::init() } // init // ----------------------------------------------------------------------------- -/** Rebuild the list of tracks and GPs. This need to be recomputed e.g. to - * take unlocked tracks into account. +/** Rebuild the list of tracks and GPs. This need to be recomputed to + * take unlocked tracks into account, when changing the current track group, etc. */ void TracksAndGPScreen::buildTrackList() { + // Add user-defined group to track groups + track_manager->setFavoriteTrackStatus(PlayerManager::getCurrentPlayer()->getFavoriteTrackStatus()); + DynamicRibbonWidget* tracks_widget = this->getWidget("tracks"); RibbonWidget* tabs = this->getWidget("trackgroups"); @@ -264,28 +293,49 @@ void TracksAndGPScreen::buildTrackList() const std::string& curr_group_name = tabs->getSelectionIDString(0); - const int track_amount = (int)track_manager->getNumberOfTracks(); - // First build a list of all tracks to be displayed // (e.g. exclude arenas, ...) PtrVector tracks; - for (int n = 0; n < track_amount; n++) + if (curr_group_name == ALL_TRACK_GROUPS_ID) { - Track* curr = track_manager->getTrack(n); - if (RaceManager::get()->getMinorMode() == RaceManager::MINOR_MODE_EASTER_EGG - && !curr->hasEasterEggs()) - continue; - core::stringw search_text = m_search_box->getText(); - search_text.make_lower(); - if (!search_text.empty() && - curr->getName().make_lower().find(search_text.c_str()) == -1) - continue; - if (curr->isArena() || curr->isSoccer()||curr->isInternal()) continue; - if (curr_group_name != ALL_TRACK_GROUPS_ID && - !curr->isInGroup(curr_group_name)) continue; + const int track_amount = (int)track_manager->getNumberOfTracks(); + for (int n = 0; n < track_amount; n++) + { + Track* curr = track_manager->getTrack(n); + if (curr->isArena() || curr->isSoccer() || curr->isInternal()) continue; + if (RaceManager::get()->getMinorMode() == RaceManager::MINOR_MODE_EASTER_EGG + && !curr->hasEasterEggs()) + continue; + core::stringw search_text = m_search_box->getText(); + search_text.make_lower(); + if (!search_text.empty() && + curr->getName().make_lower().find(search_text.c_str()) == -1) + continue; + + tracks.push_back(curr); + } // for n& curr_tracks = track_manager->getTracksInGroup(curr_group_name); + const int track_amount = (int)curr_tracks.size(); - tracks.push_back(curr); - } // for ngetTrack(curr_tracks[n]); + if (curr->isArena() || curr->isSoccer() || curr->isInternal()) continue; + if (RaceManager::get()->getMinorMode() == RaceManager::MINOR_MODE_EASTER_EGG + && !curr->hasEasterEggs()) + continue; + core::stringw search_text = m_search_box->getText(); + search_text.make_lower(); + if (!search_text.empty() && + curr->getName().make_lower().find(search_text.c_str()) == -1) + continue; + + tracks.push_back(curr); + } // for ngetScreenshotFile(), LOCKED_BADGE, IconButtonWidget::ICON_PATH_TYPE_ABSOLUTE); } + else if (PlayerManager::getCurrentPlayer()->isFavoriteTrack(curr->getIdent())) + { + tracks_widget->addItem(curr->getName(), curr->getIdent(), + curr->getScreenshotFile(), HEART_BADGE, + IconButtonWidget::ICON_PATH_TYPE_ABSOLUTE); + m_random_track_list.push_back(curr->getIdent()); + } else { - tracks_widget->addItem(curr->getName(), - curr->getIdent(), + tracks_widget->addItem(curr->getName(), curr->getIdent(), curr->getScreenshotFile(), 0, IconButtonWidget::ICON_PATH_TYPE_ABSOLUTE); m_random_track_list.push_back(curr->getIdent()); diff --git a/src/tracks/track.cpp b/src/tracks/track.cpp index e2219c50812..d5934ec0e67 100644 --- a/src/tracks/track.cpp +++ b/src/tracks/track.cpp @@ -23,6 +23,7 @@ #include "audio/music_manager.hpp" #include "challenges/challenge_status.hpp" #include "challenges/unlock_manager.hpp" +#include "config/favorite_track_status.hpp" #include "config/player_manager.hpp" #include "config/stk_config.hpp" #include "config/user_config.hpp" @@ -212,12 +213,15 @@ bool Track::operator<(const Track &other) const PlayerProfile *p = PlayerManager::getCurrentPlayer(); bool this_is_locked = p->isLocked(getIdent()); bool other_is_locked = p->isLocked(other.getIdent()); - if(this_is_locked == other_is_locked) - { - return getSortName() < other.getSortName(); - } - else + bool this_is_favorite = p->isFavoriteTrack(getIdent()); + bool other_is_favorite = p->isFavoriteTrack(other.getIdent()); + // Locked tracks cannot be favorite, so favorites < normal < locked + if (this_is_favorite != other_is_favorite) + return this_is_favorite; + else if(this_is_locked != other_is_locked) return other_is_locked; + else + return getSortName() < other.getSortName(); } // operator< //----------------------------------------------------------------------------- @@ -612,7 +616,7 @@ void Track::loadTrackInfo() m_all_modes.push_back(tm); } - if(m_groups.size()==0) m_groups.push_back(DEFAULT_GROUP_NAME); + if(m_groups.size()==0) m_groups.push_back(FavoriteTrackStatus::DEFAULT_FAVORITE_GROUP_NAME); const XMLNode *xml_node = root->getNode("curves"); if(xml_node) loadCurves(*xml_node); diff --git a/src/tracks/track_manager.cpp b/src/tracks/track_manager.cpp index aae7cd80d30..677cbe9d8e7 100644 --- a/src/tracks/track_manager.cpp +++ b/src/tracks/track_manager.cpp @@ -43,7 +43,9 @@ std::vector TrackManager::m_track_search_path; /** Constructor (currently empty). The real work happens in loadTrackList. */ TrackManager::TrackManager() -{} // TrackManager +{ + m_current_favorite_status = NULL; +} // TrackManager //----------------------------------------------------------------------------- /** Delete all tracks. @@ -151,12 +153,18 @@ std::vector TrackManager::getAllTrackIdentifiers() void TrackManager::loadTrackList() { m_all_track_dirs.clear(); + m_track_group_names.clear(); m_track_groups.clear(); m_arena_group_names.clear(); m_soccer_arena_group_names.clear(); m_arena_groups.clear(); m_soccer_arena_groups.clear(); + + m_track_groups_no_custom.clear(); + m_arena_groups_no_custom.clear(); + m_soccer_arena_groups_no_custom.clear(); + m_track_avail.clear(); // This function is called when install a new addons, delete previous // tracks @@ -254,18 +262,47 @@ void TrackManager::removeTrack(const std::string &ident) (track->isArena() ? m_arena_groups : (track->isSoccer() ? m_soccer_arena_groups : m_track_groups)); + + Group2Indices &group_2_indices_no_custom = + (track->isArena() ? m_arena_groups_no_custom : + (track->isSoccer() ? m_soccer_arena_groups_no_custom : + m_track_groups_no_custom)); std::vector &group_names = (track->isArena() ? m_arena_group_names : (track->isSoccer() ? m_soccer_arena_group_names : m_track_group_names)); - const std::vector& groups=track->getGroups(); + std::vector groups=track->getGroups(); + if (m_current_favorite_status) + { + for (auto it = m_current_favorite_status->m_favorite_tracks.begin(); + it != m_current_favorite_status->m_favorite_tracks.end(); it++) + { // User-defined groups + if (it->second.find(ident) != it->second.end()) + { + groups.push_back(it->first); + } + } + } + for(unsigned int i=0; i &indices = group_2_indices[groups[i]]; + std::vector &indices = group_2_indices_no_custom[groups[i]]; std::vector::iterator j; j = std::find(indices.begin(), indices.end(), index); + if (j != indices.end()) + indices.erase(j); + + // If the track was the last member of a group, + // completely remove the group + if(indices.size()==0) + { + group_2_indices_no_custom.erase(groups[i]); + } // if complete group must be removed + + indices = group_2_indices[groups[i]]; + j = std::find(indices.begin(), indices.end(), index); assert(j!=indices.end()); indices.erase(j); @@ -276,7 +313,7 @@ void TrackManager::removeTrack(const std::string &ident) group_2_indices.erase(groups[i]); std::vector::iterator it_g; it_g = std::find(group_names.begin(), group_names.end(), - groups[i]); + groups[i]); assert(it_g!=group_names.end()); group_names.erase(it_g); } // if complete group must be removed @@ -290,11 +327,19 @@ void TrackManager::removeTrack(const std::string &ident) Group2Indices &g2i = (i==0 ? m_soccer_arena_groups : (i==1 ? m_arena_groups : m_track_groups)); + Group2Indices &g2i_nc = (i==0 ? m_soccer_arena_groups_no_custom : + (i==1 ? m_arena_groups_no_custom : + m_track_groups_no_custom)); Group2Indices::iterator j; - for(j=g2i.begin(); j!=g2i.end(); j++) + for(j = g2i.begin(); j != g2i.end(); j++) + { + for(unsigned int i = 0; i < (*j).second.size(); i++) + if((*j).second[i] > index) (*j).second[i]--; + } // for j in group_2_indices + for(j = g2i_nc.begin(); j != g2i_nc.end(); j++) { - for(unsigned int i=0; i<(*j).second.size(); i++) - if((*j).second[i]>index) (*j).second[i]--; + for(unsigned int i = 0; i < (*j).second.size(); i++) + if((*j).second[i] > index) (*j).second[i]--; } // for j in group_2_indices } // for i in arenas, tracks @@ -312,12 +357,18 @@ void TrackManager::updateGroups(const Track* track) { if (track->isInternal()) return; - const std::vector& new_groups = track->getGroups(); + std::string ident = track->getIdent(); + std::vector new_groups = track->getGroups(); Group2Indices &group_2_indices = (track->isArena() ? m_arena_groups : (track->isSoccer() ? m_soccer_arena_groups : m_track_groups)); + + Group2Indices &group_2_indices_no_custom = + (track->isArena() ? m_arena_groups_no_custom : + (track->isSoccer() ? m_soccer_arena_groups_no_custom : + m_track_groups_no_custom)); std::vector &group_names = (track->isArena() ? m_arena_group_names : @@ -332,9 +383,93 @@ void TrackManager::updateGroups(const Track* track) if(!group_exists) group_names.push_back(new_groups[i]); group_2_indices[new_groups[i]].push_back((int)m_tracks.size()-1); + group_2_indices_no_custom[new_groups[i]].push_back((int)m_tracks.size()-1); + } + + if (m_current_favorite_status) + { + for (auto it = m_current_favorite_status->m_favorite_tracks.begin(); + it != m_current_favorite_status->m_favorite_tracks.end(); it++) + { // User-defined groups + if (it->second.find(ident) != it->second.end()) + { + bool group_exists = group_2_indices.find(ident) + != group_2_indices.end(); + if(!group_exists) + group_names.push_back(ident); + group_2_indices[ident].push_back((int)m_tracks.size()-1); + } + } } } // updateGroups +// ---------------------------------------------------------------------------- +/** \brief Adds a player's favorite track status to define the custom group + */ +void TrackManager::setFavoriteTrackStatus(FavoriteTrackStatus *status) +{ + clearFavoriteTrackStatus(); + + m_current_favorite_status = status; + + if (status == NULL) + { + return; + } + + // Add all user-defined groups + for (auto it = status->m_favorite_tracks.begin(); it != status->m_favorite_tracks.end(); it++) + { + for (auto it_name = it->second.begin(); it_name != it->second.end(); it_name++) + { + int id = getTrackIndexByIdent(*it_name); + Track *track = m_tracks[id]; + + Group2Indices &group_2_indices = + (track->isArena() ? m_arena_groups : + (track->isSoccer() ? m_soccer_arena_groups : + m_track_groups)); + + group_2_indices[it->first].push_back(id); + } + } + for (int i = 0; i < 3; i++) + { + Group2Indices &g2i = (i==0 ? m_soccer_arena_groups : + (i==1 ? m_arena_groups : + m_track_groups)); + std::vector &gn = (i==0 ? m_soccer_arena_group_names : + (i==1 ? m_arena_group_names : + m_track_group_names)); + gn.clear(); + for (auto it = g2i.begin(); it != g2i.end(); it++) + { + std::sort(it->second.begin(), it->second.end()); + auto unique_end = std::unique(it->second.begin(), it->second.end()); + it->second.erase(unique_end, it->second.end()); + gn.push_back(it->first); + } + // Make sure the order of groups are right + std::sort(gn.begin(), gn.end(), [&, g2i](std::string &a, std::string &b)->bool{ + int x = g2i.find(a)->second[0], y = g2i.find(b)->second[0]; + return x == y ? a < b : x < y; + }); + } +} // addFavoriteTrack + +// ---------------------------------------------------------------------------- +/** \brief Clears the list of active favorite tracks, used e.g. when switching + * between player profiles. + */ +void TrackManager::clearFavoriteTrackStatus() +{ + m_track_groups = m_track_groups_no_custom; + m_arena_groups = m_arena_groups_no_custom; + m_soccer_arena_groups = m_soccer_arena_groups_no_custom; + + m_current_favorite_status = NULL; +} // clearFavoriteTracks + // ---------------------------------------------------------------------------- int TrackManager::getTrackIndexByIdent(const std::string& ident) const { diff --git a/src/tracks/track_manager.hpp b/src/tracks/track_manager.hpp index 2328b0baafa..c95b1d14cfb 100644 --- a/src/tracks/track_manager.hpp +++ b/src/tracks/track_manager.hpp @@ -19,6 +19,8 @@ #ifndef HEADER_TRACK_MANAGER_HPP #define HEADER_TRACK_MANAGER_HPP +#include "config/favorite_track_status.hpp" + #include #include #include @@ -44,13 +46,13 @@ class TrackManager Tracks m_tracks; typedef std::map > Group2Indices; - /** List of all racing track groups. */ + /** List of all track indexes for each racing track group. */ Group2Indices m_track_groups; - /** List of all arena groups. */ + /** List of all arena indexes for each arena group. */ Group2Indices m_arena_groups; - /** List of all soccer arena groups. */ + /** List of all soccer arena indexes for each soccer arena group. */ Group2Indices m_soccer_arena_groups; /** List of the names of all groups containing tracks */ @@ -62,11 +64,18 @@ class TrackManager /** List of the names of all groups containing soccer arenas */ std::vector m_soccer_arena_group_names; + /** Same as above but without user-defined groups. */ + Group2Indices m_track_groups_no_custom; + Group2Indices m_arena_groups_no_custom; + Group2Indices m_soccer_arena_groups_no_custom; + /** Flag if this track is available or not. Tracks are set unavailable * if they are not available on all clients (applies only to network mode) */ std::vector m_track_avail; + FavoriteTrackStatus *m_current_favorite_status; + void updateGroups(const Track* track); public: @@ -74,6 +83,14 @@ class TrackManager ~TrackManager(); static void removeTrackSearchDirs(); static void addTrackSearchDir(const std::string &dir); + + /** Adds a track to the special group of favorite tracks. + * We need a special treatment, because the list of tracks in this group + * depends on the player-profile, not on the track data. */ + void setFavoriteTrackStatus(FavoriteTrackStatus *status); + + void clearFavoriteTrackStatus(); + /** Returns a list of all track identifiers. */ std::vector getAllTrackIdentifiers();